diff --git a/README b/README index bfd1bb6..9e68b9a 100644 --- a/README +++ b/README @@ -278,7 +278,7 @@ the following information: * Copying -Copyright (C) 2009-2023 Sergey Poznyakoff +Copyright (C) 2009-2025 Sergey Poznyakoff Permission is granted to anyone to make or distribute verbatim copies of this document as received, in any medium, provided that the diff --git a/wordsplit.3 b/wordsplit.3 index 4f86f1a..06af7bd 100644 --- a/wordsplit.3 +++ b/wordsplit.3 @@ -1,5 +1,5 @@ .\" This file is part of wordsplit -*- nroff -*- -.\" Copyright (C) 2009-2021 Sergey Poznyakoff +.\" Copyright (C) 2009-2025 Sergey Poznyakoff .\" .\" Wordsplit is free software; you can redistribute it and/or modify .\" it under the terms of the GNU General Public License as published by @@ -14,7 +14,7 @@ .\" You should have received a copy of the GNU General Public License .\" along with wordsplit. If not, see . .\" -.TH WORDSPLIT 3 "June 22, 2023" "WORDSPLIT" "Wordsplit User Reference" +.TH WORDSPLIT 3 "March 15, 2025" "WORDSPLIT" "Wordsplit User Reference" .SH NAME wordsplit \- split string into words .SH SYNOPSIS @@ -558,6 +558,43 @@ the last word. For example, if the input to the above fragment were "is" "the time for all good men" .EE +.SH COMPATIBILITY QUIRKS +If +.B WRDSF_RETURN_DELIMS +is set and +.B WRDSF_SQUEEZE_DELIMS +is not, +.B wordsplit +returns an empty word between each pair of contiguous delimiters. +Consider, for example, the following fragmen: +.PP +.EX +struct wordsplit ws; +ws.ws_delim = ":"; +wordsplit(str, &ws, WRDSF_DELIM | WRDSF_RETURN_DELIMS); +.EE +.PP +If \fIstr\fR contained \fBroot:x:0:0::/root:/bin/sh\fR, the +resulting \fBws.ws_wordv\fR array would be: +.PP +.EX +{ "root", ":", "0", ":", "0", ":", "", ":", "/root", ":", "/bin/sh" } +.EE +.PP +Notice the empty word at index 6. Earlier versions of +.B wordsplit +(up to v1.1-7-g0e1a09c) behaved differently: several contiguous +delimiters were returned one after another, without empty words in +between, like that: +.PP +.EX +{ "root", ":", "0", ":", "0", ":", ":", "/root", ":", "/bin/sh" } +.EE +.PP +To request this behavior, use the +.B WRDSO_RETDELNOTEMPTY +option. It is not advised to be used, except to +ensure backward compatibility with earlier wordsplit versions. .SH WORDSPLIT_T STRUCTURE The data type \fBwordsplit_t\fR has three members that contain output data upon return from \fBwordsplit\fR or \fBwordsplit_len\fR, @@ -1256,7 +1293,7 @@ Backtick command expansion is not supported. .SH "BUG REPORTS" Report bugs to . .SH COPYRIGHT -Copyright \(co 2009-2019 Sergey Poznyakoff +Copyright \(co 2009\(en2025 Sergey Poznyakoff .br .na License GPLv3+: GNU GPL version 3 or later diff --git a/wordsplit.at b/wordsplit.at index 38114c5..c22711b 100644 --- a/wordsplit.at +++ b/wordsplit.at @@ -1,5 +1,5 @@ # Test suite for wordsplit -*- Autotest -*- -# Copyright (C) 2014-2023 Sergey Poznyakoff +# Copyright (C) 2014-2025 Sergey Poznyakoff # # Wordsplit is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -496,6 +496,24 @@ TOTAL: 9 TESTWSP([custom, with returned & squeezed delimiters],[], [-delim : -nows -trimnl -return_delims -nosqueeze_delims], [semicolon: separated::list: of :words], +[NF: 11 +0: semicolon +1: : +2: " separated" +3: : +4: "" +5: : +6: list +7: : +8: " of " +9: : +10: words +TOTAL: 11 +]) + +TESTWSP([custom, with returned & squeezed delimiters (compat)],[], +[-delim : -nows -trimnl -return_delims -retdelnotempty -nosqueeze_delims], +[semicolon: separated::list: of :words], [NF: 10 0: semicolon 1: : @@ -510,6 +528,17 @@ TESTWSP([custom, with returned & squeezed delimiters],[], TOTAL: 10 ]) +TESTWSP([with maxwords limit],[], +[-nodefault -delim : -trimnl -maxwords 4], +[foo::baz:qux], +[NF: 4 +0: foo +1: "" +2: baz +3: qux +TOTAL: 4 +]) + WSPGROUP(wsp-sed) TESTWSP([sed expressions],[],[-sed], @@ -922,6 +951,21 @@ TESTWSP([maxwords return_delims -squeeze_delims],[], [NF: 8 0: foo 1: : +2: "" +3: : +4: "" +5: : +6: bar +7: -:baz:qux- +TOTAL: 8 +]) + +TESTWSP([maxwords return_delims -squeeze_delims (compat)],[], +[-trimnl -maxwords 8 -return_delims -retdelnotempty -nosqueeze_delims -delim :-], +[foo:::bar-:baz:qux-], +[NF: 8 +0: foo +1: : 2: : 3: : 4: bar @@ -1161,7 +1205,6 @@ TESTWSP([alternate value],[wsp-alt wsp-alt03], TOTAL: 2 ]) - m4_popdef([TESTWSP]) m4_popdef([wspnum]) m4_popdef([wspid]) diff --git a/wordsplit.c b/wordsplit.c index 59830f0..2ce6d3b 100644 --- a/wordsplit.c +++ b/wordsplit.c @@ -1,5 +1,5 @@ /* wordsplit - a word splitter - Copyright (C) 2009-2023 Sergey Poznyakoff + Copyright (C) 2009-2025 Sergey Poznyakoff This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the @@ -185,7 +185,7 @@ _wsplt_subsplit (struct wordsplit *wsp, struct wordsplit *wss, wss->ws_options = wsp->ws_options & ~WRDSO_MAXWORDS; wss->ws_namechar = wsp->ws_namechar; - + flags |= WRDSF_DELIM | WRDSF_ALLOC_DIE | WRDSF_ERROR @@ -330,7 +330,7 @@ wordsplit_init (struct wordsplit *wsp, const char *input, size_t len, wsp->ws_escape[WRDSX_WORD] = wordsplit_escape[WS_ESC_C_WS]; wsp->ws_escape[WRDSX_QUOTE] = wordsplit_escape[WS_ESC_C]; wsp->ws_options |= WRDSO_OESC_QUOTE | WRDSO_OESC_WORD - | WRDSO_XESC_QUOTE | WRDSO_XESC_WORD; + | WRDSO_XESC_QUOTE | WRDSO_XESC_WORD; } else { @@ -353,7 +353,7 @@ wordsplit_init (struct wordsplit *wsp, const char *input, size_t len, } else wsp->ws_namechar = NULL; - + wsp->ws_endp = 0; wsp->ws_wordi = 0; @@ -794,7 +794,6 @@ wordsplit_finish (struct wordsplit *wsp) is set. */ again: - delim = 0; /* Delimiter being processed (if any) */ n = 0; /* Number of words processed so far */ p = wsp->ws_head; /* Current node */ @@ -803,56 +802,48 @@ wordsplit_finish (struct wordsplit *wsp) struct wordsplit_node *next = p->next; if (p->flags & _WSNF_DELIM) { - if (wsp->ws_flags & WRDSF_RETURN_DELIMS) + if (wsp->ws_flags & WRDSF_SQUEEZE_DELIMS) { - if (wsp->ws_flags & WRDSF_SQUEEZE_DELIMS) + if (next) { - char const *s = wsnode_ptr (wsp, p); - if (delim) + if ((next->flags & _WSNF_DELIM) && + (wsnode_ptr (wsp, p))[0] == (wsnode_ptr (wsp, next))[0]) { - if (delim == *s) - { - wsnode_remove (wsp, p); - p = next; - continue; - } - else - { - delim = 0; - n++; /* Count this node; it will be returned */ - } - } - else - { - delim = *s; + wsnode_remove (wsp, p); p = next; continue; } } + else if (wsp->ws_flags & WRDSF_INCREMENTAL) + goto restart; } - else if (wsp->ws_options & WRDSO_MAXWORDS) + else if ((next && (next->flags & _WSNF_DELIM)) && + (!(wsp->ws_options & WRDSO_RETDELNOTEMPTY))) + { + int rc; + struct wordsplit_node *nulnode; + if ((rc = wsnode_new (wsp, &nulnode)) != 0) + return rc; + nulnode->flags = _WSNF_NULL | _WSNF_NOEXPAND; + wsnode_insert (wsp, nulnode, p, 0); + next = nulnode; + } + + if ((wsp->ws_options & WRDSO_MAXWORDS) && + !(wsp->ws_flags & WRDSF_RETURN_DELIMS)) { wsnode_remove (wsp, p); p = next; continue; } } - else - { - if (delim) - { - /* Last node was a delimiter or a compressed run of delimiters; - Count it, and clear the delimiter marker */ - n++; - delim = 0; - } - if (wsp->ws_options & WRDSO_MAXWORDS) - { - if (wsp->ws_wordi + n + 1 == wsp->ws_maxwords) - break; - } - } + n++; + + if ((wsp->ws_options & WRDSO_MAXWORDS) && + (wsp->ws_wordi + n == wsp->ws_maxwords)) + break; + if (wsp->ws_flags & WRDSF_INCREMENTAL) p = NULL; /* Break the loop */ else @@ -875,6 +866,7 @@ wordsplit_finish (struct wordsplit *wsp) if (wsp->ws_flags & WRDSF_INCREMENTAL) { /* Restart the processing, if there's any input left. */ + restart: if (wsp->ws_endp < wsp->ws_len) { int rc; @@ -1072,7 +1064,7 @@ wsplt_env_find (struct wordsplit *wsp, const char *name, size_t len) break; } } - else + else { /* Usual (A=B) environment. */ for (i = 0; wsp->ws_env[i]; i++) @@ -1680,7 +1672,7 @@ expvar (struct wordsplit *wsp, const char *str, size_t len, return 1; wsnode_insert (wsp, newnode, *ptail, 0); *ptail = newnode; - newnode->flags = _WSNF_NULL; + newnode->flags = _WSNF_NULL | _WSNF_NOEXPAND; } else { @@ -1726,7 +1718,7 @@ expvar (struct wordsplit *wsp, const char *str, size_t len, return 1; wsnode_insert (wsp, newnode, *ptail, 0); *ptail = newnode; - newnode->flags = _WSNF_NULL; + newnode->flags = _WSNF_NULL | _WSNF_NOEXPAND; } return 0; } @@ -1897,7 +1889,7 @@ expcmd (struct wordsplit *wsp, const char *str, size_t len, return 1; wsnode_insert (wsp, newnode, *ptail, 0); *ptail = newnode; - newnode->flags = _WSNF_NULL; + newnode->flags = _WSNF_NULL | _WSNF_NOEXPAND; } else { @@ -1928,7 +1920,7 @@ expcmd (struct wordsplit *wsp, const char *str, size_t len, return 1; wsnode_insert (wsp, newnode, *ptail, 0); *ptail = newnode; - newnode->flags = _WSNF_NULL; + newnode->flags = _WSNF_NULL | _WSNF_NOEXPAND; } return 0; } @@ -1983,7 +1975,7 @@ wordsplit_trimws (struct wordsplit *wsp) n > p->v.segm.beg && ISWS (wsp->ws_input[n - 1]); n--); p->v.segm.end = n; if (p->v.segm.beg == p->v.segm.end) - p->flags |= _WSNF_NULL; + p->flags |= _WSNF_NULL | _WSNF_NOEXPAND; } wsnode_nullelim (wsp); @@ -2355,7 +2347,7 @@ scan_word (struct wordsplit *wsp, size_t start, int consume_all) else if (WSP_RETURN_DELIMS (wsp)) { i++; - flags |= _WSNF_DELIM; + flags |= _WSNF_DELIM | _WSNF_NOEXPAND; } else if (!(wsp->ws_flags & WRDSF_SQUEEZE_DELIMS)) flags |= _WSNF_EMPTYOK; diff --git a/wordsplit.h b/wordsplit.h index 768df34..c5eb54c 100644 --- a/wordsplit.h +++ b/wordsplit.h @@ -1,5 +1,5 @@ /* wordsplit - a word splitter - Copyright (C) 2009-2023 Sergey Poznyakoff + Copyright (C) 2009-2025 Sergey Poznyakoff This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the @@ -254,6 +254,16 @@ struct wordsplit /* ws_namechar member is initialized */ #define WRDSO_NAMECHAR 0x00010000 +/* If WRDSF_RETURN_DELIMS is set and WRDSF_SQUEEZE_DELIMS is not, wordsplit + returns an empty word between each pair of contiguous delimiters. This + behavior is consistent with that without the WRDSF_RETURN_DELIMS flag. + However, earlier versions (v1.1) behaved differently: several contiguous + delimiters were returned one after another, without empty words in between. + The WRDSO_RETDELNOTEMPTY option mimics that behaviour. It is not advised to + be used, except to ensure backward compatibility with earlier wordsplit + versions. */ +#define WRDSO_RETDELNOTEMPTY 0x00020000 + #define WRDSO_BSKEEP WRDSO_BSKEEP_WORD #define WRDSO_OESC WRDSO_OESC_WORD #define WRDSO_XESC WRDSO_XESC_WORD diff --git a/wsp.c b/wsp.c index 58a9c8d..79d577e 100644 --- a/wsp.c +++ b/wsp.c @@ -1,5 +1,5 @@ /* wsp - test program for wordsplit - Copyright (C) 2014-2023 Sergey Poznyakoff + Copyright (C) 2014-2025 Sergey Poznyakoff Wordsplit is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the @@ -458,6 +458,7 @@ struct wsopt opttab[] = { { "nocmdsplit", WRDSO_NOCMDSPLIT, ws_boolean, setfn_option }, { "maxwords", WRDSO_MAXWORDS, ws_required_argument, setfn_maxwords }, { "namechar", WRDSO_NAMECHAR, ws_required_argument, setfn_namechar }, + { "retdelnotempty", WRDSO_RETDELNOTEMPTY, ws_boolean, setfn_option }, /* String options */ { "delim", WRDSF_DELIM, ws_required_argument, setfn_delim }, { "comment", WRDSF_COMMENT,ws_required_argument, setfn_comment },