Fix handling of empty words when WRDSF_RETURN_DELIMS or WRDSO_MAXWORDS are in effect

* README: Update.
* wordsplit.3: Document changes.
* wordsplit.at: Test backward compatibility quirk.
* wordsplit.c: Make sure NULL and DELIM nodes are protected from
expansions.
(wordsplit_finish): Ensure the output array produced
with WRDSF_RETURN_DELIMS is consistent with that produced without this
flag.  Provide new option, WRDSO_RETDELNOTEMPTY, to request old buggy
behavior.
* wordsplit.h (WRDSO_RETDELNOTEMPTY): New option.
* wsp.c: New tests.
This commit is contained in:
Sergey Poznyakoff 2025-03-15 23:05:25 +02:00
parent 0e1a09c4c7
commit 8f3eb3433e
6 changed files with 138 additions and 55 deletions

2
README
View file

@ -278,7 +278,7 @@ the following information:
* Copying
Copyright (C) 2009-2023 Sergey Poznyakoff
Copyright (C) 2009-2025 Sergey Poznyakoff
Permission is granted to anyone to make or distribute verbatim copies
of this document as received, in any medium, provided that the

View file

@ -1,5 +1,5 @@
.\" This file is part of wordsplit -*- nroff -*-
.\" Copyright (C) 2009-2021 Sergey Poznyakoff
.\" Copyright (C) 2009-2025 Sergey Poznyakoff
.\"
.\" Wordsplit is free software; you can redistribute it and/or modify
.\" it under the terms of the GNU General Public License as published by
@ -14,7 +14,7 @@
.\" You should have received a copy of the GNU General Public License
.\" along with wordsplit. If not, see <http://www.gnu.org/licenses/>.
.\"
.TH WORDSPLIT 3 "June 22, 2023" "WORDSPLIT" "Wordsplit User Reference"
.TH WORDSPLIT 3 "March 15, 2025" "WORDSPLIT" "Wordsplit User Reference"
.SH NAME
wordsplit \- split string into words
.SH SYNOPSIS
@ -558,6 +558,43 @@ the last word. For example, if the input to the above fragment were
"is"
"the time for all good men"
.EE
.SH COMPATIBILITY QUIRKS
If
.B WRDSF_RETURN_DELIMS
is set and
.B WRDSF_SQUEEZE_DELIMS
is not,
.B wordsplit
returns an empty word between each pair of contiguous delimiters.
Consider, for example, the following fragmen:
.PP
.EX
struct wordsplit ws;
ws.ws_delim = ":";
wordsplit(str, &ws, WRDSF_DELIM | WRDSF_RETURN_DELIMS);
.EE
.PP
If \fIstr\fR contained \fBroot:x:0:0::/root:/bin/sh\fR, the
resulting \fBws.ws_wordv\fR array would be:
.PP
.EX
{ "root", ":", "0", ":", "0", ":", "", ":", "/root", ":", "/bin/sh" }
.EE
.PP
Notice the empty word at index 6. Earlier versions of
.B wordsplit
(up to v1.1-7-g0e1a09c) behaved differently: several contiguous
delimiters were returned one after another, without empty words in
between, like that:
.PP
.EX
{ "root", ":", "0", ":", "0", ":", ":", "/root", ":", "/bin/sh" }
.EE
.PP
To request this behavior, use the
.B WRDSO_RETDELNOTEMPTY
option. It is not advised to be used, except to
ensure backward compatibility with earlier wordsplit versions.
.SH WORDSPLIT_T STRUCTURE
The data type \fBwordsplit_t\fR has three members that contain
output data upon return from \fBwordsplit\fR or \fBwordsplit_len\fR,
@ -1256,7 +1293,7 @@ Backtick command expansion is not supported.
.SH "BUG REPORTS"
Report bugs to <gray@gnu.org>.
.SH COPYRIGHT
Copyright \(co 2009-2019 Sergey Poznyakoff
Copyright \(co 2009\(en2025 Sergey Poznyakoff
.br
.na
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>

View file

@ -1,5 +1,5 @@
# Test suite for wordsplit -*- Autotest -*-
# Copyright (C) 2014-2023 Sergey Poznyakoff
# Copyright (C) 2014-2025 Sergey Poznyakoff
#
# Wordsplit is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -496,6 +496,24 @@ TOTAL: 9
TESTWSP([custom, with returned & squeezed delimiters],[],
[-delim : -nows -trimnl -return_delims -nosqueeze_delims],
[semicolon: separated::list: of :words],
[NF: 11
0: semicolon
1: :
2: " separated"
3: :
4: ""
5: :
6: list
7: :
8: " of "
9: :
10: words
TOTAL: 11
])
TESTWSP([custom, with returned & squeezed delimiters (compat)],[],
[-delim : -nows -trimnl -return_delims -retdelnotempty -nosqueeze_delims],
[semicolon: separated::list: of :words],
[NF: 10
0: semicolon
1: :
@ -510,6 +528,17 @@ TESTWSP([custom, with returned & squeezed delimiters],[],
TOTAL: 10
])
TESTWSP([with maxwords limit],[],
[-nodefault -delim : -trimnl -maxwords 4],
[foo::baz:qux],
[NF: 4
0: foo
1: ""
2: baz
3: qux
TOTAL: 4
])
WSPGROUP(wsp-sed)
TESTWSP([sed expressions],[],[-sed],
@ -922,6 +951,21 @@ TESTWSP([maxwords return_delims -squeeze_delims],[],
[NF: 8
0: foo
1: :
2: ""
3: :
4: ""
5: :
6: bar
7: -:baz:qux-
TOTAL: 8
])
TESTWSP([maxwords return_delims -squeeze_delims (compat)],[],
[-trimnl -maxwords 8 -return_delims -retdelnotempty -nosqueeze_delims -delim :-],
[foo:::bar-:baz:qux-],
[NF: 8
0: foo
1: :
2: :
3: :
4: bar
@ -1161,7 +1205,6 @@ TESTWSP([alternate value],[wsp-alt wsp-alt03],
TOTAL: 2
])
m4_popdef([TESTWSP])
m4_popdef([wspnum])
m4_popdef([wspid])

View file

@ -1,5 +1,5 @@
/* wordsplit - a word splitter
Copyright (C) 2009-2023 Sergey Poznyakoff
Copyright (C) 2009-2025 Sergey Poznyakoff
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
@ -185,7 +185,7 @@ _wsplt_subsplit (struct wordsplit *wsp, struct wordsplit *wss,
wss->ws_options = wsp->ws_options & ~WRDSO_MAXWORDS;
wss->ws_namechar = wsp->ws_namechar;
flags |= WRDSF_DELIM
| WRDSF_ALLOC_DIE
| WRDSF_ERROR
@ -330,7 +330,7 @@ wordsplit_init (struct wordsplit *wsp, const char *input, size_t len,
wsp->ws_escape[WRDSX_WORD] = wordsplit_escape[WS_ESC_C_WS];
wsp->ws_escape[WRDSX_QUOTE] = wordsplit_escape[WS_ESC_C];
wsp->ws_options |= WRDSO_OESC_QUOTE | WRDSO_OESC_WORD
| WRDSO_XESC_QUOTE | WRDSO_XESC_WORD;
| WRDSO_XESC_QUOTE | WRDSO_XESC_WORD;
}
else
{
@ -353,7 +353,7 @@ wordsplit_init (struct wordsplit *wsp, const char *input, size_t len,
}
else
wsp->ws_namechar = NULL;
wsp->ws_endp = 0;
wsp->ws_wordi = 0;
@ -794,7 +794,6 @@ wordsplit_finish (struct wordsplit *wsp)
is set.
*/
again:
delim = 0; /* Delimiter being processed (if any) */
n = 0; /* Number of words processed so far */
p = wsp->ws_head; /* Current node */
@ -803,56 +802,48 @@ wordsplit_finish (struct wordsplit *wsp)
struct wordsplit_node *next = p->next;
if (p->flags & _WSNF_DELIM)
{
if (wsp->ws_flags & WRDSF_RETURN_DELIMS)
if (wsp->ws_flags & WRDSF_SQUEEZE_DELIMS)
{
if (wsp->ws_flags & WRDSF_SQUEEZE_DELIMS)
if (next)
{
char const *s = wsnode_ptr (wsp, p);
if (delim)
if ((next->flags & _WSNF_DELIM) &&
(wsnode_ptr (wsp, p))[0] == (wsnode_ptr (wsp, next))[0])
{
if (delim == *s)
{
wsnode_remove (wsp, p);
p = next;
continue;
}
else
{
delim = 0;
n++; /* Count this node; it will be returned */
}
}
else
{
delim = *s;
wsnode_remove (wsp, p);
p = next;
continue;
}
}
else if (wsp->ws_flags & WRDSF_INCREMENTAL)
goto restart;
}
else if (wsp->ws_options & WRDSO_MAXWORDS)
else if ((next && (next->flags & _WSNF_DELIM)) &&
(!(wsp->ws_options & WRDSO_RETDELNOTEMPTY)))
{
int rc;
struct wordsplit_node *nulnode;
if ((rc = wsnode_new (wsp, &nulnode)) != 0)
return rc;
nulnode->flags = _WSNF_NULL | _WSNF_NOEXPAND;
wsnode_insert (wsp, nulnode, p, 0);
next = nulnode;
}
if ((wsp->ws_options & WRDSO_MAXWORDS) &&
!(wsp->ws_flags & WRDSF_RETURN_DELIMS))
{
wsnode_remove (wsp, p);
p = next;
continue;
}
}
else
{
if (delim)
{
/* Last node was a delimiter or a compressed run of delimiters;
Count it, and clear the delimiter marker */
n++;
delim = 0;
}
if (wsp->ws_options & WRDSO_MAXWORDS)
{
if (wsp->ws_wordi + n + 1 == wsp->ws_maxwords)
break;
}
}
n++;
if ((wsp->ws_options & WRDSO_MAXWORDS) &&
(wsp->ws_wordi + n == wsp->ws_maxwords))
break;
if (wsp->ws_flags & WRDSF_INCREMENTAL)
p = NULL; /* Break the loop */
else
@ -875,6 +866,7 @@ wordsplit_finish (struct wordsplit *wsp)
if (wsp->ws_flags & WRDSF_INCREMENTAL)
{
/* Restart the processing, if there's any input left. */
restart:
if (wsp->ws_endp < wsp->ws_len)
{
int rc;
@ -1072,7 +1064,7 @@ wsplt_env_find (struct wordsplit *wsp, const char *name, size_t len)
break;
}
}
else
else
{
/* Usual (A=B) environment. */
for (i = 0; wsp->ws_env[i]; i++)
@ -1680,7 +1672,7 @@ expvar (struct wordsplit *wsp, const char *str, size_t len,
return 1;
wsnode_insert (wsp, newnode, *ptail, 0);
*ptail = newnode;
newnode->flags = _WSNF_NULL;
newnode->flags = _WSNF_NULL | _WSNF_NOEXPAND;
}
else
{
@ -1726,7 +1718,7 @@ expvar (struct wordsplit *wsp, const char *str, size_t len,
return 1;
wsnode_insert (wsp, newnode, *ptail, 0);
*ptail = newnode;
newnode->flags = _WSNF_NULL;
newnode->flags = _WSNF_NULL | _WSNF_NOEXPAND;
}
return 0;
}
@ -1897,7 +1889,7 @@ expcmd (struct wordsplit *wsp, const char *str, size_t len,
return 1;
wsnode_insert (wsp, newnode, *ptail, 0);
*ptail = newnode;
newnode->flags = _WSNF_NULL;
newnode->flags = _WSNF_NULL | _WSNF_NOEXPAND;
}
else
{
@ -1928,7 +1920,7 @@ expcmd (struct wordsplit *wsp, const char *str, size_t len,
return 1;
wsnode_insert (wsp, newnode, *ptail, 0);
*ptail = newnode;
newnode->flags = _WSNF_NULL;
newnode->flags = _WSNF_NULL | _WSNF_NOEXPAND;
}
return 0;
}
@ -1983,7 +1975,7 @@ wordsplit_trimws (struct wordsplit *wsp)
n > p->v.segm.beg && ISWS (wsp->ws_input[n - 1]); n--);
p->v.segm.end = n;
if (p->v.segm.beg == p->v.segm.end)
p->flags |= _WSNF_NULL;
p->flags |= _WSNF_NULL | _WSNF_NOEXPAND;
}
wsnode_nullelim (wsp);
@ -2355,7 +2347,7 @@ scan_word (struct wordsplit *wsp, size_t start, int consume_all)
else if (WSP_RETURN_DELIMS (wsp))
{
i++;
flags |= _WSNF_DELIM;
flags |= _WSNF_DELIM | _WSNF_NOEXPAND;
}
else if (!(wsp->ws_flags & WRDSF_SQUEEZE_DELIMS))
flags |= _WSNF_EMPTYOK;

View file

@ -1,5 +1,5 @@
/* wordsplit - a word splitter
Copyright (C) 2009-2023 Sergey Poznyakoff
Copyright (C) 2009-2025 Sergey Poznyakoff
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
@ -254,6 +254,16 @@ struct wordsplit
/* ws_namechar member is initialized */
#define WRDSO_NAMECHAR 0x00010000
/* If WRDSF_RETURN_DELIMS is set and WRDSF_SQUEEZE_DELIMS is not, wordsplit
returns an empty word between each pair of contiguous delimiters. This
behavior is consistent with that without the WRDSF_RETURN_DELIMS flag.
However, earlier versions (v1.1) behaved differently: several contiguous
delimiters were returned one after another, without empty words in between.
The WRDSO_RETDELNOTEMPTY option mimics that behaviour. It is not advised to
be used, except to ensure backward compatibility with earlier wordsplit
versions. */
#define WRDSO_RETDELNOTEMPTY 0x00020000
#define WRDSO_BSKEEP WRDSO_BSKEEP_WORD
#define WRDSO_OESC WRDSO_OESC_WORD
#define WRDSO_XESC WRDSO_XESC_WORD

3
wsp.c
View file

@ -1,5 +1,5 @@
/* wsp - test program for wordsplit
Copyright (C) 2014-2023 Sergey Poznyakoff
Copyright (C) 2014-2025 Sergey Poznyakoff
Wordsplit is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
@ -458,6 +458,7 @@ struct wsopt opttab[] = {
{ "nocmdsplit", WRDSO_NOCMDSPLIT, ws_boolean, setfn_option },
{ "maxwords", WRDSO_MAXWORDS, ws_required_argument, setfn_maxwords },
{ "namechar", WRDSO_NAMECHAR, ws_required_argument, setfn_namechar },
{ "retdelnotempty", WRDSO_RETDELNOTEMPTY, ws_boolean, setfn_option },
/* String options */
{ "delim", WRDSF_DELIM, ws_required_argument, setfn_delim },
{ "comment", WRDSF_COMMENT,ws_required_argument, setfn_comment },