mirror of
git://git.gnu.org.ua/wordsplit.git
synced 2025-04-25 16:19:54 +03:00
Fix handling of empty words when WRDSF_RETURN_DELIMS or WRDSO_MAXWORDS are in effect
* README: Update. * wordsplit.3: Document changes. * wordsplit.at: Test backward compatibility quirk. * wordsplit.c: Make sure NULL and DELIM nodes are protected from expansions. (wordsplit_finish): Ensure the output array produced with WRDSF_RETURN_DELIMS is consistent with that produced without this flag. Provide new option, WRDSO_RETDELNOTEMPTY, to request old buggy behavior. * wordsplit.h (WRDSO_RETDELNOTEMPTY): New option. * wsp.c: New tests.
This commit is contained in:
parent
0e1a09c4c7
commit
8f3eb3433e
6 changed files with 138 additions and 55 deletions
2
README
2
README
|
@ -278,7 +278,7 @@ the following information:
|
|||
|
||||
* Copying
|
||||
|
||||
Copyright (C) 2009-2023 Sergey Poznyakoff
|
||||
Copyright (C) 2009-2025 Sergey Poznyakoff
|
||||
|
||||
Permission is granted to anyone to make or distribute verbatim copies
|
||||
of this document as received, in any medium, provided that the
|
||||
|
|
43
wordsplit.3
43
wordsplit.3
|
@ -1,5 +1,5 @@
|
|||
.\" This file is part of wordsplit -*- nroff -*-
|
||||
.\" Copyright (C) 2009-2021 Sergey Poznyakoff
|
||||
.\" Copyright (C) 2009-2025 Sergey Poznyakoff
|
||||
.\"
|
||||
.\" Wordsplit is free software; you can redistribute it and/or modify
|
||||
.\" it under the terms of the GNU General Public License as published by
|
||||
|
@ -14,7 +14,7 @@
|
|||
.\" You should have received a copy of the GNU General Public License
|
||||
.\" along with wordsplit. If not, see <http://www.gnu.org/licenses/>.
|
||||
.\"
|
||||
.TH WORDSPLIT 3 "June 22, 2023" "WORDSPLIT" "Wordsplit User Reference"
|
||||
.TH WORDSPLIT 3 "March 15, 2025" "WORDSPLIT" "Wordsplit User Reference"
|
||||
.SH NAME
|
||||
wordsplit \- split string into words
|
||||
.SH SYNOPSIS
|
||||
|
@ -558,6 +558,43 @@ the last word. For example, if the input to the above fragment were
|
|||
"is"
|
||||
"the time for all good men"
|
||||
.EE
|
||||
.SH COMPATIBILITY QUIRKS
|
||||
If
|
||||
.B WRDSF_RETURN_DELIMS
|
||||
is set and
|
||||
.B WRDSF_SQUEEZE_DELIMS
|
||||
is not,
|
||||
.B wordsplit
|
||||
returns an empty word between each pair of contiguous delimiters.
|
||||
Consider, for example, the following fragmen:
|
||||
.PP
|
||||
.EX
|
||||
struct wordsplit ws;
|
||||
ws.ws_delim = ":";
|
||||
wordsplit(str, &ws, WRDSF_DELIM | WRDSF_RETURN_DELIMS);
|
||||
.EE
|
||||
.PP
|
||||
If \fIstr\fR contained \fBroot:x:0:0::/root:/bin/sh\fR, the
|
||||
resulting \fBws.ws_wordv\fR array would be:
|
||||
.PP
|
||||
.EX
|
||||
{ "root", ":", "0", ":", "0", ":", "", ":", "/root", ":", "/bin/sh" }
|
||||
.EE
|
||||
.PP
|
||||
Notice the empty word at index 6. Earlier versions of
|
||||
.B wordsplit
|
||||
(up to v1.1-7-g0e1a09c) behaved differently: several contiguous
|
||||
delimiters were returned one after another, without empty words in
|
||||
between, like that:
|
||||
.PP
|
||||
.EX
|
||||
{ "root", ":", "0", ":", "0", ":", ":", "/root", ":", "/bin/sh" }
|
||||
.EE
|
||||
.PP
|
||||
To request this behavior, use the
|
||||
.B WRDSO_RETDELNOTEMPTY
|
||||
option. It is not advised to be used, except to
|
||||
ensure backward compatibility with earlier wordsplit versions.
|
||||
.SH WORDSPLIT_T STRUCTURE
|
||||
The data type \fBwordsplit_t\fR has three members that contain
|
||||
output data upon return from \fBwordsplit\fR or \fBwordsplit_len\fR,
|
||||
|
@ -1256,7 +1293,7 @@ Backtick command expansion is not supported.
|
|||
.SH "BUG REPORTS"
|
||||
Report bugs to <gray@gnu.org>.
|
||||
.SH COPYRIGHT
|
||||
Copyright \(co 2009-2019 Sergey Poznyakoff
|
||||
Copyright \(co 2009\(en2025 Sergey Poznyakoff
|
||||
.br
|
||||
.na
|
||||
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
|
||||
|
|
47
wordsplit.at
47
wordsplit.at
|
@ -1,5 +1,5 @@
|
|||
# Test suite for wordsplit -*- Autotest -*-
|
||||
# Copyright (C) 2014-2023 Sergey Poznyakoff
|
||||
# Copyright (C) 2014-2025 Sergey Poznyakoff
|
||||
#
|
||||
# Wordsplit is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
@ -496,6 +496,24 @@ TOTAL: 9
|
|||
TESTWSP([custom, with returned & squeezed delimiters],[],
|
||||
[-delim : -nows -trimnl -return_delims -nosqueeze_delims],
|
||||
[semicolon: separated::list: of :words],
|
||||
[NF: 11
|
||||
0: semicolon
|
||||
1: :
|
||||
2: " separated"
|
||||
3: :
|
||||
4: ""
|
||||
5: :
|
||||
6: list
|
||||
7: :
|
||||
8: " of "
|
||||
9: :
|
||||
10: words
|
||||
TOTAL: 11
|
||||
])
|
||||
|
||||
TESTWSP([custom, with returned & squeezed delimiters (compat)],[],
|
||||
[-delim : -nows -trimnl -return_delims -retdelnotempty -nosqueeze_delims],
|
||||
[semicolon: separated::list: of :words],
|
||||
[NF: 10
|
||||
0: semicolon
|
||||
1: :
|
||||
|
@ -510,6 +528,17 @@ TESTWSP([custom, with returned & squeezed delimiters],[],
|
|||
TOTAL: 10
|
||||
])
|
||||
|
||||
TESTWSP([with maxwords limit],[],
|
||||
[-nodefault -delim : -trimnl -maxwords 4],
|
||||
[foo::baz:qux],
|
||||
[NF: 4
|
||||
0: foo
|
||||
1: ""
|
||||
2: baz
|
||||
3: qux
|
||||
TOTAL: 4
|
||||
])
|
||||
|
||||
WSPGROUP(wsp-sed)
|
||||
|
||||
TESTWSP([sed expressions],[],[-sed],
|
||||
|
@ -922,6 +951,21 @@ TESTWSP([maxwords return_delims -squeeze_delims],[],
|
|||
[NF: 8
|
||||
0: foo
|
||||
1: :
|
||||
2: ""
|
||||
3: :
|
||||
4: ""
|
||||
5: :
|
||||
6: bar
|
||||
7: -:baz:qux-
|
||||
TOTAL: 8
|
||||
])
|
||||
|
||||
TESTWSP([maxwords return_delims -squeeze_delims (compat)],[],
|
||||
[-trimnl -maxwords 8 -return_delims -retdelnotempty -nosqueeze_delims -delim :-],
|
||||
[foo:::bar-:baz:qux-],
|
||||
[NF: 8
|
||||
0: foo
|
||||
1: :
|
||||
2: :
|
||||
3: :
|
||||
4: bar
|
||||
|
@ -1161,7 +1205,6 @@ TESTWSP([alternate value],[wsp-alt wsp-alt03],
|
|||
TOTAL: 2
|
||||
])
|
||||
|
||||
|
||||
m4_popdef([TESTWSP])
|
||||
m4_popdef([wspnum])
|
||||
m4_popdef([wspid])
|
||||
|
|
80
wordsplit.c
80
wordsplit.c
|
@ -1,5 +1,5 @@
|
|||
/* wordsplit - a word splitter
|
||||
Copyright (C) 2009-2023 Sergey Poznyakoff
|
||||
Copyright (C) 2009-2025 Sergey Poznyakoff
|
||||
|
||||
This program is free software; you can redistribute it and/or modify it
|
||||
under the terms of the GNU General Public License as published by the
|
||||
|
@ -330,7 +330,7 @@ wordsplit_init (struct wordsplit *wsp, const char *input, size_t len,
|
|||
wsp->ws_escape[WRDSX_WORD] = wordsplit_escape[WS_ESC_C_WS];
|
||||
wsp->ws_escape[WRDSX_QUOTE] = wordsplit_escape[WS_ESC_C];
|
||||
wsp->ws_options |= WRDSO_OESC_QUOTE | WRDSO_OESC_WORD
|
||||
| WRDSO_XESC_QUOTE | WRDSO_XESC_WORD;
|
||||
| WRDSO_XESC_QUOTE | WRDSO_XESC_WORD;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -794,7 +794,6 @@ wordsplit_finish (struct wordsplit *wsp)
|
|||
is set.
|
||||
*/
|
||||
again:
|
||||
delim = 0; /* Delimiter being processed (if any) */
|
||||
n = 0; /* Number of words processed so far */
|
||||
p = wsp->ws_head; /* Current node */
|
||||
|
||||
|
@ -803,56 +802,48 @@ wordsplit_finish (struct wordsplit *wsp)
|
|||
struct wordsplit_node *next = p->next;
|
||||
if (p->flags & _WSNF_DELIM)
|
||||
{
|
||||
if (wsp->ws_flags & WRDSF_RETURN_DELIMS)
|
||||
if (wsp->ws_flags & WRDSF_SQUEEZE_DELIMS)
|
||||
{
|
||||
if (wsp->ws_flags & WRDSF_SQUEEZE_DELIMS)
|
||||
if (next)
|
||||
{
|
||||
char const *s = wsnode_ptr (wsp, p);
|
||||
if (delim)
|
||||
if ((next->flags & _WSNF_DELIM) &&
|
||||
(wsnode_ptr (wsp, p))[0] == (wsnode_ptr (wsp, next))[0])
|
||||
{
|
||||
if (delim == *s)
|
||||
{
|
||||
wsnode_remove (wsp, p);
|
||||
p = next;
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
delim = 0;
|
||||
n++; /* Count this node; it will be returned */
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
delim = *s;
|
||||
wsnode_remove (wsp, p);
|
||||
p = next;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else if (wsp->ws_flags & WRDSF_INCREMENTAL)
|
||||
goto restart;
|
||||
}
|
||||
else if (wsp->ws_options & WRDSO_MAXWORDS)
|
||||
else if ((next && (next->flags & _WSNF_DELIM)) &&
|
||||
(!(wsp->ws_options & WRDSO_RETDELNOTEMPTY)))
|
||||
{
|
||||
int rc;
|
||||
struct wordsplit_node *nulnode;
|
||||
if ((rc = wsnode_new (wsp, &nulnode)) != 0)
|
||||
return rc;
|
||||
nulnode->flags = _WSNF_NULL | _WSNF_NOEXPAND;
|
||||
wsnode_insert (wsp, nulnode, p, 0);
|
||||
next = nulnode;
|
||||
}
|
||||
|
||||
if ((wsp->ws_options & WRDSO_MAXWORDS) &&
|
||||
!(wsp->ws_flags & WRDSF_RETURN_DELIMS))
|
||||
{
|
||||
wsnode_remove (wsp, p);
|
||||
p = next;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (delim)
|
||||
{
|
||||
/* Last node was a delimiter or a compressed run of delimiters;
|
||||
Count it, and clear the delimiter marker */
|
||||
n++;
|
||||
delim = 0;
|
||||
}
|
||||
if (wsp->ws_options & WRDSO_MAXWORDS)
|
||||
{
|
||||
if (wsp->ws_wordi + n + 1 == wsp->ws_maxwords)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
n++;
|
||||
|
||||
if ((wsp->ws_options & WRDSO_MAXWORDS) &&
|
||||
(wsp->ws_wordi + n == wsp->ws_maxwords))
|
||||
break;
|
||||
|
||||
if (wsp->ws_flags & WRDSF_INCREMENTAL)
|
||||
p = NULL; /* Break the loop */
|
||||
else
|
||||
|
@ -875,6 +866,7 @@ wordsplit_finish (struct wordsplit *wsp)
|
|||
if (wsp->ws_flags & WRDSF_INCREMENTAL)
|
||||
{
|
||||
/* Restart the processing, if there's any input left. */
|
||||
restart:
|
||||
if (wsp->ws_endp < wsp->ws_len)
|
||||
{
|
||||
int rc;
|
||||
|
@ -1680,7 +1672,7 @@ expvar (struct wordsplit *wsp, const char *str, size_t len,
|
|||
return 1;
|
||||
wsnode_insert (wsp, newnode, *ptail, 0);
|
||||
*ptail = newnode;
|
||||
newnode->flags = _WSNF_NULL;
|
||||
newnode->flags = _WSNF_NULL | _WSNF_NOEXPAND;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -1726,7 +1718,7 @@ expvar (struct wordsplit *wsp, const char *str, size_t len,
|
|||
return 1;
|
||||
wsnode_insert (wsp, newnode, *ptail, 0);
|
||||
*ptail = newnode;
|
||||
newnode->flags = _WSNF_NULL;
|
||||
newnode->flags = _WSNF_NULL | _WSNF_NOEXPAND;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -1897,7 +1889,7 @@ expcmd (struct wordsplit *wsp, const char *str, size_t len,
|
|||
return 1;
|
||||
wsnode_insert (wsp, newnode, *ptail, 0);
|
||||
*ptail = newnode;
|
||||
newnode->flags = _WSNF_NULL;
|
||||
newnode->flags = _WSNF_NULL | _WSNF_NOEXPAND;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -1928,7 +1920,7 @@ expcmd (struct wordsplit *wsp, const char *str, size_t len,
|
|||
return 1;
|
||||
wsnode_insert (wsp, newnode, *ptail, 0);
|
||||
*ptail = newnode;
|
||||
newnode->flags = _WSNF_NULL;
|
||||
newnode->flags = _WSNF_NULL | _WSNF_NOEXPAND;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -1983,7 +1975,7 @@ wordsplit_trimws (struct wordsplit *wsp)
|
|||
n > p->v.segm.beg && ISWS (wsp->ws_input[n - 1]); n--);
|
||||
p->v.segm.end = n;
|
||||
if (p->v.segm.beg == p->v.segm.end)
|
||||
p->flags |= _WSNF_NULL;
|
||||
p->flags |= _WSNF_NULL | _WSNF_NOEXPAND;
|
||||
}
|
||||
|
||||
wsnode_nullelim (wsp);
|
||||
|
@ -2355,7 +2347,7 @@ scan_word (struct wordsplit *wsp, size_t start, int consume_all)
|
|||
else if (WSP_RETURN_DELIMS (wsp))
|
||||
{
|
||||
i++;
|
||||
flags |= _WSNF_DELIM;
|
||||
flags |= _WSNF_DELIM | _WSNF_NOEXPAND;
|
||||
}
|
||||
else if (!(wsp->ws_flags & WRDSF_SQUEEZE_DELIMS))
|
||||
flags |= _WSNF_EMPTYOK;
|
||||
|
|
12
wordsplit.h
12
wordsplit.h
|
@ -1,5 +1,5 @@
|
|||
/* wordsplit - a word splitter
|
||||
Copyright (C) 2009-2023 Sergey Poznyakoff
|
||||
Copyright (C) 2009-2025 Sergey Poznyakoff
|
||||
|
||||
This program is free software; you can redistribute it and/or modify it
|
||||
under the terms of the GNU General Public License as published by the
|
||||
|
@ -254,6 +254,16 @@ struct wordsplit
|
|||
/* ws_namechar member is initialized */
|
||||
#define WRDSO_NAMECHAR 0x00010000
|
||||
|
||||
/* If WRDSF_RETURN_DELIMS is set and WRDSF_SQUEEZE_DELIMS is not, wordsplit
|
||||
returns an empty word between each pair of contiguous delimiters. This
|
||||
behavior is consistent with that without the WRDSF_RETURN_DELIMS flag.
|
||||
However, earlier versions (v1.1) behaved differently: several contiguous
|
||||
delimiters were returned one after another, without empty words in between.
|
||||
The WRDSO_RETDELNOTEMPTY option mimics that behaviour. It is not advised to
|
||||
be used, except to ensure backward compatibility with earlier wordsplit
|
||||
versions. */
|
||||
#define WRDSO_RETDELNOTEMPTY 0x00020000
|
||||
|
||||
#define WRDSO_BSKEEP WRDSO_BSKEEP_WORD
|
||||
#define WRDSO_OESC WRDSO_OESC_WORD
|
||||
#define WRDSO_XESC WRDSO_XESC_WORD
|
||||
|
|
3
wsp.c
3
wsp.c
|
@ -1,5 +1,5 @@
|
|||
/* wsp - test program for wordsplit
|
||||
Copyright (C) 2014-2023 Sergey Poznyakoff
|
||||
Copyright (C) 2014-2025 Sergey Poznyakoff
|
||||
|
||||
Wordsplit is free software; you can redistribute it and/or modify it
|
||||
under the terms of the GNU General Public License as published by the
|
||||
|
@ -458,6 +458,7 @@ struct wsopt opttab[] = {
|
|||
{ "nocmdsplit", WRDSO_NOCMDSPLIT, ws_boolean, setfn_option },
|
||||
{ "maxwords", WRDSO_MAXWORDS, ws_required_argument, setfn_maxwords },
|
||||
{ "namechar", WRDSO_NAMECHAR, ws_required_argument, setfn_namechar },
|
||||
{ "retdelnotempty", WRDSO_RETDELNOTEMPTY, ws_boolean, setfn_option },
|
||||
/* String options */
|
||||
{ "delim", WRDSF_DELIM, ws_required_argument, setfn_delim },
|
||||
{ "comment", WRDSF_COMMENT,ws_required_argument, setfn_comment },
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue