diff --git a/doc/wordsplit.3 b/doc/wordsplit.3 index a391b81..c3149fe 100644 --- a/doc/wordsplit.3 +++ b/doc/wordsplit.3 @@ -1,5 +1,5 @@ .\" This file is part of grecs -*- nroff -*- -.\" Copyright (C) 2007-2016 Sergey Poznyakoff +.\" Copyright (C) 2007-2018 Sergey Poznyakoff .\" .\" Grecs is free software; you can redistribute it and/or modify .\" it under the terms of the GNU General Public License as published by @@ -14,7 +14,7 @@ .\" You should have received a copy of the GNU General Public License .\" along with Grecs. If not, see . .\" -.TH WORDSPLIT 3 "February 20, 2018" "GRECS" "Grecs User Reference" +.TH WORDSPLIT 3 "May 22, 2018" "GRECS" "Grecs User Reference" .SH NAME wordsplit \- split string into words .SH SYNOPSIS @@ -133,6 +133,17 @@ if (rc != WRDSE_NOINPUT) wordsplit_free(&ws); .EE +.SH OPTIONS +The number of flags is limited to 32 (the width of \fBuint32_t\fR data +type) and each bit is occupied by a corresponding flag. However, the +number of features \fBwordsplit\fR provides required still +more. Additional features can be requested by setting a corresponding +\fIoption bit\fR in the \fBws_option\fR field of the \fBstruct +wordsplit\fR argument. To inform wordsplit functions that this field +is initialized the \fBWRDSF_OPTIONS\fR flag must be set. +.PP +Option symbolic names begin with \fBWRDSO_\fR. They are discussed in +detail in the subsequent chapters. .SH EXPANSION Expansion is performed on the input after it has been split into words. There are several kinds of expansion, which of them are @@ -392,7 +403,29 @@ for each such word using .PP When matching a pattern, the dot at the start of a name or immediately following a slash must be matched explicitly, unless -the \fBWRDSO_DOTGLOB\fR option is set, +the \fBWRDSO_DOTGLOB\fR option is set. +.SH LIMITING THE NUMBER OF WORDS +The maximum number of words to be returned can be limited by setting +the \fBws_maxwords\fR member to the desired count, and setting the +\fBWRDSO_MAXWORDS\fR option, e.g.: +.sp +.EX +struct wordsplit ws; +ws.ws_maxwords = 3; +ws.ws_options = WRDSO_MAXWORDS; +wordsplit(str, &ws, WRDSF_DEFFLAGS|WRDSF_OPTIONS); +.EE +.PP +If the actual number of words in the expanded input is greater than +the supplied limit, the trailing part of the input will be returned in +the last word. For example, if the input to the above fragment were +\fBNow is the time for all good men\fR, then the returned words would be: +.sp +.EX +"Now" +"is" +"the time for all good men" +.EE .SH WORDSPLIT_T STRUCTURE The data type \fBwordsplit_t\fR has three members that contain output data upon return from \fBwordsplit\fR or \fBwordsplit_len\fR, @@ -410,6 +443,12 @@ from \fBwordsplit\fR. Array of resulting words. Accessible upon successful return from \fBwordsplit\fR. .TP +.BI "size_t " ws_wordi +Total number of words processed. This field is intended for use with +.B WRDSF_INCREMENTAL +flag. If that flag is not set, the following relation holds: +.BR "ws_wordi == ws_wordc - ws_offs" . +.TP .BI "int " ws_errno Error code, if the invocation of \fBwordsplit\fR or \fBwordsplit_len\fR failed. This is the same value as returned from @@ -435,6 +474,12 @@ flag is set, this member specifies the number of initial elements in to fill with NULLs. These elements are not counted in the returned .IR ws_wordc . .TP +.BI "size_t " ws_maxwords +Maximum number of words to return. For this field to take effect, the +\fBWRDSO_MAXWORDS\fR option and \fBWRDSF_OPTIONS\fR flag must be set. +For a detailed discussion, see the chapter +.BR "LIMITING THE NUMBER OF WORDS" . +.TP .BI "int " ws_flags Contains flags passed to wordsplit on entry. Can be used as a read-only member when using \fBwordsplit\fR in incremental mode or @@ -804,6 +849,12 @@ Quote removal: handle octal escapes in doubly-quoted strings. .TP .B WRDSO_XESC_QUOTE Quote removal: handle hex escapes in doubly-quoted strings. +.TP +.B WRDSO_MAXWORDS +The \fBws_maxwords\fR member is initialized. This is used to control +the number of words returned by a call to \fBwordsplit\fR. For a +detailed discussion, refer to the chapter +.BR "LIMITING THE NUMBER OF WORDS" . .SH "ERROR CODES" .TP .BR WRDSE_OK ", " WRDSE_EOF @@ -974,7 +1025,7 @@ Sergey Poznyakoff .SH "BUG REPORTS" Report bugs to . .SH COPYRIGHT -Copyright \(co 2009-2014 Sergey Poznyakoff +Copyright \(co 2009-2018 Sergey Poznyakoff .br .na License GPLv3+: GNU GPL version 3 or later diff --git a/include/wordsplit.h b/include/wordsplit.h index 8726cf1..d13ec3c 100644 --- a/include/wordsplit.h +++ b/include/wordsplit.h @@ -1,5 +1,5 @@ /* wordsplit - a word splitter - Copyright (C) 2009-2016 Sergey Poznyakoff + Copyright (C) 2009-2018 Sergey Poznyakoff This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the @@ -41,6 +41,11 @@ struct wordsplit int ws_flags; /* [Input] Flags passed to wordsplit. */ int ws_options; /* [Input] (WRDSF_OPTIONS) Additional options. */ + size_t ws_maxwords; /* [Input] (WRDSO_MAXWORDS) Return at most that + many words */ + size_t ws_wordi; /* [Output] (WRDSF_INCREMENTAL) Total number of + words returned so far */ + const char *ws_delim; /* [Input] (WRDSF_DELIM) Word delimiters. */ const char *ws_comment; /* [Input] (WRDSF_COMMENT) Comment characters. */ const char *ws_escape[2]; /* [Input] (WRDSF_ESCAPE) Characters to be escaped @@ -198,6 +203,9 @@ struct wordsplit /* Handle hex escapes in words */ #define WRDSO_XESC_WORD 0x00000040 +/* ws_maxwords field is initialized */ +#define WRDSO_MAXWORDS 0x00000080 + /* Keep backslash in unrecognized escape sequences in quoted strings */ #define WRDSO_BSKEEP_QUOTE 0x00000100 /* Handle octal escapes in quoted strings */ diff --git a/src/wordsplit.c b/src/wordsplit.c index 5cd8daa..dab6c4d 100644 --- a/src/wordsplit.c +++ b/src/wordsplit.c @@ -1,5 +1,5 @@ /* wordsplit - a word splitter - Copyright (C) 2009-2016 Sergey Poznyakoff + Copyright (C) 2009-2018 Sergey Poznyakoff This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the @@ -52,7 +52,10 @@ #define ISVARBEG(c) (ISALPHA(c) || c == '_') #define ISVARCHR(c) (ISALNUM(c) || c == '_') - + +#define WSP_RETURN_DELIMS(wsp) \ + ((wsp)->ws_flags & WRDSF_RETURN_DELIMS || ((wsp)->ws_options & WRDSO_MAXWORDS)) + #define ALLOC_INIT 128 #define ALLOC_INCR 128 @@ -104,11 +107,18 @@ static int wordsplit_run (const char *command, size_t length, struct wordsplit *wsp, int flags, int lvl); +static int wordsplit_init (struct wordsplit *wsp, const char *input, size_t len, + int flags); +static int wordsplit_process_list (struct wordsplit *wsp, size_t start); +static int wordsplit_finish (struct wordsplit *wsp); + static int _wsplt_subsplit (struct wordsplit *wsp, struct wordsplit *wss, char const *str, int len, - int flags) + int flags, int finalize) { + int rc; + wss->ws_delim = wsp->ws_delim; wss->ws_debug = wsp->ws_debug; wss->ws_error = wsp->ws_error; @@ -138,8 +148,23 @@ _wsplt_subsplit (struct wordsplit *wsp, struct wordsplit *wss, | WRDSF_ERROR | WRDSF_DEBUG | (wsp->ws_flags & (WRDSF_SHOWDBG | WRDSF_SHOWERR | WRDSF_OPTIONS)); - - return wordsplit_run (str, len, wss, flags, wsp->ws_lvl + 1); + + rc = wordsplit_init (wss, str, len, flags); + if (rc) + return rc; + wss->ws_lvl = wsp->ws_lvl + 1; + rc = wordsplit_process_list (wss, 0); + if (rc) + { + wordsplit_free_nodes (wss); + return rc; + } + if (finalize) + { + rc = wordsplit_finish (wss); + wordsplit_free_nodes (wss); + } + return rc; } static void @@ -173,7 +198,6 @@ wordsplit_init0 (struct wordsplit *wsp) } wsp->ws_errno = 0; - wsp->ws_head = wsp->ws_tail = NULL; } char wordsplit_c_escape_tab[] = "\\\\\"\"a\ab\bf\fn\nr\rt\tv\v"; @@ -262,9 +286,14 @@ wordsplit_init (struct wordsplit *wsp, const char *input, size_t len, } wsp->ws_endp = 0; + wsp->ws_wordi = 0; + if (wsp->ws_flags & WRDSF_REUSE) + wordsplit_free_nodes (wsp); + wsp->ws_head = wsp->ws_tail = NULL; + wordsplit_init0 (wsp); - + return 0; } @@ -441,6 +470,14 @@ wsnode_remove (struct wordsplit *wsp, struct wordsplit_node *node) node->next = node->prev = NULL; } +static struct wordsplit_node * +wsnode_tail (struct wordsplit_node *p) +{ + while (p && p->next) + p = p->next; + return p; +} + static void wsnode_insert (struct wordsplit *wsp, struct wordsplit_node *node, struct wordsplit_node *anchor, int before) @@ -456,22 +493,24 @@ wsnode_insert (struct wordsplit *wsp, struct wordsplit_node *node, wsnode_insert (wsp, node, anchor->prev, 0); else { + struct wordsplit_node *tail = wsnode_tail (node); node->prev = NULL; - node->next = anchor; - anchor->prev = node; + tail->next = anchor; + anchor->prev = tail; wsp->ws_head = node; } } else { struct wordsplit_node *p; + struct wordsplit_node *tail = wsnode_tail (node); p = anchor->next; if (p) - p->prev = node; + p->prev = tail; else - wsp->ws_tail = node; - node->next = p; + wsp->ws_tail = tail; + tail->next = p; node->prev = anchor; anchor->next = node; } @@ -538,6 +577,9 @@ coalesce_segment (struct wordsplit *wsp, struct wordsplit_node *node) char *buf, *cur; int stop; + if (!(node->flags & _WSNF_JOIN)) + return 0; + for (p = node; p && (p->flags & _WSNF_JOIN); p = p->next) { len += wsnode_len (p); @@ -598,9 +640,7 @@ wsnode_quoteremoval (struct wordsplit *wsp) int unquote; if (wsp->ws_flags & WRDSF_QUOTE) - { - unquote = !(p->flags & _WSNF_NOEXPAND); - } + unquote = !(p->flags & _WSNF_NOEXPAND); else unquote = 0; @@ -638,24 +678,161 @@ wsnode_coalesce (struct wordsplit *wsp) return 0; } +static int +wsnode_tail_coalesce (struct wordsplit *wsp, struct wordsplit_node *p) +{ + if (p->next) + { + struct wordsplit_node *np = p; + while (np && np->next) + { + np->flags |= _WSNF_JOIN; + np = np->next; + } + if (coalesce_segment (wsp, p)) + return 1; + } + return 0; +} + +static size_t skip_delim (struct wordsplit *wsp); + static int wordsplit_finish (struct wordsplit *wsp) { struct wordsplit_node *p; size_t n; + int delim; - n = 0; + /* Postprocess delimiters. It would be rather simple, if it weren't for + the incremental operation. - for (p = wsp->ws_head; p; p = p->next) - n++; + Nodes of type _WSNF_DELIM get inserted to the node list if either + WRDSF_RETURN_DELIMS flag or WRDSO_MAXWORDS option is set. + + The following cases should be distinguished: + + 1. If both WRDSF_SQUEEZE_DELIMS and WRDSF_RETURN_DELIMS are set, compress + any runs of similar delimiter nodes to a single node. The nodes are + 'similar' if they point to the same delimiter character. + + If WRDSO_MAXWORDS option is set, stop compressing when + ws_wordi + 1 == ws_maxwords, and coalesce the rest of nodes into + a single last node. + + 2. If WRDSO_MAXWORDS option is set, but WRDSF_RETURN_DELIMS is not, + remove any delimiter nodes. Stop operation when + ws_wordi + 1 == ws_maxwords, and coalesce the rest of nodes into + a single last node. + + 3. If incremental operation is in progress, restart the loop any time + a delimiter node is about to be returned, unless WRDSF_RETURN_DELIMS + is set. + */ + again: + delim = 0; /* Delimiter being processed (if any) */ + n = 0; /* Number of words processed so far */ + p = wsp->ws_head; /* Current node */ + + while (p) + { + struct wordsplit_node *next = p->next; + if (p->flags & _WSNF_DELIM) + { + if (wsp->ws_flags & WRDSF_RETURN_DELIMS) + { + if (wsp->ws_flags & WRDSF_SQUEEZE_DELIMS) + { + char const *s = wsnode_ptr (wsp, p); + if (delim) + { + if (delim == *s) + { + wsnode_remove (wsp, p); + p = next; + continue; + } + else + { + delim = 0; + n++; /* Count this node; it will be returned */ + } + } + else + { + delim = *s; + p = next; + continue; + } + } + } + else if (wsp->ws_options & WRDSO_MAXWORDS) + { + wsnode_remove (wsp, p); + p = next; + continue; + } + } + else + { + if (delim) + { + /* Last node was a delimiter or a compressed run of delimiters; + Count it, and clear the delimiter marker */ + n++; + delim = 0; + } + if (wsp->ws_options & WRDSO_MAXWORDS) + { + if (wsp->ws_wordi + n + 1 == wsp->ws_maxwords) + break; + } + } + n++; + if (wsp->ws_flags & WRDSF_INCREMENTAL) + p = NULL; /* Break the loop */ + else + p = next; + } + + if (p) + { + /* We're here if WRDSO_MAXWORDS is in effect and wsp->ws_maxwords + words have already been collected. Reconstruct a single final + node from the remaining nodes. */ + if (wsnode_tail_coalesce (wsp, p)) + return wsp->ws_errno; + n++; + } + + if (n == 0 && (wsp->ws_flags & WRDSF_INCREMENTAL)) + { + /* The loop above have eliminated all nodes. Restart the + processing, if there's any input left. */ + if (wsp->ws_endp < wsp->ws_len) + { + int rc; + if (wsp->ws_flags & WRDSF_SHOWDBG) + wsp->ws_debug (_("Restarting")); + rc = wordsplit_process_list (wsp, skip_delim (wsp)); + if (rc) + return rc; + } + else + { + wsp->ws_error = WRDSE_EOF; + return WRDSE_EOF; + } + goto again; + } if (alloc_space (wsp, n + 1)) - return 1; + return wsp->ws_errno; - for (p = wsp->ws_head; p; p = p->next) + while (wsp->ws_head) { - const char *str = wsnode_ptr (wsp, p); - size_t slen = wsnode_len (p); + const char *str = wsnode_ptr (wsp, wsp->ws_head); + size_t slen = wsnode_len (wsp->ws_head); char *newstr = malloc (slen + 1); /* Assign newstr first, even if it is NULL. This way @@ -667,8 +844,13 @@ wordsplit_finish (struct wordsplit *wsp) memcpy (newstr, str, slen); newstr[slen] = 0; - wsp->ws_wordc++; + wsnode_remove (wsp, wsp->ws_head); + wsp->ws_wordc++; + wsp->ws_wordi++; + + if (wsp->ws_flags & WRDSF_INCREMENTAL) + break; } wsp->ws_wordv[wsp->ws_offs + wsp->ws_wordc] = NULL; return 0; @@ -1067,7 +1249,7 @@ expvar (struct wordsplit *wsp, const char *str, size_t len, rc = _wsplt_subsplit (wsp, &ws, defstr, size, WRDSF_NOSPLIT | WRDSF_WS | WRDSF_QUOTE | (wsp->ws_flags & - (WRDSF_NOVAR | WRDSF_NOCMD))); + (WRDSF_NOVAR | WRDSF_NOCMD)), 1); if (rc) return rc; free (value); @@ -1088,7 +1270,8 @@ expvar (struct wordsplit *wsp, const char *str, size_t len, rc = _wsplt_subsplit (wsp, &ws, defstr, size, WRDSF_NOSPLIT | WRDSF_WS | WRDSF_QUOTE | (wsp->ws_flags & - (WRDSF_NOVAR | WRDSF_NOCMD))); + (WRDSF_NOVAR | WRDSF_NOCMD)), + 1); if (rc) return rc; @@ -1113,7 +1296,8 @@ expvar (struct wordsplit *wsp, const char *str, size_t len, WRDSF_NOSPLIT | WRDSF_WS | WRDSF_QUOTE | (wsp->ws_flags & - (WRDSF_NOVAR | WRDSF_NOCMD))); + (WRDSF_NOVAR | WRDSF_NOCMD)), + 1); if (rc == 0) wsp->ws_error ("%.*s: %s", (int) i, str, ws.ws_wordv[0]); @@ -1184,11 +1368,13 @@ expvar (struct wordsplit *wsp, const char *str, size_t len, else { struct wordsplit ws; - int i, rc; + int rc; rc = _wsplt_subsplit (wsp, &ws, value, strlen (value), WRDSF_NOVAR | WRDSF_NOCMD | - WRDSF_QUOTE); + WRDSF_QUOTE + | (WSP_RETURN_DELIMS (wsp) ? WRDSF_RETURN_DELIMS : 0) , + 0); free (value); if (rc) { @@ -1196,19 +1382,9 @@ expvar (struct wordsplit *wsp, const char *str, size_t len, wordsplit_free (&ws); return 1; } - for (i = 0; i < ws.ws_wordc; i++) - { - if (wsnode_new (wsp, &newnode)) - return 1; - wsnode_insert (wsp, newnode, *ptail, 0); - *ptail = newnode; - newnode->flags = _WSNF_WORD | - _WSNF_NOEXPAND | - (i + 1 < ws.ws_wordc ? (flg & ~_WSNF_JOIN) : flg); - newnode->v.word = strdup (ws.ws_wordv[i]); - if (!newnode->v.word) - return _wsplt_nomem (wsp); - } + wsnode_insert (wsp, ws.ws_head, *ptail, 0); + *ptail = ws.ws_tail; + ws.ws_head = ws.ws_tail = NULL; wordsplit_free (&ws); } } @@ -1327,7 +1503,7 @@ wordsplit_varexp (struct wordsplit *wsp) for (p = wsp->ws_head; p;) { struct wordsplit_node *next = p->next; - if (!(p->flags & _WSNF_NOEXPAND)) + if (!(p->flags & (_WSNF_NOEXPAND|_WSNF_DELIM))) if (node_expand (wsp, p, begin_var_p, expvar)) return 1; p = next; @@ -1366,8 +1542,7 @@ expcmd (struct wordsplit *wsp, const char *str, size_t len, { struct wordsplit ws; - rc = _wsplt_subsplit (wsp, &ws, str, j, - WRDSF_WS | WRDSF_QUOTE); + rc = _wsplt_subsplit (wsp, &ws, str, j, WRDSF_WS | WRDSF_QUOTE, 1); if (rc) { _wsplt_seterr_sub (wsp, &ws); @@ -1418,11 +1593,13 @@ expcmd (struct wordsplit *wsp, const char *str, size_t len, else { struct wordsplit ws; - int i, rc; + int rc; rc = _wsplt_subsplit (wsp, &ws, value, strlen (value), - WRDSF_NOVAR | WRDSF_NOCMD | - WRDSF_WS | WRDSF_QUOTE); + WRDSF_NOVAR | WRDSF_NOCMD + | WRDSF_WS | WRDSF_QUOTE + | (WSP_RETURN_DELIMS (wsp) ? WRDSF_RETURN_DELIMS : 0), + 0); free (value); if (rc) { @@ -1430,19 +1607,9 @@ expcmd (struct wordsplit *wsp, const char *str, size_t len, wordsplit_free (&ws); return 1; } - for (i = 0; i < ws.ws_wordc; i++) - { - if (wsnode_new (wsp, &newnode)) - return 1; - wsnode_insert (wsp, newnode, *ptail, 0); - *ptail = newnode; - newnode->flags = _WSNF_WORD | - _WSNF_NOEXPAND | - (i + 1 < ws.ws_wordc ? (flg & ~_WSNF_JOIN) : flg); - newnode->v.word = strdup (ws.ws_wordv[i]); - if (!newnode->v.word) - return _wsplt_nomem (wsp); - } + wsnode_insert (wsp, ws.ws_head, *ptail, 0); + *ptail = ws.ws_tail; + ws.ws_head = ws.ws_tail = NULL; wordsplit_free (&ws); } } @@ -1736,33 +1903,24 @@ skip_sed_expr (const char *command, size_t i, size_t len) return i; } -static size_t +/* wsp->ws_endp points to a delimiter character. If RETURN_DELIMS + is true, return its value, otherwise return the index past it. */ +static inline size_t +skip_delim_internal (struct wordsplit *wsp, int return_delims) +{ + return return_delims ? wsp->ws_endp : wsp->ws_endp + 1; +} + +static inline size_t skip_delim (struct wordsplit *wsp) { - size_t start = wsp->ws_endp; - if (wsp->ws_flags & WRDSF_SQUEEZE_DELIMS) - { - if ((wsp->ws_flags & WRDSF_RETURN_DELIMS) && - ISDELIM (wsp, wsp->ws_input[start])) - { - int delim = wsp->ws_input[start]; - do - start++; - while (start < wsp->ws_len && delim == wsp->ws_input[start]); - } - else - { - do - start++; - while (start < wsp->ws_len && ISDELIM (wsp, wsp->ws_input[start])); - } - start--; - } + return skip_delim_internal (wsp, WSP_RETURN_DELIMS (wsp)); +} - if (!(wsp->ws_flags & WRDSF_RETURN_DELIMS)) - start++; - - return start; +static inline size_t +skip_delim_real (struct wordsplit *wsp) +{ + return skip_delim_internal (wsp, wsp->ws_flags & WRDSF_RETURN_DELIMS); } #define _WRDS_EOF 0 @@ -1770,7 +1928,7 @@ skip_delim (struct wordsplit *wsp) #define _WRDS_ERR 2 static int -scan_qstring (struct wordsplit *wsp, size_t start, size_t * end) +scan_qstring (struct wordsplit *wsp, size_t start, size_t *end) { size_t j; const char *command = wsp->ws_input; @@ -1799,14 +1957,15 @@ scan_qstring (struct wordsplit *wsp, size_t start, size_t * end) } static int -scan_word (struct wordsplit *wsp, size_t start) +scan_word (struct wordsplit *wsp, size_t start, int consume_all) { size_t len = wsp->ws_len; const char *command = wsp->ws_input; const char *comment = wsp->ws_comment; int join = 0; int flags = 0; - + struct wordsplit_node *np = wsp->ws_tail; + size_t i = start; if (i >= len) @@ -1823,7 +1982,7 @@ scan_word (struct wordsplit *wsp, size_t start) flags = _WSNF_SEXP; i = skip_sed_expr (command, i, len); } - else if (!ISDELIM (wsp, command[i])) + else if (consume_all || !ISDELIM (wsp, command[i])) { while (i < len) { @@ -1874,13 +2033,13 @@ scan_word (struct wordsplit *wsp, size_t start) continue; } - if (ISDELIM (wsp, command[i])) + if (!consume_all && ISDELIM (wsp, command[i])) break; else i++; } } - else if (wsp->ws_flags & WRDSF_RETURN_DELIMS) + else if (WSP_RETURN_DELIMS (wsp)) { i++; flags |= _WSNF_DELIM; @@ -1895,6 +2054,18 @@ scan_word (struct wordsplit *wsp, size_t start) wsp->ws_endp = i; if (wsp->ws_flags & WRDSF_INCREMENTAL) return _WRDS_EOF; + + if (consume_all) + { + if (!np) + np = wsp->ws_head; + while (np) + { + np->flags |= _WSNF_QUOTE; + np = np->next; + } + } + return _WRDS_OK; } @@ -2114,15 +2285,17 @@ struct exptab wordsplit. The EXPOPT_NEG option negates this test so that expansion is performed if its associated flag bit is not set in struct wordsplit. */ #define EXPOPT_NEG 0x01 +/* All bits in flag must be set in order for entry to match */ +#define EXPORT_ALLOF 0x02 /* Coalesce the input list before running the expansion. */ -#define EXPOPT_COALESCE 0x02 +#define EXPOPT_COALESCE 0x04 static struct exptab exptab[] = { { N_("WS trimming"), WRDSF_WS, 0, wordsplit_trimws }, { N_("command substitution"), WRDSF_NOCMD, EXPOPT_NEG|EXPOPT_COALESCE, wordsplit_cmdexp }, - { N_("coalesce list"), 0, EXPOPT_NEG|EXPOPT_COALESCE, + { N_("coalesce list"), 0, EXPOPT_NEG|EXPOPT_COALESCE, NULL }, { N_("tilde expansion"), WRDSF_PATHEXPAND, 0, wordsplit_tildexpand }, @@ -2136,24 +2309,43 @@ static struct exptab exptab[] = { wordsplit_pathexpand }, { NULL } }; - + +static inline int +exptab_matches(struct exptab *p, struct wordsplit *wsp) +{ + int result; + + result = (wsp->ws_flags & p->flag); + if (p->opt & EXPORT_ALLOF) + result = result == p->flag; + if (p->opt & EXPOPT_NEG) + result = !result; + + return result; +} + static int wordsplit_process_list (struct wordsplit *wsp, size_t start) { struct exptab *p; + + if (wsp->ws_flags & WRDSF_SHOWDBG) + wsp->ws_debug (_("(%02d) Input:%.*s;"), + wsp->ws_lvl, (int) wsp->ws_len, wsp->ws_input); - if (wsp->ws_flags & WRDSF_NOSPLIT) + if ((wsp->ws_flags & WRDSF_NOSPLIT) + || ((wsp->ws_options & WRDSO_MAXWORDS) + && wsp->ws_wordi + 1 == wsp->ws_maxwords)) { - /* Treat entire input as a quoted argument */ - if (wordsplit_add_segm (wsp, start, wsp->ws_len, _WSNF_QUOTE)) + /* Treat entire input as a single word */ + if (scan_word (wsp, start, 1) == _WRDS_ERR) return wsp->ws_errno; - wsp->ws_endp = wsp->ws_len; } else { int rc; - while ((rc = scan_word (wsp, start)) == _WRDS_OK) + while ((rc = scan_word (wsp, start, 0)) == _WRDS_OK) start = skip_delim (wsp); /* Make sure tail element is not joinable */ if (wsp->ws_tail) @@ -2170,8 +2362,7 @@ wordsplit_process_list (struct wordsplit *wsp, size_t start) for (p = exptab; p->descr; p++) { - if ((p->opt & EXPOPT_NEG) - ? !(wsp->ws_flags & p->flag) : (wsp->ws_flags & p->flag)) + if (exptab_matches(p, wsp)) { if (p->opt & EXPOPT_COALESCE) { @@ -2205,63 +2396,35 @@ wordsplit_run (const char *command, size_t length, struct wordsplit *wsp, { int rc; size_t start; - const char *cmdptr; - size_t cmdlen; if (!command) { if (!(flags & WRDSF_INCREMENTAL)) return EINVAL; - start = skip_delim (wsp); + if (wsp->ws_head) + return wordsplit_finish (wsp); + + start = skip_delim_real (wsp); if (wsp->ws_endp == wsp->ws_len) return _wsplt_seterr (wsp, WRDSE_NOINPUT); - cmdptr = wsp->ws_input + wsp->ws_endp; - cmdlen = wsp->ws_len - wsp->ws_endp; wsp->ws_flags |= WRDSF_REUSE; wordsplit_init0 (wsp); } else { - cmdptr = command; - cmdlen = length; start = 0; - rc = wordsplit_init (wsp, cmdptr, cmdlen, flags); + rc = wordsplit_init (wsp, command, length, flags); if (rc) return rc; wsp->ws_lvl = lvl; } - if (wsp->ws_flags & WRDSF_SHOWDBG) - wsp->ws_debug (_("(%02d) Input:%.*s;"), wsp->ws_lvl, (int) cmdlen, cmdptr); - rc = wordsplit_process_list (wsp, start); - if (rc == 0 && (flags & WRDSF_INCREMENTAL)) - { - while (!wsp->ws_head && wsp->ws_endp < wsp->ws_len) - { - start = skip_delim (wsp); - if (wsp->ws_flags & WRDSF_SHOWDBG) - { - cmdptr = wsp->ws_input + wsp->ws_endp; - cmdlen = wsp->ws_len - wsp->ws_endp; - wsp->ws_debug (_("(%02d) Restart:%.*s;"), - wsp->ws_lvl, (int) cmdlen, cmdptr); - } - rc = wordsplit_process_list (wsp, start); - if (rc) - break; - } - } if (rc) - { - wordsplit_free_nodes (wsp); - return rc; - } - wordsplit_finish (wsp); - wordsplit_free_nodes (wsp); - return wsp->ws_errno; + return rc; + return wordsplit_finish (wsp); } int @@ -2323,6 +2486,7 @@ wordsplit_clearerr (struct wordsplit *ws) void wordsplit_free (struct wordsplit *ws) { + wordsplit_free_nodes (ws); wordsplit_free_words (ws); free (ws->ws_wordv); ws->ws_wordv = NULL; diff --git a/tests/wordsplit.at b/tests/wordsplit.at index 49d47e9..d4328e3 100644 --- a/tests/wordsplit.at +++ b/tests/wordsplit.at @@ -1,5 +1,5 @@ # This file is part of grecs -*- Autotest -*- -# Copyright (C) 2014-2016 Sergey Poznyakoff +# Copyright (C) 2014-2018 Sergey Poznyakoff # # Grecs is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -61,12 +61,14 @@ TESTWSP([simple input],[wsp-simple],[], 0: 1 1: 2 2: 3 +TOTAL: 3 ]) TESTWSP([quoted space],[wsp-quoted],[], [quoted\ space], [NF: 1 0: "quoted space" +TOTAL: 1 ]) TESTWSP([tab character],[wsp-tab],[], @@ -74,6 +76,7 @@ TESTWSP([tab character],[wsp-tab],[], [NF: 2 0: a 1: tab\tcharacter +TOTAL: 2 ]) WSPGROUP(wsp-escape) @@ -81,6 +84,7 @@ TESTWSP([octal and hex escapes],[],[], [\157\143\164\141\154\40and\x20\x68\x65\x78], [NF: 1 0: "octal and hex" +TOTAL: 1 ]) TESTWSP([octal and hex escapes 2],[],[], @@ -89,12 +93,14 @@ TESTWSP([octal and hex escapes 2],[],[], 0: "octal " 1: and 2: " hex" +TOTAL: 3 ]) TESTWSP([escape representation],[],[], [A\x3-\48\39], [NF: 1 0: A\003-\0048\0039 +TOTAL: 1 ]) WSPGROUP() @@ -110,11 +116,13 @@ piec szesc], 0: jeden 1: dwa 2: trzy +TOTAL: 3 NF: 4 0: jeden 1: dwa 2: trzy 3: cztery +TOTAL: 1 NF: 6 0: jeden 1: dwa @@ -122,6 +130,7 @@ NF: 6 3: cztery 4: piec 5: szesc +TOTAL: 2 ]) TESTWSP([dooffs],[wsp-doofs ],[dooffs 3 jeden dwa trzy], @@ -132,6 +141,7 @@ TESTWSP([dooffs],[wsp-doofs ],[dooffs 3 jeden dwa trzy], (2): trzy 3: cztery 4: piec +TOTAL: 2 ]) WSPGROUP(wsp-var) @@ -142,6 +152,7 @@ TESTWSP([variable substitutions: single var],[],[], 0: a 1: bar 2: test +TOTAL: 3 ], [], [FOO=bar]) @@ -153,6 +164,7 @@ TESTWSP([variable substitutions: concatenated vars],[], 0: a 1: stringent 2: test +TOTAL: 3 ], [], [FOO=str BAR=ing]) @@ -164,6 +176,7 @@ TESTWSP([variable substitutions: field splitting],[],[], 1: variable 2: substitution 3: test +TOTAL: 4 ], [], [FOO="variable substitution"]) @@ -174,6 +187,7 @@ TESTWSP([variable substitutions: double-quoted variable],[],[], 0: a 1: "variable substitution" 2: test +TOTAL: 3 ], [], [FOO="variable substitution"]) @@ -184,6 +198,7 @@ TESTWSP([variable substitutions: single-quoted variable],[],[], 0: a 1: $FOO 2: test +TOTAL: 3 ], [], [FOO="variable substitution"]) @@ -194,6 +209,7 @@ TESTWSP([undefined variables 1],[],[], 0: a 1: test 2: ab +TOTAL: 3 ], [], [unset FOO;]) @@ -205,6 +221,7 @@ TESTWSP([undefined variables 2],[],[keepundef], 1: $FOO 2: test 3: a${FOO}b +TOTAL: 4 ], [], [unset FOO;]) @@ -212,6 +229,7 @@ TESTWSP([undefined variables 2],[],[keepundef], TESTWSP([warn about undefined variables],[],[warnundef], [$FOO], [NF: 0 +TOTAL: 0 ], [warning: undefined variable `FOO' ], @@ -228,6 +246,7 @@ TESTWSP([disable variable expansion],[],[novar], [$FOO], [NF: 1 0: $FOO +TOTAL: 1 ], [], [FOO=bar]) @@ -238,6 +257,7 @@ TESTWSP([K/V environment],[wsp-env-kv wsp-env_kv], [NF: 2 0: bar 1: aqux +TOTAL: 2 ], [], [FOO=bar BAZ=qux]) @@ -246,6 +266,7 @@ TESTWSP([nosplit with expansion],[wsp-var-nosplit],[nosplit], [a $FOO test], [NF: 1 0: "a variable expansion test\n" +TOTAL: 1 ], [], [FOO="variable expansion"]) @@ -254,6 +275,7 @@ TESTWSP([nosplit without expansion],[],[nosplit novar], [a $FOO test], [NF: 1 0: "a $FOO test\n" +TOTAL: 1 ], [], [FOO="variable expansion"]) @@ -262,6 +284,7 @@ TESTWSP([default value (defined)],[],[], [${FOO:-bar}], [NF: 1 0: qux +TOTAL: 1 ], [], [FOO=qux]) @@ -270,12 +293,14 @@ TESTWSP([default value],[],[], [${FOO:-bar}], [NF: 1 0: bar +TOTAL: 1 ]) TESTWSP([default value (defined)],[],[], [${FOO:-bar}], [NF: 1 0: qux +TOTAL: 1 ], [], [FOO=qux]) @@ -284,6 +309,7 @@ TESTWSP([default value (:- null)],[],[], [${FOO:-bar}], [NF: 1 0: bar +TOTAL: 1 ], [], [FOO=]) @@ -291,6 +317,7 @@ TESTWSP([default value (:- null)],[],[], TESTWSP([default value (- null)],[],[], [${FOO-bar}], [NF: 0 +TOTAL: 0 ], [], [FOO=]) @@ -299,6 +326,7 @@ TESTWSP([default value (- null, unset)],[],[], [${FOO-bar}], [NF: 1 0: bar +TOTAL: 1 ]) TESTWSP([assign default values],[],[], @@ -306,8 +334,10 @@ TESTWSP([assign default values],[],[], $FOO], [NF: 1 0: bar +TOTAL: 1 NF: 1 0: bar +TOTAL: 1 ]) TESTWSP([default error message (var defined)],[],[], @@ -316,6 +346,7 @@ TESTWSP([default error message (var defined)],[],[], 0: a 1: bar 2: test +TOTAL: 3 ], [], [FOO=bar]) @@ -323,6 +354,7 @@ TESTWSP([default error message (var defined)],[],[], TESTWSP([default error message],[],[], [${FOO:?}], [NF: 0 +TOTAL: 0 ], [FOO: variable null or not set ]) @@ -333,6 +365,7 @@ TESTWSP([custom error message (defined)],[wsp-custom-err wsp-custom-err00],[], 0: a 1: bar 2: test +TOTAL: 3 ], [], [FOO=bar]) @@ -342,6 +375,7 @@ TESTWSP([custom error message],[wsp-custom-err wsp-custom-err01],[], [NF: 2 0: a 1: test +TOTAL: 2 ], [FOO: please define it ]) @@ -352,6 +386,7 @@ TESTWSP([alternate value (defined)],[wsp-alt wsp-alt00],[], 0: a 1: isset 2: test +TOTAL: 3 ], [], [FOO=bar]) @@ -361,6 +396,7 @@ TESTWSP([alternate value],[wsp-alt wsp-alt01],[], [NF: 2 0: a 1: test +TOTAL: 2 ], [], [unset FOO;]) @@ -373,6 +409,7 @@ TESTWSP([getvar],[wsp-getvar], 1: bar 2: quux 3: end +TOTAL: 4 ], [], [], @@ -388,6 +425,7 @@ TESTWSP([getvar and env],[wsp-getvar], 3: quux 4: zwar 5: end +TOTAL: 6 ], [], [TVAR=12 y=zwar], @@ -399,6 +437,7 @@ TESTWSP([getvar, alternate value],[wsp-getvar], [NF: 2 0: a 1: isset +TOTAL: 2 ]) WSPGROUP() @@ -408,6 +447,7 @@ TESTWSP([ignore quotes],[wsp-ignore-quotes ],[-quote], [NF: 2 0: "\"a" 1: "text\"" +TOTAL: 2 ]) WSPGROUP(wsp-delim) @@ -421,6 +461,7 @@ TESTWSP([custom delimiters (squeeze)],[], 2: list 3: " of " 4: words +TOTAL: 5 ]) TESTWSP([custom delimiters (no squeeze)],[], @@ -433,6 +474,7 @@ TESTWSP([custom delimiters (no squeeze)],[], 3: list 4: " of " 5: words +TOTAL: 6 ]) TESTWSP([custom, with returned delimiters],[], @@ -448,6 +490,7 @@ TESTWSP([custom, with returned delimiters],[], 6: " of " 7: : 8: words +TOTAL: 9 ]) TESTWSP([custom, with returned & squeezed delimiters],[], @@ -464,6 +507,7 @@ TESTWSP([custom, with returned & squeezed delimiters],[], 7: " of " 8: : 9: words +TOTAL: 10 ]) WSPGROUP(wsp-sed) @@ -474,6 +518,7 @@ TESTWSP([sed expressions],[],[sed], 0: arg1 1: "s/foo/bar/g;s/bar baz/quz quux/" 2: arg2 +TOTAL: 3 ]) WSPGROUP() @@ -485,6 +530,7 @@ TESTWSP([C escapes on],[wcp-c-escape],[cescapes], 1: form\ffeed 2: and 3: new\nline +TOTAL: 4 ]) TESTWSP([C escapes off],[wcp-c-escape-off],[-cescapes], @@ -494,6 +540,7 @@ TESTWSP([C escapes off],[wcp-c-escape-off],[-cescapes], 1: formffeed 2: and 3: newnline +TOTAL: 4 ]) TESTWSP([ws elimination],[wsp-ws-elim],[delim ' ()' ws return_delims], @@ -503,6 +550,7 @@ TESTWSP([ws elimination],[wsp-ws-elim],[delim ' ()' ws return_delims], 1: list 2: items 3: ) +TOTAL: 4 ]) TESTWSP([ws elimination + return delim],[wsp-ws-elim-ret], @@ -516,12 +564,14 @@ TESTWSP([ws elimination + return delim],[wsp-ws-elim-ret], 4: quux 5: : 6: baaz +TOTAL: 7 ]) TESTWSP([empty quotes],[wsp-empty-quotes],[delim : ws return_delims], [t=""], [NF: 1 0: t= +TOTAL: 1 ]) TESTWSP([delimiter following empty quotes], @@ -531,6 +581,7 @@ TESTWSP([delimiter following empty quotes], 0: t= 1: : 2: r +TOTAL: 3 ]) TESTWSP([suppress ws trimming within quotes], @@ -543,6 +594,7 @@ TESTWSP([suppress ws trimming within quotes], 2: nonewline 3: , 4: "formatfield=In message %{text}, " +TOTAL: 5 ]) TESTWSP([unescape], @@ -553,6 +605,7 @@ TESTWSP([unescape], 0: \\Seen 1: "quote \"" 2: "bs \\" +TOTAL: 3 ]) TESTWSP([unescape: word/quote], @@ -566,6 +619,7 @@ TESTWSP([unescape: word/quote], 3: "31 A" 4: 3x31 5: 101 +TOTAL: 6 ]) TESTWSP([dquote],[],[-default novar nocmd dquote], @@ -575,6 +629,7 @@ TESTWSP([dquote],[],[-default novar nocmd dquote], 1: "quoted example" 2: isn't 3: it +TOTAL: 4 ]) TESTWSP([squote],[],[-default novar nocmd squote], @@ -584,6 +639,7 @@ TESTWSP([squote],[],[-default novar nocmd squote], 1: "quoted example" 2: "isn\"t" 3: it +TOTAL: 4 ]) WSPGROUP(wsp-incr) @@ -595,10 +651,13 @@ TESTWSP([incremental],[],[incremental], ], [NF: 1 0: incremental +TOTAL: 1 NF: 1 0: "input test" +TOTAL: 2 NF: 1 0: line +TOTAL: 3 ], [input exhausted ]) @@ -610,13 +669,16 @@ TESTWSP([incremental append],[],[incremental append], ], [NF: 1 0: incremental +TOTAL: 1 NF: 2 0: incremental 1: "input test" +TOTAL: 2 NF: 3 0: incremental 1: "input test" 2: line +TOTAL: 3 ], [input exhausted ]) @@ -629,10 +691,13 @@ TESTWSP([incremental ws], ], [NF: 1 0: a +TOTAL: 1 NF: 1 0: list +TOTAL: 2 NF: 1 0: test +TOTAL: 3 ], [input exhausted ]) @@ -641,7 +706,8 @@ TESTWSP([incremental nosplit],[],[incremental nosplit], [incremental "input test" line ], [NF: 1 -0: "incremental \"input test\" line" +0: "incremental input test line" +TOTAL: 1 ], [input exhausted ]) @@ -664,6 +730,7 @@ EOT 1: dir 2: dir/file 3: end +TOTAL: 4 ]) AT_CLEANUP @@ -682,6 +749,7 @@ EOT 0: begin 1: "dir dir/file" 2: end +TOTAL: 3 ]) AT_CLEANUP @@ -699,6 +767,7 @@ EOT [NF: 2 0: begin(dir 1: dir/file)end +TOTAL: 2 ]) AT_CLEANUP @@ -715,6 +784,7 @@ EOT [0], [NF: 1 0: "begin(dir dir/file)end" +TOTAL: 1 ]) AT_CLEANUP @@ -735,6 +805,7 @@ EOT 2: dir 3: dir/file 4: end +TOTAL: 5 ]) AT_CLEANUP @@ -751,6 +822,7 @@ EOT [0], [NF: 1 0: "begin(dir dir/file)end" +TOTAL: 1 ]) AT_CLEANUP @@ -771,6 +843,7 @@ EOT 1: foo 2: bar 3: baz +TOTAL: 4 ]) AT_CLEANUP @@ -792,6 +865,7 @@ EOT 1: dir/1.c 2: dir/2.c 3: end +TOTAL: 4 ]) AT_CLEANUP @@ -811,6 +885,7 @@ EOT 0: begin 1: dir/*.d 2: end +TOTAL: 3 ]) AT_CLEANUP @@ -829,6 +904,7 @@ EOT [NF: 2 0: begin 1: end +TOTAL: 2 ]) AT_CLEANUP @@ -858,6 +934,7 @@ TESTWSP([append],[],[-- extra arguments follow], 3: extra 4: arguments 5: follow +TOTAL: 3 ]) TESTWSP([append + dooffs + env],[], @@ -873,8 +950,69 @@ TESTWSP([append + dooffs + env],[], 6: extra 7: arguments 8: follow +TOTAL: 4 ]) +# Maxwords +TESTWSP([maxwords],[], +[trimnl maxwords 3], +[ws_maxwords limits the number of returned words], +[NF: 3 +0: ws_maxwords +1: limits +2: "the number of returned words" +TOTAL: 3 +]) + +TESTWSP([maxwords return_delims],[], +[trimnl maxwords 8 return_delims delim :-], +[foo:::bar-:baz-quux:ux:zu], +[NF: 8 +0: foo +1: : +2: bar +3: - +4: : +5: baz +6: - +7: quux:ux:zu +TOTAL: 8 +]) + +TESTWSP([maxwords return_delims -squeeze_delims],[], +[trimnl maxwords 8 return_delims -squeeze_delims delim :-], +[foo:::bar-:baz:qux-], +[NF: 8 +0: foo +1: : +2: : +3: : +4: bar +5: - +6: : +7: baz:qux- +TOTAL: 8 +]) + +TESTWSP([maxwords incremental],[], +[trimnl maxwords 3 incremental], +[foo bar baz qux uz + + +], +[NF: 1 +0: foo +TOTAL: 1 +NF: 1 +0: bar +TOTAL: 2 +NF: 1 +0: "baz qux uz" +TOTAL: 3 +], +[input exhausted +])) + m4_popdef([TESTWSP]) m4_popdef([wspnum]) m4_popdef([wspid]) diff --git a/tests/wsp.c b/tests/wsp.c index 84efc13..a96fb7f 100644 --- a/tests/wsp.c +++ b/tests/wsp.c @@ -520,6 +520,30 @@ main (int argc, char **argv) ws.ws_options |= flag; continue; } + + if (strcmp (opt, "maxwords") == 0) + { + char *p; + wsflags |= WRDSF_OPTIONS; + ws.ws_options |= WRDSO_MAXWORDS; + + i++; + + if (i == argc) + { + fprintf (stderr, "%s: missing arguments for %s\n", + progname, opt); + exit (1); + } + ws.ws_maxwords = strtoul (argv[i], &p, 10); + if (*p) + { + fprintf (stderr, "%s: invalid number: %s\n", + progname, argv[i]); + exit (1); + } + continue; + } if (strchr (opt, '=')) { @@ -640,6 +664,7 @@ main (int argc, char **argv) print_qword (ws.ws_wordv[i], plaintext_option); putchar ('\n'); } + printf ("TOTAL: %lu\n", (unsigned long) ws.ws_wordi); } return 0; }