From 7eaa3c45bedf204a274f5de2da4269d510d2bd56 Mon Sep 17 00:00:00 2001 From: Sergey Poznyakoff Date: Wed, 24 Jul 2019 13:19:20 +0300 Subject: [PATCH] Allow the caller to modify variable name constituents. This new feature makes it possible to expand variables with dots or other unusual characters in their names. * README: Update. * wordsplit.3: Document the use of the ws_namechar member. * wordsplit.c (is_name_char): New static function. (ISVARCHR): Removed. Use is_name_char instead. (_wsplt_seterr): Set errno to EINVAL if WRDSE_USAGE is returned. (_wsplt_subsplit): Clear the WRDSO_MAXWORDS option in the subsplit. Pass ws_namechar. (wordsplit_init): Check for valid ws_namechar content. (expvar): Take into account ws_namechar when scanning variable name. Fix name length passed to the _wsplt_setctxerr call. * wordsplit.h (ws_namechar): New member. (WRDSO_NAMECHAR): New option bit. * wsp.c: New options: -D to define an "extra" environment entry (possibly containing characters not allowed by the shell), and -namechar to define additional variable name constituents. * wordsplit.at: Test namechar modifications. --- README | 4 +- wordsplit.3 | 63 +++++++++++++++++++++++- wordsplit.at | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++ wordsplit.c | 121 +++++++++++++++++++++++----------------------- wordsplit.h | 6 +++ wsp.c | 118 ++++++++++++++++++++++++++++++++++++++------- 6 files changed, 363 insertions(+), 81 deletions(-) diff --git a/README b/README index 96ffbec..16f2c04 100644 --- a/README +++ b/README @@ -230,8 +230,8 @@ the suite. It was therefore decided that it would be advisable to have wordsplit as a separate package which could be easily included in another project without incurring unnecessary overhead. -Currently the work is underway on incorporating it into existing -projects. +By the end of July 2019, all mentioned packages switched to using +wordsplit as a submodule. * References diff --git a/wordsplit.3 b/wordsplit.3 index e742030..337170f 100644 --- a/wordsplit.3 +++ b/wordsplit.3 @@ -14,7 +14,7 @@ .\" You should have received a copy of the GNU General Public License .\" along with wordsplit. If not, see . .\" -.TH WORDSPLIT 3 "July 9, 2019" "WORDSPLIT" "Wordsplit User Reference" +.TH WORDSPLIT 3 "July 24, 2019" "WORDSPLIT" "Wordsplit User Reference" .SH NAME wordsplit \- split string into words .SH SYNOPSIS @@ -460,6 +460,43 @@ for each such word using When matching a pattern, the dot at the start of a name or immediately following a slash must be matched explicitly, unless the \fBWRDSO_DOTGLOB\fR option is set. +.SH VARIABLE NAMES +By default a shell-like lexical structure of a variable name is +assumed. A valid variable name begins with an alphabetical +character or underscore and contains alphabetical characters, digits +and underscores. +.PP +The set of characters that constitute a variable name can be +augmented. To do so, initialize the \fBws_namechar\fR member to the +C string containing the characters to be added, set the +\fBWRDSO_NAMECHAR\fR bit in \fBws_options\fR and set the +\fBWRDSF_OPTIONS\fR bit in the \fIflags\fR argument. +.PP +For example, to allow semicolon in variable names, do: +.PP +.EX +struct wordsplit ws; +ws.ws_namechar = ":"; +ws.ws_options = WRDSO_NAMECHAR; +wordsplit(str, &ws, WRDSF_DEFFLAGS|WRDSF_OPTIONS); +.EE +.PP +Certain characters cannot be allowed to be a name costituent. These +are: +.BR $ , +.BR { , +.BR } , +.BR * , +.BR @ , +.BR \- , +.BR + , +.BR ? , +and +.BR = . +If any of these appears in \fBws_namechar\fR, the \fBwordsplit\fR (and +\fBwordsplit_len\fR) function will return the +.B WRDSE_USAGE +error. .SH LIMITING THE NUMBER OF WORDS The maximum number of words to be returned can be limited by setting the \fBws_maxwords\fR member to the desired count, and setting the @@ -608,6 +645,18 @@ tabulation character and \fB\\n\fR into newline. .B WRDSF_ESCAPE flag must be set if this member is initialized. .TP +.BI "const char *" ws_namechar +Lists characters that are allowed in a variable name, in addition to +alphanumerics and underscore. The +.B WRDSO_NAMECHAR +bit must be set in +.B ws_options +for this to take effect. +.sp +See the chapter +.BR "VARIABLE NAMES" , +for a detailed discussion. +.TP .BI "void (*" ws_alloc_die ") (wordsplit_t *)" This function is called when .B wordsplit @@ -984,6 +1033,18 @@ positional argument references. A negative argument reference has the form \fB${-\fIN\fB}\fR. It is expanded to the value of the argument with index \fB\fIws_paramc\fR \- \fIN\fR, i.e. \fIN\fRth if counting from the end. +.TP +.B WRDSO_NAMECHAR +When set, indicates that the +.B ws_namechar +member of the +.B wordsplit_t +struct has been initialized. +.sp +This member allows you to modify the notion of what characters can be +part of a valid variable name. See the chapter +.BR "VARIABLE NAMES" , +for a detailed discussion. .SH "ERROR CODES" .TP .BR WRDSE_OK ", " WRDSE_EOF diff --git a/wordsplit.at b/wordsplit.at index d7d8bc9..aa2c87d 100644 --- a/wordsplit.at +++ b/wordsplit.at @@ -1021,6 +1021,138 @@ NF: 1 TOTAL: 1 ]) +# Namechar modification tests + +TESTWSP([namechar modification],[], +[-namechar ".:" -Dx.y=one -Dx:foo=bar], +[one is $x.y, foo is $x:foo], +[NF: 6 +0: one +1: is +2: one, +3: foo +4: is +5: bar +TOTAL: 6 +]) + +AT_BANNER([namechar modification]) +TESTWSP([default value],[], +[-namechar ":."], +[${x:foo:-bar}], +[NF: 1 +0: bar +TOTAL: 1 +]) + +TESTWSP([default value (defined)],[], +[-namechar ":." -Dx:foo=qux], +[${x:foo:-bar}], +[NF: 1 +0: qux +TOTAL: 1 +], +[]) + +TESTWSP([default value (:- null)],[], +[-namechar ":." -Dx:foo=], +[${x:foo:-bar}], +[NF: 1 +0: bar +TOTAL: 1 +], +[]) + +TESTWSP([default value (- null)],[], +[-namechar ":." -Dx:foo=], +[${x:foo-bar}], +[NF: 0 +TOTAL: 0 +], +[]) + +TESTWSP([default value (- null, unset)],[], +[-namechar ":."], +[${x:foo-bar}], +[NF: 1 +0: bar +TOTAL: 1 +]) + +TESTWSP([assign default values],[], +[-namechar ":."], +[${x:foo=bar} +$x:foo], +[NF: 1 +0: bar +TOTAL: 1 +NF: 1 +0: bar +TOTAL: 1 +]) + +TESTWSP([default error message (var defined)],[], +[-namechar ":." -Dx:foo=bar], +[a ${x:foo:?} test], +[NF: 3 +0: a +1: bar +2: test +TOTAL: 3 +]) + +TESTWSP([default error message],[], +[-namechar ":."], +[${x:foo:?}], +[NF: 0 +TOTAL: 0 +], +[x:foo: variable null or not set +]) + +TESTWSP([custom error message (defined)],[wsp-custom-err wsp-custom-err03], +[-namechar ":." -Dx:foo=bar], +[a ${x:foo:?please define it} test], +[NF: 3 +0: a +1: bar +2: test +TOTAL: 3 +]) + +TESTWSP([custom error message],[wsp-custom-err wsp-custom-err04], +[-namechar ":."], +[a ${x:foo:?please define it} test], +[NF: 2 +0: a +1: test +TOTAL: 2 +], +[x:foo: please define it +]) + +TESTWSP([alternate value (defined)],[wsp-alt wsp-alt02], +[-namechar ":." -Dx:foo=bar], +[a ${x:foo:+isset} test], +[NF: 3 +0: a +1: isset +2: test +TOTAL: 3 +], +[], +[FOO=bar]) + +TESTWSP([alternate value],[wsp-alt wsp-alt03], +[-namechar ":."], +[a ${x:foo:+isset} test], +[NF: 2 +0: a +1: test +TOTAL: 2 +]) + + m4_popdef([TESTWSP]) m4_popdef([wspnum]) m4_popdef([wspid]) diff --git a/wordsplit.c b/wordsplit.c index d3ec9e1..99a8b4f 100644 --- a/wordsplit.c +++ b/wordsplit.c @@ -52,7 +52,14 @@ #define ISPRINT(c) (' ' <= ((unsigned) (c)) && ((unsigned) (c)) <= 127) #define ISVARBEG(c) (ISALPHA(c) || c == '_') -#define ISVARCHR(c) (ISALNUM(c) || c == '_') +static inline int +is_name_char (struct wordsplit *wsp, int c) +{ + return ISALNUM (c) + || c == '_' + || ((wsp->ws_options & WRDSO_NAMECHAR) + && strchr (wsp->ws_namechar, c)); +} #define WSP_RETURN_DELIMS(wsp) \ ((wsp)->ws_flags & WRDSF_RETURN_DELIMS || ((wsp)->ws_options & WRDSO_MAXWORDS)) @@ -92,6 +99,8 @@ _wsplt_seterr (struct wordsplit *wsp, int ec) wsp->ws_errno = ec; if (wsp->ws_flags & WRDSF_SHOWERR) wordsplit_perror (wsp); + if (ec == WRDSE_USAGE) + errno = EINVAL; return ec; } @@ -172,8 +181,9 @@ _wsplt_subsplit (struct wordsplit *wsp, struct wordsplit *wss, flags |= wsp->ws_flags & WRDSF_CLOSURE; } - wss->ws_options = wsp->ws_options; - + wss->ws_options = wsp->ws_options & ~WRDSO_MAXWORDS; + wss->ws_namechar = wsp->ws_namechar; + flags |= WRDSF_DELIM | WRDSF_ALLOC_DIE | WRDSF_ERROR @@ -260,11 +270,7 @@ wordsplit_init (struct wordsplit *wsp, const char *input, size_t len, if (!(wsp->ws_flags & WRDSF_NOCMD)) { if (!wsp->ws_command) - { - _wsplt_seterr (wsp, WRDSE_USAGE); - errno = EINVAL; - return wsp->ws_errno; - } + return _wsplt_seterr (wsp, WRDSE_USAGE); } if (wsp->ws_flags & WRDSF_SHOWDBG) @@ -333,6 +339,14 @@ wordsplit_init (struct wordsplit *wsp, const char *input, size_t len, wsp->ws_paramidx = wsp->ws_paramsiz = 0; wsp->ws_parambuf = NULL; + if (wsp->ws_options & WRDSO_NAMECHAR) + { + if (wsp->ws_namechar[strcspn(wsp->ws_namechar, "${}*@-+?=")]) + return _wsplt_seterr (wsp, WRDSE_USAGE); + } + else + wsp->ws_namechar = NULL; + wsp->ws_endp = 0; wsp->ws_wordi = 0; @@ -1387,7 +1401,7 @@ expvar (struct wordsplit *wsp, const char *str, size_t len, if (ISVARBEG (str[0])) { for (i = 1; i < len; i++) - if (!ISVARCHR (str[i])) + if (!is_name_char (wsp, str[i])) break; *pend = str + i - 1; } @@ -1429,21 +1443,12 @@ expvar (struct wordsplit *wsp, const char *str, size_t len, && (str[1] == '-' && ISDIGIT (str[2]))))) != 0)) { + int i0 = str[0] == '-' ? 1 : 0; str++; len--; - for (i = str[0] == '-' ? 1 : 0; i < len; i++) + for (i = i0; i < len; i++) { - if (str[i] == ':') - { - size_t j; - - defstr = str + i + 1; - if (find_closing_paren (str, i + 1, len, &j, "{}")) - return _wsplt_seterr (wsp, WRDSE_CBRACE); - *pend = str + j; - break; - } - else if (str[i] == '}') + if (str[i] == '}') { defstr = NULL; *pend = str + i; @@ -1456,6 +1461,8 @@ expvar (struct wordsplit *wsp, const char *str, size_t len, defstr = str + i; if (find_closing_paren (str, i, len, &j, "{}")) return _wsplt_seterr (wsp, WRDSE_CBRACE); + if (i > i0 + 1 && str[i-1] == ':') + i--; *pend = str + j; break; } @@ -1473,8 +1480,10 @@ expvar (struct wordsplit *wsp, const char *str, size_t len, return expvar_recover (wsp, str - 1, ptail, pend, flg); } } - else if (!ISVARCHR (str[i])) + else if (!is_name_char (wsp, str[i])) { + if (str[i] == ':' && i + 1 < len && strchr ("-+?=", str[i+1])) + continue; return expvar_recover (wsp, str - 1, ptail, pend, flg); } } @@ -1495,54 +1504,46 @@ expvar (struct wordsplit *wsp, const char *str, size_t len, i - its length defstr - default replacement str */ - if (defstr && strchr("-+?=", defstr[0]) == 0) + if (is_param) { - rc = WRDSE_UNDEF; - defstr = NULL; + if (param_idx >= 0 && param_idx < wsp->ws_paramc) + { + value = strdup (wsp->ws_paramv[param_idx]); + if (!value) + rc = WRDSE_NOSPACE; + else + rc = WRDSE_OK; + } + else + rc = WRDSE_UNDEF; } else { - if (is_param) + if (wsp->ws_flags & WRDSF_GETVAR) { - if (param_idx >= 0 && param_idx < wsp->ws_paramc) + if (wsp->ws_options & WRDSO_GETVARPREF) { - value = strdup (wsp->ws_paramv[param_idx]); - if (!value) - rc = WRDSE_NOSPACE; - else - rc = WRDSE_OK; + rc = wsplt_env_getvar (wsp, str, i, &value); + if (rc == WRDSE_UNDEF) + rc = wsplt_env_lookup (wsp, str, i, &value); } else - rc = WRDSE_UNDEF; + { + rc = wsplt_env_lookup (wsp, str, i, &value); + if (rc == WRDSE_UNDEF) + rc = wsplt_env_getvar (wsp, str, i, &value); + } } else - { - if (wsp->ws_flags & WRDSF_GETVAR) - { - if (wsp->ws_options & WRDSO_GETVARPREF) - { - rc = wsplt_env_getvar (wsp, str, i, &value); - if (rc == WRDSE_UNDEF) - rc = wsplt_env_lookup (wsp, str, i, &value); - } - else - { - rc = wsplt_env_lookup (wsp, str, i, &value); - if (rc == WRDSE_UNDEF) - rc = wsplt_env_getvar (wsp, str, i, &value); - } - } - else - rc = wsplt_env_lookup (wsp, str, i, &value); - } + rc = wsplt_env_lookup (wsp, str, i, &value); + } - if (rc == WRDSE_OK - && (!value || value[0] == 0) - && defstr && defstr[-1] == ':') - { - free (value); - rc = WRDSE_UNDEF; - } + if (rc == WRDSE_OK + && (!value || value[0] == 0) + && defstr && defstr[-1] == ':') + { + free (value); + rc = WRDSE_UNDEF; } switch (rc) @@ -1628,7 +1629,7 @@ expvar (struct wordsplit *wsp, const char *str, size_t len, } else if (wsp->ws_flags & WRDSF_UNDEF) { - _wsplt_setctxerr (wsp, WRDSE_UNDEF, str, *pend - str + 1); + _wsplt_setctxerr (wsp, WRDSE_UNDEF, str, i); return 1; } else diff --git a/wordsplit.h b/wordsplit.h index 3451979..7c14cea 100644 --- a/wordsplit.h +++ b/wordsplit.h @@ -52,6 +52,10 @@ struct wordsplit const char *ws_comment; /* [Input] (WRDSF_COMMENT) Comment characters. */ const char *ws_escape[2]; /* [Input] (WRDSF_ESCAPE) Characters to be escaped with backslash. */ + const char *ws_namechar; /* [Input] (WRDSO_NAMECHAR) Characters that can + be parts of a variable name, in addition to + alphanumerics and underscore. */ + void (*ws_alloc_die) (wordsplit_t *wsp); /* [Input] (WRDSF_ALLOC_DIE) Function called when out of memory. Must not return. */ @@ -247,6 +251,8 @@ struct wordsplit /* Enable negative positional indices (${-1} is the last positional parameter) */ #define WRDSO_PARAM_NEGIDX 0x00008000 +/* ws_namechar member is initialized */ +#define WRDSO_NAMECHAR 0x00010000 #define WRDSO_BSKEEP WRDSO_BSKEEP_WORD #define WRDSO_OESC WRDSO_OESC_WORD diff --git a/wsp.c b/wsp.c index cea7980..75fd6f5 100644 --- a/wsp.c +++ b/wsp.c @@ -41,7 +41,14 @@ enum env_type { env_none, /* No environment */ env_null, /* Null environment */ - env_sys /* Use system environment */ + env_sys, /* Use system environment */ + env_extra /* Use small built-in "extra" environment */ + }; + +enum + { + MAX_F_ENV = 16, + MAX_X_ENV = 16 }; struct wsclosure @@ -54,9 +61,14 @@ struct wsclosure the argv array. The ws.ws_dooffs field gives the number of such variables. Forces the WRDSF_DOOFFS flag. */ - char **fenvbase; /* Environment for testing the ws_getenv function */ + char *fenvbase[MAX_F_ENV+1]; + /* Environment for testing the ws_getenv function */ int fenvidx; /* Number of variables in fenvbase */ - int fenvmax; /* Size of fenbase (entries) */ + + char *xenvbase[MAX_X_ENV+1]; + /* Extra environment variables */ + int xenvidx; /* Number of variables in xenvbase */ + int append_start; /* First argument to append (index in argv) */ int append_count; /* Number of arguments to append */ }; @@ -110,7 +122,7 @@ getwsopt (int argc, char **argv, struct wsopt *wso, struct wsclosure *wsc) { if (strchr (opt, '=')) { - assert (wsc->fenvidx < wsc->fenvmax - 1); + assert (wsc->fenvidx < MAX_F_ENV); wsc->fenvbase[wsc->fenvidx++] = opt; return 0; } @@ -118,6 +130,31 @@ getwsopt (int argc, char **argv, struct wsopt *wso, struct wsclosure *wsc) return EOF; } + if (strncmp (opt, "-D", 2) == 0) + { + char *asgn; + + if (opt[2]) + asgn = opt + 2; + else if (wsoptind == argc) + { + fprintf (stderr, "%s: missing arguments for -D\n", + progname); + exit (1); + } + else + asgn = argv[wsoptind++]; + + if (strchr (asgn, '=')) + { + assert (wsc->xenvidx < MAX_F_ENV); + wsc->xenvbase[wsc->xenvidx++] = asgn; + return 0; + } + wsoptind--; + return EOF; + } + if (strcmp (opt, "--version") == 0) { print_version (); @@ -305,6 +342,14 @@ setfn_maxwords (int flag, int neg, char *arg, struct wsclosure *wsc) } } +static void +setfn_namechar (int flag, int neg, char *arg, struct wsclosure *wsc) +{ + wsc->wsflags |= WRDSF_OPTIONS; + wsc->ws.ws_options |= WRDSO_NAMECHAR; + wsc->ws.ws_namechar = arg; +} + static void setfn_global (int flag, int neg, char *arg, struct wsclosure *wsc) { @@ -323,6 +368,8 @@ setfn_env (int flag, int neg, char *arg, struct wsclosure *wsc) wsc->env_type = env_null; else if (strcmp (arg, "sys") == 0) wsc->env_type = env_sys; + else if (strcmp (arg, "extra") == 0) + wsc->env_type = env_extra; else { fprintf (stderr, "%s: environment flag: %s\n", progname, arg); @@ -400,6 +447,7 @@ struct wsopt opttab[] = { { "novarsplit", WRDSO_NOVARSPLIT, ws_boolean, setfn_option }, { "nocmdsplit", WRDSO_NOCMDSPLIT, ws_boolean, setfn_option }, { "maxwords", WRDSO_MAXWORDS, ws_required_argument, setfn_maxwords }, + { "namechar", WRDSO_NAMECHAR, ws_required_argument, setfn_namechar }, /* String options */ { "delim", WRDSF_DELIM, ws_required_argument, setfn_delim }, { "comment", WRDSF_COMMENT,ws_required_argument, setfn_comment }, @@ -420,7 +468,7 @@ help (void) { size_t i; - printf ("usage: %s [options] [VAR=VALUE...] [-- EXTRA...]\n", progname); + printf ("usage: %s [options] [-D VAR=VALUE ...] [VAR=VALUE...] [-- EXTRA...]\n", progname); printf ("options are:\n"); for (i = 0; opttab[i].name; i++) { @@ -480,28 +528,28 @@ print_qword (const char *word, int plaintext) /* Convert environment to K/V form */ static char ** -make_env_kv () +make_env_kv (char **origenv) { size_t i, j, size; char **newenv; /* Count the number of entries */ - for (i = 0; environ[i]; i++) + for (i = 0; origenv[i]; i++) ; size = i * 2 + 1; newenv = calloc (size, sizeof (newenv[0])); assert (newenv != NULL); - for (i = j = 0; environ[i]; i++) + for (i = j = 0; origenv[i]; i++) { - size_t len = strcspn (environ[i], "="); + size_t len = strcspn (origenv[i], "="); char *p = malloc (len+1); assert (p != NULL); - memcpy (p, environ[i], len); + memcpy (p, origenv[i], len); p[len] = 0; newenv[j++] = p; - p = strdup (environ[i] + len + 1); + p = strdup (origenv[i] + len + 1); assert (p != NULL); newenv[j++] = p; } @@ -626,7 +674,6 @@ int main (int argc, char **argv) { struct wsclosure wsc; - char *fenvbase[128]; char buf[1024], *ptr, *saved_ptr; int next_call = 0; @@ -634,9 +681,8 @@ main (int argc, char **argv) wsc.wsflags = 0; wsc.env_type = env_sys; wsc.offarg = 0; - wsc.fenvbase = fenvbase; - wsc.fenvmax = sizeof (fenvbase) / sizeof (fenvbase[0]); wsc.fenvidx = 0; + wsc.xenvidx = 0; wsc.ws.ws_options = 0; wsc.wsflags = (WRDSF_DEFFLAGS & ~WRDSF_NOVAR) | WRDSF_ENOMEMABRT | @@ -647,12 +693,14 @@ main (int argc, char **argv) while (getwsopt (argc, argv, opttab, &wsc) != EOF) ; + wsc.fenvbase[wsc.fenvidx] = NULL; + wsc.xenvbase[wsc.xenvidx] = NULL; + if (wsc.fenvidx > 0) { - wsc.fenvbase[wsc.fenvidx] = NULL; wsc.wsflags |= WRDSF_GETVAR | WRDSF_CLOSURE; wsc.ws.ws_getvar = wsp_getvar; - wsc.ws.ws_closure = fenvbase; + wsc.ws.ws_closure = wsc.fenvbase; } if (wsoptind < argc) @@ -674,11 +722,45 @@ main (int argc, char **argv) break; case env_sys: + { + char **newenv; + + if (wsc.xenvidx) + { + size_t i, j; + for (i = 0; environ[i]; i++) + ; + newenv = calloc (i + wsc.xenvidx + 1, sizeof (*newenv)); + assert (newenv != NULL); + for (i = 0; environ[i]; i++) + { + newenv[i] = strdup (environ[i]); + assert (newenv[i] != NULL); + } + for (j = 0; j < wsc.xenvidx; j++, i++) + { + newenv[i] = strdup (wsc.xenvbase[j]); + assert (newenv[i] != NULL); + } + newenv[i] = NULL; + } + else + newenv = environ; + + wsc.wsflags |= WRDSF_ENV; + if (wsc.wsflags & WRDSF_ENV_KV) + wsc.ws.ws_env = (const char **) make_env_kv (newenv); + else + wsc.ws.ws_env = (const char **) newenv; + } + break; + + case env_extra: wsc.wsflags |= WRDSF_ENV; if (wsc.wsflags & WRDSF_ENV_KV) - wsc.ws.ws_env = (const char **) make_env_kv (); + wsc.ws.ws_env = (const char **) make_env_kv (wsc.xenvbase); else - wsc.ws.ws_env = (const char **) environ; + wsc.ws.ws_env = (const char **) wsc.xenvbase; break; }