From 271b3b3b639774ce63b5e68139a0dccf3304685b Mon Sep 17 00:00:00 2001 From: Sergey Poznyakoff Date: Wed, 11 Mar 2009 12:35:52 +0200 Subject: [PATCH] Start writing wordsplit. * gconf/wordsplit.c, gconf/wordsplit.h: New files. * tests/wordsplit.at: New file. * tests/wstest.c, tests/wsbatch.c: New files. --- wordsplit.c | 568 ++++++++++++++++++++++++++++++++++++++++++++++++++++ wordsplit.h | 88 ++++++++ 2 files changed, 656 insertions(+) create mode 100644 wordsplit.c create mode 100644 wordsplit.h diff --git a/wordsplit.c b/wordsplit.c new file mode 100644 index 0000000..a5bb13e --- /dev/null +++ b/wordsplit.c @@ -0,0 +1,568 @@ +/* wordsplit - a word splitter + Copyright (C) 2009 Sergey Poznyakoff + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3 of the License, or (at your + option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program. If not, see . */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include +#include + +#include +#include +#define _(msgid) gettext (msgid) +#include + +#define isws(c) ((c)==' '||(c)=='\t'||(c)=='\n') +#define isdelim(c,delim) (strchr(delim,(c))!=NULL) + +#define _ARGCV_WORD_SED_EXPR 0x10000 +#define _ARGCV_WORD_MASK 0xf0000 + +#define ALLOC_INIT 128 +#define ALLOC_INCR 128 + +static int +wordsplit_init (struct wordsplit *wsp, const char *input, size_t len, + int flags) +{ + wsp->ws_flags = flags; + if ((wsp->ws_flags & (WRDSF_NOVAR|WRDSF_NOCMD)) + != (WRDSF_NOVAR|WRDSF_NOCMD)) + { + if (wsp->ws_flags & WRDSF_SHOWERR) + error (0, 0, + _("variable expansion and command substitution " + "are not yet supported")); + errno = EINVAL; + return 1; + } + + wsp->ws_input = input; + wsp->ws_len = len; + + if (!(wsp->ws_flags & WRDSF_DOOFFS)) + wsp->ws_offs = 0; + + if (!(wsp->ws_flags & WRDSF_DELIM)) + wsp->ws_delim = " "; + + if (!(wsp->ws_flags & WRDSF_COMMENT)) + wsp->ws_comment = NULL; + + if (wsp->ws_flags & WRDSF_REUSE) + { + wsp->ws_wordn = wsp->ws_wordc + 1; + if (wsp->ws_flags & WRDSF_DOOFFS) + wsp->ws_wordn += wsp->ws_offs; + if (!(wsp->ws_flags & WRDSF_APPEND)) + wsp->ws_wordc = 0; + } + else + { + wsp->ws_wordv = NULL; + wsp->ws_wordc = 0; + wsp->ws_wordn = 0; + } + + wsp->ws_endp = 0; + return 0; +} + +static int +alloc_space (struct wordsplit *wsp) +{ + size_t offs = (wsp->ws_flags & WRDSF_DOOFFS) ? wsp->ws_offs : 0; + char **ptr; + size_t newalloc; + + if (wsp->ws_wordv == NULL) + { + newalloc = offs + ALLOC_INIT; + ptr = calloc (newalloc, sizeof (ptr[0])); + } + else if (wsp->ws_wordn < offs + wsp->ws_wordc + 1) + { + newalloc = offs + wsp->ws_wordc + ALLOC_INCR; + ptr = realloc (wsp->ws_wordv, newalloc * sizeof (ptr[0])); + } + else + return 0; + + if (ptr) + { + wsp->ws_wordn = newalloc; + wsp->ws_wordv = ptr; + } + else + { + if (wsp->ws_flags & WRDSF_ENOMEMABRT) + xalloc_die (); + else if (wsp->ws_flags & WRDSF_SHOWERR) + error (0, 0, _("memory exhausted")); + errno = ENOMEM; + return 1; + } + return 0; +} + +static int +skip_sed_expr(const char *command, size_t i, size_t len) +{ + int state; + + do + { + int delim; + + if (command[i] == ';') + i++; + if (!(command[i] == 's' && i + 3 < len && c_ispunct(command[i+1]))) + break; + + delim = command[++i]; + state = 1; + for (i++; i < len; i++) + { + if (state == 3) + { + if (command[i] == delim || !c_isalnum(command[i])) + break; + } + else if (command[i] == '\\') + i++; + else if (command[i] == delim) + state++; + } + } + while (state == 3 && i < len && command[i] == ';'); + return i; +} + +static size_t +skip_delim (struct wordsplit *wsp) +{ + size_t start = wsp->ws_endp; + if (wsp->ws_flags & WRDSF_SQUEEZE_DELIMS) + { + do + start++; + while (start < wsp->ws_len + && isdelim (wsp->ws_input[start], wsp->ws_delim)); + start--; + } + + if (!(wsp->ws_flags & WRDSF_RETURN_DELIMS)) + start++; + + return start; +} + +#define _WRDS_WORD 1 +#define _WRDS_CONT 2 + +static int +scan_word (struct wordsplit *wsp, size_t *pstart, size_t *pend) +{ + size_t start = *pstart; + size_t len = wsp->ws_len; + const char *command = wsp->ws_input; + const char *delim = wsp->ws_delim; + const char *comment = wsp->ws_comment; + + size_t i = start; + + if (i >= len) + return WRDSE_EOF; + + if (wsp->ws_flags & WRDSF_WS) + { + /* Skip initial whitespace */ + while (i < len && isws (command[i])) + i++; + } + + start = i; + + wsp->ws_flags &= ~_ARGCV_WORD_MASK; + + if (wsp->ws_flags & WRDSF_SED_EXPR + && command[i] == 's' && i + 3 < len && c_ispunct (command[i+1])) + { + wsp->ws_flags |= _ARGCV_WORD_SED_EXPR; + i = skip_sed_expr (command, i, len); + } + else if (!isdelim (command[i], delim)) + { + while (i < len) + { + if (comment && strchr (comment, command[i]) != NULL) + { + size_t j; + for (j = i + 1; j < len && command[j] != '\n'; j++) + ; + *pstart = start; + *pend = i; + wsp->ws_endp = j; + return i > start ? _WRDS_WORD : _WRDS_CONT; + } + + if (wsp->ws_flags & WRDSF_QUOTE) + { + if (command[i] == '\\') + { + if (++i == len) + break; + i++; + continue; + } + + if (command[i] == '\'' || command[i] == '"') + { + size_t j; + for (j = i + 1; j < len && command[j] != command[i]; j++) + if (command[j] == '\\') + j++; + if (j < len && command[j] == command[i]) + i = j + 1; + else + { + wsp->ws_endp = i; + if (wsp->ws_flags & WRDSF_SHOWERR) + error (0, 0, + _("missing closing %c (start near #%lu)"), + command[i], (unsigned long) i); + return WRDSE_QUOTE; + } + } + } + + if (((wsp->ws_flags & WRDSF_WS) && isws (command[i])) + || isdelim (command[i], delim)) + break; + else + i++; + } + } + else if (wsp->ws_flags & WRDSF_RETURN_DELIMS) + i++; + + *pstart = start; + *pend = i; + wsp->ws_endp = i; + + return _WRDS_WORD; +} + +static char quote_transtab[] = "\\\\a\ab\bf\fn\nr\rt\tv\v"; + +int +wordsplit_unquote_char (int c) +{ + char *p; + + for (p = quote_transtab; *p; p += 2) + { + if (*p == c) + return p[1]; + } + return c; +} + +int +wordsplit_quote_char (int c) +{ + char *p; + + for (p = quote_transtab + sizeof(quote_transtab) - 2; + p > quote_transtab; p -= 2) + { + if (*p == c) + return p[-1]; + } + return -1; +} + +#define to_num(c) \ + (isdigit(c) ? c - '0' : (isxdigit(c) ? toupper(c) - 'A' + 10 : 255 )) + +static int +xtonum (int *pval, const char *src, int base, int cnt) +{ + int i, val; + + for (i = 0, val = 0; i < cnt; i++, src++) + { + int n = *(unsigned char*)src; + if (n > 127 || (n = to_num(n)) >= base) + break; + val = val*base + n; + } + *pval = val; + return i; +} + +size_t +wordsplit_quoted_length (const char *str, int quote_hex, int *quote) +{ + size_t len = 0; + + *quote = 0; + for (; *str; str++) + { + if (*str == ' ') + { + len++; + *quote = 1; + } + else if (*str == '"') + { + len += 2; + *quote = 1; + } + else if (*str != '\t' && *str != '\\' && isprint (*str)) + len++; + else if (quote_hex) + len += 3; + else + { + if (wordsplit_quote_char (*str) != -1) + len += 2; + else + len += 4; + } + } + return len; +} + +void +wordsplit_unquote_copy (char *dst, const char *src, size_t n) +{ + int i = 0; + int c; + int expect_delim = 0; + + while (i < n) + { + switch (src[i]) + { + case '\'': + case '"': + if (!expect_delim) + { + const char *p; + + for (p = src+i+1; *p && *p != src[i]; p++) + if (*p == '\\') + p++; + if (*p) + expect_delim = src[i++]; + else + *dst++ = src[i++]; + } + else if (expect_delim == src[i]) + ++i; + else + *dst++ = src[i++]; + break; + + case '\\': + ++i; + if (src[i] == 'x' || src[i] == 'X') + { + if (n - i < 2) + { + *dst++ = '\\'; + *dst++ = src[i++]; + } + else + { + int off = xtonum(&c, src + i + 1, 16, 2); + if (off == 0) + { + *dst++ = '\\'; + *dst++ = src[i++]; + } + else + { + *dst++ = c; + i += off + 1; + } + } + } + else if ((unsigned char)src[i] < 128 && isdigit (src[i])) + { + if (n - i < 1) + { + *dst++ = '\\'; + *dst++ = src[i++]; + } + else + { + int off = xtonum (&c, src+i, 8, 3); + if (off == 0) + { + *dst++ = '\\'; + *dst++ = src[i++]; + } + else + { + *dst++ = c; + i += off; + } + } + } + else + *dst++ = wordsplit_unquote_char (src[i++]); + break; + + default: + *dst++ = src[i++]; + } + } + *dst = 0; +} + +void +wordsplit_quote_copy (char *dst, const char *src, int quote_hex) +{ + for (; *src; src++) + { + if (*src == '"') + { + *dst++ = '\\'; + *dst++ = *src; + } + else if (*src != '\t' && *src != '\\' && isprint (*src)) + *dst++ = *src; + else + { + char tmp[4]; + + if (quote_hex) + { + snprintf (tmp, sizeof tmp, "%%%02X", *(unsigned char*)src); + memcpy (dst, tmp, 3); + dst += 3; + } + else + { + int c = wordsplit_quote_char (*src); + *dst++ = '\\'; + if (c != -1) + *dst++ = c; + else + { + snprintf (tmp, sizeof tmp, "%03o", *(unsigned char*)src); + memcpy (dst, tmp, 3); + dst += 3; + } + } + } + } +} + +int +wordsplit_len (const char *command, size_t len, struct wordsplit *wsp, + int flags) +{ + int rc; + size_t start = 0, end = 0; + + rc = wordsplit_init (wsp, command, len, flags); + if (rc) + return rc; + + for (; (rc = scan_word (wsp, &start, &end)) > 0; start = skip_delim (wsp)) + { + int unquote = 1; + size_t n; + char *p; + + if (rc == _WRDS_CONT) + continue; + + if (alloc_space (wsp)) + return WRDSE_NOSPACE; + + n = end - start; + + if (wsp->ws_flags & WRDSF_QUOTE && + !(wsp->ws_flags & _ARGCV_WORD_SED_EXPR)) + { + if (start < end + && (command[start] == '"' || command[start] == '\'') + && command[end-1] == command[start]) + { + unquote = command[start] == '"'; + start++; + n -= 2; + } + } + else + unquote = 0; + + p = malloc (n + 1); + if (!p) + { + if (wsp->ws_flags & WRDSF_ENOMEMABRT) + xalloc_die (); + if (wsp->ws_flags & WRDSF_SHOWERR) + error (0, 0, _("memory exhausted")); + if (!(wsp->ws_flags & WRDSF_REUSE)) + wordsplit_free (wsp); + errno = ENOMEM; + return WRDSE_NOSPACE; + } + + if (unquote) + wordsplit_unquote_copy (p, &command[start], n); + else + { + memcpy (p, &command[start], n); + p[n] = 0; + } + wsp->ws_wordv[wsp->ws_offs + wsp->ws_wordc] = p; + wsp->ws_wordc++; + + ; + } + if (alloc_space (wsp)) + return WRDSE_NOSPACE; + wsp->ws_wordv[wsp->ws_offs + wsp->ws_wordc] = NULL; + /* FIXME: if (rc) free(ws) */ + return rc; +} + +int +wordsplit (const char *command, struct wordsplit *ws, int flags) +{ + return wordsplit_len (command, strlen (command), ws, flags); +} + +void +wordsplit_free (struct wordsplit *ws) +{ + free (ws->ws_wordv); + ws->ws_wordv = NULL; +} + + + diff --git a/wordsplit.h b/wordsplit.h new file mode 100644 index 0000000..d4d1f0c --- /dev/null +++ b/wordsplit.h @@ -0,0 +1,88 @@ +/* wordsplit - a word splitter + Copyright (C) 2009 Sergey Poznyakoff + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3 of the License, or (at your + option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program. If not, see . */ + +struct wordsplit +{ + size_t ws_wordc; + char **ws_wordv; + size_t ws_offs; + size_t ws_wordn; + int ws_flags; + const char *ws_delim; + const char *ws_comment; + + const char *ws_input; + size_t ws_len; + size_t ws_endp; +}; + +/* Append the words found to the array resulting from a previous + call. */ +#define WRDSF_APPEND 0x0001 +/* Insert we_offs initial NULLs in the array ws_wordv. + (These are not counted in the returned ws_wordc.) */ +#define WRDSF_DOOFFS 0x0002 +/* Don't do command substitution. Reserved for future use. */ +#define WRDSF_NOCMD 0x0004 +/* The parameter p resulted from a previous call to + wordsplit(), and wordsplit_free() was not called. Reuse the + allocated storage. */ +#define WRDSF_REUSE 0x0008 +/* Print errors */ +#define WRDSF_SHOWERR 0x0010 +/* Consider it an error if an undefined shell variable + is expanded. */ +#define WRDSF_UNDEF 0x0020 + +/* Don't do variable expansion. Reserved for future use. */ +#define WRDSF_NOVAR 0x0040 +/* Abort on ENOMEM error */ +#define WRDSF_ENOMEMABRT 0x0080 +/* Treat whitespace as delimiters */ +#define WRDSF_WS 0x0100 +/* Handle quotes and escape directives */ +#define WRDSF_QUOTE 0x0200 +/* Replace each input sequence of repeated delimiters with a single + delimiter */ +#define WRDSF_SQUEEZE_DELIMS 0x0400 +/* Return delimiters */ +#define WRDSF_RETURN_DELIMS 0x0800 +/* Treat sed expressions as words */ +#define WRDSF_SED_EXPR 0x1000 +/* ws_delim field is initialized */ +#define WRDSF_DELIM 0x2000 +/* ws_comment field is initialized */ +#define WRDSF_COMMENT 0x4000 + +#define WRDSF_DEFFLAGS \ + (WRDSF_NOVAR | WRDSF_NOCMD | \ + WRDSF_WS | WRDSF_QUOTE | WRDSF_SQUEEZE_DELIMS) + +#define WRDSE_EOF 0 +#define WRDSE_QUOTE 1 +#define WRDSE_NOSPACE 2 + +int wordsplit (const char *s, struct wordsplit *p, int flags); +void wordsplit_free (struct wordsplit *p); + +int wordsplit_unquote_char (int c); +int wordsplit_quote_char (int c); +size_t wordsplit_quoted_length (const char *str, int quote_hex, int *quote); +void wordsplit_unquote_copy (char *dst, const char *src, size_t n); +void wordsplit_quote_copy (char *dst, const char *src, int quote_hex); + + +