mirror of
git://git.gnu.org.ua/wordsplit.git
synced 2025-04-26 08:39:54 +03:00
Start writing wordsplit.
* gconf/wordsplit.c, gconf/wordsplit.h: New files. * tests/wordsplit.at: New file. * tests/wstest.c, tests/wsbatch.c: New files.
This commit is contained in:
commit
271b3b3b63
2 changed files with 656 additions and 0 deletions
568
wordsplit.c
Normal file
568
wordsplit.c
Normal file
|
@ -0,0 +1,568 @@
|
|||
/* wordsplit - a word splitter
|
||||
Copyright (C) 2009 Sergey Poznyakoff
|
||||
|
||||
This program is free software; you can redistribute it and/or modify it
|
||||
under the terms of the GNU General Public License as published by the
|
||||
Free Software Foundation; either version 3 of the License, or (at your
|
||||
option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program. If not, see <http://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include <config.h>
|
||||
#endif
|
||||
|
||||
#include <ctype.h>
|
||||
#include <c-ctype.h>
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <wordsplit.h>
|
||||
|
||||
#include <error.h>
|
||||
#include <gettext.h>
|
||||
#define _(msgid) gettext (msgid)
|
||||
#include <xalloc.h>
|
||||
|
||||
#define isws(c) ((c)==' '||(c)=='\t'||(c)=='\n')
|
||||
#define isdelim(c,delim) (strchr(delim,(c))!=NULL)
|
||||
|
||||
#define _ARGCV_WORD_SED_EXPR 0x10000
|
||||
#define _ARGCV_WORD_MASK 0xf0000
|
||||
|
||||
#define ALLOC_INIT 128
|
||||
#define ALLOC_INCR 128
|
||||
|
||||
static int
|
||||
wordsplit_init (struct wordsplit *wsp, const char *input, size_t len,
|
||||
int flags)
|
||||
{
|
||||
wsp->ws_flags = flags;
|
||||
if ((wsp->ws_flags & (WRDSF_NOVAR|WRDSF_NOCMD))
|
||||
!= (WRDSF_NOVAR|WRDSF_NOCMD))
|
||||
{
|
||||
if (wsp->ws_flags & WRDSF_SHOWERR)
|
||||
error (0, 0,
|
||||
_("variable expansion and command substitution "
|
||||
"are not yet supported"));
|
||||
errno = EINVAL;
|
||||
return 1;
|
||||
}
|
||||
|
||||
wsp->ws_input = input;
|
||||
wsp->ws_len = len;
|
||||
|
||||
if (!(wsp->ws_flags & WRDSF_DOOFFS))
|
||||
wsp->ws_offs = 0;
|
||||
|
||||
if (!(wsp->ws_flags & WRDSF_DELIM))
|
||||
wsp->ws_delim = " ";
|
||||
|
||||
if (!(wsp->ws_flags & WRDSF_COMMENT))
|
||||
wsp->ws_comment = NULL;
|
||||
|
||||
if (wsp->ws_flags & WRDSF_REUSE)
|
||||
{
|
||||
wsp->ws_wordn = wsp->ws_wordc + 1;
|
||||
if (wsp->ws_flags & WRDSF_DOOFFS)
|
||||
wsp->ws_wordn += wsp->ws_offs;
|
||||
if (!(wsp->ws_flags & WRDSF_APPEND))
|
||||
wsp->ws_wordc = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
wsp->ws_wordv = NULL;
|
||||
wsp->ws_wordc = 0;
|
||||
wsp->ws_wordn = 0;
|
||||
}
|
||||
|
||||
wsp->ws_endp = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
alloc_space (struct wordsplit *wsp)
|
||||
{
|
||||
size_t offs = (wsp->ws_flags & WRDSF_DOOFFS) ? wsp->ws_offs : 0;
|
||||
char **ptr;
|
||||
size_t newalloc;
|
||||
|
||||
if (wsp->ws_wordv == NULL)
|
||||
{
|
||||
newalloc = offs + ALLOC_INIT;
|
||||
ptr = calloc (newalloc, sizeof (ptr[0]));
|
||||
}
|
||||
else if (wsp->ws_wordn < offs + wsp->ws_wordc + 1)
|
||||
{
|
||||
newalloc = offs + wsp->ws_wordc + ALLOC_INCR;
|
||||
ptr = realloc (wsp->ws_wordv, newalloc * sizeof (ptr[0]));
|
||||
}
|
||||
else
|
||||
return 0;
|
||||
|
||||
if (ptr)
|
||||
{
|
||||
wsp->ws_wordn = newalloc;
|
||||
wsp->ws_wordv = ptr;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (wsp->ws_flags & WRDSF_ENOMEMABRT)
|
||||
xalloc_die ();
|
||||
else if (wsp->ws_flags & WRDSF_SHOWERR)
|
||||
error (0, 0, _("memory exhausted"));
|
||||
errno = ENOMEM;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
skip_sed_expr(const char *command, size_t i, size_t len)
|
||||
{
|
||||
int state;
|
||||
|
||||
do
|
||||
{
|
||||
int delim;
|
||||
|
||||
if (command[i] == ';')
|
||||
i++;
|
||||
if (!(command[i] == 's' && i + 3 < len && c_ispunct(command[i+1])))
|
||||
break;
|
||||
|
||||
delim = command[++i];
|
||||
state = 1;
|
||||
for (i++; i < len; i++)
|
||||
{
|
||||
if (state == 3)
|
||||
{
|
||||
if (command[i] == delim || !c_isalnum(command[i]))
|
||||
break;
|
||||
}
|
||||
else if (command[i] == '\\')
|
||||
i++;
|
||||
else if (command[i] == delim)
|
||||
state++;
|
||||
}
|
||||
}
|
||||
while (state == 3 && i < len && command[i] == ';');
|
||||
return i;
|
||||
}
|
||||
|
||||
static size_t
|
||||
skip_delim (struct wordsplit *wsp)
|
||||
{
|
||||
size_t start = wsp->ws_endp;
|
||||
if (wsp->ws_flags & WRDSF_SQUEEZE_DELIMS)
|
||||
{
|
||||
do
|
||||
start++;
|
||||
while (start < wsp->ws_len
|
||||
&& isdelim (wsp->ws_input[start], wsp->ws_delim));
|
||||
start--;
|
||||
}
|
||||
|
||||
if (!(wsp->ws_flags & WRDSF_RETURN_DELIMS))
|
||||
start++;
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
#define _WRDS_WORD 1
|
||||
#define _WRDS_CONT 2
|
||||
|
||||
static int
|
||||
scan_word (struct wordsplit *wsp, size_t *pstart, size_t *pend)
|
||||
{
|
||||
size_t start = *pstart;
|
||||
size_t len = wsp->ws_len;
|
||||
const char *command = wsp->ws_input;
|
||||
const char *delim = wsp->ws_delim;
|
||||
const char *comment = wsp->ws_comment;
|
||||
|
||||
size_t i = start;
|
||||
|
||||
if (i >= len)
|
||||
return WRDSE_EOF;
|
||||
|
||||
if (wsp->ws_flags & WRDSF_WS)
|
||||
{
|
||||
/* Skip initial whitespace */
|
||||
while (i < len && isws (command[i]))
|
||||
i++;
|
||||
}
|
||||
|
||||
start = i;
|
||||
|
||||
wsp->ws_flags &= ~_ARGCV_WORD_MASK;
|
||||
|
||||
if (wsp->ws_flags & WRDSF_SED_EXPR
|
||||
&& command[i] == 's' && i + 3 < len && c_ispunct (command[i+1]))
|
||||
{
|
||||
wsp->ws_flags |= _ARGCV_WORD_SED_EXPR;
|
||||
i = skip_sed_expr (command, i, len);
|
||||
}
|
||||
else if (!isdelim (command[i], delim))
|
||||
{
|
||||
while (i < len)
|
||||
{
|
||||
if (comment && strchr (comment, command[i]) != NULL)
|
||||
{
|
||||
size_t j;
|
||||
for (j = i + 1; j < len && command[j] != '\n'; j++)
|
||||
;
|
||||
*pstart = start;
|
||||
*pend = i;
|
||||
wsp->ws_endp = j;
|
||||
return i > start ? _WRDS_WORD : _WRDS_CONT;
|
||||
}
|
||||
|
||||
if (wsp->ws_flags & WRDSF_QUOTE)
|
||||
{
|
||||
if (command[i] == '\\')
|
||||
{
|
||||
if (++i == len)
|
||||
break;
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (command[i] == '\'' || command[i] == '"')
|
||||
{
|
||||
size_t j;
|
||||
for (j = i + 1; j < len && command[j] != command[i]; j++)
|
||||
if (command[j] == '\\')
|
||||
j++;
|
||||
if (j < len && command[j] == command[i])
|
||||
i = j + 1;
|
||||
else
|
||||
{
|
||||
wsp->ws_endp = i;
|
||||
if (wsp->ws_flags & WRDSF_SHOWERR)
|
||||
error (0, 0,
|
||||
_("missing closing %c (start near #%lu)"),
|
||||
command[i], (unsigned long) i);
|
||||
return WRDSE_QUOTE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (((wsp->ws_flags & WRDSF_WS) && isws (command[i]))
|
||||
|| isdelim (command[i], delim))
|
||||
break;
|
||||
else
|
||||
i++;
|
||||
}
|
||||
}
|
||||
else if (wsp->ws_flags & WRDSF_RETURN_DELIMS)
|
||||
i++;
|
||||
|
||||
*pstart = start;
|
||||
*pend = i;
|
||||
wsp->ws_endp = i;
|
||||
|
||||
return _WRDS_WORD;
|
||||
}
|
||||
|
||||
static char quote_transtab[] = "\\\\a\ab\bf\fn\nr\rt\tv\v";
|
||||
|
||||
int
|
||||
wordsplit_unquote_char (int c)
|
||||
{
|
||||
char *p;
|
||||
|
||||
for (p = quote_transtab; *p; p += 2)
|
||||
{
|
||||
if (*p == c)
|
||||
return p[1];
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
int
|
||||
wordsplit_quote_char (int c)
|
||||
{
|
||||
char *p;
|
||||
|
||||
for (p = quote_transtab + sizeof(quote_transtab) - 2;
|
||||
p > quote_transtab; p -= 2)
|
||||
{
|
||||
if (*p == c)
|
||||
return p[-1];
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
#define to_num(c) \
|
||||
(isdigit(c) ? c - '0' : (isxdigit(c) ? toupper(c) - 'A' + 10 : 255 ))
|
||||
|
||||
static int
|
||||
xtonum (int *pval, const char *src, int base, int cnt)
|
||||
{
|
||||
int i, val;
|
||||
|
||||
for (i = 0, val = 0; i < cnt; i++, src++)
|
||||
{
|
||||
int n = *(unsigned char*)src;
|
||||
if (n > 127 || (n = to_num(n)) >= base)
|
||||
break;
|
||||
val = val*base + n;
|
||||
}
|
||||
*pval = val;
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t
|
||||
wordsplit_quoted_length (const char *str, int quote_hex, int *quote)
|
||||
{
|
||||
size_t len = 0;
|
||||
|
||||
*quote = 0;
|
||||
for (; *str; str++)
|
||||
{
|
||||
if (*str == ' ')
|
||||
{
|
||||
len++;
|
||||
*quote = 1;
|
||||
}
|
||||
else if (*str == '"')
|
||||
{
|
||||
len += 2;
|
||||
*quote = 1;
|
||||
}
|
||||
else if (*str != '\t' && *str != '\\' && isprint (*str))
|
||||
len++;
|
||||
else if (quote_hex)
|
||||
len += 3;
|
||||
else
|
||||
{
|
||||
if (wordsplit_quote_char (*str) != -1)
|
||||
len += 2;
|
||||
else
|
||||
len += 4;
|
||||
}
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
void
|
||||
wordsplit_unquote_copy (char *dst, const char *src, size_t n)
|
||||
{
|
||||
int i = 0;
|
||||
int c;
|
||||
int expect_delim = 0;
|
||||
|
||||
while (i < n)
|
||||
{
|
||||
switch (src[i])
|
||||
{
|
||||
case '\'':
|
||||
case '"':
|
||||
if (!expect_delim)
|
||||
{
|
||||
const char *p;
|
||||
|
||||
for (p = src+i+1; *p && *p != src[i]; p++)
|
||||
if (*p == '\\')
|
||||
p++;
|
||||
if (*p)
|
||||
expect_delim = src[i++];
|
||||
else
|
||||
*dst++ = src[i++];
|
||||
}
|
||||
else if (expect_delim == src[i])
|
||||
++i;
|
||||
else
|
||||
*dst++ = src[i++];
|
||||
break;
|
||||
|
||||
case '\\':
|
||||
++i;
|
||||
if (src[i] == 'x' || src[i] == 'X')
|
||||
{
|
||||
if (n - i < 2)
|
||||
{
|
||||
*dst++ = '\\';
|
||||
*dst++ = src[i++];
|
||||
}
|
||||
else
|
||||
{
|
||||
int off = xtonum(&c, src + i + 1, 16, 2);
|
||||
if (off == 0)
|
||||
{
|
||||
*dst++ = '\\';
|
||||
*dst++ = src[i++];
|
||||
}
|
||||
else
|
||||
{
|
||||
*dst++ = c;
|
||||
i += off + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if ((unsigned char)src[i] < 128 && isdigit (src[i]))
|
||||
{
|
||||
if (n - i < 1)
|
||||
{
|
||||
*dst++ = '\\';
|
||||
*dst++ = src[i++];
|
||||
}
|
||||
else
|
||||
{
|
||||
int off = xtonum (&c, src+i, 8, 3);
|
||||
if (off == 0)
|
||||
{
|
||||
*dst++ = '\\';
|
||||
*dst++ = src[i++];
|
||||
}
|
||||
else
|
||||
{
|
||||
*dst++ = c;
|
||||
i += off;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
*dst++ = wordsplit_unquote_char (src[i++]);
|
||||
break;
|
||||
|
||||
default:
|
||||
*dst++ = src[i++];
|
||||
}
|
||||
}
|
||||
*dst = 0;
|
||||
}
|
||||
|
||||
void
|
||||
wordsplit_quote_copy (char *dst, const char *src, int quote_hex)
|
||||
{
|
||||
for (; *src; src++)
|
||||
{
|
||||
if (*src == '"')
|
||||
{
|
||||
*dst++ = '\\';
|
||||
*dst++ = *src;
|
||||
}
|
||||
else if (*src != '\t' && *src != '\\' && isprint (*src))
|
||||
*dst++ = *src;
|
||||
else
|
||||
{
|
||||
char tmp[4];
|
||||
|
||||
if (quote_hex)
|
||||
{
|
||||
snprintf (tmp, sizeof tmp, "%%%02X", *(unsigned char*)src);
|
||||
memcpy (dst, tmp, 3);
|
||||
dst += 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
int c = wordsplit_quote_char (*src);
|
||||
*dst++ = '\\';
|
||||
if (c != -1)
|
||||
*dst++ = c;
|
||||
else
|
||||
{
|
||||
snprintf (tmp, sizeof tmp, "%03o", *(unsigned char*)src);
|
||||
memcpy (dst, tmp, 3);
|
||||
dst += 3;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
wordsplit_len (const char *command, size_t len, struct wordsplit *wsp,
|
||||
int flags)
|
||||
{
|
||||
int rc;
|
||||
size_t start = 0, end = 0;
|
||||
|
||||
rc = wordsplit_init (wsp, command, len, flags);
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
for (; (rc = scan_word (wsp, &start, &end)) > 0; start = skip_delim (wsp))
|
||||
{
|
||||
int unquote = 1;
|
||||
size_t n;
|
||||
char *p;
|
||||
|
||||
if (rc == _WRDS_CONT)
|
||||
continue;
|
||||
|
||||
if (alloc_space (wsp))
|
||||
return WRDSE_NOSPACE;
|
||||
|
||||
n = end - start;
|
||||
|
||||
if (wsp->ws_flags & WRDSF_QUOTE &&
|
||||
!(wsp->ws_flags & _ARGCV_WORD_SED_EXPR))
|
||||
{
|
||||
if (start < end
|
||||
&& (command[start] == '"' || command[start] == '\'')
|
||||
&& command[end-1] == command[start])
|
||||
{
|
||||
unquote = command[start] == '"';
|
||||
start++;
|
||||
n -= 2;
|
||||
}
|
||||
}
|
||||
else
|
||||
unquote = 0;
|
||||
|
||||
p = malloc (n + 1);
|
||||
if (!p)
|
||||
{
|
||||
if (wsp->ws_flags & WRDSF_ENOMEMABRT)
|
||||
xalloc_die ();
|
||||
if (wsp->ws_flags & WRDSF_SHOWERR)
|
||||
error (0, 0, _("memory exhausted"));
|
||||
if (!(wsp->ws_flags & WRDSF_REUSE))
|
||||
wordsplit_free (wsp);
|
||||
errno = ENOMEM;
|
||||
return WRDSE_NOSPACE;
|
||||
}
|
||||
|
||||
if (unquote)
|
||||
wordsplit_unquote_copy (p, &command[start], n);
|
||||
else
|
||||
{
|
||||
memcpy (p, &command[start], n);
|
||||
p[n] = 0;
|
||||
}
|
||||
wsp->ws_wordv[wsp->ws_offs + wsp->ws_wordc] = p;
|
||||
wsp->ws_wordc++;
|
||||
|
||||
;
|
||||
}
|
||||
if (alloc_space (wsp))
|
||||
return WRDSE_NOSPACE;
|
||||
wsp->ws_wordv[wsp->ws_offs + wsp->ws_wordc] = NULL;
|
||||
/* FIXME: if (rc) free(ws) */
|
||||
return rc;
|
||||
}
|
||||
|
||||
int
|
||||
wordsplit (const char *command, struct wordsplit *ws, int flags)
|
||||
{
|
||||
return wordsplit_len (command, strlen (command), ws, flags);
|
||||
}
|
||||
|
||||
void
|
||||
wordsplit_free (struct wordsplit *ws)
|
||||
{
|
||||
free (ws->ws_wordv);
|
||||
ws->ws_wordv = NULL;
|
||||
}
|
||||
|
||||
|
||||
|
88
wordsplit.h
Normal file
88
wordsplit.h
Normal file
|
@ -0,0 +1,88 @@
|
|||
/* wordsplit - a word splitter
|
||||
Copyright (C) 2009 Sergey Poznyakoff
|
||||
|
||||
This program is free software; you can redistribute it and/or modify it
|
||||
under the terms of the GNU General Public License as published by the
|
||||
Free Software Foundation; either version 3 of the License, or (at your
|
||||
option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program. If not, see <http://www.gnu.org/licenses/>. */
|
||||
|
||||
struct wordsplit
|
||||
{
|
||||
size_t ws_wordc;
|
||||
char **ws_wordv;
|
||||
size_t ws_offs;
|
||||
size_t ws_wordn;
|
||||
int ws_flags;
|
||||
const char *ws_delim;
|
||||
const char *ws_comment;
|
||||
|
||||
const char *ws_input;
|
||||
size_t ws_len;
|
||||
size_t ws_endp;
|
||||
};
|
||||
|
||||
/* Append the words found to the array resulting from a previous
|
||||
call. */
|
||||
#define WRDSF_APPEND 0x0001
|
||||
/* Insert we_offs initial NULLs in the array ws_wordv.
|
||||
(These are not counted in the returned ws_wordc.) */
|
||||
#define WRDSF_DOOFFS 0x0002
|
||||
/* Don't do command substitution. Reserved for future use. */
|
||||
#define WRDSF_NOCMD 0x0004
|
||||
/* The parameter p resulted from a previous call to
|
||||
wordsplit(), and wordsplit_free() was not called. Reuse the
|
||||
allocated storage. */
|
||||
#define WRDSF_REUSE 0x0008
|
||||
/* Print errors */
|
||||
#define WRDSF_SHOWERR 0x0010
|
||||
/* Consider it an error if an undefined shell variable
|
||||
is expanded. */
|
||||
#define WRDSF_UNDEF 0x0020
|
||||
|
||||
/* Don't do variable expansion. Reserved for future use. */
|
||||
#define WRDSF_NOVAR 0x0040
|
||||
/* Abort on ENOMEM error */
|
||||
#define WRDSF_ENOMEMABRT 0x0080
|
||||
/* Treat whitespace as delimiters */
|
||||
#define WRDSF_WS 0x0100
|
||||
/* Handle quotes and escape directives */
|
||||
#define WRDSF_QUOTE 0x0200
|
||||
/* Replace each input sequence of repeated delimiters with a single
|
||||
delimiter */
|
||||
#define WRDSF_SQUEEZE_DELIMS 0x0400
|
||||
/* Return delimiters */
|
||||
#define WRDSF_RETURN_DELIMS 0x0800
|
||||
/* Treat sed expressions as words */
|
||||
#define WRDSF_SED_EXPR 0x1000
|
||||
/* ws_delim field is initialized */
|
||||
#define WRDSF_DELIM 0x2000
|
||||
/* ws_comment field is initialized */
|
||||
#define WRDSF_COMMENT 0x4000
|
||||
|
||||
#define WRDSF_DEFFLAGS \
|
||||
(WRDSF_NOVAR | WRDSF_NOCMD | \
|
||||
WRDSF_WS | WRDSF_QUOTE | WRDSF_SQUEEZE_DELIMS)
|
||||
|
||||
#define WRDSE_EOF 0
|
||||
#define WRDSE_QUOTE 1
|
||||
#define WRDSE_NOSPACE 2
|
||||
|
||||
int wordsplit (const char *s, struct wordsplit *p, int flags);
|
||||
void wordsplit_free (struct wordsplit *p);
|
||||
|
||||
int wordsplit_unquote_char (int c);
|
||||
int wordsplit_quote_char (int c);
|
||||
size_t wordsplit_quoted_length (const char *str, int quote_hex, int *quote);
|
||||
void wordsplit_unquote_copy (char *dst, const char *src, size_t n);
|
||||
void wordsplit_quote_copy (char *dst, const char *src, int quote_hex);
|
||||
|
||||
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue