phobos/regexp.d
2007-09-10 02:16:36 +00:00

2421 lines
47 KiB
D

// Regular Expressions
// Copyright (c) 2000-2001 by Digital Mars
// All Rights Reserved
// Written by Walter Bright
// www.digitalmars.com
/*
Escape sequences:
\nnn starts out a 1, 2 or 3 digit octal sequence,
where n is an octal digit. If nnn is larger than
0377, then the 3rd digit is not part of the sequence
and is not consumed.
For maximal portability, use exactly 3 digits.
\xXX starts out a 1 or 2 digit hex sequence. X
is a hex character. If the first character after the \x
is not a hex character, the value of the sequence is 'x'
and the XX are not consumed.
For maximal portability, use exactly 2 digits.
\uUUUU is a unicode sequence. There are exactly
4 hex characters after the \u, if any are not, then
the value of the sequence is 'u', and the UUUU are not
consumed.
Character classes:
[a-b], where a is greater than b, will produce
an error.
*/
//debug = regexp; // uncomment to turn on debugging printf's
import c.stdio;
import ctype;
import outbuffer;
/************************************
* One of these gets thrown on compilation error
*/
class RegExpError : Error
{
this(char[] msg)
{
super(msg);
}
}
struct regmatch_t
{
int rm_so; // index of start of match
int rm_eo; // index past end of match
}
alias char tchar; // so we can make a wchar version
class RegExp
{
public this(tchar[] pattern, tchar[] attributes)
{
pmatch = (&gmatch)[0 .. 1];
compile(pattern, attributes);
}
uint re_nsub; // number of parenthesized subexpression matches
regmatch_t[] pmatch; // array [re_nsub + 1]
tchar[] input; // the string to search
// per instance:
tchar[] pattern; // source text of the regular expression
tchar[] flags; // source text of the attributes parameter
int errors;
uint attributes;
enum REA
{
global = 1, // has the g attribute
ignoreCase = 2, // has the i attribute
multiline = 4, // if treat as multiple lines separated
// by newlines, or as a single line
dotmatchlf = 8, // if . matches \n
}
private:
uint src; // current source index in input[]
uint src_start; // starting index for match in input[]
uint p; // position of parser in pattern[]
regmatch_t gmatch; // match for the entire regular expression
// (serves as storage for pmatch[0])
ubyte[] program; // pattern[] compiled into regular expression program
OutBuffer buf;
/******************************************/
// Opcodes
enum : ubyte
{
REend, // end of program
REchar, // single character
REichar, // single character, case insensitive
REwchar, // single wide character
REiwchar, // single wide character, case insensitive
REanychar, // any character
REanystar, // ".*"
REstring, // string of characters
REtestbit, // any in bitmap, non-consuming
REbit, // any in the bit map
REnotbit, // any not in the bit map
RErange, // any in the string
REnotrange, // any not in the string
REor, // a | b
REplus, // 1 or more
REstar, // 0 or more
REquest, // 0 or 1
REnm, // n..m
REnmq, // n..m, non-greedy version
REbol, // beginning of line
REeol, // end of line
REparen, // parenthesized subexpression
REgoto, // goto offset
REwordboundary,
REnotwordboundary,
REdigit,
REnotdigit,
REspace,
REnotspace,
REword,
REnotword,
REbackref,
};
// BUG: should this include '$'?
private int isword(tchar c) { return isalnum(c) || c == '_'; }
private uint inf = ~0u;
/*********************************
* Throws RegExpError on error
*/
void compile(tchar[] pattern, tchar[] attributes)
{
//printf("RegExp.compile('%.*s', '%.*s')\n", pattern, attributes);
this.attributes = 0;
for (uint i = 0; i < attributes.length; i++)
{ REA att;
switch (attributes[i])
{
case 'g': att = REA.global; break;
case 'i': att = REA.ignoreCase; break;
case 'm': att = REA.multiline; break;
default:
error("unrecognized attribute");
return;
}
if (this.attributes & att)
{ error("redundant attribute");
return;
}
this.attributes |= att;
}
input = null;
this.pattern = pattern;
this.flags = attributes;
uint oldre_nsub = re_nsub;
re_nsub = 0;
errors = 0;
buf = new OutBuffer();
buf.reserve(pattern.length * 8);
p = 0;
parseRegexp();
if (p < pattern.length)
{ error("unmatched ')'");
}
optimize();
program = buf.data;
buf.data = null;
delete buf;
if (re_nsub > oldre_nsub)
{
if (pmatch === &gmatch)
pmatch = null;
pmatch.length = re_nsub + 1;
}
pmatch[0].rm_so = 0;
pmatch[0].rm_eo = 0;
}
/********************************************
* Split string[] into an array of strings, using the regular expression as the separator.
* Returns:
* array of slices into string[]
*/
public tchar[][] split(tchar[] string)
{
debug(regexp) printf("regexp.split()\n");
tchar[][] result;
if (string.length)
{
int p = 0;
int q;
for (q = p; q != string.length;)
{
if (test(string, q))
{ int e;
q = pmatch[0].rm_so;
e = pmatch[0].rm_eo;
if (e != p)
{
result ~= string[p .. q];
for (int i = 1; i < pmatch.length; i++)
{
int so = pmatch[i].rm_so;
int eo = pmatch[i].rm_eo;
if (so == eo)
{ so = 0; // -1 gives array bounds error
eo = 0;
}
result ~= string[so .. eo];
}
q = p = e;
continue;
}
}
q++;
}
result ~= string[p .. string.length];
}
else if (!test(string))
result ~= string;
return result;
}
unittest
{
debug(regexp) printf("regexp.split.unittest()\n");
RegExp r = new RegExp("a*?", null);
tchar[][] result;
tchar[] j;
int i;
result = r.split("ab");
assert(result.length == 2);
i = string.cmp(result[0], "a");
assert(i == 0);
i = string.cmp(result[1], "b");
assert(i == 0);
r = new RegExp("a*", null);
result = r.split("ab");
assert(result.length == 2);
i = string.cmp(result[0], "");
assert(i == 0);
i = string.cmp(result[1], "b");
assert(i == 0);
r = new RegExp('<(\/)?([^<>]+)>', null);
result = r.split("a<b>font</b>bar<TAG>hello</TAG>");
for (i = 0; i < result.length; i++)
{
//debug(regexp) printf("result[%d] = '%.*s'\n", i, result[i]);
}
j = join(result, ",");
//printf("j = '%.*s'\n", j);
i = string.cmp(j, "a,,b,font,/,b,bar,,TAG,hello,/,TAG,");
assert(i == 0);
}
/*************************************************
* Search string[] for match.
* Returns:
* >=0 index of match
* -1 no match
*/
public int search(tchar[] string)
{
int i;
i = test(string);
if (i)
i = pmatch[0].rm_so;
else
i = -1; // no match
return i;
}
unittest
{
debug(regexp) printf("regexp.search.unittest()\n");
int i;
RegExp r = new RegExp("abc", null);
i = r.search("xabcy");
assert(i == 1);
i = r.search("cba");
assert(i == -1);
}
/*************************************************
* Search string[] for match.
* Returns:
* if global, return same value as exec(string)
* if not global, return array of all matches
*/
public tchar[][] match(tchar[] string)
{
tchar[][] result;
if (attributes & REA.global)
{
int lastindex = 0;
while (test(string, lastindex))
{ int eo = pmatch[0].rm_eo;
result ~= input[pmatch[0].rm_so .. eo];
if (lastindex == eo)
lastindex++; // always consume some source
else
lastindex = eo;
}
}
else
{
result = exec(string);
}
return result;
}
unittest
{
debug(regexp) printf("regexp.match.unittest()\n");
int i;
tchar[][] result;
tchar[] j;
RegExp r;
r = new RegExp("a[bc]", null);
result = r.match("1ab2ac3");
j = join(result, ",");
i = string.cmp(j, "ab");
assert(i == 0);
r = new RegExp("a[bc]", "g");
result = r.match("1ab2ac3");
j = join(result, ",");
i = string.cmp(j, "ab,ac");
assert(i == 0);
}
/*************************************************
* Find regular expression matches in string[]. Replace those matches
* with a new string composed of format[] merged with the result of the
* matches.
* If global, replace all matches. Otherwise, replace first match.
* Return the new string.
*/
public tchar[] replace(tchar[] string, tchar[] format)
{
tchar[] result;
int lastindex;
int offset;
result = string;
lastindex = 0;
offset = 0;
for (;;)
{
if (!test(string, lastindex))
break;
int so = pmatch[0].rm_so;
int eo = pmatch[0].rm_eo;
tchar[] replacement = replace(format);
result = replaceSlice(result, result[offset + so .. offset + eo], replacement);
if (attributes & REA.global)
{
offset += replacement.length - (eo - so);
if (lastindex == eo)
lastindex++; // always consume some source
else
lastindex = eo;
}
else
break;
}
return result;
}
unittest
{
debug(regexp) printf("regexp.replace.unittest()\n");
int i;
tchar[] result;
RegExp r;
r = new RegExp("a[bc]", "g");
result = r.replace("1ab2ac3", "x$&y");
i = string.cmp(result, "1xaby2xacy3");
assert(i == 0);
}
/*************************************************
* Search string[] for match.
* Returns:
* array of slices into string[] representing matches
*/
public tchar[][] exec(tchar[] string)
{
input = string;
return exec();
}
/*************************************************
* Search string[] for next match.
* Returns:
* array of slices into string[] representing matches
*/
public tchar[][] exec()
{
if (!test())
return null;
tchar[][] result;
result = new tchar[][pmatch.length];
for (int i = 0; i < pmatch.length; i++)
{
if (pmatch[i].rm_so == pmatch[i].rm_eo)
result[i] = null;
else
result[i] = input[pmatch[i].rm_so .. pmatch[i].rm_eo];
}
return result;
}
/************************************************
* Search string[] for match.
* Returns:
* 0 no match
* !=0 match
*/
public int test(tchar[] string)
{
return test(string, pmatch[0].rm_eo);
}
/************************************************
* Pick up where last test() left off, and search again.
* Returns:
* 0 no match
* !=0 match
*/
public int test()
{
return test(input, pmatch[0].rm_eo);
}
/************************************************
* Test input[] starting at startindex against compiled in pattern[].
* Returns:
* 0 no match
* !=0 match
*/
int test(char[] string, int startindex)
{
tchar firstc;
uint si;
input = string;
debug (regexp) printf("RegExp.test(input[] = '%.*s', startindex = %d)\n", input, startindex);
pmatch[0].rm_so = 0;
pmatch[0].rm_eo = 0;
if (startindex < 0 || startindex > input.length)
{
return 0; // fail
}
debug(regexp) printProgram(program);
// First character optimization
firstc = 0;
if (program[0] == REchar)
{
firstc = program[1];
if (attributes & REA.ignoreCase && isalpha(firstc))
firstc = 0;
}
for (si = startindex; ; si++)
{
if (firstc)
{
if (si == input.length)
break; // no match
if (input[si] != firstc)
{
si++;
if (!chr(si, firstc)) // if first character not found
break; // no match
}
}
for (int i = 0; i < re_nsub + 1; i++)
{
pmatch[i].rm_so = -1;
pmatch[i].rm_eo = -1;
}
src_start = src = si;
if (trymatch(0, program.length))
{
pmatch[0].rm_so = si;
pmatch[0].rm_eo = src;
debug(regexp) printf("start = %d, end = %d\n", gmatch.rm_so, gmatch.rm_eo);
return 1;
}
// If possible match must start at beginning, we are done
if (program[0] == REbol || program[0] == REanystar)
{
if (attributes & REA.multiline)
{
// Scan for the next \n
if (!chr(si, \n))
break; // no match if '\n' not found
}
else
break;
}
if (si == input.length)
break;
debug(regexp) printf("Starting new try: '%.*s'\n", input[si + 1 .. input.length]);
}
return 0; // no match
}
int chr(inout uint si, tchar c)
{
for (; si < input.length; si++)
{
if (input[si] == c)
return 1;
}
return 0;
}
void printProgram(ubyte[] prog)
{
debug(regexp)
{
uint pc;
uint len;
uint n;
uint m;
ushort *pu;
uint *puint;
printf("printProgram()\n");
for (pc = 0; pc < prog.length; )
{
printf("%3d: ", pc);
//printf("prog[pc] = %d, REchar = %d, REnmq = %d\n", prog[pc], REchar, REnmq);
switch (prog[pc])
{
case REchar:
printf("\tREchar '%c'\n", prog[pc + 1]);
pc += 1 + char.size;
break;
case REichar:
printf("\tREichar '%c'\n", prog[pc + 1]);
pc += 1 + char.size;
break;
case REwchar:
printf("\tREwchar '%c'\n", *(wchar *)&prog[pc + 1]);
pc += 1 + wchar.size;
break;
case REiwchar:
printf("\tREiwchar '%c'\n", *(wchar *)&prog[pc + 1]);
pc += 1 + wchar.size;
break;
case REanychar:
printf("\tREanychar\n");
pc++;
break;
case REstring:
len = *(uint *)&prog[pc + 1];
printf("\tREstring x%x, '%.*s'\n", len,
(&prog[pc + 1 + uint.size])[0 .. len]);
pc += 1 + uint.size + len * tchar.size;
break;
case REtestbit:
pu = (ushort *)&prog[pc + 1];
printf("\tREtestbit %d, %d\n", pu[0], pu[1]);
len = pu[1];
pc += 1 + 2 * ushort.size + len;
break;
case REbit:
pu = (ushort *)&prog[pc + 1];
len = pu[1];
printf("\tREbit cmax=%02x, len=%d:", pu[0], len);
for (n = 0; n < len; n++)
printf(" %02x", prog[pc + 1 + 2 * ushort.size + n]);
printf("\n");
pc += 1 + 2 * ushort.size + len;
break;
case REnotbit:
pu = (ushort *)&prog[pc + 1];
printf("\tREnotbit %d, %d\n", pu[0], pu[1]);
len = pu[1];
pc += 1 + 2 * ushort.size + len;
break;
case RErange:
len = *(uint *)&prog[pc + 1];
printf("\tRErange %d\n", len);
// BUG: REAignoreCase?
pc += 1 + uint.size + len;
break;
case REnotrange:
len = *(uint *)&prog[pc + 1];
printf("\tREnotrange %d\n", len);
// BUG: REAignoreCase?
pc += 1 + uint.size + len;
break;
case REbol:
printf("\tREbol\n");
pc++;
break;
case REeol:
printf("\tREeol\n");
pc++;
break;
case REor:
len = *(uint *)&prog[pc + 1];
printf("\tREor %d, pc=>%d\n", len, pc + 1 + uint.size + len);
pc += 1 + uint.size;
break;
case REgoto:
len = *(uint *)&prog[pc + 1];
printf("\tREgoto %d, pc=>%d\n", len, pc + 1 + uint.size + len);
pc += 1 + uint.size;
break;
case REanystar:
printf("\tREanystar\n");
pc++;
break;
case REnm:
case REnmq:
// len, n, m, ()
puint = (uint *)&prog[pc + 1];
len = puint[0];
n = puint[1];
m = puint[2];
printf("\tREnm%.*s len=%d, n=%u, m=%u, pc=>%d\n",
(prog[pc] == REnmq) ? "q" : " ",
len, n, m, pc + 1 + uint.size * 3 + len);
pc += 1 + uint.size * 3;
break;
case REparen:
// len, n, ()
puint = (uint *)&prog[pc + 1];
len = puint[0];
n = puint[1];
printf("\tREparen len=%d n=%d, pc=>%d\n", len, n, pc + 1 + uint.size * 2 + len);
pc += 1 + uint.size * 2;
break;
case REend:
printf("\tREend\n");
return;
case REwordboundary:
printf("\tREwordboundary\n");
pc++;
break;
case REnotwordboundary:
printf("\tREnotwordboundary\n");
pc++;
break;
case REdigit:
printf("\tREdigit\n");
pc++;
break;
case REnotdigit:
printf("\tREnotdigit\n");
pc++;
break;
case REspace:
printf("\tREspace\n");
pc++;
break;
case REnotspace:
printf("\tREnotspace\n");
pc++;
break;
case REword:
printf("\tREword\n");
pc++;
break;
case REnotword:
printf("\tREnotword\n");
pc++;
break;
case REbackref:
printf("\tREbackref %d\n", prog[1]);
pc += 2;
break;
default:
assert(0);
}
}
}
}
/**************************************************
* Match input against a section of the program[].
* Returns:
* 1 if successful match
* 0 no match
*/
int trymatch(int pc, int pcend)
{ int srcsave;
uint len;
uint n;
uint m;
uint count;
uint pop;
uint ss;
regmatch_t *psave;
uint c1;
uint c2;
ushort* pu;
uint* puint;
debug(regexp)
printf("RegExp.trymatch(pc = %d, src = '%.*s', pcend = %d)\n",
pc, input[src .. input.length], pcend);
srcsave = src;
psave = null;
for (;;)
{
if (pc == pcend) // if done matching
{ debug(regex) printf("\tprogend\n");
return 1;
}
//printf("\top = %d\n", program[pc]);
switch (program[pc])
{
case REchar:
if (src == input.length)
goto Lnomatch;
debug(regexp) printf("\tREchar '%c', src = '%c'\n", program[pc + 1], input[src]);
if (program[pc + 1] != input[src])
goto Lnomatch;
src++;
pc += 1 + char.size;
break;
case REichar:
if (src == input.length)
goto Lnomatch;
debug(regexp) printf("\tREichar '%c', src = '%c'\n", program[pc + 1], input[src]);
c1 = program[pc + 1];
c2 = input[src];
if (c1 != c2)
{
if (islower((tchar)c2))
c2 = ctype.toupper((tchar)c2);
else
goto Lnomatch;
if (c1 != c2)
goto Lnomatch;
}
src++;
pc += 1 + char.size;
break;
case REwchar:
debug(regexp) printf("\tREwchar '%c', src = '%c'\n", *((wchar *)&program[pc + 1]), input[src]);
if (src == input.length)
goto Lnomatch;
if (*((wchar *)&program[pc + 1]) != input[src])
goto Lnomatch;
src++;
pc += 1 + wchar.size;
break;
case REiwchar:
debug(regexp) printf("\tREiwchar '%c', src = '%c'\n", *((wchar *)&program[pc + 1]), input[src]);
if (src == input.length)
goto Lnomatch;
c1 = *((wchar *)&program[pc + 1]);
c2 = input[src];
if (c1 != c2)
{
if (islower(cast(tchar)c2))
c2 = ctype.toupper(cast(tchar)c2);
else
goto Lnomatch;
if (c1 != c2)
goto Lnomatch;
}
src++;
pc += 1 + wchar.size;
break;
case REanychar:
debug(regexp) printf("\tREanychar\n");
if (src == input.length)
goto Lnomatch;
if (!(attributes & REA.dotmatchlf) && input[src] == (tchar)\n)
goto Lnomatch;
src++;
pc++;
break;
case REstring:
len = *(uint *)&program[pc + 1];
debug(regexp) printf("\tREstring x%x, '%.*s'\n", len,
(&program[pc + 1 + uint.size])[0 .. len]);
if (src + len > input.length)
goto Lnomatch;
if (memcmp(&program[pc + 1 + uint.size], &input[src], len * tchar.size))
goto Lnomatch;
src += len;
pc += 1 + uint.size + len * tchar.size;
break;
case REtestbit:
pu = ((ushort *)&program[pc + 1]);
debug(regexp) printf("\tREtestbit %d, %d, '%c', x%02x\n",
pu[0], pu[1], input[src], input[src]);
if (src == input.length)
goto Lnomatch;
len = pu[1];
c1 = input[src];
//printf("[x%02x]=x%02x, x%02x\n", c1 >> 3, ((&program[pc + 1 + 4])[c1 >> 3] ), (1 << (c1 & 7)));
if (c1 <= pu[0] &&
!((&(program[pc + 1 + 4]))[c1 >> 3] & (1 << (c1 & 7))))
goto Lnomatch;
pc += 1 + 2 * ushort.size + len;
break;
case REbit:
pu = ((ushort *)&program[pc + 1]);
debug(regexp) printf("\tREbit %d, %d, '%c'\n",
pu[0], pu[1], input[src]);
if (src == input.length)
goto Lnomatch;
len = pu[1];
c1 = input[src];
if (c1 > pu[0])
goto Lnomatch;
if (!((&program[pc + 1 + 4])[c1 >> 3] & (1 << (c1 & 7))))
goto Lnomatch;
src++;
pc += 1 + 2 * ushort.size + len;
break;
case REnotbit:
pu = ((ushort *)&program[pc + 1]);
debug(regexp) printf("\tREnotbit %d, %d, '%c'\n",
pu[0], pu[1], input[src]);
if (src == input.length)
goto Lnomatch;
len = pu[1];
c1 = input[src];
if (c1 <= pu[0] &&
((&program[pc + 1 + 4])[c1 >> 3] & (1 << (c1 & 7))))
goto Lnomatch;
src++;
pc += 1 + 2 * ushort.size + len;
break;
case RErange:
len = *(uint *)&program[pc + 1];
debug(regexp) printf("\tRErange %d\n", len);
if (src == input.length)
goto Lnomatch;
// BUG: REA.ignoreCase?
if (memchr((char*)&program[pc + 1 + uint.size], input[src], len) == null)
goto Lnomatch;
src++;
pc += 1 + uint.size + len;
break;
case REnotrange:
len = *(uint *)&program[pc + 1];
debug(regexp) printf("\tREnotrange %d\n", len);
if (src == input.length)
goto Lnomatch;
// BUG: REA.ignoreCase?
if (memchr((char*)&program[pc + 1 + uint.size], input[src], len) != null)
goto Lnomatch;
src++;
pc += 1 + uint.size + len;
break;
case REbol:
debug(regexp) printf("\tREbol\n");
if (src == 0)
{
}
else if (attributes & REA.multiline)
{
if (input[src - 1] != \n)
goto Lnomatch;
}
else
goto Lnomatch;
pc++;
break;
case REeol:
debug(regexp) printf("\tREeol\n");
if (src == input.length)
{
}
else if (attributes & REA.multiline && input[src] == \n)
src++;
else
goto Lnomatch;
pc++;
break;
case REor:
len = ((uint *)&program[pc + 1])[0];
debug(regexp) printf("\tREor %d\n", len);
pop = pc + 1 + uint.size;
ss = src;
if (trymatch(pop, pcend))
{
if (pcend != program.length)
{ int s;
s = src;
if (trymatch(pcend, program.length))
{ debug(regexp) printf("\tfirst operand matched\n");
src = s;
return 1;
}
else
{
// If second branch doesn't match to end, take first anyway
src = ss;
if (!trymatch(pop + len, program.length))
{
debug(regexp) printf("\tfirst operand matched\n");
src = s;
return 1;
}
}
src = ss;
}
else
{ debug(regexp) printf("\tfirst operand matched\n");
return 1;
}
}
pc = pop + len; // proceed with 2nd branch
break;
case REgoto:
debug(regexp) printf("\tREgoto\n");
len = ((uint *)&program[pc + 1])[0];
pc += 1 + uint.size + len;
break;
case REanystar:
debug(regexp) printf("\tREanystar\n");
pc++;
for (;;)
{ int s1;
int s2;
s1 = src;
if (src == input.length)
break;
if (!(attributes & REA.dotmatchlf) && input[src] == \n)
break;
src++;
s2 = src;
// If no match after consumption, but it
// did match before, then no match
if (!trymatch(pc, program.length))
{
src = s1;
// BUG: should we save/restore pmatch[]?
if (trymatch(pc, program.length))
{
src = s1; // no match
break;
}
}
src = s2;
}
break;
case REnm:
case REnmq:
// len, n, m, ()
puint = (uint *)&program[pc + 1];
len = puint[0];
n = puint[1];
m = puint[2];
debug(regexp) printf("\tREnm%s len=%d, n=%u, m=%u\n", (program[pc] == REnmq) ? (char*)"q" : (char*)"", len, n, m);
pop = pc + 1 + uint.size * 3;
for (count = 0; count < n; count++)
{
if (!trymatch(pop, pop + len))
goto Lnomatch;
}
if (!psave && count < m)
{
//version (Win32)
psave = (regmatch_t *)alloca((re_nsub + 1) * regmatch_t.size);
//else
//psave = new regmatch_t[re_nsub + 1];
}
if (program[pc] == REnmq) // if minimal munch
{
for (; count < m; count++)
{ int s1;
memcpy(psave, pmatch, (re_nsub + 1) * regmatch_t.size);
s1 = src;
if (trymatch(pop + len, program.length))
{
src = s1;
memcpy(pmatch, psave, (re_nsub + 1) * regmatch_t.size);
break;
}
if (!trymatch(pop, pop + len))
{ debug(regexp) printf("\tdoesn't match subexpression\n");
break;
}
// If source is not consumed, don't
// infinite loop on the match
if (s1 == src)
{ debug(regexp) printf("\tsource is not consumed\n");
break;
}
}
}
else // maximal munch
{
for (; count < m; count++)
{ int s1;
int s2;
memcpy(psave, pmatch, (re_nsub + 1) * regmatch_t.size);
s1 = src;
if (!trymatch(pop, pop + len))
{ debug(regexp) printf("\tdoesn't match subexpression\n");
break;
}
s2 = src;
// If source is not consumed, don't
// infinite loop on the match
if (s1 == s2)
{ debug(regexp) printf("\tsource is not consumed\n");
break;
}
// If no match after consumption, but it
// did match before, then no match
if (!trymatch(pop + len, program.length))
{
src = s1;
if (trymatch(pop + len, program.length))
{
src = s1; // no match
memcpy(pmatch, psave, (re_nsub + 1) * regmatch_t.size);
break;
}
}
src = s2;
}
}
debug(regexp) printf("\tREnm len=%d, n=%u, m=%u, DONE count=%d\n", len, n, m, count);
pc = pop + len;
break;
case REparen:
// len, ()
debug(regexp) printf("\tREparen\n");
puint = (uint *)&program[pc + 1];
len = puint[0];
n = puint[1];
pop = pc + 1 + uint.size * 2;
ss = src;
if (!trymatch(pop, pop + len))
goto Lnomatch;
pmatch[n + 1].rm_so = ss;
pmatch[n + 1].rm_eo = src;
pc = pop + len;
break;
case REend:
debug(regexp) printf("\tREend\n");
return 1; // successful match
case REwordboundary:
debug(regexp) printf("\tREwordboundary\n");
if (src > 0 && src < input.length)
{
c1 = input[src - 1];
c2 = input[src];
if (!(
(isword((tchar)c1) && !isword((tchar)c2)) ||
(!isword((tchar)c1) && isword((tchar)c2))
)
)
goto Lnomatch;
}
pc++;
break;
case REnotwordboundary:
debug(regexp) printf("\tREnotwordboundary\n");
if (src == 0 || src == input.length)
goto Lnomatch;
c1 = input[src - 1];
c2 = input[src];
if (
(isword((tchar)c1) && !isword((tchar)c2)) ||
(!isword((tchar)c1) && isword((tchar)c2))
)
goto Lnomatch;
pc++;
break;
case REdigit:
debug(regexp) printf("\tREdigit\n");
if (src == input.length)
goto Lnomatch;
if (!isdigit(input[src]))
goto Lnomatch;
src++;
pc++;
break;
case REnotdigit:
debug(regexp) printf("\tREnotdigit\n");
if (src == input.length)
goto Lnomatch;
if (isdigit(input[src]))
goto Lnomatch;
src++;
pc++;
break;
case REspace:
debug(regexp) printf("\tREspace\n");
if (src == input.length)
goto Lnomatch;
if (!isspace(input[src]))
goto Lnomatch;
src++;
pc++;
break;
case REnotspace:
debug(regexp) printf("\tREnotspace\n");
if (src == input.length)
goto Lnomatch;
if (isspace(input[src]))
goto Lnomatch;
src++;
pc++;
break;
case REword:
debug(regexp) printf("\tREword\n");
if (src == input.length)
goto Lnomatch;
if (!isword(input[src]))
goto Lnomatch;
src++;
pc++;
break;
case REnotword:
debug(regexp) printf("\tREnotword\n");
if (src == input.length)
goto Lnomatch;
if (isword(input[src]))
goto Lnomatch;
src++;
pc++;
break;
case REbackref:
{
n = program[pc + 1];
debug(regexp) printf("\tREbackref %d\n", n);
int so = pmatch[n + 1].rm_so;
int eo = pmatch[n + 1].rm_eo;
len = eo - so;
if (src + len > input.length)
goto Lnomatch;
else if (attributes & REA.ignoreCase)
{
if (icmp(input[src .. src + len], input[so .. eo]))
goto Lnomatch;
}
else if (memcmp(&input[src], &input[so], len * tchar.size))
goto Lnomatch;
src += len;
pc += 2;
break;
}
default:
assert(0);
}
}
Lnomatch:
debug(regexp) printf("\tnomatch pc=%d\n", pc);
src = srcsave;
return 0;
}
/* =================== Compiler ================== */
int parseRegexp()
{ uint offset;
uint gotooffset;
uint len1;
uint len2;
//printf("parseRegexp() '%.*s'\n", pattern[p .. pattern.length]);
offset = buf.offset;
for (;;)
{
assert(p <= pattern.length);
if (p == pattern.length)
{ buf.write(REend);
return 1;
}
switch (pattern[p])
{
case ')':
return 1;
case '|':
p++;
gotooffset = buf.offset;
buf.write(REgoto);
buf.write((uint)0);
len1 = buf.offset - offset;
buf.spread(offset, 1 + uint.size);
gotooffset += 1 + uint.size;
parseRegexp();
len2 = buf.offset - (gotooffset + 1 + uint.size);
buf.data[offset] = REor;
((uint *)&buf.data[offset + 1])[0] = len1;
((uint *)&buf.data[gotooffset + 1])[0] = len2;
break;
default:
parsePiece();
break;
}
}
}
int parsePiece()
{ uint offset;
uint len;
uint n;
uint m;
ubyte op;
int plength = pattern.length;
//printf("parsePiece() '%.*s'\n", pattern[p .. pattern.length]);
offset = buf.offset;
parseAtom();
if (p == plength)
return 1;
switch (pattern[p])
{
case '*':
// Special optimization: replace .* with REanystar
if (buf.offset - offset == 1 &&
buf.data[offset] == REanychar &&
p + 1 < plength &&
pattern[p + 1] != '?')
{
buf.data[offset] = REanystar;
p++;
break;
}
n = 0;
m = inf;
goto Lnm;
case '+':
n = 1;
m = inf;
goto Lnm;
case '?':
n = 0;
m = 1;
goto Lnm;
case '{': // {n} {n,} {n,m}
p++;
if (p == plength || !isdigit(pattern[p]))
goto Lerr;
n = 0;
do
{
// BUG: handle overflow
n = n * 10 + pattern[p] - '0';
p++;
if (p == plength)
goto Lerr;
} while (isdigit(pattern[p]));
if (pattern[p] == '}') // {n}
{ m = n;
goto Lnm;
}
if (pattern[p] != ',')
goto Lerr;
p++;
if (p == plength)
goto Lerr;
if (pattern[p] == '}') // {n,}
{ m = inf;
goto Lnm;
}
if (!isdigit(pattern[p]))
goto Lerr;
m = 0; // {n,m}
do
{
// BUG: handle overflow
m = m * 10 + pattern[p] - '0';
p++;
if (p == plength)
goto Lerr;
} while (isdigit(pattern[p]));
if (pattern[p] != '}')
goto Lerr;
goto Lnm;
Lnm:
p++;
op = REnm;
if (p < plength && pattern[p] == '?')
{ op = REnmq; // minimal munch version
p++;
}
len = buf.offset - offset;
buf.spread(offset, 1 + uint.size * 3);
buf.data[offset] = op;
uint* puint = (uint *)&buf.data[offset + 1];
puint[0] = len;
puint[1] = n;
puint[2] = m;
break;
default:
break;
}
return 1;
Lerr:
error("badly formed {n,m}");
}
int parseAtom()
{ ubyte op;
uint offset;
tchar c;
//printf("parseAtom() '%.*s'\n", pattern[p .. pattern.length]);
if (p < pattern.length)
{
c = pattern[p];
switch (c)
{
case '*':
case '+':
case '?':
error("*+? not allowed in atom");
p++;
return 0;
case '(':
p++;
buf.write(REparen);
offset = buf.offset;
buf.write((uint)0); // reserve space for length
buf.write(re_nsub);
re_nsub++;
parseRegexp();
*(uint *)&buf.data[offset] =
buf.offset - (offset + uint.size * 2);
if (p == pattern.length || pattern[p] != ')')
{
error("')' expected");
return 0;
}
p++;
break;
case '[':
if (!parseRange())
return 0;
break;
case '.':
p++;
buf.write(REanychar);
break;
case '^':
p++;
buf.write(REbol);
break;
case '$':
p++;
buf.write(REeol);
break;
case '\':
p++;
if (p == pattern.length)
{ error("no character past '\\'");
return 0;
}
c = pattern[p];
switch (c)
{
case 'b': op = REwordboundary; goto Lop;
case 'B': op = REnotwordboundary; goto Lop;
case 'd': op = REdigit; goto Lop;
case 'D': op = REnotdigit; goto Lop;
case 's': op = REspace; goto Lop;
case 'S': op = REnotspace; goto Lop;
case 'w': op = REword; goto Lop;
case 'W': op = REnotword; goto Lop;
Lop:
buf.write(op);
p++;
break;
case 'f':
case 'n':
case 'r':
case 't':
case 'v':
case 'c':
case 'x':
case 'u':
case '0':
c = escape();
goto Lbyte;
case '1': case '2': case '3':
case '4': case '5': case '6':
case '7': case '8': case '9':
c -= '1';
if (c < re_nsub)
{ buf.write(REbackref);
buf.write((ubyte)c);
}
else
{ error("no matching back reference");
return 0;
}
p++;
break;
default:
p++;
goto Lbyte;
}
break;
default:
p++;
Lbyte:
op = REchar;
if (attributes & REA.ignoreCase)
{
if (isalpha(c))
{
op = REichar;
c = ctype.toupper(c);
}
}
if (op == REchar && c <= 0xFF)
{
// Look ahead and see if we can make this into
// an REstring
int q;
int len;
for (q = p; q < pattern.length; ++q)
{ tchar qc = pattern[q];
switch (qc)
{
case '{':
case '*':
case '+':
case '?':
if (q == p)
goto Lchar;
q--;
break;
case '(': case ')':
case '|':
case '[': case ']':
case '.': case '^':
case '$': case '\':
case '}':
break;
default:
continue;
}
break;
}
len = q - p;
if (len > 0)
{
debug(regexp) printf("writing string len %d, c = '%c', pattern[p] = '%c'\n", len+1, c, pattern[p]);
buf.reserve(5 + (1 + len) * tchar.size);
buf.write(REstring);
buf.write(len + 1);
buf.write(c);
buf.write(pattern[p .. p + len]);
p = q;
break;
}
}
if (c & ~0xFF)
{
// Convert to wchar opcode
op = (op == REchar) ? REwchar : REiwchar;
buf.write(op);
buf.write(c);
}
else
{
Lchar:
debug(regexp) printf("It's an REchar '%c'\n", c);
buf.write(op);
buf.write((char)c);
}
break;
}
}
return 1;
}
private:
class Range
{
uint maxc;
uint maxb;
OutBuffer buf;
ubyte* base;
bit[] bits;
this(OutBuffer buf)
{
this.buf = buf;
if (buf.data.length)
this.base = &buf.data[buf.offset];
}
void setbitmax(uint u)
{ uint b;
if (u > maxc)
{
maxc = u;
b = u / 8;
if (b >= maxb)
{ uint u;
u = base ? base - &buf.data[0] : 0;
buf.fill0(b - maxb + 1);
base = &buf.data[u];
maxb = b + 1;
bits = ((bit*)this.base)[0 .. maxc + 1];
}
}
}
void setbit2(uint u)
{
setbitmax(u + 1);
//printf("setbit2 [x%02x] |= x%02x\n", u >> 3, 1 << (u & 7));
bits[u] = 1;
}
};
int parseRange()
{ ubyte op;
int c;
int c2;
uint i;
uint cmax;
uint offset;
cmax = 0x7F;
p++;
op = REbit;
if (p == pattern.length)
goto Lerr;
if (pattern[p] == '^')
{ p++;
op = REnotbit;
if (p == pattern.length)
goto Lerr;
}
buf.write(op);
offset = buf.offset;
buf.write(cast(uint)0); // reserve space for length
buf.reserve(128 / 8);
Range r = new Range(buf);
if (op == REnotbit)
r.setbit2(0);
switch (pattern[p])
{
case ']':
case '-':
c = pattern[p];
p++;
r.setbit2(c);
break;
default:
break;
}
enum RS { start, rliteral, dash };
RS rs;
rs = RS.start;
for (;;)
{
if (p == pattern.length)
goto Lerr;
switch (pattern[p])
{
case ']':
switch (rs)
{ case RS.dash:
r.setbit2('-');
case RS.rliteral:
r.setbit2(c);
break;
case RS.start:
break;
}
p++;
break;
case '\':
p++;
r.setbitmax(cmax);
if (p == pattern.length)
goto Lerr;
switch (pattern[p])
{
case 'd':
for (i = '0'; i <= '9'; i++)
r.bits[i] = 1;
goto Lrs;
case 'D':
for (i = 1; i < '0'; i++)
r.bits[i] = 1;
for (i = '9' + 1; i <= cmax; i++)
r.bits[i] = 1;
goto Lrs;
case 's':
for (i = 0; i <= cmax; i++)
if (isspace(i))
r.bits[i] = 1;
goto Lrs;
case 'S':
for (i = 1; i <= cmax; i++)
if (!isspace(i))
r.bits[i] = 1;
goto Lrs;
case 'w':
for (i = 0; i <= cmax; i++)
if (isword((tchar)i))
r.bits[i] = 1;
goto Lrs;
case 'W':
for (i = 1; i <= cmax; i++)
if (!isword((tchar)i))
r.bits[i] = 1;
goto Lrs;
Lrs:
switch (rs)
{ case RS.dash:
r.setbit2('-');
case RS.rliteral:
r.setbit2(c);
break;
}
rs = RS.start;
continue;
default:
break;
}
c2 = escape();
goto Lrange;
case '-':
p++;
if (rs == RS.start)
goto Lrange;
else if (rs == RS.rliteral)
rs = RS.dash;
else if (rs == RS.dash)
{
r.setbit2(c);
r.setbit2('-');
rs = RS.start;
}
continue;
default:
c2 = pattern[p];
p++;
Lrange:
switch (rs)
{ case RS.rliteral:
r.setbit2(c);
case RS.start:
c = c2;
rs = RS.rliteral;
break;
case RS.dash:
if (c > c2)
{ error("inverted range in character class");
return 0;
}
r.setbitmax(c2);
//printf("c = %x, c2 = %x\n",c,c2);
for (; c <= c2; c++)
r.bits[c] = 1;
rs = RS.start;
break;
}
continue;
}
break;
}
//printf("maxc = %d, maxb = %d\n",r.maxc,r.maxb);
((ushort *)&buf.data[offset])[0] = (ushort)r.maxc;
((ushort *)&buf.data[offset])[1] = (ushort)r.maxb;
if (attributes & REA.ignoreCase)
{
// BUG: what about wchar?
r.setbitmax(0x7F);
for (c = 'a'; c <= 'z'; c++)
{
if (r.bits[c])
r.bits[c + 'A' - 'a'] = 1;
else if (r.bits[c + 'A' - 'a'])
r.bits[c] = 1;
}
}
return 1;
Lerr:
error("invalid range");
return 0;
}
void error(char[] msg)
{
errors++;
throw new RegExpError(msg);
}
// p is following the \ char
int escape()
in
{
assert(p < pattern.length);
}
body
{ int c;
int i;
tchar tc;
c = pattern[p]; // none of the cases are multibyte
switch (c)
{
case 'b': c = \b; break;
case 'f': c = \f; break;
case 'n': c = \n; break;
case 'r': c = \r; break;
case 't': c = \t; break;
case 'v': c = \v; break;
// BUG: Perl does \a and \e too, should we?
case 'c':
++p;
if (p == pattern.length)
goto Lretc;
c = pattern[p];
// Note: we are deliberately not allowing wchar letters
if (!(('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')))
{
Lcerr:
error("letter expected following \\c");
return 0;
}
c &= 0x1F;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
c -= '0';
for (i = 0; i < 2; i++)
{
p++;
if (p == pattern.length)
goto Lretc;
tc = pattern[p];
if ('0' <= tc && tc <= '7')
{ c = c * 8 + (tc - '0');
// Treat overflow as if last
// digit was not an octal digit
if (c >= 0xFF)
{ c >>= 3;
return c;
}
}
else
return c;
}
break;
case 'x':
c = 0;
for (i = 0; i < 2; i++)
{
p++;
if (p == pattern.length)
goto Lretc;
tc = pattern[p];
if ('0' <= tc && tc <= '9')
c = c * 16 + (tc - '0');
else if ('a' <= tc && tc <= 'f')
c = c * 16 + (tc - 'a' + 10);
else if ('A' <= tc && tc <= 'F')
c = c * 16 + (tc - 'A' + 10);
else if (i == 0) // if no hex digits after \x
{
// Not a valid \xXX sequence
return 'x';
}
else
return c;
}
break;
case 'u':
c = 0;
for (i = 0; i < 4; i++)
{
p++;
if (p == pattern.length)
goto Lretc;
tc = pattern[p];
if ('0' <= tc && tc <= '9')
c = c * 16 + (tc - '0');
else if ('a' <= tc && tc <= 'f')
c = c * 16 + (tc - 'a' + 10);
else if ('A' <= tc && tc <= 'F')
c = c * 16 + (tc - 'A' + 10);
else
{
// Not a valid \uXXXX sequence
p -= i;
return 'u';
}
}
break;
default:
break;
}
p++;
Lretc:
return c;
}
/* ==================== optimizer ======================= */
void optimize()
{ ubyte[] prog;
int i;
debug(regexp) printf("RegExp.optimize()\n");
prog = buf.toBytes();
for (i = 0; 1;)
{
//printf("\tprog[%d] = %d, %d\n", i, prog[i], REstring);
switch (prog[i])
{
case REend:
case REanychar:
case REanystar:
case REbackref:
case REeol:
case REchar:
case REichar:
case REwchar:
case REiwchar:
case REstring:
case REtestbit:
case REbit:
case REnotbit:
case RErange:
case REnotrange:
case REwordboundary:
case REnotwordboundary:
case REdigit:
case REnotdigit:
case REspace:
case REnotspace:
case REword:
case REnotword:
return;
case REbol:
i++;
continue;
case REor:
case REnm:
case REnmq:
case REparen:
case REgoto:
{
OutBuffer bitbuf = new OutBuffer;
Range r = new Range(bitbuf);
uint offset;
offset = i;
if (startchars(r, prog[i .. prog.length]))
{
debug(regexp) printf("\tfilter built\n");
buf.spread(offset, 1 + 4 + r.maxb);
buf.data[offset] = REtestbit;
((ushort *)&buf.data[offset + 1])[0] = (ushort)r.maxc;
((ushort *)&buf.data[offset + 1])[1] = (ushort)r.maxb;
i = offset + 1 + 4;
buf.data[i .. i + r.maxb] = r.base[0 .. r.maxb];
}
return;
}
default:
assert(0);
}
}
}
/////////////////////////////////////////
// OR the leading character bits into r.
// Limit the character range from 0..7F,
// trymatch() will allow through anything over maxc.
// Return 1 if success, 0 if we can't build a filter or
// if there is no point to one.
int startchars(Range r, ubyte[] prog)
{ tchar c;
uint maxc;
uint maxb;
uint len;
uint b;
uint n;
uint m;
ubyte* pop;
int i;
//printf("RegExp.startchars(prog = %p, progend = %p)\n", prog, progend);
for (i = 0; i < prog.length;)
{
switch (prog[i])
{
case REchar:
c = prog[i + 1];
if (c <= 0x7F)
r.setbit2(c);
return 1;
case REichar:
c = prog[i + 1];
if (c <= 0x7F)
{ r.setbit2(c);
r.setbit2(ctype.tolower((tchar)c));
}
return 1;
case REwchar:
case REiwchar:
return 1;
case REanychar:
return 0; // no point
case REstring:
len = *(uint *)&prog[i + 1];
assert(len);
c = *(tchar *)&prog[i + 1 + uint.size];
debug(regexp) printf("\tREstring %d, '%c'\n", len, c);
if (c <= 0x7F)
r.setbit2(c);
return 1;
case REtestbit:
case REbit:
maxc = ((ushort *)&prog[i + 1])[0];
maxb = ((ushort *)&prog[i + 1])[1];
if (maxc <= 0x7F)
r.setbitmax(maxc);
else
maxb = r.maxb;
for (b = 0; b < maxb; b++)
r.base[b] |= prog[i + 1 + 4 + b];
return 1;
case REnotbit:
maxc = ((ushort *)&prog[i + 1])[0];
maxb = ((ushort *)&prog[i + 1])[1];
if (maxc <= 0x7F)
r.setbitmax(maxc);
else
maxb = r.maxb;
for (b = 0; b < maxb; b++)
r.base[b] |= ~prog[i + 1 + 4 + b];
return 1;
case REbol:
case REeol:
return 0;
case REor:
len = ((uint *)&prog[i + 1])[0];
return startchars(r, prog[i + 1 + uint.size .. prog.length]) &&
startchars(r, prog[i + 1 + uint.size + len .. prog.length]);
case REgoto:
len = ((uint *)&prog[i + 1])[0];
i += 1 + uint.size + len;
break;
case REanystar:
return 0;
case REnm:
case REnmq:
// len, n, m, ()
len = ((uint *)&prog[i + 1])[0];
n = ((uint *)&prog[i + 1])[1];
m = ((uint *)&prog[i + 1])[2];
pop = &prog[i + 1 + uint.size * 3];
if (!startchars(r, pop[0 .. len]))
return 0;
if (n)
return 1;
i += 1 + uint.size * 3 + len;
break;
case REparen:
// len, ()
len = ((uint *)&prog[i + 1])[0];
n = ((uint *)&prog[i + 1])[1];
pop = &prog[0] + i + 1 + uint.size * 2;
return startchars(r, pop[0 .. len]);
case REend:
return 0;
case REwordboundary:
case REnotwordboundary:
return 0;
case REdigit:
r.setbitmax('9');
for (c = '0'; c <= '9'; c++)
r.bits[c] = 1;
return 1;
case REnotdigit:
r.setbitmax(0x7F);
for (c = 0; c <= '0'; c++)
r.bits[c] = 1;
for (c = '9' + 1; c <= r.maxc; c++)
r.bits[c] = 1;
return 1;
case REspace:
r.setbitmax(0x7F);
for (c = 0; c <= r.maxc; c++)
if (isspace(c))
r.bits[c] = 1;
return 1;
case REnotspace:
r.setbitmax(0x7F);
for (c = 0; c <= r.maxc; c++)
if (!isspace(c))
r.bits[c] = 1;
return 1;
case REword:
r.setbitmax(0x7F);
for (c = 0; c <= r.maxc; c++)
if (isword((tchar)c))
r.bits[c] = 1;
return 1;
case REnotword:
r.setbitmax(0x7F);
for (c = 0; c <= r.maxc; c++)
if (!isword((tchar)c))
r.bits[c] = 1;
return 1;
case REbackref:
return 0;
}
}
return 1;
}
/* ==================== replace ======================= */
/************************************
* This version of replace() uses:
* & replace with the match
* \n replace with the nth parenthesized match, n is 1..9
* \c replace with char c
*/
public tchar[] replaceOld(tchar[] format)
{
OutBuffer buf;
tchar[] result;
tchar c;
//printf("replace: this = %p so = %d, eo = %d\n", this, pmatch[0].rm_so, pmatch[0].rm_eo);
//printf("3input = '%.*s'\n", input);
buf = new OutBuffer();
buf.reserve(format.length * tchar.size);
for (uint i; i < format.length; i++)
{
c = format[i];
switch (c)
{
case '&':
//printf("match = '%.*s'\n", input[pmatch[0].rm_so .. pmatch[0].rm_eo]);
buf.write(input[pmatch[0].rm_so .. pmatch[0].rm_eo]);
break;
case '\':
if (i + 1 < format.length)
{
c = format[++i];
if (c >= '1' && c <= '9')
{ uint i;
i = c - '0';
if (i <= re_nsub && pmatch[i].rm_so != pmatch[i].rm_eo)
buf.write(input[pmatch[i].rm_so .. pmatch[i].rm_eo]);
break;
}
}
buf.write(c);
break;
default:
buf.write(c);
break;
}
}
result = cast(tchar[])buf.toBytes();
return result;
}
// This version of replace uses:
// $$ $
// $& The matched substring.
// $` The portion of string that precedes the matched substring.
// $' The portion of string that follows the matched substring.
// $n The nth capture, where n is a single digit 1-9
// and $n is not followed by a decimal digit.
// $nn The nnth capture, where nn is a two-digit decimal
// number 01-99.
// If nnth capture is undefined or more than the number
// of parenthesized subexpressions, use the empty
// string instead.
//
// Any other $ are left as is.
public tchar[] replace(tchar[] format)
{
return replace3(format, input, pmatch[0 .. re_nsub + 1]);
}
// Static version that doesn't require a RegExp object to be created
private tchar[] replace3(tchar[] format, tchar[] input, regmatch_t[] pmatch)
{
OutBuffer buf;
tchar[] result;
tchar c;
uint c2;
int rm_so;
int rm_eo;
int i;
int f;
// printf("replace3(format = '%.*s', input = '%.*s')\n", format, input);
buf = new OutBuffer();
buf.reserve(format.length * tchar.size);
for (f = 0; f < format.length; f++)
{
c = format[f];
L1:
if (c != '$')
{
buf.write(c);
continue;
}
++f;
if (f == format.length)
{
buf.write(cast(tchar)'$');
break;
}
c = format[f];
switch (c)
{
case '&':
rm_so = pmatch[0].rm_so;
rm_eo = pmatch[0].rm_eo;
goto Lstring;
case '`':
rm_so = 0;
rm_eo = pmatch[0].rm_so;
goto Lstring;
case \':
rm_so = pmatch[0].rm_eo;
rm_eo = input.length;
goto Lstring;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
i = c - '0';
if (f + 1 == format.length)
{
if (i == 0)
{
buf.write(cast(tchar)'$');
buf.write(c);
continue;
}
}
else
{
c2 = format[f + 1];
if (c2 >= '0' && c2 <= '9')
{ i = (c - '0') * 10 + (c2 - '0');
f++;
}
if (i == 0)
{
buf.write(cast(tchar)'$');
buf.write(c);
c = c2;
goto L1;
}
}
if (i < pmatch.length)
{ rm_so = pmatch[i].rm_so;
rm_eo = pmatch[i].rm_eo;
goto Lstring;
}
break;
Lstring:
if (rm_so != rm_eo)
buf.write(input[rm_so .. rm_eo]);
break;
default:
buf.write(cast(tchar)'$');
buf.write(c);
break;
}
}
result = (tchar[])buf.toBytes();
return result;
}
}