// Regular Expressions /* * Copyright (C) 2000-2005 by Digital Mars, www.digitalmars.com * Written by Walter Bright * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages * arising from the use of this software. * * Permission is granted to anyone to use this software for any purpose, * including commercial applications, and to alter it and redistribute it * freely, subject to the following restrictions: * * o The origin of this software must not be misrepresented; you must not * claim that you wrote the original software. If you use this software * in a product, an acknowledgment in the product documentation would be * appreciated but is not required. * o Altered source versions must be plainly marked as such, and must not * be misrepresented as being the original software. * o This notice may not be removed or altered from any source * distribution. */ /********************************************** * $(LINK2 "../../ctg/regular.html", Regular expressions) * are a powerful method of string pattern matching. * The regular expression * language used is the same as that commonly used, however, some of the very * advanced forms may behave slightly differently. * * In the following guide, $(I pattern)[] refers to a * $(LINK2 "../../ctg/regular.html", regular expression). * The $(I attributes)[] refers to a string controlling the interpretation of the regular expression. It consists of a sequence of one or more of the following characters:
Attribute Action
$(B g) global; repeat over the whole input string
$(B i) case insensitive
$(B m) treat as multiple lines separated by newlines
* Macros: * WIKI = StdRegexp */ /* Escape sequences: \nnn starts out a 1, 2 or 3 digit octal sequence, where n is an octal digit. If nnn is larger than 0377, then the 3rd digit is not part of the sequence and is not consumed. For maximal portability, use exactly 3 digits. \xXX starts out a 1 or 2 digit hex sequence. X is a hex character. If the first character after the \x is not a hex character, the value of the sequence is 'x' and the XX are not consumed. For maximal portability, use exactly 2 digits. \uUUUU is a unicode sequence. There are exactly 4 hex characters after the \u, if any are not, then the value of the sequence is 'u', and the UUUU are not consumed. Character classes: [a-b], where a is greater than b, will produce an error. References: http://www.unicode.org/unicode/reports/tr18/ */ module std.regexp; //debug = regexp; // uncomment to turn on debugging printf's private { import std.c.stdio; import std.c.stdlib; import std.string; import std.ctype; import std.outbuffer; import std.bitarray; } /** Regular expression to extract an _email address */ const char[] email = r"[a-zA-Z]([.]?([[a-zA-Z0-9_]-]+)*)?@([[a-zA-Z0-9_]\-_]+\.)+[a-zA-Z]{2,6}"; /** Regular expression to extract a _url */ const char[] url = r"(([h|H][t|T]|[f|F])[t|T][p|P]([s|S]?)\:\/\/|~/|/)?([\w]+:\w+@)?(([a-zA-Z]{1}([\w\-]+\.)+([\w]{2,5}))(:[\d]{1,5})?)?((/?\w+/)+|/?)(\w+\.[\w]{3,4})?([,]\w+)*((\?\w+=\w+)?(&\w+=\w+)*([,]\w*)*)?"; /************************************ * One of these gets thrown on compilation errors */ class RegExpException : Exception { this(char[] msg) { super(msg); } } struct regmatch_t { int rm_so; // index of start of match int rm_eo; // index past end of match } private alias char rchar; // so we can make a wchar version /****************************************************** * Search string for matches with regular expression * pattern with attributes. * Replace each match with string generated from format. * Params: * string = String to search. * pattern = Regular expression pattern. * format = Replacement string format. * attributes = Regular expression attributes. * Returns: the resulting string. */ char[] sub(char[] string, char[] pattern, char[] format, char[] attributes = null) { RegExp r = new RegExp(pattern, attributes); char[] result = r.replace(string, format); delete r; return result; } unittest { debug(regexp) printf("regexp.sub.unittest\n"); char[] r = sub("hello", "ll", "ss"); assert(r == "hesso"); } /******************************************************* * Search string for matches with regular expression * pattern with attributes. * Pass each match to delegate dg. * Replace each match with the return value from dg. * Params: * string = String to search. * pattern = Regular expression pattern. * dg = Delegate * attributes = Regular expression attributes. * Returns: the resulting string. */ char[] sub(char[] string, char[] pattern, char[] delegate(RegExp) dg, char[] attributes = null) { RegExp r = new RegExp(pattern, attributes); rchar[] result; int lastindex; int offset; result = string; lastindex = 0; offset = 0; while (r.test(string, lastindex)) { int so = r.pmatch[0].rm_so; int eo = r.pmatch[0].rm_eo; rchar[] replacement = dg(r); result = replaceSlice(result, result[offset + so .. offset + eo], replacement); if (r.attributes & RegExp.REA.global) { offset += replacement.length - (eo - so); if (lastindex == eo) lastindex++; // always consume some source else lastindex = eo; } else break; } delete r; return result; } unittest { debug(regexp) printf("regexp.sub.unittest\n"); char[] foo(RegExp r) { return "ss"; } char[] r = sub("hello", "ll", delegate char[](RegExp r) { return "ss"; }); assert(r == "hesso"); } /************************************************* * Search string[] for first match with pattern[] with attributes[]. * Params: * string = String to search. * pattern = Regular expression pattern. * attributes = Regular expression attributes. * Returns: * index into string[] of match if found, -1 if no match. */ int find(rchar[] string, char[] pattern, char[] attributes = null) { int i = -1; RegExp r = new RegExp(pattern, attributes); if (r.test(string)) { i = r.pmatch[0].rm_so; } delete r; return i; } unittest { debug(regexp) printf("regexp.find.unittest\n"); int i; i = find("xabcy", "abc"); assert(i == 1); i = find("cba", "abc"); assert(i == -1); } /************************************************* * Search string[] for last match with pattern[] with attributes[]. * Params: * string = String to search. * pattern = Regular expression pattern. * attributes = Regular expression attributes. * Returns: * index into string[] of match if found, -1 if no match. */ int rfind(rchar[] string, char[] pattern, char[] attributes = null) { int i = -1; int lastindex = 0; RegExp r = new RegExp(pattern, attributes); while (r.test(string, lastindex)) { int eo = r.pmatch[0].rm_eo; i = r.pmatch[0].rm_so; if (lastindex == eo) lastindex++; // always consume some source else lastindex = eo; } delete r; return i; } unittest { int i; debug(regexp) printf("regexp.rfind.unittest\n"); i = rfind("abcdefcdef", "c"); assert(i == 6); i = rfind("abcdefcdef", "cd"); assert(i == 6); i = rfind("abcdefcdef", "x"); assert(i == -1); i = rfind("abcdefcdef", "xy"); assert(i == -1); i = rfind("abcdefcdef", ""); assert(i == 10); } /******************************************** * Split string[] into an array of strings, using the regular * expression pattern[] with attributes[] as the separator. * string = String to search. * pattern = Regular expression pattern. * attributes = Regular expression attributes. * Returns: * array of slices into string[] */ char[][] split(char[] string, char[] pattern, char[] attributes = null) { RegExp r = new RegExp(pattern, attributes); char[][] result = r.split(string); delete r; return result; } unittest { debug(regexp) printf("regexp.split.unittest()\n"); char[][] result; result = split("ab", "a*"); assert(result.length == 2); assert(result[0] == ""); assert(result[1] == "b"); } /**************************************************** * Search string[] for first match with pattern[] with attributes[]. * Params: * string = String to search. * pattern = Regular expression pattern. * attributes = Regular expression attributes. * Returns: * corresponding RegExp if found, null if not. * Example: * --- * import std.stdio; * import std.regexp; * * void main() * { * if (m; std.regexp.search("abcdef", "c")) * { * writefln("%s[%s]%s", m.pre, m.match(0), m.post); * } * } * // Prints: * // ab[c]def * --- */ RegExp search(char[] string, char[] pattern, char[] attributes = null) { RegExp r = new RegExp(pattern, attributes); if (r.test(string)) { } else { delete r; r = null; } return r; } /* ********************************* RegExp ******************************** */ /***************************** * RegExp is a class to handle regular expressions. * * It is the core foundation for adding powerful string pattern matching * capabilities to programs like grep, text editors, awk, sed, etc. */ class RegExp { /***** * Construct a RegExp object. Compile pattern * with attributes into * an internal form for fast execution. * Params: * pattern = regular expression * attributes = _attributes * Throws: RegExpException if there are any compilation errors. */ public this(rchar[] pattern, rchar[] attributes = null) { pmatch = (&gmatch)[0 .. 1]; compile(pattern, attributes); } /***** * Generate instance of RegExp. * Params: * pattern = regular expression * attributes = _attributes * Throws: RegExpException if there are any compilation errors. */ public static RegExp opCall(rchar[] pattern, rchar[] attributes = null) { return new RegExp(pattern, attributes); } /************************************ * Set up for start of foreach loop. * Returns: * search() returns instance of RegExp set up to _search string[]. * Example: * --- * import std.stdio; * import std.regexp; * * void main() * { * foreach(m; RegExp("ab").search("abcabcabab")) * { * writefln("%s[%s]%s", m.pre, m.match(0), m.post); * } * } * // Prints: * // [ab]cabcabab * // abc[ab]cabab * // abcabc[ab]ab * // abcabcab[ab] * --- */ public RegExp search(rchar[] string) { input = string; pmatch[0].rm_eo = 0; return this; } /** ditto */ public int opApply(int delegate(inout RegExp) dg) { int result; RegExp r = this; while (test()) { result = dg(r); if (result) break; } return result; } /****************** * Retrieve match n. * * n==0 means the matched substring, n>0 means the * n'th parenthesized subexpression. * if n is larger than the number of parenthesized subexpressions, * null is returned. */ public char[] match(size_t n) { if (n >= pmatch.length) return null; else { size_t rm_so, rm_eo; rm_so = pmatch[n].rm_so; rm_eo = pmatch[n].rm_eo; if (rm_so == rm_eo) return null; return input[rm_so .. rm_eo]; } } /******************* * Return the slice of the input that precedes the matched substring. */ public char[] pre() { return input[0 .. pmatch[0].rm_so]; } /******************* * Return the slice of the input that follows the matched substring. */ public char[] post() { return input[pmatch[0].rm_eo .. $]; } uint re_nsub; // number of parenthesized subexpression matches regmatch_t[] pmatch; // array [re_nsub + 1] rchar[] input; // the string to search // per instance: rchar[] pattern; // source text of the regular expression rchar[] flags; // source text of the attributes parameter int errors; uint attributes; enum REA { global = 1, // has the g attribute ignoreCase = 2, // has the i attribute multiline = 4, // if treat as multiple lines separated // by newlines, or as a single line dotmatchlf = 8, // if . matches \n } private: size_t src; // current source index in input[] size_t src_start; // starting index for match in input[] size_t p; // position of parser in pattern[] regmatch_t gmatch; // match for the entire regular expression // (serves as storage for pmatch[0]) ubyte[] program; // pattern[] compiled into regular expression program OutBuffer buf; /******************************************/ // Opcodes enum : ubyte { REend, // end of program REchar, // single character REichar, // single character, case insensitive REdchar, // single UCS character REidchar, // single wide character, case insensitive REanychar, // any character REanystar, // ".*" REstring, // string of characters REistring, // string of characters, case insensitive REtestbit, // any in bitmap, non-consuming REbit, // any in the bit map REnotbit, // any not in the bit map RErange, // any in the string REnotrange, // any not in the string REor, // a | b REplus, // 1 or more REstar, // 0 or more REquest, // 0 or 1 REnm, // n..m REnmq, // n..m, non-greedy version REbol, // beginning of line REeol, // end of line REparen, // parenthesized subexpression REgoto, // goto offset REwordboundary, REnotwordboundary, REdigit, REnotdigit, REspace, REnotspace, REword, REnotword, REbackref, }; // BUG: should this include '$'? private int isword(dchar c) { return isalnum(c) || c == '_'; } private uint inf = ~0u; /* ******************************** * Throws RegExpException on error */ public void compile(rchar[] pattern, rchar[] attributes) { //printf("RegExp.compile('%.*s', '%.*s')\n", pattern, attributes); this.attributes = 0; foreach (rchar c; attributes) { REA att; switch (c) { case 'g': att = REA.global; break; case 'i': att = REA.ignoreCase; break; case 'm': att = REA.multiline; break; default: error("unrecognized attribute"); return; } if (this.attributes & att) { error("redundant attribute"); return; } this.attributes |= att; } input = null; this.pattern = pattern; this.flags = attributes; uint oldre_nsub = re_nsub; re_nsub = 0; errors = 0; buf = new OutBuffer(); buf.reserve(pattern.length * 8); p = 0; parseRegexp(); if (p < pattern.length) { error("unmatched ')'"); } optimize(); program = buf.data; buf.data = null; delete buf; if (re_nsub > oldre_nsub) { if (pmatch is &gmatch) pmatch = null; pmatch.length = re_nsub + 1; } pmatch[0].rm_so = 0; pmatch[0].rm_eo = 0; } /******************************************** * Split string[] into an array of strings, using the regular * expression as the separator. * Returns: * array of slices into string[] */ public rchar[][] split(rchar[] string) { debug(regexp) printf("regexp.split()\n"); rchar[][] result; if (string.length) { int p = 0; int q; for (q = p; q != string.length;) { if (test(string, q)) { int e; q = pmatch[0].rm_so; e = pmatch[0].rm_eo; if (e != p) { result ~= string[p .. q]; for (int i = 1; i < pmatch.length; i++) { int so = pmatch[i].rm_so; int eo = pmatch[i].rm_eo; if (so == eo) { so = 0; // -1 gives array bounds error eo = 0; } result ~= string[so .. eo]; } q = p = e; continue; } } q++; } result ~= string[p .. string.length]; } else if (!test(string)) result ~= string; return result; } unittest { debug(regexp) printf("regexp.split.unittest()\n"); RegExp r = new RegExp("a*?", null); rchar[][] result; rchar[] j; int i; result = r.split("ab"); assert(result.length == 2); i = std.string.cmp(result[0], "a"); assert(i == 0); i = std.string.cmp(result[1], "b"); assert(i == 0); r = new RegExp("a*", null); result = r.split("ab"); assert(result.length == 2); i = std.string.cmp(result[0], ""); assert(i == 0); i = std.string.cmp(result[1], "b"); assert(i == 0); r = new RegExp("<(\\/)?([^<>]+)>", null); result = r.split("afontbarhello"); for (i = 0; i < result.length; i++) { //debug(regexp) printf("result[%d] = '%.*s'\n", i, result[i]); } j = join(result, ","); //printf("j = '%.*s'\n", j); i = std.string.cmp(j, "a,,b,font,/,b,bar,,TAG,hello,/,TAG,"); assert(i == 0); r = new RegExp("a[bc]", null); result = r.match("123ab"); j = join(result, ","); i = std.string.cmp(j, "ab"); assert(i == 0); result = r.match("ac"); j = join(result, ","); i = std.string.cmp(j, "ac"); assert(i == 0); } /************************************************* * Search string[] for match with regular expression. * Returns: * index of match if successful, -1 if not found */ public int find(rchar[] string) { int i; i = test(string); if (i) i = pmatch[0].rm_so; else i = -1; // no match return i; } //deprecated alias find search; unittest { debug(regexp) printf("regexp.find.unittest()\n"); int i; RegExp r = new RegExp("abc", null); i = r.find("xabcy"); assert(i == 1); i = r.find("cba"); assert(i == -1); } /************************************************* * Search string[] for match. * Returns: * If global attribute, return same value as exec(string). * If not global attribute, return array of all matches. */ public rchar[][] match(rchar[] string) { rchar[][] result; if (attributes & REA.global) { int lastindex = 0; while (test(string, lastindex)) { int eo = pmatch[0].rm_eo; result ~= input[pmatch[0].rm_so .. eo]; if (lastindex == eo) lastindex++; // always consume some source else lastindex = eo; } } else { result = exec(string); } return result; } unittest { debug(regexp) printf("regexp.match.unittest()\n"); int i; rchar[][] result; rchar[] j; RegExp r; r = new RegExp("a[bc]", null); result = r.match("1ab2ac3"); j = join(result, ","); i = std.string.cmp(j, "ab"); assert(i == 0); r = new RegExp("a[bc]", "g"); result = r.match("1ab2ac3"); j = join(result, ","); i = std.string.cmp(j, "ab,ac"); assert(i == 0); } /************************************************* * Find regular expression matches in string[]. Replace those matches * with a new _string composed of format[] merged with the result of the * matches. * If global, replace all matches. Otherwise, replace first match. * Returns: the new _string */ public rchar[] replace(rchar[] string, rchar[] format) { rchar[] result; int lastindex; int offset; result = string; lastindex = 0; offset = 0; for (;;) { if (!test(string, lastindex)) break; int so = pmatch[0].rm_so; int eo = pmatch[0].rm_eo; rchar[] replacement = replace(format); result = replaceSlice(result, result[offset + so .. offset + eo], replacement); if (attributes & REA.global) { offset += replacement.length - (eo - so); if (lastindex == eo) lastindex++; // always consume some source else lastindex = eo; } else break; } return result; } unittest { debug(regexp) printf("regexp.replace.unittest()\n"); int i; rchar[] result; RegExp r; r = new RegExp("a[bc]", "g"); result = r.replace("1ab2ac3", "x$&y"); i = std.string.cmp(result, "1xaby2xacy3"); assert(i == 0); } /************************************************* * Search string[] for match. * Returns: * array of slices into string[] representing matches */ public rchar[][] exec(rchar[] string) { debug(regexp) printf("regexp.exec(string = '%.*s')\n", string); input = string; pmatch[0].rm_so = 0; pmatch[0].rm_eo = 0; return exec(); } /************************************************* * Pick up where last exec(string) or exec() left off, * searching string[] for next match. * Returns: * array of slices into string[] representing matches */ public rchar[][] exec() { if (!test()) return null; rchar[][] result; result = new rchar[][pmatch.length]; for (int i = 0; i < pmatch.length; i++) { if (pmatch[i].rm_so == pmatch[i].rm_eo) result[i] = null; else result[i] = input[pmatch[i].rm_so .. pmatch[i].rm_eo]; } return result; } /************************************************ * Search string[] for match. * Returns: 0 for no match, !=0 for match */ public int test(rchar[] string) { return test(string, 0 /*pmatch[0].rm_eo*/); } /************************************************ * Pick up where last test(string) or test() left off, and search again. * Returns: 0 for no match, !=0 for match */ public int test() { return test(input, pmatch[0].rm_eo); } /************************************************ * Test string[] starting at startindex against regular expression. * Returns: 0 for no match, !=0 for match */ public int test(char[] string, int startindex) { char firstc; uint si; input = string; debug (regexp) printf("RegExp.test(input[] = '%.*s', startindex = %d)\n", input, startindex); pmatch[0].rm_so = 0; pmatch[0].rm_eo = 0; if (startindex < 0 || startindex > input.length) { return 0; // fail } //debug(regexp) printProgram(program); // First character optimization firstc = 0; if (program[0] == REchar) { firstc = program[1]; if (attributes & REA.ignoreCase && isalpha(firstc)) firstc = 0; } for (si = startindex; ; si++) { if (firstc) { if (si == input.length) break; // no match if (input[si] != firstc) { si++; if (!chr(si, firstc)) // if first character not found break; // no match } } for (int i = 0; i < re_nsub + 1; i++) { pmatch[i].rm_so = -1; pmatch[i].rm_eo = -1; } src_start = src = si; if (trymatch(0, program.length)) { pmatch[0].rm_so = si; pmatch[0].rm_eo = src; //debug(regexp) printf("start = %d, end = %d\n", gmatch.rm_so, gmatch.rm_eo); return 1; } // If possible match must start at beginning, we are done if (program[0] == REbol || program[0] == REanystar) { if (attributes & REA.multiline) { // Scan for the next \n if (!chr(si, '\n')) break; // no match if '\n' not found } else break; } if (si == input.length) break; //debug(regexp) printf("Starting new try: '%.*s'\n", input[si + 1 .. input.length]); } return 0; // no match } int chr(inout uint si, rchar c) { for (; si < input.length; si++) { if (input[si] == c) return 1; } return 0; } void printProgram(ubyte[] prog) { //debug(regexp) { uint pc; uint len; uint n; uint m; ushort *pu; uint *puint; printf("printProgram()\n"); for (pc = 0; pc < prog.length; ) { printf("%3d: ", pc); //printf("prog[pc] = %d, REchar = %d, REnmq = %d\n", prog[pc], REchar, REnmq); switch (prog[pc]) { case REchar: printf("\tREchar '%c'\n", prog[pc + 1]); pc += 1 + char.sizeof; break; case REichar: printf("\tREichar '%c'\n", prog[pc + 1]); pc += 1 + char.sizeof; break; case REdchar: printf("\tREdchar '%c'\n", *cast(dchar *)&prog[pc + 1]); pc += 1 + dchar.sizeof; break; case REidchar: printf("\tREidchar '%c'\n", *cast(dchar *)&prog[pc + 1]); pc += 1 + dchar.sizeof; break; case REanychar: printf("\tREanychar\n"); pc++; break; case REstring: len = *cast(uint *)&prog[pc + 1]; printf("\tREstring x%x, '%.*s'\n", len, (&prog[pc + 1 + uint.sizeof])[0 .. len]); pc += 1 + uint.sizeof + len * rchar.sizeof; break; case REistring: len = *cast(uint *)&prog[pc + 1]; printf("\tREistring x%x, '%.*s'\n", len, (&prog[pc + 1 + uint.sizeof])[0 .. len]); pc += 1 + uint.sizeof + len * rchar.sizeof; break; case REtestbit: pu = cast(ushort *)&prog[pc + 1]; printf("\tREtestbit %d, %d\n", pu[0], pu[1]); len = pu[1]; pc += 1 + 2 * ushort.sizeof + len; break; case REbit: pu = cast(ushort *)&prog[pc + 1]; len = pu[1]; printf("\tREbit cmax=%02x, len=%d:", pu[0], len); for (n = 0; n < len; n++) printf(" %02x", prog[pc + 1 + 2 * ushort.sizeof + n]); printf("\n"); pc += 1 + 2 * ushort.sizeof + len; break; case REnotbit: pu = cast(ushort *)&prog[pc + 1]; printf("\tREnotbit %d, %d\n", pu[0], pu[1]); len = pu[1]; pc += 1 + 2 * ushort.sizeof + len; break; case RErange: len = *cast(uint *)&prog[pc + 1]; printf("\tRErange %d\n", len); // BUG: REAignoreCase? pc += 1 + uint.sizeof + len; break; case REnotrange: len = *cast(uint *)&prog[pc + 1]; printf("\tREnotrange %d\n", len); // BUG: REAignoreCase? pc += 1 + uint.sizeof + len; break; case REbol: printf("\tREbol\n"); pc++; break; case REeol: printf("\tREeol\n"); pc++; break; case REor: len = *cast(uint *)&prog[pc + 1]; printf("\tREor %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len); pc += 1 + uint.sizeof; break; case REgoto: len = *cast(uint *)&prog[pc + 1]; printf("\tREgoto %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len); pc += 1 + uint.sizeof; break; case REanystar: printf("\tREanystar\n"); pc++; break; case REnm: case REnmq: // len, n, m, () puint = cast(uint *)&prog[pc + 1]; len = puint[0]; n = puint[1]; m = puint[2]; printf("\tREnm%.*s len=%d, n=%u, m=%u, pc=>%d\n", (prog[pc] == REnmq) ? "q" : " ", len, n, m, pc + 1 + uint.sizeof * 3 + len); pc += 1 + uint.sizeof * 3; break; case REparen: // len, n, () puint = cast(uint *)&prog[pc + 1]; len = puint[0]; n = puint[1]; printf("\tREparen len=%d n=%d, pc=>%d\n", len, n, pc + 1 + uint.sizeof * 2 + len); pc += 1 + uint.sizeof * 2; break; case REend: printf("\tREend\n"); return; case REwordboundary: printf("\tREwordboundary\n"); pc++; break; case REnotwordboundary: printf("\tREnotwordboundary\n"); pc++; break; case REdigit: printf("\tREdigit\n"); pc++; break; case REnotdigit: printf("\tREnotdigit\n"); pc++; break; case REspace: printf("\tREspace\n"); pc++; break; case REnotspace: printf("\tREnotspace\n"); pc++; break; case REword: printf("\tREword\n"); pc++; break; case REnotword: printf("\tREnotword\n"); pc++; break; case REbackref: printf("\tREbackref %d\n", prog[1]); pc += 2; break; default: assert(0); } } } } /************************************************** * Match input against a section of the program[]. * Returns: * 1 if successful match * 0 no match */ int trymatch(int pc, int pcend) { int srcsave; uint len; uint n; uint m; uint count; uint pop; uint ss; regmatch_t *psave; uint c1; uint c2; ushort* pu; uint* puint; debug(regexp) printf("RegExp.trymatch(pc = %d, src = '%.*s', pcend = %d)\n", pc, input[src .. input.length], pcend); srcsave = src; psave = null; for (;;) { if (pc == pcend) // if done matching { debug(regex) printf("\tprogend\n"); return 1; } //printf("\top = %d\n", program[pc]); switch (program[pc]) { case REchar: if (src == input.length) goto Lnomatch; debug(regexp) printf("\tREchar '%c', src = '%c'\n", program[pc + 1], input[src]); if (program[pc + 1] != input[src]) goto Lnomatch; src++; pc += 1 + char.sizeof; break; case REichar: if (src == input.length) goto Lnomatch; debug(regexp) printf("\tREichar '%c', src = '%c'\n", program[pc + 1], input[src]); c1 = program[pc + 1]; c2 = input[src]; if (c1 != c2) { if (islower(cast(rchar)c2)) c2 = std.ctype.toupper(cast(rchar)c2); else goto Lnomatch; if (c1 != c2) goto Lnomatch; } src++; pc += 1 + char.sizeof; break; case REdchar: debug(regexp) printf("\tREdchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]); if (src == input.length) goto Lnomatch; if (*(cast(dchar *)&program[pc + 1]) != input[src]) goto Lnomatch; src++; pc += 1 + dchar.sizeof; break; case REidchar: debug(regexp) printf("\tREidchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]); if (src == input.length) goto Lnomatch; c1 = *(cast(dchar *)&program[pc + 1]); c2 = input[src]; if (c1 != c2) { if (islower(cast(rchar)c2)) c2 = std.ctype.toupper(cast(rchar)c2); else goto Lnomatch; if (c1 != c2) goto Lnomatch; } src++; pc += 1 + dchar.sizeof; break; case REanychar: debug(regexp) printf("\tREanychar\n"); if (src == input.length) goto Lnomatch; if (!(attributes & REA.dotmatchlf) && input[src] == cast(rchar)'\n') goto Lnomatch; src++; pc++; break; case REstring: len = *cast(uint *)&program[pc + 1]; debug(regexp) printf("\tREstring x%x, '%.*s'\n", len, (&program[pc + 1 + uint.sizeof])[0 .. len]); if (src + len > input.length) goto Lnomatch; if (memcmp(&program[pc + 1 + uint.sizeof], &input[src], len * rchar.sizeof)) goto Lnomatch; src += len; pc += 1 + uint.sizeof + len * rchar.sizeof; break; case REistring: len = *cast(uint *)&program[pc + 1]; debug(regexp) printf("\tREistring x%x, '%.*s'\n", len, (&program[pc + 1 + uint.sizeof])[0 .. len]); if (src + len > input.length) goto Lnomatch; version (Win32) { if (memicmp(cast(char*)&program[pc + 1 + uint.sizeof], &input[src], len * rchar.sizeof)) goto Lnomatch; } else { if (icmp((cast(char*)&program[pc + 1 + uint.sizeof])[0..len], input[src .. src + len])) goto Lnomatch; } src += len; pc += 1 + uint.sizeof + len * rchar.sizeof; break; case REtestbit: pu = (cast(ushort *)&program[pc + 1]); debug(regexp) printf("\tREtestbit %d, %d, '%c', x%02x\n", pu[0], pu[1], input[src], input[src]); if (src == input.length) goto Lnomatch; len = pu[1]; c1 = input[src]; //printf("[x%02x]=x%02x, x%02x\n", c1 >> 3, ((&program[pc + 1 + 4])[c1 >> 3] ), (1 << (c1 & 7))); if (c1 <= pu[0] && !((&(program[pc + 1 + 4]))[c1 >> 3] & (1 << (c1 & 7)))) goto Lnomatch; pc += 1 + 2 * ushort.sizeof + len; break; case REbit: pu = (cast(ushort *)&program[pc + 1]); debug(regexp) printf("\tREbit %d, %d, '%c'\n", pu[0], pu[1], input[src]); if (src == input.length) goto Lnomatch; len = pu[1]; c1 = input[src]; if (c1 > pu[0]) goto Lnomatch; if (!((&program[pc + 1 + 4])[c1 >> 3] & (1 << (c1 & 7)))) goto Lnomatch; src++; pc += 1 + 2 * ushort.sizeof + len; break; case REnotbit: pu = (cast(ushort *)&program[pc + 1]); debug(regexp) printf("\tREnotbit %d, %d, '%c'\n", pu[0], pu[1], input[src]); if (src == input.length) goto Lnomatch; len = pu[1]; c1 = input[src]; if (c1 <= pu[0] && ((&program[pc + 1 + 4])[c1 >> 3] & (1 << (c1 & 7)))) goto Lnomatch; src++; pc += 1 + 2 * ushort.sizeof + len; break; case RErange: len = *cast(uint *)&program[pc + 1]; debug(regexp) printf("\tRErange %d\n", len); if (src == input.length) goto Lnomatch; // BUG: REA.ignoreCase? if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) == null) goto Lnomatch; src++; pc += 1 + uint.sizeof + len; break; case REnotrange: len = *cast(uint *)&program[pc + 1]; debug(regexp) printf("\tREnotrange %d\n", len); if (src == input.length) goto Lnomatch; // BUG: REA.ignoreCase? if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) != null) goto Lnomatch; src++; pc += 1 + uint.sizeof + len; break; case REbol: debug(regexp) printf("\tREbol\n"); if (src == 0) { } else if (attributes & REA.multiline) { if (input[src - 1] != '\n') goto Lnomatch; } else goto Lnomatch; pc++; break; case REeol: debug(regexp) printf("\tREeol\n"); if (src == input.length) { } else if (attributes & REA.multiline && input[src] == '\n') src++; else goto Lnomatch; pc++; break; case REor: len = (cast(uint *)&program[pc + 1])[0]; debug(regexp) printf("\tREor %d\n", len); pop = pc + 1 + uint.sizeof; ss = src; if (trymatch(pop, pcend)) { if (pcend != program.length) { int s; s = src; if (trymatch(pcend, program.length)) { debug(regexp) printf("\tfirst operand matched\n"); src = s; return 1; } else { // If second branch doesn't match to end, take first anyway src = ss; if (!trymatch(pop + len, program.length)) { debug(regexp) printf("\tfirst operand matched\n"); src = s; return 1; } } src = ss; } else { debug(regexp) printf("\tfirst operand matched\n"); return 1; } } pc = pop + len; // proceed with 2nd branch break; case REgoto: debug(regexp) printf("\tREgoto\n"); len = (cast(uint *)&program[pc + 1])[0]; pc += 1 + uint.sizeof + len; break; case REanystar: debug(regexp) printf("\tREanystar\n"); pc++; for (;;) { int s1; int s2; s1 = src; if (src == input.length) break; if (!(attributes & REA.dotmatchlf) && input[src] == '\n') break; src++; s2 = src; // If no match after consumption, but it // did match before, then no match if (!trymatch(pc, program.length)) { src = s1; // BUG: should we save/restore pmatch[]? if (trymatch(pc, program.length)) { src = s1; // no match break; } } src = s2; } break; case REnm: case REnmq: // len, n, m, () puint = cast(uint *)&program[pc + 1]; len = puint[0]; n = puint[1]; m = puint[2]; debug(regexp) printf("\tREnm%s len=%d, n=%u, m=%u\n", (program[pc] == REnmq) ? cast(char*)"q" : cast(char*)"", len, n, m); pop = pc + 1 + uint.sizeof * 3; for (count = 0; count < n; count++) { if (!trymatch(pop, pop + len)) goto Lnomatch; } if (!psave && count < m) { //version (Win32) psave = cast(regmatch_t *)alloca((re_nsub + 1) * regmatch_t.sizeof); //else //psave = new regmatch_t[re_nsub + 1]; } if (program[pc] == REnmq) // if minimal munch { for (; count < m; count++) { int s1; memcpy(psave, pmatch, (re_nsub + 1) * regmatch_t.sizeof); s1 = src; if (trymatch(pop + len, program.length)) { src = s1; memcpy(pmatch, psave, (re_nsub + 1) * regmatch_t.sizeof); break; } if (!trymatch(pop, pop + len)) { debug(regexp) printf("\tdoesn't match subexpression\n"); break; } // If source is not consumed, don't // infinite loop on the match if (s1 == src) { debug(regexp) printf("\tsource is not consumed\n"); break; } } } else // maximal munch { for (; count < m; count++) { int s1; int s2; memcpy(psave, pmatch, (re_nsub + 1) * regmatch_t.sizeof); s1 = src; if (!trymatch(pop, pop + len)) { debug(regexp) printf("\tdoesn't match subexpression\n"); break; } s2 = src; // If source is not consumed, don't // infinite loop on the match if (s1 == s2) { debug(regexp) printf("\tsource is not consumed\n"); break; } // If no match after consumption, but it // did match before, then no match if (!trymatch(pop + len, program.length)) { src = s1; if (trymatch(pop + len, program.length)) { src = s1; // no match memcpy(pmatch, psave, (re_nsub + 1) * regmatch_t.sizeof); break; } } src = s2; } } debug(regexp) printf("\tREnm len=%d, n=%u, m=%u, DONE count=%d\n", len, n, m, count); pc = pop + len; break; case REparen: // len, () debug(regexp) printf("\tREparen\n"); puint = cast(uint *)&program[pc + 1]; len = puint[0]; n = puint[1]; pop = pc + 1 + uint.sizeof * 2; ss = src; if (!trymatch(pop, pop + len)) goto Lnomatch; pmatch[n + 1].rm_so = ss; pmatch[n + 1].rm_eo = src; pc = pop + len; break; case REend: debug(regexp) printf("\tREend\n"); return 1; // successful match case REwordboundary: debug(regexp) printf("\tREwordboundary\n"); if (src > 0 && src < input.length) { c1 = input[src - 1]; c2 = input[src]; if (!( (isword(cast(rchar)c1) && !isword(cast(rchar)c2)) || (!isword(cast(rchar)c1) && isword(cast(rchar)c2)) ) ) goto Lnomatch; } pc++; break; case REnotwordboundary: debug(regexp) printf("\tREnotwordboundary\n"); if (src == 0 || src == input.length) goto Lnomatch; c1 = input[src - 1]; c2 = input[src]; if ( (isword(cast(rchar)c1) && !isword(cast(rchar)c2)) || (!isword(cast(rchar)c1) && isword(cast(rchar)c2)) ) goto Lnomatch; pc++; break; case REdigit: debug(regexp) printf("\tREdigit\n"); if (src == input.length) goto Lnomatch; if (!isdigit(input[src])) goto Lnomatch; src++; pc++; break; case REnotdigit: debug(regexp) printf("\tREnotdigit\n"); if (src == input.length) goto Lnomatch; if (isdigit(input[src])) goto Lnomatch; src++; pc++; break; case REspace: debug(regexp) printf("\tREspace\n"); if (src == input.length) goto Lnomatch; if (!isspace(input[src])) goto Lnomatch; src++; pc++; break; case REnotspace: debug(regexp) printf("\tREnotspace\n"); if (src == input.length) goto Lnomatch; if (isspace(input[src])) goto Lnomatch; src++; pc++; break; case REword: debug(regexp) printf("\tREword\n"); if (src == input.length) goto Lnomatch; if (!isword(input[src])) goto Lnomatch; src++; pc++; break; case REnotword: debug(regexp) printf("\tREnotword\n"); if (src == input.length) goto Lnomatch; if (isword(input[src])) goto Lnomatch; src++; pc++; break; case REbackref: { n = program[pc + 1]; debug(regexp) printf("\tREbackref %d\n", n); int so = pmatch[n + 1].rm_so; int eo = pmatch[n + 1].rm_eo; len = eo - so; if (src + len > input.length) goto Lnomatch; else if (attributes & REA.ignoreCase) { if (icmp(input[src .. src + len], input[so .. eo])) goto Lnomatch; } else if (memcmp(&input[src], &input[so], len * rchar.sizeof)) goto Lnomatch; src += len; pc += 2; break; } default: assert(0); } } Lnomatch: debug(regexp) printf("\tnomatch pc=%d\n", pc); src = srcsave; return 0; } /* =================== Compiler ================== */ int parseRegexp() { uint offset; uint gotooffset; uint len1; uint len2; //printf("parseRegexp() '%.*s'\n", pattern[p .. pattern.length]); offset = buf.offset; for (;;) { assert(p <= pattern.length); if (p == pattern.length) { buf.write(REend); return 1; } switch (pattern[p]) { case ')': return 1; case '|': p++; gotooffset = buf.offset; buf.write(REgoto); buf.write(cast(uint)0); len1 = buf.offset - offset; buf.spread(offset, 1 + uint.sizeof); gotooffset += 1 + uint.sizeof; parseRegexp(); len2 = buf.offset - (gotooffset + 1 + uint.sizeof); buf.data[offset] = REor; (cast(uint *)&buf.data[offset + 1])[0] = len1; (cast(uint *)&buf.data[gotooffset + 1])[0] = len2; break; default: parsePiece(); break; } } } int parsePiece() { uint offset; uint len; uint n; uint m; ubyte op; int plength = pattern.length; //printf("parsePiece() '%.*s'\n", pattern[p .. pattern.length]); offset = buf.offset; parseAtom(); if (p == plength) return 1; switch (pattern[p]) { case '*': // Special optimization: replace .* with REanystar if (buf.offset - offset == 1 && buf.data[offset] == REanychar && p + 1 < plength && pattern[p + 1] != '?') { buf.data[offset] = REanystar; p++; break; } n = 0; m = inf; goto Lnm; case '+': n = 1; m = inf; goto Lnm; case '?': n = 0; m = 1; goto Lnm; case '{': // {n} {n,} {n,m} p++; if (p == plength || !isdigit(pattern[p])) goto Lerr; n = 0; do { // BUG: handle overflow n = n * 10 + pattern[p] - '0'; p++; if (p == plength) goto Lerr; } while (isdigit(pattern[p])); if (pattern[p] == '}') // {n} { m = n; goto Lnm; } if (pattern[p] != ',') goto Lerr; p++; if (p == plength) goto Lerr; if (pattern[p] == /*{*/ '}') // {n,} { m = inf; goto Lnm; } if (!isdigit(pattern[p])) goto Lerr; m = 0; // {n,m} do { // BUG: handle overflow m = m * 10 + pattern[p] - '0'; p++; if (p == plength) goto Lerr; } while (isdigit(pattern[p])); if (pattern[p] != /*{*/ '}') goto Lerr; goto Lnm; Lnm: p++; op = REnm; if (p < plength && pattern[p] == '?') { op = REnmq; // minimal munch version p++; } len = buf.offset - offset; buf.spread(offset, 1 + uint.sizeof * 3); buf.data[offset] = op; uint* puint = cast(uint *)&buf.data[offset + 1]; puint[0] = len; puint[1] = n; puint[2] = m; break; default: break; } return 1; Lerr: error("badly formed {n,m}"); } int parseAtom() { ubyte op; uint offset; rchar c; //printf("parseAtom() '%.*s'\n", pattern[p .. pattern.length]); if (p < pattern.length) { c = pattern[p]; switch (c) { case '*': case '+': case '?': error("*+? not allowed in atom"); p++; return 0; case '(': p++; buf.write(REparen); offset = buf.offset; buf.write(cast(uint)0); // reserve space for length buf.write(re_nsub); re_nsub++; parseRegexp(); *cast(uint *)&buf.data[offset] = buf.offset - (offset + uint.sizeof * 2); if (p == pattern.length || pattern[p] != ')') { error("')' expected"); return 0; } p++; break; case '[': if (!parseRange()) return 0; break; case '.': p++; buf.write(REanychar); break; case '^': p++; buf.write(REbol); break; case '$': p++; buf.write(REeol); break; case '\\': p++; if (p == pattern.length) { error("no character past '\\'"); return 0; } c = pattern[p]; switch (c) { case 'b': op = REwordboundary; goto Lop; case 'B': op = REnotwordboundary; goto Lop; case 'd': op = REdigit; goto Lop; case 'D': op = REnotdigit; goto Lop; case 's': op = REspace; goto Lop; case 'S': op = REnotspace; goto Lop; case 'w': op = REword; goto Lop; case 'W': op = REnotword; goto Lop; Lop: buf.write(op); p++; break; case 'f': case 'n': case 'r': case 't': case 'v': case 'c': case 'x': case 'u': case '0': c = escape(); goto Lbyte; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': c -= '1'; if (c < re_nsub) { buf.write(REbackref); buf.write(cast(ubyte)c); } else { error("no matching back reference"); return 0; } p++; break; default: p++; goto Lbyte; } break; default: p++; Lbyte: op = REchar; if (attributes & REA.ignoreCase) { if (isalpha(c)) { op = REichar; c = std.ctype.toupper(c); } } if (op == REchar && c <= 0xFF) { // Look ahead and see if we can make this into // an REstring int q; int len; for (q = p; q < pattern.length; ++q) { rchar qc = pattern[q]; switch (qc) { case '{': case '*': case '+': case '?': if (q == p) goto Lchar; q--; break; case '(': case ')': case '|': case '[': case ']': case '.': case '^': case '$': case '\\': case '}': break; default: continue; } break; } len = q - p; if (len > 0) { debug(regexp) printf("writing string len %d, c = '%c', pattern[p] = '%c'\n", len+1, c, pattern[p]); buf.reserve(5 + (1 + len) * rchar.sizeof); buf.write((attributes & REA.ignoreCase) ? REistring : REstring); buf.write(len + 1); buf.write(c); buf.write(pattern[p .. p + len]); p = q; break; } } if (c >= 0x80) { // Convert to dchar opcode op = (op == REchar) ? REdchar : REidchar; buf.write(op); buf.write(c); } else { Lchar: debug(regexp) printf("It's an REchar '%c'\n", c); buf.write(op); buf.write(cast(char)c); } break; } } return 1; } private: class Range { uint maxc; uint maxb; OutBuffer buf; ubyte* base; BitArray bits; this(OutBuffer buf) { this.buf = buf; if (buf.data.length) this.base = &buf.data[buf.offset]; } void setbitmax(uint u) { uint b; //printf("setbitmax(x%x), maxc = x%x\n", u, maxc); if (u > maxc) { maxc = u; b = u / 8; if (b >= maxb) { uint u; u = base ? base - &buf.data[0] : 0; buf.fill0(b - maxb + 1); base = &buf.data[u]; maxb = b + 1; //bits = (cast(bit*)this.base)[0 .. maxc + 1]; bits.ptr = cast(uint*)this.base; } bits.len = maxc + 1; } } void setbit2(uint u) { setbitmax(u + 1); //printf("setbit2 [x%02x] |= x%02x\n", u >> 3, 1 << (u & 7)); bits[u] = 1; } }; int parseRange() { ubyte op; int c; int c2; uint i; uint cmax; uint offset; cmax = 0x7F; p++; op = REbit; if (p == pattern.length) goto Lerr; if (pattern[p] == '^') { p++; op = REnotbit; if (p == pattern.length) goto Lerr; } buf.write(op); offset = buf.offset; buf.write(cast(uint)0); // reserve space for length buf.reserve(128 / 8); Range r = new Range(buf); if (op == REnotbit) r.setbit2(0); switch (pattern[p]) { case ']': case '-': c = pattern[p]; p++; r.setbit2(c); break; default: break; } enum RS { start, rliteral, dash }; RS rs; rs = RS.start; for (;;) { if (p == pattern.length) goto Lerr; switch (pattern[p]) { case ']': switch (rs) { case RS.dash: r.setbit2('-'); case RS.rliteral: r.setbit2(c); break; case RS.start: break; } p++; break; case '\\': p++; r.setbitmax(cmax); if (p == pattern.length) goto Lerr; switch (pattern[p]) { case 'd': for (i = '0'; i <= '9'; i++) r.bits[i] = 1; goto Lrs; case 'D': for (i = 1; i < '0'; i++) r.bits[i] = 1; for (i = '9' + 1; i <= cmax; i++) r.bits[i] = 1; goto Lrs; case 's': for (i = 0; i <= cmax; i++) if (isspace(i)) r.bits[i] = 1; goto Lrs; case 'S': for (i = 1; i <= cmax; i++) if (!isspace(i)) r.bits[i] = 1; goto Lrs; case 'w': for (i = 0; i <= cmax; i++) if (isword(cast(rchar)i)) r.bits[i] = 1; goto Lrs; case 'W': for (i = 1; i <= cmax; i++) if (!isword(cast(rchar)i)) r.bits[i] = 1; goto Lrs; Lrs: switch (rs) { case RS.dash: r.setbit2('-'); case RS.rliteral: r.setbit2(c); break; default: break; } rs = RS.start; continue; default: break; } c2 = escape(); goto Lrange; case '-': p++; if (rs == RS.start) goto Lrange; else if (rs == RS.rliteral) rs = RS.dash; else if (rs == RS.dash) { r.setbit2(c); r.setbit2('-'); rs = RS.start; } continue; default: c2 = pattern[p]; p++; Lrange: switch (rs) { case RS.rliteral: r.setbit2(c); case RS.start: c = c2; rs = RS.rliteral; break; case RS.dash: if (c > c2) { error("inverted range in character class"); return 0; } r.setbitmax(c2); //printf("c = %x, c2 = %x\n",c,c2); for (; c <= c2; c++) r.bits[c] = 1; rs = RS.start; break; } continue; } break; } //printf("maxc = %d, maxb = %d\n",r.maxc,r.maxb); (cast(ushort *)&buf.data[offset])[0] = cast(ushort)r.maxc; (cast(ushort *)&buf.data[offset])[1] = cast(ushort)r.maxb; if (attributes & REA.ignoreCase) { // BUG: what about dchar? r.setbitmax(0x7F); for (c = 'a'; c <= 'z'; c++) { if (r.bits[c]) r.bits[c + 'A' - 'a'] = 1; else if (r.bits[c + 'A' - 'a']) r.bits[c] = 1; } } return 1; Lerr: error("invalid range"); return 0; } void error(char[] msg) { errors++; debug(regexp) printf("error: %.*s\n", msg); //assert(0); //*(char*)0=0; throw new RegExpException(msg); } // p is following the \ char int escape() in { assert(p < pattern.length); } body { int c; int i; rchar tc; c = pattern[p]; // none of the cases are multibyte switch (c) { case 'b': c = '\b'; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'v': c = '\v'; break; // BUG: Perl does \a and \e too, should we? case 'c': ++p; if (p == pattern.length) goto Lretc; c = pattern[p]; // Note: we are deliberately not allowing dchar letters if (!(('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))) { Lcerr: error("letter expected following \\c"); return 0; } c &= 0x1F; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': c -= '0'; for (i = 0; i < 2; i++) { p++; if (p == pattern.length) goto Lretc; tc = pattern[p]; if ('0' <= tc && tc <= '7') { c = c * 8 + (tc - '0'); // Treat overflow as if last // digit was not an octal digit if (c >= 0xFF) { c >>= 3; return c; } } else return c; } break; case 'x': c = 0; for (i = 0; i < 2; i++) { p++; if (p == pattern.length) goto Lretc; tc = pattern[p]; if ('0' <= tc && tc <= '9') c = c * 16 + (tc - '0'); else if ('a' <= tc && tc <= 'f') c = c * 16 + (tc - 'a' + 10); else if ('A' <= tc && tc <= 'F') c = c * 16 + (tc - 'A' + 10); else if (i == 0) // if no hex digits after \x { // Not a valid \xXX sequence return 'x'; } else return c; } break; case 'u': c = 0; for (i = 0; i < 4; i++) { p++; if (p == pattern.length) goto Lretc; tc = pattern[p]; if ('0' <= tc && tc <= '9') c = c * 16 + (tc - '0'); else if ('a' <= tc && tc <= 'f') c = c * 16 + (tc - 'a' + 10); else if ('A' <= tc && tc <= 'F') c = c * 16 + (tc - 'A' + 10); else { // Not a valid \uXXXX sequence p -= i; return 'u'; } } break; default: break; } p++; Lretc: return c; } /* ==================== optimizer ======================= */ void optimize() { ubyte[] prog; int i; debug(regexp) printf("RegExp.optimize()\n"); prog = buf.toBytes(); for (i = 0; 1;) { //printf("\tprog[%d] = %d, %d\n", i, prog[i], REstring); switch (prog[i]) { case REend: case REanychar: case REanystar: case REbackref: case REeol: case REchar: case REichar: case REdchar: case REidchar: case REstring: case REistring: case REtestbit: case REbit: case REnotbit: case RErange: case REnotrange: case REwordboundary: case REnotwordboundary: case REdigit: case REnotdigit: case REspace: case REnotspace: case REword: case REnotword: return; case REbol: i++; continue; case REor: case REnm: case REnmq: case REparen: case REgoto: { OutBuffer bitbuf = new OutBuffer; Range r = new Range(bitbuf); uint offset; offset = i; if (starrchars(r, prog[i .. prog.length])) { debug(regexp) printf("\tfilter built\n"); buf.spread(offset, 1 + 4 + r.maxb); buf.data[offset] = REtestbit; (cast(ushort *)&buf.data[offset + 1])[0] = cast(ushort)r.maxc; (cast(ushort *)&buf.data[offset + 1])[1] = cast(ushort)r.maxb; i = offset + 1 + 4; buf.data[i .. i + r.maxb] = r.base[0 .. r.maxb]; } return; } default: assert(0); } } } ///////////////////////////////////////// // OR the leading character bits into r. // Limit the character range from 0..7F, // trymatch() will allow through anything over maxc. // Return 1 if success, 0 if we can't build a filter or // if there is no point to one. int starrchars(Range r, ubyte[] prog) { rchar c; uint maxc; uint maxb; uint len; uint b; uint n; uint m; ubyte* pop; int i; //printf("RegExp.starrchars(prog = %p, progend = %p)\n", prog, progend); for (i = 0; i < prog.length;) { switch (prog[i]) { case REchar: c = prog[i + 1]; if (c <= 0x7F) r.setbit2(c); return 1; case REichar: c = prog[i + 1]; if (c <= 0x7F) { r.setbit2(c); r.setbit2(std.ctype.tolower(cast(rchar)c)); } return 1; case REdchar: case REidchar: return 1; case REanychar: return 0; // no point case REstring: len = *cast(uint *)&prog[i + 1]; assert(len); c = *cast(rchar *)&prog[i + 1 + uint.sizeof]; debug(regexp) printf("\tREstring %d, '%c'\n", len, c); if (c <= 0x7F) r.setbit2(c); return 1; case REistring: len = *cast(uint *)&prog[i + 1]; assert(len); c = *cast(rchar *)&prog[i + 1 + uint.sizeof]; debug(regexp) printf("\tREistring %d, '%c'\n", len, c); if (c <= 0x7F) { r.setbit2(std.ctype.toupper(cast(rchar)c)); r.setbit2(std.ctype.tolower(cast(rchar)c)); } return 1; case REtestbit: case REbit: maxc = (cast(ushort *)&prog[i + 1])[0]; maxb = (cast(ushort *)&prog[i + 1])[1]; if (maxc <= 0x7F) r.setbitmax(maxc); else maxb = r.maxb; for (b = 0; b < maxb; b++) r.base[b] |= prog[i + 1 + 4 + b]; return 1; case REnotbit: maxc = (cast(ushort *)&prog[i + 1])[0]; maxb = (cast(ushort *)&prog[i + 1])[1]; if (maxc <= 0x7F) r.setbitmax(maxc); else maxb = r.maxb; for (b = 0; b < maxb; b++) r.base[b] |= ~prog[i + 1 + 4 + b]; return 1; case REbol: case REeol: return 0; case REor: len = (cast(uint *)&prog[i + 1])[0]; return starrchars(r, prog[i + 1 + uint.sizeof .. prog.length]) && starrchars(r, prog[i + 1 + uint.sizeof + len .. prog.length]); case REgoto: len = (cast(uint *)&prog[i + 1])[0]; i += 1 + uint.sizeof + len; break; case REanystar: return 0; case REnm: case REnmq: // len, n, m, () len = (cast(uint *)&prog[i + 1])[0]; n = (cast(uint *)&prog[i + 1])[1]; m = (cast(uint *)&prog[i + 1])[2]; pop = &prog[i + 1 + uint.sizeof * 3]; if (!starrchars(r, pop[0 .. len])) return 0; if (n) return 1; i += 1 + uint.sizeof * 3 + len; break; case REparen: // len, () len = (cast(uint *)&prog[i + 1])[0]; n = (cast(uint *)&prog[i + 1])[1]; pop = &prog[0] + i + 1 + uint.sizeof * 2; return starrchars(r, pop[0 .. len]); case REend: return 0; case REwordboundary: case REnotwordboundary: return 0; case REdigit: r.setbitmax('9'); for (c = '0'; c <= '9'; c++) r.bits[c] = 1; return 1; case REnotdigit: r.setbitmax(0x7F); for (c = 0; c <= '0'; c++) r.bits[c] = 1; for (c = '9' + 1; c <= r.maxc; c++) r.bits[c] = 1; return 1; case REspace: r.setbitmax(0x7F); for (c = 0; c <= r.maxc; c++) if (isspace(c)) r.bits[c] = 1; return 1; case REnotspace: r.setbitmax(0x7F); for (c = 0; c <= r.maxc; c++) if (!isspace(c)) r.bits[c] = 1; return 1; case REword: r.setbitmax(0x7F); for (c = 0; c <= r.maxc; c++) if (isword(cast(rchar)c)) r.bits[c] = 1; return 1; case REnotword: r.setbitmax(0x7F); for (c = 0; c <= r.maxc; c++) if (!isword(cast(rchar)c)) r.bits[c] = 1; return 1; case REbackref: return 0; } } return 1; } /* ==================== replace ======================= */ /*********************** * After a match is found with test(), this function * will take the match results and, using the format * string, generate and return a new string. * The format string has the formatting characters: * $(TR $(TH Format) $(TH Replaced With)) $(TR $(TD $(B $$)) $(TD $) ) $(TR $(TD $(B $&)) $(TD The matched substring.) ) $(TR $(TD $(B $`)) $(TD The portion of string that precedes the matched substring.) ) $(TR $(TD $(B $')) $(TD The portion of string that follows the matched substring.) ) $(TR $(TD $(B $n)) $(TD The nth capture, where n is a single digit 1-9 and $n is not followed by a decimal digit.) ) $(TR $(TD $(B $nn)) $(TD The nnth capture, where nn is a two-digit decimal number 01-99. If nnth capture is undefined or more than the number of parenthesized subexpressions, use the empty string instead.) )
Any other $ are left as is. */ public rchar[] replace(rchar[] format) { return replace3(format, input, pmatch[0 .. re_nsub + 1]); } // Static version that doesn't require a RegExp object to be created private static rchar[] replace3(rchar[] format, rchar[] input, regmatch_t[] pmatch) { OutBuffer buf; rchar[] result; rchar c; uint c2; int rm_so; int rm_eo; int i; int f; // printf("replace3(format = '%.*s', input = '%.*s')\n", format, input); buf = new OutBuffer(); buf.reserve(format.length * rchar.sizeof); for (f = 0; f < format.length; f++) { c = format[f]; L1: if (c != '$') { buf.write(c); continue; } ++f; if (f == format.length) { buf.write(cast(rchar)'$'); break; } c = format[f]; switch (c) { case '&': rm_so = pmatch[0].rm_so; rm_eo = pmatch[0].rm_eo; goto Lstring; case '`': rm_so = 0; rm_eo = pmatch[0].rm_so; goto Lstring; case '\'': rm_so = pmatch[0].rm_eo; rm_eo = input.length; goto Lstring; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': i = c - '0'; if (f + 1 == format.length) { if (i == 0) { buf.write(cast(rchar)'$'); buf.write(c); continue; } } else { c2 = format[f + 1]; if (c2 >= '0' && c2 <= '9') { i = (c - '0') * 10 + (c2 - '0'); f++; } if (i == 0) { buf.write(cast(rchar)'$'); buf.write(c); c = c2; goto L1; } } if (i < pmatch.length) { rm_so = pmatch[i].rm_so; rm_eo = pmatch[i].rm_eo; goto Lstring; } break; Lstring: if (rm_so != rm_eo) buf.write(input[rm_so .. rm_eo]); break; default: buf.write(cast(rchar)'$'); buf.write(c); break; } } result = cast(rchar[])buf.toBytes(); return result; } /************************************ * Like replace(char[] format), but uses old style formatting:
Format Description
& replace with the match
\n replace with the nth parenthesized match, n is 1..9
\c replace with char c.
*/ public rchar[] replaceOld(rchar[] format) { OutBuffer buf; rchar[] result; rchar c; //printf("replace: this = %p so = %d, eo = %d\n", this, pmatch[0].rm_so, pmatch[0].rm_eo); //printf("3input = '%.*s'\n", input); buf = new OutBuffer(); buf.reserve(format.length * rchar.sizeof); for (uint i; i < format.length; i++) { c = format[i]; switch (c) { case '&': //printf("match = '%.*s'\n", input[pmatch[0].rm_so .. pmatch[0].rm_eo]); buf.write(input[pmatch[0].rm_so .. pmatch[0].rm_eo]); break; case '\\': if (i + 1 < format.length) { c = format[++i]; if (c >= '1' && c <= '9') { uint i; i = c - '0'; if (i <= re_nsub && pmatch[i].rm_so != pmatch[i].rm_eo) buf.write(input[pmatch[i].rm_so .. pmatch[i].rm_eo]); break; } } buf.write(c); break; default: buf.write(c); break; } } result = cast(rchar[])buf.toBytes(); return result; } }