From cf6ba6ebdad0ff768c6d7bcc9e0010e6dfeb5a18 Mon Sep 17 00:00:00 2001 From: jmdavis Date: Sat, 23 Feb 2013 18:07:54 -0800 Subject: [PATCH] Finally remove std.regexp. It's been deprecated for some time now, and we no longer have anything in Phobos which depends on it. So, it's long past time for it to be fully removed. --- index.d | 2 +- posix.mak | 2 +- std/regexp.d | 3434 -------------------------------------------------- unittest.d | 4 +- win32.mak | 8 +- win64.mak | 8 +- 6 files changed, 8 insertions(+), 3450 deletions(-) delete mode 100644 std/regexp.d diff --git a/index.d b/index.d index ccb6d6694..8d9b2a295 100644 --- a/index.d +++ b/index.d @@ -190,7 +190,7 @@ $(V1
Recursively search file system and (currently Windows only) FTP sites. ) -
std.regexp +
std.regex
The usual regular expression functions.
std.socket diff --git a/posix.mak b/posix.mak index d393b09c9..e1754a60d 100644 --- a/posix.mak +++ b/posix.mak @@ -164,7 +164,7 @@ STD_MODULES = $(addprefix std/, algorithm array ascii base64 bigint \ cpuid cstream ctype csv datetime demangle encoding exception \ file format functional getopt json math mathspecial md5 \ metastrings mmfile numeric outbuffer parallelism path perf \ - process random range regex regexp signals socket socketstream \ + process random range regex signals socket socketstream \ stdint stdio stdiobase stream string syserror system traits \ typecons typetuple uni uri utf uuid variant xml zip zlib) diff --git a/std/regexp.d b/std/regexp.d deleted file mode 100644 index 87726cfb2..000000000 --- a/std/regexp.d +++ /dev/null @@ -1,3434 +0,0 @@ -// Written in the D programming language. -// Regular Expressions. - -/** - * $(RED Deprecated. - * Please use $(LINK2 std_regex.html, std.regex) instead.) - * - * $(LINK2 http://www.digitalmars.com/ctg/regular.html, Regular - * expressions) are a powerful method of string pattern matching. The - * regular expression language used in this library is the same as - * that commonly used, however, some of the very advanced forms may - * behave slightly differently. The standard observed is the $(WEB - * www.ecma-international.org/publications/standards/Ecma-262.htm, - * ECMA standard) for regular expressions. - * - * std.regexp is designed to work only with valid UTF strings as input. - * To validate untrusted input, use std.utf.validate(). - * - * In the following guide, $(I pattern)[] refers to a - * $(LINK2 http://www.digitalmars.com/ctg/regular.html, regular expression). - * The $(I attributes)[] refers to - * a string controlling the interpretation - * of the regular expression. - * It consists of a sequence of one or more - * of the following characters: - * - * - * - * $(TR $(TH Attribute) $(TH Action)) - * - * $(TD $(B g)) - * $(TD global; repeat over the whole input string) - * - * - * $(TD $(B i)) - * $(TD case insensitive) - * - * - * $(TD $(B m)) - * $(TD treat as multiple lines separated by newlines) - * - *
Attribute Characters
- * - * The $(I format)[] string has the formatting characters: - * - * - * - * $(TR $(TH Format) $(TH Replaced With)) - * $(TR - * $(TD $(B $$)) $(TD $) - * ) - * $(TR - * $(TD $(B $&)) $(TD The matched substring.) - * ) - * $(TR - * $(TD $(B $`)) $(TD The portion of string that precedes the matched substring.) - * ) - * $(TR - * $(TD $(B $')) $(TD The portion of string that follows the matched substring.) - * ) - * $(TR - * $(TD $(B $(DOLLAR))$(I n)) $(TD The $(I n)th capture, where $(I n) - * is a single digit 1-9 - * and $$(I n) is not followed by a decimal digit.) - * ) - * $(TR - * $(TD $(B $(DOLLAR))$(I nn)) $(TD The $(I nn)th capture, where $(I nn) - * is a two-digit decimal - * number 01-99. - * If $(I nn)th capture is undefined or more than the number - * of parenthesized subexpressions, use the empty - * string instead.) - * ) - *
Formatting Characters
- * - * Any other $ are left as is. - * - * References: - * $(LINK2 http://en.wikipedia.org/wiki/Regular_expressions, Wikipedia) - * Macros: - * WIKI = StdRegexp - * DOLLAR = $ - * - * Copyright: Copyright Digital Mars 2000 - 2011. - * License: Boost License 1.0. - * Authors: $(WEB digitalmars.com, Walter Bright) - * Source: $(PHOBOSSRC std/_regexp.d) - */ -/* Copyright Digital Mars 2000 - 2011. - * Distributed under the Boost Software License, Version 1.0. - * (See accompanying file LICENSE_1_0.txt or copy at - * http://www.boost.org/LICENSE_1_0.txt) - */ - -/* - Escape sequences: - - \nnn starts out a 1, 2 or 3 digit octal sequence, - where n is an octal digit. If nnn is larger than - 0377, then the 3rd digit is not part of the sequence - and is not consumed. - For maximal portability, use exactly 3 digits. - - \xXX starts out a 1 or 2 digit hex sequence. X - is a hex character. If the first character after the \x - is not a hex character, the value of the sequence is 'x' - and the XX are not consumed. - For maximal portability, use exactly 2 digits. - - \uUUUU is a unicode sequence. There are exactly - 4 hex characters after the \u, if any are not, then - the value of the sequence is 'u', and the UUUU are not - consumed. - - Character classes: - - [a-b], where a is greater than b, will produce - an error. - - References: - - http://www.unicode.org/unicode/reports/tr18/ -*/ - -module std.regexp; - -pragma(msg, "Notice: As of Phobos 2.055, std.regexp has been deprecated. " ~ - "Please use std.regex instead."); - -//debug = regexp; // uncomment to turn on debugging printf's - -private -{ - import core.stdc.stdio; - import core.stdc.stdlib; - import core.stdc.string; - import std.algorithm; - import std.array; - import std.stdio; - import std.string; - import std.ascii; - import std.outbuffer; - import std.bitmanip; - import std.utf; - import std.algorithm; - import std.array; - import std.traits; -} - -deprecated: - -/** Regular expression to extract an _email address. - * References: - * $(LINK2 http://www.regular-expressions.info/email.html, How to Find or Validate an Email Address)$(BR) - * $(LINK2 http://tools.ietf.org/html/rfc2822#section-3.4.1, RFC 2822 Internet Message Format) - */ -string email = - r"[a-zA-Z]([.]?([[a-zA-Z0-9_]-]+)*)?@([[a-zA-Z0-9_]\-_]+\.)+[a-zA-Z]{2,6}"; - -/** Regular expression to extract a _url */ -string url = r"(([h|H][t|T]|[f|F])[t|T][p|P]([s|S]?)\:\/\/|~/|/)?([\w]+:\w+@)?(([a-zA-Z]{1}([\w\-]+\.)+([\w]{2,5}))(:[\d]{1,5})?)?((/?\w+/)+|/?)(\w+\.[\w]{3,4})?([,]\w+)*((\?\w+=\w+)?(&\w+=\w+)*([,]\w*)*)?"; - -/************************************ - * One of these gets thrown on compilation errors - */ - -class RegExpException : Exception -{ - this(string msg) - { - super(msg); - } -} - -struct regmatch_t -{ - ptrdiff_t rm_so; // index of start of match - ptrdiff_t rm_eo; // index past end of match -} - -private alias char rchar; // so we can make a wchar version - -/****************************************************** - * Search string for matches with regular expression - * pattern with attributes. - * Replace each match with string generated from format. - * Params: - * s = String to search. - * pattern = Regular expression pattern. - * format = Replacement string format. - * attributes = Regular expression attributes. - * Returns: - * the resulting string - * Example: - * Replace the letters 'a' with the letters 'ZZ'. - * --- - * s = "Strap a rocket engine on a chicken." - * sub(s, "a", "ZZ") // result: StrZZp a rocket engine on a chicken. - * sub(s, "a", "ZZ", "g") // result: StrZZp ZZ rocket engine on ZZ chicken. - * --- - * The replacement format can reference the matches using - * the $&, $$, $', $`, $0 .. $99 notation: - * --- - * sub(s, "[ar]", "[$&]", "g") // result: St[r][a]p [a] [r]ocket engine on [a] chi - * --- - */ - -string sub(string s, string pattern, string format, string attributes = null) -{ - auto r = new RegExp(pattern, attributes); - auto result = r.replace(s, format); - delete r; - return result; -} - -unittest -{ - debug(regexp) printf("regexp.sub.unittest\n"); - - string r = sub("hello", "ll", "ss"); - assert(r == "hesso"); -} - -/******************************************************* - * Search string for matches with regular expression - * pattern with attributes. - * Pass each match to delegate dg. - * Replace each match with the return value from dg. - * Params: - * s = String to search. - * pattern = Regular expression pattern. - * dg = Delegate - * attributes = Regular expression attributes. - * Returns: the resulting string. - * Example: - * Capitalize the letters 'a' and 'r': - * --- - * s = "Strap a rocket engine on a chicken."; - * sub(s, "[ar]", - * delegate char[] (RegExp m) - * { - * return toUpper(m[0]); - * }, - * "g"); // result: StRAp A Rocket engine on A chicken. - * --- - */ - -string sub(string s, string pattern, string delegate(RegExp) dg, string attributes = null) -{ - auto r = new RegExp(pattern, attributes); - - string result = s; - size_t lastindex = 0; - size_t offset = 0; - - while (r.test(s, lastindex)) - { - auto so = r.pmatch[0].rm_so; - auto eo = r.pmatch[0].rm_eo; - - string replacement = dg(r); - - // Optimize by using std.string.replace if possible - Dave Fladebo - string slice = result[offset + so .. offset + eo]; - if (r.attributes & RegExp.REA.global && // global, so replace all - !(r.attributes & RegExp.REA.ignoreCase) && // not ignoring case - !(r.attributes & RegExp.REA.multiline) && // not multiline - pattern == slice) // simple pattern (exact match, no special characters) - { - debug(regexp) - printf("result: %.*s, pattern: %.*s, slice: %.*s, replacement: %.*s\n", - result.length, result.ptr, - pattern.length, pattern.ptr, - slice.length, slice.ptr, - replacement.length, replacement.ptr); - result = replace(result,slice,replacement); - break; - } - - result = replaceSlice(result, result[offset + so .. offset + eo], replacement); - - if (r.attributes & RegExp.REA.global) - { - offset += replacement.length - (eo - so); - - if (lastindex == eo) - lastindex++; // always consume some source - else - lastindex = eo; - } - else - break; - } - delete r; - - return result; -} - -unittest -{ - debug(regexp) printf("regexp.sub.unittest\n"); - - string foo(RegExp r) { return "ss"; } - - auto r = sub("hello", "ll", delegate string(RegExp r) { return "ss"; }); - assert(r == "hesso"); - - r = sub("hello", "l", delegate string(RegExp r) { return "l"; }, "g"); - assert(r == "hello"); - - auto s = sub("Strap a rocket engine on a chicken.", - "[ar]", - delegate string (RegExp m) - { - return std.string.toUpper(m[0]); - }, - "g"); - assert(s == "StRAp A Rocket engine on A chicken."); -} - - -/************************************************* - * Search $(D_PARAM s[]) for first match with $(D_PARAM pattern). - * Params: - * s = String to search. - * pattern = Regular expression pattern. - * Returns: - * index into s[] of match if found, -1 if no match. - * Example: - * --- - * auto s = "abcabcabab"; - * find(s, RegExp("b")); // match, returns 1 - * find(s, RegExp("f")); // no match, returns -1 - * --- - */ - -ptrdiff_t find(string s, RegExp pattern) -{ - return pattern.test(s) - ? pattern.pmatch[0].rm_so - : -1; -} - -unittest -{ - debug(regexp) printf("regexp.find.unittest\n"); - - auto i = find("xabcy", RegExp("abc")); - assert(i == 1); - i = find("cba", RegExp("abc")); - assert(i == -1); -} - -/** - Returns: - - Same as $(D_PARAM find(s, RegExp(pattern, attributes))). - - WARNING: - - This function is scheduled for deprecation due to unnecessary - ambiguity with the homonym function in std.string. Instead of - $(D_PARAM std.regexp.find(s, p, a)), you may want to use $(D_PARAM - find(s, RegExp(p, a))). -*/ - -ptrdiff_t -find(string s, string pattern, string attributes = null) -{ - auto r = new RegExp(pattern, attributes); - scope(exit) delete r; - return r.test(s) ? r.pmatch[0].rm_so : -1; -} - -unittest -{ - debug(regexp) printf("regexp.find.unittest\n"); - - auto i = find("xabcy", "abc"); - assert(i == 1); - i = find("cba", "abc"); - assert(i == -1); -} - -/************************************************* - * Search $(D_PARAM s[]) for last match with $(D_PARAM pattern). - * Params: - * s = String to search. - * pattern = Regular expression pattern. - * Returns: - * index into s[] of match if found, -1 if no match. - * Example: - * --- - * auto s = "abcabcabab"; - * rfind(s, RegExp("b")); // match, returns 9 - * rfind(s, RegExp("f")); // no match, returns -1 - * --- - */ - -ptrdiff_t rfind(string s, RegExp pattern) -{ - ptrdiff_t i = -1, lastindex = 0; - - while (pattern.test(s, lastindex)) - { - auto eo = pattern.pmatch[0].rm_eo; - i = pattern.pmatch[0].rm_so; - if (lastindex == eo) - lastindex++; // always consume some source - else - lastindex = eo; - } - return i; -} - -unittest -{ - ptrdiff_t i; - - debug(regexp) printf("regexp.rfind.unittest\n"); - i = rfind("abcdefcdef", RegExp("c")); - assert(i == 6); - i = rfind("abcdefcdef", RegExp("cd")); - assert(i == 6); - i = rfind("abcdefcdef", RegExp("x")); - assert(i == -1); - i = rfind("abcdefcdef", RegExp("xy")); - assert(i == -1); - i = rfind("abcdefcdef", RegExp("")); - assert(i == 10); -} - -/************************************************* -Returns: - - Same as $(D_PARAM rfind(s, RegExp(pattern, attributes))). - -WARNING: - -This function is scheduled for deprecation due to unnecessary -ambiguity with the homonym function in std.string. Instead of -$(D_PARAM std.regexp.rfind(s, p, a)), you may want to use $(D_PARAM -rfind(s, RegExp(p, a))). -*/ - -ptrdiff_t -rfind(string s, string pattern, string attributes = null) -{ - typeof(return) i = -1, lastindex = 0; - - auto r = new RegExp(pattern, attributes); - while (r.test(s, lastindex)) - { - auto eo = r.pmatch[0].rm_eo; - i = r.pmatch[0].rm_so; - if (lastindex == eo) - lastindex++; // always consume some source - else - lastindex = eo; - } - delete r; - return i; -} - -unittest -{ - ptrdiff_t i; - - debug(regexp) printf("regexp.rfind.unittest\n"); - i = rfind("abcdefcdef", "c"); - assert(i == 6); - i = rfind("abcdefcdef", "cd"); - assert(i == 6); - i = rfind("abcdefcdef", "x"); - assert(i == -1); - i = rfind("abcdefcdef", "xy"); - assert(i == -1); - i = rfind("abcdefcdef", ""); - assert(i == 10); -} - - -/******************************************** - * Split s[] into an array of strings, using the regular - * expression $(D_PARAM pattern) as the separator. - * Params: - * s = String to search. - * pattern = Regular expression pattern. - * Returns: - * array of slices into s[] - * Example: - * --- - * foreach (s; split("abcabcabab", RegExp("C.", "i"))) - * { - * writefln("s = '%s'", s); - * } - * // Prints: - * // s = 'ab' - * // s = 'b' - * // s = 'bab' - * --- - */ - -string[] split(string s, RegExp pattern) -{ - return pattern.split(s); -} - -unittest -{ - debug(regexp) printf("regexp.split.unittest()\n"); - string[] result; - - result = split("ab", RegExp("a*")); - assert(result.length == 2); - assert(result[0] == ""); - assert(result[1] == "b"); - - foreach (i, s; split("abcabcabab", RegExp("C.", "i"))) - { - //writefln("s[%d] = '%s'", i, s); - if (i == 0) assert(s == "ab"); - else if (i == 1) assert(s == "b"); - else if (i == 2) assert(s == "bab"); - else assert(0); - } -} - -/******************************************** - Returns: - Same as $(D_PARAM split(s, RegExp(pattern, attributes))). - -WARNING: - -This function is scheduled for deprecation due to unnecessary -ambiguity with the homonym function in std.string. Instead of -$(D_PARAM std.regexp.split(s, p, a)), you may want to use $(D_PARAM -split(s, RegExp(p, a))). -*/ - -string[] split(string s, string pattern, string attributes = null) -{ - auto r = new RegExp(pattern, attributes); - auto result = r.split(s); - delete r; - return result; -} - -unittest -{ - debug(regexp) printf("regexp.split.unittest()\n"); - string[] result; - - result = split("ab", "a*"); - assert(result.length == 2); - assert(result[0] == ""); - assert(result[1] == "b"); - - foreach (i, s; split("abcabcabab", "C.", "i")) - { - //writefln("s[%d] = '%s'", i, s.length, s.ptr); - if (i == 0) assert(s == "ab"); - else if (i == 1) assert(s == "b"); - else if (i == 2) assert(s == "bab"); - else assert(0); - } -} - -/**************************************************** - * Search s[] for first match with pattern[] with attributes[]. - * Params: - * s = String to search. - * pattern = Regular expression pattern. - * attributes = Regular expression attributes. - * Returns: - * corresponding RegExp if found, null if not. - * Example: - * --- - * import std.stdio; - * import std.regexp; - * - * void main() - * { - * if (auto m = std.regexp.search("abcdef", "c")) - * { - * writefln("%s[%s]%s", m.pre, m[0], m.post); - * } - * } - * // Prints: - * // ab[c]def - * --- - */ - -RegExp search(string s, string pattern, string attributes = null) -{ - auto r = new RegExp(pattern, attributes); - if (!r.test(s)) - { delete r; - assert(r is null); - } - return r; -} - -unittest -{ - debug(regexp) printf("regexp.string.unittest()\n"); - - if (auto m = std.regexp.search("abcdef", "c()")) - { - auto result = std.string.format("%s[%s]%s", m.pre, m[0], m.post); - assert(result == "ab[c]def"); - assert(m[1] == null); - assert(m[2] == null); - } - else - assert(0); - - if (auto n = std.regexp.search("abcdef", "g")) - { - assert(0); - } -} - -/* ********************************* RegExp ******************************** */ - -/***************************** - * RegExp is a class to handle regular expressions. - * - * It is the core foundation for adding powerful string pattern matching - * capabilities to programs like grep, text editors, awk, sed, etc. - */ -class RegExp -{ - /***** - * Construct a RegExp object. Compile pattern - * with attributes into - * an internal form for fast execution. - * Params: - * pattern = regular expression - * attributes = _attributes - * Throws: RegExpException if there are any compilation errors. - * Example: - * Declare two variables and assign to them a RegExp object: - * --- - * auto r = new RegExp("pattern"); - * auto s = new RegExp(r"p[1-5]\s*"); - * --- - */ - public this(string pattern, string attributes = null) - { - pmatch = (&gmatch)[0 .. 1]; - compile(pattern, attributes); - } - - /***** - * Generate instance of RegExp. - * Params: - * pattern = regular expression - * attributes = _attributes - * Throws: RegExpException if there are any compilation errors. - * Example: - * Declare two variables and assign to them a RegExp object: - * --- - * auto r = RegExp("pattern"); - * auto s = RegExp(r"p[1-5]\s*"); - * --- - */ - public static RegExp opCall(string pattern, string attributes = null) - { - return new RegExp(pattern, attributes); - } - - unittest - { - debug(regexp) printf("regexp.opCall.unittest()\n"); - auto r1 = RegExp("hello", "m"); - string msg; - try - { - auto r2 = RegExp("hello", "q"); - assert(0); - } - catch (RegExpException ree) - { - msg = ree.toString(); - //writefln("message: %s", ree); - } - assert(std.algorithm.countUntil(msg, "unrecognized attribute") >= 0); - } - - /************************************ - * Set up for start of foreach loop. - * Returns: - * search() returns instance of RegExp set up to _search string[]. - * Example: - * --- - * import std.stdio; - * import std.regexp; - * - * void main() - * { - * foreach(m; RegExp("ab").search("abcabcabab")) - * { - * writefln("%s[%s]%s", m.pre, m[0], m.post); - * } - * } - * // Prints: - * // [ab]cabcabab - * // abc[ab]cabab - * // abcabc[ab]ab - * // abcabcab[ab] - * --- - */ - - public RegExp search(string string) - { - input = string; - pmatch[0].rm_eo = 0; - return this; - } - - /** ditto */ - public int opApply(scope int delegate(ref RegExp) dg) - { - int result; - RegExp r = this; - - while (test()) - { - result = dg(r); - if (result) - break; - } - - return result; - } - - unittest - { - debug(regexp) printf("regexp.search.unittest()\n"); - - int i; - foreach(m; RegExp("ab").search("abcabcabab")) - { - auto s = std.string.format("%s[%s]%s", m.pre, m[0], m.post); - if (i == 0) assert(s == "[ab]cabcabab"); - else if (i == 1) assert(s == "abc[ab]cabab"); - else if (i == 2) assert(s == "abcabc[ab]ab"); - else if (i == 3) assert(s == "abcabcab[ab]"); - else assert(0); - i++; - } - } - - /****************** - * Retrieve match n. - * - * n==0 means the matched substring, n>0 means the - * n'th parenthesized subexpression. - * if n is larger than the number of parenthesized subexpressions, - * null is returned. - */ - public string opIndex(size_t n) - { - if (n >= pmatch.length) - return null; - else - { - auto rm_so = pmatch[n].rm_so; - auto rm_eo = pmatch[n].rm_eo; - if (rm_so == rm_eo) - return null; - return input[rm_so .. rm_eo]; - } - } - - /** - Same as $(D_PARAM opIndex(n)). - - WARNING: - - Scheduled for deprecation due to confusion with overloaded - $(D_PARAM match(string)). Instead of $(D_PARAM regex.match(n)) - you may want to use $(D_PARAM regex[n]). - */ - public string match(size_t n) - { - return this[n]; - } - - /******************* - * Return the slice of the input that precedes the matched substring. - */ - public @property string pre() - { - return input[0 .. pmatch[0].rm_so]; - } - - /******************* - * Return the slice of the input that follows the matched substring. - */ - public @property string post() - { - return input[pmatch[0].rm_eo .. $]; - } - - uint re_nsub; // number of parenthesized subexpression matches - regmatch_t[] pmatch; // array [re_nsub + 1] - - string input; // the string to search - - // per instance: - - string pattern; // source text of the regular expression - - string flags; // source text of the attributes parameter - - int errors; - - uint attributes; - - enum REA - { - global = 1, // has the g attribute - ignoreCase = 2, // has the i attribute - multiline = 4, // if treat as multiple lines separated - // by newlines, or as a single line - dotmatchlf = 8, // if . matches \n - } - - -private: - size_t src; // current source index in input[] - size_t src_start; // starting index for match in input[] - size_t p; // position of parser in pattern[] - regmatch_t gmatch; // match for the entire regular expression - // (serves as storage for pmatch[0]) - - const(ubyte)[] program; // pattern[] compiled into regular expression program - OutBuffer buf; - - - - -/******************************************/ - -// Opcodes - - enum : ubyte - { - REend, // end of program - REchar, // single character - REichar, // single character, case insensitive - REdchar, // single UCS character - REidchar, // single wide character, case insensitive - REanychar, // any character - REanystar, // ".*" - REstring, // string of characters - REistring, // string of characters, case insensitive - REtestbit, // any in bitmap, non-consuming - REbit, // any in the bit map - REnotbit, // any not in the bit map - RErange, // any in the string - REnotrange, // any not in the string - REor, // a | b - REplus, // 1 or more - REstar, // 0 or more - REquest, // 0 or 1 - REnm, // n..m - REnmq, // n..m, non-greedy version - REbol, // beginning of line - REeol, // end of line - REparen, // parenthesized subexpression - REgoto, // goto offset - - REwordboundary, - REnotwordboundary, - REdigit, - REnotdigit, - REspace, - REnotspace, - REword, - REnotword, - REbackref, - }; - -// BUG: should this include '$'? - private int isword(dchar c) { return isAlphaNum(c) || c == '_'; } - - private uint inf = ~0u; - -/* ******************************** - * Throws RegExpException on error - */ - - public void compile(string pattern, string attributes) - { - //printf("RegExp.compile('%.*s', '%.*s')\n", pattern.length, pattern.ptr, attributes.length, attributes.ptr); - - this.attributes = 0; - foreach (rchar c; attributes) - { REA att; - - switch (c) - { - case 'g': att = REA.global; break; - case 'i': att = REA.ignoreCase; break; - case 'm': att = REA.multiline; break; - default: - error("unrecognized attribute"); - return; - } - if (this.attributes & att) - { error("redundant attribute"); - return; - } - this.attributes |= att; - } - - input = null; - - this.pattern = pattern; - this.flags = attributes; - - uint oldre_nsub = re_nsub; - re_nsub = 0; - errors = 0; - - buf = new OutBuffer(); - buf.reserve(pattern.length * 8); - p = 0; - parseRegexp(); - if (p < pattern.length) - { error("unmatched ')'"); - } - // @@@ SKIPPING OPTIMIZATION SOLVES BUG 941 @@@ - //optimize(); - program = buf.data; - buf.data = null; - delete buf; - - if (re_nsub > oldre_nsub) - { - if (pmatch.ptr is &gmatch) - pmatch = null; - pmatch.length = re_nsub + 1; - } - pmatch[0].rm_so = 0; - pmatch[0].rm_eo = 0; - } - -/******************************************** - * Split s[] into an array of strings, using the regular - * expression as the separator. - * Returns: - * array of slices into s[] - */ - - public string[] split(string s) - { - debug(regexp) printf("regexp.split()\n"); - - string[] result; - - if (s.length) - { - ptrdiff_t p, q; - for (q = p; q != s.length;) - { - if (test(s, q)) - { - q = pmatch[0].rm_so; - auto e = pmatch[0].rm_eo; - if (e != p) - { - result ~= s[p .. q]; - for (size_t i = 1; i < pmatch.length; i++) - { - auto so = pmatch[i].rm_so; - auto eo = pmatch[i].rm_eo; - if (so == eo) - { so = 0; // -1 gives array bounds error - eo = 0; - } - result ~= s[so .. eo]; - } - q = p = e; - continue; - } - } - q++; - } - result ~= s[p .. s.length]; - } - else if (!test(s)) - result ~= s; - return result; - } - - unittest - { - debug(regexp) printf("regexp.split.unittest()\n"); - - auto r = new RegExp("a*?", null); - string[] result; - string j; - int i; - - result = r.split("ab"); - - assert(result.length == 2); - i = std.algorithm.cmp(result[0], "a"); - assert(i == 0); - i = std.algorithm.cmp(result[1], "b"); - assert(i == 0); - - r = new RegExp("a*", null); - result = r.split("ab"); - assert(result.length == 2); - i = std.algorithm.cmp(result[0], ""); - assert(i == 0); - i = std.algorithm.cmp(result[1], "b"); - assert(i == 0); - - r = new RegExp("<(\\/)?([^<>]+)>", null); - result = r.split("afontbarhello"); - - debug(regexp) - { - for (i = 0; i < result.length; i++) - printf("result[%d] = '%.*s'\n", i, result[i].length, result[i].ptr); - } - - j = join(result, ","); - //printf("j = '%.*s'\n", j.length, j.ptr); - i = std.algorithm.cmp(j, "a,,b,font,/,b,bar,,TAG,hello,/,TAG,"); - assert(i == 0); - - r = new RegExp("a[bc]", null); - result = r.match("123ab"); - j = join(result, ","); - i = std.algorithm.cmp(j, "ab"); - assert(i == 0); - - result = r.match("ac"); - j = join(result, ","); - i = std.algorithm.cmp(j, "ac"); - assert(i == 0); - } - -/************************************************* - * Search string[] for match with regular expression. - * Returns: - * index of match if successful, -1 if not found - */ - - public ptrdiff_t find(string string) - { - if (test(string)) - return pmatch[0].rm_so; - else - return -1; // no match - } - -//deprecated alias find search; - - unittest - { - debug(regexp) printf("regexp.find.unittest()\n"); - - RegExp r = new RegExp("abc", null); - auto i = r.find("xabcy"); - assert(i == 1); - i = r.find("cba"); - assert(i == -1); - } - - -/************************************************* - * Search s[] for match. - * Returns: - * If global attribute, return same value as exec(s). - * If not global attribute, return array of all matches. - */ - - public string[] match(string s) - { - string[] result; - - if (attributes & REA.global) - { - ptrdiff_t lastindex = 0; - - while (test(s, lastindex)) - { - auto eo = pmatch[0].rm_eo; - - result ~= input[pmatch[0].rm_so .. eo]; - if (lastindex == eo) - lastindex++; // always consume some source - else - lastindex = eo; - } - } - else - { - result = exec(s); - } - return result; - } - - unittest - { - debug(regexp) printf("regexp.match.unittest()\n"); - - int i; - string[] result; - string j; - RegExp r; - - r = new RegExp("a[bc]", null); - result = r.match("1ab2ac3"); - j = join(result, ","); - i = std.algorithm.cmp(j, "ab"); - assert(i == 0); - - r = new RegExp("a[bc]", "g"); - result = r.match("1ab2ac3"); - j = join(result, ","); - i = std.algorithm.cmp(j, "ab,ac"); - assert(i == 0); - } - - -/************************************************* - * Find regular expression matches in s[]. Replace those matches - * with a new string composed of format[] merged with the result of the - * matches. - * If global, replace all matches. Otherwise, replace first match. - * Returns: the new string - */ - - public string replace(string s, string format) - { - debug(regexp) printf("string = %.*s, format = %.*s\n", s.length, s.ptr, format.length, format.ptr); - - string result = s; - ptrdiff_t lastindex = 0; - size_t offset = 0; - - for (;;) - { - if (!test(s, lastindex)) - break; - - auto so = pmatch[0].rm_so; - auto eo = pmatch[0].rm_eo; - - string replacement = replace(format); - - // Optimize by using replace if possible - Dave Fladebo - string slice = result[offset + so .. offset + eo]; - if (attributes & REA.global && // global, so replace all - !(attributes & REA.ignoreCase) && // not ignoring case - !(attributes & REA.multiline) && // not multiline - pattern == slice && // simple pattern (exact match, no special characters) - format == replacement) // simple format, not $ formats - { - debug(regexp) - { - auto sss = result[offset + so .. offset + eo]; - printf("pattern: %.*s, slice: %.*s, format: %.*s, replacement: %.*s\n", - pattern.length, pattern.ptr, sss.length, sss.ptr, format.length, format.ptr, replacement.length, replacement.ptr); - } - result = std.array.replace(result,slice,replacement); - break; - } - - result = replaceSlice(result, result[offset + so .. offset + eo], replacement); - - if (attributes & REA.global) - { - offset += replacement.length - (eo - so); - - if (lastindex == eo) - lastindex++; // always consume some source - else - lastindex = eo; - } - else - break; - } - - return result; - } - - unittest - { - debug(regexp) printf("regexp.replace.unittest()\n"); - - int i; - string result; - RegExp r; - - r = new RegExp("a[bc]", "g"); - result = r.replace("1ab2ac3", "x$&y"); - i = std.algorithm.cmp(result, "1xaby2xacy3"); - assert(i == 0); - - r = new RegExp("ab", "g"); - result = r.replace("1ab2ac3", "xy"); - i = std.algorithm.cmp(result, "1xy2ac3"); - assert(i == 0); - } - - -/************************************************* - * Search string[] for match. - * Returns: - * array of slices into string[] representing matches - */ - - public string[] exec(string s) - { - debug(regexp) printf("regexp.exec(string = '%.*s')\n", s.length, s.ptr); - input = s; - pmatch[0].rm_so = 0; - pmatch[0].rm_eo = 0; - return exec(); - } - -/************************************************* - * Pick up where last exec(string) or exec() left off, - * searching string[] for next match. - * Returns: - * array of slices into string[] representing matches - */ - - public string[] exec() - { - if (!test()) - return null; - - auto result = new string[pmatch.length]; - for (int i = 0; i < pmatch.length; i++) - { - if (pmatch[i].rm_so == pmatch[i].rm_eo) - result[i] = null; - else - result[i] = input[pmatch[i].rm_so .. pmatch[i].rm_eo]; - } - - return result; - } - -/************************************************ - * Search s[] for match. - * Returns: 0 for no match, !=0 for match - * Example: ---- -import std.stdio; -import std.regexp; -import std.string; - -int grep(int delegate(char[]) pred, char[][] list) -{ - int count; - foreach (s; list) - { if (pred(s)) - ++count; - } - return count; -} - -void main() -{ - auto x = grep(&RegExp("[Ff]oo").test, - std.string.split("mary had a foo lamb")); - writefln(x); -} ---- -* which prints: 1 -*/ - //@@@ -public bool test(string s) - { - return test(s, 0 /*pmatch[0].rm_eo*/) != 0; - } - -/************************************************ - * Pick up where last test(string) or test() left off, and search again. - * Returns: 0 for no match, !=0 for match - */ - - public int test() - { - return test(input, pmatch[0].rm_eo); - } - -/************************************************ - * Test s[] starting at startindex against regular expression. - * Returns: 0 for no match, !=0 for match - */ - - public int test(string s, size_t startindex) - { - char firstc; - - input = s; - debug (regexp) printf("RegExp.test(input[] = '%.*s', startindex = %zd)\n", input.length, input.ptr, startindex); - pmatch[0].rm_so = 0; - pmatch[0].rm_eo = 0; - if (startindex < 0 || startindex > input.length) - { - return 0; // fail - } - //debug(regexp) printProgram(program); - - // First character optimization - firstc = 0; - if (program[0] == REchar) - { - firstc = program[1]; - if (attributes & REA.ignoreCase && isAlpha(firstc)) - firstc = 0; - } - - for (auto si = startindex; ; si++) - { - if (firstc) - { - if (si == input.length) - break; // no match - if (input[si] != firstc) - { - si++; - if (!chr(si, firstc)) // if first character not found - break; // no match - } - } - for (size_t i = 0; i < re_nsub + 1; i++) - { - pmatch[i].rm_so = -1; - pmatch[i].rm_eo = -1; - } - src_start = src = si; - if (trymatch(0, program.length)) - { - pmatch[0].rm_so = si; - pmatch[0].rm_eo = src; - //debug(regexp) printf("start = %d, end = %d\n", gmatch.rm_so, gmatch.rm_eo); - return 1; - } - // If possible match must start at beginning, we are done - if (program[0] == REbol || program[0] == REanystar) - { - if (attributes & REA.multiline) - { - // Scan for the next \n - if (!chr(si, '\n')) - break; // no match if '\n' not found - } - else - break; - } - if (si == input.length) - break; - debug(regexp) - { - auto sss = input[si + 1 .. input.length]; - printf("Starting new try: '%.*s'\n", sss.length, sss.ptr); - } - } - return 0; // no match - } - - /** - Returns whether string $(D_PARAM s) matches $(D_PARAM this). - */ - alias test opEquals; -// bool opEquals(string s) -// { -// return test(s); -// } - - unittest - { - assert("abc" == RegExp(".b.")); - assert("abc" != RegExp(".b..")); - } - - int chr(ref size_t si, rchar c) - { - for (; si < input.length; si++) - { - if (input[si] == c) - return 1; - } - return 0; - } - - - void printProgram(const(ubyte)[] prog) - { - //debug(regexp) - { - size_t len; - uint n; - uint m; - ushort *pu; - uint *puint; - char[] str; - - printf("printProgram()\n"); - for (size_t pc = 0; pc < prog.length; ) - { - printf("%3d: ", pc); - - //printf("prog[pc] = %d, REchar = %d, REnmq = %d\n", prog[pc], REchar, REnmq); - switch (prog[pc]) - { - case REchar: - printf("\tREchar '%c'\n", prog[pc + 1]); - pc += 1 + char.sizeof; - break; - - case REichar: - printf("\tREichar '%c'\n", prog[pc + 1]); - pc += 1 + char.sizeof; - break; - - case REdchar: - printf("\tREdchar '%c'\n", *cast(dchar *)&prog[pc + 1]); - pc += 1 + dchar.sizeof; - break; - - case REidchar: - printf("\tREidchar '%c'\n", *cast(dchar *)&prog[pc + 1]); - pc += 1 + dchar.sizeof; - break; - - case REanychar: - printf("\tREanychar\n"); - pc++; - break; - - case REstring: - len = *cast(size_t *)&prog[pc + 1]; - str = (cast(char*)&prog[pc + 1 + size_t.sizeof])[0 .. len]; - printf("\tREstring x%x, '%.*s'\n", len, str.length, str.ptr); - pc += 1 + size_t.sizeof + len * rchar.sizeof; - break; - - case REistring: - len = *cast(size_t *)&prog[pc + 1]; - str = (cast(char*)&prog[pc + 1 + size_t.sizeof])[0 .. len]; - printf("\tREistring x%x, '%.*s'\n", len, str.length, str.ptr); - pc += 1 + size_t.sizeof + len * rchar.sizeof; - break; - - case REtestbit: - pu = cast(ushort *)&prog[pc + 1]; - printf("\tREtestbit %d, %d\n", pu[0], pu[1]); - len = pu[1]; - pc += 1 + 2 * ushort.sizeof + len; - break; - - case REbit: - pu = cast(ushort *)&prog[pc + 1]; - len = pu[1]; - printf("\tREbit cmax=%02x, len=%d:", pu[0], len); - for (n = 0; n < len; n++) - printf(" %02x", prog[pc + 1 + 2 * ushort.sizeof + n]); - printf("\n"); - pc += 1 + 2 * ushort.sizeof + len; - break; - - case REnotbit: - pu = cast(ushort *)&prog[pc + 1]; - printf("\tREnotbit %d, %d\n", pu[0], pu[1]); - len = pu[1]; - pc += 1 + 2 * ushort.sizeof + len; - break; - - case RErange: - len = *cast(uint *)&prog[pc + 1]; - printf("\tRErange %d\n", len); - // BUG: REAignoreCase? - pc += 1 + uint.sizeof + len; - break; - - case REnotrange: - len = *cast(uint *)&prog[pc + 1]; - printf("\tREnotrange %d\n", len); - // BUG: REAignoreCase? - pc += 1 + uint.sizeof + len; - break; - - case REbol: - printf("\tREbol\n"); - pc++; - break; - - case REeol: - printf("\tREeol\n"); - pc++; - break; - - case REor: - len = *cast(uint *)&prog[pc + 1]; - printf("\tREor %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len); - pc += 1 + uint.sizeof; - break; - - case REgoto: - len = *cast(uint *)&prog[pc + 1]; - printf("\tREgoto %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len); - pc += 1 + uint.sizeof; - break; - - case REanystar: - printf("\tREanystar\n"); - pc++; - break; - - case REnm: - case REnmq: - // len, n, m, () - puint = cast(uint *)&prog[pc + 1]; - len = puint[0]; - n = puint[1]; - m = puint[2]; - printf("\tREnm%s len=%d, n=%u, m=%u, pc=>%d\n", - (prog[pc] == REnmq) ? "q".ptr : " ".ptr, - len, n, m, pc + 1 + uint.sizeof * 3 + len); - pc += 1 + uint.sizeof * 3; - break; - - case REparen: - // len, n, () - puint = cast(uint *)&prog[pc + 1]; - len = puint[0]; - n = puint[1]; - printf("\tREparen len=%d n=%d, pc=>%d\n", len, n, pc + 1 + uint.sizeof * 2 + len); - pc += 1 + uint.sizeof * 2; - break; - - case REend: - printf("\tREend\n"); - return; - - case REwordboundary: - printf("\tREwordboundary\n"); - pc++; - break; - - case REnotwordboundary: - printf("\tREnotwordboundary\n"); - pc++; - break; - - case REdigit: - printf("\tREdigit\n"); - pc++; - break; - - case REnotdigit: - printf("\tREnotdigit\n"); - pc++; - break; - - case REspace: - printf("\tREspace\n"); - pc++; - break; - - case REnotspace: - printf("\tREnotspace\n"); - pc++; - break; - - case REword: - printf("\tREword\n"); - pc++; - break; - - case REnotword: - printf("\tREnotword\n"); - pc++; - break; - - case REbackref: - printf("\tREbackref %d\n", prog[1]); - pc += 2; - break; - - default: - assert(0); - } - } - } - } - - -/************************************************** - * Match input against a section of the program[]. - * Returns: - * 1 if successful match - * 0 no match - */ - - int trymatch(size_t pc, size_t pcend) - { - size_t len; - size_t n; - size_t m; - size_t count; - size_t pop; - size_t ss; - regmatch_t *psave; - size_t c1; - size_t c2; - ushort* pu; - uint* puint; - - debug(regexp) - { - auto sss = input[src .. input.length]; - printf("RegExp.trymatch(pc = %zd, src = '%.*s', pcend = %zd)\n", pc, sss.length, sss.ptr, pcend); - } - auto srcsave = src; - psave = null; - for (;;) - { - if (pc == pcend) // if done matching - { debug(regex) printf("\tprogend\n"); - return 1; - } - - //printf("\top = %d\n", program[pc]); - switch (program[pc]) - { - case REchar: - if (src == input.length) - goto Lnomatch; - debug(regexp) printf("\tREchar '%c', src = '%c'\n", program[pc + 1], input[src]); - if (program[pc + 1] != input[src]) - goto Lnomatch; - src++; - pc += 1 + char.sizeof; - break; - - case REichar: - if (src == input.length) - goto Lnomatch; - debug(regexp) printf("\tREichar '%c', src = '%c'\n", program[pc + 1], input[src]); - c1 = program[pc + 1]; - c2 = input[src]; - if (c1 != c2) - { - if (isLower(cast(rchar)c2)) - c2 = std.ascii.toUpper(cast(rchar)c2); - else - goto Lnomatch; - if (c1 != c2) - goto Lnomatch; - } - src++; - pc += 1 + char.sizeof; - break; - - case REdchar: - debug(regexp) printf("\tREdchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]); - if (src == input.length) - goto Lnomatch; - if (*(cast(dchar *)&program[pc + 1]) != input[src]) - goto Lnomatch; - src++; - pc += 1 + dchar.sizeof; - break; - - case REidchar: - debug(regexp) printf("\tREidchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]); - if (src == input.length) - goto Lnomatch; - c1 = *(cast(dchar *)&program[pc + 1]); - c2 = input[src]; - if (c1 != c2) - { - if (isLower(cast(rchar)c2)) - c2 = std.ascii.toUpper(cast(rchar)c2); - else - goto Lnomatch; - if (c1 != c2) - goto Lnomatch; - } - src++; - pc += 1 + dchar.sizeof; - break; - - case REanychar: - debug(regexp) printf("\tREanychar\n"); - if (src == input.length) - goto Lnomatch; - if (!(attributes & REA.dotmatchlf) && input[src] == cast(rchar)'\n') - goto Lnomatch; - src += std.utf.stride(input, src); - //src++; - pc++; - break; - - case REstring: - len = *cast(size_t *)&program[pc + 1]; - debug(regexp) - { - auto sss2 = (&program[pc + 1 + size_t.sizeof])[0 .. len]; - printf("\tREstring x%x, '%.*s'\n", len, sss2.length, sss2.ptr); - } - if (src + len > input.length) - goto Lnomatch; - if (memcmp(&program[pc + 1 + size_t.sizeof], &input[src], len * rchar.sizeof)) - goto Lnomatch; - src += len; - pc += 1 + size_t.sizeof + len * rchar.sizeof; - break; - - case REistring: - len = *cast(size_t *)&program[pc + 1]; - debug(regexp) - { - auto sss2 = (&program[pc + 1 + size_t.sizeof])[0 .. len]; - printf("\tREistring x%x, '%.*s'\n", len, sss2.length, sss2.ptr); - } - if (src + len > input.length) - goto Lnomatch; - if (icmp((cast(char*)&program[pc + 1 + size_t.sizeof])[0..len], - input[src .. src + len])) - goto Lnomatch; - src += len; - pc += 1 + size_t.sizeof + len * rchar.sizeof; - break; - - case REtestbit: - pu = (cast(ushort *)&program[pc + 1]); - if (src == input.length) - goto Lnomatch; - debug(regexp) printf("\tREtestbit %d, %d, '%c', x%02x\n", - pu[0], pu[1], input[src], input[src]); - len = pu[1]; - c1 = input[src]; - //printf("[x%02x]=x%02x, x%02x\n", c1 >> 3, ((&program[pc + 1 + 4])[c1 >> 3] ), (1 << (c1 & 7))); - if (c1 <= pu[0] && - !((&(program[pc + 1 + 4]))[c1 >> 3] & (1 << (c1 & 7)))) - goto Lnomatch; - pc += 1 + 2 * ushort.sizeof + len; - break; - - case REbit: - pu = (cast(ushort *)&program[pc + 1]); - if (src == input.length) - goto Lnomatch; - debug(regexp) printf("\tREbit %d, %d, '%c'\n", - pu[0], pu[1], input[src]); - len = pu[1]; - c1 = input[src]; - if (c1 > pu[0]) - goto Lnomatch; - if (!((&program[pc + 1 + 4])[c1 >> 3] & (1 << (c1 & 7)))) - goto Lnomatch; - src++; - pc += 1 + 2 * ushort.sizeof + len; - break; - - case REnotbit: - pu = (cast(ushort *)&program[pc + 1]); - if (src == input.length) - goto Lnomatch; - debug(regexp) printf("\tREnotbit %d, %d, '%c'\n", - pu[0], pu[1], input[src]); - len = pu[1]; - c1 = input[src]; - if (c1 <= pu[0] && - ((&program[pc + 1 + 4])[c1 >> 3] & (1 << (c1 & 7)))) - goto Lnomatch; - src++; - pc += 1 + 2 * ushort.sizeof + len; - break; - - case RErange: - len = *cast(uint *)&program[pc + 1]; - debug(regexp) printf("\tRErange %d\n", len); - if (src == input.length) - goto Lnomatch; - // BUG: REA.ignoreCase? - if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) == null) - goto Lnomatch; - src++; - pc += 1 + uint.sizeof + len; - break; - - case REnotrange: - len = *cast(uint *)&program[pc + 1]; - debug(regexp) printf("\tREnotrange %d\n", len); - if (src == input.length) - goto Lnomatch; - // BUG: REA.ignoreCase? - if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) != null) - goto Lnomatch; - src++; - pc += 1 + uint.sizeof + len; - break; - - case REbol: - debug(regexp) printf("\tREbol\n"); - if (src == 0) - { - } - else if (attributes & REA.multiline) - { - if (input[src - 1] != '\n') - goto Lnomatch; - } - else - goto Lnomatch; - pc++; - break; - - case REeol: - debug(regexp) printf("\tREeol\n"); - if (src == input.length) - { - } - else if (attributes & REA.multiline && input[src] == '\n') - src++; - else - goto Lnomatch; - pc++; - break; - - case REor: - len = (cast(uint *)&program[pc + 1])[0]; - debug(regexp) printf("\tREor %d\n", len); - pop = pc + 1 + uint.sizeof; - ss = src; - if (trymatch(pop, pcend)) - { - if (pcend != program.length) - { - auto s = src; - if (trymatch(pcend, program.length)) - { debug(regexp) printf("\tfirst operand matched\n"); - src = s; - return 1; - } - else - { - // If second branch doesn't match to end, take first anyway - src = ss; - if (!trymatch(pop + len, program.length)) - { - debug(regexp) printf("\tfirst operand matched\n"); - src = s; - return 1; - } - } - src = ss; - } - else - { debug(regexp) printf("\tfirst operand matched\n"); - return 1; - } - } - pc = pop + len; // proceed with 2nd branch - break; - - case REgoto: - debug(regexp) printf("\tREgoto\n"); - len = (cast(uint *)&program[pc + 1])[0]; - pc += 1 + uint.sizeof + len; - break; - - case REanystar: - debug(regexp) printf("\tREanystar\n"); - pc++; - for (;;) - { - auto s1 = src; - if (src == input.length) - break; - if (!(attributes & REA.dotmatchlf) && input[src] == '\n') - break; - src++; - auto s2 = src; - - // If no match after consumption, but it - // did match before, then no match - if (!trymatch(pc, program.length)) - { - src = s1; - // BUG: should we save/restore pmatch[]? - if (trymatch(pc, program.length)) - { - src = s1; // no match - break; - } - } - src = s2; - } - break; - - case REnm: - case REnmq: - // len, n, m, () - puint = cast(uint *)&program[pc + 1]; - len = puint[0]; - n = puint[1]; - m = puint[2]; - debug(regexp) printf("\tREnm%s len=%d, n=%u, m=%u\n", - (program[pc] == REnmq) ? "q".ptr : "".ptr, len, n, m); - pop = pc + 1 + uint.sizeof * 3; - for (count = 0; count < n; count++) - { - if (!trymatch(pop, pop + len)) - goto Lnomatch; - } - if (!psave && count < m) - { - //version (Win32) - psave = cast(regmatch_t *)alloca((re_nsub + 1) * regmatch_t.sizeof); - //else - //psave = new regmatch_t[re_nsub + 1]; - } - if (program[pc] == REnmq) // if minimal munch - { - for (; count < m; count++) - { - memcpy(psave, pmatch.ptr, (re_nsub + 1) * regmatch_t.sizeof); - auto s1 = src; - - if (trymatch(pop + len, program.length)) - { - src = s1; - memcpy(pmatch.ptr, psave, (re_nsub + 1) * regmatch_t.sizeof); - break; - } - - if (!trymatch(pop, pop + len)) - { debug(regexp) printf("\tdoesn't match subexpression\n"); - break; - } - - // If source is not consumed, don't - // infinite loop on the match - if (s1 == src) - { debug(regexp) printf("\tsource is not consumed\n"); - break; - } - } - } - else // maximal munch - { - for (; count < m; count++) - { - memcpy(psave, pmatch.ptr, (re_nsub + 1) * regmatch_t.sizeof); - auto s1 = src; - if (!trymatch(pop, pop + len)) - { debug(regexp) printf("\tdoesn't match subexpression\n"); - break; - } - auto s2 = src; - - // If source is not consumed, don't - // infinite loop on the match - if (s1 == s2) - { debug(regexp) printf("\tsource is not consumed\n"); - break; - } - - // If no match after consumption, but it - // did match before, then no match - if (!trymatch(pop + len, program.length)) - { - src = s1; - if (trymatch(pop + len, program.length)) - { - src = s1; // no match - memcpy(pmatch.ptr, psave, (re_nsub + 1) * regmatch_t.sizeof); - break; - } - } - src = s2; - } - } - debug(regexp) printf("\tREnm len=%d, n=%u, m=%u, DONE count=%d\n", len, n, m, count); - pc = pop + len; - break; - - case REparen: - // len, () - debug(regexp) printf("\tREparen\n"); - puint = cast(uint *)&program[pc + 1]; - len = puint[0]; - n = puint[1]; - pop = pc + 1 + uint.sizeof * 2; - ss = src; - if (!trymatch(pop, pop + len)) - goto Lnomatch; - pmatch[n + 1].rm_so = ss; - pmatch[n + 1].rm_eo = src; - pc = pop + len; - break; - - case REend: - debug(regexp) printf("\tREend\n"); - return 1; // successful match - - case REwordboundary: - debug(regexp) printf("\tREwordboundary\n"); - if (src > 0 && src < input.length) - { - c1 = input[src - 1]; - c2 = input[src]; - if (!( - (isword(cast(rchar)c1) && !isword(cast(rchar)c2)) || - (!isword(cast(rchar)c1) && isword(cast(rchar)c2)) - ) - ) - goto Lnomatch; - } - pc++; - break; - - case REnotwordboundary: - debug(regexp) printf("\tREnotwordboundary\n"); - if (src == 0 || src == input.length) - goto Lnomatch; - c1 = input[src - 1]; - c2 = input[src]; - if ( - (isword(cast(rchar)c1) && !isword(cast(rchar)c2)) || - (!isword(cast(rchar)c1) && isword(cast(rchar)c2)) - ) - goto Lnomatch; - pc++; - break; - - case REdigit: - debug(regexp) printf("\tREdigit\n"); - if (src == input.length) - goto Lnomatch; - if (!isDigit(input[src])) - goto Lnomatch; - src++; - pc++; - break; - - case REnotdigit: - debug(regexp) printf("\tREnotdigit\n"); - if (src == input.length) - goto Lnomatch; - if (isDigit(input[src])) - goto Lnomatch; - src++; - pc++; - break; - - case REspace: - debug(regexp) printf("\tREspace\n"); - if (src == input.length) - goto Lnomatch; - if (!isWhite(input[src])) - goto Lnomatch; - src++; - pc++; - break; - - case REnotspace: - debug(regexp) printf("\tREnotspace\n"); - if (src == input.length) - goto Lnomatch; - if (isWhite(input[src])) - goto Lnomatch; - src++; - pc++; - break; - - case REword: - debug(regexp) printf("\tREword\n"); - if (src == input.length) - goto Lnomatch; - if (!isword(input[src])) - goto Lnomatch; - src++; - pc++; - break; - - case REnotword: - debug(regexp) printf("\tREnotword\n"); - if (src == input.length) - goto Lnomatch; - if (isword(input[src])) - goto Lnomatch; - src++; - pc++; - break; - - case REbackref: - { - n = program[pc + 1]; - debug(regexp) printf("\tREbackref %d\n", n); - - auto so = pmatch[n + 1].rm_so; - auto eo = pmatch[n + 1].rm_eo; - len = eo - so; - if (src + len > input.length) - goto Lnomatch; - else if (attributes & REA.ignoreCase) - { - if (icmp(input[src .. src + len], input[so .. eo])) - goto Lnomatch; - } - else if (memcmp(&input[src], &input[so], len * rchar.sizeof)) - goto Lnomatch; - src += len; - pc += 2; - break; - } - - default: - assert(0); - } - } - - Lnomatch: - debug(regexp) printf("\tnomatch pc=%d\n", pc); - src = srcsave; - return 0; - } - -/* =================== Compiler ================== */ - - int parseRegexp() - { - size_t gotooffset; - uint len1; - uint len2; - - debug(regexp) - { - auto sss = pattern[p .. pattern.length]; - printf("parseRegexp() '%.*s'\n", sss.length, sss.ptr); - } - auto offset = buf.offset; - for (;;) - { - assert(p <= pattern.length); - if (p == pattern.length) - { buf.write(REend); - return 1; - } - switch (pattern[p]) - { - case ')': - return 1; - - case '|': - p++; - gotooffset = buf.offset; - buf.write(REgoto); - buf.write(cast(uint)0); - len1 = cast(uint)(buf.offset - offset); - buf.spread(offset, 1 + uint.sizeof); - gotooffset += 1 + uint.sizeof; - parseRegexp(); - len2 = cast(uint)(buf.offset - (gotooffset + 1 + uint.sizeof)); - buf.data[offset] = REor; - (cast(uint *)&buf.data[offset + 1])[0] = len1; - (cast(uint *)&buf.data[gotooffset + 1])[0] = len2; - break; - - default: - parsePiece(); - break; - } - } - } - - int parsePiece() - { - uint len; - uint n; - uint m; - ubyte op; - auto plength = pattern.length; - - debug(regexp) - { - auto sss = pattern[p .. pattern.length]; - printf("parsePiece() '%.*s'\n", sss.length, sss.ptr); - } - auto offset = buf.offset; - parseAtom(); - if (p == plength) - return 1; - switch (pattern[p]) - { - case '*': - // Special optimization: replace .* with REanystar - if (buf.offset - offset == 1 && - buf.data[offset] == REanychar && - p + 1 < plength && - pattern[p + 1] != '?') - { - buf.data[offset] = REanystar; - p++; - break; - } - - n = 0; - m = inf; - goto Lnm; - - case '+': - n = 1; - m = inf; - goto Lnm; - - case '?': - n = 0; - m = 1; - goto Lnm; - - case '{': // {n} {n,} {n,m} - p++; - if (p == plength || !isDigit(pattern[p])) - goto Lerr; - n = 0; - do - { - // BUG: handle overflow - n = n * 10 + pattern[p] - '0'; - p++; - if (p == plength) - goto Lerr; - } while (isDigit(pattern[p])); - if (pattern[p] == '}') // {n} - { m = n; - goto Lnm; - } - if (pattern[p] != ',') - goto Lerr; - p++; - if (p == plength) - goto Lerr; - if (pattern[p] == /*{*/ '}') // {n,} - { m = inf; - goto Lnm; - } - if (!isDigit(pattern[p])) - goto Lerr; - m = 0; // {n,m} - do - { - // BUG: handle overflow - m = m * 10 + pattern[p] - '0'; - p++; - if (p == plength) - goto Lerr; - } while (isDigit(pattern[p])); - if (pattern[p] != /*{*/ '}') - goto Lerr; - goto Lnm; - - Lnm: - p++; - op = REnm; - if (p < plength && pattern[p] == '?') - { op = REnmq; // minimal munch version - p++; - } - len = cast(uint)(buf.offset - offset); - buf.spread(offset, 1 + uint.sizeof * 3); - buf.data[offset] = op; - uint* puint = cast(uint *)&buf.data[offset + 1]; - puint[0] = len; - puint[1] = n; - puint[2] = m; - break; - - default: - break; - } - return 1; - - Lerr: - error("badly formed {n,m}"); - assert(0); - } - - int parseAtom() - { ubyte op; - size_t offset; - rchar c; - - debug(regexp) - { - auto sss = pattern[p .. pattern.length]; - printf("parseAtom() '%.*s'\n", sss.length, sss.ptr); - } - if (p < pattern.length) - { - c = pattern[p]; - switch (c) - { - case '*': - case '+': - case '?': - error("*+? not allowed in atom"); - p++; - return 0; - - case '(': - p++; - buf.write(REparen); - offset = buf.offset; - buf.write(cast(uint)0); // reserve space for length - buf.write(re_nsub); - re_nsub++; - parseRegexp(); - *cast(uint *)&buf.data[offset] = - cast(uint)(buf.offset - (offset + uint.sizeof * 2)); - if (p == pattern.length || pattern[p] != ')') - { - error("')' expected"); - return 0; - } - p++; - break; - - case '[': - if (!parseRange()) - return 0; - break; - - case '.': - p++; - buf.write(REanychar); - break; - - case '^': - p++; - buf.write(REbol); - break; - - case '$': - p++; - buf.write(REeol); - break; - - case '\\': - p++; - if (p == pattern.length) - { error("no character past '\\'"); - return 0; - } - c = pattern[p]; - switch (c) - { - case 'b': op = REwordboundary; goto Lop; - case 'B': op = REnotwordboundary; goto Lop; - case 'd': op = REdigit; goto Lop; - case 'D': op = REnotdigit; goto Lop; - case 's': op = REspace; goto Lop; - case 'S': op = REnotspace; goto Lop; - case 'w': op = REword; goto Lop; - case 'W': op = REnotword; goto Lop; - - Lop: - buf.write(op); - p++; - break; - - case 'f': - case 'n': - case 'r': - case 't': - case 'v': - case 'c': - case 'x': - case 'u': - case '0': - c = cast(char)escape(); - goto Lbyte; - - case '1': case '2': case '3': - case '4': case '5': case '6': - case '7': case '8': case '9': - c -= '1'; - if (c < re_nsub) - { buf.write(REbackref); - buf.write(cast(ubyte)c); - } - else - { error("no matching back reference"); - return 0; - } - p++; - break; - - default: - p++; - goto Lbyte; - } - break; - - default: - p++; - Lbyte: - op = REchar; - if (attributes & REA.ignoreCase) - { - if (isAlpha(c)) - { - op = REichar; - c = cast(char)std.ascii.toUpper(c); - } - } - if (op == REchar && c <= 0xFF) - { - // Look ahead and see if we can make this into - // an REstring - auto q = p; - for (; q < pattern.length; ++q) - { rchar qc = pattern[q]; - - switch (qc) - { - case '{': - case '*': - case '+': - case '?': - if (q == p) - goto Lchar; - q--; - break; - - case '(': case ')': - case '|': - case '[': case ']': - case '.': case '^': - case '$': case '\\': - case '}': - break; - - default: - continue; - } - break; - } - auto len = q - p; - if (len > 0) - { - debug(regexp) printf("writing string len %d, c = '%c', pattern[p] = '%c'\n", len+1, c, pattern[p]); - buf.reserve(5 + (1 + len) * rchar.sizeof); - buf.write((attributes & REA.ignoreCase) ? REistring : REstring); - buf.write(len + 1); - buf.write(c); - buf.write(pattern[p .. p + len]); - p = q; - break; - } - } - if (c >= 0x80) - { - // Convert to dchar opcode - op = (op == REchar) ? REdchar : REidchar; - buf.write(op); - buf.write(c); - } - else - { - Lchar: - debug(regexp) printf("It's an REchar '%c'\n", c); - buf.write(op); - buf.write(cast(char)c); - } - break; - } - } - return 1; - } - -private: - class Range - { - size_t maxc; - size_t maxb; - OutBuffer buf; - ubyte* base; - BitArray bits; - - this(OutBuffer buf) - { - this.buf = buf; - if (buf.data.length) - this.base = &buf.data[buf.offset]; - } - - void setbitmax(size_t u) - { - //printf("setbitmax(x%x), maxc = x%x\n", u, maxc); - if (u > maxc) - { - maxc = u; - auto b = u / 8; - if (b >= maxb) - { - auto u2 = base ? base - &buf.data[0] : 0; - buf.fill0(b - maxb + 1); - base = &buf.data[u2]; - maxb = b + 1; - //bits = (cast(bit*)this.base)[0 .. maxc + 1]; - bits.ptr = cast(size_t*)this.base; - } - bits.len = maxc + 1; - } - } - - void setbit2(size_t u) - { - setbitmax(u + 1); - //printf("setbit2 [x%02x] |= x%02x\n", u >> 3, 1 << (u & 7)); - bits[u] = 1; - } - - }; - - int parseRange() - { - int c; - int c2; - uint i; - uint cmax; - - cmax = 0x7F; - p++; - ubyte op = REbit; - if (p == pattern.length) - goto Lerr; - if (pattern[p] == '^') - { p++; - op = REnotbit; - if (p == pattern.length) - goto Lerr; - } - buf.write(op); - auto offset = buf.offset; - buf.write(cast(uint)0); // reserve space for length - buf.reserve(128 / 8); - auto r = new Range(buf); - if (op == REnotbit) - r.setbit2(0); - switch (pattern[p]) - { - case ']': - case '-': - c = pattern[p]; - p++; - r.setbit2(c); - break; - - default: - break; - } - - enum RS { start, rliteral, dash } - RS rs; - - rs = RS.start; - for (;;) - { - if (p == pattern.length) - goto Lerr; - switch (pattern[p]) - { - case ']': - switch (rs) - { case RS.dash: - r.setbit2('-'); - goto case; - case RS.rliteral: - r.setbit2(c); - break; - case RS.start: - break; - default: - assert(0); - } - p++; - break; - - case '\\': - p++; - r.setbitmax(cmax); - if (p == pattern.length) - goto Lerr; - switch (pattern[p]) - { - case 'd': - for (i = '0'; i <= '9'; i++) - r.bits[i] = 1; - goto Lrs; - - case 'D': - for (i = 1; i < '0'; i++) - r.bits[i] = 1; - for (i = '9' + 1; i <= cmax; i++) - r.bits[i] = 1; - goto Lrs; - - case 's': - for (i = 0; i <= cmax; i++) - if (isWhite(i)) - r.bits[i] = 1; - goto Lrs; - - case 'S': - for (i = 1; i <= cmax; i++) - if (!isWhite(i)) - r.bits[i] = 1; - goto Lrs; - - case 'w': - for (i = 0; i <= cmax; i++) - if (isword(cast(rchar)i)) - r.bits[i] = 1; - goto Lrs; - - case 'W': - for (i = 1; i <= cmax; i++) - if (!isword(cast(rchar)i)) - r.bits[i] = 1; - goto Lrs; - - Lrs: - switch (rs) - { case RS.dash: - r.setbit2('-'); - goto case; - case RS.rliteral: - r.setbit2(c); - break; - default: - break; - } - rs = RS.start; - continue; - - default: - break; - } - c2 = escape(); - goto Lrange; - - case '-': - p++; - if (rs == RS.start) - goto Lrange; - else if (rs == RS.rliteral) - rs = RS.dash; - else if (rs == RS.dash) - { - r.setbit2(c); - r.setbit2('-'); - rs = RS.start; - } - continue; - - default: - c2 = pattern[p]; - p++; - Lrange: - switch (rs) - { case RS.rliteral: - r.setbit2(c); - goto case; - case RS.start: - c = c2; - rs = RS.rliteral; - break; - - case RS.dash: - if (c > c2) - { error("inverted range in character class"); - return 0; - } - r.setbitmax(c2); - //printf("c = %x, c2 = %x\n",c,c2); - for (; c <= c2; c++) - r.bits[c] = 1; - rs = RS.start; - break; - - default: - assert(0); - } - continue; - } - break; - } - if (attributes & REA.ignoreCase) - { - // BUG: what about dchar? - r.setbitmax(0x7F); - for (c = 'a'; c <= 'z'; c++) - { - if (r.bits[c]) - r.bits[c + 'A' - 'a'] = 1; - else if (r.bits[c + 'A' - 'a']) - r.bits[c] = 1; - } - } - //printf("maxc = %d, maxb = %d\n",r.maxc,r.maxb); - (cast(ushort *)&buf.data[offset])[0] = cast(ushort)r.maxc; - (cast(ushort *)&buf.data[offset])[1] = cast(ushort)r.maxb; - return 1; - - Lerr: - error("invalid range"); - return 0; - } - - void error(string msg) - { - errors++; - debug(regexp) printf("error: %.*s\n", msg.length, msg.ptr); -//assert(0); -//*(char*)0=0; - throw new RegExpException(msg); - } - -// p is following the \ char - int escape() - in - { - assert(p < pattern.length); - } - body - { int c; - int i; - rchar tc; - - c = pattern[p]; // none of the cases are multibyte - switch (c) - { - case 'b': c = '\b'; break; - case 'f': c = '\f'; break; - case 'n': c = '\n'; break; - case 'r': c = '\r'; break; - case 't': c = '\t'; break; - case 'v': c = '\v'; break; - - // BUG: Perl does \a and \e too, should we? - - case 'c': - ++p; - if (p == pattern.length) - goto Lretc; - c = pattern[p]; - // Note: we are deliberately not allowing dchar letters - if (!(('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))) - { - Lcerr: - error("letter expected following \\c"); - return 0; - } - c &= 0x1F; - break; - - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - c -= '0'; - for (i = 0; i < 2; i++) - { - p++; - if (p == pattern.length) - goto Lretc; - tc = pattern[p]; - if ('0' <= tc && tc <= '7') - { c = c * 8 + (tc - '0'); - // Treat overflow as if last - // digit was not an octal digit - if (c >= 0xFF) - { c >>= 3; - return c; - } - } - else - return c; - } - break; - - case 'x': - c = 0; - for (i = 0; i < 2; i++) - { - p++; - if (p == pattern.length) - goto Lretc; - tc = pattern[p]; - if ('0' <= tc && tc <= '9') - c = c * 16 + (tc - '0'); - else if ('a' <= tc && tc <= 'f') - c = c * 16 + (tc - 'a' + 10); - else if ('A' <= tc && tc <= 'F') - c = c * 16 + (tc - 'A' + 10); - else if (i == 0) // if no hex digits after \x - { - // Not a valid \xXX sequence - return 'x'; - } - else - return c; - } - break; - - case 'u': - c = 0; - for (i = 0; i < 4; i++) - { - p++; - if (p == pattern.length) - goto Lretc; - tc = pattern[p]; - if ('0' <= tc && tc <= '9') - c = c * 16 + (tc - '0'); - else if ('a' <= tc && tc <= 'f') - c = c * 16 + (tc - 'a' + 10); - else if ('A' <= tc && tc <= 'F') - c = c * 16 + (tc - 'A' + 10); - else - { - // Not a valid \uXXXX sequence - p -= i; - return 'u'; - } - } - break; - - default: - break; - } - p++; - Lretc: - return c; - } - -/* ==================== optimizer ======================= */ - - void optimize() - { ubyte[] prog; - - debug(regexp) printf("RegExp.optimize()\n"); - prog = buf.toBytes(); - for (size_t i = 0; 1;) - { - //printf("\tprog[%d] = %d, %d\n", i, prog[i], REstring); - switch (prog[i]) - { - case REend: - case REanychar: - case REanystar: - case REbackref: - case REeol: - case REchar: - case REichar: - case REdchar: - case REidchar: - case REstring: - case REistring: - case REtestbit: - case REbit: - case REnotbit: - case RErange: - case REnotrange: - case REwordboundary: - case REnotwordboundary: - case REdigit: - case REnotdigit: - case REspace: - case REnotspace: - case REword: - case REnotword: - return; - - case REbol: - i++; - continue; - - case REor: - case REnm: - case REnmq: - case REparen: - case REgoto: - { - auto bitbuf = new OutBuffer; - auto r = new Range(bitbuf); - auto offset = i; - if (starrchars(r, prog[i .. prog.length])) - { - debug(regexp) printf("\tfilter built\n"); - buf.spread(offset, 1 + 4 + r.maxb); - buf.data[offset] = REtestbit; - (cast(ushort *)&buf.data[offset + 1])[0] = cast(ushort)r.maxc; - (cast(ushort *)&buf.data[offset + 1])[1] = cast(ushort)r.maxb; - i = offset + 1 + 4; - buf.data[i .. i + r.maxb] = r.base[0 .. r.maxb]; - } - return; - } - default: - assert(0); - } - } - } - -///////////////////////////////////////// -// OR the leading character bits into r. -// Limit the character range from 0..7F, -// trymatch() will allow through anything over maxc. -// Return 1 if success, 0 if we can't build a filter or -// if there is no point to one. - - int starrchars(Range r, const(ubyte)[] prog) - { rchar c; - uint maxc; - size_t maxb; - size_t len; - uint b; - uint n; - uint m; - const(ubyte)* pop; - - //printf("RegExp.starrchars(prog = %p, progend = %p)\n", prog, progend); - for (size_t i = 0; i < prog.length;) - { - switch (prog[i]) - { - case REchar: - c = prog[i + 1]; - if (c <= 0x7F) - r.setbit2(c); - return 1; - - case REichar: - c = prog[i + 1]; - if (c <= 0x7F) - { r.setbit2(c); - r.setbit2(std.ascii.toLower(cast(rchar)c)); - } - return 1; - - case REdchar: - case REidchar: - return 1; - - case REanychar: - return 0; // no point - - case REstring: - len = *cast(size_t *)&prog[i + 1]; - assert(len); - c = *cast(rchar *)&prog[i + 1 + size_t.sizeof]; - debug(regexp) printf("\tREstring %d, '%c'\n", len, c); - if (c <= 0x7F) - r.setbit2(c); - return 1; - - case REistring: - len = *cast(size_t *)&prog[i + 1]; - assert(len); - c = *cast(rchar *)&prog[i + 1 + size_t.sizeof]; - debug(regexp) printf("\tREistring %d, '%c'\n", len, c); - if (c <= 0x7F) - { r.setbit2(std.ascii.toUpper(cast(rchar)c)); - r.setbit2(std.ascii.toLower(cast(rchar)c)); - } - return 1; - - case REtestbit: - case REbit: - maxc = (cast(ushort *)&prog[i + 1])[0]; - maxb = (cast(ushort *)&prog[i + 1])[1]; - if (maxc <= 0x7F) - r.setbitmax(maxc); - else - maxb = r.maxb; - for (b = 0; b < maxb; b++) - r.base[b] |= prog[i + 1 + 4 + b]; - return 1; - - case REnotbit: - maxc = (cast(ushort *)&prog[i + 1])[0]; - maxb = (cast(ushort *)&prog[i + 1])[1]; - if (maxc <= 0x7F) - r.setbitmax(maxc); - else - maxb = r.maxb; - for (b = 0; b < maxb; b++) - r.base[b] |= ~prog[i + 1 + 4 + b]; - return 1; - - case REbol: - case REeol: - return 0; - - case REor: - len = (cast(uint *)&prog[i + 1])[0]; - return starrchars(r, prog[i + 1 + uint.sizeof .. prog.length]) && - starrchars(r, prog[i + 1 + uint.sizeof + len .. prog.length]); - - case REgoto: - len = (cast(uint *)&prog[i + 1])[0]; - i += 1 + uint.sizeof + len; - break; - - case REanystar: - return 0; - - case REnm: - case REnmq: - // len, n, m, () - len = (cast(uint *)&prog[i + 1])[0]; - n = (cast(uint *)&prog[i + 1])[1]; - m = (cast(uint *)&prog[i + 1])[2]; - pop = &prog[i + 1 + uint.sizeof * 3]; - if (!starrchars(r, pop[0 .. len])) - return 0; - if (n) - return 1; - i += 1 + uint.sizeof * 3 + len; - break; - - case REparen: - // len, () - len = (cast(uint *)&prog[i + 1])[0]; - n = (cast(uint *)&prog[i + 1])[1]; - pop = &prog[0] + i + 1 + uint.sizeof * 2; - return starrchars(r, pop[0 .. len]); - - case REend: - return 0; - - case REwordboundary: - case REnotwordboundary: - return 0; - - case REdigit: - r.setbitmax('9'); - for (c = '0'; c <= '9'; c++) - r.bits[c] = 1; - return 1; - - case REnotdigit: - r.setbitmax(0x7F); - for (c = 0; c <= '0'; c++) - r.bits[c] = 1; - for (c = '9' + 1; c <= r.maxc; c++) - r.bits[c] = 1; - return 1; - - case REspace: - r.setbitmax(0x7F); - for (c = 0; c <= r.maxc; c++) - if (isWhite(c)) - r.bits[c] = 1; - return 1; - - case REnotspace: - r.setbitmax(0x7F); - for (c = 0; c <= r.maxc; c++) - if (!isWhite(c)) - r.bits[c] = 1; - return 1; - - case REword: - r.setbitmax(0x7F); - for (c = 0; c <= r.maxc; c++) - if (isword(cast(rchar)c)) - r.bits[c] = 1; - return 1; - - case REnotword: - r.setbitmax(0x7F); - for (c = 0; c <= r.maxc; c++) - if (!isword(cast(rchar)c)) - r.bits[c] = 1; - return 1; - - case REbackref: - return 0; - - default: - assert(0); - } - } - return 1; - } - -/* ==================== replace ======================= */ - -/*********************** - * After a match is found with test(), this function - * will take the match results and, using the format - * string, generate and return a new string. - */ - - public string replace(string format) - { - return replace3(format, input, pmatch[0 .. re_nsub + 1]); - } - -// Static version that doesn't require a RegExp object to be created - - public static string replace3(string format, string input, regmatch_t[] pmatch) - { - string result; - size_t c2; - ptrdiff_t rm_so, rm_eo, i; - -// printf("replace3(format = '%.*s', input = '%.*s')\n", format.length, format.ptr, input.length, input.ptr); - result.length = format.length; - result.length = 0; - for (size_t f = 0; f < format.length; f++) - { - char c = format[f]; - L1: - if (c != '$') - { - result ~= c; - continue; - } - ++f; - if (f == format.length) - { - result ~= '$'; - break; - } - c = format[f]; - switch (c) - { - case '&': - rm_so = pmatch[0].rm_so; - rm_eo = pmatch[0].rm_eo; - goto Lstring; - - case '`': - rm_so = 0; - rm_eo = pmatch[0].rm_so; - goto Lstring; - - case '\'': - rm_so = pmatch[0].rm_eo; - rm_eo = input.length; - goto Lstring; - - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - i = c - '0'; - if (f + 1 == format.length) - { - if (i == 0) - { - result ~= '$'; - result ~= c; - continue; - } - } - else - { - c2 = format[f + 1]; - if (c2 >= '0' && c2 <= '9') - { - i = (c - '0') * 10 + (c2 - '0'); - f++; - } - if (i == 0) - { - result ~= '$'; - result ~= c; - c = cast(char)c2; - goto L1; - } - } - - if (i < pmatch.length) - { rm_so = pmatch[i].rm_so; - rm_eo = pmatch[i].rm_eo; - goto Lstring; - } - break; - - Lstring: - if (rm_so != rm_eo) - result ~= input[rm_so .. rm_eo]; - break; - - default: - result ~= '$'; - result ~= c; - break; - } - } - return result; - } - -/************************************ - * Like replace(char[] format), but uses old style formatting: - - - - - - - -
Format - Description -
& - replace with the match -
\n - replace with the nth parenthesized match, n is 1..9 -
\c - replace with char c. -
-*/ - - public string replaceOld(string format) - { - string result; - -//printf("replace: this = %p so = %d, eo = %d\n", this, pmatch[0].rm_so, pmatch[0].rm_eo); -//printf("3input = '%.*s'\n", input.length, input.ptr); - result.length = format.length; - result.length = 0; - for (size_t i; i < format.length; i++) - { - char c = format[i]; - switch (c) - { - case '&': - { - auto sss = input[pmatch[0].rm_so .. pmatch[0].rm_eo]; - //printf("match = '%.*s'\n", sss.length, sss.ptr); - result ~= sss; - } - break; - - case '\\': - if (i + 1 < format.length) - { - c = format[++i]; - if (c >= '1' && c <= '9') - { uint j; - - j = c - '0'; - if (j <= re_nsub && pmatch[j].rm_so != pmatch[j].rm_eo) - result ~= input[pmatch[j].rm_so .. pmatch[j].rm_eo]; - break; - } - } - result ~= c; - break; - - default: - result ~= c; - break; - } - } - return result; - } - -} - -unittest -{ // Created and placed in public domain by Don Clugston - - auto m = search("aBC r s", `bc\x20r[\40]s`, "i"); - assert(m.pre=="a"); - assert(m[0]=="BC r s"); - auto m2 = search("7xxyxxx", `^\d([a-z]{2})\D\1`); - assert(m2[0]=="7xxyxx"); - // Just check the parsing. - auto m3 = search("dcbxx", `ca|b[\d\]\D\s\S\w-\W]`); - auto m4 = search("xy", `[^\ca-\xFa\r\n\b\f\t\v\0123]{2,485}$`); - auto m5 = search("xxx", `^^\r\n\b{13,}\f{4}\t\v\u02aF3a\w\W`); - auto m6 = search("xxy", `.*y`); - assert(m6[0]=="xxy"); - auto m7 = search("QWDEfGH", "(ca|b|defg)+", "i"); - assert(m7[0]=="DEfG"); - auto m8 = search("dcbxx", `a?\B\s\S`); - auto m9 = search("dcbxx", `[-w]`); - auto m10 = search("dcbsfd", `aB[c-fW]dB|\d|\D|\u012356|\w|\W|\s|\S`, "i"); - auto m11 = search("dcbsfd", `[]a-]`); - m.replaceOld(`a&b\1c`); - m.replace(`a$&b$'$1c`); -} - -// Andrei -//------------------------------------------------------------------------------ - -struct Pattern(Char) -{ - immutable(Char)[] pattern; - - this(immutable(Char)[] pattern) - { - this.pattern = pattern; - } -} - -Pattern!(Char) pattern(Char)(immutable(Char)[] pat) -{ - return typeof(return)(pat); -} - -struct Splitter(Range) -{ - Range _input; - size_t _chunkLength; - RegExp _rx; - - private Range search() - { - //rx = std.regexp.search(_input, "(" ~ _separator.pattern ~ ")"); - auto i = std.regexp.find(cast(string) _input, _rx); - return _input[i >= 0 ? i : _input.length .. _input.length]; - } - - private void advance() - { - //writeln("(" ~ _separator.pattern ~ ")"); - //writeln(_input); - //assert(_rx[0].length > 0); - _chunkLength += _rx[0].length; - } - - this(Range input, Pattern!(char) separator) - { - _input = input; - _rx = RegExp(separator.pattern); - _chunkLength = _input.length - search().length; - } - - ref auto opSlice() - { - return this; - } - - @property Range front() - { - return _input[0 .. _chunkLength]; - } - - @property bool empty() - { - return _input.empty; - } - - void popFront() - { - if (_chunkLength == _input.length) - { - _input = _input[_chunkLength .. _input.length]; - return; - } - advance(); - _input = _input[_chunkLength .. _input.length]; - _chunkLength = _input.length - search().length; - } -} - -Splitter!(Range) splitter(Range)(Range r, Pattern!(char) pat) -{ - static assert(is(Unqual!(typeof(Range.init[0])) == char), - Unqual!(typeof(Range.init[0])).stringof); - return typeof(return)(cast(string) r, pat); -} - -unittest -{ - auto s1 = ", abc, de, fg, hi, "; - auto sp2 = splitter(s1, pattern(", *")); - //foreach (e; sp2) writeln("[", e, "]"); - assert(equal(sp2, ["", "abc", "de", "fg", "hi"][])); -} - -unittest -{ - auto str= "foo"; - string[] re_strs= [ - r"^(h|a|)fo[oas]$", - r"^(a|b|)fo[oas]$", - r"^(a|)foo$", - r"(a|)foo", - r"^(h|)foo$", - r"(h|)foo", - r"(h|a|)fo[oas]", - r"^(a|b|)fo[o]$", - r"[abf][ops](o|oo|)(h|a|)", - r"(h|)[abf][ops](o|oo|)", - r"(c|)[abf][ops](o|oo|)" - ]; - - foreach (re_str; re_strs) { - auto re= new RegExp(re_str); - auto matches= cast(bool)re.test(str); - assert(matches); - //writefln("'%s' matches '%s' ? %s", str, re_str, matches); - } - - for (char c='a'; c<='z'; ++c) { - auto re_str= "("~c~"|)foo"; - auto re= new RegExp(re_str); - auto matches= cast(bool)re.test(str); - assert(matches); - //writefln("'%s' matches '%s' ? %s", str, re_str, matches); - } -} diff --git a/unittest.d b/unittest.d index 6953cc728..6da374fcc 100644 --- a/unittest.d +++ b/unittest.d @@ -39,7 +39,7 @@ public import std.path; public import std.perf; public import std.process; public import std.random; -public import std.regexp; +public import std.regex; public import std.signals; //public import std.slist; public import std.socket; @@ -82,7 +82,7 @@ else std.conv.to!double("1.0"); // std.conv OutBuffer b = new OutBuffer(); // outbuffer std.ctype.tolower('A'); // ctype - RegExp r = new RegExp(null, null); // regexp + auto r = regex(""); // regex uint ranseed = std.random.unpredictableSeed; thisTid; int a[]; diff --git a/win32.mak b/win32.mak index 45d95c69e..b18a77179 100644 --- a/win32.mak +++ b/win32.mak @@ -117,7 +117,7 @@ SRC_STD_3= std\csv.d std\math.d std\complex.d std\numeric.d std\bigint.d \ SRC_STD_3a= std\signals.d std\typetuple.d std\traits.d \ std\encoding.d std\xml.d \ - std\random.d std\regexp.d \ + std\random.d \ std\exception.d \ std\compiler.d std\cpuid.d \ std\system.d std\concurrency.d @@ -156,7 +156,7 @@ SRC_STD= std\zlib.d std\zip.d std\stdint.d std\container.d std\conv.d std\utf.d std\outbuffer.d std\md5.d std\base64.d \ std\mmfile.d \ std\syserror.d \ - std\regexp.d std\random.d std\stream.d std\process.d \ + std\random.d std\stream.d std\process.d \ std\socket.d std\socketstream.d std\format.d \ std\stdio.d std\perf.d std\uni.d std\uuid.d \ std\cstream.d std\demangle.d \ @@ -319,7 +319,6 @@ DOCS= $(DOC)\object.html \ $(DOC)\std_random.html \ $(DOC)\std_range.html \ $(DOC)\std_regex.html \ - $(DOC)\std_regexp.html \ $(DOC)\std_signals.html \ $(DOC)\std_socket.html \ $(DOC)\std_socketstream.html \ @@ -580,9 +579,6 @@ $(DOC)\std_range.html : $(STDDOC) std\range.d $(DOC)\std_regex.html : $(STDDOC) std\regex.d $(DMD) -c -o- $(DDOCFLAGS) -Df$(DOC)\std_regex.html $(STDDOC) std\regex.d -$(DOC)\std_regexp.html : $(STDDOC) std\regexp.d - $(DMD) -c -o- $(DDOCFLAGS) -Df$(DOC)\std_regexp.html $(STDDOC) std\regexp.d - $(DOC)\std_signals.html : $(STDDOC) std\signals.d $(DMD) -c -o- $(DDOCFLAGS) -Df$(DOC)\std_signals.html $(STDDOC) std\signals.d diff --git a/win64.mak b/win64.mak index 69c16f38b..494e003c7 100644 --- a/win64.mak +++ b/win64.mak @@ -120,7 +120,7 @@ SRC_STD_3a= std\uni.d std\base64.d std\md5.d std\ctype.d std\ascii.d \ SRC_STD_3b= std\signals.d std\typetuple.d std\traits.d \ std\encoding.d std\xml.d \ - std\random.d std\regexp.d \ + std\random.d \ std\exception.d \ std\compiler.d std\cpuid.d \ std\system.d std\concurrency.d @@ -178,7 +178,7 @@ SRC_STD= std\zlib.d std\zip.d std\stdint.d std\container.d std\conv.d std\utf.d std\outbuffer.d std\md5.d std\base64.d \ std\mmfile.d \ std\syserror.d \ - std\regexp.d std\random.d std\stream.d std\process.d \ + std\random.d std\stream.d std\process.d \ std\socket.d std\socketstream.d std\format.d \ std\stdio.d std\perf.d std\uni.d std\uuid.d \ std\cstream.d std\demangle.d \ @@ -341,7 +341,6 @@ DOCS= $(DOC)\object.html \ $(DOC)\std_random.html \ $(DOC)\std_range.html \ $(DOC)\std_regex.html \ - $(DOC)\std_regexp.html \ $(DOC)\std_signals.html \ $(DOC)\std_socket.html \ $(DOC)\std_socketstream.html \ @@ -633,9 +632,6 @@ $(DOC)\std_range.html : $(STDDOC) std\range.d $(DOC)\std_regex.html : $(STDDOC) std\regex.d $(DMD) -c -o- $(DDOCFLAGS) -Df$(DOC)\std_regex.html $(STDDOC) std\regex.d -$(DOC)\std_regexp.html : $(STDDOC) std\regexp.d - $(DMD) -c -o- $(DDOCFLAGS) -Df$(DOC)\std_regexp.html $(STDDOC) std\regexp.d - $(DOC)\std_signals.html : $(STDDOC) std\signals.d $(DMD) -c -o- $(DDOCFLAGS) -Df$(DOC)\std_signals.html $(STDDOC) std\signals.d