diff --git a/index.d b/index.d
index ccb6d6694..8d9b2a295 100644
--- a/index.d
+++ b/index.d
@@ -190,7 +190,7 @@ $(V1
Recursively search file system and (currently Windows
only) FTP sites.
)
- std.regexp
+ std.regex
The usual regular expression functions.
std.socket
diff --git a/posix.mak b/posix.mak
index d393b09c9..e1754a60d 100644
--- a/posix.mak
+++ b/posix.mak
@@ -164,7 +164,7 @@ STD_MODULES = $(addprefix std/, algorithm array ascii base64 bigint \
cpuid cstream ctype csv datetime demangle encoding exception \
file format functional getopt json math mathspecial md5 \
metastrings mmfile numeric outbuffer parallelism path perf \
- process random range regex regexp signals socket socketstream \
+ process random range regex signals socket socketstream \
stdint stdio stdiobase stream string syserror system traits \
typecons typetuple uni uri utf uuid variant xml zip zlib)
diff --git a/std/regexp.d b/std/regexp.d
deleted file mode 100644
index 87726cfb2..000000000
--- a/std/regexp.d
+++ /dev/null
@@ -1,3434 +0,0 @@
-// Written in the D programming language.
-// Regular Expressions.
-
-/**
- * $(RED Deprecated.
- * Please use $(LINK2 std_regex.html, std.regex) instead.)
- *
- * $(LINK2 http://www.digitalmars.com/ctg/regular.html, Regular
- * expressions) are a powerful method of string pattern matching. The
- * regular expression language used in this library is the same as
- * that commonly used, however, some of the very advanced forms may
- * behave slightly differently. The standard observed is the $(WEB
- * www.ecma-international.org/publications/standards/Ecma-262.htm,
- * ECMA standard) for regular expressions.
- *
- * std.regexp is designed to work only with valid UTF strings as input.
- * To validate untrusted input, use std.utf.validate().
- *
- * In the following guide, $(I pattern)[] refers to a
- * $(LINK2 http://www.digitalmars.com/ctg/regular.html, regular expression).
- * The $(I attributes)[] refers to
- * a string controlling the interpretation
- * of the regular expression.
- * It consists of a sequence of one or more
- * of the following characters:
- *
- *
- * Attribute Characters
- * $(TR $(TH Attribute) $(TH Action))
- *
- * $(TD $(B g))
- * $(TD global; repeat over the whole input string)
- *
- *
- * $(TD $(B i))
- * $(TD case insensitive)
- *
- *
- * $(TD $(B m))
- * $(TD treat as multiple lines separated by newlines)
- *
- *
- *
- * The $(I format)[] string has the formatting characters:
- *
- *
- * Formatting Characters
- * $(TR $(TH Format) $(TH Replaced With))
- * $(TR
- * $(TD $(B $$)) $(TD $)
- * )
- * $(TR
- * $(TD $(B $&)) $(TD The matched substring.)
- * )
- * $(TR
- * $(TD $(B $`)) $(TD The portion of string that precedes the matched substring.)
- * )
- * $(TR
- * $(TD $(B $')) $(TD The portion of string that follows the matched substring.)
- * )
- * $(TR
- * $(TD $(B $(DOLLAR))$(I n)) $(TD The $(I n)th capture, where $(I n)
- * is a single digit 1-9
- * and $$(I n) is not followed by a decimal digit.)
- * )
- * $(TR
- * $(TD $(B $(DOLLAR))$(I nn)) $(TD The $(I nn)th capture, where $(I nn)
- * is a two-digit decimal
- * number 01-99.
- * If $(I nn)th capture is undefined or more than the number
- * of parenthesized subexpressions, use the empty
- * string instead.)
- * )
- *
- *
- * Any other $ are left as is.
- *
- * References:
- * $(LINK2 http://en.wikipedia.org/wiki/Regular_expressions, Wikipedia)
- * Macros:
- * WIKI = StdRegexp
- * DOLLAR = $
- *
- * Copyright: Copyright Digital Mars 2000 - 2011.
- * License: Boost License 1.0.
- * Authors: $(WEB digitalmars.com, Walter Bright)
- * Source: $(PHOBOSSRC std/_regexp.d)
- */
-/* Copyright Digital Mars 2000 - 2011.
- * Distributed under the Boost Software License, Version 1.0.
- * (See accompanying file LICENSE_1_0.txt or copy at
- * http://www.boost.org/LICENSE_1_0.txt)
- */
-
-/*
- Escape sequences:
-
- \nnn starts out a 1, 2 or 3 digit octal sequence,
- where n is an octal digit. If nnn is larger than
- 0377, then the 3rd digit is not part of the sequence
- and is not consumed.
- For maximal portability, use exactly 3 digits.
-
- \xXX starts out a 1 or 2 digit hex sequence. X
- is a hex character. If the first character after the \x
- is not a hex character, the value of the sequence is 'x'
- and the XX are not consumed.
- For maximal portability, use exactly 2 digits.
-
- \uUUUU is a unicode sequence. There are exactly
- 4 hex characters after the \u, if any are not, then
- the value of the sequence is 'u', and the UUUU are not
- consumed.
-
- Character classes:
-
- [a-b], where a is greater than b, will produce
- an error.
-
- References:
-
- http://www.unicode.org/unicode/reports/tr18/
-*/
-
-module std.regexp;
-
-pragma(msg, "Notice: As of Phobos 2.055, std.regexp has been deprecated. " ~
- "Please use std.regex instead.");
-
-//debug = regexp; // uncomment to turn on debugging printf's
-
-private
-{
- import core.stdc.stdio;
- import core.stdc.stdlib;
- import core.stdc.string;
- import std.algorithm;
- import std.array;
- import std.stdio;
- import std.string;
- import std.ascii;
- import std.outbuffer;
- import std.bitmanip;
- import std.utf;
- import std.algorithm;
- import std.array;
- import std.traits;
-}
-
-deprecated:
-
-/** Regular expression to extract an _email address.
- * References:
- * $(LINK2 http://www.regular-expressions.info/email.html, How to Find or Validate an Email Address)$(BR)
- * $(LINK2 http://tools.ietf.org/html/rfc2822#section-3.4.1, RFC 2822 Internet Message Format)
- */
-string email =
- r"[a-zA-Z]([.]?([[a-zA-Z0-9_]-]+)*)?@([[a-zA-Z0-9_]\-_]+\.)+[a-zA-Z]{2,6}";
-
-/** Regular expression to extract a _url */
-string url = r"(([h|H][t|T]|[f|F])[t|T][p|P]([s|S]?)\:\/\/|~/|/)?([\w]+:\w+@)?(([a-zA-Z]{1}([\w\-]+\.)+([\w]{2,5}))(:[\d]{1,5})?)?((/?\w+/)+|/?)(\w+\.[\w]{3,4})?([,]\w+)*((\?\w+=\w+)?(&\w+=\w+)*([,]\w*)*)?";
-
-/************************************
- * One of these gets thrown on compilation errors
- */
-
-class RegExpException : Exception
-{
- this(string msg)
- {
- super(msg);
- }
-}
-
-struct regmatch_t
-{
- ptrdiff_t rm_so; // index of start of match
- ptrdiff_t rm_eo; // index past end of match
-}
-
-private alias char rchar; // so we can make a wchar version
-
-/******************************************************
- * Search string for matches with regular expression
- * pattern with attributes.
- * Replace each match with string generated from format.
- * Params:
- * s = String to search.
- * pattern = Regular expression pattern.
- * format = Replacement string format.
- * attributes = Regular expression attributes.
- * Returns:
- * the resulting string
- * Example:
- * Replace the letters 'a' with the letters 'ZZ'.
- * ---
- * s = "Strap a rocket engine on a chicken."
- * sub(s, "a", "ZZ") // result: StrZZp a rocket engine on a chicken.
- * sub(s, "a", "ZZ", "g") // result: StrZZp ZZ rocket engine on ZZ chicken.
- * ---
- * The replacement format can reference the matches using
- * the $&, $$, $', $`, $0 .. $99 notation:
- * ---
- * sub(s, "[ar]", "[$&]", "g") // result: St[r][a]p [a] [r]ocket engine on [a] chi
- * ---
- */
-
-string sub(string s, string pattern, string format, string attributes = null)
-{
- auto r = new RegExp(pattern, attributes);
- auto result = r.replace(s, format);
- delete r;
- return result;
-}
-
-unittest
-{
- debug(regexp) printf("regexp.sub.unittest\n");
-
- string r = sub("hello", "ll", "ss");
- assert(r == "hesso");
-}
-
-/*******************************************************
- * Search string for matches with regular expression
- * pattern with attributes.
- * Pass each match to delegate dg.
- * Replace each match with the return value from dg.
- * Params:
- * s = String to search.
- * pattern = Regular expression pattern.
- * dg = Delegate
- * attributes = Regular expression attributes.
- * Returns: the resulting string.
- * Example:
- * Capitalize the letters 'a' and 'r':
- * ---
- * s = "Strap a rocket engine on a chicken.";
- * sub(s, "[ar]",
- * delegate char[] (RegExp m)
- * {
- * return toUpper(m[0]);
- * },
- * "g"); // result: StRAp A Rocket engine on A chicken.
- * ---
- */
-
-string sub(string s, string pattern, string delegate(RegExp) dg, string attributes = null)
-{
- auto r = new RegExp(pattern, attributes);
-
- string result = s;
- size_t lastindex = 0;
- size_t offset = 0;
-
- while (r.test(s, lastindex))
- {
- auto so = r.pmatch[0].rm_so;
- auto eo = r.pmatch[0].rm_eo;
-
- string replacement = dg(r);
-
- // Optimize by using std.string.replace if possible - Dave Fladebo
- string slice = result[offset + so .. offset + eo];
- if (r.attributes & RegExp.REA.global && // global, so replace all
- !(r.attributes & RegExp.REA.ignoreCase) && // not ignoring case
- !(r.attributes & RegExp.REA.multiline) && // not multiline
- pattern == slice) // simple pattern (exact match, no special characters)
- {
- debug(regexp)
- printf("result: %.*s, pattern: %.*s, slice: %.*s, replacement: %.*s\n",
- result.length, result.ptr,
- pattern.length, pattern.ptr,
- slice.length, slice.ptr,
- replacement.length, replacement.ptr);
- result = replace(result,slice,replacement);
- break;
- }
-
- result = replaceSlice(result, result[offset + so .. offset + eo], replacement);
-
- if (r.attributes & RegExp.REA.global)
- {
- offset += replacement.length - (eo - so);
-
- if (lastindex == eo)
- lastindex++; // always consume some source
- else
- lastindex = eo;
- }
- else
- break;
- }
- delete r;
-
- return result;
-}
-
-unittest
-{
- debug(regexp) printf("regexp.sub.unittest\n");
-
- string foo(RegExp r) { return "ss"; }
-
- auto r = sub("hello", "ll", delegate string(RegExp r) { return "ss"; });
- assert(r == "hesso");
-
- r = sub("hello", "l", delegate string(RegExp r) { return "l"; }, "g");
- assert(r == "hello");
-
- auto s = sub("Strap a rocket engine on a chicken.",
- "[ar]",
- delegate string (RegExp m)
- {
- return std.string.toUpper(m[0]);
- },
- "g");
- assert(s == "StRAp A Rocket engine on A chicken.");
-}
-
-
-/*************************************************
- * Search $(D_PARAM s[]) for first match with $(D_PARAM pattern).
- * Params:
- * s = String to search.
- * pattern = Regular expression pattern.
- * Returns:
- * index into s[] of match if found, -1 if no match.
- * Example:
- * ---
- * auto s = "abcabcabab";
- * find(s, RegExp("b")); // match, returns 1
- * find(s, RegExp("f")); // no match, returns -1
- * ---
- */
-
-ptrdiff_t find(string s, RegExp pattern)
-{
- return pattern.test(s)
- ? pattern.pmatch[0].rm_so
- : -1;
-}
-
-unittest
-{
- debug(regexp) printf("regexp.find.unittest\n");
-
- auto i = find("xabcy", RegExp("abc"));
- assert(i == 1);
- i = find("cba", RegExp("abc"));
- assert(i == -1);
-}
-
-/**
- Returns:
-
- Same as $(D_PARAM find(s, RegExp(pattern, attributes))).
-
- WARNING:
-
- This function is scheduled for deprecation due to unnecessary
- ambiguity with the homonym function in std.string. Instead of
- $(D_PARAM std.regexp.find(s, p, a)), you may want to use $(D_PARAM
- find(s, RegExp(p, a))).
-*/
-
-ptrdiff_t
-find(string s, string pattern, string attributes = null)
-{
- auto r = new RegExp(pattern, attributes);
- scope(exit) delete r;
- return r.test(s) ? r.pmatch[0].rm_so : -1;
-}
-
-unittest
-{
- debug(regexp) printf("regexp.find.unittest\n");
-
- auto i = find("xabcy", "abc");
- assert(i == 1);
- i = find("cba", "abc");
- assert(i == -1);
-}
-
-/*************************************************
- * Search $(D_PARAM s[]) for last match with $(D_PARAM pattern).
- * Params:
- * s = String to search.
- * pattern = Regular expression pattern.
- * Returns:
- * index into s[] of match if found, -1 if no match.
- * Example:
- * ---
- * auto s = "abcabcabab";
- * rfind(s, RegExp("b")); // match, returns 9
- * rfind(s, RegExp("f")); // no match, returns -1
- * ---
- */
-
-ptrdiff_t rfind(string s, RegExp pattern)
-{
- ptrdiff_t i = -1, lastindex = 0;
-
- while (pattern.test(s, lastindex))
- {
- auto eo = pattern.pmatch[0].rm_eo;
- i = pattern.pmatch[0].rm_so;
- if (lastindex == eo)
- lastindex++; // always consume some source
- else
- lastindex = eo;
- }
- return i;
-}
-
-unittest
-{
- ptrdiff_t i;
-
- debug(regexp) printf("regexp.rfind.unittest\n");
- i = rfind("abcdefcdef", RegExp("c"));
- assert(i == 6);
- i = rfind("abcdefcdef", RegExp("cd"));
- assert(i == 6);
- i = rfind("abcdefcdef", RegExp("x"));
- assert(i == -1);
- i = rfind("abcdefcdef", RegExp("xy"));
- assert(i == -1);
- i = rfind("abcdefcdef", RegExp(""));
- assert(i == 10);
-}
-
-/*************************************************
-Returns:
-
- Same as $(D_PARAM rfind(s, RegExp(pattern, attributes))).
-
-WARNING:
-
-This function is scheduled for deprecation due to unnecessary
-ambiguity with the homonym function in std.string. Instead of
-$(D_PARAM std.regexp.rfind(s, p, a)), you may want to use $(D_PARAM
-rfind(s, RegExp(p, a))).
-*/
-
-ptrdiff_t
-rfind(string s, string pattern, string attributes = null)
-{
- typeof(return) i = -1, lastindex = 0;
-
- auto r = new RegExp(pattern, attributes);
- while (r.test(s, lastindex))
- {
- auto eo = r.pmatch[0].rm_eo;
- i = r.pmatch[0].rm_so;
- if (lastindex == eo)
- lastindex++; // always consume some source
- else
- lastindex = eo;
- }
- delete r;
- return i;
-}
-
-unittest
-{
- ptrdiff_t i;
-
- debug(regexp) printf("regexp.rfind.unittest\n");
- i = rfind("abcdefcdef", "c");
- assert(i == 6);
- i = rfind("abcdefcdef", "cd");
- assert(i == 6);
- i = rfind("abcdefcdef", "x");
- assert(i == -1);
- i = rfind("abcdefcdef", "xy");
- assert(i == -1);
- i = rfind("abcdefcdef", "");
- assert(i == 10);
-}
-
-
-/********************************************
- * Split s[] into an array of strings, using the regular
- * expression $(D_PARAM pattern) as the separator.
- * Params:
- * s = String to search.
- * pattern = Regular expression pattern.
- * Returns:
- * array of slices into s[]
- * Example:
- * ---
- * foreach (s; split("abcabcabab", RegExp("C.", "i")))
- * {
- * writefln("s = '%s'", s);
- * }
- * // Prints:
- * // s = 'ab'
- * // s = 'b'
- * // s = 'bab'
- * ---
- */
-
-string[] split(string s, RegExp pattern)
-{
- return pattern.split(s);
-}
-
-unittest
-{
- debug(regexp) printf("regexp.split.unittest()\n");
- string[] result;
-
- result = split("ab", RegExp("a*"));
- assert(result.length == 2);
- assert(result[0] == "");
- assert(result[1] == "b");
-
- foreach (i, s; split("abcabcabab", RegExp("C.", "i")))
- {
- //writefln("s[%d] = '%s'", i, s);
- if (i == 0) assert(s == "ab");
- else if (i == 1) assert(s == "b");
- else if (i == 2) assert(s == "bab");
- else assert(0);
- }
-}
-
-/********************************************
- Returns:
- Same as $(D_PARAM split(s, RegExp(pattern, attributes))).
-
-WARNING:
-
-This function is scheduled for deprecation due to unnecessary
-ambiguity with the homonym function in std.string. Instead of
-$(D_PARAM std.regexp.split(s, p, a)), you may want to use $(D_PARAM
-split(s, RegExp(p, a))).
-*/
-
-string[] split(string s, string pattern, string attributes = null)
-{
- auto r = new RegExp(pattern, attributes);
- auto result = r.split(s);
- delete r;
- return result;
-}
-
-unittest
-{
- debug(regexp) printf("regexp.split.unittest()\n");
- string[] result;
-
- result = split("ab", "a*");
- assert(result.length == 2);
- assert(result[0] == "");
- assert(result[1] == "b");
-
- foreach (i, s; split("abcabcabab", "C.", "i"))
- {
- //writefln("s[%d] = '%s'", i, s.length, s.ptr);
- if (i == 0) assert(s == "ab");
- else if (i == 1) assert(s == "b");
- else if (i == 2) assert(s == "bab");
- else assert(0);
- }
-}
-
-/****************************************************
- * Search s[] for first match with pattern[] with attributes[].
- * Params:
- * s = String to search.
- * pattern = Regular expression pattern.
- * attributes = Regular expression attributes.
- * Returns:
- * corresponding RegExp if found, null if not.
- * Example:
- * ---
- * import std.stdio;
- * import std.regexp;
- *
- * void main()
- * {
- * if (auto m = std.regexp.search("abcdef", "c"))
- * {
- * writefln("%s[%s]%s", m.pre, m[0], m.post);
- * }
- * }
- * // Prints:
- * // ab[c]def
- * ---
- */
-
-RegExp search(string s, string pattern, string attributes = null)
-{
- auto r = new RegExp(pattern, attributes);
- if (!r.test(s))
- { delete r;
- assert(r is null);
- }
- return r;
-}
-
-unittest
-{
- debug(regexp) printf("regexp.string.unittest()\n");
-
- if (auto m = std.regexp.search("abcdef", "c()"))
- {
- auto result = std.string.format("%s[%s]%s", m.pre, m[0], m.post);
- assert(result == "ab[c]def");
- assert(m[1] == null);
- assert(m[2] == null);
- }
- else
- assert(0);
-
- if (auto n = std.regexp.search("abcdef", "g"))
- {
- assert(0);
- }
-}
-
-/* ********************************* RegExp ******************************** */
-
-/*****************************
- * RegExp is a class to handle regular expressions.
- *
- * It is the core foundation for adding powerful string pattern matching
- * capabilities to programs like grep, text editors, awk, sed, etc.
- */
-class RegExp
-{
- /*****
- * Construct a RegExp object. Compile pattern
- * with attributes into
- * an internal form for fast execution.
- * Params:
- * pattern = regular expression
- * attributes = _attributes
- * Throws: RegExpException if there are any compilation errors.
- * Example:
- * Declare two variables and assign to them a RegExp object:
- * ---
- * auto r = new RegExp("pattern");
- * auto s = new RegExp(r"p[1-5]\s*");
- * ---
- */
- public this(string pattern, string attributes = null)
- {
- pmatch = (&gmatch)[0 .. 1];
- compile(pattern, attributes);
- }
-
- /*****
- * Generate instance of RegExp.
- * Params:
- * pattern = regular expression
- * attributes = _attributes
- * Throws: RegExpException if there are any compilation errors.
- * Example:
- * Declare two variables and assign to them a RegExp object:
- * ---
- * auto r = RegExp("pattern");
- * auto s = RegExp(r"p[1-5]\s*");
- * ---
- */
- public static RegExp opCall(string pattern, string attributes = null)
- {
- return new RegExp(pattern, attributes);
- }
-
- unittest
- {
- debug(regexp) printf("regexp.opCall.unittest()\n");
- auto r1 = RegExp("hello", "m");
- string msg;
- try
- {
- auto r2 = RegExp("hello", "q");
- assert(0);
- }
- catch (RegExpException ree)
- {
- msg = ree.toString();
- //writefln("message: %s", ree);
- }
- assert(std.algorithm.countUntil(msg, "unrecognized attribute") >= 0);
- }
-
- /************************************
- * Set up for start of foreach loop.
- * Returns:
- * search() returns instance of RegExp set up to _search string[].
- * Example:
- * ---
- * import std.stdio;
- * import std.regexp;
- *
- * void main()
- * {
- * foreach(m; RegExp("ab").search("abcabcabab"))
- * {
- * writefln("%s[%s]%s", m.pre, m[0], m.post);
- * }
- * }
- * // Prints:
- * // [ab]cabcabab
- * // abc[ab]cabab
- * // abcabc[ab]ab
- * // abcabcab[ab]
- * ---
- */
-
- public RegExp search(string string)
- {
- input = string;
- pmatch[0].rm_eo = 0;
- return this;
- }
-
- /** ditto */
- public int opApply(scope int delegate(ref RegExp) dg)
- {
- int result;
- RegExp r = this;
-
- while (test())
- {
- result = dg(r);
- if (result)
- break;
- }
-
- return result;
- }
-
- unittest
- {
- debug(regexp) printf("regexp.search.unittest()\n");
-
- int i;
- foreach(m; RegExp("ab").search("abcabcabab"))
- {
- auto s = std.string.format("%s[%s]%s", m.pre, m[0], m.post);
- if (i == 0) assert(s == "[ab]cabcabab");
- else if (i == 1) assert(s == "abc[ab]cabab");
- else if (i == 2) assert(s == "abcabc[ab]ab");
- else if (i == 3) assert(s == "abcabcab[ab]");
- else assert(0);
- i++;
- }
- }
-
- /******************
- * Retrieve match n.
- *
- * n==0 means the matched substring, n>0 means the
- * n'th parenthesized subexpression.
- * if n is larger than the number of parenthesized subexpressions,
- * null is returned.
- */
- public string opIndex(size_t n)
- {
- if (n >= pmatch.length)
- return null;
- else
- {
- auto rm_so = pmatch[n].rm_so;
- auto rm_eo = pmatch[n].rm_eo;
- if (rm_so == rm_eo)
- return null;
- return input[rm_so .. rm_eo];
- }
- }
-
- /**
- Same as $(D_PARAM opIndex(n)).
-
- WARNING:
-
- Scheduled for deprecation due to confusion with overloaded
- $(D_PARAM match(string)). Instead of $(D_PARAM regex.match(n))
- you may want to use $(D_PARAM regex[n]).
- */
- public string match(size_t n)
- {
- return this[n];
- }
-
- /*******************
- * Return the slice of the input that precedes the matched substring.
- */
- public @property string pre()
- {
- return input[0 .. pmatch[0].rm_so];
- }
-
- /*******************
- * Return the slice of the input that follows the matched substring.
- */
- public @property string post()
- {
- return input[pmatch[0].rm_eo .. $];
- }
-
- uint re_nsub; // number of parenthesized subexpression matches
- regmatch_t[] pmatch; // array [re_nsub + 1]
-
- string input; // the string to search
-
- // per instance:
-
- string pattern; // source text of the regular expression
-
- string flags; // source text of the attributes parameter
-
- int errors;
-
- uint attributes;
-
- enum REA
- {
- global = 1, // has the g attribute
- ignoreCase = 2, // has the i attribute
- multiline = 4, // if treat as multiple lines separated
- // by newlines, or as a single line
- dotmatchlf = 8, // if . matches \n
- }
-
-
-private:
- size_t src; // current source index in input[]
- size_t src_start; // starting index for match in input[]
- size_t p; // position of parser in pattern[]
- regmatch_t gmatch; // match for the entire regular expression
- // (serves as storage for pmatch[0])
-
- const(ubyte)[] program; // pattern[] compiled into regular expression program
- OutBuffer buf;
-
-
-
-
-/******************************************/
-
-// Opcodes
-
- enum : ubyte
- {
- REend, // end of program
- REchar, // single character
- REichar, // single character, case insensitive
- REdchar, // single UCS character
- REidchar, // single wide character, case insensitive
- REanychar, // any character
- REanystar, // ".*"
- REstring, // string of characters
- REistring, // string of characters, case insensitive
- REtestbit, // any in bitmap, non-consuming
- REbit, // any in the bit map
- REnotbit, // any not in the bit map
- RErange, // any in the string
- REnotrange, // any not in the string
- REor, // a | b
- REplus, // 1 or more
- REstar, // 0 or more
- REquest, // 0 or 1
- REnm, // n..m
- REnmq, // n..m, non-greedy version
- REbol, // beginning of line
- REeol, // end of line
- REparen, // parenthesized subexpression
- REgoto, // goto offset
-
- REwordboundary,
- REnotwordboundary,
- REdigit,
- REnotdigit,
- REspace,
- REnotspace,
- REword,
- REnotword,
- REbackref,
- };
-
-// BUG: should this include '$'?
- private int isword(dchar c) { return isAlphaNum(c) || c == '_'; }
-
- private uint inf = ~0u;
-
-/* ********************************
- * Throws RegExpException on error
- */
-
- public void compile(string pattern, string attributes)
- {
- //printf("RegExp.compile('%.*s', '%.*s')\n", pattern.length, pattern.ptr, attributes.length, attributes.ptr);
-
- this.attributes = 0;
- foreach (rchar c; attributes)
- { REA att;
-
- switch (c)
- {
- case 'g': att = REA.global; break;
- case 'i': att = REA.ignoreCase; break;
- case 'm': att = REA.multiline; break;
- default:
- error("unrecognized attribute");
- return;
- }
- if (this.attributes & att)
- { error("redundant attribute");
- return;
- }
- this.attributes |= att;
- }
-
- input = null;
-
- this.pattern = pattern;
- this.flags = attributes;
-
- uint oldre_nsub = re_nsub;
- re_nsub = 0;
- errors = 0;
-
- buf = new OutBuffer();
- buf.reserve(pattern.length * 8);
- p = 0;
- parseRegexp();
- if (p < pattern.length)
- { error("unmatched ')'");
- }
- // @@@ SKIPPING OPTIMIZATION SOLVES BUG 941 @@@
- //optimize();
- program = buf.data;
- buf.data = null;
- delete buf;
-
- if (re_nsub > oldre_nsub)
- {
- if (pmatch.ptr is &gmatch)
- pmatch = null;
- pmatch.length = re_nsub + 1;
- }
- pmatch[0].rm_so = 0;
- pmatch[0].rm_eo = 0;
- }
-
-/********************************************
- * Split s[] into an array of strings, using the regular
- * expression as the separator.
- * Returns:
- * array of slices into s[]
- */
-
- public string[] split(string s)
- {
- debug(regexp) printf("regexp.split()\n");
-
- string[] result;
-
- if (s.length)
- {
- ptrdiff_t p, q;
- for (q = p; q != s.length;)
- {
- if (test(s, q))
- {
- q = pmatch[0].rm_so;
- auto e = pmatch[0].rm_eo;
- if (e != p)
- {
- result ~= s[p .. q];
- for (size_t i = 1; i < pmatch.length; i++)
- {
- auto so = pmatch[i].rm_so;
- auto eo = pmatch[i].rm_eo;
- if (so == eo)
- { so = 0; // -1 gives array bounds error
- eo = 0;
- }
- result ~= s[so .. eo];
- }
- q = p = e;
- continue;
- }
- }
- q++;
- }
- result ~= s[p .. s.length];
- }
- else if (!test(s))
- result ~= s;
- return result;
- }
-
- unittest
- {
- debug(regexp) printf("regexp.split.unittest()\n");
-
- auto r = new RegExp("a*?", null);
- string[] result;
- string j;
- int i;
-
- result = r.split("ab");
-
- assert(result.length == 2);
- i = std.algorithm.cmp(result[0], "a");
- assert(i == 0);
- i = std.algorithm.cmp(result[1], "b");
- assert(i == 0);
-
- r = new RegExp("a*", null);
- result = r.split("ab");
- assert(result.length == 2);
- i = std.algorithm.cmp(result[0], "");
- assert(i == 0);
- i = std.algorithm.cmp(result[1], "b");
- assert(i == 0);
-
- r = new RegExp("<(\\/)?([^<>]+)>", null);
- result = r.split("afontbarhello");
-
- debug(regexp)
- {
- for (i = 0; i < result.length; i++)
- printf("result[%d] = '%.*s'\n", i, result[i].length, result[i].ptr);
- }
-
- j = join(result, ",");
- //printf("j = '%.*s'\n", j.length, j.ptr);
- i = std.algorithm.cmp(j, "a,,b,font,/,b,bar,,TAG,hello,/,TAG,");
- assert(i == 0);
-
- r = new RegExp("a[bc]", null);
- result = r.match("123ab");
- j = join(result, ",");
- i = std.algorithm.cmp(j, "ab");
- assert(i == 0);
-
- result = r.match("ac");
- j = join(result, ",");
- i = std.algorithm.cmp(j, "ac");
- assert(i == 0);
- }
-
-/*************************************************
- * Search string[] for match with regular expression.
- * Returns:
- * index of match if successful, -1 if not found
- */
-
- public ptrdiff_t find(string string)
- {
- if (test(string))
- return pmatch[0].rm_so;
- else
- return -1; // no match
- }
-
-//deprecated alias find search;
-
- unittest
- {
- debug(regexp) printf("regexp.find.unittest()\n");
-
- RegExp r = new RegExp("abc", null);
- auto i = r.find("xabcy");
- assert(i == 1);
- i = r.find("cba");
- assert(i == -1);
- }
-
-
-/*************************************************
- * Search s[] for match.
- * Returns:
- * If global attribute, return same value as exec(s).
- * If not global attribute, return array of all matches.
- */
-
- public string[] match(string s)
- {
- string[] result;
-
- if (attributes & REA.global)
- {
- ptrdiff_t lastindex = 0;
-
- while (test(s, lastindex))
- {
- auto eo = pmatch[0].rm_eo;
-
- result ~= input[pmatch[0].rm_so .. eo];
- if (lastindex == eo)
- lastindex++; // always consume some source
- else
- lastindex = eo;
- }
- }
- else
- {
- result = exec(s);
- }
- return result;
- }
-
- unittest
- {
- debug(regexp) printf("regexp.match.unittest()\n");
-
- int i;
- string[] result;
- string j;
- RegExp r;
-
- r = new RegExp("a[bc]", null);
- result = r.match("1ab2ac3");
- j = join(result, ",");
- i = std.algorithm.cmp(j, "ab");
- assert(i == 0);
-
- r = new RegExp("a[bc]", "g");
- result = r.match("1ab2ac3");
- j = join(result, ",");
- i = std.algorithm.cmp(j, "ab,ac");
- assert(i == 0);
- }
-
-
-/*************************************************
- * Find regular expression matches in s[]. Replace those matches
- * with a new string composed of format[] merged with the result of the
- * matches.
- * If global, replace all matches. Otherwise, replace first match.
- * Returns: the new string
- */
-
- public string replace(string s, string format)
- {
- debug(regexp) printf("string = %.*s, format = %.*s\n", s.length, s.ptr, format.length, format.ptr);
-
- string result = s;
- ptrdiff_t lastindex = 0;
- size_t offset = 0;
-
- for (;;)
- {
- if (!test(s, lastindex))
- break;
-
- auto so = pmatch[0].rm_so;
- auto eo = pmatch[0].rm_eo;
-
- string replacement = replace(format);
-
- // Optimize by using replace if possible - Dave Fladebo
- string slice = result[offset + so .. offset + eo];
- if (attributes & REA.global && // global, so replace all
- !(attributes & REA.ignoreCase) && // not ignoring case
- !(attributes & REA.multiline) && // not multiline
- pattern == slice && // simple pattern (exact match, no special characters)
- format == replacement) // simple format, not $ formats
- {
- debug(regexp)
- {
- auto sss = result[offset + so .. offset + eo];
- printf("pattern: %.*s, slice: %.*s, format: %.*s, replacement: %.*s\n",
- pattern.length, pattern.ptr, sss.length, sss.ptr, format.length, format.ptr, replacement.length, replacement.ptr);
- }
- result = std.array.replace(result,slice,replacement);
- break;
- }
-
- result = replaceSlice(result, result[offset + so .. offset + eo], replacement);
-
- if (attributes & REA.global)
- {
- offset += replacement.length - (eo - so);
-
- if (lastindex == eo)
- lastindex++; // always consume some source
- else
- lastindex = eo;
- }
- else
- break;
- }
-
- return result;
- }
-
- unittest
- {
- debug(regexp) printf("regexp.replace.unittest()\n");
-
- int i;
- string result;
- RegExp r;
-
- r = new RegExp("a[bc]", "g");
- result = r.replace("1ab2ac3", "x$&y");
- i = std.algorithm.cmp(result, "1xaby2xacy3");
- assert(i == 0);
-
- r = new RegExp("ab", "g");
- result = r.replace("1ab2ac3", "xy");
- i = std.algorithm.cmp(result, "1xy2ac3");
- assert(i == 0);
- }
-
-
-/*************************************************
- * Search string[] for match.
- * Returns:
- * array of slices into string[] representing matches
- */
-
- public string[] exec(string s)
- {
- debug(regexp) printf("regexp.exec(string = '%.*s')\n", s.length, s.ptr);
- input = s;
- pmatch[0].rm_so = 0;
- pmatch[0].rm_eo = 0;
- return exec();
- }
-
-/*************************************************
- * Pick up where last exec(string) or exec() left off,
- * searching string[] for next match.
- * Returns:
- * array of slices into string[] representing matches
- */
-
- public string[] exec()
- {
- if (!test())
- return null;
-
- auto result = new string[pmatch.length];
- for (int i = 0; i < pmatch.length; i++)
- {
- if (pmatch[i].rm_so == pmatch[i].rm_eo)
- result[i] = null;
- else
- result[i] = input[pmatch[i].rm_so .. pmatch[i].rm_eo];
- }
-
- return result;
- }
-
-/************************************************
- * Search s[] for match.
- * Returns: 0 for no match, !=0 for match
- * Example:
----
-import std.stdio;
-import std.regexp;
-import std.string;
-
-int grep(int delegate(char[]) pred, char[][] list)
-{
- int count;
- foreach (s; list)
- { if (pred(s))
- ++count;
- }
- return count;
-}
-
-void main()
-{
- auto x = grep(&RegExp("[Ff]oo").test,
- std.string.split("mary had a foo lamb"));
- writefln(x);
-}
----
-* which prints: 1
-*/
- //@@@
-public bool test(string s)
- {
- return test(s, 0 /*pmatch[0].rm_eo*/) != 0;
- }
-
-/************************************************
- * Pick up where last test(string) or test() left off, and search again.
- * Returns: 0 for no match, !=0 for match
- */
-
- public int test()
- {
- return test(input, pmatch[0].rm_eo);
- }
-
-/************************************************
- * Test s[] starting at startindex against regular expression.
- * Returns: 0 for no match, !=0 for match
- */
-
- public int test(string s, size_t startindex)
- {
- char firstc;
-
- input = s;
- debug (regexp) printf("RegExp.test(input[] = '%.*s', startindex = %zd)\n", input.length, input.ptr, startindex);
- pmatch[0].rm_so = 0;
- pmatch[0].rm_eo = 0;
- if (startindex < 0 || startindex > input.length)
- {
- return 0; // fail
- }
- //debug(regexp) printProgram(program);
-
- // First character optimization
- firstc = 0;
- if (program[0] == REchar)
- {
- firstc = program[1];
- if (attributes & REA.ignoreCase && isAlpha(firstc))
- firstc = 0;
- }
-
- for (auto si = startindex; ; si++)
- {
- if (firstc)
- {
- if (si == input.length)
- break; // no match
- if (input[si] != firstc)
- {
- si++;
- if (!chr(si, firstc)) // if first character not found
- break; // no match
- }
- }
- for (size_t i = 0; i < re_nsub + 1; i++)
- {
- pmatch[i].rm_so = -1;
- pmatch[i].rm_eo = -1;
- }
- src_start = src = si;
- if (trymatch(0, program.length))
- {
- pmatch[0].rm_so = si;
- pmatch[0].rm_eo = src;
- //debug(regexp) printf("start = %d, end = %d\n", gmatch.rm_so, gmatch.rm_eo);
- return 1;
- }
- // If possible match must start at beginning, we are done
- if (program[0] == REbol || program[0] == REanystar)
- {
- if (attributes & REA.multiline)
- {
- // Scan for the next \n
- if (!chr(si, '\n'))
- break; // no match if '\n' not found
- }
- else
- break;
- }
- if (si == input.length)
- break;
- debug(regexp)
- {
- auto sss = input[si + 1 .. input.length];
- printf("Starting new try: '%.*s'\n", sss.length, sss.ptr);
- }
- }
- return 0; // no match
- }
-
- /**
- Returns whether string $(D_PARAM s) matches $(D_PARAM this).
- */
- alias test opEquals;
-// bool opEquals(string s)
-// {
-// return test(s);
-// }
-
- unittest
- {
- assert("abc" == RegExp(".b."));
- assert("abc" != RegExp(".b.."));
- }
-
- int chr(ref size_t si, rchar c)
- {
- for (; si < input.length; si++)
- {
- if (input[si] == c)
- return 1;
- }
- return 0;
- }
-
-
- void printProgram(const(ubyte)[] prog)
- {
- //debug(regexp)
- {
- size_t len;
- uint n;
- uint m;
- ushort *pu;
- uint *puint;
- char[] str;
-
- printf("printProgram()\n");
- for (size_t pc = 0; pc < prog.length; )
- {
- printf("%3d: ", pc);
-
- //printf("prog[pc] = %d, REchar = %d, REnmq = %d\n", prog[pc], REchar, REnmq);
- switch (prog[pc])
- {
- case REchar:
- printf("\tREchar '%c'\n", prog[pc + 1]);
- pc += 1 + char.sizeof;
- break;
-
- case REichar:
- printf("\tREichar '%c'\n", prog[pc + 1]);
- pc += 1 + char.sizeof;
- break;
-
- case REdchar:
- printf("\tREdchar '%c'\n", *cast(dchar *)&prog[pc + 1]);
- pc += 1 + dchar.sizeof;
- break;
-
- case REidchar:
- printf("\tREidchar '%c'\n", *cast(dchar *)&prog[pc + 1]);
- pc += 1 + dchar.sizeof;
- break;
-
- case REanychar:
- printf("\tREanychar\n");
- pc++;
- break;
-
- case REstring:
- len = *cast(size_t *)&prog[pc + 1];
- str = (cast(char*)&prog[pc + 1 + size_t.sizeof])[0 .. len];
- printf("\tREstring x%x, '%.*s'\n", len, str.length, str.ptr);
- pc += 1 + size_t.sizeof + len * rchar.sizeof;
- break;
-
- case REistring:
- len = *cast(size_t *)&prog[pc + 1];
- str = (cast(char*)&prog[pc + 1 + size_t.sizeof])[0 .. len];
- printf("\tREistring x%x, '%.*s'\n", len, str.length, str.ptr);
- pc += 1 + size_t.sizeof + len * rchar.sizeof;
- break;
-
- case REtestbit:
- pu = cast(ushort *)&prog[pc + 1];
- printf("\tREtestbit %d, %d\n", pu[0], pu[1]);
- len = pu[1];
- pc += 1 + 2 * ushort.sizeof + len;
- break;
-
- case REbit:
- pu = cast(ushort *)&prog[pc + 1];
- len = pu[1];
- printf("\tREbit cmax=%02x, len=%d:", pu[0], len);
- for (n = 0; n < len; n++)
- printf(" %02x", prog[pc + 1 + 2 * ushort.sizeof + n]);
- printf("\n");
- pc += 1 + 2 * ushort.sizeof + len;
- break;
-
- case REnotbit:
- pu = cast(ushort *)&prog[pc + 1];
- printf("\tREnotbit %d, %d\n", pu[0], pu[1]);
- len = pu[1];
- pc += 1 + 2 * ushort.sizeof + len;
- break;
-
- case RErange:
- len = *cast(uint *)&prog[pc + 1];
- printf("\tRErange %d\n", len);
- // BUG: REAignoreCase?
- pc += 1 + uint.sizeof + len;
- break;
-
- case REnotrange:
- len = *cast(uint *)&prog[pc + 1];
- printf("\tREnotrange %d\n", len);
- // BUG: REAignoreCase?
- pc += 1 + uint.sizeof + len;
- break;
-
- case REbol:
- printf("\tREbol\n");
- pc++;
- break;
-
- case REeol:
- printf("\tREeol\n");
- pc++;
- break;
-
- case REor:
- len = *cast(uint *)&prog[pc + 1];
- printf("\tREor %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len);
- pc += 1 + uint.sizeof;
- break;
-
- case REgoto:
- len = *cast(uint *)&prog[pc + 1];
- printf("\tREgoto %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len);
- pc += 1 + uint.sizeof;
- break;
-
- case REanystar:
- printf("\tREanystar\n");
- pc++;
- break;
-
- case REnm:
- case REnmq:
- // len, n, m, ()
- puint = cast(uint *)&prog[pc + 1];
- len = puint[0];
- n = puint[1];
- m = puint[2];
- printf("\tREnm%s len=%d, n=%u, m=%u, pc=>%d\n",
- (prog[pc] == REnmq) ? "q".ptr : " ".ptr,
- len, n, m, pc + 1 + uint.sizeof * 3 + len);
- pc += 1 + uint.sizeof * 3;
- break;
-
- case REparen:
- // len, n, ()
- puint = cast(uint *)&prog[pc + 1];
- len = puint[0];
- n = puint[1];
- printf("\tREparen len=%d n=%d, pc=>%d\n", len, n, pc + 1 + uint.sizeof * 2 + len);
- pc += 1 + uint.sizeof * 2;
- break;
-
- case REend:
- printf("\tREend\n");
- return;
-
- case REwordboundary:
- printf("\tREwordboundary\n");
- pc++;
- break;
-
- case REnotwordboundary:
- printf("\tREnotwordboundary\n");
- pc++;
- break;
-
- case REdigit:
- printf("\tREdigit\n");
- pc++;
- break;
-
- case REnotdigit:
- printf("\tREnotdigit\n");
- pc++;
- break;
-
- case REspace:
- printf("\tREspace\n");
- pc++;
- break;
-
- case REnotspace:
- printf("\tREnotspace\n");
- pc++;
- break;
-
- case REword:
- printf("\tREword\n");
- pc++;
- break;
-
- case REnotword:
- printf("\tREnotword\n");
- pc++;
- break;
-
- case REbackref:
- printf("\tREbackref %d\n", prog[1]);
- pc += 2;
- break;
-
- default:
- assert(0);
- }
- }
- }
- }
-
-
-/**************************************************
- * Match input against a section of the program[].
- * Returns:
- * 1 if successful match
- * 0 no match
- */
-
- int trymatch(size_t pc, size_t pcend)
- {
- size_t len;
- size_t n;
- size_t m;
- size_t count;
- size_t pop;
- size_t ss;
- regmatch_t *psave;
- size_t c1;
- size_t c2;
- ushort* pu;
- uint* puint;
-
- debug(regexp)
- {
- auto sss = input[src .. input.length];
- printf("RegExp.trymatch(pc = %zd, src = '%.*s', pcend = %zd)\n", pc, sss.length, sss.ptr, pcend);
- }
- auto srcsave = src;
- psave = null;
- for (;;)
- {
- if (pc == pcend) // if done matching
- { debug(regex) printf("\tprogend\n");
- return 1;
- }
-
- //printf("\top = %d\n", program[pc]);
- switch (program[pc])
- {
- case REchar:
- if (src == input.length)
- goto Lnomatch;
- debug(regexp) printf("\tREchar '%c', src = '%c'\n", program[pc + 1], input[src]);
- if (program[pc + 1] != input[src])
- goto Lnomatch;
- src++;
- pc += 1 + char.sizeof;
- break;
-
- case REichar:
- if (src == input.length)
- goto Lnomatch;
- debug(regexp) printf("\tREichar '%c', src = '%c'\n", program[pc + 1], input[src]);
- c1 = program[pc + 1];
- c2 = input[src];
- if (c1 != c2)
- {
- if (isLower(cast(rchar)c2))
- c2 = std.ascii.toUpper(cast(rchar)c2);
- else
- goto Lnomatch;
- if (c1 != c2)
- goto Lnomatch;
- }
- src++;
- pc += 1 + char.sizeof;
- break;
-
- case REdchar:
- debug(regexp) printf("\tREdchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]);
- if (src == input.length)
- goto Lnomatch;
- if (*(cast(dchar *)&program[pc + 1]) != input[src])
- goto Lnomatch;
- src++;
- pc += 1 + dchar.sizeof;
- break;
-
- case REidchar:
- debug(regexp) printf("\tREidchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]);
- if (src == input.length)
- goto Lnomatch;
- c1 = *(cast(dchar *)&program[pc + 1]);
- c2 = input[src];
- if (c1 != c2)
- {
- if (isLower(cast(rchar)c2))
- c2 = std.ascii.toUpper(cast(rchar)c2);
- else
- goto Lnomatch;
- if (c1 != c2)
- goto Lnomatch;
- }
- src++;
- pc += 1 + dchar.sizeof;
- break;
-
- case REanychar:
- debug(regexp) printf("\tREanychar\n");
- if (src == input.length)
- goto Lnomatch;
- if (!(attributes & REA.dotmatchlf) && input[src] == cast(rchar)'\n')
- goto Lnomatch;
- src += std.utf.stride(input, src);
- //src++;
- pc++;
- break;
-
- case REstring:
- len = *cast(size_t *)&program[pc + 1];
- debug(regexp)
- {
- auto sss2 = (&program[pc + 1 + size_t.sizeof])[0 .. len];
- printf("\tREstring x%x, '%.*s'\n", len, sss2.length, sss2.ptr);
- }
- if (src + len > input.length)
- goto Lnomatch;
- if (memcmp(&program[pc + 1 + size_t.sizeof], &input[src], len * rchar.sizeof))
- goto Lnomatch;
- src += len;
- pc += 1 + size_t.sizeof + len * rchar.sizeof;
- break;
-
- case REistring:
- len = *cast(size_t *)&program[pc + 1];
- debug(regexp)
- {
- auto sss2 = (&program[pc + 1 + size_t.sizeof])[0 .. len];
- printf("\tREistring x%x, '%.*s'\n", len, sss2.length, sss2.ptr);
- }
- if (src + len > input.length)
- goto Lnomatch;
- if (icmp((cast(char*)&program[pc + 1 + size_t.sizeof])[0..len],
- input[src .. src + len]))
- goto Lnomatch;
- src += len;
- pc += 1 + size_t.sizeof + len * rchar.sizeof;
- break;
-
- case REtestbit:
- pu = (cast(ushort *)&program[pc + 1]);
- if (src == input.length)
- goto Lnomatch;
- debug(regexp) printf("\tREtestbit %d, %d, '%c', x%02x\n",
- pu[0], pu[1], input[src], input[src]);
- len = pu[1];
- c1 = input[src];
- //printf("[x%02x]=x%02x, x%02x\n", c1 >> 3, ((&program[pc + 1 + 4])[c1 >> 3] ), (1 << (c1 & 7)));
- if (c1 <= pu[0] &&
- !((&(program[pc + 1 + 4]))[c1 >> 3] & (1 << (c1 & 7))))
- goto Lnomatch;
- pc += 1 + 2 * ushort.sizeof + len;
- break;
-
- case REbit:
- pu = (cast(ushort *)&program[pc + 1]);
- if (src == input.length)
- goto Lnomatch;
- debug(regexp) printf("\tREbit %d, %d, '%c'\n",
- pu[0], pu[1], input[src]);
- len = pu[1];
- c1 = input[src];
- if (c1 > pu[0])
- goto Lnomatch;
- if (!((&program[pc + 1 + 4])[c1 >> 3] & (1 << (c1 & 7))))
- goto Lnomatch;
- src++;
- pc += 1 + 2 * ushort.sizeof + len;
- break;
-
- case REnotbit:
- pu = (cast(ushort *)&program[pc + 1]);
- if (src == input.length)
- goto Lnomatch;
- debug(regexp) printf("\tREnotbit %d, %d, '%c'\n",
- pu[0], pu[1], input[src]);
- len = pu[1];
- c1 = input[src];
- if (c1 <= pu[0] &&
- ((&program[pc + 1 + 4])[c1 >> 3] & (1 << (c1 & 7))))
- goto Lnomatch;
- src++;
- pc += 1 + 2 * ushort.sizeof + len;
- break;
-
- case RErange:
- len = *cast(uint *)&program[pc + 1];
- debug(regexp) printf("\tRErange %d\n", len);
- if (src == input.length)
- goto Lnomatch;
- // BUG: REA.ignoreCase?
- if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) == null)
- goto Lnomatch;
- src++;
- pc += 1 + uint.sizeof + len;
- break;
-
- case REnotrange:
- len = *cast(uint *)&program[pc + 1];
- debug(regexp) printf("\tREnotrange %d\n", len);
- if (src == input.length)
- goto Lnomatch;
- // BUG: REA.ignoreCase?
- if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) != null)
- goto Lnomatch;
- src++;
- pc += 1 + uint.sizeof + len;
- break;
-
- case REbol:
- debug(regexp) printf("\tREbol\n");
- if (src == 0)
- {
- }
- else if (attributes & REA.multiline)
- {
- if (input[src - 1] != '\n')
- goto Lnomatch;
- }
- else
- goto Lnomatch;
- pc++;
- break;
-
- case REeol:
- debug(regexp) printf("\tREeol\n");
- if (src == input.length)
- {
- }
- else if (attributes & REA.multiline && input[src] == '\n')
- src++;
- else
- goto Lnomatch;
- pc++;
- break;
-
- case REor:
- len = (cast(uint *)&program[pc + 1])[0];
- debug(regexp) printf("\tREor %d\n", len);
- pop = pc + 1 + uint.sizeof;
- ss = src;
- if (trymatch(pop, pcend))
- {
- if (pcend != program.length)
- {
- auto s = src;
- if (trymatch(pcend, program.length))
- { debug(regexp) printf("\tfirst operand matched\n");
- src = s;
- return 1;
- }
- else
- {
- // If second branch doesn't match to end, take first anyway
- src = ss;
- if (!trymatch(pop + len, program.length))
- {
- debug(regexp) printf("\tfirst operand matched\n");
- src = s;
- return 1;
- }
- }
- src = ss;
- }
- else
- { debug(regexp) printf("\tfirst operand matched\n");
- return 1;
- }
- }
- pc = pop + len; // proceed with 2nd branch
- break;
-
- case REgoto:
- debug(regexp) printf("\tREgoto\n");
- len = (cast(uint *)&program[pc + 1])[0];
- pc += 1 + uint.sizeof + len;
- break;
-
- case REanystar:
- debug(regexp) printf("\tREanystar\n");
- pc++;
- for (;;)
- {
- auto s1 = src;
- if (src == input.length)
- break;
- if (!(attributes & REA.dotmatchlf) && input[src] == '\n')
- break;
- src++;
- auto s2 = src;
-
- // If no match after consumption, but it
- // did match before, then no match
- if (!trymatch(pc, program.length))
- {
- src = s1;
- // BUG: should we save/restore pmatch[]?
- if (trymatch(pc, program.length))
- {
- src = s1; // no match
- break;
- }
- }
- src = s2;
- }
- break;
-
- case REnm:
- case REnmq:
- // len, n, m, ()
- puint = cast(uint *)&program[pc + 1];
- len = puint[0];
- n = puint[1];
- m = puint[2];
- debug(regexp) printf("\tREnm%s len=%d, n=%u, m=%u\n",
- (program[pc] == REnmq) ? "q".ptr : "".ptr, len, n, m);
- pop = pc + 1 + uint.sizeof * 3;
- for (count = 0; count < n; count++)
- {
- if (!trymatch(pop, pop + len))
- goto Lnomatch;
- }
- if (!psave && count < m)
- {
- //version (Win32)
- psave = cast(regmatch_t *)alloca((re_nsub + 1) * regmatch_t.sizeof);
- //else
- //psave = new regmatch_t[re_nsub + 1];
- }
- if (program[pc] == REnmq) // if minimal munch
- {
- for (; count < m; count++)
- {
- memcpy(psave, pmatch.ptr, (re_nsub + 1) * regmatch_t.sizeof);
- auto s1 = src;
-
- if (trymatch(pop + len, program.length))
- {
- src = s1;
- memcpy(pmatch.ptr, psave, (re_nsub + 1) * regmatch_t.sizeof);
- break;
- }
-
- if (!trymatch(pop, pop + len))
- { debug(regexp) printf("\tdoesn't match subexpression\n");
- break;
- }
-
- // If source is not consumed, don't
- // infinite loop on the match
- if (s1 == src)
- { debug(regexp) printf("\tsource is not consumed\n");
- break;
- }
- }
- }
- else // maximal munch
- {
- for (; count < m; count++)
- {
- memcpy(psave, pmatch.ptr, (re_nsub + 1) * regmatch_t.sizeof);
- auto s1 = src;
- if (!trymatch(pop, pop + len))
- { debug(regexp) printf("\tdoesn't match subexpression\n");
- break;
- }
- auto s2 = src;
-
- // If source is not consumed, don't
- // infinite loop on the match
- if (s1 == s2)
- { debug(regexp) printf("\tsource is not consumed\n");
- break;
- }
-
- // If no match after consumption, but it
- // did match before, then no match
- if (!trymatch(pop + len, program.length))
- {
- src = s1;
- if (trymatch(pop + len, program.length))
- {
- src = s1; // no match
- memcpy(pmatch.ptr, psave, (re_nsub + 1) * regmatch_t.sizeof);
- break;
- }
- }
- src = s2;
- }
- }
- debug(regexp) printf("\tREnm len=%d, n=%u, m=%u, DONE count=%d\n", len, n, m, count);
- pc = pop + len;
- break;
-
- case REparen:
- // len, ()
- debug(regexp) printf("\tREparen\n");
- puint = cast(uint *)&program[pc + 1];
- len = puint[0];
- n = puint[1];
- pop = pc + 1 + uint.sizeof * 2;
- ss = src;
- if (!trymatch(pop, pop + len))
- goto Lnomatch;
- pmatch[n + 1].rm_so = ss;
- pmatch[n + 1].rm_eo = src;
- pc = pop + len;
- break;
-
- case REend:
- debug(regexp) printf("\tREend\n");
- return 1; // successful match
-
- case REwordboundary:
- debug(regexp) printf("\tREwordboundary\n");
- if (src > 0 && src < input.length)
- {
- c1 = input[src - 1];
- c2 = input[src];
- if (!(
- (isword(cast(rchar)c1) && !isword(cast(rchar)c2)) ||
- (!isword(cast(rchar)c1) && isword(cast(rchar)c2))
- )
- )
- goto Lnomatch;
- }
- pc++;
- break;
-
- case REnotwordboundary:
- debug(regexp) printf("\tREnotwordboundary\n");
- if (src == 0 || src == input.length)
- goto Lnomatch;
- c1 = input[src - 1];
- c2 = input[src];
- if (
- (isword(cast(rchar)c1) && !isword(cast(rchar)c2)) ||
- (!isword(cast(rchar)c1) && isword(cast(rchar)c2))
- )
- goto Lnomatch;
- pc++;
- break;
-
- case REdigit:
- debug(regexp) printf("\tREdigit\n");
- if (src == input.length)
- goto Lnomatch;
- if (!isDigit(input[src]))
- goto Lnomatch;
- src++;
- pc++;
- break;
-
- case REnotdigit:
- debug(regexp) printf("\tREnotdigit\n");
- if (src == input.length)
- goto Lnomatch;
- if (isDigit(input[src]))
- goto Lnomatch;
- src++;
- pc++;
- break;
-
- case REspace:
- debug(regexp) printf("\tREspace\n");
- if (src == input.length)
- goto Lnomatch;
- if (!isWhite(input[src]))
- goto Lnomatch;
- src++;
- pc++;
- break;
-
- case REnotspace:
- debug(regexp) printf("\tREnotspace\n");
- if (src == input.length)
- goto Lnomatch;
- if (isWhite(input[src]))
- goto Lnomatch;
- src++;
- pc++;
- break;
-
- case REword:
- debug(regexp) printf("\tREword\n");
- if (src == input.length)
- goto Lnomatch;
- if (!isword(input[src]))
- goto Lnomatch;
- src++;
- pc++;
- break;
-
- case REnotword:
- debug(regexp) printf("\tREnotword\n");
- if (src == input.length)
- goto Lnomatch;
- if (isword(input[src]))
- goto Lnomatch;
- src++;
- pc++;
- break;
-
- case REbackref:
- {
- n = program[pc + 1];
- debug(regexp) printf("\tREbackref %d\n", n);
-
- auto so = pmatch[n + 1].rm_so;
- auto eo = pmatch[n + 1].rm_eo;
- len = eo - so;
- if (src + len > input.length)
- goto Lnomatch;
- else if (attributes & REA.ignoreCase)
- {
- if (icmp(input[src .. src + len], input[so .. eo]))
- goto Lnomatch;
- }
- else if (memcmp(&input[src], &input[so], len * rchar.sizeof))
- goto Lnomatch;
- src += len;
- pc += 2;
- break;
- }
-
- default:
- assert(0);
- }
- }
-
- Lnomatch:
- debug(regexp) printf("\tnomatch pc=%d\n", pc);
- src = srcsave;
- return 0;
- }
-
-/* =================== Compiler ================== */
-
- int parseRegexp()
- {
- size_t gotooffset;
- uint len1;
- uint len2;
-
- debug(regexp)
- {
- auto sss = pattern[p .. pattern.length];
- printf("parseRegexp() '%.*s'\n", sss.length, sss.ptr);
- }
- auto offset = buf.offset;
- for (;;)
- {
- assert(p <= pattern.length);
- if (p == pattern.length)
- { buf.write(REend);
- return 1;
- }
- switch (pattern[p])
- {
- case ')':
- return 1;
-
- case '|':
- p++;
- gotooffset = buf.offset;
- buf.write(REgoto);
- buf.write(cast(uint)0);
- len1 = cast(uint)(buf.offset - offset);
- buf.spread(offset, 1 + uint.sizeof);
- gotooffset += 1 + uint.sizeof;
- parseRegexp();
- len2 = cast(uint)(buf.offset - (gotooffset + 1 + uint.sizeof));
- buf.data[offset] = REor;
- (cast(uint *)&buf.data[offset + 1])[0] = len1;
- (cast(uint *)&buf.data[gotooffset + 1])[0] = len2;
- break;
-
- default:
- parsePiece();
- break;
- }
- }
- }
-
- int parsePiece()
- {
- uint len;
- uint n;
- uint m;
- ubyte op;
- auto plength = pattern.length;
-
- debug(regexp)
- {
- auto sss = pattern[p .. pattern.length];
- printf("parsePiece() '%.*s'\n", sss.length, sss.ptr);
- }
- auto offset = buf.offset;
- parseAtom();
- if (p == plength)
- return 1;
- switch (pattern[p])
- {
- case '*':
- // Special optimization: replace .* with REanystar
- if (buf.offset - offset == 1 &&
- buf.data[offset] == REanychar &&
- p + 1 < plength &&
- pattern[p + 1] != '?')
- {
- buf.data[offset] = REanystar;
- p++;
- break;
- }
-
- n = 0;
- m = inf;
- goto Lnm;
-
- case '+':
- n = 1;
- m = inf;
- goto Lnm;
-
- case '?':
- n = 0;
- m = 1;
- goto Lnm;
-
- case '{': // {n} {n,} {n,m}
- p++;
- if (p == plength || !isDigit(pattern[p]))
- goto Lerr;
- n = 0;
- do
- {
- // BUG: handle overflow
- n = n * 10 + pattern[p] - '0';
- p++;
- if (p == plength)
- goto Lerr;
- } while (isDigit(pattern[p]));
- if (pattern[p] == '}') // {n}
- { m = n;
- goto Lnm;
- }
- if (pattern[p] != ',')
- goto Lerr;
- p++;
- if (p == plength)
- goto Lerr;
- if (pattern[p] == /*{*/ '}') // {n,}
- { m = inf;
- goto Lnm;
- }
- if (!isDigit(pattern[p]))
- goto Lerr;
- m = 0; // {n,m}
- do
- {
- // BUG: handle overflow
- m = m * 10 + pattern[p] - '0';
- p++;
- if (p == plength)
- goto Lerr;
- } while (isDigit(pattern[p]));
- if (pattern[p] != /*{*/ '}')
- goto Lerr;
- goto Lnm;
-
- Lnm:
- p++;
- op = REnm;
- if (p < plength && pattern[p] == '?')
- { op = REnmq; // minimal munch version
- p++;
- }
- len = cast(uint)(buf.offset - offset);
- buf.spread(offset, 1 + uint.sizeof * 3);
- buf.data[offset] = op;
- uint* puint = cast(uint *)&buf.data[offset + 1];
- puint[0] = len;
- puint[1] = n;
- puint[2] = m;
- break;
-
- default:
- break;
- }
- return 1;
-
- Lerr:
- error("badly formed {n,m}");
- assert(0);
- }
-
- int parseAtom()
- { ubyte op;
- size_t offset;
- rchar c;
-
- debug(regexp)
- {
- auto sss = pattern[p .. pattern.length];
- printf("parseAtom() '%.*s'\n", sss.length, sss.ptr);
- }
- if (p < pattern.length)
- {
- c = pattern[p];
- switch (c)
- {
- case '*':
- case '+':
- case '?':
- error("*+? not allowed in atom");
- p++;
- return 0;
-
- case '(':
- p++;
- buf.write(REparen);
- offset = buf.offset;
- buf.write(cast(uint)0); // reserve space for length
- buf.write(re_nsub);
- re_nsub++;
- parseRegexp();
- *cast(uint *)&buf.data[offset] =
- cast(uint)(buf.offset - (offset + uint.sizeof * 2));
- if (p == pattern.length || pattern[p] != ')')
- {
- error("')' expected");
- return 0;
- }
- p++;
- break;
-
- case '[':
- if (!parseRange())
- return 0;
- break;
-
- case '.':
- p++;
- buf.write(REanychar);
- break;
-
- case '^':
- p++;
- buf.write(REbol);
- break;
-
- case '$':
- p++;
- buf.write(REeol);
- break;
-
- case '\\':
- p++;
- if (p == pattern.length)
- { error("no character past '\\'");
- return 0;
- }
- c = pattern[p];
- switch (c)
- {
- case 'b': op = REwordboundary; goto Lop;
- case 'B': op = REnotwordboundary; goto Lop;
- case 'd': op = REdigit; goto Lop;
- case 'D': op = REnotdigit; goto Lop;
- case 's': op = REspace; goto Lop;
- case 'S': op = REnotspace; goto Lop;
- case 'w': op = REword; goto Lop;
- case 'W': op = REnotword; goto Lop;
-
- Lop:
- buf.write(op);
- p++;
- break;
-
- case 'f':
- case 'n':
- case 'r':
- case 't':
- case 'v':
- case 'c':
- case 'x':
- case 'u':
- case '0':
- c = cast(char)escape();
- goto Lbyte;
-
- case '1': case '2': case '3':
- case '4': case '5': case '6':
- case '7': case '8': case '9':
- c -= '1';
- if (c < re_nsub)
- { buf.write(REbackref);
- buf.write(cast(ubyte)c);
- }
- else
- { error("no matching back reference");
- return 0;
- }
- p++;
- break;
-
- default:
- p++;
- goto Lbyte;
- }
- break;
-
- default:
- p++;
- Lbyte:
- op = REchar;
- if (attributes & REA.ignoreCase)
- {
- if (isAlpha(c))
- {
- op = REichar;
- c = cast(char)std.ascii.toUpper(c);
- }
- }
- if (op == REchar && c <= 0xFF)
- {
- // Look ahead and see if we can make this into
- // an REstring
- auto q = p;
- for (; q < pattern.length; ++q)
- { rchar qc = pattern[q];
-
- switch (qc)
- {
- case '{':
- case '*':
- case '+':
- case '?':
- if (q == p)
- goto Lchar;
- q--;
- break;
-
- case '(': case ')':
- case '|':
- case '[': case ']':
- case '.': case '^':
- case '$': case '\\':
- case '}':
- break;
-
- default:
- continue;
- }
- break;
- }
- auto len = q - p;
- if (len > 0)
- {
- debug(regexp) printf("writing string len %d, c = '%c', pattern[p] = '%c'\n", len+1, c, pattern[p]);
- buf.reserve(5 + (1 + len) * rchar.sizeof);
- buf.write((attributes & REA.ignoreCase) ? REistring : REstring);
- buf.write(len + 1);
- buf.write(c);
- buf.write(pattern[p .. p + len]);
- p = q;
- break;
- }
- }
- if (c >= 0x80)
- {
- // Convert to dchar opcode
- op = (op == REchar) ? REdchar : REidchar;
- buf.write(op);
- buf.write(c);
- }
- else
- {
- Lchar:
- debug(regexp) printf("It's an REchar '%c'\n", c);
- buf.write(op);
- buf.write(cast(char)c);
- }
- break;
- }
- }
- return 1;
- }
-
-private:
- class Range
- {
- size_t maxc;
- size_t maxb;
- OutBuffer buf;
- ubyte* base;
- BitArray bits;
-
- this(OutBuffer buf)
- {
- this.buf = buf;
- if (buf.data.length)
- this.base = &buf.data[buf.offset];
- }
-
- void setbitmax(size_t u)
- {
- //printf("setbitmax(x%x), maxc = x%x\n", u, maxc);
- if (u > maxc)
- {
- maxc = u;
- auto b = u / 8;
- if (b >= maxb)
- {
- auto u2 = base ? base - &buf.data[0] : 0;
- buf.fill0(b - maxb + 1);
- base = &buf.data[u2];
- maxb = b + 1;
- //bits = (cast(bit*)this.base)[0 .. maxc + 1];
- bits.ptr = cast(size_t*)this.base;
- }
- bits.len = maxc + 1;
- }
- }
-
- void setbit2(size_t u)
- {
- setbitmax(u + 1);
- //printf("setbit2 [x%02x] |= x%02x\n", u >> 3, 1 << (u & 7));
- bits[u] = 1;
- }
-
- };
-
- int parseRange()
- {
- int c;
- int c2;
- uint i;
- uint cmax;
-
- cmax = 0x7F;
- p++;
- ubyte op = REbit;
- if (p == pattern.length)
- goto Lerr;
- if (pattern[p] == '^')
- { p++;
- op = REnotbit;
- if (p == pattern.length)
- goto Lerr;
- }
- buf.write(op);
- auto offset = buf.offset;
- buf.write(cast(uint)0); // reserve space for length
- buf.reserve(128 / 8);
- auto r = new Range(buf);
- if (op == REnotbit)
- r.setbit2(0);
- switch (pattern[p])
- {
- case ']':
- case '-':
- c = pattern[p];
- p++;
- r.setbit2(c);
- break;
-
- default:
- break;
- }
-
- enum RS { start, rliteral, dash }
- RS rs;
-
- rs = RS.start;
- for (;;)
- {
- if (p == pattern.length)
- goto Lerr;
- switch (pattern[p])
- {
- case ']':
- switch (rs)
- { case RS.dash:
- r.setbit2('-');
- goto case;
- case RS.rliteral:
- r.setbit2(c);
- break;
- case RS.start:
- break;
- default:
- assert(0);
- }
- p++;
- break;
-
- case '\\':
- p++;
- r.setbitmax(cmax);
- if (p == pattern.length)
- goto Lerr;
- switch (pattern[p])
- {
- case 'd':
- for (i = '0'; i <= '9'; i++)
- r.bits[i] = 1;
- goto Lrs;
-
- case 'D':
- for (i = 1; i < '0'; i++)
- r.bits[i] = 1;
- for (i = '9' + 1; i <= cmax; i++)
- r.bits[i] = 1;
- goto Lrs;
-
- case 's':
- for (i = 0; i <= cmax; i++)
- if (isWhite(i))
- r.bits[i] = 1;
- goto Lrs;
-
- case 'S':
- for (i = 1; i <= cmax; i++)
- if (!isWhite(i))
- r.bits[i] = 1;
- goto Lrs;
-
- case 'w':
- for (i = 0; i <= cmax; i++)
- if (isword(cast(rchar)i))
- r.bits[i] = 1;
- goto Lrs;
-
- case 'W':
- for (i = 1; i <= cmax; i++)
- if (!isword(cast(rchar)i))
- r.bits[i] = 1;
- goto Lrs;
-
- Lrs:
- switch (rs)
- { case RS.dash:
- r.setbit2('-');
- goto case;
- case RS.rliteral:
- r.setbit2(c);
- break;
- default:
- break;
- }
- rs = RS.start;
- continue;
-
- default:
- break;
- }
- c2 = escape();
- goto Lrange;
-
- case '-':
- p++;
- if (rs == RS.start)
- goto Lrange;
- else if (rs == RS.rliteral)
- rs = RS.dash;
- else if (rs == RS.dash)
- {
- r.setbit2(c);
- r.setbit2('-');
- rs = RS.start;
- }
- continue;
-
- default:
- c2 = pattern[p];
- p++;
- Lrange:
- switch (rs)
- { case RS.rliteral:
- r.setbit2(c);
- goto case;
- case RS.start:
- c = c2;
- rs = RS.rliteral;
- break;
-
- case RS.dash:
- if (c > c2)
- { error("inverted range in character class");
- return 0;
- }
- r.setbitmax(c2);
- //printf("c = %x, c2 = %x\n",c,c2);
- for (; c <= c2; c++)
- r.bits[c] = 1;
- rs = RS.start;
- break;
-
- default:
- assert(0);
- }
- continue;
- }
- break;
- }
- if (attributes & REA.ignoreCase)
- {
- // BUG: what about dchar?
- r.setbitmax(0x7F);
- for (c = 'a'; c <= 'z'; c++)
- {
- if (r.bits[c])
- r.bits[c + 'A' - 'a'] = 1;
- else if (r.bits[c + 'A' - 'a'])
- r.bits[c] = 1;
- }
- }
- //printf("maxc = %d, maxb = %d\n",r.maxc,r.maxb);
- (cast(ushort *)&buf.data[offset])[0] = cast(ushort)r.maxc;
- (cast(ushort *)&buf.data[offset])[1] = cast(ushort)r.maxb;
- return 1;
-
- Lerr:
- error("invalid range");
- return 0;
- }
-
- void error(string msg)
- {
- errors++;
- debug(regexp) printf("error: %.*s\n", msg.length, msg.ptr);
-//assert(0);
-//*(char*)0=0;
- throw new RegExpException(msg);
- }
-
-// p is following the \ char
- int escape()
- in
- {
- assert(p < pattern.length);
- }
- body
- { int c;
- int i;
- rchar tc;
-
- c = pattern[p]; // none of the cases are multibyte
- switch (c)
- {
- case 'b': c = '\b'; break;
- case 'f': c = '\f'; break;
- case 'n': c = '\n'; break;
- case 'r': c = '\r'; break;
- case 't': c = '\t'; break;
- case 'v': c = '\v'; break;
-
- // BUG: Perl does \a and \e too, should we?
-
- case 'c':
- ++p;
- if (p == pattern.length)
- goto Lretc;
- c = pattern[p];
- // Note: we are deliberately not allowing dchar letters
- if (!(('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')))
- {
- Lcerr:
- error("letter expected following \\c");
- return 0;
- }
- c &= 0x1F;
- break;
-
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- c -= '0';
- for (i = 0; i < 2; i++)
- {
- p++;
- if (p == pattern.length)
- goto Lretc;
- tc = pattern[p];
- if ('0' <= tc && tc <= '7')
- { c = c * 8 + (tc - '0');
- // Treat overflow as if last
- // digit was not an octal digit
- if (c >= 0xFF)
- { c >>= 3;
- return c;
- }
- }
- else
- return c;
- }
- break;
-
- case 'x':
- c = 0;
- for (i = 0; i < 2; i++)
- {
- p++;
- if (p == pattern.length)
- goto Lretc;
- tc = pattern[p];
- if ('0' <= tc && tc <= '9')
- c = c * 16 + (tc - '0');
- else if ('a' <= tc && tc <= 'f')
- c = c * 16 + (tc - 'a' + 10);
- else if ('A' <= tc && tc <= 'F')
- c = c * 16 + (tc - 'A' + 10);
- else if (i == 0) // if no hex digits after \x
- {
- // Not a valid \xXX sequence
- return 'x';
- }
- else
- return c;
- }
- break;
-
- case 'u':
- c = 0;
- for (i = 0; i < 4; i++)
- {
- p++;
- if (p == pattern.length)
- goto Lretc;
- tc = pattern[p];
- if ('0' <= tc && tc <= '9')
- c = c * 16 + (tc - '0');
- else if ('a' <= tc && tc <= 'f')
- c = c * 16 + (tc - 'a' + 10);
- else if ('A' <= tc && tc <= 'F')
- c = c * 16 + (tc - 'A' + 10);
- else
- {
- // Not a valid \uXXXX sequence
- p -= i;
- return 'u';
- }
- }
- break;
-
- default:
- break;
- }
- p++;
- Lretc:
- return c;
- }
-
-/* ==================== optimizer ======================= */
-
- void optimize()
- { ubyte[] prog;
-
- debug(regexp) printf("RegExp.optimize()\n");
- prog = buf.toBytes();
- for (size_t i = 0; 1;)
- {
- //printf("\tprog[%d] = %d, %d\n", i, prog[i], REstring);
- switch (prog[i])
- {
- case REend:
- case REanychar:
- case REanystar:
- case REbackref:
- case REeol:
- case REchar:
- case REichar:
- case REdchar:
- case REidchar:
- case REstring:
- case REistring:
- case REtestbit:
- case REbit:
- case REnotbit:
- case RErange:
- case REnotrange:
- case REwordboundary:
- case REnotwordboundary:
- case REdigit:
- case REnotdigit:
- case REspace:
- case REnotspace:
- case REword:
- case REnotword:
- return;
-
- case REbol:
- i++;
- continue;
-
- case REor:
- case REnm:
- case REnmq:
- case REparen:
- case REgoto:
- {
- auto bitbuf = new OutBuffer;
- auto r = new Range(bitbuf);
- auto offset = i;
- if (starrchars(r, prog[i .. prog.length]))
- {
- debug(regexp) printf("\tfilter built\n");
- buf.spread(offset, 1 + 4 + r.maxb);
- buf.data[offset] = REtestbit;
- (cast(ushort *)&buf.data[offset + 1])[0] = cast(ushort)r.maxc;
- (cast(ushort *)&buf.data[offset + 1])[1] = cast(ushort)r.maxb;
- i = offset + 1 + 4;
- buf.data[i .. i + r.maxb] = r.base[0 .. r.maxb];
- }
- return;
- }
- default:
- assert(0);
- }
- }
- }
-
-/////////////////////////////////////////
-// OR the leading character bits into r.
-// Limit the character range from 0..7F,
-// trymatch() will allow through anything over maxc.
-// Return 1 if success, 0 if we can't build a filter or
-// if there is no point to one.
-
- int starrchars(Range r, const(ubyte)[] prog)
- { rchar c;
- uint maxc;
- size_t maxb;
- size_t len;
- uint b;
- uint n;
- uint m;
- const(ubyte)* pop;
-
- //printf("RegExp.starrchars(prog = %p, progend = %p)\n", prog, progend);
- for (size_t i = 0; i < prog.length;)
- {
- switch (prog[i])
- {
- case REchar:
- c = prog[i + 1];
- if (c <= 0x7F)
- r.setbit2(c);
- return 1;
-
- case REichar:
- c = prog[i + 1];
- if (c <= 0x7F)
- { r.setbit2(c);
- r.setbit2(std.ascii.toLower(cast(rchar)c));
- }
- return 1;
-
- case REdchar:
- case REidchar:
- return 1;
-
- case REanychar:
- return 0; // no point
-
- case REstring:
- len = *cast(size_t *)&prog[i + 1];
- assert(len);
- c = *cast(rchar *)&prog[i + 1 + size_t.sizeof];
- debug(regexp) printf("\tREstring %d, '%c'\n", len, c);
- if (c <= 0x7F)
- r.setbit2(c);
- return 1;
-
- case REistring:
- len = *cast(size_t *)&prog[i + 1];
- assert(len);
- c = *cast(rchar *)&prog[i + 1 + size_t.sizeof];
- debug(regexp) printf("\tREistring %d, '%c'\n", len, c);
- if (c <= 0x7F)
- { r.setbit2(std.ascii.toUpper(cast(rchar)c));
- r.setbit2(std.ascii.toLower(cast(rchar)c));
- }
- return 1;
-
- case REtestbit:
- case REbit:
- maxc = (cast(ushort *)&prog[i + 1])[0];
- maxb = (cast(ushort *)&prog[i + 1])[1];
- if (maxc <= 0x7F)
- r.setbitmax(maxc);
- else
- maxb = r.maxb;
- for (b = 0; b < maxb; b++)
- r.base[b] |= prog[i + 1 + 4 + b];
- return 1;
-
- case REnotbit:
- maxc = (cast(ushort *)&prog[i + 1])[0];
- maxb = (cast(ushort *)&prog[i + 1])[1];
- if (maxc <= 0x7F)
- r.setbitmax(maxc);
- else
- maxb = r.maxb;
- for (b = 0; b < maxb; b++)
- r.base[b] |= ~prog[i + 1 + 4 + b];
- return 1;
-
- case REbol:
- case REeol:
- return 0;
-
- case REor:
- len = (cast(uint *)&prog[i + 1])[0];
- return starrchars(r, prog[i + 1 + uint.sizeof .. prog.length]) &&
- starrchars(r, prog[i + 1 + uint.sizeof + len .. prog.length]);
-
- case REgoto:
- len = (cast(uint *)&prog[i + 1])[0];
- i += 1 + uint.sizeof + len;
- break;
-
- case REanystar:
- return 0;
-
- case REnm:
- case REnmq:
- // len, n, m, ()
- len = (cast(uint *)&prog[i + 1])[0];
- n = (cast(uint *)&prog[i + 1])[1];
- m = (cast(uint *)&prog[i + 1])[2];
- pop = &prog[i + 1 + uint.sizeof * 3];
- if (!starrchars(r, pop[0 .. len]))
- return 0;
- if (n)
- return 1;
- i += 1 + uint.sizeof * 3 + len;
- break;
-
- case REparen:
- // len, ()
- len = (cast(uint *)&prog[i + 1])[0];
- n = (cast(uint *)&prog[i + 1])[1];
- pop = &prog[0] + i + 1 + uint.sizeof * 2;
- return starrchars(r, pop[0 .. len]);
-
- case REend:
- return 0;
-
- case REwordboundary:
- case REnotwordboundary:
- return 0;
-
- case REdigit:
- r.setbitmax('9');
- for (c = '0'; c <= '9'; c++)
- r.bits[c] = 1;
- return 1;
-
- case REnotdigit:
- r.setbitmax(0x7F);
- for (c = 0; c <= '0'; c++)
- r.bits[c] = 1;
- for (c = '9' + 1; c <= r.maxc; c++)
- r.bits[c] = 1;
- return 1;
-
- case REspace:
- r.setbitmax(0x7F);
- for (c = 0; c <= r.maxc; c++)
- if (isWhite(c))
- r.bits[c] = 1;
- return 1;
-
- case REnotspace:
- r.setbitmax(0x7F);
- for (c = 0; c <= r.maxc; c++)
- if (!isWhite(c))
- r.bits[c] = 1;
- return 1;
-
- case REword:
- r.setbitmax(0x7F);
- for (c = 0; c <= r.maxc; c++)
- if (isword(cast(rchar)c))
- r.bits[c] = 1;
- return 1;
-
- case REnotword:
- r.setbitmax(0x7F);
- for (c = 0; c <= r.maxc; c++)
- if (!isword(cast(rchar)c))
- r.bits[c] = 1;
- return 1;
-
- case REbackref:
- return 0;
-
- default:
- assert(0);
- }
- }
- return 1;
- }
-
-/* ==================== replace ======================= */
-
-/***********************
- * After a match is found with test(), this function
- * will take the match results and, using the format
- * string, generate and return a new string.
- */
-
- public string replace(string format)
- {
- return replace3(format, input, pmatch[0 .. re_nsub + 1]);
- }
-
-// Static version that doesn't require a RegExp object to be created
-
- public static string replace3(string format, string input, regmatch_t[] pmatch)
- {
- string result;
- size_t c2;
- ptrdiff_t rm_so, rm_eo, i;
-
-// printf("replace3(format = '%.*s', input = '%.*s')\n", format.length, format.ptr, input.length, input.ptr);
- result.length = format.length;
- result.length = 0;
- for (size_t f = 0; f < format.length; f++)
- {
- char c = format[f];
- L1:
- if (c != '$')
- {
- result ~= c;
- continue;
- }
- ++f;
- if (f == format.length)
- {
- result ~= '$';
- break;
- }
- c = format[f];
- switch (c)
- {
- case '&':
- rm_so = pmatch[0].rm_so;
- rm_eo = pmatch[0].rm_eo;
- goto Lstring;
-
- case '`':
- rm_so = 0;
- rm_eo = pmatch[0].rm_so;
- goto Lstring;
-
- case '\'':
- rm_so = pmatch[0].rm_eo;
- rm_eo = input.length;
- goto Lstring;
-
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- i = c - '0';
- if (f + 1 == format.length)
- {
- if (i == 0)
- {
- result ~= '$';
- result ~= c;
- continue;
- }
- }
- else
- {
- c2 = format[f + 1];
- if (c2 >= '0' && c2 <= '9')
- {
- i = (c - '0') * 10 + (c2 - '0');
- f++;
- }
- if (i == 0)
- {
- result ~= '$';
- result ~= c;
- c = cast(char)c2;
- goto L1;
- }
- }
-
- if (i < pmatch.length)
- { rm_so = pmatch[i].rm_so;
- rm_eo = pmatch[i].rm_eo;
- goto Lstring;
- }
- break;
-
- Lstring:
- if (rm_so != rm_eo)
- result ~= input[rm_so .. rm_eo];
- break;
-
- default:
- result ~= '$';
- result ~= c;
- break;
- }
- }
- return result;
- }
-
-/************************************
- * Like replace(char[] format), but uses old style formatting:
-
- Format
- | Description
- |
- &
- | replace with the match
- |
-
- \n
- | replace with the nth parenthesized match, n is 1..9
- |
-
- \c
- | replace with char c.
- |
-
-*/
-
- public string replaceOld(string format)
- {
- string result;
-
-//printf("replace: this = %p so = %d, eo = %d\n", this, pmatch[0].rm_so, pmatch[0].rm_eo);
-//printf("3input = '%.*s'\n", input.length, input.ptr);
- result.length = format.length;
- result.length = 0;
- for (size_t i; i < format.length; i++)
- {
- char c = format[i];
- switch (c)
- {
- case '&':
- {
- auto sss = input[pmatch[0].rm_so .. pmatch[0].rm_eo];
- //printf("match = '%.*s'\n", sss.length, sss.ptr);
- result ~= sss;
- }
- break;
-
- case '\\':
- if (i + 1 < format.length)
- {
- c = format[++i];
- if (c >= '1' && c <= '9')
- { uint j;
-
- j = c - '0';
- if (j <= re_nsub && pmatch[j].rm_so != pmatch[j].rm_eo)
- result ~= input[pmatch[j].rm_so .. pmatch[j].rm_eo];
- break;
- }
- }
- result ~= c;
- break;
-
- default:
- result ~= c;
- break;
- }
- }
- return result;
- }
-
-}
-
-unittest
-{ // Created and placed in public domain by Don Clugston
-
- auto m = search("aBC r s", `bc\x20r[\40]s`, "i");
- assert(m.pre=="a");
- assert(m[0]=="BC r s");
- auto m2 = search("7xxyxxx", `^\d([a-z]{2})\D\1`);
- assert(m2[0]=="7xxyxx");
- // Just check the parsing.
- auto m3 = search("dcbxx", `ca|b[\d\]\D\s\S\w-\W]`);
- auto m4 = search("xy", `[^\ca-\xFa\r\n\b\f\t\v\0123]{2,485}$`);
- auto m5 = search("xxx", `^^\r\n\b{13,}\f{4}\t\v\u02aF3a\w\W`);
- auto m6 = search("xxy", `.*y`);
- assert(m6[0]=="xxy");
- auto m7 = search("QWDEfGH", "(ca|b|defg)+", "i");
- assert(m7[0]=="DEfG");
- auto m8 = search("dcbxx", `a?\B\s\S`);
- auto m9 = search("dcbxx", `[-w]`);
- auto m10 = search("dcbsfd", `aB[c-fW]dB|\d|\D|\u012356|\w|\W|\s|\S`, "i");
- auto m11 = search("dcbsfd", `[]a-]`);
- m.replaceOld(`a&b\1c`);
- m.replace(`a$&b$'$1c`);
-}
-
-// Andrei
-//------------------------------------------------------------------------------
-
-struct Pattern(Char)
-{
- immutable(Char)[] pattern;
-
- this(immutable(Char)[] pattern)
- {
- this.pattern = pattern;
- }
-}
-
-Pattern!(Char) pattern(Char)(immutable(Char)[] pat)
-{
- return typeof(return)(pat);
-}
-
-struct Splitter(Range)
-{
- Range _input;
- size_t _chunkLength;
- RegExp _rx;
-
- private Range search()
- {
- //rx = std.regexp.search(_input, "(" ~ _separator.pattern ~ ")");
- auto i = std.regexp.find(cast(string) _input, _rx);
- return _input[i >= 0 ? i : _input.length .. _input.length];
- }
-
- private void advance()
- {
- //writeln("(" ~ _separator.pattern ~ ")");
- //writeln(_input);
- //assert(_rx[0].length > 0);
- _chunkLength += _rx[0].length;
- }
-
- this(Range input, Pattern!(char) separator)
- {
- _input = input;
- _rx = RegExp(separator.pattern);
- _chunkLength = _input.length - search().length;
- }
-
- ref auto opSlice()
- {
- return this;
- }
-
- @property Range front()
- {
- return _input[0 .. _chunkLength];
- }
-
- @property bool empty()
- {
- return _input.empty;
- }
-
- void popFront()
- {
- if (_chunkLength == _input.length)
- {
- _input = _input[_chunkLength .. _input.length];
- return;
- }
- advance();
- _input = _input[_chunkLength .. _input.length];
- _chunkLength = _input.length - search().length;
- }
-}
-
-Splitter!(Range) splitter(Range)(Range r, Pattern!(char) pat)
-{
- static assert(is(Unqual!(typeof(Range.init[0])) == char),
- Unqual!(typeof(Range.init[0])).stringof);
- return typeof(return)(cast(string) r, pat);
-}
-
-unittest
-{
- auto s1 = ", abc, de, fg, hi, ";
- auto sp2 = splitter(s1, pattern(", *"));
- //foreach (e; sp2) writeln("[", e, "]");
- assert(equal(sp2, ["", "abc", "de", "fg", "hi"][]));
-}
-
-unittest
-{
- auto str= "foo";
- string[] re_strs= [
- r"^(h|a|)fo[oas]$",
- r"^(a|b|)fo[oas]$",
- r"^(a|)foo$",
- r"(a|)foo",
- r"^(h|)foo$",
- r"(h|)foo",
- r"(h|a|)fo[oas]",
- r"^(a|b|)fo[o]$",
- r"[abf][ops](o|oo|)(h|a|)",
- r"(h|)[abf][ops](o|oo|)",
- r"(c|)[abf][ops](o|oo|)"
- ];
-
- foreach (re_str; re_strs) {
- auto re= new RegExp(re_str);
- auto matches= cast(bool)re.test(str);
- assert(matches);
- //writefln("'%s' matches '%s' ? %s", str, re_str, matches);
- }
-
- for (char c='a'; c<='z'; ++c) {
- auto re_str= "("~c~"|)foo";
- auto re= new RegExp(re_str);
- auto matches= cast(bool)re.test(str);
- assert(matches);
- //writefln("'%s' matches '%s' ? %s", str, re_str, matches);
- }
-}
diff --git a/unittest.d b/unittest.d
index 6953cc728..6da374fcc 100644
--- a/unittest.d
+++ b/unittest.d
@@ -39,7 +39,7 @@ public import std.path;
public import std.perf;
public import std.process;
public import std.random;
-public import std.regexp;
+public import std.regex;
public import std.signals;
//public import std.slist;
public import std.socket;
@@ -82,7 +82,7 @@ else
std.conv.to!double("1.0"); // std.conv
OutBuffer b = new OutBuffer(); // outbuffer
std.ctype.tolower('A'); // ctype
- RegExp r = new RegExp(null, null); // regexp
+ auto r = regex(""); // regex
uint ranseed = std.random.unpredictableSeed;
thisTid;
int a[];
diff --git a/win32.mak b/win32.mak
index 45d95c69e..b18a77179 100644
--- a/win32.mak
+++ b/win32.mak
@@ -117,7 +117,7 @@ SRC_STD_3= std\csv.d std\math.d std\complex.d std\numeric.d std\bigint.d \
SRC_STD_3a= std\signals.d std\typetuple.d std\traits.d \
std\encoding.d std\xml.d \
- std\random.d std\regexp.d \
+ std\random.d \
std\exception.d \
std\compiler.d std\cpuid.d \
std\system.d std\concurrency.d
@@ -156,7 +156,7 @@ SRC_STD= std\zlib.d std\zip.d std\stdint.d std\container.d std\conv.d std\utf.d
std\outbuffer.d std\md5.d std\base64.d \
std\mmfile.d \
std\syserror.d \
- std\regexp.d std\random.d std\stream.d std\process.d \
+ std\random.d std\stream.d std\process.d \
std\socket.d std\socketstream.d std\format.d \
std\stdio.d std\perf.d std\uni.d std\uuid.d \
std\cstream.d std\demangle.d \
@@ -319,7 +319,6 @@ DOCS= $(DOC)\object.html \
$(DOC)\std_random.html \
$(DOC)\std_range.html \
$(DOC)\std_regex.html \
- $(DOC)\std_regexp.html \
$(DOC)\std_signals.html \
$(DOC)\std_socket.html \
$(DOC)\std_socketstream.html \
@@ -580,9 +579,6 @@ $(DOC)\std_range.html : $(STDDOC) std\range.d
$(DOC)\std_regex.html : $(STDDOC) std\regex.d
$(DMD) -c -o- $(DDOCFLAGS) -Df$(DOC)\std_regex.html $(STDDOC) std\regex.d
-$(DOC)\std_regexp.html : $(STDDOC) std\regexp.d
- $(DMD) -c -o- $(DDOCFLAGS) -Df$(DOC)\std_regexp.html $(STDDOC) std\regexp.d
-
$(DOC)\std_signals.html : $(STDDOC) std\signals.d
$(DMD) -c -o- $(DDOCFLAGS) -Df$(DOC)\std_signals.html $(STDDOC) std\signals.d
diff --git a/win64.mak b/win64.mak
index 69c16f38b..494e003c7 100644
--- a/win64.mak
+++ b/win64.mak
@@ -120,7 +120,7 @@ SRC_STD_3a= std\uni.d std\base64.d std\md5.d std\ctype.d std\ascii.d \
SRC_STD_3b= std\signals.d std\typetuple.d std\traits.d \
std\encoding.d std\xml.d \
- std\random.d std\regexp.d \
+ std\random.d \
std\exception.d \
std\compiler.d std\cpuid.d \
std\system.d std\concurrency.d
@@ -178,7 +178,7 @@ SRC_STD= std\zlib.d std\zip.d std\stdint.d std\container.d std\conv.d std\utf.d
std\outbuffer.d std\md5.d std\base64.d \
std\mmfile.d \
std\syserror.d \
- std\regexp.d std\random.d std\stream.d std\process.d \
+ std\random.d std\stream.d std\process.d \
std\socket.d std\socketstream.d std\format.d \
std\stdio.d std\perf.d std\uni.d std\uuid.d \
std\cstream.d std\demangle.d \
@@ -341,7 +341,6 @@ DOCS= $(DOC)\object.html \
$(DOC)\std_random.html \
$(DOC)\std_range.html \
$(DOC)\std_regex.html \
- $(DOC)\std_regexp.html \
$(DOC)\std_signals.html \
$(DOC)\std_socket.html \
$(DOC)\std_socketstream.html \
@@ -633,9 +632,6 @@ $(DOC)\std_range.html : $(STDDOC) std\range.d
$(DOC)\std_regex.html : $(STDDOC) std\regex.d
$(DMD) -c -o- $(DDOCFLAGS) -Df$(DOC)\std_regex.html $(STDDOC) std\regex.d
-$(DOC)\std_regexp.html : $(STDDOC) std\regexp.d
- $(DMD) -c -o- $(DDOCFLAGS) -Df$(DOC)\std_regexp.html $(STDDOC) std\regexp.d
-
$(DOC)\std_signals.html : $(STDDOC) std\signals.d
$(DMD) -c -o- $(DDOCFLAGS) -Df$(DOC)\std_signals.html $(STDDOC) std\signals.d