phobos/std/regexp.d
jmdavis 85ee64d968 Changed deprecation not on std.regexp.
One of the overloads of std.file.listDir still uses std.regexp and
doesn't have a replacement yet. It's scheduled for deprecation with no
date (since it has no proper replacement yet). So, we can't remove
std.regexp in March as scheduled. We'll remove it after the std.file
issue has been resolved.
2012-03-10 19:09:35 -08:00

3433 lines
98 KiB
D

// Written in the D programming language.
// Regular Expressions.
/**
* $(RED Deprecated.
* Please use $(LINK2 std_regex.html, std.regex) instead.)
*
* $(LINK2 http://www.digitalmars.com/ctg/regular.html, Regular
* expressions) are a powerful method of string pattern matching. The
* regular expression language used in this library is the same as
* that commonly used, however, some of the very advanced forms may
* behave slightly differently. The standard observed is the $(WEB
* www.ecma-international.org/publications/standards/Ecma-262.htm,
* ECMA standard) for regular expressions.
*
* std.regexp is designed to work only with valid UTF strings as input.
* To validate untrusted input, use std.utf.validate().
*
* In the following guide, $(I pattern)[] refers to a
* $(LINK2 http://www.digitalmars.com/ctg/regular.html, regular expression).
* The $(I attributes)[] refers to
* a string controlling the interpretation
* of the regular expression.
* It consists of a sequence of one or more
* of the following characters:
*
* <table border=1 cellspacing=0 cellpadding=5>
* <caption>Attribute Characters</caption>
* $(TR $(TH Attribute) $(TH Action))
* <tr>
* $(TD $(B g))
* $(TD global; repeat over the whole input string)
* </tr>
* <tr>
* $(TD $(B i))
* $(TD case insensitive)
* </tr>
* <tr>
* $(TD $(B m))
* $(TD treat as multiple lines separated by newlines)
* </tr>
* </table>
*
* The $(I format)[] string has the formatting characters:
*
* <table border=1 cellspacing=0 cellpadding=5>
* <caption>Formatting Characters</caption>
* $(TR $(TH Format) $(TH Replaced With))
* $(TR
* $(TD $(B $$)) $(TD $)
* )
* $(TR
* $(TD $(B $&amp;)) $(TD The matched substring.)
* )
* $(TR
* $(TD $(B $`)) $(TD The portion of string that precedes the matched substring.)
* )
* $(TR
* $(TD $(B $')) $(TD The portion of string that follows the matched substring.)
* )
* $(TR
* $(TD $(B $(DOLLAR))$(I n)) $(TD The $(I n)th capture, where $(I n)
* is a single digit 1-9
* and $$(I n) is not followed by a decimal digit.)
* )
* $(TR
* $(TD $(B $(DOLLAR))$(I nn)) $(TD The $(I nn)th capture, where $(I nn)
* is a two-digit decimal
* number 01-99.
* If $(I nn)th capture is undefined or more than the number
* of parenthesized subexpressions, use the empty
* string instead.)
* )
* </table>
*
* Any other $ are left as is.
*
* References:
* $(LINK2 http://en.wikipedia.org/wiki/Regular_expressions, Wikipedia)
* Macros:
* WIKI = StdRegexp
* DOLLAR = $
*
* Copyright: Copyright Digital Mars 2000 - 2011.
* License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
* Authors: $(WEB digitalmars.com, Walter Bright)
* Source: $(PHOBOSSRC std/_regexp.d)
*/
/* Copyright Digital Mars 2000 - 2011.
* Distributed under the Boost Software License, Version 1.0.
* (See accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
*/
/*
Escape sequences:
\nnn starts out a 1, 2 or 3 digit octal sequence,
where n is an octal digit. If nnn is larger than
0377, then the 3rd digit is not part of the sequence
and is not consumed.
For maximal portability, use exactly 3 digits.
\xXX starts out a 1 or 2 digit hex sequence. X
is a hex character. If the first character after the \x
is not a hex character, the value of the sequence is 'x'
and the XX are not consumed.
For maximal portability, use exactly 2 digits.
\uUUUU is a unicode sequence. There are exactly
4 hex characters after the \u, if any are not, then
the value of the sequence is 'u', and the UUUU are not
consumed.
Character classes:
[a-b], where a is greater than b, will produce
an error.
References:
http://www.unicode.org/unicode/reports/tr18/
*/
module std.regexp;
pragma(msg, "Notice: As of Phobos 2.055, std.regexp has been deprecated. " ~
"Please use std.regex instead.");
//debug = regexp; // uncomment to turn on debugging printf's
private
{
import core.stdc.stdio;
import core.stdc.stdlib;
import core.stdc.string;
import std.array;
import std.stdio;
import std.string;
import std.ascii;
import std.outbuffer;
import std.bitmanip;
import std.utf;
import std.algorithm;
import std.array;
import std.traits;
}
deprecated:
/** Regular expression to extract an _email address.
* References:
* $(LINK2 http://www.regular-expressions.info/email.html, How to Find or Validate an Email Address)$(BR)
* $(LINK2 http://tools.ietf.org/html/rfc2822#section-3.4.1, RFC 2822 Internet Message Format)
*/
string email =
r"[a-zA-Z]([.]?([[a-zA-Z0-9_]-]+)*)?@([[a-zA-Z0-9_]\-_]+\.)+[a-zA-Z]{2,6}";
/** Regular expression to extract a _url */
string url = r"(([h|H][t|T]|[f|F])[t|T][p|P]([s|S]?)\:\/\/|~/|/)?([\w]+:\w+@)?(([a-zA-Z]{1}([\w\-]+\.)+([\w]{2,5}))(:[\d]{1,5})?)?((/?\w+/)+|/?)(\w+\.[\w]{3,4})?([,]\w+)*((\?\w+=\w+)?(&\w+=\w+)*([,]\w*)*)?";
/************************************
* One of these gets thrown on compilation errors
*/
class RegExpException : Exception
{
this(string msg)
{
super(msg);
}
}
struct regmatch_t
{
sizediff_t rm_so; // index of start of match
sizediff_t rm_eo; // index past end of match
}
private alias char rchar; // so we can make a wchar version
/******************************************************
* Search string for matches with regular expression
* pattern with attributes.
* Replace each match with string generated from format.
* Params:
* s = String to search.
* pattern = Regular expression pattern.
* format = Replacement string format.
* attributes = Regular expression attributes.
* Returns:
* the resulting string
* Example:
* Replace the letters 'a' with the letters 'ZZ'.
* ---
* s = "Strap a rocket engine on a chicken."
* sub(s, "a", "ZZ") // result: StrZZp a rocket engine on a chicken.
* sub(s, "a", "ZZ", "g") // result: StrZZp ZZ rocket engine on ZZ chicken.
* ---
* The replacement format can reference the matches using
* the $&amp;, $$, $', $`, $0 .. $99 notation:
* ---
* sub(s, "[ar]", "[$&]", "g") // result: St[r][a]p [a] [r]ocket engine on [a] chi
* ---
*/
string sub(string s, string pattern, string format, string attributes = null)
{
auto r = new RegExp(pattern, attributes);
auto result = r.replace(s, format);
delete r;
return result;
}
unittest
{
debug(regexp) printf("regexp.sub.unittest\n");
string r = sub("hello", "ll", "ss");
assert(r == "hesso");
}
/*******************************************************
* Search string for matches with regular expression
* pattern with attributes.
* Pass each match to delegate dg.
* Replace each match with the return value from dg.
* Params:
* s = String to search.
* pattern = Regular expression pattern.
* dg = Delegate
* attributes = Regular expression attributes.
* Returns: the resulting string.
* Example:
* Capitalize the letters 'a' and 'r':
* ---
* s = "Strap a rocket engine on a chicken.";
* sub(s, "[ar]",
* delegate char[] (RegExp m)
* {
* return toUpper(m[0]);
* },
* "g"); // result: StRAp A Rocket engine on A chicken.
* ---
*/
string sub(string s, string pattern, string delegate(RegExp) dg, string attributes = null)
{
auto r = new RegExp(pattern, attributes);
string result = s;
size_t lastindex = 0;
size_t offset = 0;
while (r.test(s, lastindex))
{
auto so = r.pmatch[0].rm_so;
auto eo = r.pmatch[0].rm_eo;
string replacement = dg(r);
// Optimize by using std.string.replace if possible - Dave Fladebo
string slice = result[offset + so .. offset + eo];
if (r.attributes & RegExp.REA.global && // global, so replace all
!(r.attributes & RegExp.REA.ignoreCase) && // not ignoring case
!(r.attributes & RegExp.REA.multiline) && // not multiline
pattern == slice) // simple pattern (exact match, no special characters)
{
debug(regexp)
printf("result: %.*s, pattern: %.*s, slice: %.*s, replacement: %.*s\n",
result.length, result.ptr,
pattern.length, pattern.ptr,
slice.length, slice.ptr,
replacement.length, replacement.ptr);
result = replace(result,slice,replacement);
break;
}
result = replaceSlice(result, result[offset + so .. offset + eo], replacement);
if (r.attributes & RegExp.REA.global)
{
offset += replacement.length - (eo - so);
if (lastindex == eo)
lastindex++; // always consume some source
else
lastindex = eo;
}
else
break;
}
delete r;
return result;
}
unittest
{
debug(regexp) printf("regexp.sub.unittest\n");
string foo(RegExp r) { return "ss"; }
auto r = sub("hello", "ll", delegate string(RegExp r) { return "ss"; });
assert(r == "hesso");
r = sub("hello", "l", delegate string(RegExp r) { return "l"; }, "g");
assert(r == "hello");
auto s = sub("Strap a rocket engine on a chicken.",
"[ar]",
delegate string (RegExp m)
{
return std.string.toUpper(m[0]);
},
"g");
assert(s == "StRAp A Rocket engine on A chicken.");
}
/*************************************************
* Search $(D_PARAM s[]) for first match with $(D_PARAM pattern).
* Params:
* s = String to search.
* pattern = Regular expression pattern.
* Returns:
* index into s[] of match if found, -1 if no match.
* Example:
* ---
* auto s = "abcabcabab";
* find(s, RegExp("b")); // match, returns 1
* find(s, RegExp("f")); // no match, returns -1
* ---
*/
sizediff_t find(string s, RegExp pattern)
{
return pattern.test(s)
? pattern.pmatch[0].rm_so
: -1;
}
unittest
{
debug(regexp) printf("regexp.find.unittest\n");
auto i = find("xabcy", RegExp("abc"));
assert(i == 1);
i = find("cba", RegExp("abc"));
assert(i == -1);
}
/**
Returns:
Same as $(D_PARAM find(s, RegExp(pattern, attributes))).
WARNING:
This function is scheduled for deprecation due to unnecessary
ambiguity with the homonym function in std.string. Instead of
$(D_PARAM std.regexp.find(s, p, a)), you may want to use $(D_PARAM
find(s, RegExp(p, a))).
*/
sizediff_t
find(string s, string pattern, string attributes = null)
{
auto r = new RegExp(pattern, attributes);
scope(exit) delete r;
return r.test(s) ? r.pmatch[0].rm_so : -1;
}
unittest
{
debug(regexp) printf("regexp.find.unittest\n");
auto i = find("xabcy", "abc");
assert(i == 1);
i = find("cba", "abc");
assert(i == -1);
}
/*************************************************
* Search $(D_PARAM s[]) for last match with $(D_PARAM pattern).
* Params:
* s = String to search.
* pattern = Regular expression pattern.
* Returns:
* index into s[] of match if found, -1 if no match.
* Example:
* ---
* auto s = "abcabcabab";
* rfind(s, RegExp("b")); // match, returns 9
* rfind(s, RegExp("f")); // no match, returns -1
* ---
*/
sizediff_t rfind(string s, RegExp pattern)
{
sizediff_t i = -1, lastindex = 0;
while (pattern.test(s, lastindex))
{
auto eo = pattern.pmatch[0].rm_eo;
i = pattern.pmatch[0].rm_so;
if (lastindex == eo)
lastindex++; // always consume some source
else
lastindex = eo;
}
return i;
}
unittest
{
sizediff_t i;
debug(regexp) printf("regexp.rfind.unittest\n");
i = rfind("abcdefcdef", RegExp("c"));
assert(i == 6);
i = rfind("abcdefcdef", RegExp("cd"));
assert(i == 6);
i = rfind("abcdefcdef", RegExp("x"));
assert(i == -1);
i = rfind("abcdefcdef", RegExp("xy"));
assert(i == -1);
i = rfind("abcdefcdef", RegExp(""));
assert(i == 10);
}
/*************************************************
Returns:
Same as $(D_PARAM rfind(s, RegExp(pattern, attributes))).
WARNING:
This function is scheduled for deprecation due to unnecessary
ambiguity with the homonym function in std.string. Instead of
$(D_PARAM std.regexp.rfind(s, p, a)), you may want to use $(D_PARAM
rfind(s, RegExp(p, a))).
*/
sizediff_t
rfind(string s, string pattern, string attributes = null)
{
typeof(return) i = -1, lastindex = 0;
auto r = new RegExp(pattern, attributes);
while (r.test(s, lastindex))
{
auto eo = r.pmatch[0].rm_eo;
i = r.pmatch[0].rm_so;
if (lastindex == eo)
lastindex++; // always consume some source
else
lastindex = eo;
}
delete r;
return i;
}
unittest
{
sizediff_t i;
debug(regexp) printf("regexp.rfind.unittest\n");
i = rfind("abcdefcdef", "c");
assert(i == 6);
i = rfind("abcdefcdef", "cd");
assert(i == 6);
i = rfind("abcdefcdef", "x");
assert(i == -1);
i = rfind("abcdefcdef", "xy");
assert(i == -1);
i = rfind("abcdefcdef", "");
assert(i == 10);
}
/********************************************
* Split s[] into an array of strings, using the regular
* expression $(D_PARAM pattern) as the separator.
* Params:
* s = String to search.
* pattern = Regular expression pattern.
* Returns:
* array of slices into s[]
* Example:
* ---
* foreach (s; split("abcabcabab", RegExp("C.", "i")))
* {
* writefln("s = '%s'", s);
* }
* // Prints:
* // s = 'ab'
* // s = 'b'
* // s = 'bab'
* ---
*/
string[] split(string s, RegExp pattern)
{
return pattern.split(s);
}
unittest
{
debug(regexp) printf("regexp.split.unittest()\n");
string[] result;
result = split("ab", RegExp("a*"));
assert(result.length == 2);
assert(result[0] == "");
assert(result[1] == "b");
foreach (i, s; split("abcabcabab", RegExp("C.", "i")))
{
//writefln("s[%d] = '%s'", i, s);
if (i == 0) assert(s == "ab");
else if (i == 1) assert(s == "b");
else if (i == 2) assert(s == "bab");
else assert(0);
}
}
/********************************************
Returns:
Same as $(D_PARAM split(s, RegExp(pattern, attributes))).
WARNING:
This function is scheduled for deprecation due to unnecessary
ambiguity with the homonym function in std.string. Instead of
$(D_PARAM std.regexp.split(s, p, a)), you may want to use $(D_PARAM
split(s, RegExp(p, a))).
*/
string[] split(string s, string pattern, string attributes = null)
{
auto r = new RegExp(pattern, attributes);
auto result = r.split(s);
delete r;
return result;
}
unittest
{
debug(regexp) printf("regexp.split.unittest()\n");
string[] result;
result = split("ab", "a*");
assert(result.length == 2);
assert(result[0] == "");
assert(result[1] == "b");
foreach (i, s; split("abcabcabab", "C.", "i"))
{
//writefln("s[%d] = '%s'", i, s.length, s.ptr);
if (i == 0) assert(s == "ab");
else if (i == 1) assert(s == "b");
else if (i == 2) assert(s == "bab");
else assert(0);
}
}
/****************************************************
* Search s[] for first match with pattern[] with attributes[].
* Params:
* s = String to search.
* pattern = Regular expression pattern.
* attributes = Regular expression attributes.
* Returns:
* corresponding RegExp if found, null if not.
* Example:
* ---
* import std.stdio;
* import std.regexp;
*
* void main()
* {
* if (auto m = std.regexp.search("abcdef", "c"))
* {
* writefln("%s[%s]%s", m.pre, m[0], m.post);
* }
* }
* // Prints:
* // ab[c]def
* ---
*/
RegExp search(string s, string pattern, string attributes = null)
{
auto r = new RegExp(pattern, attributes);
if (!r.test(s))
{ delete r;
assert(r is null);
}
return r;
}
unittest
{
debug(regexp) printf("regexp.string.unittest()\n");
if (auto m = std.regexp.search("abcdef", "c()"))
{
auto result = std.string.format("%s[%s]%s", m.pre, m[0], m.post);
assert(result == "ab[c]def");
assert(m[1] == null);
assert(m[2] == null);
}
else
assert(0);
if (auto n = std.regexp.search("abcdef", "g"))
{
assert(0);
}
}
/* ********************************* RegExp ******************************** */
/*****************************
* RegExp is a class to handle regular expressions.
*
* It is the core foundation for adding powerful string pattern matching
* capabilities to programs like grep, text editors, awk, sed, etc.
*/
class RegExp
{
/*****
* Construct a RegExp object. Compile pattern
* with <i>attributes</i> into
* an internal form for fast execution.
* Params:
* pattern = regular expression
* attributes = _attributes
* Throws: RegExpException if there are any compilation errors.
* Example:
* Declare two variables and assign to them a RegExp object:
* ---
* auto r = new RegExp("pattern");
* auto s = new RegExp(r"p[1-5]\s*");
* ---
*/
public this(string pattern, string attributes = null)
{
pmatch = (&gmatch)[0 .. 1];
compile(pattern, attributes);
}
/*****
* Generate instance of RegExp.
* Params:
* pattern = regular expression
* attributes = _attributes
* Throws: RegExpException if there are any compilation errors.
* Example:
* Declare two variables and assign to them a RegExp object:
* ---
* auto r = RegExp("pattern");
* auto s = RegExp(r"p[1-5]\s*");
* ---
*/
public static RegExp opCall(string pattern, string attributes = null)
{
return new RegExp(pattern, attributes);
}
unittest
{
debug(regexp) printf("regexp.opCall.unittest()\n");
auto r1 = RegExp("hello", "m");
string msg;
try
{
auto r2 = RegExp("hello", "q");
assert(0);
}
catch (RegExpException ree)
{
msg = ree.toString();
//writefln("message: %s", ree);
}
assert(std.algorithm.countUntil(msg, "unrecognized attribute") >= 0);
}
/************************************
* Set up for start of foreach loop.
* Returns:
* search() returns instance of RegExp set up to _search string[].
* Example:
* ---
* import std.stdio;
* import std.regexp;
*
* void main()
* {
* foreach(m; RegExp("ab").search("abcabcabab"))
* {
* writefln("%s[%s]%s", m.pre, m[0], m.post);
* }
* }
* // Prints:
* // [ab]cabcabab
* // abc[ab]cabab
* // abcabc[ab]ab
* // abcabcab[ab]
* ---
*/
public RegExp search(string string)
{
input = string;
pmatch[0].rm_eo = 0;
return this;
}
/** ditto */
public int opApply(scope int delegate(ref RegExp) dg)
{
int result;
RegExp r = this;
while (test())
{
result = dg(r);
if (result)
break;
}
return result;
}
unittest
{
debug(regexp) printf("regexp.search.unittest()\n");
int i;
foreach(m; RegExp("ab").search("abcabcabab"))
{
auto s = std.string.format("%s[%s]%s", m.pre, m[0], m.post);
if (i == 0) assert(s == "[ab]cabcabab");
else if (i == 1) assert(s == "abc[ab]cabab");
else if (i == 2) assert(s == "abcabc[ab]ab");
else if (i == 3) assert(s == "abcabcab[ab]");
else assert(0);
i++;
}
}
/******************
* Retrieve match n.
*
* n==0 means the matched substring, n>0 means the
* n'th parenthesized subexpression.
* if n is larger than the number of parenthesized subexpressions,
* null is returned.
*/
public string opIndex(size_t n)
{
if (n >= pmatch.length)
return null;
else
{
auto rm_so = pmatch[n].rm_so;
auto rm_eo = pmatch[n].rm_eo;
if (rm_so == rm_eo)
return null;
return input[rm_so .. rm_eo];
}
}
/**
Same as $(D_PARAM opIndex(n)).
WARNING:
Scheduled for deprecation due to confusion with overloaded
$(D_PARAM match(string)). Instead of $(D_PARAM regex.match(n))
you may want to use $(D_PARAM regex[n]).
*/
public string match(size_t n)
{
return this[n];
}
/*******************
* Return the slice of the input that precedes the matched substring.
*/
public @property string pre()
{
return input[0 .. pmatch[0].rm_so];
}
/*******************
* Return the slice of the input that follows the matched substring.
*/
public @property string post()
{
return input[pmatch[0].rm_eo .. $];
}
uint re_nsub; // number of parenthesized subexpression matches
regmatch_t[] pmatch; // array [re_nsub + 1]
string input; // the string to search
// per instance:
string pattern; // source text of the regular expression
string flags; // source text of the attributes parameter
int errors;
uint attributes;
enum REA
{
global = 1, // has the g attribute
ignoreCase = 2, // has the i attribute
multiline = 4, // if treat as multiple lines separated
// by newlines, or as a single line
dotmatchlf = 8, // if . matches \n
}
private:
size_t src; // current source index in input[]
size_t src_start; // starting index for match in input[]
size_t p; // position of parser in pattern[]
regmatch_t gmatch; // match for the entire regular expression
// (serves as storage for pmatch[0])
const(ubyte)[] program; // pattern[] compiled into regular expression program
OutBuffer buf;
/******************************************/
// Opcodes
enum : ubyte
{
REend, // end of program
REchar, // single character
REichar, // single character, case insensitive
REdchar, // single UCS character
REidchar, // single wide character, case insensitive
REanychar, // any character
REanystar, // ".*"
REstring, // string of characters
REistring, // string of characters, case insensitive
REtestbit, // any in bitmap, non-consuming
REbit, // any in the bit map
REnotbit, // any not in the bit map
RErange, // any in the string
REnotrange, // any not in the string
REor, // a | b
REplus, // 1 or more
REstar, // 0 or more
REquest, // 0 or 1
REnm, // n..m
REnmq, // n..m, non-greedy version
REbol, // beginning of line
REeol, // end of line
REparen, // parenthesized subexpression
REgoto, // goto offset
REwordboundary,
REnotwordboundary,
REdigit,
REnotdigit,
REspace,
REnotspace,
REword,
REnotword,
REbackref,
};
// BUG: should this include '$'?
private int isword(dchar c) { return isAlphaNum(c) || c == '_'; }
private uint inf = ~0u;
/* ********************************
* Throws RegExpException on error
*/
public void compile(string pattern, string attributes)
{
//printf("RegExp.compile('%.*s', '%.*s')\n", pattern.length, pattern.ptr, attributes.length, attributes.ptr);
this.attributes = 0;
foreach (rchar c; attributes)
{ REA att;
switch (c)
{
case 'g': att = REA.global; break;
case 'i': att = REA.ignoreCase; break;
case 'm': att = REA.multiline; break;
default:
error("unrecognized attribute");
return;
}
if (this.attributes & att)
{ error("redundant attribute");
return;
}
this.attributes |= att;
}
input = null;
this.pattern = pattern;
this.flags = attributes;
uint oldre_nsub = re_nsub;
re_nsub = 0;
errors = 0;
buf = new OutBuffer();
buf.reserve(pattern.length * 8);
p = 0;
parseRegexp();
if (p < pattern.length)
{ error("unmatched ')'");
}
// @@@ SKIPPING OPTIMIZATION SOLVES BUG 941 @@@
//optimize();
program = buf.data;
buf.data = null;
delete buf;
if (re_nsub > oldre_nsub)
{
if (pmatch.ptr is &gmatch)
pmatch = null;
pmatch.length = re_nsub + 1;
}
pmatch[0].rm_so = 0;
pmatch[0].rm_eo = 0;
}
/********************************************
* Split s[] into an array of strings, using the regular
* expression as the separator.
* Returns:
* array of slices into s[]
*/
public string[] split(string s)
{
debug(regexp) printf("regexp.split()\n");
string[] result;
if (s.length)
{
sizediff_t p, q;
for (q = p; q != s.length;)
{
if (test(s, q))
{
q = pmatch[0].rm_so;
auto e = pmatch[0].rm_eo;
if (e != p)
{
result ~= s[p .. q];
for (size_t i = 1; i < pmatch.length; i++)
{
auto so = pmatch[i].rm_so;
auto eo = pmatch[i].rm_eo;
if (so == eo)
{ so = 0; // -1 gives array bounds error
eo = 0;
}
result ~= s[so .. eo];
}
q = p = e;
continue;
}
}
q++;
}
result ~= s[p .. s.length];
}
else if (!test(s))
result ~= s;
return result;
}
unittest
{
debug(regexp) printf("regexp.split.unittest()\n");
auto r = new RegExp("a*?", null);
string[] result;
string j;
int i;
result = r.split("ab");
assert(result.length == 2);
i = std.string.cmp(result[0], "a");
assert(i == 0);
i = std.string.cmp(result[1], "b");
assert(i == 0);
r = new RegExp("a*", null);
result = r.split("ab");
assert(result.length == 2);
i = std.string.cmp(result[0], "");
assert(i == 0);
i = std.string.cmp(result[1], "b");
assert(i == 0);
r = new RegExp("<(\\/)?([^<>]+)>", null);
result = r.split("a<b>font</b>bar<TAG>hello</TAG>");
debug(regexp)
{
for (i = 0; i < result.length; i++)
printf("result[%d] = '%.*s'\n", i, result[i].length, result[i].ptr);
}
j = join(result, ",");
//printf("j = '%.*s'\n", j.length, j.ptr);
i = std.string.cmp(j, "a,,b,font,/,b,bar,,TAG,hello,/,TAG,");
assert(i == 0);
r = new RegExp("a[bc]", null);
result = r.match("123ab");
j = join(result, ",");
i = std.string.cmp(j, "ab");
assert(i == 0);
result = r.match("ac");
j = join(result, ",");
i = std.string.cmp(j, "ac");
assert(i == 0);
}
/*************************************************
* Search string[] for match with regular expression.
* Returns:
* index of match if successful, -1 if not found
*/
public sizediff_t find(string string)
{
if (test(string))
return pmatch[0].rm_so;
else
return -1; // no match
}
//deprecated alias find search;
unittest
{
debug(regexp) printf("regexp.find.unittest()\n");
RegExp r = new RegExp("abc", null);
auto i = r.find("xabcy");
assert(i == 1);
i = r.find("cba");
assert(i == -1);
}
/*************************************************
* Search s[] for match.
* Returns:
* If global attribute, return same value as exec(s).
* If not global attribute, return array of all matches.
*/
public string[] match(string s)
{
string[] result;
if (attributes & REA.global)
{
sizediff_t lastindex = 0;
while (test(s, lastindex))
{
auto eo = pmatch[0].rm_eo;
result ~= input[pmatch[0].rm_so .. eo];
if (lastindex == eo)
lastindex++; // always consume some source
else
lastindex = eo;
}
}
else
{
result = exec(s);
}
return result;
}
unittest
{
debug(regexp) printf("regexp.match.unittest()\n");
int i;
string[] result;
string j;
RegExp r;
r = new RegExp("a[bc]", null);
result = r.match("1ab2ac3");
j = join(result, ",");
i = std.string.cmp(j, "ab");
assert(i == 0);
r = new RegExp("a[bc]", "g");
result = r.match("1ab2ac3");
j = join(result, ",");
i = std.string.cmp(j, "ab,ac");
assert(i == 0);
}
/*************************************************
* Find regular expression matches in s[]. Replace those matches
* with a new string composed of format[] merged with the result of the
* matches.
* If global, replace all matches. Otherwise, replace first match.
* Returns: the new string
*/
public string replace(string s, string format)
{
debug(regexp) printf("string = %.*s, format = %.*s\n", s.length, s.ptr, format.length, format.ptr);
string result = s;
sizediff_t lastindex = 0;
size_t offset = 0;
for (;;)
{
if (!test(s, lastindex))
break;
auto so = pmatch[0].rm_so;
auto eo = pmatch[0].rm_eo;
string replacement = replace(format);
// Optimize by using replace if possible - Dave Fladebo
string slice = result[offset + so .. offset + eo];
if (attributes & REA.global && // global, so replace all
!(attributes & REA.ignoreCase) && // not ignoring case
!(attributes & REA.multiline) && // not multiline
pattern == slice && // simple pattern (exact match, no special characters)
format == replacement) // simple format, not $ formats
{
debug(regexp)
{
auto sss = result[offset + so .. offset + eo];
printf("pattern: %.*s, slice: %.*s, format: %.*s, replacement: %.*s\n",
pattern.length, pattern.ptr, sss.length, sss.ptr, format.length, format.ptr, replacement.length, replacement.ptr);
}
result = std.array.replace(result,slice,replacement);
break;
}
result = replaceSlice(result, result[offset + so .. offset + eo], replacement);
if (attributes & REA.global)
{
offset += replacement.length - (eo - so);
if (lastindex == eo)
lastindex++; // always consume some source
else
lastindex = eo;
}
else
break;
}
return result;
}
unittest
{
debug(regexp) printf("regexp.replace.unittest()\n");
int i;
string result;
RegExp r;
r = new RegExp("a[bc]", "g");
result = r.replace("1ab2ac3", "x$&y");
i = std.string.cmp(result, "1xaby2xacy3");
assert(i == 0);
r = new RegExp("ab", "g");
result = r.replace("1ab2ac3", "xy");
i = std.string.cmp(result, "1xy2ac3");
assert(i == 0);
}
/*************************************************
* Search string[] for match.
* Returns:
* array of slices into string[] representing matches
*/
public string[] exec(string s)
{
debug(regexp) printf("regexp.exec(string = '%.*s')\n", s.length, s.ptr);
input = s;
pmatch[0].rm_so = 0;
pmatch[0].rm_eo = 0;
return exec();
}
/*************************************************
* Pick up where last exec(string) or exec() left off,
* searching string[] for next match.
* Returns:
* array of slices into string[] representing matches
*/
public string[] exec()
{
if (!test())
return null;
auto result = new string[pmatch.length];
for (int i = 0; i < pmatch.length; i++)
{
if (pmatch[i].rm_so == pmatch[i].rm_eo)
result[i] = null;
else
result[i] = input[pmatch[i].rm_so .. pmatch[i].rm_eo];
}
return result;
}
/************************************************
* Search s[] for match.
* Returns: 0 for no match, !=0 for match
* Example:
---
import std.stdio;
import std.regexp;
import std.string;
int grep(int delegate(char[]) pred, char[][] list)
{
int count;
foreach (s; list)
{ if (pred(s))
++count;
}
return count;
}
void main()
{
auto x = grep(&RegExp("[Ff]oo").test,
std.string.split("mary had a foo lamb"));
writefln(x);
}
---
* which prints: 1
*/
//@@@
public bool test(string s)
{
return test(s, 0 /*pmatch[0].rm_eo*/) != 0;
}
/************************************************
* Pick up where last test(string) or test() left off, and search again.
* Returns: 0 for no match, !=0 for match
*/
public int test()
{
return test(input, pmatch[0].rm_eo);
}
/************************************************
* Test s[] starting at startindex against regular expression.
* Returns: 0 for no match, !=0 for match
*/
public int test(string s, size_t startindex)
{
char firstc;
input = s;
debug (regexp) printf("RegExp.test(input[] = '%.*s', startindex = %zd)\n", input.length, input.ptr, startindex);
pmatch[0].rm_so = 0;
pmatch[0].rm_eo = 0;
if (startindex < 0 || startindex > input.length)
{
return 0; // fail
}
//debug(regexp) printProgram(program);
// First character optimization
firstc = 0;
if (program[0] == REchar)
{
firstc = program[1];
if (attributes & REA.ignoreCase && isAlpha(firstc))
firstc = 0;
}
for (auto si = startindex; ; si++)
{
if (firstc)
{
if (si == input.length)
break; // no match
if (input[si] != firstc)
{
si++;
if (!chr(si, firstc)) // if first character not found
break; // no match
}
}
for (size_t i = 0; i < re_nsub + 1; i++)
{
pmatch[i].rm_so = -1;
pmatch[i].rm_eo = -1;
}
src_start = src = si;
if (trymatch(0, program.length))
{
pmatch[0].rm_so = si;
pmatch[0].rm_eo = src;
//debug(regexp) printf("start = %d, end = %d\n", gmatch.rm_so, gmatch.rm_eo);
return 1;
}
// If possible match must start at beginning, we are done
if (program[0] == REbol || program[0] == REanystar)
{
if (attributes & REA.multiline)
{
// Scan for the next \n
if (!chr(si, '\n'))
break; // no match if '\n' not found
}
else
break;
}
if (si == input.length)
break;
debug(regexp)
{
auto sss = input[si + 1 .. input.length];
printf("Starting new try: '%.*s'\n", sss.length, sss.ptr);
}
}
return 0; // no match
}
/**
Returns whether string $(D_PARAM s) matches $(D_PARAM this).
*/
alias test opEquals;
// bool opEquals(string s)
// {
// return test(s);
// }
unittest
{
assert("abc" == RegExp(".b."));
assert("abc" != RegExp(".b.."));
}
int chr(ref size_t si, rchar c)
{
for (; si < input.length; si++)
{
if (input[si] == c)
return 1;
}
return 0;
}
void printProgram(const(ubyte)[] prog)
{
//debug(regexp)
{
size_t len;
uint n;
uint m;
ushort *pu;
uint *puint;
char[] str;
printf("printProgram()\n");
for (size_t pc = 0; pc < prog.length; )
{
printf("%3d: ", pc);
//printf("prog[pc] = %d, REchar = %d, REnmq = %d\n", prog[pc], REchar, REnmq);
switch (prog[pc])
{
case REchar:
printf("\tREchar '%c'\n", prog[pc + 1]);
pc += 1 + char.sizeof;
break;
case REichar:
printf("\tREichar '%c'\n", prog[pc + 1]);
pc += 1 + char.sizeof;
break;
case REdchar:
printf("\tREdchar '%c'\n", *cast(dchar *)&prog[pc + 1]);
pc += 1 + dchar.sizeof;
break;
case REidchar:
printf("\tREidchar '%c'\n", *cast(dchar *)&prog[pc + 1]);
pc += 1 + dchar.sizeof;
break;
case REanychar:
printf("\tREanychar\n");
pc++;
break;
case REstring:
len = *cast(size_t *)&prog[pc + 1];
str = (cast(char*)&prog[pc + 1 + size_t.sizeof])[0 .. len];
printf("\tREstring x%x, '%.*s'\n", len, str.length, str.ptr);
pc += 1 + size_t.sizeof + len * rchar.sizeof;
break;
case REistring:
len = *cast(size_t *)&prog[pc + 1];
str = (cast(char*)&prog[pc + 1 + size_t.sizeof])[0 .. len];
printf("\tREistring x%x, '%.*s'\n", len, str.length, str.ptr);
pc += 1 + size_t.sizeof + len * rchar.sizeof;
break;
case REtestbit:
pu = cast(ushort *)&prog[pc + 1];
printf("\tREtestbit %d, %d\n", pu[0], pu[1]);
len = pu[1];
pc += 1 + 2 * ushort.sizeof + len;
break;
case REbit:
pu = cast(ushort *)&prog[pc + 1];
len = pu[1];
printf("\tREbit cmax=%02x, len=%d:", pu[0], len);
for (n = 0; n < len; n++)
printf(" %02x", prog[pc + 1 + 2 * ushort.sizeof + n]);
printf("\n");
pc += 1 + 2 * ushort.sizeof + len;
break;
case REnotbit:
pu = cast(ushort *)&prog[pc + 1];
printf("\tREnotbit %d, %d\n", pu[0], pu[1]);
len = pu[1];
pc += 1 + 2 * ushort.sizeof + len;
break;
case RErange:
len = *cast(uint *)&prog[pc + 1];
printf("\tRErange %d\n", len);
// BUG: REAignoreCase?
pc += 1 + uint.sizeof + len;
break;
case REnotrange:
len = *cast(uint *)&prog[pc + 1];
printf("\tREnotrange %d\n", len);
// BUG: REAignoreCase?
pc += 1 + uint.sizeof + len;
break;
case REbol:
printf("\tREbol\n");
pc++;
break;
case REeol:
printf("\tREeol\n");
pc++;
break;
case REor:
len = *cast(uint *)&prog[pc + 1];
printf("\tREor %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len);
pc += 1 + uint.sizeof;
break;
case REgoto:
len = *cast(uint *)&prog[pc + 1];
printf("\tREgoto %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len);
pc += 1 + uint.sizeof;
break;
case REanystar:
printf("\tREanystar\n");
pc++;
break;
case REnm:
case REnmq:
// len, n, m, ()
puint = cast(uint *)&prog[pc + 1];
len = puint[0];
n = puint[1];
m = puint[2];
printf("\tREnm%s len=%d, n=%u, m=%u, pc=>%d\n",
(prog[pc] == REnmq) ? "q".ptr : " ".ptr,
len, n, m, pc + 1 + uint.sizeof * 3 + len);
pc += 1 + uint.sizeof * 3;
break;
case REparen:
// len, n, ()
puint = cast(uint *)&prog[pc + 1];
len = puint[0];
n = puint[1];
printf("\tREparen len=%d n=%d, pc=>%d\n", len, n, pc + 1 + uint.sizeof * 2 + len);
pc += 1 + uint.sizeof * 2;
break;
case REend:
printf("\tREend\n");
return;
case REwordboundary:
printf("\tREwordboundary\n");
pc++;
break;
case REnotwordboundary:
printf("\tREnotwordboundary\n");
pc++;
break;
case REdigit:
printf("\tREdigit\n");
pc++;
break;
case REnotdigit:
printf("\tREnotdigit\n");
pc++;
break;
case REspace:
printf("\tREspace\n");
pc++;
break;
case REnotspace:
printf("\tREnotspace\n");
pc++;
break;
case REword:
printf("\tREword\n");
pc++;
break;
case REnotword:
printf("\tREnotword\n");
pc++;
break;
case REbackref:
printf("\tREbackref %d\n", prog[1]);
pc += 2;
break;
default:
assert(0);
}
}
}
}
/**************************************************
* Match input against a section of the program[].
* Returns:
* 1 if successful match
* 0 no match
*/
int trymatch(size_t pc, size_t pcend)
{
size_t len;
size_t n;
size_t m;
size_t count;
size_t pop;
size_t ss;
regmatch_t *psave;
size_t c1;
size_t c2;
ushort* pu;
uint* puint;
debug(regexp)
{
auto sss = input[src .. input.length];
printf("RegExp.trymatch(pc = %zd, src = '%.*s', pcend = %zd)\n", pc, sss.length, sss.ptr, pcend);
}
auto srcsave = src;
psave = null;
for (;;)
{
if (pc == pcend) // if done matching
{ debug(regex) printf("\tprogend\n");
return 1;
}
//printf("\top = %d\n", program[pc]);
switch (program[pc])
{
case REchar:
if (src == input.length)
goto Lnomatch;
debug(regexp) printf("\tREchar '%c', src = '%c'\n", program[pc + 1], input[src]);
if (program[pc + 1] != input[src])
goto Lnomatch;
src++;
pc += 1 + char.sizeof;
break;
case REichar:
if (src == input.length)
goto Lnomatch;
debug(regexp) printf("\tREichar '%c', src = '%c'\n", program[pc + 1], input[src]);
c1 = program[pc + 1];
c2 = input[src];
if (c1 != c2)
{
if (isLower(cast(rchar)c2))
c2 = std.ascii.toUpper(cast(rchar)c2);
else
goto Lnomatch;
if (c1 != c2)
goto Lnomatch;
}
src++;
pc += 1 + char.sizeof;
break;
case REdchar:
debug(regexp) printf("\tREdchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]);
if (src == input.length)
goto Lnomatch;
if (*(cast(dchar *)&program[pc + 1]) != input[src])
goto Lnomatch;
src++;
pc += 1 + dchar.sizeof;
break;
case REidchar:
debug(regexp) printf("\tREidchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]);
if (src == input.length)
goto Lnomatch;
c1 = *(cast(dchar *)&program[pc + 1]);
c2 = input[src];
if (c1 != c2)
{
if (isLower(cast(rchar)c2))
c2 = std.ascii.toUpper(cast(rchar)c2);
else
goto Lnomatch;
if (c1 != c2)
goto Lnomatch;
}
src++;
pc += 1 + dchar.sizeof;
break;
case REanychar:
debug(regexp) printf("\tREanychar\n");
if (src == input.length)
goto Lnomatch;
if (!(attributes & REA.dotmatchlf) && input[src] == cast(rchar)'\n')
goto Lnomatch;
src += std.utf.stride(input, src);
//src++;
pc++;
break;
case REstring:
len = *cast(size_t *)&program[pc + 1];
debug(regexp)
{
auto sss2 = (&program[pc + 1 + size_t.sizeof])[0 .. len];
printf("\tREstring x%x, '%.*s'\n", len, sss2.length, sss2.ptr);
}
if (src + len > input.length)
goto Lnomatch;
if (memcmp(&program[pc + 1 + size_t.sizeof], &input[src], len * rchar.sizeof))
goto Lnomatch;
src += len;
pc += 1 + size_t.sizeof + len * rchar.sizeof;
break;
case REistring:
len = *cast(size_t *)&program[pc + 1];
debug(regexp)
{
auto sss2 = (&program[pc + 1 + size_t.sizeof])[0 .. len];
printf("\tREistring x%x, '%.*s'\n", len, sss2.length, sss2.ptr);
}
if (src + len > input.length)
goto Lnomatch;
if (icmp((cast(char*)&program[pc + 1 + size_t.sizeof])[0..len],
input[src .. src + len]))
goto Lnomatch;
src += len;
pc += 1 + size_t.sizeof + len * rchar.sizeof;
break;
case REtestbit:
pu = (cast(ushort *)&program[pc + 1]);
if (src == input.length)
goto Lnomatch;
debug(regexp) printf("\tREtestbit %d, %d, '%c', x%02x\n",
pu[0], pu[1], input[src], input[src]);
len = pu[1];
c1 = input[src];
//printf("[x%02x]=x%02x, x%02x\n", c1 >> 3, ((&program[pc + 1 + 4])[c1 >> 3] ), (1 << (c1 & 7)));
if (c1 <= pu[0] &&
!((&(program[pc + 1 + 4]))[c1 >> 3] & (1 << (c1 & 7))))
goto Lnomatch;
pc += 1 + 2 * ushort.sizeof + len;
break;
case REbit:
pu = (cast(ushort *)&program[pc + 1]);
if (src == input.length)
goto Lnomatch;
debug(regexp) printf("\tREbit %d, %d, '%c'\n",
pu[0], pu[1], input[src]);
len = pu[1];
c1 = input[src];
if (c1 > pu[0])
goto Lnomatch;
if (!((&program[pc + 1 + 4])[c1 >> 3] & (1 << (c1 & 7))))
goto Lnomatch;
src++;
pc += 1 + 2 * ushort.sizeof + len;
break;
case REnotbit:
pu = (cast(ushort *)&program[pc + 1]);
if (src == input.length)
goto Lnomatch;
debug(regexp) printf("\tREnotbit %d, %d, '%c'\n",
pu[0], pu[1], input[src]);
len = pu[1];
c1 = input[src];
if (c1 <= pu[0] &&
((&program[pc + 1 + 4])[c1 >> 3] & (1 << (c1 & 7))))
goto Lnomatch;
src++;
pc += 1 + 2 * ushort.sizeof + len;
break;
case RErange:
len = *cast(uint *)&program[pc + 1];
debug(regexp) printf("\tRErange %d\n", len);
if (src == input.length)
goto Lnomatch;
// BUG: REA.ignoreCase?
if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) == null)
goto Lnomatch;
src++;
pc += 1 + uint.sizeof + len;
break;
case REnotrange:
len = *cast(uint *)&program[pc + 1];
debug(regexp) printf("\tREnotrange %d\n", len);
if (src == input.length)
goto Lnomatch;
// BUG: REA.ignoreCase?
if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) != null)
goto Lnomatch;
src++;
pc += 1 + uint.sizeof + len;
break;
case REbol:
debug(regexp) printf("\tREbol\n");
if (src == 0)
{
}
else if (attributes & REA.multiline)
{
if (input[src - 1] != '\n')
goto Lnomatch;
}
else
goto Lnomatch;
pc++;
break;
case REeol:
debug(regexp) printf("\tREeol\n");
if (src == input.length)
{
}
else if (attributes & REA.multiline && input[src] == '\n')
src++;
else
goto Lnomatch;
pc++;
break;
case REor:
len = (cast(uint *)&program[pc + 1])[0];
debug(regexp) printf("\tREor %d\n", len);
pop = pc + 1 + uint.sizeof;
ss = src;
if (trymatch(pop, pcend))
{
if (pcend != program.length)
{
auto s = src;
if (trymatch(pcend, program.length))
{ debug(regexp) printf("\tfirst operand matched\n");
src = s;
return 1;
}
else
{
// If second branch doesn't match to end, take first anyway
src = ss;
if (!trymatch(pop + len, program.length))
{
debug(regexp) printf("\tfirst operand matched\n");
src = s;
return 1;
}
}
src = ss;
}
else
{ debug(regexp) printf("\tfirst operand matched\n");
return 1;
}
}
pc = pop + len; // proceed with 2nd branch
break;
case REgoto:
debug(regexp) printf("\tREgoto\n");
len = (cast(uint *)&program[pc + 1])[0];
pc += 1 + uint.sizeof + len;
break;
case REanystar:
debug(regexp) printf("\tREanystar\n");
pc++;
for (;;)
{
auto s1 = src;
if (src == input.length)
break;
if (!(attributes & REA.dotmatchlf) && input[src] == '\n')
break;
src++;
auto s2 = src;
// If no match after consumption, but it
// did match before, then no match
if (!trymatch(pc, program.length))
{
src = s1;
// BUG: should we save/restore pmatch[]?
if (trymatch(pc, program.length))
{
src = s1; // no match
break;
}
}
src = s2;
}
break;
case REnm:
case REnmq:
// len, n, m, ()
puint = cast(uint *)&program[pc + 1];
len = puint[0];
n = puint[1];
m = puint[2];
debug(regexp) printf("\tREnm%s len=%d, n=%u, m=%u\n",
(program[pc] == REnmq) ? "q".ptr : "".ptr, len, n, m);
pop = pc + 1 + uint.sizeof * 3;
for (count = 0; count < n; count++)
{
if (!trymatch(pop, pop + len))
goto Lnomatch;
}
if (!psave && count < m)
{
//version (Win32)
psave = cast(regmatch_t *)alloca((re_nsub + 1) * regmatch_t.sizeof);
//else
//psave = new regmatch_t[re_nsub + 1];
}
if (program[pc] == REnmq) // if minimal munch
{
for (; count < m; count++)
{
memcpy(psave, pmatch.ptr, (re_nsub + 1) * regmatch_t.sizeof);
auto s1 = src;
if (trymatch(pop + len, program.length))
{
src = s1;
memcpy(pmatch.ptr, psave, (re_nsub + 1) * regmatch_t.sizeof);
break;
}
if (!trymatch(pop, pop + len))
{ debug(regexp) printf("\tdoesn't match subexpression\n");
break;
}
// If source is not consumed, don't
// infinite loop on the match
if (s1 == src)
{ debug(regexp) printf("\tsource is not consumed\n");
break;
}
}
}
else // maximal munch
{
for (; count < m; count++)
{
memcpy(psave, pmatch.ptr, (re_nsub + 1) * regmatch_t.sizeof);
auto s1 = src;
if (!trymatch(pop, pop + len))
{ debug(regexp) printf("\tdoesn't match subexpression\n");
break;
}
auto s2 = src;
// If source is not consumed, don't
// infinite loop on the match
if (s1 == s2)
{ debug(regexp) printf("\tsource is not consumed\n");
break;
}
// If no match after consumption, but it
// did match before, then no match
if (!trymatch(pop + len, program.length))
{
src = s1;
if (trymatch(pop + len, program.length))
{
src = s1; // no match
memcpy(pmatch.ptr, psave, (re_nsub + 1) * regmatch_t.sizeof);
break;
}
}
src = s2;
}
}
debug(regexp) printf("\tREnm len=%d, n=%u, m=%u, DONE count=%d\n", len, n, m, count);
pc = pop + len;
break;
case REparen:
// len, ()
debug(regexp) printf("\tREparen\n");
puint = cast(uint *)&program[pc + 1];
len = puint[0];
n = puint[1];
pop = pc + 1 + uint.sizeof * 2;
ss = src;
if (!trymatch(pop, pop + len))
goto Lnomatch;
pmatch[n + 1].rm_so = ss;
pmatch[n + 1].rm_eo = src;
pc = pop + len;
break;
case REend:
debug(regexp) printf("\tREend\n");
return 1; // successful match
case REwordboundary:
debug(regexp) printf("\tREwordboundary\n");
if (src > 0 && src < input.length)
{
c1 = input[src - 1];
c2 = input[src];
if (!(
(isword(cast(rchar)c1) && !isword(cast(rchar)c2)) ||
(!isword(cast(rchar)c1) && isword(cast(rchar)c2))
)
)
goto Lnomatch;
}
pc++;
break;
case REnotwordboundary:
debug(regexp) printf("\tREnotwordboundary\n");
if (src == 0 || src == input.length)
goto Lnomatch;
c1 = input[src - 1];
c2 = input[src];
if (
(isword(cast(rchar)c1) && !isword(cast(rchar)c2)) ||
(!isword(cast(rchar)c1) && isword(cast(rchar)c2))
)
goto Lnomatch;
pc++;
break;
case REdigit:
debug(regexp) printf("\tREdigit\n");
if (src == input.length)
goto Lnomatch;
if (!isDigit(input[src]))
goto Lnomatch;
src++;
pc++;
break;
case REnotdigit:
debug(regexp) printf("\tREnotdigit\n");
if (src == input.length)
goto Lnomatch;
if (isDigit(input[src]))
goto Lnomatch;
src++;
pc++;
break;
case REspace:
debug(regexp) printf("\tREspace\n");
if (src == input.length)
goto Lnomatch;
if (!isWhite(input[src]))
goto Lnomatch;
src++;
pc++;
break;
case REnotspace:
debug(regexp) printf("\tREnotspace\n");
if (src == input.length)
goto Lnomatch;
if (isWhite(input[src]))
goto Lnomatch;
src++;
pc++;
break;
case REword:
debug(regexp) printf("\tREword\n");
if (src == input.length)
goto Lnomatch;
if (!isword(input[src]))
goto Lnomatch;
src++;
pc++;
break;
case REnotword:
debug(regexp) printf("\tREnotword\n");
if (src == input.length)
goto Lnomatch;
if (isword(input[src]))
goto Lnomatch;
src++;
pc++;
break;
case REbackref:
{
n = program[pc + 1];
debug(regexp) printf("\tREbackref %d\n", n);
auto so = pmatch[n + 1].rm_so;
auto eo = pmatch[n + 1].rm_eo;
len = eo - so;
if (src + len > input.length)
goto Lnomatch;
else if (attributes & REA.ignoreCase)
{
if (icmp(input[src .. src + len], input[so .. eo]))
goto Lnomatch;
}
else if (memcmp(&input[src], &input[so], len * rchar.sizeof))
goto Lnomatch;
src += len;
pc += 2;
break;
}
default:
assert(0);
}
}
Lnomatch:
debug(regexp) printf("\tnomatch pc=%d\n", pc);
src = srcsave;
return 0;
}
/* =================== Compiler ================== */
int parseRegexp()
{
size_t gotooffset;
uint len1;
uint len2;
debug(regexp)
{
auto sss = pattern[p .. pattern.length];
printf("parseRegexp() '%.*s'\n", sss.length, sss.ptr);
}
auto offset = buf.offset;
for (;;)
{
assert(p <= pattern.length);
if (p == pattern.length)
{ buf.write(REend);
return 1;
}
switch (pattern[p])
{
case ')':
return 1;
case '|':
p++;
gotooffset = buf.offset;
buf.write(REgoto);
buf.write(cast(uint)0);
len1 = cast(uint)(buf.offset - offset);
buf.spread(offset, 1 + uint.sizeof);
gotooffset += 1 + uint.sizeof;
parseRegexp();
len2 = cast(uint)(buf.offset - (gotooffset + 1 + uint.sizeof));
buf.data[offset] = REor;
(cast(uint *)&buf.data[offset + 1])[0] = len1;
(cast(uint *)&buf.data[gotooffset + 1])[0] = len2;
break;
default:
parsePiece();
break;
}
}
}
int parsePiece()
{
uint len;
uint n;
uint m;
ubyte op;
auto plength = pattern.length;
debug(regexp)
{
auto sss = pattern[p .. pattern.length];
printf("parsePiece() '%.*s'\n", sss.length, sss.ptr);
}
auto offset = buf.offset;
parseAtom();
if (p == plength)
return 1;
switch (pattern[p])
{
case '*':
// Special optimization: replace .* with REanystar
if (buf.offset - offset == 1 &&
buf.data[offset] == REanychar &&
p + 1 < plength &&
pattern[p + 1] != '?')
{
buf.data[offset] = REanystar;
p++;
break;
}
n = 0;
m = inf;
goto Lnm;
case '+':
n = 1;
m = inf;
goto Lnm;
case '?':
n = 0;
m = 1;
goto Lnm;
case '{': // {n} {n,} {n,m}
p++;
if (p == plength || !isDigit(pattern[p]))
goto Lerr;
n = 0;
do
{
// BUG: handle overflow
n = n * 10 + pattern[p] - '0';
p++;
if (p == plength)
goto Lerr;
} while (isDigit(pattern[p]));
if (pattern[p] == '}') // {n}
{ m = n;
goto Lnm;
}
if (pattern[p] != ',')
goto Lerr;
p++;
if (p == plength)
goto Lerr;
if (pattern[p] == /*{*/ '}') // {n,}
{ m = inf;
goto Lnm;
}
if (!isDigit(pattern[p]))
goto Lerr;
m = 0; // {n,m}
do
{
// BUG: handle overflow
m = m * 10 + pattern[p] - '0';
p++;
if (p == plength)
goto Lerr;
} while (isDigit(pattern[p]));
if (pattern[p] != /*{*/ '}')
goto Lerr;
goto Lnm;
Lnm:
p++;
op = REnm;
if (p < plength && pattern[p] == '?')
{ op = REnmq; // minimal munch version
p++;
}
len = cast(uint)(buf.offset - offset);
buf.spread(offset, 1 + uint.sizeof * 3);
buf.data[offset] = op;
uint* puint = cast(uint *)&buf.data[offset + 1];
puint[0] = len;
puint[1] = n;
puint[2] = m;
break;
default:
break;
}
return 1;
Lerr:
error("badly formed {n,m}");
assert(0);
}
int parseAtom()
{ ubyte op;
size_t offset;
rchar c;
debug(regexp)
{
auto sss = pattern[p .. pattern.length];
printf("parseAtom() '%.*s'\n", sss.length, sss.ptr);
}
if (p < pattern.length)
{
c = pattern[p];
switch (c)
{
case '*':
case '+':
case '?':
error("*+? not allowed in atom");
p++;
return 0;
case '(':
p++;
buf.write(REparen);
offset = buf.offset;
buf.write(cast(uint)0); // reserve space for length
buf.write(re_nsub);
re_nsub++;
parseRegexp();
*cast(uint *)&buf.data[offset] =
cast(uint)(buf.offset - (offset + uint.sizeof * 2));
if (p == pattern.length || pattern[p] != ')')
{
error("')' expected");
return 0;
}
p++;
break;
case '[':
if (!parseRange())
return 0;
break;
case '.':
p++;
buf.write(REanychar);
break;
case '^':
p++;
buf.write(REbol);
break;
case '$':
p++;
buf.write(REeol);
break;
case '\\':
p++;
if (p == pattern.length)
{ error("no character past '\\'");
return 0;
}
c = pattern[p];
switch (c)
{
case 'b': op = REwordboundary; goto Lop;
case 'B': op = REnotwordboundary; goto Lop;
case 'd': op = REdigit; goto Lop;
case 'D': op = REnotdigit; goto Lop;
case 's': op = REspace; goto Lop;
case 'S': op = REnotspace; goto Lop;
case 'w': op = REword; goto Lop;
case 'W': op = REnotword; goto Lop;
Lop:
buf.write(op);
p++;
break;
case 'f':
case 'n':
case 'r':
case 't':
case 'v':
case 'c':
case 'x':
case 'u':
case '0':
c = cast(char)escape();
goto Lbyte;
case '1': case '2': case '3':
case '4': case '5': case '6':
case '7': case '8': case '9':
c -= '1';
if (c < re_nsub)
{ buf.write(REbackref);
buf.write(cast(ubyte)c);
}
else
{ error("no matching back reference");
return 0;
}
p++;
break;
default:
p++;
goto Lbyte;
}
break;
default:
p++;
Lbyte:
op = REchar;
if (attributes & REA.ignoreCase)
{
if (isAlpha(c))
{
op = REichar;
c = cast(char)std.ascii.toUpper(c);
}
}
if (op == REchar && c <= 0xFF)
{
// Look ahead and see if we can make this into
// an REstring
auto q = p;
for (; q < pattern.length; ++q)
{ rchar qc = pattern[q];
switch (qc)
{
case '{':
case '*':
case '+':
case '?':
if (q == p)
goto Lchar;
q--;
break;
case '(': case ')':
case '|':
case '[': case ']':
case '.': case '^':
case '$': case '\\':
case '}':
break;
default:
continue;
}
break;
}
auto len = q - p;
if (len > 0)
{
debug(regexp) printf("writing string len %d, c = '%c', pattern[p] = '%c'\n", len+1, c, pattern[p]);
buf.reserve(5 + (1 + len) * rchar.sizeof);
buf.write((attributes & REA.ignoreCase) ? REistring : REstring);
buf.write(len + 1);
buf.write(c);
buf.write(pattern[p .. p + len]);
p = q;
break;
}
}
if (c >= 0x80)
{
// Convert to dchar opcode
op = (op == REchar) ? REdchar : REidchar;
buf.write(op);
buf.write(c);
}
else
{
Lchar:
debug(regexp) printf("It's an REchar '%c'\n", c);
buf.write(op);
buf.write(cast(char)c);
}
break;
}
}
return 1;
}
private:
class Range
{
size_t maxc;
size_t maxb;
OutBuffer buf;
ubyte* base;
BitArray bits;
this(OutBuffer buf)
{
this.buf = buf;
if (buf.data.length)
this.base = &buf.data[buf.offset];
}
void setbitmax(size_t u)
{
//printf("setbitmax(x%x), maxc = x%x\n", u, maxc);
if (u > maxc)
{
maxc = u;
auto b = u / 8;
if (b >= maxb)
{
auto u2 = base ? base - &buf.data[0] : 0;
buf.fill0(b - maxb + 1);
base = &buf.data[u2];
maxb = b + 1;
//bits = (cast(bit*)this.base)[0 .. maxc + 1];
bits.ptr = cast(size_t*)this.base;
}
bits.len = maxc + 1;
}
}
void setbit2(size_t u)
{
setbitmax(u + 1);
//printf("setbit2 [x%02x] |= x%02x\n", u >> 3, 1 << (u & 7));
bits[u] = 1;
}
};
int parseRange()
{
int c;
int c2;
uint i;
uint cmax;
cmax = 0x7F;
p++;
ubyte op = REbit;
if (p == pattern.length)
goto Lerr;
if (pattern[p] == '^')
{ p++;
op = REnotbit;
if (p == pattern.length)
goto Lerr;
}
buf.write(op);
auto offset = buf.offset;
buf.write(cast(uint)0); // reserve space for length
buf.reserve(128 / 8);
auto r = new Range(buf);
if (op == REnotbit)
r.setbit2(0);
switch (pattern[p])
{
case ']':
case '-':
c = pattern[p];
p++;
r.setbit2(c);
break;
default:
break;
}
enum RS { start, rliteral, dash }
RS rs;
rs = RS.start;
for (;;)
{
if (p == pattern.length)
goto Lerr;
switch (pattern[p])
{
case ']':
switch (rs)
{ case RS.dash:
r.setbit2('-');
goto case;
case RS.rliteral:
r.setbit2(c);
break;
case RS.start:
break;
default:
assert(0);
}
p++;
break;
case '\\':
p++;
r.setbitmax(cmax);
if (p == pattern.length)
goto Lerr;
switch (pattern[p])
{
case 'd':
for (i = '0'; i <= '9'; i++)
r.bits[i] = 1;
goto Lrs;
case 'D':
for (i = 1; i < '0'; i++)
r.bits[i] = 1;
for (i = '9' + 1; i <= cmax; i++)
r.bits[i] = 1;
goto Lrs;
case 's':
for (i = 0; i <= cmax; i++)
if (isWhite(i))
r.bits[i] = 1;
goto Lrs;
case 'S':
for (i = 1; i <= cmax; i++)
if (!isWhite(i))
r.bits[i] = 1;
goto Lrs;
case 'w':
for (i = 0; i <= cmax; i++)
if (isword(cast(rchar)i))
r.bits[i] = 1;
goto Lrs;
case 'W':
for (i = 1; i <= cmax; i++)
if (!isword(cast(rchar)i))
r.bits[i] = 1;
goto Lrs;
Lrs:
switch (rs)
{ case RS.dash:
r.setbit2('-');
goto case;
case RS.rliteral:
r.setbit2(c);
break;
default:
break;
}
rs = RS.start;
continue;
default:
break;
}
c2 = escape();
goto Lrange;
case '-':
p++;
if (rs == RS.start)
goto Lrange;
else if (rs == RS.rliteral)
rs = RS.dash;
else if (rs == RS.dash)
{
r.setbit2(c);
r.setbit2('-');
rs = RS.start;
}
continue;
default:
c2 = pattern[p];
p++;
Lrange:
switch (rs)
{ case RS.rliteral:
r.setbit2(c);
goto case;
case RS.start:
c = c2;
rs = RS.rliteral;
break;
case RS.dash:
if (c > c2)
{ error("inverted range in character class");
return 0;
}
r.setbitmax(c2);
//printf("c = %x, c2 = %x\n",c,c2);
for (; c <= c2; c++)
r.bits[c] = 1;
rs = RS.start;
break;
default:
assert(0);
}
continue;
}
break;
}
if (attributes & REA.ignoreCase)
{
// BUG: what about dchar?
r.setbitmax(0x7F);
for (c = 'a'; c <= 'z'; c++)
{
if (r.bits[c])
r.bits[c + 'A' - 'a'] = 1;
else if (r.bits[c + 'A' - 'a'])
r.bits[c] = 1;
}
}
//printf("maxc = %d, maxb = %d\n",r.maxc,r.maxb);
(cast(ushort *)&buf.data[offset])[0] = cast(ushort)r.maxc;
(cast(ushort *)&buf.data[offset])[1] = cast(ushort)r.maxb;
return 1;
Lerr:
error("invalid range");
return 0;
}
void error(string msg)
{
errors++;
debug(regexp) printf("error: %.*s\n", msg.length, msg.ptr);
//assert(0);
//*(char*)0=0;
throw new RegExpException(msg);
}
// p is following the \ char
int escape()
in
{
assert(p < pattern.length);
}
body
{ int c;
int i;
rchar tc;
c = pattern[p]; // none of the cases are multibyte
switch (c)
{
case 'b': c = '\b'; break;
case 'f': c = '\f'; break;
case 'n': c = '\n'; break;
case 'r': c = '\r'; break;
case 't': c = '\t'; break;
case 'v': c = '\v'; break;
// BUG: Perl does \a and \e too, should we?
case 'c':
++p;
if (p == pattern.length)
goto Lretc;
c = pattern[p];
// Note: we are deliberately not allowing dchar letters
if (!(('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')))
{
Lcerr:
error("letter expected following \\c");
return 0;
}
c &= 0x1F;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
c -= '0';
for (i = 0; i < 2; i++)
{
p++;
if (p == pattern.length)
goto Lretc;
tc = pattern[p];
if ('0' <= tc && tc <= '7')
{ c = c * 8 + (tc - '0');
// Treat overflow as if last
// digit was not an octal digit
if (c >= 0xFF)
{ c >>= 3;
return c;
}
}
else
return c;
}
break;
case 'x':
c = 0;
for (i = 0; i < 2; i++)
{
p++;
if (p == pattern.length)
goto Lretc;
tc = pattern[p];
if ('0' <= tc && tc <= '9')
c = c * 16 + (tc - '0');
else if ('a' <= tc && tc <= 'f')
c = c * 16 + (tc - 'a' + 10);
else if ('A' <= tc && tc <= 'F')
c = c * 16 + (tc - 'A' + 10);
else if (i == 0) // if no hex digits after \x
{
// Not a valid \xXX sequence
return 'x';
}
else
return c;
}
break;
case 'u':
c = 0;
for (i = 0; i < 4; i++)
{
p++;
if (p == pattern.length)
goto Lretc;
tc = pattern[p];
if ('0' <= tc && tc <= '9')
c = c * 16 + (tc - '0');
else if ('a' <= tc && tc <= 'f')
c = c * 16 + (tc - 'a' + 10);
else if ('A' <= tc && tc <= 'F')
c = c * 16 + (tc - 'A' + 10);
else
{
// Not a valid \uXXXX sequence
p -= i;
return 'u';
}
}
break;
default:
break;
}
p++;
Lretc:
return c;
}
/* ==================== optimizer ======================= */
void optimize()
{ ubyte[] prog;
debug(regexp) printf("RegExp.optimize()\n");
prog = buf.toBytes();
for (size_t i = 0; 1;)
{
//printf("\tprog[%d] = %d, %d\n", i, prog[i], REstring);
switch (prog[i])
{
case REend:
case REanychar:
case REanystar:
case REbackref:
case REeol:
case REchar:
case REichar:
case REdchar:
case REidchar:
case REstring:
case REistring:
case REtestbit:
case REbit:
case REnotbit:
case RErange:
case REnotrange:
case REwordboundary:
case REnotwordboundary:
case REdigit:
case REnotdigit:
case REspace:
case REnotspace:
case REword:
case REnotword:
return;
case REbol:
i++;
continue;
case REor:
case REnm:
case REnmq:
case REparen:
case REgoto:
{
auto bitbuf = new OutBuffer;
auto r = new Range(bitbuf);
auto offset = i;
if (starrchars(r, prog[i .. prog.length]))
{
debug(regexp) printf("\tfilter built\n");
buf.spread(offset, 1 + 4 + r.maxb);
buf.data[offset] = REtestbit;
(cast(ushort *)&buf.data[offset + 1])[0] = cast(ushort)r.maxc;
(cast(ushort *)&buf.data[offset + 1])[1] = cast(ushort)r.maxb;
i = offset + 1 + 4;
buf.data[i .. i + r.maxb] = r.base[0 .. r.maxb];
}
return;
}
default:
assert(0);
}
}
}
/////////////////////////////////////////
// OR the leading character bits into r.
// Limit the character range from 0..7F,
// trymatch() will allow through anything over maxc.
// Return 1 if success, 0 if we can't build a filter or
// if there is no point to one.
int starrchars(Range r, const(ubyte)[] prog)
{ rchar c;
uint maxc;
size_t maxb;
size_t len;
uint b;
uint n;
uint m;
const(ubyte)* pop;
//printf("RegExp.starrchars(prog = %p, progend = %p)\n", prog, progend);
for (size_t i = 0; i < prog.length;)
{
switch (prog[i])
{
case REchar:
c = prog[i + 1];
if (c <= 0x7F)
r.setbit2(c);
return 1;
case REichar:
c = prog[i + 1];
if (c <= 0x7F)
{ r.setbit2(c);
r.setbit2(std.ascii.toLower(cast(rchar)c));
}
return 1;
case REdchar:
case REidchar:
return 1;
case REanychar:
return 0; // no point
case REstring:
len = *cast(size_t *)&prog[i + 1];
assert(len);
c = *cast(rchar *)&prog[i + 1 + size_t.sizeof];
debug(regexp) printf("\tREstring %d, '%c'\n", len, c);
if (c <= 0x7F)
r.setbit2(c);
return 1;
case REistring:
len = *cast(size_t *)&prog[i + 1];
assert(len);
c = *cast(rchar *)&prog[i + 1 + size_t.sizeof];
debug(regexp) printf("\tREistring %d, '%c'\n", len, c);
if (c <= 0x7F)
{ r.setbit2(std.ascii.toUpper(cast(rchar)c));
r.setbit2(std.ascii.toLower(cast(rchar)c));
}
return 1;
case REtestbit:
case REbit:
maxc = (cast(ushort *)&prog[i + 1])[0];
maxb = (cast(ushort *)&prog[i + 1])[1];
if (maxc <= 0x7F)
r.setbitmax(maxc);
else
maxb = r.maxb;
for (b = 0; b < maxb; b++)
r.base[b] |= prog[i + 1 + 4 + b];
return 1;
case REnotbit:
maxc = (cast(ushort *)&prog[i + 1])[0];
maxb = (cast(ushort *)&prog[i + 1])[1];
if (maxc <= 0x7F)
r.setbitmax(maxc);
else
maxb = r.maxb;
for (b = 0; b < maxb; b++)
r.base[b] |= ~prog[i + 1 + 4 + b];
return 1;
case REbol:
case REeol:
return 0;
case REor:
len = (cast(uint *)&prog[i + 1])[0];
return starrchars(r, prog[i + 1 + uint.sizeof .. prog.length]) &&
starrchars(r, prog[i + 1 + uint.sizeof + len .. prog.length]);
case REgoto:
len = (cast(uint *)&prog[i + 1])[0];
i += 1 + uint.sizeof + len;
break;
case REanystar:
return 0;
case REnm:
case REnmq:
// len, n, m, ()
len = (cast(uint *)&prog[i + 1])[0];
n = (cast(uint *)&prog[i + 1])[1];
m = (cast(uint *)&prog[i + 1])[2];
pop = &prog[i + 1 + uint.sizeof * 3];
if (!starrchars(r, pop[0 .. len]))
return 0;
if (n)
return 1;
i += 1 + uint.sizeof * 3 + len;
break;
case REparen:
// len, ()
len = (cast(uint *)&prog[i + 1])[0];
n = (cast(uint *)&prog[i + 1])[1];
pop = &prog[0] + i + 1 + uint.sizeof * 2;
return starrchars(r, pop[0 .. len]);
case REend:
return 0;
case REwordboundary:
case REnotwordboundary:
return 0;
case REdigit:
r.setbitmax('9');
for (c = '0'; c <= '9'; c++)
r.bits[c] = 1;
return 1;
case REnotdigit:
r.setbitmax(0x7F);
for (c = 0; c <= '0'; c++)
r.bits[c] = 1;
for (c = '9' + 1; c <= r.maxc; c++)
r.bits[c] = 1;
return 1;
case REspace:
r.setbitmax(0x7F);
for (c = 0; c <= r.maxc; c++)
if (isWhite(c))
r.bits[c] = 1;
return 1;
case REnotspace:
r.setbitmax(0x7F);
for (c = 0; c <= r.maxc; c++)
if (!isWhite(c))
r.bits[c] = 1;
return 1;
case REword:
r.setbitmax(0x7F);
for (c = 0; c <= r.maxc; c++)
if (isword(cast(rchar)c))
r.bits[c] = 1;
return 1;
case REnotword:
r.setbitmax(0x7F);
for (c = 0; c <= r.maxc; c++)
if (!isword(cast(rchar)c))
r.bits[c] = 1;
return 1;
case REbackref:
return 0;
default:
assert(0);
}
}
return 1;
}
/* ==================== replace ======================= */
/***********************
* After a match is found with test(), this function
* will take the match results and, using the format
* string, generate and return a new string.
*/
public string replace(string format)
{
return replace3(format, input, pmatch[0 .. re_nsub + 1]);
}
// Static version that doesn't require a RegExp object to be created
public static string replace3(string format, string input, regmatch_t[] pmatch)
{
string result;
size_t c2;
sizediff_t rm_so, rm_eo, i;
// printf("replace3(format = '%.*s', input = '%.*s')\n", format.length, format.ptr, input.length, input.ptr);
result.length = format.length;
result.length = 0;
for (size_t f = 0; f < format.length; f++)
{
char c = format[f];
L1:
if (c != '$')
{
result ~= c;
continue;
}
++f;
if (f == format.length)
{
result ~= '$';
break;
}
c = format[f];
switch (c)
{
case '&':
rm_so = pmatch[0].rm_so;
rm_eo = pmatch[0].rm_eo;
goto Lstring;
case '`':
rm_so = 0;
rm_eo = pmatch[0].rm_so;
goto Lstring;
case '\'':
rm_so = pmatch[0].rm_eo;
rm_eo = input.length;
goto Lstring;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
i = c - '0';
if (f + 1 == format.length)
{
if (i == 0)
{
result ~= '$';
result ~= c;
continue;
}
}
else
{
c2 = format[f + 1];
if (c2 >= '0' && c2 <= '9')
{
i = (c - '0') * 10 + (c2 - '0');
f++;
}
if (i == 0)
{
result ~= '$';
result ~= c;
c = cast(char)c2;
goto L1;
}
}
if (i < pmatch.length)
{ rm_so = pmatch[i].rm_so;
rm_eo = pmatch[i].rm_eo;
goto Lstring;
}
break;
Lstring:
if (rm_so != rm_eo)
result ~= input[rm_so .. rm_eo];
break;
default:
result ~= '$';
result ~= c;
break;
}
}
return result;
}
/************************************
* Like replace(char[] format), but uses old style formatting:
<table border=1 cellspacing=0 cellpadding=5>
<th>Format
<th>Description
<tr>
<td><b>&</b>
<td>replace with the match
</tr>
<tr>
<td><b>\</b><i>n</i>
<td>replace with the <i>n</i>th parenthesized match, <i>n</i> is 1..9
</tr>
<tr>
<td><b>\</b><i>c</i>
<td>replace with char <i>c</i>.
</tr>
</table>
*/
public string replaceOld(string format)
{
string result;
//printf("replace: this = %p so = %d, eo = %d\n", this, pmatch[0].rm_so, pmatch[0].rm_eo);
//printf("3input = '%.*s'\n", input.length, input.ptr);
result.length = format.length;
result.length = 0;
for (size_t i; i < format.length; i++)
{
char c = format[i];
switch (c)
{
case '&':
{
auto sss = input[pmatch[0].rm_so .. pmatch[0].rm_eo];
//printf("match = '%.*s'\n", sss.length, sss.ptr);
result ~= sss;
}
break;
case '\\':
if (i + 1 < format.length)
{
c = format[++i];
if (c >= '1' && c <= '9')
{ uint j;
j = c - '0';
if (j <= re_nsub && pmatch[j].rm_so != pmatch[j].rm_eo)
result ~= input[pmatch[j].rm_so .. pmatch[j].rm_eo];
break;
}
}
result ~= c;
break;
default:
result ~= c;
break;
}
}
return result;
}
}
unittest
{ // Created and placed in public domain by Don Clugston
auto m = search("aBC r s", `bc\x20r[\40]s`, "i");
assert(m.pre=="a");
assert(m[0]=="BC r s");
auto m2 = search("7xxyxxx", `^\d([a-z]{2})\D\1`);
assert(m2[0]=="7xxyxx");
// Just check the parsing.
auto m3 = search("dcbxx", `ca|b[\d\]\D\s\S\w-\W]`);
auto m4 = search("xy", `[^\ca-\xFa\r\n\b\f\t\v\0123]{2,485}$`);
auto m5 = search("xxx", `^^\r\n\b{13,}\f{4}\t\v\u02aF3a\w\W`);
auto m6 = search("xxy", `.*y`);
assert(m6[0]=="xxy");
auto m7 = search("QWDEfGH", "(ca|b|defg)+", "i");
assert(m7[0]=="DEfG");
auto m8 = search("dcbxx", `a?\B\s\S`);
auto m9 = search("dcbxx", `[-w]`);
auto m10 = search("dcbsfd", `aB[c-fW]dB|\d|\D|\u012356|\w|\W|\s|\S`, "i");
auto m11 = search("dcbsfd", `[]a-]`);
m.replaceOld(`a&b\1c`);
m.replace(`a$&b$'$1c`);
}
// Andrei
//------------------------------------------------------------------------------
struct Pattern(Char)
{
immutable(Char)[] pattern;
this(immutable(Char)[] pattern)
{
this.pattern = pattern;
}
}
Pattern!(Char) pattern(Char)(immutable(Char)[] pat)
{
return typeof(return)(pat);
}
struct Splitter(Range)
{
Range _input;
size_t _chunkLength;
RegExp _rx;
private Range search()
{
//rx = std.regexp.search(_input, "(" ~ _separator.pattern ~ ")");
auto i = std.regexp.find(cast(string) _input, _rx);
return _input[i >= 0 ? i : _input.length .. _input.length];
}
private void advance()
{
//writeln("(" ~ _separator.pattern ~ ")");
//writeln(_input);
//assert(_rx[0].length > 0);
_chunkLength += _rx[0].length;
}
this(Range input, Pattern!(char) separator)
{
_input = input;
_rx = RegExp(separator.pattern);
_chunkLength = _input.length - search().length;
}
ref auto opSlice()
{
return this;
}
@property Range front()
{
return _input[0 .. _chunkLength];
}
@property bool empty()
{
return _input.empty;
}
void popFront()
{
if (_chunkLength == _input.length)
{
_input = _input[_chunkLength .. _input.length];
return;
}
advance();
_input = _input[_chunkLength .. _input.length];
_chunkLength = _input.length - search().length;
}
}
Splitter!(Range) splitter(Range)(Range r, Pattern!(char) pat)
{
static assert(is(Unqual!(typeof(Range.init[0])) == char),
Unqual!(typeof(Range.init[0])).stringof);
return typeof(return)(cast(string) r, pat);
}
unittest
{
auto s1 = ", abc, de, fg, hi, ";
auto sp2 = splitter(s1, pattern(", *"));
//foreach (e; sp2) writeln("[", e, "]");
assert(equal(sp2, ["", "abc", "de", "fg", "hi"][]));
}
unittest
{
auto str= "foo";
string[] re_strs= [
r"^(h|a|)fo[oas]$",
r"^(a|b|)fo[oas]$",
r"^(a|)foo$",
r"(a|)foo",
r"^(h|)foo$",
r"(h|)foo",
r"(h|a|)fo[oas]",
r"^(a|b|)fo[o]$",
r"[abf][ops](o|oo|)(h|a|)",
r"(h|)[abf][ops](o|oo|)",
r"(c|)[abf][ops](o|oo|)"
];
foreach (re_str; re_strs) {
auto re= new RegExp(re_str);
auto matches= cast(bool)re.test(str);
assert(matches);
//writefln("'%s' matches '%s' ? %s", str, re_str, matches);
}
for (char c='a'; c<='z'; ++c) {
auto re_str= "("~c~"|)foo";
auto re= new RegExp(re_str);
auto matches= cast(bool)re.test(str);
assert(matches);
//writefln("'%s' matches '%s' ? %s", str, re_str, matches);
}
}