mirror of
https://github.com/dlang/phobos.git
synced 2025-04-30 15:10:46 +03:00

std.contracts: added functions pointsTo() std.numeric: minor unittest fixes. std.bitmanip: fixed code bloat issue, reintroduced FloatRep and DoubleRep. std.conv: minor simplification of implementation. std.regexp: added reference to ECMA standard in the documentation. std.getopt: changed return type from bool to void, error is signaled by use of exceptions. std.functional: added unaryFun, binaryFun, adjoin. std.string: updated documentation, changed code to compile with warnings enabled. std.traits: changed FieldTypeTuple; added RepresentationTypeTuple, hasAliasing; fixed bug 1826; added call to flush() from within write; fixed unlisted bug in lines(). std.algorithm: added map, reduce, filter, inPlace, move, swap, overwriteAdjacent, find, findRange, findBoyerMoore, findAdjacent, findAmong, findAmongSorted, canFind, canFindAmong, canFindAmongSorted, count, equal, overlap, min, max, mismatch, EditOp, none, substitute, insert, remove, levenshteinDistance, levenshteinDistanceAndPath, copy, copyIf, iterSwap, swapRanges, reverse, rotate, SwapStrategy, Unstable, Semistable, Stable, eliminate, partition, nthElement, sort, schwartzSort, partialSort, isSorted, makeIndex, schwartzMakeIndex, lowerBound, upperBound, equalRange, canFindSorted. std.thread: fixed so it compiles with warnings enabled. std.file: made getSize() faster under Linux. std.random: fixed so it compiles with warnings enabled; improved function uniform so it deduces type generated from its arguments. std.format: added fixes to make formatting work with const data. std.path: minor documentation changes.
3309 lines
69 KiB
D
3309 lines
69 KiB
D
// Written in the D programming language.
|
|
|
|
// Regular Expressions
|
|
|
|
/*
|
|
* Copyright (C) 2000-2005 by Digital Mars, www.digitalmars.com
|
|
* Written by Walter Bright
|
|
*
|
|
* This software is provided 'as-is', without any express or implied
|
|
* warranty. In no event will the authors be held liable for any damages
|
|
* arising from the use of this software.
|
|
*
|
|
* Permission is granted to anyone to use this software for any purpose,
|
|
* including commercial applications, and to alter it and redistribute it
|
|
* freely, subject to the following restrictions:
|
|
*
|
|
* o The origin of this software must not be misrepresented; you must not
|
|
* claim that you wrote the original software. If you use this software
|
|
* in a product, an acknowledgment in the product documentation would be
|
|
* appreciated but is not required.
|
|
* o Altered source versions must be plainly marked as such, and must not
|
|
* be misrepresented as being the original software.
|
|
* o This notice may not be removed or altered from any source
|
|
* distribution.
|
|
*/
|
|
|
|
/**********************************************
|
|
* $(LINK2 http://www.digitalmars.com/ctg/regular.html, Regular
|
|
* expressions) are a powerful method of string pattern matching. The
|
|
* regular expression language used in this library is the same as
|
|
* that commonly used, however, some of the very advanced forms may
|
|
* behave slightly differently. The standard observed is the $(WEB
|
|
* www.ecma-international.org/publications/standards/Ecma-262.htm,
|
|
* ECMA standard) for regular expressions.
|
|
*
|
|
* std.regexp is designed to work only with valid UTF strings as input.
|
|
* To validate untrusted input, use std.utf.validate().
|
|
*
|
|
* In the following guide, $(I pattern)[] refers to a
|
|
* $(LINK2 http://www.digitalmars.com/ctg/regular.html, regular expression).
|
|
* The $(I attributes)[] refers to
|
|
a string controlling the interpretation
|
|
of the regular expression.
|
|
It consists of a sequence of one or more
|
|
of the following characters:
|
|
|
|
<table border=1 cellspacing=0 cellpadding=5>
|
|
<caption>Attribute Characters</caption>
|
|
$(TR $(TH Attribute) $(TH Action))
|
|
<tr>
|
|
$(TD $(B g))
|
|
$(TD global; repeat over the whole input string)
|
|
</tr>
|
|
<tr>
|
|
$(TD $(B i))
|
|
$(TD case insensitive)
|
|
</tr>
|
|
<tr>
|
|
$(TD $(B m))
|
|
$(TD treat as multiple lines separated by newlines)
|
|
</tr>
|
|
</table>
|
|
*
|
|
* The $(I format)[] string has the formatting characters:
|
|
*
|
|
* <table border=1 cellspacing=0 cellpadding=5>
|
|
<caption>Formatting Characters</caption>
|
|
$(TR $(TH Format) $(TH Replaced With))
|
|
$(TR
|
|
$(TD $(B $$)) $(TD $)
|
|
)
|
|
$(TR
|
|
$(TD $(B $&)) $(TD The matched substring.)
|
|
)
|
|
$(TR
|
|
$(TD $(B $`)) $(TD The portion of string that precedes the matched substring.)
|
|
)
|
|
$(TR
|
|
$(TD $(B $')) $(TD The portion of string that follows the matched substring.)
|
|
)
|
|
$(TR
|
|
$(TD $(B $(DOLLAR))$(I n)) $(TD The $(I n)th capture, where $(I n)
|
|
is a single digit 1-9
|
|
and $$(I n) is not followed by a decimal digit.)
|
|
)
|
|
$(TR
|
|
$(TD $(B $(DOLLAR))$(I nn)) $(TD The $(I nn)th capture, where $(I nn)
|
|
is a two-digit decimal
|
|
number 01-99.
|
|
If $(I nn)th capture is undefined or more than the number
|
|
of parenthesized subexpressions, use the empty
|
|
string instead.)
|
|
)
|
|
</table>
|
|
|
|
* Any other $ are left as is.
|
|
*
|
|
* References:
|
|
* $(LINK2 http://en.wikipedia.org/wiki/Regular_expressions, Wikipedia)
|
|
* Macros:
|
|
* WIKI = StdRegexp
|
|
* DOLLAR = $
|
|
*/
|
|
|
|
/*
|
|
Escape sequences:
|
|
|
|
\nnn starts out a 1, 2 or 3 digit octal sequence,
|
|
where n is an octal digit. If nnn is larger than
|
|
0377, then the 3rd digit is not part of the sequence
|
|
and is not consumed.
|
|
For maximal portability, use exactly 3 digits.
|
|
|
|
\xXX starts out a 1 or 2 digit hex sequence. X
|
|
is a hex character. If the first character after the \x
|
|
is not a hex character, the value of the sequence is 'x'
|
|
and the XX are not consumed.
|
|
For maximal portability, use exactly 2 digits.
|
|
|
|
\uUUUU is a unicode sequence. There are exactly
|
|
4 hex characters after the \u, if any are not, then
|
|
the value of the sequence is 'u', and the UUUU are not
|
|
consumed.
|
|
|
|
Character classes:
|
|
|
|
[a-b], where a is greater than b, will produce
|
|
an error.
|
|
|
|
References:
|
|
|
|
http://www.unicode.org/unicode/reports/tr18/
|
|
*/
|
|
|
|
module std.regexp;
|
|
|
|
//debug = regexp; // uncomment to turn on debugging printf's
|
|
|
|
private
|
|
{
|
|
import std.c.stdio;
|
|
import std.c.stdlib;
|
|
import std.c.string;
|
|
import std.stdio;
|
|
import std.string;
|
|
import std.ctype;
|
|
import std.outbuffer;
|
|
import std.bitmanip;
|
|
import std.utf;
|
|
}
|
|
|
|
/** Regular expression to extract an _email address.
|
|
* References:
|
|
* $(LINK2 http://www.regular-expressions.info/email.html, How to Find or Validate an Email Address)$(BR)
|
|
* $(LINK2 http://tools.ietf.org/html/rfc2822#section-3.4.1, RFC 2822 Internet Message Format)
|
|
*/
|
|
string email =
|
|
r"[a-zA-Z]([.]?([[a-zA-Z0-9_]-]+)*)?@([[a-zA-Z0-9_]\-_]+\.)+[a-zA-Z]{2,6}";
|
|
|
|
/** Regular expression to extract a _url */
|
|
string url = r"(([h|H][t|T]|[f|F])[t|T][p|P]([s|S]?)\:\/\/|~/|/)?([\w]+:\w+@)?(([a-zA-Z]{1}([\w\-]+\.)+([\w]{2,5}))(:[\d]{1,5})?)?((/?\w+/)+|/?)(\w+\.[\w]{3,4})?([,]\w+)*((\?\w+=\w+)?(&\w+=\w+)*([,]\w*)*)?";
|
|
|
|
/************************************
|
|
* One of these gets thrown on compilation errors
|
|
*/
|
|
|
|
class RegExpException : Exception
|
|
{
|
|
this(string msg)
|
|
{
|
|
super(msg);
|
|
}
|
|
}
|
|
|
|
struct regmatch_t
|
|
{
|
|
int rm_so; // index of start of match
|
|
int rm_eo; // index past end of match
|
|
}
|
|
|
|
private alias char rchar; // so we can make a wchar version
|
|
|
|
/******************************************************
|
|
* Search string for matches with regular expression
|
|
* pattern with attributes.
|
|
* Replace each match with string generated from format.
|
|
* Params:
|
|
* s = String to search.
|
|
* pattern = Regular expression pattern.
|
|
* format = Replacement string format.
|
|
* attributes = Regular expression attributes.
|
|
* Returns:
|
|
* the resulting string
|
|
* Example:
|
|
* Replace the letters 'a' with the letters 'ZZ'.
|
|
* ---
|
|
* s = "Strap a rocket engine on a chicken."
|
|
* sub(s, "a", "ZZ") // result: StrZZp a rocket engine on a chicken.
|
|
* sub(s, "a", "ZZ", "g") // result: StrZZp ZZ rocket engine on ZZ chicken.
|
|
* ---
|
|
* The replacement format can reference the matches using
|
|
* the $&, $$, $', $`, $0 .. $99 notation:
|
|
* ---
|
|
* sub(s, "[ar]", "[$&]", "g") // result: St[r][a]p [a] [r]ocket engine on [a] chi
|
|
* ---
|
|
*/
|
|
|
|
string sub(string s, string pattern, string format, string attributes = null)
|
|
{
|
|
auto r = new RegExp(pattern, attributes);
|
|
auto result = r.replace(s, format);
|
|
delete r;
|
|
return result;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
debug(regexp) printf("regexp.sub.unittest\n");
|
|
|
|
string r = sub("hello", "ll", "ss");
|
|
assert(r == "hesso");
|
|
}
|
|
|
|
/*******************************************************
|
|
* Search string for matches with regular expression
|
|
* pattern with attributes.
|
|
* Pass each match to delegate dg.
|
|
* Replace each match with the return value from dg.
|
|
* Params:
|
|
* s = String to search.
|
|
* pattern = Regular expression pattern.
|
|
* dg = Delegate
|
|
* attributes = Regular expression attributes.
|
|
* Returns: the resulting string.
|
|
* Example:
|
|
* Capitalize the letters 'a' and 'r':
|
|
* ---
|
|
* s = "Strap a rocket engine on a chicken.";
|
|
* sub(s, "[ar]",
|
|
* delegate char[] (RegExp m)
|
|
* {
|
|
* return toupper(m[0]);
|
|
* },
|
|
* "g"); // result: StRAp A Rocket engine on A chicken.
|
|
* ---
|
|
*/
|
|
|
|
string sub(string s, string pattern, string delegate(RegExp) dg, string attributes = null)
|
|
{
|
|
auto r = new RegExp(pattern, attributes);
|
|
string result;
|
|
int lastindex;
|
|
int offset;
|
|
|
|
result = s;
|
|
lastindex = 0;
|
|
offset = 0;
|
|
while (r.test(s, lastindex))
|
|
{
|
|
int so = r.pmatch[0].rm_so;
|
|
int eo = r.pmatch[0].rm_eo;
|
|
|
|
string replacement = dg(r);
|
|
|
|
// Optimize by using std.string.replace if possible - Dave Fladebo
|
|
string slice = result[offset + so .. offset + eo];
|
|
if (r.attributes & RegExp.REA.global && // global, so replace all
|
|
!(r.attributes & RegExp.REA.ignoreCase) && // not ignoring case
|
|
!(r.attributes & RegExp.REA.multiline) && // not multiline
|
|
pattern == slice) // simple pattern (exact match, no special characters)
|
|
{
|
|
debug(regexp)
|
|
printf("pattern: %.*s, slice: %.*s, replacement: %.*s\n",pattern,result[offset + so .. offset + eo],replacement);
|
|
result = std.string.replace(result,slice,replacement);
|
|
break;
|
|
}
|
|
|
|
result = replaceSlice(result, result[offset + so .. offset + eo], replacement);
|
|
|
|
if (r.attributes & RegExp.REA.global)
|
|
{
|
|
offset += replacement.length - (eo - so);
|
|
|
|
if (lastindex == eo)
|
|
lastindex++; // always consume some source
|
|
else
|
|
lastindex = eo;
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
delete r;
|
|
|
|
return result;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
debug(regexp) printf("regexp.sub.unittest\n");
|
|
|
|
string foo(RegExp r) { return "ss"; }
|
|
|
|
auto r = sub("hello", "ll", delegate string(RegExp r) { return "ss"; });
|
|
assert(r == "hesso");
|
|
|
|
r = sub("hello", "l", delegate string(RegExp r) { return "l"; }, "g");
|
|
assert(r == "hello");
|
|
|
|
auto s = sub("Strap a rocket engine on a chicken.",
|
|
"[ar]",
|
|
delegate string (RegExp m)
|
|
{
|
|
return std.string.toupper(m[0]);
|
|
},
|
|
"g");
|
|
assert(s == "StRAp A Rocket engine on A chicken.");
|
|
}
|
|
|
|
|
|
/*************************************************
|
|
* Search $(D_PARAM s[]) for first match with $(D_PARAM pattern).
|
|
* Params:
|
|
* s = String to search.
|
|
* pattern = Regular expression pattern.
|
|
* Returns:
|
|
* index into s[] of match if found, -1 if no match.
|
|
* Example:
|
|
* ---
|
|
* auto s = "abcabcabab";
|
|
* find(s, RegExp("b")); // match, returns 1
|
|
* find(s, RegExp("f")); // no match, returns -1
|
|
* ---
|
|
*/
|
|
|
|
int find(string s, RegExp pattern)
|
|
{
|
|
return pattern.test(s)
|
|
? pattern.pmatch[0].rm_so
|
|
: -1;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
debug(regexp) printf("regexp.find.unittest\n");
|
|
|
|
auto i = find("xabcy", RegExp("abc"));
|
|
assert(i == 1);
|
|
i = find("cba", RegExp("abc"));
|
|
assert(i == -1);
|
|
}
|
|
|
|
/**
|
|
Returns:
|
|
|
|
Same as $(D_PARAM find(s, RegExp(pattern, attributes))).
|
|
|
|
WARNING:
|
|
|
|
This function is scheduled for deprecation due to unnecessary
|
|
ambiguity with the homonym function in std.string. Instead of
|
|
$(D_PARAM std.regexp.find(s, p, a)), you may want to use $(D_PARAM
|
|
find(s, RegExp(p, a))).
|
|
*/
|
|
|
|
int find(string s, string pattern, string attributes = null)
|
|
{
|
|
int i = -1;
|
|
|
|
auto r = new RegExp(pattern, attributes);
|
|
if (r.test(s))
|
|
{
|
|
i = r.pmatch[0].rm_so;
|
|
}
|
|
delete r;
|
|
return i;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
debug(regexp) printf("regexp.find.unittest\n");
|
|
|
|
auto i = find("xabcy", "abc");
|
|
assert(i == 1);
|
|
i = find("cba", "abc");
|
|
assert(i == -1);
|
|
}
|
|
|
|
/*************************************************
|
|
* Search $(D_PARAM s[]) for last match with $(D_PARAM pattern).
|
|
* Params:
|
|
* s = String to search.
|
|
* pattern = Regular expression pattern.
|
|
* Returns:
|
|
* index into s[] of match if found, -1 if no match.
|
|
* Example:
|
|
* ---
|
|
* auto s = "abcabcabab";
|
|
* rfind(s, RegExp("b")); // match, returns 9
|
|
* rfind(s, RegExp("f")); // no match, returns -1
|
|
* ---
|
|
*/
|
|
|
|
int rfind(string s, RegExp pattern)
|
|
{
|
|
int i = -1;
|
|
int lastindex = 0;
|
|
|
|
while (pattern.test(s, lastindex))
|
|
{ int eo = pattern.pmatch[0].rm_eo;
|
|
i = pattern.pmatch[0].rm_so;
|
|
if (lastindex == eo)
|
|
lastindex++; // always consume some source
|
|
else
|
|
lastindex = eo;
|
|
}
|
|
return i;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
int i;
|
|
|
|
debug(regexp) printf("regexp.rfind.unittest\n");
|
|
i = rfind("abcdefcdef", RegExp("c"));
|
|
assert(i == 6);
|
|
i = rfind("abcdefcdef", RegExp("cd"));
|
|
assert(i == 6);
|
|
i = rfind("abcdefcdef", RegExp("x"));
|
|
assert(i == -1);
|
|
i = rfind("abcdefcdef", RegExp("xy"));
|
|
assert(i == -1);
|
|
i = rfind("abcdefcdef", RegExp(""));
|
|
assert(i == 10);
|
|
}
|
|
|
|
/*************************************************
|
|
Returns:
|
|
|
|
Same as $(D_PARAM rfind(s, RegExp(pattern, attributes))).
|
|
|
|
WARNING:
|
|
|
|
This function is scheduled for deprecation due to unnecessary
|
|
ambiguity with the homonym function in std.string. Instead of
|
|
$(D_PARAM std.regexp.rfind(s, p, a)), you may want to use $(D_PARAM
|
|
rfind(s, RegExp(p, a))).
|
|
*/
|
|
|
|
int rfind(string s, string pattern, string attributes = null)
|
|
{
|
|
int i = -1;
|
|
int lastindex = 0;
|
|
|
|
auto r = new RegExp(pattern, attributes);
|
|
while (r.test(s, lastindex))
|
|
{ int eo = r.pmatch[0].rm_eo;
|
|
i = r.pmatch[0].rm_so;
|
|
if (lastindex == eo)
|
|
lastindex++; // always consume some source
|
|
else
|
|
lastindex = eo;
|
|
}
|
|
delete r;
|
|
return i;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
int i;
|
|
|
|
debug(regexp) printf("regexp.rfind.unittest\n");
|
|
i = rfind("abcdefcdef", "c");
|
|
assert(i == 6);
|
|
i = rfind("abcdefcdef", "cd");
|
|
assert(i == 6);
|
|
i = rfind("abcdefcdef", "x");
|
|
assert(i == -1);
|
|
i = rfind("abcdefcdef", "xy");
|
|
assert(i == -1);
|
|
i = rfind("abcdefcdef", "");
|
|
assert(i == 10);
|
|
}
|
|
|
|
|
|
/********************************************
|
|
* Split s[] into an array of strings, using the regular
|
|
* expression $(D_PARAM pattern) as the separator.
|
|
* Params:
|
|
* s = String to search.
|
|
* pattern = Regular expression pattern.
|
|
* Returns:
|
|
* array of slices into s[]
|
|
* Example:
|
|
* ---
|
|
* foreach (s; split("abcabcabab", RegExp("C.", "i")))
|
|
* {
|
|
* writefln("s = '%s'", s);
|
|
* }
|
|
* // Prints:
|
|
* // s = 'ab'
|
|
* // s = 'b'
|
|
* // s = 'bab'
|
|
* ---
|
|
*/
|
|
|
|
string[] split(string s, RegExp pattern)
|
|
{
|
|
return pattern.split(s);
|
|
}
|
|
|
|
unittest
|
|
{
|
|
debug(regexp) printf("regexp.split.unittest()\n");
|
|
string[] result;
|
|
|
|
result = split("ab", RegExp("a*"));
|
|
assert(result.length == 2);
|
|
assert(result[0] == "");
|
|
assert(result[1] == "b");
|
|
|
|
foreach (i, s; split("abcabcabab", RegExp("C.", "i")))
|
|
{
|
|
//writefln("s[%d] = '%s'", i, s);
|
|
if (i == 0) assert(s == "ab");
|
|
else if (i == 1) assert(s == "b");
|
|
else if (i == 2) assert(s == "bab");
|
|
else assert(0);
|
|
}
|
|
}
|
|
|
|
/********************************************
|
|
Returns:
|
|
Same as $(D_PARAM split(s, RegExp(pattern, attributes))).
|
|
|
|
WARNING:
|
|
|
|
This function is scheduled for deprecation due to unnecessary
|
|
ambiguity with the homonym function in std.string. Instead of
|
|
$(D_PARAM std.regexp.split(s, p, a)), you may want to use $(D_PARAM
|
|
split(s, RegExp(p, a))).
|
|
*/
|
|
|
|
string[] split(string s, string pattern, string attributes = null)
|
|
{
|
|
auto r = new RegExp(pattern, attributes);
|
|
auto result = r.split(s);
|
|
delete r;
|
|
return result;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
debug(regexp) printf("regexp.split.unittest()\n");
|
|
string[] result;
|
|
|
|
result = split("ab", "a*");
|
|
assert(result.length == 2);
|
|
assert(result[0] == "");
|
|
assert(result[1] == "b");
|
|
|
|
foreach (i, s; split("abcabcabab", "C.", "i"))
|
|
{
|
|
//writefln("s[%d] = '%s'", i, s);
|
|
if (i == 0) assert(s == "ab");
|
|
else if (i == 1) assert(s == "b");
|
|
else if (i == 2) assert(s == "bab");
|
|
else assert(0);
|
|
}
|
|
}
|
|
|
|
/****************************************************
|
|
* Search s[] for first match with pattern[] with attributes[].
|
|
* Params:
|
|
* s = String to search.
|
|
* pattern = Regular expression pattern.
|
|
* attributes = Regular expression attributes.
|
|
* Returns:
|
|
* corresponding RegExp if found, null if not.
|
|
* Example:
|
|
* ---
|
|
* import std.stdio;
|
|
* import std.regexp;
|
|
*
|
|
* void main()
|
|
* {
|
|
* if (auto m = std.regexp.search("abcdef", "c"))
|
|
* {
|
|
* writefln("%s[%s]%s", m.pre, m[0], m.post);
|
|
* }
|
|
* }
|
|
* // Prints:
|
|
* // ab[c]def
|
|
* ---
|
|
*/
|
|
|
|
RegExp search(string s, string pattern, string attributes = null)
|
|
{
|
|
auto r = new RegExp(pattern, attributes);
|
|
if (!r.test(s))
|
|
{ delete r;
|
|
assert(r is null);
|
|
}
|
|
return r;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
debug(regexp) printf("regexp.string.unittest()\n");
|
|
|
|
if (auto m = std.regexp.search("abcdef", "c()"))
|
|
{
|
|
auto result = std.string.format("%s[%s]%s", m.pre, m[0], m.post);
|
|
assert(result == "ab[c]def");
|
|
assert(m[1] == null);
|
|
assert(m[2] == null);
|
|
}
|
|
else
|
|
assert(0);
|
|
|
|
if (auto n = std.regexp.search("abcdef", "g"))
|
|
{
|
|
assert(0);
|
|
}
|
|
}
|
|
|
|
|
|
/* ********************************* RegExp ******************************** */
|
|
|
|
/*****************************
|
|
* RegExp is a class to handle regular expressions.
|
|
*
|
|
* It is the core foundation for adding powerful string pattern matching
|
|
* capabilities to programs like grep, text editors, awk, sed, etc.
|
|
*/
|
|
class RegExp
|
|
{
|
|
/*****
|
|
* Construct a RegExp object. Compile pattern
|
|
* with <i>attributes</i> into
|
|
* an internal form for fast execution.
|
|
* Params:
|
|
* pattern = regular expression
|
|
* attributes = _attributes
|
|
* Throws: RegExpException if there are any compilation errors.
|
|
* Example:
|
|
* Declare two variables and assign to them a RegExp object:
|
|
* ---
|
|
* auto r = new RegExp("pattern");
|
|
* auto s = new RegExp(r"p[1-5]\s*");
|
|
* ---
|
|
*/
|
|
public this(string pattern, string attributes = null)
|
|
{
|
|
pmatch = (&gmatch)[0 .. 1];
|
|
compile(pattern, attributes);
|
|
}
|
|
|
|
/*****
|
|
* Generate instance of RegExp.
|
|
* Params:
|
|
* pattern = regular expression
|
|
* attributes = _attributes
|
|
* Throws: RegExpException if there are any compilation errors.
|
|
* Example:
|
|
* Declare two variables and assign to them a RegExp object:
|
|
* ---
|
|
* auto r = RegExp("pattern");
|
|
* auto s = RegExp(r"p[1-5]\s*");
|
|
* ---
|
|
*/
|
|
public static RegExp opCall(string pattern, string attributes = null)
|
|
{
|
|
return new RegExp(pattern, attributes);
|
|
}
|
|
|
|
unittest
|
|
{
|
|
debug(regexp) printf("regexp.opCall.unittest()\n");
|
|
auto r1 = RegExp("hello", "m");
|
|
string msg;
|
|
try
|
|
{
|
|
auto r2 = RegExp("hello", "q");
|
|
assert(0);
|
|
}
|
|
catch (RegExpException ree)
|
|
{
|
|
msg = ree.toString();
|
|
//writefln("message: %s", ree);
|
|
}
|
|
assert(msg == "unrecognized attribute");
|
|
}
|
|
|
|
/************************************
|
|
* Set up for start of foreach loop.
|
|
* Returns:
|
|
* search() returns instance of RegExp set up to _search string[].
|
|
* Example:
|
|
* ---
|
|
* import std.stdio;
|
|
* import std.regexp;
|
|
*
|
|
* void main()
|
|
* {
|
|
* foreach(m; RegExp("ab").search("abcabcabab"))
|
|
* {
|
|
* writefln("%s[%s]%s", m.pre, m[0], m.post);
|
|
* }
|
|
* }
|
|
* // Prints:
|
|
* // [ab]cabcabab
|
|
* // abc[ab]cabab
|
|
* // abcabc[ab]ab
|
|
* // abcabcab[ab]
|
|
* ---
|
|
*/
|
|
|
|
public RegExp search(string string)
|
|
{
|
|
input = string;
|
|
pmatch[0].rm_eo = 0;
|
|
return this;
|
|
}
|
|
|
|
/** ditto */
|
|
public int opApply(int delegate(inout RegExp) dg)
|
|
{
|
|
int result;
|
|
RegExp r = this;
|
|
|
|
while (test())
|
|
{
|
|
result = dg(r);
|
|
if (result)
|
|
break;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
debug(regexp) printf("regexp.search.unittest()\n");
|
|
|
|
int i;
|
|
foreach(m; RegExp("ab").search("abcabcabab"))
|
|
{
|
|
auto s = std.string.format("%s[%s]%s", m.pre, m[0], m.post);
|
|
if (i == 0) assert(s == "[ab]cabcabab");
|
|
else if (i == 1) assert(s == "abc[ab]cabab");
|
|
else if (i == 2) assert(s == "abcabc[ab]ab");
|
|
else if (i == 3) assert(s == "abcabcab[ab]");
|
|
else assert(0);
|
|
i++;
|
|
}
|
|
}
|
|
|
|
/******************
|
|
* Retrieve match n.
|
|
*
|
|
* n==0 means the matched substring, n>0 means the
|
|
* n'th parenthesized subexpression.
|
|
* if n is larger than the number of parenthesized subexpressions,
|
|
* null is returned.
|
|
*/
|
|
public string opIndex(size_t n)
|
|
{
|
|
if (n >= pmatch.length)
|
|
return null;
|
|
else
|
|
{ size_t rm_so, rm_eo;
|
|
rm_so = pmatch[n].rm_so;
|
|
rm_eo = pmatch[n].rm_eo;
|
|
if (rm_so == rm_eo)
|
|
return null;
|
|
return input[rm_so .. rm_eo];
|
|
}
|
|
}
|
|
|
|
/**
|
|
Same as $(D_PARAM opIndex(n)).
|
|
|
|
WARNING:
|
|
|
|
Scheduled for deprecation due to confusion with overloaded
|
|
$(D_PARAM match(string)). Instead of $(D_PARAM regex.match(n))
|
|
you may want to use $(D_PARAM regex[n]).
|
|
*/
|
|
public string match(size_t n)
|
|
{
|
|
return this[n];
|
|
}
|
|
|
|
/*******************
|
|
* Return the slice of the input that precedes the matched substring.
|
|
*/
|
|
public string pre()
|
|
{
|
|
return input[0 .. pmatch[0].rm_so];
|
|
}
|
|
|
|
/*******************
|
|
* Return the slice of the input that follows the matched substring.
|
|
*/
|
|
public string post()
|
|
{
|
|
return input[pmatch[0].rm_eo .. $];
|
|
}
|
|
|
|
uint re_nsub; // number of parenthesized subexpression matches
|
|
regmatch_t[] pmatch; // array [re_nsub + 1]
|
|
|
|
string input; // the string to search
|
|
|
|
// per instance:
|
|
|
|
string pattern; // source text of the regular expression
|
|
|
|
string flags; // source text of the attributes parameter
|
|
|
|
int errors;
|
|
|
|
uint attributes;
|
|
|
|
enum REA
|
|
{
|
|
global = 1, // has the g attribute
|
|
ignoreCase = 2, // has the i attribute
|
|
multiline = 4, // if treat as multiple lines separated
|
|
// by newlines, or as a single line
|
|
dotmatchlf = 8, // if . matches \n
|
|
}
|
|
|
|
|
|
private:
|
|
size_t src; // current source index in input[]
|
|
size_t src_start; // starting index for match in input[]
|
|
size_t p; // position of parser in pattern[]
|
|
regmatch_t gmatch; // match for the entire regular expression
|
|
// (serves as storage for pmatch[0])
|
|
|
|
const(ubyte)[] program; // pattern[] compiled into regular expression program
|
|
OutBuffer buf;
|
|
|
|
|
|
|
|
|
|
/******************************************/
|
|
|
|
// Opcodes
|
|
|
|
enum : ubyte
|
|
{
|
|
REend, // end of program
|
|
REchar, // single character
|
|
REichar, // single character, case insensitive
|
|
REdchar, // single UCS character
|
|
REidchar, // single wide character, case insensitive
|
|
REanychar, // any character
|
|
REanystar, // ".*"
|
|
REstring, // string of characters
|
|
REistring, // string of characters, case insensitive
|
|
REtestbit, // any in bitmap, non-consuming
|
|
REbit, // any in the bit map
|
|
REnotbit, // any not in the bit map
|
|
RErange, // any in the string
|
|
REnotrange, // any not in the string
|
|
REor, // a | b
|
|
REplus, // 1 or more
|
|
REstar, // 0 or more
|
|
REquest, // 0 or 1
|
|
REnm, // n..m
|
|
REnmq, // n..m, non-greedy version
|
|
REbol, // beginning of line
|
|
REeol, // end of line
|
|
REparen, // parenthesized subexpression
|
|
REgoto, // goto offset
|
|
|
|
REwordboundary,
|
|
REnotwordboundary,
|
|
REdigit,
|
|
REnotdigit,
|
|
REspace,
|
|
REnotspace,
|
|
REword,
|
|
REnotword,
|
|
REbackref,
|
|
};
|
|
|
|
// BUG: should this include '$'?
|
|
private int isword(dchar c) { return isalnum(c) || c == '_'; }
|
|
|
|
private uint inf = ~0u;
|
|
|
|
/* ********************************
|
|
* Throws RegExpException on error
|
|
*/
|
|
|
|
public void compile(string pattern, string attributes)
|
|
{
|
|
//printf("RegExp.compile('%.*s', '%.*s')\n", pattern, attributes);
|
|
|
|
this.attributes = 0;
|
|
foreach (rchar c; attributes)
|
|
{ REA att;
|
|
|
|
switch (c)
|
|
{
|
|
case 'g': att = REA.global; break;
|
|
case 'i': att = REA.ignoreCase; break;
|
|
case 'm': att = REA.multiline; break;
|
|
default:
|
|
error("unrecognized attribute");
|
|
return;
|
|
}
|
|
if (this.attributes & att)
|
|
{ error("redundant attribute");
|
|
return;
|
|
}
|
|
this.attributes |= att;
|
|
}
|
|
|
|
input = null;
|
|
|
|
this.pattern = pattern;
|
|
this.flags = attributes;
|
|
|
|
uint oldre_nsub = re_nsub;
|
|
re_nsub = 0;
|
|
errors = 0;
|
|
|
|
buf = new OutBuffer();
|
|
buf.reserve(pattern.length * 8);
|
|
p = 0;
|
|
parseRegexp();
|
|
if (p < pattern.length)
|
|
{ error("unmatched ')'");
|
|
}
|
|
optimize();
|
|
program = buf.data;
|
|
buf.data = null;
|
|
delete buf;
|
|
|
|
if (re_nsub > oldre_nsub)
|
|
{
|
|
if (pmatch.ptr is &gmatch)
|
|
pmatch = null;
|
|
pmatch.length = re_nsub + 1;
|
|
}
|
|
pmatch[0].rm_so = 0;
|
|
pmatch[0].rm_eo = 0;
|
|
}
|
|
|
|
/********************************************
|
|
* Split s[] into an array of strings, using the regular
|
|
* expression as the separator.
|
|
* Returns:
|
|
* array of slices into s[]
|
|
*/
|
|
|
|
public string[] split(string s)
|
|
{
|
|
debug(regexp) printf("regexp.split()\n");
|
|
|
|
string[] result;
|
|
|
|
if (s.length)
|
|
{
|
|
int p = 0;
|
|
int q;
|
|
for (q = p; q != s.length;)
|
|
{
|
|
if (test(s, q))
|
|
{ int e;
|
|
|
|
q = pmatch[0].rm_so;
|
|
e = pmatch[0].rm_eo;
|
|
if (e != p)
|
|
{
|
|
result ~= s[p .. q];
|
|
for (int i = 1; i < pmatch.length; i++)
|
|
{
|
|
int so = pmatch[i].rm_so;
|
|
int eo = pmatch[i].rm_eo;
|
|
if (so == eo)
|
|
{ so = 0; // -1 gives array bounds error
|
|
eo = 0;
|
|
}
|
|
result ~= s[so .. eo];
|
|
}
|
|
q = p = e;
|
|
continue;
|
|
}
|
|
}
|
|
q++;
|
|
}
|
|
result ~= s[p .. s.length];
|
|
}
|
|
else if (!test(s))
|
|
result ~= s;
|
|
return result;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
debug(regexp) printf("regexp.split.unittest()\n");
|
|
|
|
auto r = new RegExp("a*?", null);
|
|
string[] result;
|
|
string j;
|
|
int i;
|
|
|
|
result = r.split("ab");
|
|
|
|
assert(result.length == 2);
|
|
i = std.string.cmp(result[0], "a");
|
|
assert(i == 0);
|
|
i = std.string.cmp(result[1], "b");
|
|
assert(i == 0);
|
|
|
|
r = new RegExp("a*", null);
|
|
result = r.split("ab");
|
|
assert(result.length == 2);
|
|
i = std.string.cmp(result[0], "");
|
|
assert(i == 0);
|
|
i = std.string.cmp(result[1], "b");
|
|
assert(i == 0);
|
|
|
|
r = new RegExp("<(\\/)?([^<>]+)>", null);
|
|
result = r.split("a<b>font</b>bar<TAG>hello</TAG>");
|
|
|
|
for (i = 0; i < result.length; i++)
|
|
{
|
|
//debug(regexp) printf("result[%d] = '%.*s'\n", i, result[i]);
|
|
}
|
|
|
|
j = join(result, ",");
|
|
//printf("j = '%.*s'\n", j);
|
|
i = std.string.cmp(j, "a,,b,font,/,b,bar,,TAG,hello,/,TAG,");
|
|
assert(i == 0);
|
|
|
|
r = new RegExp("a[bc]", null);
|
|
result = r.match("123ab");
|
|
j = join(result, ",");
|
|
i = std.string.cmp(j, "ab");
|
|
assert(i == 0);
|
|
|
|
result = r.match("ac");
|
|
j = join(result, ",");
|
|
i = std.string.cmp(j, "ac");
|
|
assert(i == 0);
|
|
}
|
|
|
|
/*************************************************
|
|
* Search string[] for match with regular expression.
|
|
* Returns:
|
|
* index of match if successful, -1 if not found
|
|
*/
|
|
|
|
public int find(string string)
|
|
{
|
|
int i;
|
|
|
|
i = test(string);
|
|
if (i)
|
|
i = pmatch[0].rm_so;
|
|
else
|
|
i = -1; // no match
|
|
return i;
|
|
}
|
|
|
|
//deprecated alias find search;
|
|
|
|
unittest
|
|
{
|
|
debug(regexp) printf("regexp.find.unittest()\n");
|
|
|
|
int i;
|
|
RegExp r = new RegExp("abc", null);
|
|
i = r.find("xabcy");
|
|
assert(i == 1);
|
|
i = r.find("cba");
|
|
assert(i == -1);
|
|
}
|
|
|
|
|
|
/*************************************************
|
|
* Search s[] for match.
|
|
* Returns:
|
|
* If global attribute, return same value as exec(s).
|
|
* If not global attribute, return array of all matches.
|
|
*/
|
|
|
|
public string[] match(string s)
|
|
{
|
|
string[] result;
|
|
|
|
if (attributes & REA.global)
|
|
{
|
|
int lastindex = 0;
|
|
|
|
while (test(s, lastindex))
|
|
{ int eo = pmatch[0].rm_eo;
|
|
|
|
result ~= input[pmatch[0].rm_so .. eo];
|
|
if (lastindex == eo)
|
|
lastindex++; // always consume some source
|
|
else
|
|
lastindex = eo;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
result = exec(s);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
debug(regexp) printf("regexp.match.unittest()\n");
|
|
|
|
int i;
|
|
string[] result;
|
|
string j;
|
|
RegExp r;
|
|
|
|
r = new RegExp("a[bc]", null);
|
|
result = r.match("1ab2ac3");
|
|
j = join(result, ",");
|
|
i = std.string.cmp(j, "ab");
|
|
assert(i == 0);
|
|
|
|
r = new RegExp("a[bc]", "g");
|
|
result = r.match("1ab2ac3");
|
|
j = join(result, ",");
|
|
i = std.string.cmp(j, "ab,ac");
|
|
assert(i == 0);
|
|
}
|
|
|
|
|
|
/*************************************************
|
|
* Find regular expression matches in s[]. Replace those matches
|
|
* with a new string composed of format[] merged with the result of the
|
|
* matches.
|
|
* If global, replace all matches. Otherwise, replace first match.
|
|
* Returns: the new string
|
|
*/
|
|
|
|
public string replace(string s, string format)
|
|
{
|
|
string result;
|
|
int lastindex;
|
|
int offset;
|
|
|
|
result = s;
|
|
lastindex = 0;
|
|
offset = 0;
|
|
for (;;)
|
|
{
|
|
if (!test(s, lastindex))
|
|
break;
|
|
|
|
int so = pmatch[0].rm_so;
|
|
int eo = pmatch[0].rm_eo;
|
|
|
|
string replacement = replace(format);
|
|
|
|
// Optimize by using std.string.replace if possible - Dave Fladebo
|
|
string slice = result[offset + so .. offset + eo];
|
|
if (attributes & REA.global && // global, so replace all
|
|
!(attributes & REA.ignoreCase) && // not ignoring case
|
|
!(attributes & REA.multiline) && // not multiline
|
|
pattern == slice && // simple pattern (exact match, no special characters)
|
|
format == replacement) // simple format, not $ formats
|
|
{
|
|
debug(regexp)
|
|
printf("pattern: %.*s, slice: %.*s, format: %.*s, replacement: %.*s\n",pattern,result[offset + so .. offset + eo],format,replacement);
|
|
result = std.string.replace(result,slice,replacement);
|
|
break;
|
|
}
|
|
|
|
result = replaceSlice(result, result[offset + so .. offset + eo], replacement);
|
|
|
|
if (attributes & REA.global)
|
|
{
|
|
offset += replacement.length - (eo - so);
|
|
|
|
if (lastindex == eo)
|
|
lastindex++; // always consume some source
|
|
else
|
|
lastindex = eo;
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
debug(regexp) printf("regexp.replace.unittest()\n");
|
|
|
|
int i;
|
|
string result;
|
|
RegExp r;
|
|
|
|
r = new RegExp("a[bc]", "g");
|
|
result = r.replace("1ab2ac3", "x$&y");
|
|
i = std.string.cmp(result, "1xaby2xacy3");
|
|
assert(i == 0);
|
|
|
|
r = new RegExp("ab", "g");
|
|
result = r.replace("1ab2ac3", "xy");
|
|
i = std.string.cmp(result, "1xy2ac3");
|
|
assert(i == 0);
|
|
}
|
|
|
|
|
|
/*************************************************
|
|
* Search string[] for match.
|
|
* Returns:
|
|
* array of slices into string[] representing matches
|
|
*/
|
|
|
|
public string[] exec(string string)
|
|
{
|
|
debug(regexp) printf("regexp.exec(string = '%.*s')\n", string);
|
|
input = string;
|
|
pmatch[0].rm_so = 0;
|
|
pmatch[0].rm_eo = 0;
|
|
return exec();
|
|
}
|
|
|
|
/*************************************************
|
|
* Pick up where last exec(string) or exec() left off,
|
|
* searching string[] for next match.
|
|
* Returns:
|
|
* array of slices into string[] representing matches
|
|
*/
|
|
|
|
public string[] exec()
|
|
{
|
|
if (!test())
|
|
return null;
|
|
|
|
auto result = new string[pmatch.length];
|
|
for (int i = 0; i < pmatch.length; i++)
|
|
{
|
|
if (pmatch[i].rm_so == pmatch[i].rm_eo)
|
|
result[i] = null;
|
|
else
|
|
result[i] = input[pmatch[i].rm_so .. pmatch[i].rm_eo];
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/************************************************
|
|
* Search s[] for match.
|
|
* Returns: 0 for no match, !=0 for match
|
|
* Example:
|
|
---
|
|
import std.stdio;
|
|
import std.regexp;
|
|
import std.string;
|
|
|
|
int grep(int delegate(char[]) pred, char[][] list)
|
|
{
|
|
int count;
|
|
foreach (s; list)
|
|
{ if (pred(s))
|
|
++count;
|
|
}
|
|
return count;
|
|
}
|
|
|
|
void main()
|
|
{
|
|
auto x = grep(&RegExp("[Ff]oo").test,
|
|
std.string.split("mary had a foo lamb"));
|
|
writefln(x);
|
|
}
|
|
---
|
|
* which prints: 1
|
|
*/
|
|
|
|
public int test(string s)
|
|
{
|
|
return test(s, 0 /*pmatch[0].rm_eo*/);
|
|
}
|
|
|
|
/************************************************
|
|
* Pick up where last test(string) or test() left off, and search again.
|
|
* Returns: 0 for no match, !=0 for match
|
|
*/
|
|
|
|
public int test()
|
|
{
|
|
return test(input, pmatch[0].rm_eo);
|
|
}
|
|
|
|
/************************************************
|
|
* Test s[] starting at startindex against regular expression.
|
|
* Returns: 0 for no match, !=0 for match
|
|
*/
|
|
|
|
public int test(string s, int startindex)
|
|
{
|
|
char firstc;
|
|
uint si;
|
|
|
|
input = s;
|
|
debug (regexp) printf("RegExp.test(input[] = '%.*s', startindex = %d)\n", input, startindex);
|
|
pmatch[0].rm_so = 0;
|
|
pmatch[0].rm_eo = 0;
|
|
if (startindex < 0 || startindex > input.length)
|
|
{
|
|
return 0; // fail
|
|
}
|
|
//debug(regexp) printProgram(program);
|
|
|
|
// First character optimization
|
|
firstc = 0;
|
|
if (program[0] == REchar)
|
|
{
|
|
firstc = program[1];
|
|
if (attributes & REA.ignoreCase && isalpha(firstc))
|
|
firstc = 0;
|
|
}
|
|
|
|
for (si = startindex; ; si++)
|
|
{
|
|
if (firstc)
|
|
{
|
|
if (si == input.length)
|
|
break; // no match
|
|
if (input[si] != firstc)
|
|
{
|
|
si++;
|
|
if (!chr(si, firstc)) // if first character not found
|
|
break; // no match
|
|
}
|
|
}
|
|
for (int i = 0; i < re_nsub + 1; i++)
|
|
{
|
|
pmatch[i].rm_so = -1;
|
|
pmatch[i].rm_eo = -1;
|
|
}
|
|
src_start = src = si;
|
|
if (trymatch(0, program.length))
|
|
{
|
|
pmatch[0].rm_so = si;
|
|
pmatch[0].rm_eo = src;
|
|
//debug(regexp) printf("start = %d, end = %d\n", gmatch.rm_so, gmatch.rm_eo);
|
|
return 1;
|
|
}
|
|
// If possible match must start at beginning, we are done
|
|
if (program[0] == REbol || program[0] == REanystar)
|
|
{
|
|
if (attributes & REA.multiline)
|
|
{
|
|
// Scan for the next \n
|
|
if (!chr(si, '\n'))
|
|
break; // no match if '\n' not found
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
if (si == input.length)
|
|
break;
|
|
//debug(regexp) printf("Starting new try: '%.*s'\n", input[si + 1 .. input.length]);
|
|
}
|
|
return 0; // no match
|
|
}
|
|
|
|
/**
|
|
Returns whether string $(D_PARAM s) matches $(D_PARAM this).
|
|
*/
|
|
alias test opEquals;
|
|
// int opEquals(string s)
|
|
// {
|
|
// return test(s);
|
|
// }
|
|
|
|
unittest
|
|
{
|
|
assert("abc" == RegExp(".b."));
|
|
assert("abc" != RegExp(".b.."));
|
|
}
|
|
|
|
int chr(inout uint si, rchar c)
|
|
{
|
|
for (; si < input.length; si++)
|
|
{
|
|
if (input[si] == c)
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
void printProgram(const(ubyte)[] prog)
|
|
{
|
|
//debug(regexp)
|
|
{
|
|
uint pc;
|
|
uint len;
|
|
uint n;
|
|
uint m;
|
|
ushort *pu;
|
|
uint *puint;
|
|
|
|
printf("printProgram()\n");
|
|
for (pc = 0; pc < prog.length; )
|
|
{
|
|
printf("%3d: ", pc);
|
|
|
|
//printf("prog[pc] = %d, REchar = %d, REnmq = %d\n", prog[pc], REchar, REnmq);
|
|
switch (prog[pc])
|
|
{
|
|
case REchar:
|
|
printf("\tREchar '%c'\n", prog[pc + 1]);
|
|
pc += 1 + char.sizeof;
|
|
break;
|
|
|
|
case REichar:
|
|
printf("\tREichar '%c'\n", prog[pc + 1]);
|
|
pc += 1 + char.sizeof;
|
|
break;
|
|
|
|
case REdchar:
|
|
printf("\tREdchar '%c'\n", *cast(dchar *)&prog[pc + 1]);
|
|
pc += 1 + dchar.sizeof;
|
|
break;
|
|
|
|
case REidchar:
|
|
printf("\tREidchar '%c'\n", *cast(dchar *)&prog[pc + 1]);
|
|
pc += 1 + dchar.sizeof;
|
|
break;
|
|
|
|
case REanychar:
|
|
printf("\tREanychar\n");
|
|
pc++;
|
|
break;
|
|
|
|
case REstring:
|
|
len = *cast(uint *)&prog[pc + 1];
|
|
printf("\tREstring x%x, '%.*s'\n", len,
|
|
(&prog[pc + 1 + uint.sizeof])[0 .. len]);
|
|
pc += 1 + uint.sizeof + len * rchar.sizeof;
|
|
break;
|
|
|
|
case REistring:
|
|
len = *cast(uint *)&prog[pc + 1];
|
|
printf("\tREistring x%x, '%.*s'\n", len,
|
|
(&prog[pc + 1 + uint.sizeof])[0 .. len]);
|
|
pc += 1 + uint.sizeof + len * rchar.sizeof;
|
|
break;
|
|
|
|
case REtestbit:
|
|
pu = cast(ushort *)&prog[pc + 1];
|
|
printf("\tREtestbit %d, %d\n", pu[0], pu[1]);
|
|
len = pu[1];
|
|
pc += 1 + 2 * ushort.sizeof + len;
|
|
break;
|
|
|
|
case REbit:
|
|
pu = cast(ushort *)&prog[pc + 1];
|
|
len = pu[1];
|
|
printf("\tREbit cmax=%02x, len=%d:", pu[0], len);
|
|
for (n = 0; n < len; n++)
|
|
printf(" %02x", prog[pc + 1 + 2 * ushort.sizeof + n]);
|
|
printf("\n");
|
|
pc += 1 + 2 * ushort.sizeof + len;
|
|
break;
|
|
|
|
case REnotbit:
|
|
pu = cast(ushort *)&prog[pc + 1];
|
|
printf("\tREnotbit %d, %d\n", pu[0], pu[1]);
|
|
len = pu[1];
|
|
pc += 1 + 2 * ushort.sizeof + len;
|
|
break;
|
|
|
|
case RErange:
|
|
len = *cast(uint *)&prog[pc + 1];
|
|
printf("\tRErange %d\n", len);
|
|
// BUG: REAignoreCase?
|
|
pc += 1 + uint.sizeof + len;
|
|
break;
|
|
|
|
case REnotrange:
|
|
len = *cast(uint *)&prog[pc + 1];
|
|
printf("\tREnotrange %d\n", len);
|
|
// BUG: REAignoreCase?
|
|
pc += 1 + uint.sizeof + len;
|
|
break;
|
|
|
|
case REbol:
|
|
printf("\tREbol\n");
|
|
pc++;
|
|
break;
|
|
|
|
case REeol:
|
|
printf("\tREeol\n");
|
|
pc++;
|
|
break;
|
|
|
|
case REor:
|
|
len = *cast(uint *)&prog[pc + 1];
|
|
printf("\tREor %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len);
|
|
pc += 1 + uint.sizeof;
|
|
break;
|
|
|
|
case REgoto:
|
|
len = *cast(uint *)&prog[pc + 1];
|
|
printf("\tREgoto %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len);
|
|
pc += 1 + uint.sizeof;
|
|
break;
|
|
|
|
case REanystar:
|
|
printf("\tREanystar\n");
|
|
pc++;
|
|
break;
|
|
|
|
case REnm:
|
|
case REnmq:
|
|
// len, n, m, ()
|
|
puint = cast(uint *)&prog[pc + 1];
|
|
len = puint[0];
|
|
n = puint[1];
|
|
m = puint[2];
|
|
printf("\tREnm%.*s len=%d, n=%u, m=%u, pc=>%d\n",
|
|
(prog[pc] == REnmq) ? "q" : " ",
|
|
len, n, m, pc + 1 + uint.sizeof * 3 + len);
|
|
pc += 1 + uint.sizeof * 3;
|
|
break;
|
|
|
|
case REparen:
|
|
// len, n, ()
|
|
puint = cast(uint *)&prog[pc + 1];
|
|
len = puint[0];
|
|
n = puint[1];
|
|
printf("\tREparen len=%d n=%d, pc=>%d\n", len, n, pc + 1 + uint.sizeof * 2 + len);
|
|
pc += 1 + uint.sizeof * 2;
|
|
break;
|
|
|
|
case REend:
|
|
printf("\tREend\n");
|
|
return;
|
|
|
|
case REwordboundary:
|
|
printf("\tREwordboundary\n");
|
|
pc++;
|
|
break;
|
|
|
|
case REnotwordboundary:
|
|
printf("\tREnotwordboundary\n");
|
|
pc++;
|
|
break;
|
|
|
|
case REdigit:
|
|
printf("\tREdigit\n");
|
|
pc++;
|
|
break;
|
|
|
|
case REnotdigit:
|
|
printf("\tREnotdigit\n");
|
|
pc++;
|
|
break;
|
|
|
|
case REspace:
|
|
printf("\tREspace\n");
|
|
pc++;
|
|
break;
|
|
|
|
case REnotspace:
|
|
printf("\tREnotspace\n");
|
|
pc++;
|
|
break;
|
|
|
|
case REword:
|
|
printf("\tREword\n");
|
|
pc++;
|
|
break;
|
|
|
|
case REnotword:
|
|
printf("\tREnotword\n");
|
|
pc++;
|
|
break;
|
|
|
|
case REbackref:
|
|
printf("\tREbackref %d\n", prog[1]);
|
|
pc += 2;
|
|
break;
|
|
|
|
default:
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/**************************************************
|
|
* Match input against a section of the program[].
|
|
* Returns:
|
|
* 1 if successful match
|
|
* 0 no match
|
|
*/
|
|
|
|
int trymatch(int pc, int pcend)
|
|
{ int srcsave;
|
|
uint len;
|
|
uint n;
|
|
uint m;
|
|
uint count;
|
|
uint pop;
|
|
uint ss;
|
|
regmatch_t *psave;
|
|
uint c1;
|
|
uint c2;
|
|
ushort* pu;
|
|
uint* puint;
|
|
|
|
debug(regexp)
|
|
printf("RegExp.trymatch(pc = %d, src = '%.*s', pcend = %d)\n",
|
|
pc, input[src .. input.length], pcend);
|
|
srcsave = src;
|
|
psave = null;
|
|
for (;;)
|
|
{
|
|
if (pc == pcend) // if done matching
|
|
{ debug(regex) printf("\tprogend\n");
|
|
return 1;
|
|
}
|
|
|
|
//printf("\top = %d\n", program[pc]);
|
|
switch (program[pc])
|
|
{
|
|
case REchar:
|
|
if (src == input.length)
|
|
goto Lnomatch;
|
|
debug(regexp) printf("\tREchar '%c', src = '%c'\n", program[pc + 1], input[src]);
|
|
if (program[pc + 1] != input[src])
|
|
goto Lnomatch;
|
|
src++;
|
|
pc += 1 + char.sizeof;
|
|
break;
|
|
|
|
case REichar:
|
|
if (src == input.length)
|
|
goto Lnomatch;
|
|
debug(regexp) printf("\tREichar '%c', src = '%c'\n", program[pc + 1], input[src]);
|
|
c1 = program[pc + 1];
|
|
c2 = input[src];
|
|
if (c1 != c2)
|
|
{
|
|
if (islower(cast(rchar)c2))
|
|
c2 = std.ctype.toupper(cast(rchar)c2);
|
|
else
|
|
goto Lnomatch;
|
|
if (c1 != c2)
|
|
goto Lnomatch;
|
|
}
|
|
src++;
|
|
pc += 1 + char.sizeof;
|
|
break;
|
|
|
|
case REdchar:
|
|
debug(regexp) printf("\tREdchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]);
|
|
if (src == input.length)
|
|
goto Lnomatch;
|
|
if (*(cast(dchar *)&program[pc + 1]) != input[src])
|
|
goto Lnomatch;
|
|
src++;
|
|
pc += 1 + dchar.sizeof;
|
|
break;
|
|
|
|
case REidchar:
|
|
debug(regexp) printf("\tREidchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]);
|
|
if (src == input.length)
|
|
goto Lnomatch;
|
|
c1 = *(cast(dchar *)&program[pc + 1]);
|
|
c2 = input[src];
|
|
if (c1 != c2)
|
|
{
|
|
if (islower(cast(rchar)c2))
|
|
c2 = std.ctype.toupper(cast(rchar)c2);
|
|
else
|
|
goto Lnomatch;
|
|
if (c1 != c2)
|
|
goto Lnomatch;
|
|
}
|
|
src++;
|
|
pc += 1 + dchar.sizeof;
|
|
break;
|
|
|
|
case REanychar:
|
|
debug(regexp) printf("\tREanychar\n");
|
|
if (src == input.length)
|
|
goto Lnomatch;
|
|
if (!(attributes & REA.dotmatchlf) && input[src] == cast(rchar)'\n')
|
|
goto Lnomatch;
|
|
src += std.utf.stride(input, src);
|
|
//src++;
|
|
pc++;
|
|
break;
|
|
|
|
case REstring:
|
|
len = *cast(uint *)&program[pc + 1];
|
|
debug(regexp) printf("\tREstring x%x, '%.*s'\n", len,
|
|
(&program[pc + 1 + uint.sizeof])[0 .. len]);
|
|
if (src + len > input.length)
|
|
goto Lnomatch;
|
|
if (memcmp(&program[pc + 1 + uint.sizeof], &input[src], len * rchar.sizeof))
|
|
goto Lnomatch;
|
|
src += len;
|
|
pc += 1 + uint.sizeof + len * rchar.sizeof;
|
|
break;
|
|
|
|
case REistring:
|
|
len = *cast(uint *)&program[pc + 1];
|
|
debug(regexp) printf("\tREistring x%x, '%.*s'\n", len,
|
|
(&program[pc + 1 + uint.sizeof])[0 .. len]);
|
|
if (src + len > input.length)
|
|
goto Lnomatch;
|
|
version (Win32)
|
|
{
|
|
if (memicmp(cast(char*)&program[pc + 1 + uint.sizeof], &input[src], len * rchar.sizeof))
|
|
goto Lnomatch;
|
|
}
|
|
else
|
|
{
|
|
if (icmp((cast(char*)&program[pc + 1 + uint.sizeof])[0..len],
|
|
input[src .. src + len]))
|
|
goto Lnomatch;
|
|
}
|
|
src += len;
|
|
pc += 1 + uint.sizeof + len * rchar.sizeof;
|
|
break;
|
|
|
|
case REtestbit:
|
|
pu = (cast(ushort *)&program[pc + 1]);
|
|
debug(regexp) printf("\tREtestbit %d, %d, '%c', x%02x\n",
|
|
pu[0], pu[1], input[src], input[src]);
|
|
if (src == input.length)
|
|
goto Lnomatch;
|
|
len = pu[1];
|
|
c1 = input[src];
|
|
//printf("[x%02x]=x%02x, x%02x\n", c1 >> 3, ((&program[pc + 1 + 4])[c1 >> 3] ), (1 << (c1 & 7)));
|
|
if (c1 <= pu[0] &&
|
|
!((&(program[pc + 1 + 4]))[c1 >> 3] & (1 << (c1 & 7))))
|
|
goto Lnomatch;
|
|
pc += 1 + 2 * ushort.sizeof + len;
|
|
break;
|
|
|
|
case REbit:
|
|
pu = (cast(ushort *)&program[pc + 1]);
|
|
debug(regexp) printf("\tREbit %d, %d, '%c'\n",
|
|
pu[0], pu[1], input[src]);
|
|
if (src == input.length)
|
|
goto Lnomatch;
|
|
len = pu[1];
|
|
c1 = input[src];
|
|
if (c1 > pu[0])
|
|
goto Lnomatch;
|
|
if (!((&program[pc + 1 + 4])[c1 >> 3] & (1 << (c1 & 7))))
|
|
goto Lnomatch;
|
|
src++;
|
|
pc += 1 + 2 * ushort.sizeof + len;
|
|
break;
|
|
|
|
case REnotbit:
|
|
pu = (cast(ushort *)&program[pc + 1]);
|
|
debug(regexp) printf("\tREnotbit %d, %d, '%c'\n",
|
|
pu[0], pu[1], input[src]);
|
|
if (src == input.length)
|
|
goto Lnomatch;
|
|
len = pu[1];
|
|
c1 = input[src];
|
|
if (c1 <= pu[0] &&
|
|
((&program[pc + 1 + 4])[c1 >> 3] & (1 << (c1 & 7))))
|
|
goto Lnomatch;
|
|
src++;
|
|
pc += 1 + 2 * ushort.sizeof + len;
|
|
break;
|
|
|
|
case RErange:
|
|
len = *cast(uint *)&program[pc + 1];
|
|
debug(regexp) printf("\tRErange %d\n", len);
|
|
if (src == input.length)
|
|
goto Lnomatch;
|
|
// BUG: REA.ignoreCase?
|
|
if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) == null)
|
|
goto Lnomatch;
|
|
src++;
|
|
pc += 1 + uint.sizeof + len;
|
|
break;
|
|
|
|
case REnotrange:
|
|
len = *cast(uint *)&program[pc + 1];
|
|
debug(regexp) printf("\tREnotrange %d\n", len);
|
|
if (src == input.length)
|
|
goto Lnomatch;
|
|
// BUG: REA.ignoreCase?
|
|
if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) != null)
|
|
goto Lnomatch;
|
|
src++;
|
|
pc += 1 + uint.sizeof + len;
|
|
break;
|
|
|
|
case REbol:
|
|
debug(regexp) printf("\tREbol\n");
|
|
if (src == 0)
|
|
{
|
|
}
|
|
else if (attributes & REA.multiline)
|
|
{
|
|
if (input[src - 1] != '\n')
|
|
goto Lnomatch;
|
|
}
|
|
else
|
|
goto Lnomatch;
|
|
pc++;
|
|
break;
|
|
|
|
case REeol:
|
|
debug(regexp) printf("\tREeol\n");
|
|
if (src == input.length)
|
|
{
|
|
}
|
|
else if (attributes & REA.multiline && input[src] == '\n')
|
|
src++;
|
|
else
|
|
goto Lnomatch;
|
|
pc++;
|
|
break;
|
|
|
|
case REor:
|
|
len = (cast(uint *)&program[pc + 1])[0];
|
|
debug(regexp) printf("\tREor %d\n", len);
|
|
pop = pc + 1 + uint.sizeof;
|
|
ss = src;
|
|
if (trymatch(pop, pcend))
|
|
{
|
|
if (pcend != program.length)
|
|
{ int s;
|
|
|
|
s = src;
|
|
if (trymatch(pcend, program.length))
|
|
{ debug(regexp) printf("\tfirst operand matched\n");
|
|
src = s;
|
|
return 1;
|
|
}
|
|
else
|
|
{
|
|
// If second branch doesn't match to end, take first anyway
|
|
src = ss;
|
|
if (!trymatch(pop + len, program.length))
|
|
{
|
|
debug(regexp) printf("\tfirst operand matched\n");
|
|
src = s;
|
|
return 1;
|
|
}
|
|
}
|
|
src = ss;
|
|
}
|
|
else
|
|
{ debug(regexp) printf("\tfirst operand matched\n");
|
|
return 1;
|
|
}
|
|
}
|
|
pc = pop + len; // proceed with 2nd branch
|
|
break;
|
|
|
|
case REgoto:
|
|
debug(regexp) printf("\tREgoto\n");
|
|
len = (cast(uint *)&program[pc + 1])[0];
|
|
pc += 1 + uint.sizeof + len;
|
|
break;
|
|
|
|
case REanystar:
|
|
debug(regexp) printf("\tREanystar\n");
|
|
pc++;
|
|
for (;;)
|
|
{ int s1;
|
|
int s2;
|
|
|
|
s1 = src;
|
|
if (src == input.length)
|
|
break;
|
|
if (!(attributes & REA.dotmatchlf) && input[src] == '\n')
|
|
break;
|
|
src++;
|
|
s2 = src;
|
|
|
|
// If no match after consumption, but it
|
|
// did match before, then no match
|
|
if (!trymatch(pc, program.length))
|
|
{
|
|
src = s1;
|
|
// BUG: should we save/restore pmatch[]?
|
|
if (trymatch(pc, program.length))
|
|
{
|
|
src = s1; // no match
|
|
break;
|
|
}
|
|
}
|
|
src = s2;
|
|
}
|
|
break;
|
|
|
|
case REnm:
|
|
case REnmq:
|
|
// len, n, m, ()
|
|
puint = cast(uint *)&program[pc + 1];
|
|
len = puint[0];
|
|
n = puint[1];
|
|
m = puint[2];
|
|
debug(regexp) printf("\tREnm%s len=%d, n=%u, m=%u\n", (program[pc] == REnmq) ? cast(char*)"q" : cast(char*)"", len, n, m);
|
|
pop = pc + 1 + uint.sizeof * 3;
|
|
for (count = 0; count < n; count++)
|
|
{
|
|
if (!trymatch(pop, pop + len))
|
|
goto Lnomatch;
|
|
}
|
|
if (!psave && count < m)
|
|
{
|
|
//version (Win32)
|
|
psave = cast(regmatch_t *)alloca((re_nsub + 1) * regmatch_t.sizeof);
|
|
//else
|
|
//psave = new regmatch_t[re_nsub + 1];
|
|
}
|
|
if (program[pc] == REnmq) // if minimal munch
|
|
{
|
|
for (; count < m; count++)
|
|
{ int s1;
|
|
|
|
memcpy(psave, pmatch.ptr, (re_nsub + 1) * regmatch_t.sizeof);
|
|
s1 = src;
|
|
|
|
if (trymatch(pop + len, program.length))
|
|
{
|
|
src = s1;
|
|
memcpy(pmatch.ptr, psave, (re_nsub + 1) * regmatch_t.sizeof);
|
|
break;
|
|
}
|
|
|
|
if (!trymatch(pop, pop + len))
|
|
{ debug(regexp) printf("\tdoesn't match subexpression\n");
|
|
break;
|
|
}
|
|
|
|
// If source is not consumed, don't
|
|
// infinite loop on the match
|
|
if (s1 == src)
|
|
{ debug(regexp) printf("\tsource is not consumed\n");
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else // maximal munch
|
|
{
|
|
for (; count < m; count++)
|
|
{ int s1;
|
|
int s2;
|
|
|
|
memcpy(psave, pmatch.ptr, (re_nsub + 1) * regmatch_t.sizeof);
|
|
s1 = src;
|
|
if (!trymatch(pop, pop + len))
|
|
{ debug(regexp) printf("\tdoesn't match subexpression\n");
|
|
break;
|
|
}
|
|
s2 = src;
|
|
|
|
// If source is not consumed, don't
|
|
// infinite loop on the match
|
|
if (s1 == s2)
|
|
{ debug(regexp) printf("\tsource is not consumed\n");
|
|
break;
|
|
}
|
|
|
|
// If no match after consumption, but it
|
|
// did match before, then no match
|
|
if (!trymatch(pop + len, program.length))
|
|
{
|
|
src = s1;
|
|
if (trymatch(pop + len, program.length))
|
|
{
|
|
src = s1; // no match
|
|
memcpy(pmatch.ptr, psave, (re_nsub + 1) * regmatch_t.sizeof);
|
|
break;
|
|
}
|
|
}
|
|
src = s2;
|
|
}
|
|
}
|
|
debug(regexp) printf("\tREnm len=%d, n=%u, m=%u, DONE count=%d\n", len, n, m, count);
|
|
pc = pop + len;
|
|
break;
|
|
|
|
case REparen:
|
|
// len, ()
|
|
debug(regexp) printf("\tREparen\n");
|
|
puint = cast(uint *)&program[pc + 1];
|
|
len = puint[0];
|
|
n = puint[1];
|
|
pop = pc + 1 + uint.sizeof * 2;
|
|
ss = src;
|
|
if (!trymatch(pop, pop + len))
|
|
goto Lnomatch;
|
|
pmatch[n + 1].rm_so = ss;
|
|
pmatch[n + 1].rm_eo = src;
|
|
pc = pop + len;
|
|
break;
|
|
|
|
case REend:
|
|
debug(regexp) printf("\tREend\n");
|
|
return 1; // successful match
|
|
|
|
case REwordboundary:
|
|
debug(regexp) printf("\tREwordboundary\n");
|
|
if (src > 0 && src < input.length)
|
|
{
|
|
c1 = input[src - 1];
|
|
c2 = input[src];
|
|
if (!(
|
|
(isword(cast(rchar)c1) && !isword(cast(rchar)c2)) ||
|
|
(!isword(cast(rchar)c1) && isword(cast(rchar)c2))
|
|
)
|
|
)
|
|
goto Lnomatch;
|
|
}
|
|
pc++;
|
|
break;
|
|
|
|
case REnotwordboundary:
|
|
debug(regexp) printf("\tREnotwordboundary\n");
|
|
if (src == 0 || src == input.length)
|
|
goto Lnomatch;
|
|
c1 = input[src - 1];
|
|
c2 = input[src];
|
|
if (
|
|
(isword(cast(rchar)c1) && !isword(cast(rchar)c2)) ||
|
|
(!isword(cast(rchar)c1) && isword(cast(rchar)c2))
|
|
)
|
|
goto Lnomatch;
|
|
pc++;
|
|
break;
|
|
|
|
case REdigit:
|
|
debug(regexp) printf("\tREdigit\n");
|
|
if (src == input.length)
|
|
goto Lnomatch;
|
|
if (!isdigit(input[src]))
|
|
goto Lnomatch;
|
|
src++;
|
|
pc++;
|
|
break;
|
|
|
|
case REnotdigit:
|
|
debug(regexp) printf("\tREnotdigit\n");
|
|
if (src == input.length)
|
|
goto Lnomatch;
|
|
if (isdigit(input[src]))
|
|
goto Lnomatch;
|
|
src++;
|
|
pc++;
|
|
break;
|
|
|
|
case REspace:
|
|
debug(regexp) printf("\tREspace\n");
|
|
if (src == input.length)
|
|
goto Lnomatch;
|
|
if (!isspace(input[src]))
|
|
goto Lnomatch;
|
|
src++;
|
|
pc++;
|
|
break;
|
|
|
|
case REnotspace:
|
|
debug(regexp) printf("\tREnotspace\n");
|
|
if (src == input.length)
|
|
goto Lnomatch;
|
|
if (isspace(input[src]))
|
|
goto Lnomatch;
|
|
src++;
|
|
pc++;
|
|
break;
|
|
|
|
case REword:
|
|
debug(regexp) printf("\tREword\n");
|
|
if (src == input.length)
|
|
goto Lnomatch;
|
|
if (!isword(input[src]))
|
|
goto Lnomatch;
|
|
src++;
|
|
pc++;
|
|
break;
|
|
|
|
case REnotword:
|
|
debug(regexp) printf("\tREnotword\n");
|
|
if (src == input.length)
|
|
goto Lnomatch;
|
|
if (isword(input[src]))
|
|
goto Lnomatch;
|
|
src++;
|
|
pc++;
|
|
break;
|
|
|
|
case REbackref:
|
|
{
|
|
n = program[pc + 1];
|
|
debug(regexp) printf("\tREbackref %d\n", n);
|
|
|
|
int so = pmatch[n + 1].rm_so;
|
|
int eo = pmatch[n + 1].rm_eo;
|
|
len = eo - so;
|
|
if (src + len > input.length)
|
|
goto Lnomatch;
|
|
else if (attributes & REA.ignoreCase)
|
|
{
|
|
if (icmp(input[src .. src + len], input[so .. eo]))
|
|
goto Lnomatch;
|
|
}
|
|
else if (memcmp(&input[src], &input[so], len * rchar.sizeof))
|
|
goto Lnomatch;
|
|
src += len;
|
|
pc += 2;
|
|
break;
|
|
}
|
|
|
|
default:
|
|
assert(0);
|
|
}
|
|
}
|
|
|
|
Lnomatch:
|
|
debug(regexp) printf("\tnomatch pc=%d\n", pc);
|
|
src = srcsave;
|
|
return 0;
|
|
}
|
|
|
|
/* =================== Compiler ================== */
|
|
|
|
int parseRegexp()
|
|
{ uint offset;
|
|
uint gotooffset;
|
|
uint len1;
|
|
uint len2;
|
|
|
|
//printf("parseRegexp() '%.*s'\n", pattern[p .. pattern.length]);
|
|
offset = buf.offset;
|
|
for (;;)
|
|
{
|
|
assert(p <= pattern.length);
|
|
if (p == pattern.length)
|
|
{ buf.write(REend);
|
|
return 1;
|
|
}
|
|
switch (pattern[p])
|
|
{
|
|
case ')':
|
|
return 1;
|
|
|
|
case '|':
|
|
p++;
|
|
gotooffset = buf.offset;
|
|
buf.write(REgoto);
|
|
buf.write(cast(uint)0);
|
|
len1 = buf.offset - offset;
|
|
buf.spread(offset, 1 + uint.sizeof);
|
|
gotooffset += 1 + uint.sizeof;
|
|
parseRegexp();
|
|
len2 = buf.offset - (gotooffset + 1 + uint.sizeof);
|
|
buf.data[offset] = REor;
|
|
(cast(uint *)&buf.data[offset + 1])[0] = len1;
|
|
(cast(uint *)&buf.data[gotooffset + 1])[0] = len2;
|
|
break;
|
|
|
|
default:
|
|
parsePiece();
|
|
break;
|
|
}
|
|
}
|
|
assert(0);
|
|
}
|
|
|
|
int parsePiece()
|
|
{ uint offset;
|
|
uint len;
|
|
uint n;
|
|
uint m;
|
|
ubyte op;
|
|
int plength = pattern.length;
|
|
|
|
//printf("parsePiece() '%.*s'\n", pattern[p .. pattern.length]);
|
|
offset = buf.offset;
|
|
parseAtom();
|
|
if (p == plength)
|
|
return 1;
|
|
switch (pattern[p])
|
|
{
|
|
case '*':
|
|
// Special optimization: replace .* with REanystar
|
|
if (buf.offset - offset == 1 &&
|
|
buf.data[offset] == REanychar &&
|
|
p + 1 < plength &&
|
|
pattern[p + 1] != '?')
|
|
{
|
|
buf.data[offset] = REanystar;
|
|
p++;
|
|
break;
|
|
}
|
|
|
|
n = 0;
|
|
m = inf;
|
|
goto Lnm;
|
|
|
|
case '+':
|
|
n = 1;
|
|
m = inf;
|
|
goto Lnm;
|
|
|
|
case '?':
|
|
n = 0;
|
|
m = 1;
|
|
goto Lnm;
|
|
|
|
case '{': // {n} {n,} {n,m}
|
|
p++;
|
|
if (p == plength || !isdigit(pattern[p]))
|
|
goto Lerr;
|
|
n = 0;
|
|
do
|
|
{
|
|
// BUG: handle overflow
|
|
n = n * 10 + pattern[p] - '0';
|
|
p++;
|
|
if (p == plength)
|
|
goto Lerr;
|
|
} while (isdigit(pattern[p]));
|
|
if (pattern[p] == '}') // {n}
|
|
{ m = n;
|
|
goto Lnm;
|
|
}
|
|
if (pattern[p] != ',')
|
|
goto Lerr;
|
|
p++;
|
|
if (p == plength)
|
|
goto Lerr;
|
|
if (pattern[p] == /*{*/ '}') // {n,}
|
|
{ m = inf;
|
|
goto Lnm;
|
|
}
|
|
if (!isdigit(pattern[p]))
|
|
goto Lerr;
|
|
m = 0; // {n,m}
|
|
do
|
|
{
|
|
// BUG: handle overflow
|
|
m = m * 10 + pattern[p] - '0';
|
|
p++;
|
|
if (p == plength)
|
|
goto Lerr;
|
|
} while (isdigit(pattern[p]));
|
|
if (pattern[p] != /*{*/ '}')
|
|
goto Lerr;
|
|
goto Lnm;
|
|
|
|
Lnm:
|
|
p++;
|
|
op = REnm;
|
|
if (p < plength && pattern[p] == '?')
|
|
{ op = REnmq; // minimal munch version
|
|
p++;
|
|
}
|
|
len = buf.offset - offset;
|
|
buf.spread(offset, 1 + uint.sizeof * 3);
|
|
buf.data[offset] = op;
|
|
uint* puint = cast(uint *)&buf.data[offset + 1];
|
|
puint[0] = len;
|
|
puint[1] = n;
|
|
puint[2] = m;
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
return 1;
|
|
|
|
Lerr:
|
|
error("badly formed {n,m}");
|
|
assert(0);
|
|
}
|
|
|
|
int parseAtom()
|
|
{ ubyte op;
|
|
uint offset;
|
|
rchar c;
|
|
|
|
//printf("parseAtom() '%.*s'\n", pattern[p .. pattern.length]);
|
|
if (p < pattern.length)
|
|
{
|
|
c = pattern[p];
|
|
switch (c)
|
|
{
|
|
case '*':
|
|
case '+':
|
|
case '?':
|
|
error("*+? not allowed in atom");
|
|
p++;
|
|
return 0;
|
|
|
|
case '(':
|
|
p++;
|
|
buf.write(REparen);
|
|
offset = buf.offset;
|
|
buf.write(cast(uint)0); // reserve space for length
|
|
buf.write(re_nsub);
|
|
re_nsub++;
|
|
parseRegexp();
|
|
*cast(uint *)&buf.data[offset] =
|
|
buf.offset - (offset + uint.sizeof * 2);
|
|
if (p == pattern.length || pattern[p] != ')')
|
|
{
|
|
error("')' expected");
|
|
return 0;
|
|
}
|
|
p++;
|
|
break;
|
|
|
|
case '[':
|
|
if (!parseRange())
|
|
return 0;
|
|
break;
|
|
|
|
case '.':
|
|
p++;
|
|
buf.write(REanychar);
|
|
break;
|
|
|
|
case '^':
|
|
p++;
|
|
buf.write(REbol);
|
|
break;
|
|
|
|
case '$':
|
|
p++;
|
|
buf.write(REeol);
|
|
break;
|
|
|
|
case '\\':
|
|
p++;
|
|
if (p == pattern.length)
|
|
{ error("no character past '\\'");
|
|
return 0;
|
|
}
|
|
c = pattern[p];
|
|
switch (c)
|
|
{
|
|
case 'b': op = REwordboundary; goto Lop;
|
|
case 'B': op = REnotwordboundary; goto Lop;
|
|
case 'd': op = REdigit; goto Lop;
|
|
case 'D': op = REnotdigit; goto Lop;
|
|
case 's': op = REspace; goto Lop;
|
|
case 'S': op = REnotspace; goto Lop;
|
|
case 'w': op = REword; goto Lop;
|
|
case 'W': op = REnotword; goto Lop;
|
|
|
|
Lop:
|
|
buf.write(op);
|
|
p++;
|
|
break;
|
|
|
|
case 'f':
|
|
case 'n':
|
|
case 'r':
|
|
case 't':
|
|
case 'v':
|
|
case 'c':
|
|
case 'x':
|
|
case 'u':
|
|
case '0':
|
|
c = cast(char)escape();
|
|
goto Lbyte;
|
|
|
|
case '1': case '2': case '3':
|
|
case '4': case '5': case '6':
|
|
case '7': case '8': case '9':
|
|
c -= '1';
|
|
if (c < re_nsub)
|
|
{ buf.write(REbackref);
|
|
buf.write(cast(ubyte)c);
|
|
}
|
|
else
|
|
{ error("no matching back reference");
|
|
return 0;
|
|
}
|
|
p++;
|
|
break;
|
|
|
|
default:
|
|
p++;
|
|
goto Lbyte;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
p++;
|
|
Lbyte:
|
|
op = REchar;
|
|
if (attributes & REA.ignoreCase)
|
|
{
|
|
if (isalpha(c))
|
|
{
|
|
op = REichar;
|
|
c = cast(char)std.ctype.toupper(c);
|
|
}
|
|
}
|
|
if (op == REchar && c <= 0xFF)
|
|
{
|
|
// Look ahead and see if we can make this into
|
|
// an REstring
|
|
int q;
|
|
int len;
|
|
|
|
for (q = p; q < pattern.length; ++q)
|
|
{ rchar qc = pattern[q];
|
|
|
|
switch (qc)
|
|
{
|
|
case '{':
|
|
case '*':
|
|
case '+':
|
|
case '?':
|
|
if (q == p)
|
|
goto Lchar;
|
|
q--;
|
|
break;
|
|
|
|
case '(': case ')':
|
|
case '|':
|
|
case '[': case ']':
|
|
case '.': case '^':
|
|
case '$': case '\\':
|
|
case '}':
|
|
break;
|
|
|
|
default:
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
len = q - p;
|
|
if (len > 0)
|
|
{
|
|
debug(regexp) printf("writing string len %d, c = '%c', pattern[p] = '%c'\n", len+1, c, pattern[p]);
|
|
buf.reserve(5 + (1 + len) * rchar.sizeof);
|
|
buf.write((attributes & REA.ignoreCase) ? REistring : REstring);
|
|
buf.write(len + 1);
|
|
buf.write(c);
|
|
buf.write(pattern[p .. p + len]);
|
|
p = q;
|
|
break;
|
|
}
|
|
}
|
|
if (c >= 0x80)
|
|
{
|
|
// Convert to dchar opcode
|
|
op = (op == REchar) ? REdchar : REidchar;
|
|
buf.write(op);
|
|
buf.write(c);
|
|
}
|
|
else
|
|
{
|
|
Lchar:
|
|
debug(regexp) printf("It's an REchar '%c'\n", c);
|
|
buf.write(op);
|
|
buf.write(cast(char)c);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
private:
|
|
class Range
|
|
{
|
|
uint maxc;
|
|
uint maxb;
|
|
OutBuffer buf;
|
|
ubyte* base;
|
|
BitArray bits;
|
|
|
|
this(OutBuffer buf)
|
|
{
|
|
this.buf = buf;
|
|
if (buf.data.length)
|
|
this.base = &buf.data[buf.offset];
|
|
}
|
|
|
|
void setbitmax(uint u)
|
|
{ uint b;
|
|
|
|
//printf("setbitmax(x%x), maxc = x%x\n", u, maxc);
|
|
if (u > maxc)
|
|
{
|
|
maxc = u;
|
|
b = u / 8;
|
|
if (b >= maxb)
|
|
{ uint u2;
|
|
|
|
u2 = base ? base - &buf.data[0] : 0;
|
|
buf.fill0(b - maxb + 1);
|
|
base = &buf.data[u2];
|
|
maxb = b + 1;
|
|
//bits = (cast(bit*)this.base)[0 .. maxc + 1];
|
|
bits.ptr = cast(uint*)this.base;
|
|
}
|
|
bits.len = maxc + 1;
|
|
}
|
|
}
|
|
|
|
void setbit2(uint u)
|
|
{
|
|
setbitmax(u + 1);
|
|
//printf("setbit2 [x%02x] |= x%02x\n", u >> 3, 1 << (u & 7));
|
|
bits[u] = 1;
|
|
}
|
|
|
|
};
|
|
|
|
int parseRange()
|
|
{ ubyte op;
|
|
int c;
|
|
int c2;
|
|
uint i;
|
|
uint cmax;
|
|
uint offset;
|
|
|
|
cmax = 0x7F;
|
|
p++;
|
|
op = REbit;
|
|
if (p == pattern.length)
|
|
goto Lerr;
|
|
if (pattern[p] == '^')
|
|
{ p++;
|
|
op = REnotbit;
|
|
if (p == pattern.length)
|
|
goto Lerr;
|
|
}
|
|
buf.write(op);
|
|
offset = buf.offset;
|
|
buf.write(cast(uint)0); // reserve space for length
|
|
buf.reserve(128 / 8);
|
|
auto r = new Range(buf);
|
|
if (op == REnotbit)
|
|
r.setbit2(0);
|
|
switch (pattern[p])
|
|
{
|
|
case ']':
|
|
case '-':
|
|
c = pattern[p];
|
|
p++;
|
|
r.setbit2(c);
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
enum RS { start, rliteral, dash };
|
|
RS rs;
|
|
|
|
rs = RS.start;
|
|
for (;;)
|
|
{
|
|
if (p == pattern.length)
|
|
goto Lerr;
|
|
switch (pattern[p])
|
|
{
|
|
case ']':
|
|
switch (rs)
|
|
{ case RS.dash:
|
|
r.setbit2('-');
|
|
case RS.rliteral:
|
|
r.setbit2(c);
|
|
break;
|
|
case RS.start:
|
|
break;
|
|
default:
|
|
assert(0);
|
|
}
|
|
p++;
|
|
break;
|
|
|
|
case '\\':
|
|
p++;
|
|
r.setbitmax(cmax);
|
|
if (p == pattern.length)
|
|
goto Lerr;
|
|
switch (pattern[p])
|
|
{
|
|
case 'd':
|
|
for (i = '0'; i <= '9'; i++)
|
|
r.bits[i] = 1;
|
|
goto Lrs;
|
|
|
|
case 'D':
|
|
for (i = 1; i < '0'; i++)
|
|
r.bits[i] = 1;
|
|
for (i = '9' + 1; i <= cmax; i++)
|
|
r.bits[i] = 1;
|
|
goto Lrs;
|
|
|
|
case 's':
|
|
for (i = 0; i <= cmax; i++)
|
|
if (isspace(i))
|
|
r.bits[i] = 1;
|
|
goto Lrs;
|
|
|
|
case 'S':
|
|
for (i = 1; i <= cmax; i++)
|
|
if (!isspace(i))
|
|
r.bits[i] = 1;
|
|
goto Lrs;
|
|
|
|
case 'w':
|
|
for (i = 0; i <= cmax; i++)
|
|
if (isword(cast(rchar)i))
|
|
r.bits[i] = 1;
|
|
goto Lrs;
|
|
|
|
case 'W':
|
|
for (i = 1; i <= cmax; i++)
|
|
if (!isword(cast(rchar)i))
|
|
r.bits[i] = 1;
|
|
goto Lrs;
|
|
|
|
Lrs:
|
|
switch (rs)
|
|
{ case RS.dash:
|
|
r.setbit2('-');
|
|
case RS.rliteral:
|
|
r.setbit2(c);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
rs = RS.start;
|
|
continue;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
c2 = escape();
|
|
goto Lrange;
|
|
|
|
case '-':
|
|
p++;
|
|
if (rs == RS.start)
|
|
goto Lrange;
|
|
else if (rs == RS.rliteral)
|
|
rs = RS.dash;
|
|
else if (rs == RS.dash)
|
|
{
|
|
r.setbit2(c);
|
|
r.setbit2('-');
|
|
rs = RS.start;
|
|
}
|
|
continue;
|
|
|
|
default:
|
|
c2 = pattern[p];
|
|
p++;
|
|
Lrange:
|
|
switch (rs)
|
|
{ case RS.rliteral:
|
|
r.setbit2(c);
|
|
case RS.start:
|
|
c = c2;
|
|
rs = RS.rliteral;
|
|
break;
|
|
|
|
case RS.dash:
|
|
if (c > c2)
|
|
{ error("inverted range in character class");
|
|
return 0;
|
|
}
|
|
r.setbitmax(c2);
|
|
//printf("c = %x, c2 = %x\n",c,c2);
|
|
for (; c <= c2; c++)
|
|
r.bits[c] = 1;
|
|
rs = RS.start;
|
|
break;
|
|
|
|
default:
|
|
assert(0);
|
|
}
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
if (attributes & REA.ignoreCase)
|
|
{
|
|
// BUG: what about dchar?
|
|
r.setbitmax(0x7F);
|
|
for (c = 'a'; c <= 'z'; c++)
|
|
{
|
|
if (r.bits[c])
|
|
r.bits[c + 'A' - 'a'] = 1;
|
|
else if (r.bits[c + 'A' - 'a'])
|
|
r.bits[c] = 1;
|
|
}
|
|
}
|
|
//printf("maxc = %d, maxb = %d\n",r.maxc,r.maxb);
|
|
(cast(ushort *)&buf.data[offset])[0] = cast(ushort)r.maxc;
|
|
(cast(ushort *)&buf.data[offset])[1] = cast(ushort)r.maxb;
|
|
return 1;
|
|
|
|
Lerr:
|
|
error("invalid range");
|
|
return 0;
|
|
}
|
|
|
|
void error(string msg)
|
|
{
|
|
errors++;
|
|
debug(regexp) printf("error: %.*s\n", msg);
|
|
//assert(0);
|
|
//*(char*)0=0;
|
|
throw new RegExpException(msg);
|
|
}
|
|
|
|
// p is following the \ char
|
|
int escape()
|
|
in
|
|
{
|
|
assert(p < pattern.length);
|
|
}
|
|
body
|
|
{ int c;
|
|
int i;
|
|
rchar tc;
|
|
|
|
c = pattern[p]; // none of the cases are multibyte
|
|
switch (c)
|
|
{
|
|
case 'b': c = '\b'; break;
|
|
case 'f': c = '\f'; break;
|
|
case 'n': c = '\n'; break;
|
|
case 'r': c = '\r'; break;
|
|
case 't': c = '\t'; break;
|
|
case 'v': c = '\v'; break;
|
|
|
|
// BUG: Perl does \a and \e too, should we?
|
|
|
|
case 'c':
|
|
++p;
|
|
if (p == pattern.length)
|
|
goto Lretc;
|
|
c = pattern[p];
|
|
// Note: we are deliberately not allowing dchar letters
|
|
if (!(('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')))
|
|
{
|
|
Lcerr:
|
|
error("letter expected following \\c");
|
|
return 0;
|
|
}
|
|
c &= 0x1F;
|
|
break;
|
|
|
|
case '0':
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
c -= '0';
|
|
for (i = 0; i < 2; i++)
|
|
{
|
|
p++;
|
|
if (p == pattern.length)
|
|
goto Lretc;
|
|
tc = pattern[p];
|
|
if ('0' <= tc && tc <= '7')
|
|
{ c = c * 8 + (tc - '0');
|
|
// Treat overflow as if last
|
|
// digit was not an octal digit
|
|
if (c >= 0xFF)
|
|
{ c >>= 3;
|
|
return c;
|
|
}
|
|
}
|
|
else
|
|
return c;
|
|
}
|
|
break;
|
|
|
|
case 'x':
|
|
c = 0;
|
|
for (i = 0; i < 2; i++)
|
|
{
|
|
p++;
|
|
if (p == pattern.length)
|
|
goto Lretc;
|
|
tc = pattern[p];
|
|
if ('0' <= tc && tc <= '9')
|
|
c = c * 16 + (tc - '0');
|
|
else if ('a' <= tc && tc <= 'f')
|
|
c = c * 16 + (tc - 'a' + 10);
|
|
else if ('A' <= tc && tc <= 'F')
|
|
c = c * 16 + (tc - 'A' + 10);
|
|
else if (i == 0) // if no hex digits after \x
|
|
{
|
|
// Not a valid \xXX sequence
|
|
return 'x';
|
|
}
|
|
else
|
|
return c;
|
|
}
|
|
break;
|
|
|
|
case 'u':
|
|
c = 0;
|
|
for (i = 0; i < 4; i++)
|
|
{
|
|
p++;
|
|
if (p == pattern.length)
|
|
goto Lretc;
|
|
tc = pattern[p];
|
|
if ('0' <= tc && tc <= '9')
|
|
c = c * 16 + (tc - '0');
|
|
else if ('a' <= tc && tc <= 'f')
|
|
c = c * 16 + (tc - 'a' + 10);
|
|
else if ('A' <= tc && tc <= 'F')
|
|
c = c * 16 + (tc - 'A' + 10);
|
|
else
|
|
{
|
|
// Not a valid \uXXXX sequence
|
|
p -= i;
|
|
return 'u';
|
|
}
|
|
}
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
p++;
|
|
Lretc:
|
|
return c;
|
|
}
|
|
|
|
/* ==================== optimizer ======================= */
|
|
|
|
void optimize()
|
|
{ ubyte[] prog;
|
|
|
|
debug(regexp) printf("RegExp.optimize()\n");
|
|
prog = buf.toBytes();
|
|
for (size_t i = 0; 1;)
|
|
{
|
|
//printf("\tprog[%d] = %d, %d\n", i, prog[i], REstring);
|
|
switch (prog[i])
|
|
{
|
|
case REend:
|
|
case REanychar:
|
|
case REanystar:
|
|
case REbackref:
|
|
case REeol:
|
|
case REchar:
|
|
case REichar:
|
|
case REdchar:
|
|
case REidchar:
|
|
case REstring:
|
|
case REistring:
|
|
case REtestbit:
|
|
case REbit:
|
|
case REnotbit:
|
|
case RErange:
|
|
case REnotrange:
|
|
case REwordboundary:
|
|
case REnotwordboundary:
|
|
case REdigit:
|
|
case REnotdigit:
|
|
case REspace:
|
|
case REnotspace:
|
|
case REword:
|
|
case REnotword:
|
|
return;
|
|
|
|
case REbol:
|
|
i++;
|
|
continue;
|
|
|
|
case REor:
|
|
case REnm:
|
|
case REnmq:
|
|
case REparen:
|
|
case REgoto:
|
|
{
|
|
auto bitbuf = new OutBuffer;
|
|
auto r = new Range(bitbuf);
|
|
uint offset;
|
|
|
|
offset = i;
|
|
if (starrchars(r, prog[i .. prog.length]))
|
|
{
|
|
debug(regexp) printf("\tfilter built\n");
|
|
buf.spread(offset, 1 + 4 + r.maxb);
|
|
buf.data[offset] = REtestbit;
|
|
(cast(ushort *)&buf.data[offset + 1])[0] = cast(ushort)r.maxc;
|
|
(cast(ushort *)&buf.data[offset + 1])[1] = cast(ushort)r.maxb;
|
|
i = offset + 1 + 4;
|
|
buf.data[i .. i + r.maxb] = r.base[0 .. r.maxb];
|
|
}
|
|
return;
|
|
}
|
|
default:
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
|
|
/////////////////////////////////////////
|
|
// OR the leading character bits into r.
|
|
// Limit the character range from 0..7F,
|
|
// trymatch() will allow through anything over maxc.
|
|
// Return 1 if success, 0 if we can't build a filter or
|
|
// if there is no point to one.
|
|
|
|
int starrchars(Range r, const(ubyte)[] prog)
|
|
{ rchar c;
|
|
uint maxc;
|
|
uint maxb;
|
|
uint len;
|
|
uint b;
|
|
uint n;
|
|
uint m;
|
|
const(ubyte)* pop;
|
|
|
|
//printf("RegExp.starrchars(prog = %p, progend = %p)\n", prog, progend);
|
|
for (size_t i = 0; i < prog.length;)
|
|
{
|
|
switch (prog[i])
|
|
{
|
|
case REchar:
|
|
c = prog[i + 1];
|
|
if (c <= 0x7F)
|
|
r.setbit2(c);
|
|
return 1;
|
|
|
|
case REichar:
|
|
c = prog[i + 1];
|
|
if (c <= 0x7F)
|
|
{ r.setbit2(c);
|
|
r.setbit2(std.ctype.tolower(cast(rchar)c));
|
|
}
|
|
return 1;
|
|
|
|
case REdchar:
|
|
case REidchar:
|
|
return 1;
|
|
|
|
case REanychar:
|
|
return 0; // no point
|
|
|
|
case REstring:
|
|
len = *cast(uint *)&prog[i + 1];
|
|
assert(len);
|
|
c = *cast(rchar *)&prog[i + 1 + uint.sizeof];
|
|
debug(regexp) printf("\tREstring %d, '%c'\n", len, c);
|
|
if (c <= 0x7F)
|
|
r.setbit2(c);
|
|
return 1;
|
|
|
|
case REistring:
|
|
len = *cast(uint *)&prog[i + 1];
|
|
assert(len);
|
|
c = *cast(rchar *)&prog[i + 1 + uint.sizeof];
|
|
debug(regexp) printf("\tREistring %d, '%c'\n", len, c);
|
|
if (c <= 0x7F)
|
|
{ r.setbit2(std.ctype.toupper(cast(rchar)c));
|
|
r.setbit2(std.ctype.tolower(cast(rchar)c));
|
|
}
|
|
return 1;
|
|
|
|
case REtestbit:
|
|
case REbit:
|
|
maxc = (cast(ushort *)&prog[i + 1])[0];
|
|
maxb = (cast(ushort *)&prog[i + 1])[1];
|
|
if (maxc <= 0x7F)
|
|
r.setbitmax(maxc);
|
|
else
|
|
maxb = r.maxb;
|
|
for (b = 0; b < maxb; b++)
|
|
r.base[b] |= prog[i + 1 + 4 + b];
|
|
return 1;
|
|
|
|
case REnotbit:
|
|
maxc = (cast(ushort *)&prog[i + 1])[0];
|
|
maxb = (cast(ushort *)&prog[i + 1])[1];
|
|
if (maxc <= 0x7F)
|
|
r.setbitmax(maxc);
|
|
else
|
|
maxb = r.maxb;
|
|
for (b = 0; b < maxb; b++)
|
|
r.base[b] |= ~prog[i + 1 + 4 + b];
|
|
return 1;
|
|
|
|
case REbol:
|
|
case REeol:
|
|
return 0;
|
|
|
|
case REor:
|
|
len = (cast(uint *)&prog[i + 1])[0];
|
|
return starrchars(r, prog[i + 1 + uint.sizeof .. prog.length]) &&
|
|
starrchars(r, prog[i + 1 + uint.sizeof + len .. prog.length]);
|
|
|
|
case REgoto:
|
|
len = (cast(uint *)&prog[i + 1])[0];
|
|
i += 1 + uint.sizeof + len;
|
|
break;
|
|
|
|
case REanystar:
|
|
return 0;
|
|
|
|
case REnm:
|
|
case REnmq:
|
|
// len, n, m, ()
|
|
len = (cast(uint *)&prog[i + 1])[0];
|
|
n = (cast(uint *)&prog[i + 1])[1];
|
|
m = (cast(uint *)&prog[i + 1])[2];
|
|
pop = &prog[i + 1 + uint.sizeof * 3];
|
|
if (!starrchars(r, pop[0 .. len]))
|
|
return 0;
|
|
if (n)
|
|
return 1;
|
|
i += 1 + uint.sizeof * 3 + len;
|
|
break;
|
|
|
|
case REparen:
|
|
// len, ()
|
|
len = (cast(uint *)&prog[i + 1])[0];
|
|
n = (cast(uint *)&prog[i + 1])[1];
|
|
pop = &prog[0] + i + 1 + uint.sizeof * 2;
|
|
return starrchars(r, pop[0 .. len]);
|
|
|
|
case REend:
|
|
return 0;
|
|
|
|
case REwordboundary:
|
|
case REnotwordboundary:
|
|
return 0;
|
|
|
|
case REdigit:
|
|
r.setbitmax('9');
|
|
for (c = '0'; c <= '9'; c++)
|
|
r.bits[c] = 1;
|
|
return 1;
|
|
|
|
case REnotdigit:
|
|
r.setbitmax(0x7F);
|
|
for (c = 0; c <= '0'; c++)
|
|
r.bits[c] = 1;
|
|
for (c = '9' + 1; c <= r.maxc; c++)
|
|
r.bits[c] = 1;
|
|
return 1;
|
|
|
|
case REspace:
|
|
r.setbitmax(0x7F);
|
|
for (c = 0; c <= r.maxc; c++)
|
|
if (isspace(c))
|
|
r.bits[c] = 1;
|
|
return 1;
|
|
|
|
case REnotspace:
|
|
r.setbitmax(0x7F);
|
|
for (c = 0; c <= r.maxc; c++)
|
|
if (!isspace(c))
|
|
r.bits[c] = 1;
|
|
return 1;
|
|
|
|
case REword:
|
|
r.setbitmax(0x7F);
|
|
for (c = 0; c <= r.maxc; c++)
|
|
if (isword(cast(rchar)c))
|
|
r.bits[c] = 1;
|
|
return 1;
|
|
|
|
case REnotword:
|
|
r.setbitmax(0x7F);
|
|
for (c = 0; c <= r.maxc; c++)
|
|
if (!isword(cast(rchar)c))
|
|
r.bits[c] = 1;
|
|
return 1;
|
|
|
|
case REbackref:
|
|
return 0;
|
|
|
|
default:
|
|
assert(0);
|
|
}
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
/* ==================== replace ======================= */
|
|
|
|
/***********************
|
|
* After a match is found with test(), this function
|
|
* will take the match results and, using the format
|
|
* string, generate and return a new string.
|
|
*/
|
|
|
|
public string replace(string format)
|
|
{
|
|
return replace3(format, input, pmatch[0 .. re_nsub + 1]);
|
|
}
|
|
|
|
// Static version that doesn't require a RegExp object to be created
|
|
|
|
public static string replace3(string format, string input, regmatch_t[] pmatch)
|
|
{
|
|
string result;
|
|
uint c2;
|
|
int rm_so;
|
|
int rm_eo;
|
|
int i;
|
|
|
|
// printf("replace3(format = '%.*s', input = '%.*s')\n", format, input);
|
|
result.length = format.length;
|
|
result.length = 0;
|
|
for (size_t f = 0; f < format.length; f++)
|
|
{
|
|
char c = format[f];
|
|
L1:
|
|
if (c != '$')
|
|
{
|
|
result ~= c;
|
|
continue;
|
|
}
|
|
++f;
|
|
if (f == format.length)
|
|
{
|
|
result ~= '$';
|
|
break;
|
|
}
|
|
c = format[f];
|
|
switch (c)
|
|
{
|
|
case '&':
|
|
rm_so = pmatch[0].rm_so;
|
|
rm_eo = pmatch[0].rm_eo;
|
|
goto Lstring;
|
|
|
|
case '`':
|
|
rm_so = 0;
|
|
rm_eo = pmatch[0].rm_so;
|
|
goto Lstring;
|
|
|
|
case '\'':
|
|
rm_so = pmatch[0].rm_eo;
|
|
rm_eo = input.length;
|
|
goto Lstring;
|
|
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
i = c - '0';
|
|
if (f + 1 == format.length)
|
|
{
|
|
if (i == 0)
|
|
{
|
|
result ~= '$';
|
|
result ~= c;
|
|
continue;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
c2 = format[f + 1];
|
|
if (c2 >= '0' && c2 <= '9')
|
|
{ i = (c - '0') * 10 + (c2 - '0');
|
|
f++;
|
|
}
|
|
if (i == 0)
|
|
{
|
|
result ~= '$';
|
|
result ~= c;
|
|
c = cast(char)c2;
|
|
goto L1;
|
|
}
|
|
}
|
|
|
|
if (i < pmatch.length)
|
|
{ rm_so = pmatch[i].rm_so;
|
|
rm_eo = pmatch[i].rm_eo;
|
|
goto Lstring;
|
|
}
|
|
break;
|
|
|
|
Lstring:
|
|
if (rm_so != rm_eo)
|
|
result ~= input[rm_so .. rm_eo];
|
|
break;
|
|
|
|
default:
|
|
result ~= '$';
|
|
result ~= c;
|
|
break;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/************************************
|
|
* Like replace(char[] format), but uses old style formatting:
|
|
<table border=1 cellspacing=0 cellpadding=5>
|
|
<th>Format
|
|
<th>Description
|
|
<tr>
|
|
<td><b>&</b>
|
|
<td>replace with the match
|
|
</tr>
|
|
<tr>
|
|
<td><b>\</b><i>n</i>
|
|
<td>replace with the <i>n</i>th parenthesized match, <i>n</i> is 1..9
|
|
</tr>
|
|
<tr>
|
|
<td><b>\</b><i>c</i>
|
|
<td>replace with char <i>c</i>.
|
|
</tr>
|
|
</table>
|
|
*/
|
|
|
|
public string replaceOld(string format)
|
|
{
|
|
string result;
|
|
|
|
//printf("replace: this = %p so = %d, eo = %d\n", this, pmatch[0].rm_so, pmatch[0].rm_eo);
|
|
//printf("3input = '%.*s'\n", input);
|
|
result.length = format.length;
|
|
result.length = 0;
|
|
for (size_t i; i < format.length; i++)
|
|
{
|
|
char c = format[i];
|
|
switch (c)
|
|
{
|
|
case '&':
|
|
//printf("match = '%.*s'\n", input[pmatch[0].rm_so .. pmatch[0].rm_eo]);
|
|
result ~= input[pmatch[0].rm_so .. pmatch[0].rm_eo];
|
|
break;
|
|
|
|
case '\\':
|
|
if (i + 1 < format.length)
|
|
{
|
|
c = format[++i];
|
|
if (c >= '1' && c <= '9')
|
|
{ uint j;
|
|
|
|
j = c - '0';
|
|
if (j <= re_nsub && pmatch[j].rm_so != pmatch[j].rm_eo)
|
|
result ~= input[pmatch[j].rm_so .. pmatch[j].rm_eo];
|
|
break;
|
|
}
|
|
}
|
|
result ~= c;
|
|
break;
|
|
|
|
default:
|
|
result ~= c;
|
|
break;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
}
|
|
|
|
unittest
|
|
{ // Created and placed in public domain by Don Clugston
|
|
|
|
auto m = search("aBC r s", `bc\x20r[\40]s`, "i");
|
|
assert(m.pre=="a");
|
|
assert(m[0]=="BC r s");
|
|
auto m2 = search("7xxyxxx", `^\d([a-z]{2})\D\1`);
|
|
assert(m2[0]=="7xxyxx");
|
|
// Just check the parsing.
|
|
auto m3 = search("dcbxx", `ca|b[\d\]\D\s\S\w-\W]`);
|
|
auto m4 = search("xy", `[^\ca-\xFa\r\n\b\f\t\v\0123]{2,485}$`);
|
|
auto m5 = search("xxx", `^^\r\n\b{13,}\f{4}\t\v\u02aF3a\w\W`);
|
|
auto m6 = search("xxy", `.*y`);
|
|
assert(m6[0]=="xxy");
|
|
auto m7 = search("QWDEfGH", "(ca|b|defg)+", "i");
|
|
assert(m7[0]=="DEfG");
|
|
auto m8 = search("dcbxx", `a?\B\s\S`);
|
|
auto m9 = search("dcbxx", `[-w]`);
|
|
auto m10 = search("dcbsfd", `aB[c-fW]dB|\d|\D|\u012356|\w|\W|\s|\S`, "i");
|
|
auto m11 = search("dcbsfd", `[]a-]`);
|
|
m.replaceOld(`a&b\1c`);
|
|
m.replace(`a$&b$'$1c`);
|
|
}
|