Merge pull request #462 from blackwhale/regex-fixes

fix Issue 7300 - std.regex.ShiftOr!dchar.search is broken
This commit is contained in:
Andrei Alexandrescu 2012-03-30 06:26:15 -07:00
commit 84a847cbe1
5 changed files with 162 additions and 98 deletions

View file

@ -2,14 +2,17 @@ $(VERSION 059, ddd mm, 2012, =================================================,
$(LIBBUGSFIXED $(LIBBUGSFIXED
$(LI $(BUGZILLA 4604): A stack overflow with writeln) $(LI $(BUGZILLA 4604): A stack overflow with writeln)
$(LI $(BUGZILLA 5523): std.regex handles "\s" and "\W" (etc.) inside square brackets improperly)
$(LI $(BUGZILLA 5674): AssertError in std.regex) $(LI $(BUGZILLA 5674): AssertError in std.regex)
$(LI $(BUGZILLA 5652): Add \p and \P unicode properties to std.regex) $(LI $(BUGZILLA 5652): Add \p and \P unicode properties to std.regex)
$(LI $(BUGZILLA 5964): std.stdio.readln can throw a UnicodeException) $(LI $(BUGZILLA 5964): std.stdio.readln can throw a UnicodeException)
$(LI $(BUGZILLA 6217): [GSOC] result of std.algorithm.map is not movable)
$(LI $(BUGZILLA 6403): Upgrade std.regex to Unicode UTS #18 Level 1 support) $(LI $(BUGZILLA 6403): Upgrade std.regex to Unicode UTS #18 Level 1 support)
$(LI $(BUGZILLA 7111): New regex engine cannot match beginning of empty string) $(LI $(BUGZILLA 7111): New regex engine cannot match beginning of empty string)
$(LI $(BUGZILLA 7138): Can't call array() on dirEntries) $(LI $(BUGZILLA 7138): Can't call array() on dirEntries)
$(LI $(BUGZILLA 7264): Can't iterate result from 4-arg dirEntries as string) $(LI $(BUGZILLA 7264): Can't iterate result from 4-arg dirEntries as string)
$(LI $(BUGZILLA 7299): std.uni missing doc comments) $(LI $(BUGZILLA 7299): std.uni missing doc comments)
$(LI $(BUGZILLA 7300): std.regex.ShiftOr!dchar.search is broken)
$(LI $(BUGZILLA 7374): stdin.byLine() throws AssertError on empty input) $(LI $(BUGZILLA 7374): stdin.byLine() throws AssertError on empty input)
$(LI $(BUGZILLA 7628): std.format formatValue incorrect overload) $(LI $(BUGZILLA 7628): std.format formatValue incorrect overload)
$(LI $(BUGZILLA 7674): regex replace requires escaped format) $(LI $(BUGZILLA 7674): regex replace requires escaped format)

View file

@ -1406,7 +1406,7 @@ unittest
/// Ditto /// Ditto
T move(T)(ref T src) T move(T)(ref T src)
{ {
T result; T result=void;
move(src, result); move(src, result);
return result; return result;
} }
@ -8771,3 +8771,9 @@ unittest
//writeln(b[0]); //writeln(b[0]);
assert(b[0] == tuple(4.0, 2u)); assert(b[0] == tuple(4.0, 2u));
} }
unittest//Issue 6217
{
auto x = map!"a"([1,2,3]);
x = move(x);
}

View file

@ -58,16 +58,6 @@ body
} }
} }
//ditto
@trusted void moveAllAlt(T)(T[] src, T[] dest)
{//moveAll is @system
if(__ctfe)
foreach(i,v; src)
dest[i] = v;
else
moveAll(src, dest);
}
//$(D Interval) represents an interval of codepoints: [a,b). //$(D Interval) represents an interval of codepoints: [a,b).
struct Interval struct Interval
{ {

View file

@ -5932,6 +5932,27 @@ unittest {
assert(equal(app.data, [1,2,3])); assert(equal(app.data, [1,2,3]));
} }
/**
Returns true if $(D fn) accepts variables of type T1 and T2 in any order.
The following code should compile:
---
T1 t1;
T2 t2;
fn(t1, t2);
fn(t2, t1);
---
*/
template isTwoWayCompatible(alias fn, T1, T2)
{
enum isTwoWayCompatible = is(typeof( (){
T1 e;
T2 v;
return fn(v,e) && fn(e,v);
}
));
}
/** /**
Policy used with the searching primitives $(D lowerBound), $(D Policy used with the searching primitives $(D lowerBound), $(D
upperBound), and $(D equalRange) of $(LREF SortedRange) below. upperBound), and $(D equalRange) of $(LREF SortedRange) below.
@ -6233,10 +6254,9 @@ if (isRandomAccessRange!Range)
---- ----
*/ */
auto lowerBound(SearchPolicy sp = SearchPolicy.binarySearch, V)(V value) auto lowerBound(SearchPolicy sp = SearchPolicy.binarySearch, V)(V value)
if (is(V : ElementType!Range)) if (isTwoWayCompatible!(predFun, ElementType!Range, V))
{ {
ElementType!Range v = value; return this[0 .. getTransitionIndex!(sp, geq)(value)];
return this[0 .. getTransitionIndex!(sp, geq)(v)];
} }
// upperBound // upperBound
@ -6257,10 +6277,9 @@ if (isRandomAccessRange!Range)
---- ----
*/ */
auto upperBound(SearchPolicy sp = SearchPolicy.binarySearch, V)(V value) auto upperBound(SearchPolicy sp = SearchPolicy.binarySearch, V)(V value)
if (is(V : ElementType!Range)) if (isTwoWayCompatible!(predFun, ElementType!Range, V))
{ {
ElementType!Range v = value; return this[getTransitionIndex!(sp, gt)(value) .. length];
return this[getTransitionIndex!(sp, gt)(v) .. length];
} }
// equalRange // equalRange
@ -6284,7 +6303,8 @@ if (isRandomAccessRange!Range)
assert(equal(r, [ 3, 3, 3 ])); assert(equal(r, [ 3, 3, 3 ]));
---- ----
*/ */
auto equalRange(V)(V value) if (is(V : ElementType!Range)) auto equalRange(V)(V value)
if (isTwoWayCompatible!(predFun, ElementType!Range, V))
{ {
size_t first = 0, count = _input.length; size_t first = 0, count = _input.length;
while (count > 0) while (count > 0)
@ -6339,7 +6359,8 @@ assert(equal(r[1], [ 3, 3, 3 ]));
assert(equal(r[2], [ 4, 4, 5, 6 ])); assert(equal(r[2], [ 4, 4, 5, 6 ]));
---- ----
*/ */
auto trisect(V)(V value) if (is(V : ElementType!Range)) auto trisect(V)(V value)
if (isTwoWayCompatible!(predFun, ElementType!Range, V))
{ {
size_t first = 0, count = _input.length; size_t first = 0, count = _input.length;
while (count > 0) while (count > 0)
@ -6445,6 +6466,19 @@ unittest
assert(equal(r[2], [ 40, 40, 50, 60 ])); assert(equal(r[2], [ 40, 40, 50, 60 ]));
} }
unittest
{
auto a = [ "A", "AG", "B", "E", "F" ];
auto r = assumeSorted!"cmp(a,b) < 0"(a).trisect("B"w);
assert(equal(r[0], [ "A", "AG" ]));
assert(equal(r[1], [ "B" ]));
assert(equal(r[2], [ "E", "F" ]));
r = assumeSorted!"cmp(a,b) < 0"(a).trisect("A"d);
assert(r[0].empty);
assert(equal(r[1], [ "A" ]));
assert(equal(r[2], [ "AG", "B", "E", "F" ]));
}
unittest unittest
{ {
static void test(SearchPolicy pol)() static void test(SearchPolicy pol)()
@ -6536,6 +6570,8 @@ unittest
assert(equal(p, [0, 1, 2, 3, 4])); assert(equal(p, [0, 1, 2, 3, 4]));
p = assumeSorted(a).lowerBound(6); p = assumeSorted(a).lowerBound(6);
assert(equal(p, [ 0, 1, 2, 3, 4, 5])); assert(equal(p, [ 0, 1, 2, 3, 4, 5]));
p = assumeSorted(a).lowerBound(6.9);
assert(equal(p, [ 0, 1, 2, 3, 4, 5, 6]));
} }
unittest unittest
@ -6543,6 +6579,8 @@ unittest
int[] a = [ 1, 2, 3, 3, 3, 4, 4, 5, 6 ]; int[] a = [ 1, 2, 3, 3, 3, 4, 4, 5, 6 ];
auto p = assumeSorted(a).upperBound(3); auto p = assumeSorted(a).upperBound(3);
assert(equal(p, [4, 4, 5, 6 ])); assert(equal(p, [4, 4, 5, 6 ]));
p = assumeSorted(a).upperBound(4.2);
assert(equal(p, [ 5, 6 ]));
} }
unittest unittest
@ -6558,6 +6596,8 @@ unittest
assert(p.empty); assert(p.empty);
p = assumeSorted(a).equalRange(7); p = assumeSorted(a).equalRange(7);
assert(p.empty); assert(p.empty);
p = assumeSorted(a).equalRange(3.0);
assert(equal(p, [ 3, 3, 3]));
} }
unittest unittest

View file

@ -774,22 +774,12 @@ auto memoizeExpr(string expr)()
s.add(Interval(0,0x7f)); s.add(Interval(0,0x7f));
else else
{ {
version(fred_perfect_hashing) auto range = assumeSorted!((x,y) => ucmp(x.name, y.name) < 0)(unicodeProperties);
{
uint key = phash(name);
if(key >= PHASHNKEYS || ucmp(name,unicodeProperties[key].name) != 0)
enforce(0, "invalid property name");
s = cast(CodepointSet)unicodeProperties[key].set;
}
else
{
auto range = assumeSorted!((x,y){ return ucmp(x.name, y.name) < 0; })(unicodeProperties);
//creating empty Codepointset is a workaround //creating empty Codepointset is a workaround
auto eq = range.lowerBound(UnicodeProperty(cast(string)name,CodepointSet.init)).length; auto eq = range.lowerBound(UnicodeProperty(cast(string)name,CodepointSet.init)).length;
enforce(eq!=range.length && ucmp(name,range[eq].name)==0,"invalid property name"); enforce(eq!=range.length && ucmp(name,range[eq].name)==0,"invalid property name");
s = range[eq].set.dup; s = range[eq].set.dup;
} }
}
if(casefold) if(casefold)
s = caseEnclose(s); s = caseEnclose(s);
@ -873,15 +863,12 @@ struct Parser(R, bool CTFE=false)
if(isSomeString!S) if(isSomeString!S)
{ {
pat = origin = pattern; pat = origin = pattern;
//reserve slightly more then avg as sampled from unittests
if(!__ctfe) if(!__ctfe)
ir.reserve(pat.length); ir.reserve((pat.length*5+2)/4);
parseFlags(flags); parseFlags(flags);
_current = ' ';//a safe default for freeform parsing _current = ' ';//a safe default for freeform parsing
next(); next();
if(__ctfe)
parseRegex();
else
{
try try
{ {
parseRegex(); parseRegex();
@ -890,7 +877,6 @@ struct Parser(R, bool CTFE=false)
{ {
error(e.msg);//also adds pattern location error(e.msg);//also adds pattern location
} }
}
put(Bytecode(IR.End, 0)); put(Bytecode(IR.End, 0));
} }
@ -911,10 +897,8 @@ struct Parser(R, bool CTFE=false)
empty = true; empty = true;
return false; return false;
} }
//for CTFEability _current = pat.front;
size_t idx=0; pat.popFront();
_current = decode(pat, idx);
pat = pat[idx..$];
return true; return true;
} }
@ -1250,7 +1234,7 @@ struct Parser(R, bool CTFE=false)
default: default:
if(replace) if(replace)
{ {
moveAllAlt(ir[offset+1..$],ir[offset..$-1]); moveAll(ir[offset+1..$],ir[offset..$-1]);
ir.length -= 1; ir.length -= 1;
} }
return; return;
@ -1298,17 +1282,10 @@ struct Parser(R, bool CTFE=false)
counterDepth = std.algorithm.max(counterDepth, nesting+1); counterDepth = std.algorithm.max(counterDepth, nesting+1);
} }
else if(replace) else if(replace)
{
if(__ctfe)//CTFE workaround: no moveAll and length -= x;
{
ir = ir[0..offset] ~ ir[offset+1..$];
}
else
{ {
moveAll(ir[offset+1 .. $],ir[offset .. $-1]); moveAll(ir[offset+1 .. $],ir[offset .. $-1]);
ir.length -= 1; ir.length -= 1;
} }
}
put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len)); put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len));
enforce(ir.length + len < maxCompiledLength, "maximum compiled pattern length is exceeded"); enforce(ir.length + len < maxCompiledLength, "maximum compiled pattern length is exceeded");
ir ~= ir[offset .. offset+len]; ir ~= ir[offset .. offset+len];
@ -2162,13 +2139,8 @@ private:
// //
@trusted uint lookupNamedGroup(String)(NamedGroup[] dict, String name) @trusted uint lookupNamedGroup(String)(NamedGroup[] dict, String name)
{//equal is @system? {//equal is @system?
//@@@BUG@@@ assumeSorted kills "-inline" auto fnd = assumeSorted!"cmp(a,b) < 0"(map!"a.name"(dict)).lowerBound(name).length;
//auto fnd = assumeSorted(map!"a.name"(dict)).lowerBound(name).length; enforce(equal(dict[fnd].name, name), text("no submatch named ", name));
uint fnd;
for(fnd = 0; fnd<dict.length; fnd++)
if(equal(dict[fnd].name,name))
break;
enforce(fnd < dict.length, text("no submatch named ", name));
return dict[fnd].group; return dict[fnd].group;
} }
@ -2766,7 +2738,7 @@ public:
// returns only valid UTF indexes // returns only valid UTF indexes
// (that given the haystack in question is valid UTF string) // (that given the haystack in question is valid UTF string)
@trusted size_t search(const(Char)[] haystack, size_t idx) @trusted size_t search(const(Char)[] haystack, size_t idx)
{ {//@BUG: apparently assumes little endian machines
assert(!empty); assert(!empty);
auto p = cast(const(ubyte)*)(haystack.ptr+idx); auto p = cast(const(ubyte)*)(haystack.ptr+idx);
uint state = uint.max; uint state = uint.max;
@ -2779,9 +2751,10 @@ public:
while(p != end) while(p != end)
{ {
if(!~state) if(!~state)
{ {//speed up seeking first matching place
for(;;) for(;;)
{ {
assert(p <= end, text(p," vs ", end));
p = cast(ubyte*)memchr(p, fChar, end - p); p = cast(ubyte*)memchr(p, fChar, end - p);
if(!p) if(!p)
return haystack.length; return haystack.length;
@ -2796,31 +2769,40 @@ public:
{ {
state = (state<<1) | table[p[1]]; state = (state<<1) | table[p[1]];
state = (state<<1) | table[p[2]]; state = (state<<1) | table[p[2]];
p += 3; p += 4;
} }
} else
//first char is already tested, see if that's all p++;
if(!(state & limit))//division rounds down for dchar //first char is tested, see if that's all
if(!(state & limit))
return (p-cast(ubyte*)haystack.ptr)/Char.sizeof return (p-cast(ubyte*)haystack.ptr)/Char.sizeof
-length+1; -length;
}
else
{//have some bits/states for possible matches,
//use the usual shift-or cycle
static if(charSize == 3) static if(charSize == 3)
{ {
state = (state<<1) | table[p[0]];
state = (state<<1) | table[p[1]]; state = (state<<1) | table[p[1]];
state = (state<<1) | table[p[2]]; state = (state<<1) | table[p[2]];
state = (state<<1) | table[p[3]];
p+=4; p+=4;
} }
else else
{ {
state = (state<<1) | table[p[1]]; state = (state<<1) | table[p[0]];
p++; p++;
} }
if(!(state & limit))
return (p-cast(ubyte*)haystack.ptr)/Char.sizeof
-length;
}
debug(fred_search) writefln("State: %32b", state); debug(fred_search) writefln("State: %32b", state);
} }
} }
else else
{ {
//in this path we have to shift first //normal path, partially unrolled for char/wchar
static if(charSize == 3) static if(charSize == 3)
{ {
const(ubyte)* end = cast(ubyte*)(haystack.ptr + haystack.length); const(ubyte)* end = cast(ubyte*)(haystack.ptr + haystack.length);
@ -4870,8 +4852,6 @@ enum OneShot { Fwd, Bwd };
if(is(Char : dchar)) if(is(Char : dchar))
{ {
alias Stream.DataIndex DataIndex; alias Stream.DataIndex DataIndex;
alias const(Char)[] String;
enum threadAllocSize = 16;
Thread!DataIndex* freelist; Thread!DataIndex* freelist;
ThreadList!DataIndex clist, nlist; ThreadList!DataIndex clist, nlist;
DataIndex[] merge; DataIndex[] merge;
@ -4978,7 +4958,6 @@ enum OneShot { Fwd, Bwd };
writeln("------------------------------------------"); writeln("------------------------------------------");
if(exhausted) if(exhausted)
{ {
return false; return false;
} }
if(re.flags & RegexInfo.oneShot) if(re.flags & RegexInfo.oneShot)
@ -5039,8 +5018,7 @@ enum OneShot { Fwd, Bwd };
break; break;
} }
} }
else
exhausted = true;
genCounter++; //increment also on each end genCounter++; //increment also on each end
debug(fred_matching) writefln("Threaded matching threads at end"); debug(fred_matching) writefln("Threaded matching threads at end");
//try out all zero-width posibilities //try out all zero-width posibilities
@ -5050,8 +5028,17 @@ enum OneShot { Fwd, Bwd };
} }
if(!matched) if(!matched)
eval!false(createStart(index), matches);//new thread starting at end of input eval!false(createStart(index), matches);//new thread starting at end of input
if(matched && !(re.flags & RegexOption.global)) if(matched)
exhausted = true; {//in case NFA found match along the way
//and last possible longer alternative ultimately failed
s.reset(matches[0].end);//reset to last successful match
next();//and reload front character
//--- here the exact state of stream was restored ---
exhausted = atEnd || !(re.flags & RegexOption.global);
//+ empty match advances the input
if(!exhausted && matches[0].begin == matches[0].end)
next();
}
return matched; return matched;
} }
@ -6278,6 +6265,24 @@ public:
@property ref captures(){ return this; } @property ref captures(){ return this; }
} }
unittest//verify example
{
auto m = match("@abc#", regex(`(\w)(\w)(\w)`));
auto c = m.captures;
assert(c.pre == "@");// part of input preceeding match
assert(c.post == "#"); // immediately after match
assert(c.hit == c[0] && c.hit == "abc");// the whole match
assert(c[2] =="b");
assert(c.front == "abc");
c.popFront();
assert(c.front == "a");
assert(c.back == "c");
c.popBack();
assert(c.back == "b");
popFrontN(c, 2);
assert(c.empty);
}
/++ /++
A regex engine state, as returned by $(D match) family of functions. A regex engine state, as returned by $(D match) family of functions.
@ -6397,9 +6402,19 @@ public:
Throws: $(D RegexException) if there were any errors during compilation. Throws: $(D RegexException) if there were any errors during compilation.
+/ +/
public auto regex(S)(S pattern, const(char)[] flags="") @trusted public auto regex(S)(S pattern, const(char)[] flags="")
if(isSomeString!(S)) if(isSomeString!(S))
{ {
enum cacheSize = 8; //TODO: invent nice interface to control regex caching
if(__ctfe)
return regexImpl(pattern, flags);
return memoize!(regexImpl!S, cacheSize)(pattern, flags);
}
public auto regexImpl(S)(S pattern, const(char)[] flags="")
if(isSomeString!(S))
{
alias Regex!(BasicElementOf!S) Reg;
if(!__ctfe) if(!__ctfe)
{ {
auto parser = Parser!(Unqual!(typeof(pattern)))(pattern, flags); auto parser = Parser!(Unqual!(typeof(pattern)))(pattern, flags);
@ -7228,6 +7243,7 @@ unittest
run_tests!match(); //thompson VM run_tests!match(); //thompson VM
} }
} }
version(fred_ct) version(fred_ct)
{ {
unittest unittest
@ -7424,6 +7440,11 @@ else
if(ch != '-') //'--' is an operator if(ch != '-') //'--' is an operator
assert(match(to!string(ch),regex(`[\`~ch~`-\`~ch~`]`))); assert(match(to!string(ch),regex(`[\`~ch~`-\`~ch~`]`)));
} }
//bugzilla 7718
string strcmd = "./myApp.rb -os OSX -path \"/GIT/Ruby Apps/sec\" -conf 'notimer'";
auto reStrCmd = regex (`(".*")|('.*')`, "g");
assert(equal(map!"a[0]"(matchFn(strcmd, reStrCmd)),
[`"/GIT/Ruby Apps/sec"`, `'notimer'`]));
} }
test_body!bmatch(); test_body!bmatch();
test_body!match(); test_body!match();
@ -7502,7 +7523,11 @@ else
} }
unittest unittest
{//bugzilla 7111 {//bugzilla 7111
assert(!match("", regex("^")).empty); assert(match("", regex("^")));
}
unittest
{//bugzilla 7300
assert(!match("a"d, "aa"d));
} }
unittest unittest
@ -7523,4 +7548,4 @@ else
} }
} }
} }//version(unittest)