Merge pull request #462 from blackwhale/regex-fixes

fix Issue 7300 - std.regex.ShiftOr!dchar.search is broken
2025-05-01 15:40:36 +03:00 · 2012-03-30 06:26:15 -07:00 · 2012-03-30 06:26:15 -07:00 · 84a847cbe1
commit 84a847cbe1
parent 9273c74aa8 16bd8e423a
5 changed files with 162 additions and 98 deletions
--- a/changelog.dd
+++ b/changelog.dd
@ -2,14 +2,17 @@ $(VERSION 059, ddd mm, 2012, =================================================,
    $(LIBBUGSFIXED
        $(LI $(BUGZILLA 4604): A stack overflow with writeln)
        $(LI $(BUGZILLA 5523): std.regex handles "\s" and "\W" (etc.) inside square brackets improperly)
        $(LI $(BUGZILLA 5674): AssertError in std.regex)
        $(LI $(BUGZILLA 5652): Add \p and \P unicode properties to std.regex)
        $(LI $(BUGZILLA 5964): std.stdio.readln can throw a UnicodeException)
        $(LI $(BUGZILLA 6217): [GSOC] result of std.algorithm.map is not movable)
        $(LI $(BUGZILLA 6403): Upgrade std.regex to Unicode UTS #18 Level 1 support)
        $(LI $(BUGZILLA 7111): New regex engine cannot match beginning of empty string)
        $(LI $(BUGZILLA 7138): Can't call array() on dirEntries)
        $(LI $(BUGZILLA 7264): Can't iterate result from 4-arg dirEntries as string)
        $(LI $(BUGZILLA 7299): std.uni missing doc comments)
        $(LI $(BUGZILLA 7300): std.regex.ShiftOr!dchar.search is broken)
        $(LI $(BUGZILLA 7374): stdin.byLine() throws AssertError on empty input)
        $(LI $(BUGZILLA 7628): std.format formatValue incorrect overload)
        $(LI $(BUGZILLA 7674): regex replace requires escaped format)
--- a/std/algorithm.d
+++ b/std/algorithm.d
@ -1406,7 +1406,7 @@ unittest
 /// Ditto
 T move(T)(ref T src)
 {
-    T result;
+    T result=void;
    move(src, result);
    return result;
 }
@ -8771,3 +8771,9 @@ unittest
    //writeln(b[0]);
    assert(b[0] == tuple(4.0, 2u));
 }
 unittest//Issue 6217 
 {
    auto x = map!"a"([1,2,3]);
    x = move(x);
 }
--- a/std/internal/uni.d
+++ b/std/internal/uni.d
@ -58,16 +58,6 @@ body
    }
 }
 //ditto
@trusted void moveAllAlt(T)(T[] src, T[] dest)
 {//moveAll is @system
    if(__ctfe)
        foreach(i,v; src)
            dest[i] = v;
    else
        moveAll(src, dest);
 }
 //$(D Interval)  represents an interval of codepoints: [a,b).
 struct Interval
 {
--- a/std/range.d
+++ b/std/range.d
@ -5932,6 +5932,27 @@ unittest {
    assert(equal(app.data, [1,2,3]));
 }
 /**
  Returns true if $(D fn) accepts variables of type T1 and T2 in any order.
  The following code should compile:
  ---
  T1 t1; 
  T2 t2;
  fn(t1, t2);
  fn(t2, t1);
  ---
 */
 template isTwoWayCompatible(alias fn, T1, T2)
 {
    enum isTwoWayCompatible = is(typeof( (){ 
            T1 e; 
            T2 v;
            return fn(v,e) && fn(e,v); 
        }
    ));
 }
 /**
   Policy used with the searching primitives $(D lowerBound), $(D
   upperBound), and $(D equalRange) of $(LREF SortedRange) below.
@ -6233,10 +6254,9 @@ if (isRandomAccessRange!Range)
   ----
 */
    auto lowerBound(SearchPolicy sp = SearchPolicy.binarySearch, V)(V value)
-    if (is(V : ElementType!Range))
+    if (isTwoWayCompatible!(predFun, ElementType!Range, V))
    {
-        ElementType!Range v = value;
+        return this[0 .. getTransitionIndex!(sp, geq)(value)];
        return this[0 .. getTransitionIndex!(sp, geq)(v)];
    }
 // upperBound
@ -6257,10 +6277,9 @@ if (isRandomAccessRange!Range)
   ----
 */
    auto upperBound(SearchPolicy sp = SearchPolicy.binarySearch, V)(V value)
-    if (is(V : ElementType!Range))
+    if (isTwoWayCompatible!(predFun, ElementType!Range, V))
    {
-        ElementType!Range v = value;
+        return this[getTransitionIndex!(sp, gt)(value) .. length];
        return this[getTransitionIndex!(sp, gt)(v) .. length];
    }
 // equalRange
@ -6284,7 +6303,8 @@ if (isRandomAccessRange!Range)
   assert(equal(r, [ 3, 3, 3 ]));
   ----
 */
-    auto equalRange(V)(V value) if (is(V : ElementType!Range))
+    auto equalRange(V)(V value)
    if (isTwoWayCompatible!(predFun, ElementType!Range, V))
    {
        size_t first = 0, count = _input.length;
        while (count > 0)
@ -6339,7 +6359,8 @@ assert(equal(r[1], [ 3, 3, 3 ]));
 assert(equal(r[2], [ 4, 4, 5, 6 ]));
 ----
 */
-    auto trisect(V)(V value) if (is(V : ElementType!Range))
+    auto trisect(V)(V value)
    if (isTwoWayCompatible!(predFun, ElementType!Range, V))
    {
        size_t first = 0, count = _input.length;
        while (count > 0)
@ -6445,6 +6466,19 @@ unittest
    assert(equal(r[2], [ 40, 40, 50, 60 ]));
 }
 unittest
 {
    auto a = [ "A", "AG", "B", "E", "F" ];
    auto r = assumeSorted!"cmp(a,b) < 0"(a).trisect("B"w);
    assert(equal(r[0], [ "A", "AG" ]));
    assert(equal(r[1], [ "B" ]));
    assert(equal(r[2], [ "E", "F" ]));
    r = assumeSorted!"cmp(a,b) < 0"(a).trisect("A"d);
    assert(r[0].empty);
    assert(equal(r[1], [ "A" ]));
    assert(equal(r[2], [ "AG", "B", "E", "F" ]));
 }
 unittest
 {
    static void test(SearchPolicy pol)()
@ -6536,6 +6570,8 @@ unittest
    assert(equal(p, [0, 1, 2, 3, 4]));
    p = assumeSorted(a).lowerBound(6);
    assert(equal(p, [ 0, 1, 2, 3, 4, 5]));
    p = assumeSorted(a).lowerBound(6.9);
    assert(equal(p, [ 0, 1, 2, 3, 4, 5, 6]));
 }
 unittest
@ -6543,6 +6579,8 @@ unittest
    int[] a = [ 1, 2, 3, 3, 3, 4, 4, 5, 6 ];
    auto p = assumeSorted(a).upperBound(3);
    assert(equal(p, [4, 4, 5, 6 ]));
    p = assumeSorted(a).upperBound(4.2);
    assert(equal(p, [ 5, 6 ]));
 }
 unittest
@ -6558,6 +6596,8 @@ unittest
    assert(p.empty);
    p = assumeSorted(a).equalRange(7);
    assert(p.empty);
    p = assumeSorted(a).equalRange(3.0);
    assert(equal(p, [ 3, 3, 3]));
 }
 unittest
--- a/std/regex.d
+++ b/std/regex.d
@ -774,22 +774,12 @@ auto memoizeExpr(string expr)()
        s.add(Interval(0,0x7f));
    else
    {
-        version(fred_perfect_hashing)
+        auto range = assumeSorted!((x,y) => ucmp(x.name, y.name) < 0)(unicodeProperties);
        {
            uint key = phash(name);
            if(key >= PHASHNKEYS || ucmp(name,unicodeProperties[key].name) != 0)
                enforce(0, "invalid property name");
            s = cast(CodepointSet)unicodeProperties[key].set;
        }
        else
        {
            auto range = assumeSorted!((x,y){ return ucmp(x.name, y.name) < 0; })(unicodeProperties);
        //creating empty Codepointset is a workaround
        auto eq = range.lowerBound(UnicodeProperty(cast(string)name,CodepointSet.init)).length;
        enforce(eq!=range.length && ucmp(name,range[eq].name)==0,"invalid property name");
        s = range[eq].set.dup;
    }
    }
    if(casefold)
        s = caseEnclose(s);
@ -873,15 +863,12 @@ struct Parser(R, bool CTFE=false)
        if(isSomeString!S)
    {
        pat = origin = pattern;
        //reserve slightly more then avg as sampled from unittests
        if(!__ctfe)
-            ir.reserve(pat.length);
+            ir.reserve((pat.length*5+2)/4);
        parseFlags(flags);
        _current = ' ';//a safe default for freeform parsing
        next();
        if(__ctfe)
            parseRegex();
        else
        {
        try
        {
            parseRegex();
@ -890,7 +877,6 @@ struct Parser(R, bool CTFE=false)
        {
            error(e.msg);//also adds pattern location
        }
        }
        put(Bytecode(IR.End, 0));
    }
@ -911,10 +897,8 @@ struct Parser(R, bool CTFE=false)
            empty =  true;
            return false;
        }
-        //for CTFEability
+        _current = pat.front;
-        size_t idx=0;
+        pat.popFront();
        _current = decode(pat, idx);
        pat = pat[idx..$];
        return true;
    }
@ -1250,7 +1234,7 @@ struct Parser(R, bool CTFE=false)
        default:
            if(replace)
            {
-                moveAllAlt(ir[offset+1..$],ir[offset..$-1]);
+                moveAll(ir[offset+1..$],ir[offset..$-1]);
                ir.length -= 1;
            }
            return;
@ -1298,17 +1282,10 @@ struct Parser(R, bool CTFE=false)
                counterDepth = std.algorithm.max(counterDepth, nesting+1);
            }
            else if(replace)
            {
                if(__ctfe)//CTFE workaround: no moveAll and length -= x;
                {
                    ir = ir[0..offset] ~ ir[offset+1..$];
                }
                else
            {
                moveAll(ir[offset+1 .. $],ir[offset .. $-1]);
                ir.length -= 1;
            }
            }
            put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len));
            enforce(ir.length + len < maxCompiledLength,  "maximum compiled pattern length is exceeded");
            ir ~= ir[offset .. offset+len];
@ -2162,13 +2139,8 @@ private:
 //
@trusted uint lookupNamedGroup(String)(NamedGroup[] dict, String name)
 {//equal is @system?
-    //@@@BUG@@@ assumeSorted kills "-inline"
+    auto fnd = assumeSorted!"cmp(a,b) < 0"(map!"a.name"(dict)).lowerBound(name).length;
-    //auto fnd = assumeSorted(map!"a.name"(dict)).lowerBound(name).length;
+    enforce(equal(dict[fnd].name, name), text("no submatch named ", name));
    uint fnd;
    for(fnd = 0; fnd<dict.length; fnd++)
        if(equal(dict[fnd].name,name))
            break;
    enforce(fnd < dict.length, text("no submatch named ", name));
    return dict[fnd].group;
 }
@ -2766,7 +2738,7 @@ public:
    // returns only valid UTF indexes
    // (that given the haystack in question is valid UTF string)
    @trusted size_t search(const(Char)[] haystack, size_t idx)
-    {
+    {//@BUG: apparently assumes little endian machines
        assert(!empty);
        auto p = cast(const(ubyte)*)(haystack.ptr+idx);
        uint state = uint.max;
@ -2779,9 +2751,10 @@ public:
            while(p != end)
            {
                if(!~state)
-                {
+                {//speed up seeking first matching place
                    for(;;)
                    {
                        assert(p <= end, text(p," vs ", end));
                        p = cast(ubyte*)memchr(p, fChar, end - p);
                        if(!p)
                            return haystack.length;
@ -2796,31 +2769,40 @@ public:
                    {
                        state = (state<<1) | table[p[1]];
                        state = (state<<1) | table[p[2]];
-                        p += 3;
+                        p += 4;
                    }
-                }
+                    else
-                //first char is already tested, see if that's all
+                        p++;
-                if(!(state & limit))//division rounds down for dchar
+                    //first char is tested, see if that's all
                    if(!(state & limit))
                        return (p-cast(ubyte*)haystack.ptr)/Char.sizeof
-                        -length+1;
+                            -length;
                }
                else
                {//have some bits/states for possible matches,
                 //use the usual shift-or cycle
                    static if(charSize == 3)
                    {
                        state = (state<<1) | table[p[0]];
                        state = (state<<1) | table[p[1]];
                        state = (state<<1) | table[p[2]];
                    state = (state<<1) | table[p[3]];
                        p+=4;
                    }
                    else
                    {
-                    state = (state<<1) | table[p[1]];
+                        state = (state<<1) | table[p[0]];
                        p++;
                    }
                    if(!(state & limit))
                        return (p-cast(ubyte*)haystack.ptr)/Char.sizeof
                            -length;
                }
                debug(fred_search) writefln("State: %32b", state);
            }
        }
        else
        {
-            //in this path we have to shift first
+            //normal path, partially unrolled for char/wchar
            static if(charSize == 3)
            {
                const(ubyte)* end = cast(ubyte*)(haystack.ptr + haystack.length);
@ -4870,8 +4852,6 @@ enum OneShot { Fwd, Bwd };
    if(is(Char : dchar))
 {
    alias Stream.DataIndex DataIndex;
    alias const(Char)[] String;
    enum threadAllocSize = 16;
    Thread!DataIndex* freelist;
    ThreadList!DataIndex clist, nlist;
    DataIndex[] merge;
@ -4978,7 +4958,6 @@ enum OneShot { Fwd, Bwd };
            writeln("------------------------------------------");
        if(exhausted)
        {
            return false;
        }
        if(re.flags & RegexInfo.oneShot)
@ -5039,8 +5018,7 @@ enum OneShot { Fwd, Bwd };
                    break;
                }
            }
-        else
+
            exhausted = true;
        genCounter++; //increment also on each end
        debug(fred_matching) writefln("Threaded matching threads at end");
        //try out all zero-width posibilities
@ -5050,8 +5028,17 @@ enum OneShot { Fwd, Bwd };
        }
        if(!matched)
            eval!false(createStart(index), matches);//new thread starting at end of input
-        if(matched && !(re.flags & RegexOption.global))
+        if(matched)
-           exhausted = true;
+        {//in case NFA found match along the way
         //and last possible longer alternative ultimately failed
            s.reset(matches[0].end);//reset to last successful match
            next();//and reload front character
            //--- here the exact state of stream was restored ---
            exhausted = atEnd || !(re.flags & RegexOption.global);
            //+ empty match advances the input
            if(!exhausted && matches[0].begin == matches[0].end)
                next(); 
        }
        return matched;
    }
@ -6278,6 +6265,24 @@ public:
    @property ref captures(){ return this; }
 }
 unittest//verify example
 {
    auto m = match("@abc#", regex(`(\w)(\w)(\w)`));
    auto c = m.captures;
    assert(c.pre == "@");// part of input preceeding match
    assert(c.post == "#"); // immediately after match
    assert(c.hit == c[0] && c.hit == "abc");// the whole match
    assert(c[2] =="b");
    assert(c.front == "abc");
    c.popFront();
    assert(c.front == "a");
    assert(c.back == "c");
    c.popBack();
    assert(c.back == "b");
    popFrontN(c, 2);
    assert(c.empty);
 }
 /++
    A regex engine state, as returned by $(D match) family of functions.
@ -6397,9 +6402,19 @@ public:
    Throws: $(D RegexException) if there were any errors during compilation.
 +/
-public auto regex(S)(S pattern, const(char)[] flags="")
+@trusted public auto regex(S)(S pattern, const(char)[] flags="")
    if(isSomeString!(S))
 {
    enum cacheSize = 8; //TODO: invent nice interface to control regex caching
    if(__ctfe)
        return regexImpl(pattern, flags);
    return memoize!(regexImpl!S, cacheSize)(pattern, flags);
 }
 public auto regexImpl(S)(S pattern, const(char)[] flags="")
    if(isSomeString!(S))
 {
    alias Regex!(BasicElementOf!S) Reg;
    if(!__ctfe)
    {
        auto parser = Parser!(Unqual!(typeof(pattern)))(pattern, flags);
@ -7228,6 +7243,7 @@ unittest
        run_tests!match(); //thompson VM
    }
 }
 version(fred_ct)
 {
    unittest
@ -7424,6 +7440,11 @@ else
                if(ch != '-') //'--' is an operator
                    assert(match(to!string(ch),regex(`[\`~ch~`-\`~ch~`]`)));
            }
            //bugzilla 7718
            string strcmd = "./myApp.rb -os OSX -path \"/GIT/Ruby Apps/sec\" -conf 'notimer'";
            auto reStrCmd = regex (`(".*")|('.*')`, "g");
            assert(equal(map!"a[0]"(matchFn(strcmd, reStrCmd)),
                         [`"/GIT/Ruby Apps/sec"`, `'notimer'`]));
        }
        test_body!bmatch();
        test_body!match();
@ -7502,7 +7523,11 @@ else
    }
    unittest
    {//bugzilla 7111
-        assert(!match("", regex("^")).empty);
+        assert(match("", regex("^")));
    }
    unittest
    {//bugzilla 7300
        assert(!match("a"d, "aa"d));
    }    
    unittest
@ -7523,4 +7548,4 @@ else
    }
 }
-}
+}//version(unittest)