Merge pull request #462 from blackwhale/regex-fixes

fix Issue 7300 - std.regex.ShiftOr!dchar.search is broken
2025-05-01 15:40:36 +03:00 · 2012-03-30 06:26:15 -07:00 · 2012-03-30 06:26:15 -07:00 · 84a847cbe1
commit 84a847cbe1
parent 9273c74aa8 16bd8e423a
5 changed files with 162 additions and 98 deletions
--- a/changelog.dd
+++ b/changelog.dd
@ -2,14 +2,17 @@ $(VERSION 059, ddd mm, 2012, =================================================,

    $(LIBBUGSFIXED
        $(LI $(BUGZILLA 4604): A stack overflow with writeln)
+        $(LI $(BUGZILLA 5523): std.regex handles "\s" and "\W" (etc.) inside square brackets improperly)
        $(LI $(BUGZILLA 5674): AssertError in std.regex)
        $(LI $(BUGZILLA 5652): Add \p and \P unicode properties to std.regex)
        $(LI $(BUGZILLA 5964): std.stdio.readln can throw a UnicodeException)
+        $(LI $(BUGZILLA 6217): [GSOC] result of std.algorithm.map is not movable)
        $(LI $(BUGZILLA 6403): Upgrade std.regex to Unicode UTS #18 Level 1 support)
        $(LI $(BUGZILLA 7111): New regex engine cannot match beginning of empty string)
        $(LI $(BUGZILLA 7138): Can't call array() on dirEntries)
        $(LI $(BUGZILLA 7264): Can't iterate result from 4-arg dirEntries as string)
        $(LI $(BUGZILLA 7299): std.uni missing doc comments)
+        $(LI $(BUGZILLA 7300): std.regex.ShiftOr!dchar.search is broken)
        $(LI $(BUGZILLA 7374): stdin.byLine() throws AssertError on empty input)
        $(LI $(BUGZILLA 7628): std.format formatValue incorrect overload)
        $(LI $(BUGZILLA 7674): regex replace requires escaped format)
--- a/std/algorithm.d
+++ b/std/algorithm.d
@ -1406,7 +1406,7 @@ unittest
 /// Ditto
 T move(T)(ref T src)
 {
-    T result;
+    T result=void;
    move(src, result);
    return result;
 }
@ -8771,3 +8771,9 @@ unittest
    //writeln(b[0]);
    assert(b[0] == tuple(4.0, 2u));
 }
+
+unittest//Issue 6217 
+{
+    auto x = map!"a"([1,2,3]);
+    x = move(x);
+}
--- a/std/internal/uni.d
+++ b/std/internal/uni.d
@ -58,16 +58,6 @@ body
    }
 }

-//ditto
-@trusted void moveAllAlt(T)(T[] src, T[] dest)
-{//moveAll is @system
-    if(__ctfe)
-        foreach(i,v; src)
-            dest[i] = v;
-    else
-        moveAll(src, dest);
-}
-
 //$(D Interval)  represents an interval of codepoints: [a,b).
 struct Interval
 {
--- a/std/range.d
+++ b/std/range.d
@ -5932,6 +5932,27 @@ unittest {
    assert(equal(app.data, [1,2,3]));
 }

+/**
+  Returns true if $(D fn) accepts variables of type T1 and T2 in any order.
+  The following code should compile:
+  ---
+  T1 t1; 
+  T2 t2;
+  fn(t1, t2);
+  fn(t2, t1);
+  ---
+*/
+template isTwoWayCompatible(alias fn, T1, T2)
+{
+    enum isTwoWayCompatible = is(typeof( (){ 
+            T1 e; 
+            T2 v;
+            return fn(v,e) && fn(e,v); 
+        }
+    ));
+}
+
+
 /**
   Policy used with the searching primitives $(D lowerBound), $(D
   upperBound), and $(D equalRange) of $(LREF SortedRange) below.
@ -6233,10 +6254,9 @@ if (isRandomAccessRange!Range)
   ----
 */
    auto lowerBound(SearchPolicy sp = SearchPolicy.binarySearch, V)(V value)
-    if (is(V : ElementType!Range))
+    if (isTwoWayCompatible!(predFun, ElementType!Range, V))
    {
-        ElementType!Range v = value;
-        return this[0 .. getTransitionIndex!(sp, geq)(v)];
+        return this[0 .. getTransitionIndex!(sp, geq)(value)];
    }

 // upperBound
@ -6257,10 +6277,9 @@ if (isRandomAccessRange!Range)
   ----
 */
    auto upperBound(SearchPolicy sp = SearchPolicy.binarySearch, V)(V value)
-    if (is(V : ElementType!Range))
+    if (isTwoWayCompatible!(predFun, ElementType!Range, V))
    {
-        ElementType!Range v = value;
-        return this[getTransitionIndex!(sp, gt)(v) .. length];
+        return this[getTransitionIndex!(sp, gt)(value) .. length];
    }

 // equalRange
@ -6284,7 +6303,8 @@ if (isRandomAccessRange!Range)
   assert(equal(r, [ 3, 3, 3 ]));
   ----
 */
-    auto equalRange(V)(V value) if (is(V : ElementType!Range))
+    auto equalRange(V)(V value)
+    if (isTwoWayCompatible!(predFun, ElementType!Range, V))
    {
        size_t first = 0, count = _input.length;
        while (count > 0)
@ -6339,7 +6359,8 @@ assert(equal(r[1], [ 3, 3, 3 ]));
 assert(equal(r[2], [ 4, 4, 5, 6 ]));
 ----
 */
-    auto trisect(V)(V value) if (is(V : ElementType!Range))
+    auto trisect(V)(V value)
+    if (isTwoWayCompatible!(predFun, ElementType!Range, V))
    {
        size_t first = 0, count = _input.length;
        while (count > 0)
@ -6445,6 +6466,19 @@ unittest
    assert(equal(r[2], [ 40, 40, 50, 60 ]));
 }

+unittest
+{
+    auto a = [ "A", "AG", "B", "E", "F" ];
+    auto r = assumeSorted!"cmp(a,b) < 0"(a).trisect("B"w);
+    assert(equal(r[0], [ "A", "AG" ]));
+    assert(equal(r[1], [ "B" ]));
+    assert(equal(r[2], [ "E", "F" ]));
+    r = assumeSorted!"cmp(a,b) < 0"(a).trisect("A"d);
+    assert(r[0].empty);
+    assert(equal(r[1], [ "A" ]));
+    assert(equal(r[2], [ "AG", "B", "E", "F" ]));
+}
+
 unittest
 {
    static void test(SearchPolicy pol)()
@ -6536,6 +6570,8 @@ unittest
    assert(equal(p, [0, 1, 2, 3, 4]));
    p = assumeSorted(a).lowerBound(6);
    assert(equal(p, [ 0, 1, 2, 3, 4, 5]));
+    p = assumeSorted(a).lowerBound(6.9);
+    assert(equal(p, [ 0, 1, 2, 3, 4, 5, 6]));
 }

 unittest
@ -6543,6 +6579,8 @@ unittest
    int[] a = [ 1, 2, 3, 3, 3, 4, 4, 5, 6 ];
    auto p = assumeSorted(a).upperBound(3);
    assert(equal(p, [4, 4, 5, 6 ]));
+    p = assumeSorted(a).upperBound(4.2);
+    assert(equal(p, [ 5, 6 ]));
 }

 unittest
@ -6558,6 +6596,8 @@ unittest
    assert(p.empty);
    p = assumeSorted(a).equalRange(7);
    assert(p.empty);
+    p = assumeSorted(a).equalRange(3.0);
+    assert(equal(p, [ 3, 3, 3]));
 }

 unittest
--- a/std/regex.d
+++ b/std/regex.d
@ -774,22 +774,12 @@ auto memoizeExpr(string expr)()
        s.add(Interval(0,0x7f));
    else
    {
-        version(fred_perfect_hashing)
-        {
-            uint key = phash(name);
-            if(key >= PHASHNKEYS || ucmp(name,unicodeProperties[key].name) != 0)
-                enforce(0, "invalid property name");
-            s = cast(CodepointSet)unicodeProperties[key].set;
-        }
-        else
-        {
-            auto range = assumeSorted!((x,y){ return ucmp(x.name, y.name) < 0; })(unicodeProperties);
+        auto range = assumeSorted!((x,y) => ucmp(x.name, y.name) < 0)(unicodeProperties);
        //creating empty Codepointset is a workaround
        auto eq = range.lowerBound(UnicodeProperty(cast(string)name,CodepointSet.init)).length;
        enforce(eq!=range.length && ucmp(name,range[eq].name)==0,"invalid property name");
        s = range[eq].set.dup;
    }
-    }

    if(casefold)
        s = caseEnclose(s);
@ -873,15 +863,12 @@ struct Parser(R, bool CTFE=false)
        if(isSomeString!S)
    {
        pat = origin = pattern;
+        //reserve slightly more then avg as sampled from unittests
        if(!__ctfe)
-            ir.reserve(pat.length);
+            ir.reserve((pat.length*5+2)/4);
        parseFlags(flags);
        _current = ' ';//a safe default for freeform parsing
        next();
-        if(__ctfe)
-            parseRegex();
-        else
-        {
        try
        {
            parseRegex();
@ -890,7 +877,6 @@ struct Parser(R, bool CTFE=false)
        {
            error(e.msg);//also adds pattern location
        }
-        }
        put(Bytecode(IR.End, 0));
    }

@ -911,10 +897,8 @@ struct Parser(R, bool CTFE=false)
            empty =  true;
            return false;
        }
-        //for CTFEability
-        size_t idx=0;
-        _current = decode(pat, idx);
-        pat = pat[idx..$];
+        _current = pat.front;
+        pat.popFront();
        return true;
    }

@ -1250,7 +1234,7 @@ struct Parser(R, bool CTFE=false)
        default:
            if(replace)
            {
-                moveAllAlt(ir[offset+1..$],ir[offset..$-1]);
+                moveAll(ir[offset+1..$],ir[offset..$-1]);
                ir.length -= 1;
            }
            return;
@ -1298,17 +1282,10 @@ struct Parser(R, bool CTFE=false)
                counterDepth = std.algorithm.max(counterDepth, nesting+1);
            }
            else if(replace)
-            {
-                if(__ctfe)//CTFE workaround: no moveAll and length -= x;
-                {
-                    ir = ir[0..offset] ~ ir[offset+1..$];
-                }
-                else
            {
                moveAll(ir[offset+1 .. $],ir[offset .. $-1]);
                ir.length -= 1;
            }
-            }
            put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len));
            enforce(ir.length + len < maxCompiledLength,  "maximum compiled pattern length is exceeded");
            ir ~= ir[offset .. offset+len];
@ -2162,13 +2139,8 @@ private:
 //
@trusted uint lookupNamedGroup(String)(NamedGroup[] dict, String name)
 {//equal is @system?
-    //@@@BUG@@@ assumeSorted kills "-inline"
-    //auto fnd = assumeSorted(map!"a.name"(dict)).lowerBound(name).length;
-    uint fnd;
-    for(fnd = 0; fnd<dict.length; fnd++)
-        if(equal(dict[fnd].name,name))
-            break;
-    enforce(fnd < dict.length, text("no submatch named ", name));
+    auto fnd = assumeSorted!"cmp(a,b) < 0"(map!"a.name"(dict)).lowerBound(name).length;
+    enforce(equal(dict[fnd].name, name), text("no submatch named ", name));
    return dict[fnd].group;
 }

@ -2766,7 +2738,7 @@ public:
    // returns only valid UTF indexes
    // (that given the haystack in question is valid UTF string)
    @trusted size_t search(const(Char)[] haystack, size_t idx)
-    {
+    {//@BUG: apparently assumes little endian machines
        assert(!empty);
        auto p = cast(const(ubyte)*)(haystack.ptr+idx);
        uint state = uint.max;
@ -2779,9 +2751,10 @@ public:
            while(p != end)
            {
                if(!~state)
-                {
+                {//speed up seeking first matching place
                    for(;;)
                    {
+                        assert(p <= end, text(p," vs ", end));
                        p = cast(ubyte*)memchr(p, fChar, end - p);
                        if(!p)
                            return haystack.length;
@ -2796,31 +2769,40 @@ public:
                    {
                        state = (state<<1) | table[p[1]];
                        state = (state<<1) | table[p[2]];
-                        p += 3;
+                        p += 4;
                    }
-                }
-                //first char is already tested, see if that's all
-                if(!(state & limit))//division rounds down for dchar
+                    else
+                        p++;
+                    //first char is tested, see if that's all
+                    if(!(state & limit))
                        return (p-cast(ubyte*)haystack.ptr)/Char.sizeof
-                        -length+1;
+                            -length;
+                }
+                else
+                {//have some bits/states for possible matches,
+                 //use the usual shift-or cycle
                    static if(charSize == 3)
                    {
+                        state = (state<<1) | table[p[0]];
                        state = (state<<1) | table[p[1]];
                        state = (state<<1) | table[p[2]];
-                    state = (state<<1) | table[p[3]];
                        p+=4;
                    }
                    else
                    {
-                    state = (state<<1) | table[p[1]];
+                        state = (state<<1) | table[p[0]];
                        p++;
                    }
+                    if(!(state & limit))
+                        return (p-cast(ubyte*)haystack.ptr)/Char.sizeof
+                            -length;
+                }
                debug(fred_search) writefln("State: %32b", state);
            }
        }
        else
        {
-            //in this path we have to shift first
+            //normal path, partially unrolled for char/wchar
            static if(charSize == 3)
            {
                const(ubyte)* end = cast(ubyte*)(haystack.ptr + haystack.length);
@ -4870,8 +4852,6 @@ enum OneShot { Fwd, Bwd };
    if(is(Char : dchar))
 {
    alias Stream.DataIndex DataIndex;
-    alias const(Char)[] String;
-    enum threadAllocSize = 16;
    Thread!DataIndex* freelist;
    ThreadList!DataIndex clist, nlist;
    DataIndex[] merge;
@ -4978,7 +4958,6 @@ enum OneShot { Fwd, Bwd };
            writeln("------------------------------------------");
        if(exhausted)
        {
-
            return false;
        }
        if(re.flags & RegexInfo.oneShot)
@ -5039,8 +5018,7 @@ enum OneShot { Fwd, Bwd };
                    break;
                }
            }
-        else
-            exhausted = true;
+
        genCounter++; //increment also on each end
        debug(fred_matching) writefln("Threaded matching threads at end");
        //try out all zero-width posibilities
@ -5050,8 +5028,17 @@ enum OneShot { Fwd, Bwd };
        }
        if(!matched)
            eval!false(createStart(index), matches);//new thread starting at end of input
-        if(matched && !(re.flags & RegexOption.global))
-           exhausted = true;
+        if(matched)
+        {//in case NFA found match along the way
+         //and last possible longer alternative ultimately failed
+            s.reset(matches[0].end);//reset to last successful match
+            next();//and reload front character
+            //--- here the exact state of stream was restored ---
+            exhausted = atEnd || !(re.flags & RegexOption.global);
+            //+ empty match advances the input
+            if(!exhausted && matches[0].begin == matches[0].end)
+                next(); 
+        }
        return matched;
    }

@ -6278,6 +6265,24 @@ public:
    @property ref captures(){ return this; }
 }

+unittest//verify example
+{
+    auto m = match("@abc#", regex(`(\w)(\w)(\w)`));
+    auto c = m.captures;
+    assert(c.pre == "@");// part of input preceeding match
+    assert(c.post == "#"); // immediately after match
+    assert(c.hit == c[0] && c.hit == "abc");// the whole match
+    assert(c[2] =="b");
+    assert(c.front == "abc");
+    c.popFront();
+    assert(c.front == "a");
+    assert(c.back == "c");
+    c.popBack();
+    assert(c.back == "b");
+    popFrontN(c, 2);
+    assert(c.empty);
+}
+
 /++
    A regex engine state, as returned by $(D match) family of functions.

@ -6397,9 +6402,19 @@ public:

    Throws: $(D RegexException) if there were any errors during compilation.
 +/
-public auto regex(S)(S pattern, const(char)[] flags="")
+@trusted public auto regex(S)(S pattern, const(char)[] flags="")
    if(isSomeString!(S))
 {
+    enum cacheSize = 8; //TODO: invent nice interface to control regex caching
+    if(__ctfe)
+        return regexImpl(pattern, flags);
+    return memoize!(regexImpl!S, cacheSize)(pattern, flags);
+}
+
+public auto regexImpl(S)(S pattern, const(char)[] flags="")
+    if(isSomeString!(S))
+{
+    alias Regex!(BasicElementOf!S) Reg;
    if(!__ctfe)
    {
        auto parser = Parser!(Unqual!(typeof(pattern)))(pattern, flags);
@ -7228,6 +7243,7 @@ unittest
        run_tests!match(); //thompson VM
    }
 }
+
 version(fred_ct)
 {
    unittest
@ -7424,6 +7440,11 @@ else
                if(ch != '-') //'--' is an operator
                    assert(match(to!string(ch),regex(`[\`~ch~`-\`~ch~`]`)));
            }
+            //bugzilla 7718
+            string strcmd = "./myApp.rb -os OSX -path \"/GIT/Ruby Apps/sec\" -conf 'notimer'";
+            auto reStrCmd = regex (`(".*")|('.*')`, "g");
+            assert(equal(map!"a[0]"(matchFn(strcmd, reStrCmd)),
+                         [`"/GIT/Ruby Apps/sec"`, `'notimer'`]));
        }
        test_body!bmatch();
        test_body!match();
@ -7502,7 +7523,11 @@ else
    }
    unittest
    {//bugzilla 7111
-        assert(!match("", regex("^")).empty);
+        assert(match("", regex("^")));
+    }
+    unittest
+    {//bugzilla 7300
+        assert(!match("a"d, "aa"d));
    }    

    unittest
@ -7523,4 +7548,4 @@ else
    }
 }

-}
+}//version(unittest)