Merge pull request #3883 from JakobOvrum/std_utf_docs

Consolidate documentation for std.utf.stride[Back]
2025-05-07 19:49:36 +03:00 · 2015-12-25 09:30:01 -05:00 · 2015-12-25 09:30:01 -05:00 · 4799bc6f45
commit 4799bc6f45
parent e828cf7b51 fc5d24566a
1 changed files with 212 additions and 277 deletions
--- a/std/utf.d
+++ b/std/utf.d
@ -189,14 +189,20 @@ package auto invalidUTFstrings(Char)() @safe pure @nogc nothrow
 }
 /++
-    Returns whether $(D c) is a valid UTF-32 character.
+    Check whether the given Unicode code point is valid.
    Params:
        c = code point to check
    Returns:
        $(D true) iff $(D c) is a valid Unicode code point
    Note:
    $(D '\uFFFE') and $(D '\uFFFF') are considered valid by $(D isValidDchar),
    as they are permitted for internal use by an application, but they are
    not allowed for interchange by the Unicode standard.
  +/
-@safe
+bool isValidDchar(dchar c) pure nothrow @safe @nogc
 pure nothrow bool isValidDchar(dchar c) @nogc
 {
    /* Note: FFFE and FFFF are specifically permitted by the
     * Unicode standard for application internal use, but are not
@ -208,7 +214,7 @@ pure nothrow bool isValidDchar(dchar c) @nogc
          (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
 }
-unittest
+pure nothrow @safe @nogc unittest
 {
    import std.exception;
    debug(utf) printf("utf.isValidDchar.unittest\n");
@ -232,26 +238,26 @@ unittest
 /++
-    $(D stride) returns the length of the UTF-8 sequence starting at $(D index)
+    Calculate the length of the UTF sequence starting at $(D index)
    in $(D str).
-    $(D stride) works with both UTF-8 strings and ranges of $(D char). If no
+    Params:
-    index is passed, then an input range will work, but if an index is passed,
+        str = input range of UTF code units. Must be random access if
-    then a random-access range is required.
+        $(D index) is passed
-
+        index = starting index of UTF sequence (default: $(D 0))
    $(D index) defaults to $(D 0) if none is passed.
    Returns:
-        The number of bytes in the UTF-8 sequence, a value between 1 and 4
+        The number of code units in the UTF sequence. For UTF-8, this is a
-        (as per $(WEB tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
+        value between 1 and 4 (as per $(WEB tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
        For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
    Throws:
        May throw a $(D UTFException) if $(D str[index]) is not the start of a
-        valid UTF-8 sequence.
+        valid UTF sequence.
-    Notes:
+    Note:
        $(D stride) will only analyze the first $(D str[index]) element. It
-        will not fully verify the validity of UTF-8 sequence, nor even verify
+        will not fully verify the validity of the UTF sequence, nor even verify
        the presence of the sequence: it will not actually guarantee that
        $(D index + stride(str, index) <= str.length).
  +/
@ -377,28 +383,202 @@ unittest // invalid start bytes
        assertThrown!UTFException(stride([c]));
 }
 /// Ditto
 uint stride(S)(auto ref S str, size_t index)
    if (is(S : const wchar[]) ||
        (isRandomAccessRange!S && is(Unqual!(ElementType!S) == wchar)))
 {
    static if (is(typeof(str.length) : ulong))
        assert(index < str.length, "Past the end of the UTF-16 sequence");
    immutable uint u = str[index];
    return 1 + (u >= 0xD800 && u <= 0xDBFF);
 }
 /// Ditto
 uint stride(S)(auto ref S str) @safe pure
    if (is(S : const wchar[]))
 {
    return stride(str, 0);
 }
 /// Ditto
 uint stride(S)(auto ref S str)
    if (isInputRange!S && is(Unqual!(ElementType!S) == wchar))
 {
    assert(!str.empty, "UTF-16 sequence is empty");
    immutable uint u = str.front;
    return 1 + (u >= 0xD800 && u <= 0xDBFF);
 }
 unittest
 {
    import std.conv : to;
    import std.exception;
    import std. string : format;
    import core.exception : AssertError;
    static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__)
    {
        enforce(stride(s, i) == codeLength!wchar(c),
                new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
        enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c),
                new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
        auto refRandom = new RefRandomCU!wchar(s);
        immutable randLen = refRandom.length;
        enforce(stride(refRandom, i) == codeLength!wchar(c),
                new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
        enforce(refRandom.length == randLen,
                new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
        if (i == 0)
        {
            enforce(stride(s) == codeLength!wchar(c),
                    new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
            enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c),
                    new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
            auto refBidir = new RefBidirCU!wchar(s);
            immutable bidirLen = refBidir.length;
            enforce(stride(refBidir) == codeLength!wchar(c),
                    new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
            enforce(refBidir.length == bidirLen,
                    new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
        }
    }
    assertCTFEable!(
    {
    test("a", 'a');
    test(" ", ' ');
    test("\u2029", '\u2029'); //paraSep
    test("\u0100", '\u0100');
    test("\u0430", '\u0430');
    test("\U00010143", '\U00010143');
    test("abcdefcdef", 'a');
    test("hello\U00010143\u0100\U00010143", 'h', 0);
    test("hello\U00010143\u0100\U00010143", 'e', 1);
    test("hello\U00010143\u0100\U00010143", 'l', 2);
    test("hello\U00010143\u0100\U00010143", 'l', 3);
    test("hello\U00010143\u0100\U00010143", 'o', 4);
    test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
    test("hello\U00010143\u0100\U00010143", '\u0100', 7);
    test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
    foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
    {
        enum str = to!S("hello world");
        static assert(isSafe!(() => stride(str, 0)));
        static assert(isSafe!(() => stride(str)   ));
        static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
        static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
    }
    });
 }
 /// Ditto
 uint stride(S)(auto ref S str, size_t index = 0)
    if (is(S : const dchar[]) ||
        (isInputRange!S && is(Unqual!(ElementEncodingType!S) == dchar)))
 {
    static if (is(typeof(str.length) : ulong))
        assert(index < str.length, "Past the end of the UTF-32 sequence");
    else
        assert(!str.empty, "UTF-32 sequence is empty.");
    return 1;
 }
 unittest
 {
    import std.conv : to;
    import std.exception;
    import std. string : format;
    import core.exception : AssertError;
    static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__)
    {
        enforce(stride(s, i) == codeLength!dchar(c),
                new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
        enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c),
                new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
        auto refRandom = new RefRandomCU!dchar(s);
        immutable randLen = refRandom.length;
        enforce(stride(refRandom, i) == codeLength!dchar(c),
                new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
        enforce(refRandom.length == randLen,
                new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
        if (i == 0)
        {
            enforce(stride(s) == codeLength!dchar(c),
                    new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
            enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c),
                    new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
            auto refBidir = new RefBidirCU!dchar(s);
            immutable bidirLen = refBidir.length;
            enforce(stride(refBidir) == codeLength!dchar(c),
                    new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
            enforce(refBidir.length == bidirLen,
                    new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
        }
    }
    assertCTFEable!(
    {
    test("a", 'a');
    test(" ", ' ');
    test("\u2029", '\u2029'); //paraSep
    test("\u0100", '\u0100');
    test("\u0430", '\u0430');
    test("\U00010143", '\U00010143');
    test("abcdefcdef", 'a');
    test("hello\U00010143\u0100\U00010143", 'h', 0);
    test("hello\U00010143\u0100\U00010143", 'e', 1);
    test("hello\U00010143\u0100\U00010143", 'l', 2);
    test("hello\U00010143\u0100\U00010143", 'l', 3);
    test("hello\U00010143\u0100\U00010143", 'o', 4);
    test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
    test("hello\U00010143\u0100\U00010143", '\u0100', 6);
    test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
    foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
    {
        enum str = to!S("hello world");
        static assert(isSafe!(() => stride(str, 0)));
        static assert(isSafe!(() => stride(str)   ));
        static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
        static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
    }
    });
 }
 /++
-    $(D strideBack) returns the length of the UTF-8 sequence ending one code
+    Calculate the length of the UTF sequence ending one code unit before
-    unit before $(D index) in $(D str).
+    $(D index) in $(D str).
-    $(D strideBack) works with both UTF-8 strings and bidirectional ranges of
+    Params:
-    $(D char). If no index is passed, then a bidirectional range will work, but
+        str = bidirectional range of UTF code units. Must be random access if
-    if an index is passed, then a random-access range is required.
+        $(D index) is passed
-
+        index = index one past end of UTF sequence (default: $(D str.length))
    $(D index) defaults to $(D str.length) if none is passed.
    Returns:
-        The number of bytes in the UTF-8 sequence.
+        The number of code units in the UTF sequence. For UTF-8, this is a
        value between 1 and 4 (as per $(WEB tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
        For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
    Throws:
        May throw a $(D UTFException) if $(D str[index]) is not one past the
-        end of a valid UTF-8 sequence.
+        end of a valid UTF sequence.
-    Notes:
+    Note:
-        $(D strideBack) will not fully verify the validity of the UTF-8
+        $(D strideBack) will only analyze the element at $(D str[index - 1])
-        sequence. It will, however, guarantee that
+        element. It will not fully verify the validity of the UTF sequence, nor
-        $(D index - stride(str, index)) is a valid index.
+        even verify the presence of the sequence: it will not actually
        guarantee that $(D strideBack(str, index) <= index).
  +/
 uint strideBack(S)(auto ref S str, size_t index)
    if (is(S : const char[]) ||
@ -438,6 +618,7 @@ uint strideBack(S)(auto ref S str)
    return strideBack(str, str.length);
 }
 /// Ditto
 uint strideBack(S)(auto ref S str)
    if (isBidirectionalRange!S && is(Unqual!(ElementType!S) == char) && !isRandomAccessRange!S)
 {
@ -521,148 +702,9 @@ unittest
    });
 }
 /++
    $(D stride) returns the length of the UTF-16 sequence starting at $(D index)
    in $(D str).
    $(D stride) works with both UTF-16 strings and ranges of $(D wchar). If no
    index is passed, then an input range will work, but if an index is passed,
    then a random-access range is required.
    $(D index) defaults to $(D 0) if none is passed.
    Returns:
        The number of bytes in the UTF-16 sequence.
    Throws:
        May throw a $(D UTFException) if $(D str[index]) is not the start of a
        valid UTF-16 sequence.
    Notes:
        $(D stride) will only analyze the first $(D str[index]) element. It
        will not fully verify the validity of UTF-16 sequence, nor even verify
        the presence of the sequence: it will not actually guarantee that
        $(D index + stride(str, index) <= str.length).
  +/
 uint stride(S)(auto ref S str, size_t index)
    if (is(S : const wchar[]) ||
        (isRandomAccessRange!S && is(Unqual!(ElementType!S) == wchar)))
 {
    static if (is(typeof(str.length) : ulong))
        assert(index < str.length, "Past the end of the UTF-16 sequence");
    immutable uint u = str[index];
    return 1 + (u >= 0xD800 && u <= 0xDBFF);
 }
 /// Ditto
 uint stride(S)(auto ref S str) @safe pure
    if (is(S : const wchar[]))
 {
    return stride(str, 0);
 }
 uint stride(S)(auto ref S str)
    if (isInputRange!S && is(Unqual!(ElementType!S) == wchar))
 {
    assert(!str.empty, "UTF-16 sequence is empty");
    immutable uint u = str.front;
    return 1 + (u >= 0xD800 && u <= 0xDBFF);
 }
@trusted unittest
 {
    import std.conv : to;
    import std.exception;
    import std. string : format;
    import core.exception : AssertError;
    static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__)
    {
        enforce(stride(s, i) == codeLength!wchar(c),
                new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
        enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c),
                new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
        auto refRandom = new RefRandomCU!wchar(s);
        immutable randLen = refRandom.length;
        enforce(stride(refRandom, i) == codeLength!wchar(c),
                new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
        enforce(refRandom.length == randLen,
                new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
        if (i == 0)
        {
            enforce(stride(s) == codeLength!wchar(c),
                    new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
            enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c),
                    new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
            auto refBidir = new RefBidirCU!wchar(s);
            immutable bidirLen = refBidir.length;
            enforce(stride(refBidir) == codeLength!wchar(c),
                    new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
            enforce(refBidir.length == bidirLen,
                    new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
        }
    }
    assertCTFEable!(
    {
    test("a", 'a');
    test(" ", ' ');
    test("\u2029", '\u2029'); //paraSep
    test("\u0100", '\u0100');
    test("\u0430", '\u0430');
    test("\U00010143", '\U00010143');
    test("abcdefcdef", 'a');
    test("hello\U00010143\u0100\U00010143", 'h', 0);
    test("hello\U00010143\u0100\U00010143", 'e', 1);
    test("hello\U00010143\u0100\U00010143", 'l', 2);
    test("hello\U00010143\u0100\U00010143", 'l', 3);
    test("hello\U00010143\u0100\U00010143", 'o', 4);
    test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
    test("hello\U00010143\u0100\U00010143", '\u0100', 7);
    test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
    foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
    {
        enum str = to!S("hello world");
        static assert(isSafe!(() => stride(str, 0)));
        static assert(isSafe!(() => stride(str)   ));
        static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
        static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
    }
    });
 }
 /++
    $(D strideBack) returns the length of the UTF-16 sequence ending one code
    unit before $(D index) in $(D str).
    $(D strideBack) works with both UTF-16 strings and ranges of $(D wchar). If
    no index is passed, then a bidirectional range will work, but if an index is
    passed, then a random-access range is required.
    $(D index) defaults to $(D str.length) if none is passed.
    Returns:
        The number of bytes in the UTF-16 sequence.
    Throws:
        May throw a $(D UTFException) if $(D str[index]) is not one past the
        end of a valid UTF-16 sequence.
    Notes:
        $(D stride) will only analyze the element at $(D str[index - 1])
        element. It will not fully verify the validity of UTF-16 sequence, nor
        even verify the presence of the sequence: it will not actually
        guarantee that $(D stride(str, index) <= index).
  +/
 //UTF-16 is self synchronizing: The length of strideBack can be found from
 //the value of a single wchar
 /// Ditto
 uint strideBack(S)(auto ref S str, size_t index)
    if (is(S : const wchar[]) ||
        (isRandomAccessRange!S && is(Unqual!(ElementType!S) == wchar)))
@ -757,114 +799,7 @@ unittest
    });
 }
-
+/// Ditto
 /++
    $(D stride) returns the length of the UTF-32 sequence starting at $(D index)
    in $(D str).
    $(D stride) works with both UTF-32 strings and ranges of $(D dchar).
    Returns:
        The number of bytes in the UTF-32 sequence (always $(D 1)).
    Throws:
        Never.
  +/
 uint stride(S)(auto ref S str, size_t index = 0)
    if (is(S : const dchar[]) ||
        (isInputRange!S && is(Unqual!(ElementEncodingType!S) == dchar)))
 {
    static if (is(typeof(str.length) : ulong))
        assert(index < str.length, "Past the end of the UTF-32 sequence");
    else
        assert(!str.empty, "UTF-32 sequence is empty.");
    return 1;
 }
 unittest
 {
    import std.conv : to;
    import std.exception;
    import std. string : format;
    import core.exception : AssertError;
    static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__)
    {
        enforce(stride(s, i) == codeLength!dchar(c),
                new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
        enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c),
                new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
        auto refRandom = new RefRandomCU!dchar(s);
        immutable randLen = refRandom.length;
        enforce(stride(refRandom, i) == codeLength!dchar(c),
                new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
        enforce(refRandom.length == randLen,
                new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
        if (i == 0)
        {
            enforce(stride(s) == codeLength!dchar(c),
                    new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
            enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c),
                    new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
            auto refBidir = new RefBidirCU!dchar(s);
            immutable bidirLen = refBidir.length;
            enforce(stride(refBidir) == codeLength!dchar(c),
                    new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
            enforce(refBidir.length == bidirLen,
                    new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
        }
    }
    assertCTFEable!(
    {
    test("a", 'a');
    test(" ", ' ');
    test("\u2029", '\u2029'); //paraSep
    test("\u0100", '\u0100');
    test("\u0430", '\u0430');
    test("\U00010143", '\U00010143');
    test("abcdefcdef", 'a');
    test("hello\U00010143\u0100\U00010143", 'h', 0);
    test("hello\U00010143\u0100\U00010143", 'e', 1);
    test("hello\U00010143\u0100\U00010143", 'l', 2);
    test("hello\U00010143\u0100\U00010143", 'l', 3);
    test("hello\U00010143\u0100\U00010143", 'o', 4);
    test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
    test("hello\U00010143\u0100\U00010143", '\u0100', 6);
    test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
    foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
    {
        enum str = to!S("hello world");
        static assert(isSafe!(() => stride(str, 0)));
        static assert(isSafe!(() => stride(str)   ));
        static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
        static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
    }
    });
 }
 /++
    $(D strideBack) returns the length of the UTF-32 sequence ending one code
    unit before $(D index) in $(D str).
    $(D strideBack) works with both UTF-32 strings and ranges of $(D dchar). If
    no index is passed, then a bidirectional range will work, but if an index is
    passed, then a random-access range is required.
    $(D index) defaults to $(D str.length) if none is passed.
    Returns:
        The number of bytes in the UTF-32 sequence (always $(D 1)).
    Throws:
        Never.
  +/
 uint strideBack(S)(auto ref S str, size_t index)
    if (isRandomAccessRange!S && is(Unqual!(ElementEncodingType!S) == dchar))
 {
@ -1041,7 +976,7 @@ unittest
 /* =================== Decode ======================= */
-/// Whether or not to replace invalid UTF with replacementDchar
+/// Whether or not to replace invalid UTF with $(LREF replacementDchar)
 alias UseReplacementDchar = Flag!"useReplacementDchar";
 /++