Merge pull request #3883 from JakobOvrum/std_utf_docs

Consolidate documentation for std.utf.stride[Back]
2025-05-03 08:30:33 +03:00 · 2015-12-25 09:30:01 -05:00 · 2015-12-25 09:30:01 -05:00 · 4799bc6f45
commit 4799bc6f45
parent e828cf7b51 fc5d24566a
1 changed files with 212 additions and 277 deletions
--- a/std/utf.d
+++ b/std/utf.d
@ -189,14 +189,20 @@ package auto invalidUTFstrings(Char)() @safe pure @nogc nothrow
 }

 /++
-    Returns whether $(D c) is a valid UTF-32 character.
+    Check whether the given Unicode code point is valid.

+    Params:
+        c = code point to check
+
+    Returns:
+        $(D true) iff $(D c) is a valid Unicode code point
+
+    Note:
    $(D '\uFFFE') and $(D '\uFFFF') are considered valid by $(D isValidDchar),
    as they are permitted for internal use by an application, but they are
    not allowed for interchange by the Unicode standard.
  +/
-@safe
-pure nothrow bool isValidDchar(dchar c) @nogc
+bool isValidDchar(dchar c) pure nothrow @safe @nogc
 {
    /* Note: FFFE and FFFF are specifically permitted by the
     * Unicode standard for application internal use, but are not
@ -208,7 +214,7 @@ pure nothrow bool isValidDchar(dchar c) @nogc
          (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
 }

-unittest
+pure nothrow @safe @nogc unittest
 {
    import std.exception;
    debug(utf) printf("utf.isValidDchar.unittest\n");
@ -232,26 +238,26 @@ unittest


 /++
-    $(D stride) returns the length of the UTF-8 sequence starting at $(D index)
+    Calculate the length of the UTF sequence starting at $(D index)
    in $(D str).

-    $(D stride) works with both UTF-8 strings and ranges of $(D char). If no
-    index is passed, then an input range will work, but if an index is passed,
-    then a random-access range is required.
-
-    $(D index) defaults to $(D 0) if none is passed.
+    Params:
+        str = input range of UTF code units. Must be random access if
+        $(D index) is passed
+        index = starting index of UTF sequence (default: $(D 0))

    Returns:
-        The number of bytes in the UTF-8 sequence, a value between 1 and 4
-        (as per $(WEB tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
+        The number of code units in the UTF sequence. For UTF-8, this is a
+        value between 1 and 4 (as per $(WEB tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
+        For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.

    Throws:
        May throw a $(D UTFException) if $(D str[index]) is not the start of a
-        valid UTF-8 sequence.
+        valid UTF sequence.

-    Notes:
+    Note:
        $(D stride) will only analyze the first $(D str[index]) element. It
-        will not fully verify the validity of UTF-8 sequence, nor even verify
+        will not fully verify the validity of the UTF sequence, nor even verify
        the presence of the sequence: it will not actually guarantee that
        $(D index + stride(str, index) <= str.length).
  +/
@ -377,28 +383,202 @@ unittest // invalid start bytes
        assertThrown!UTFException(stride([c]));
 }

+/// Ditto
+uint stride(S)(auto ref S str, size_t index)
+    if (is(S : const wchar[]) ||
+        (isRandomAccessRange!S && is(Unqual!(ElementType!S) == wchar)))
+{
+    static if (is(typeof(str.length) : ulong))
+        assert(index < str.length, "Past the end of the UTF-16 sequence");
+    immutable uint u = str[index];
+    return 1 + (u >= 0xD800 && u <= 0xDBFF);
+}
+
+/// Ditto
+uint stride(S)(auto ref S str) @safe pure
+    if (is(S : const wchar[]))
+{
+    return stride(str, 0);
+}
+
+/// Ditto
+uint stride(S)(auto ref S str)
+    if (isInputRange!S && is(Unqual!(ElementType!S) == wchar))
+{
+    assert(!str.empty, "UTF-16 sequence is empty");
+    immutable uint u = str.front;
+    return 1 + (u >= 0xD800 && u <= 0xDBFF);
+}
+
+unittest
+{
+    import std.conv : to;
+    import std.exception;
+    import std. string : format;
+    import core.exception : AssertError;
+    static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__)
+    {
+        enforce(stride(s, i) == codeLength!wchar(c),
+                new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
+
+        enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c),
+                new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
+
+        auto refRandom = new RefRandomCU!wchar(s);
+        immutable randLen = refRandom.length;
+        enforce(stride(refRandom, i) == codeLength!wchar(c),
+                new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
+        enforce(refRandom.length == randLen,
+                new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
+
+        if (i == 0)
+        {
+            enforce(stride(s) == codeLength!wchar(c),
+                    new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
+
+            enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c),
+                    new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
+
+            auto refBidir = new RefBidirCU!wchar(s);
+            immutable bidirLen = refBidir.length;
+            enforce(stride(refBidir) == codeLength!wchar(c),
+                    new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
+            enforce(refBidir.length == bidirLen,
+                    new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
+        }
+    }
+
+    assertCTFEable!(
+    {
+    test("a", 'a');
+    test(" ", ' ');
+    test("\u2029", '\u2029'); //paraSep
+    test("\u0100", '\u0100');
+    test("\u0430", '\u0430');
+    test("\U00010143", '\U00010143');
+    test("abcdefcdef", 'a');
+    test("hello\U00010143\u0100\U00010143", 'h', 0);
+    test("hello\U00010143\u0100\U00010143", 'e', 1);
+    test("hello\U00010143\u0100\U00010143", 'l', 2);
+    test("hello\U00010143\u0100\U00010143", 'l', 3);
+    test("hello\U00010143\u0100\U00010143", 'o', 4);
+    test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
+    test("hello\U00010143\u0100\U00010143", '\u0100', 7);
+    test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
+
+    foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
+    {
+        enum str = to!S("hello world");
+        static assert(isSafe!(() => stride(str, 0)));
+        static assert(isSafe!(() => stride(str)   ));
+        static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
+        static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
+    }
+    });
+}
+
+/// Ditto
+uint stride(S)(auto ref S str, size_t index = 0)
+    if (is(S : const dchar[]) ||
+        (isInputRange!S && is(Unqual!(ElementEncodingType!S) == dchar)))
+{
+    static if (is(typeof(str.length) : ulong))
+        assert(index < str.length, "Past the end of the UTF-32 sequence");
+    else
+        assert(!str.empty, "UTF-32 sequence is empty.");
+    return 1;
+}
+
+unittest
+{
+    import std.conv : to;
+    import std.exception;
+    import std. string : format;
+    import core.exception : AssertError;
+    static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__)
+    {
+        enforce(stride(s, i) == codeLength!dchar(c),
+                new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
+
+        enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c),
+                new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
+
+        auto refRandom = new RefRandomCU!dchar(s);
+        immutable randLen = refRandom.length;
+        enforce(stride(refRandom, i) == codeLength!dchar(c),
+                new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
+        enforce(refRandom.length == randLen,
+                new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
+
+        if (i == 0)
+        {
+            enforce(stride(s) == codeLength!dchar(c),
+                    new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
+
+            enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c),
+                    new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
+
+            auto refBidir = new RefBidirCU!dchar(s);
+            immutable bidirLen = refBidir.length;
+            enforce(stride(refBidir) == codeLength!dchar(c),
+                    new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
+            enforce(refBidir.length == bidirLen,
+                    new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
+        }
+    }
+
+    assertCTFEable!(
+    {
+    test("a", 'a');
+    test(" ", ' ');
+    test("\u2029", '\u2029'); //paraSep
+    test("\u0100", '\u0100');
+    test("\u0430", '\u0430');
+    test("\U00010143", '\U00010143');
+    test("abcdefcdef", 'a');
+    test("hello\U00010143\u0100\U00010143", 'h', 0);
+    test("hello\U00010143\u0100\U00010143", 'e', 1);
+    test("hello\U00010143\u0100\U00010143", 'l', 2);
+    test("hello\U00010143\u0100\U00010143", 'l', 3);
+    test("hello\U00010143\u0100\U00010143", 'o', 4);
+    test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
+    test("hello\U00010143\u0100\U00010143", '\u0100', 6);
+    test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
+
+    foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
+    {
+        enum str = to!S("hello world");
+        static assert(isSafe!(() => stride(str, 0)));
+        static assert(isSafe!(() => stride(str)   ));
+        static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
+        static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
+    }
+    });
+}

 /++
-    $(D strideBack) returns the length of the UTF-8 sequence ending one code
-    unit before $(D index) in $(D str).
+    Calculate the length of the UTF sequence ending one code unit before
+    $(D index) in $(D str).

-    $(D strideBack) works with both UTF-8 strings and bidirectional ranges of
-    $(D char). If no index is passed, then a bidirectional range will work, but
-    if an index is passed, then a random-access range is required.
-
-    $(D index) defaults to $(D str.length) if none is passed.
+    Params:
+        str = bidirectional range of UTF code units. Must be random access if
+        $(D index) is passed
+        index = index one past end of UTF sequence (default: $(D str.length))

    Returns:
-        The number of bytes in the UTF-8 sequence.
+        The number of code units in the UTF sequence. For UTF-8, this is a
+        value between 1 and 4 (as per $(WEB tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
+        For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.

    Throws:
        May throw a $(D UTFException) if $(D str[index]) is not one past the
-        end of a valid UTF-8 sequence.
+        end of a valid UTF sequence.

-    Notes:
-        $(D strideBack) will not fully verify the validity of the UTF-8
-        sequence. It will, however, guarantee that
-        $(D index - stride(str, index)) is a valid index.
+    Note:
+        $(D strideBack) will only analyze the element at $(D str[index - 1])
+        element. It will not fully verify the validity of the UTF sequence, nor
+        even verify the presence of the sequence: it will not actually
+        guarantee that $(D strideBack(str, index) <= index).
  +/
 uint strideBack(S)(auto ref S str, size_t index)
    if (is(S : const char[]) ||
@ -438,6 +618,7 @@ uint strideBack(S)(auto ref S str)
    return strideBack(str, str.length);
 }

+/// Ditto
 uint strideBack(S)(auto ref S str)
    if (isBidirectionalRange!S && is(Unqual!(ElementType!S) == char) && !isRandomAccessRange!S)
 {
@ -521,148 +702,9 @@ unittest
    });
 }

-
-/++
-    $(D stride) returns the length of the UTF-16 sequence starting at $(D index)
-    in $(D str).
-
-    $(D stride) works with both UTF-16 strings and ranges of $(D wchar). If no
-    index is passed, then an input range will work, but if an index is passed,
-    then a random-access range is required.
-
-    $(D index) defaults to $(D 0) if none is passed.
-
-    Returns:
-        The number of bytes in the UTF-16 sequence.
-
-    Throws:
-        May throw a $(D UTFException) if $(D str[index]) is not the start of a
-        valid UTF-16 sequence.
-
-    Notes:
-        $(D stride) will only analyze the first $(D str[index]) element. It
-        will not fully verify the validity of UTF-16 sequence, nor even verify
-        the presence of the sequence: it will not actually guarantee that
-        $(D index + stride(str, index) <= str.length).
-  +/
-uint stride(S)(auto ref S str, size_t index)
-    if (is(S : const wchar[]) ||
-        (isRandomAccessRange!S && is(Unqual!(ElementType!S) == wchar)))
-{
-    static if (is(typeof(str.length) : ulong))
-        assert(index < str.length, "Past the end of the UTF-16 sequence");
-    immutable uint u = str[index];
-    return 1 + (u >= 0xD800 && u <= 0xDBFF);
-}
-
-/// Ditto
-uint stride(S)(auto ref S str) @safe pure
-    if (is(S : const wchar[]))
-{
-    return stride(str, 0);
-}
-
-uint stride(S)(auto ref S str)
-    if (isInputRange!S && is(Unqual!(ElementType!S) == wchar))
-{
-    assert(!str.empty, "UTF-16 sequence is empty");
-    immutable uint u = str.front;
-    return 1 + (u >= 0xD800 && u <= 0xDBFF);
-}
-
-@trusted unittest
-{
-    import std.conv : to;
-    import std.exception;
-    import std. string : format;
-    import core.exception : AssertError;
-    static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__)
-    {
-        enforce(stride(s, i) == codeLength!wchar(c),
-                new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
-
-        enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c),
-                new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
-
-        auto refRandom = new RefRandomCU!wchar(s);
-        immutable randLen = refRandom.length;
-        enforce(stride(refRandom, i) == codeLength!wchar(c),
-                new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
-        enforce(refRandom.length == randLen,
-                new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
-
-        if (i == 0)
-        {
-            enforce(stride(s) == codeLength!wchar(c),
-                    new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
-
-            enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c),
-                    new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
-
-            auto refBidir = new RefBidirCU!wchar(s);
-            immutable bidirLen = refBidir.length;
-            enforce(stride(refBidir) == codeLength!wchar(c),
-                    new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
-            enforce(refBidir.length == bidirLen,
-                    new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
-        }
-    }
-
-    assertCTFEable!(
-    {
-    test("a", 'a');
-    test(" ", ' ');
-    test("\u2029", '\u2029'); //paraSep
-    test("\u0100", '\u0100');
-    test("\u0430", '\u0430');
-    test("\U00010143", '\U00010143');
-    test("abcdefcdef", 'a');
-    test("hello\U00010143\u0100\U00010143", 'h', 0);
-    test("hello\U00010143\u0100\U00010143", 'e', 1);
-    test("hello\U00010143\u0100\U00010143", 'l', 2);
-    test("hello\U00010143\u0100\U00010143", 'l', 3);
-    test("hello\U00010143\u0100\U00010143", 'o', 4);
-    test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
-    test("hello\U00010143\u0100\U00010143", '\u0100', 7);
-    test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
-
-    foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
-    {
-        enum str = to!S("hello world");
-        static assert(isSafe!(() => stride(str, 0)));
-        static assert(isSafe!(() => stride(str)   ));
-        static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
-        static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
-    }
-    });
-}
-
-
-/++
-    $(D strideBack) returns the length of the UTF-16 sequence ending one code
-    unit before $(D index) in $(D str).
-
-    $(D strideBack) works with both UTF-16 strings and ranges of $(D wchar). If
-    no index is passed, then a bidirectional range will work, but if an index is
-    passed, then a random-access range is required.
-
-    $(D index) defaults to $(D str.length) if none is passed.
-
-    Returns:
-        The number of bytes in the UTF-16 sequence.
-
-    Throws:
-        May throw a $(D UTFException) if $(D str[index]) is not one past the
-        end of a valid UTF-16 sequence.
-
-    Notes:
-        $(D stride) will only analyze the element at $(D str[index - 1])
-        element. It will not fully verify the validity of UTF-16 sequence, nor
-        even verify the presence of the sequence: it will not actually
-        guarantee that $(D stride(str, index) <= index).
-  +/
 //UTF-16 is self synchronizing: The length of strideBack can be found from
 //the value of a single wchar
+/// Ditto
 uint strideBack(S)(auto ref S str, size_t index)
    if (is(S : const wchar[]) ||
        (isRandomAccessRange!S && is(Unqual!(ElementType!S) == wchar)))
@ -757,114 +799,7 @@ unittest
    });
 }

-
-/++
-    $(D stride) returns the length of the UTF-32 sequence starting at $(D index)
-    in $(D str).
-
-    $(D stride) works with both UTF-32 strings and ranges of $(D dchar).
-
-    Returns:
-        The number of bytes in the UTF-32 sequence (always $(D 1)).
-
-    Throws:
-        Never.
-  +/
-uint stride(S)(auto ref S str, size_t index = 0)
-    if (is(S : const dchar[]) ||
-        (isInputRange!S && is(Unqual!(ElementEncodingType!S) == dchar)))
-{
-    static if (is(typeof(str.length) : ulong))
-        assert(index < str.length, "Past the end of the UTF-32 sequence");
-    else
-        assert(!str.empty, "UTF-32 sequence is empty.");
-    return 1;
-}
-
-unittest
-{
-    import std.conv : to;
-    import std.exception;
-    import std. string : format;
-    import core.exception : AssertError;
-    static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__)
-    {
-        enforce(stride(s, i) == codeLength!dchar(c),
-                new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
-
-        enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c),
-                new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
-
-        auto refRandom = new RefRandomCU!dchar(s);
-        immutable randLen = refRandom.length;
-        enforce(stride(refRandom, i) == codeLength!dchar(c),
-                new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
-        enforce(refRandom.length == randLen,
-                new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
-
-        if (i == 0)
-        {
-            enforce(stride(s) == codeLength!dchar(c),
-                    new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
-
-            enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c),
-                    new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
-
-            auto refBidir = new RefBidirCU!dchar(s);
-            immutable bidirLen = refBidir.length;
-            enforce(stride(refBidir) == codeLength!dchar(c),
-                    new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
-            enforce(refBidir.length == bidirLen,
-                    new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
-        }
-    }
-
-    assertCTFEable!(
-    {
-    test("a", 'a');
-    test(" ", ' ');
-    test("\u2029", '\u2029'); //paraSep
-    test("\u0100", '\u0100');
-    test("\u0430", '\u0430');
-    test("\U00010143", '\U00010143');
-    test("abcdefcdef", 'a');
-    test("hello\U00010143\u0100\U00010143", 'h', 0);
-    test("hello\U00010143\u0100\U00010143", 'e', 1);
-    test("hello\U00010143\u0100\U00010143", 'l', 2);
-    test("hello\U00010143\u0100\U00010143", 'l', 3);
-    test("hello\U00010143\u0100\U00010143", 'o', 4);
-    test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
-    test("hello\U00010143\u0100\U00010143", '\u0100', 6);
-    test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
-
-    foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
-    {
-        enum str = to!S("hello world");
-        static assert(isSafe!(() => stride(str, 0)));
-        static assert(isSafe!(() => stride(str)   ));
-        static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
-        static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
-    }
-    });
-}
-
-
-/++
-    $(D strideBack) returns the length of the UTF-32 sequence ending one code
-    unit before $(D index) in $(D str).
-
-    $(D strideBack) works with both UTF-32 strings and ranges of $(D dchar). If
-    no index is passed, then a bidirectional range will work, but if an index is
-    passed, then a random-access range is required.
-
-    $(D index) defaults to $(D str.length) if none is passed.
-
-    Returns:
-        The number of bytes in the UTF-32 sequence (always $(D 1)).
-
-    Throws:
-        Never.
-  +/
+/// Ditto
 uint strideBack(S)(auto ref S str, size_t index)
    if (isRandomAccessRange!S && is(Unqual!(ElementEncodingType!S) == dchar))
 {
@ -1041,7 +976,7 @@ unittest

 /* =================== Decode ======================= */

-/// Whether or not to replace invalid UTF with replacementDchar
+/// Whether or not to replace invalid UTF with $(LREF replacementDchar)
 alias UseReplacementDchar = Flag!"useReplacementDchar";

 /++