D-Scanner/std/d/lexer.d

// Written in the D programming language

/**
 * This module contains a range-based _lexer for the D programming language.
 *
 * For performance reasons the _lexer contained in this module operates only on
 * ASCII or UTF-8 encoded source code. If the use of other encodings is
 * desired, the source code must be converted to UTF-8 before passing it to this
 * _lexer.
 *
 * To use the _lexer, create a LexerConfig struct
 * ---
 * LexerConfig config;
 * config.iterStyle = IterationStyle.everything;
 * config.tokenStyle = TokenStyle.source;
 * config.versionNumber = 2064;
 * config.vendorString = "Lexer Example";
 * ---
 * Once you have configured the _lexer, call byToken$(LPAREN)$(RPAREN) on your
 * source code, passing in the configuration.
 * ---
 * auto source = "import std.stdio;"c;
 * auto tokens = byToken(source, config);
 * ---
 * The result of byToken$(LPAREN)$(RPAREN) is a forward range of tokens that can
 * be easily used with the algorithms from std.algorithm or iterated over with
 * $(D_KEYWORD foreach)
 * ---
 * assert (tokens.front.type == TokenType.import_);
 * assert (tokens.front.value == "import");
 * assert (tokens.front.line == 1);
 * assert (tokens.front.startIndex == 0);
 * ---
 *
 * Examples:
 *
 * Generate HTML markup of D code.
 * ---
 * module highlighter;
 *
 * import std.stdio;
 * import std.array;
 * import std.d.lexer;
 *
 * void writeSpan(string cssClass, string value)
 * {
 *     stdout.write(`<span class="`, cssClass, `">`, value.replace("&", "&amp;").replace("<", "&lt;"), `</span>`);
 * }
 *
 * // http://ethanschoonover.com/solarized
 * void highlight(R)(R tokens)
 * {
 *     stdout.writeln(q"[<!DOCTYPE html>
 * <html>
 * <head>
 * <meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
 * </head>
 * <body>
 * <style type="text/css">
 * html  { background-color: #fdf6e3; color: #002b36; }
 * .kwrd { color: #b58900; font-weight: bold;  }
 * .com  { color: #93a1a1; font-style: italic; }
 * .num  { color: #dc322f; font-weigth: bold;  }
 * .str  { color: #2aa198; font-style: italic; }
 * .op   { color: #586e75; font-weight: bold;  }
 * .type { color: #268bd2; font-weight: bold;  }
 * .cons { color: #859900; font-weight: bold;  }
 * </style>
 * <pre>]");
 *
 *     foreach (Token t; tokens)
 *     {
 *         if (isBuiltType(t.type))
 *             writeSpan("type", t.value);
 *         else if (isKeyword(t.type))
 *             writeSpan("kwrd", t.value);
 *         else if (t.type == TokenType.comment)
 *             writeSpan("com", t.value);
 *         else if (isStringLiteral(t.type))
 *             writeSpan("str", t.value);
 *         else if (isNumberLiteral(t.type))
 *             writeSpan("num", t.value);
 *         else if (isOperator(t.type))
 *             writeSpan("op", t.value);
 *         else
 *             stdout.write(t.value.replace("<", "&lt;"));
 *     }
 *     stdout.writeln("</pre>\n</body></html>");
 * }
 *
 * void main(string[] args)
 * {
 *     LexerConfig config;
 *     config.tokenStyle = TokenStyle.source;
 *     config.iterStyle = IterationStyle.everything;
 *     config.fileName = args[1];
 *     auto f = File(args[1]);
 *     (cast(ubyte[]) f.byLine(KeepTerminator.yes).join()).byToken(config).highlight();
 * }
 * ---
 *
 * Copyright: Brian Schott 2013
 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0)
 * Authors: Brian Schott, Dmitry Olshansky
 * Source: $(PHOBOSSRC std/d/_lexer.d)
 */

module std.d.lexer;

import std.algorithm;
import std.ascii;
import std.conv;
import std.datetime;
import std.d.entities;
import std.exception;
import std.range;
import std.regex;
import std.string;
import std.traits;
import std.utf;
version (unittest) import std.stdio;


public:

/**
 * Represents a D token
 */
struct Token
{
    /**
     * The token type.
     */
    TokenType type;

    /**
     * The representation of the token in the original source code.
     */
    string value;

    /**
     * The number of the line the token is on.
     */
    uint line;

    /**
     * The column number of the start of the token in the original source.
     * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN)
     */
    uint column;

    /**
     * The index of the start of the token in the original source.
     * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN)
     */
    size_t startIndex;

    /**
     * Check to see if the token is of the same type and has the same string
     * representation as the given token.
     */
    bool opEquals(ref const(Token) other) const
    {
        return other.type == type && other.value == value;
    }

    /**
     * Checks to see if the token's string representation is equal to the given
     * string.
     */
    bool opEquals(string value) const { return this.value == value; }

    /**
     * Checks to see if the token is of the given type.
     */
    bool opEquals(TokenType type) const { return this.type == type; }

    /**
     * Comparison operator orders tokens by start index.
     */
    int opCmp(ref const(Token) other) const
    {
        if (startIndex < other.startIndex) return -1;
        if (startIndex > other.startIndex) return 1;
        return 0;
    }
}

/**
 * Configure the behavior of the byToken() function. These flags may be
 * combined using a bitwise or.
 */
enum IterationStyle
{
    /// Only include code, not whitespace or comments
    codeOnly = 0,
    /// Includes comments
    includeComments = 0b0001,
    /// Includes whitespace
    includeWhitespace = 0b0010,
    /// Include $(LINK2 http://dlang.org/lex.html#specialtokens, special tokens)
    includeSpecialTokens = 0b0100,
    /// Do not stop iteration on reaching the $(D_KEYWORD ___EOF__) token
    ignoreEOF = 0b1000,
    /// Include _everything
    everything = includeComments | includeWhitespace | ignoreEOF
}

/**
 * Configuration of the token lexing style. These flags may be combined with a
 * bitwise or.
 */
enum TokenStyle : uint
{
    /**
     * Escape sequences will be replaced with their equivalent characters,
     * enclosing quote characters will not be included. Special tokens such as
     * $(D_KEYWORD ___VENDOR__) will be replaced with their equivalent strings.
	 * Useful for creating a compiler or interpreter.
     */
    default_ = 0b0000,

    /**
     * Escape sequences will not be processed. An escaped quote character will
     * not terminate string lexing, but it will not be replaced with the quote
     * character in the token.
     */
    notEscaped = 0b0001,

    /**
     * Strings will include their opening and closing quote characters as well
     * as any prefixes or suffixes $(LPAREN)e.g.: $(D_STRING "abcde"w) will
     * include the $(D_STRING 'w') character as well as the opening and closing
     * quotes$(RPAREN)
     */
    includeQuotes = 0b0010,

    /**
     * Do not replace the value field of the special tokens such as
	 * $(D_KEYWORD ___DATE__) with their string equivalents.
     */
    doNotReplaceSpecial = 0b0100,

    /**
     * Strings will be read exactly as they appeared in the source, including
     * their opening and closing quote characters. Useful for syntax
     * highlighting.
     */
    source = notEscaped | includeQuotes | doNotReplaceSpecial
}

/**
 * Lexer configuration
 */
struct LexerConfig
{
    /**
     * Iteration style
     */
    IterationStyle iterStyle = IterationStyle.codeOnly;

    /**
     * Token style
     */
    TokenStyle tokenStyle = tokenStyle.default_;

    /**
     * Replacement for the $(D_KEYWORD ___VERSION__) token. Defaults to 100.
     */
    uint versionNumber = 100;

    /**
     * Replacement for the $(D_KEYWORD ___VENDOR__) token. Defaults to $(D_STRING "std.d.lexer")
     */
    string vendorString = "std.d.lexer";

    /**
     * Name used when creating error messages that are sent to errorFunc. This
     * is needed because the lexer operates on any forwarad range of ASCII
     * characters or UTF-8 code units and does not know what to call its input
     * source. Defaults to the empty string.
     */
    string fileName = "";

    /**
     * This function is called when an error is encountered during lexing.
     * Parameters are file name, code uint index, line number, column,
     * and error messsage.
     */
    void delegate(string, size_t, uint, uint, string) errorFunc;
}

/**
 * Iterate over the given range of characters by D tokens.
 * Params:
 *     range = the range of characters
 *     config = the lexer configuration
 *     bufferSize = initial size of internal circular buffer
 * Returns:
 *     an input range of tokens
 */
auto byToken(R)(R range, LexerConfig config, size_t bufferSize = 4*1024)
    if (isForwardRange!(R) && !isRandomAccessRange!(R)
        && is(ElementType!R : const(ubyte)))
{
    // 4K of circular buffer by default
    auto r = TokenRange!(typeof(lexerSource(range)))
        (lexerSource(range, bufferSize), config);
    r.config = config;
    r.lineNumber = 1;
    r.popFront();
    return r;
}

///ditto
auto byToken(R)(R range, LexerConfig config)
    if (isRandomAccessRange!(R) && is(ElementType!R : const(ubyte)))
{
    auto r = TokenRange!(typeof(lexerSource(range)))
        (lexerSource(range), config);
    r.config = config;
    r.lineNumber = 1;
    r.popFront();
    return r;
}

/**
 * Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate.
 */
struct TokenRange(LexSrc)
    //if ( is(LexSrc : LexSource!(U...), U...)) //check for LexSource
{
    /**
     * Returns: true if the range is empty
     */
    bool empty() const @property
    {
        return _empty;
    }

    /**
     * Returns: the current token
     */
    ref const(Token) front() const @property
    {
        assert(!empty, "trying to get front of an empty token range");
        return current;
    }

    /**
     * Returns the current token and then removes it from the range
     */
    Token moveFront()
    {
        auto r = move(current);
        popFront();
        return r;
    }

    /**
     * Removes the current token from the range
     */
    void popFront()
    {
        advance();
    }

private:

    /*
     * Advances the range to the next token
     */
    void advance()
    {
L_advance:
        if (src.empty)
        {
            _empty = true;
            return;
        }
        src.mark(); // mark a start of a lexing "frame"
        current.line = lineNumber;
        current.startIndex = src.index;
        current.column = column;
        current.value = null;
        switch (src.front)
        {
        // handle sentenels for end of input
        case 0:
        case 0x1a:
            // TODO: check config flags, it's cheap
            // since this branch at most is taken once per file
            _empty = true;
            return;
        mixin(generateCaseTrie(
            "=",               "TokenType.assign",
            "@",               "TokenType.at",
            "&",               "TokenType.bitAnd",
            "&=",              "TokenType.bitAndEqual",
            "|",               "TokenType.bitOr",
            "|=",              "TokenType.bitOrEqual",
            "~=",              "TokenType.catEqual",
            ":",               "TokenType.colon",
            ",",               "TokenType.comma",
            "--",              "TokenType.decrement",
            "$",               "TokenType.dollar",
            "==",              "TokenType.equal",
            "=>",              "TokenType.goesTo",
            ">",               "TokenType.greater",
            ">=",              "TokenType.greaterEqual",
            "++",              "TokenType.increment",
            "{",               "TokenType.lBrace",
            "[",               "TokenType.lBracket",
            "<",               "TokenType.less",
            "<=",              "TokenType.lessEqual",
            "<>=",             "TokenType.lessEqualGreater",
            "<>",              "TokenType.lessOrGreater",
            "&&",              "TokenType.logicAnd",
            "||",              "TokenType.logicOr",
            "(",               "TokenType.lParen",
            "-",               "TokenType.minus",
            "-=",              "TokenType.minusEqual",
            "%",               "TokenType.mod",
            "%=",              "TokenType.modEqual",
            "*=",              "TokenType.mulEqual",
            "!",               "TokenType.not",
            "!=",              "TokenType.notEqual",
            "!>",              "TokenType.notGreater",
            "!>=",             "TokenType.notGreaterEqual",
            "!<",              "TokenType.notLess",
            "!<=",             "TokenType.notLessEqual",
            "!<>",             "TokenType.notLessEqualGreater",
            "+",               "TokenType.plus",
            "+=",              "TokenType.plusEqual",
            "^^",              "TokenType.pow",
            "^^=",             "TokenType.powEqual",
            "}",               "TokenType.rBrace",
            "]",               "TokenType.rBracket",
            ")",               "TokenType.rParen",
            ";",               "TokenType.semicolon",
            "<<",              "TokenType.shiftLeft",
            "<<=",             "TokenType.shiftLeftEqual",
            ">>",              "TokenType.shiftRight",
            ">>=",             "TokenType.shiftRightEqual",
            "*",               "TokenType.star",
            "?",               "TokenType.ternary",
            "~",               "TokenType.tilde",
            "!<>=",            "TokenType.unordered",
            ">>>",             "TokenType.unsignedShiftRight",
            ">>>=",            "TokenType.unsignedShiftRightEqual",
            "^",               "TokenType.xor",
            "^=",              "TokenType.xorEqual",
        ));
        case '/':
            nextCharNonLF();
            if (isEoF())
            {
                current.type = TokenType.div;
                current.value = "/";
                return;
            }
            switch (src.front)
            {
            case '/':
            case '*':
            case '+':
                if (config.iterStyle & IterationStyle.includeComments)
                    return lexComment!true();
                lexComment!false();
                goto L_advance; // tail-recursion

            case '=':
                current.type = TokenType.divEqual;
                current.value = "/=";
                src.popFront();
                return;
            default:
                current.type = TokenType.div;
                current.value = "/";
                return;
            }
        case '.':
            if (!src.canPeek())
            {
                current.type = TokenType.dot;
                current.value = tokenValue!(TokenType.dot);
                return;
            }
            switch (src.peek())
            {
            case '0': .. case '9':
                lexNumber();
                return;
            case '.':
                nextCharNonLF();
                nextCharNonLF();
                current.type = TokenType.slice;
                if (src.front == '.')
                {
                    current.type = TokenType.vararg;
                    nextCharNonLF();
                    current.value = tokenValue!(TokenType.vararg);
                }
                else
                    current.value = tokenValue!(TokenType.slice);
                return;
            default:
                nextCharNonLF();
                current.type = TokenType.dot;
                current.value = tokenValue!(TokenType.dot);
                return;
            }
        case '0': .. case '9':
            lexNumber();
            return;
        case '\'':
            lexCharacterLiteral();
            return;
        case '"':
        case '`':
            lexString();
            return;
        case 'q':
            nextCharNonLF();
            if (isEoF())
                goto default;
            switch (src.front)
            {
            case '{':
                lexTokenString();
                return;
            case '"':
                lexDelimitedString();
                return;
            default:
                break;
            }
            goto default;
        case 'r':
            nextCharNonLF();
            if (isEoF())
                goto default;
            else if (src.front == '"')
            {
                lexString();
                return;
            }
            else
                goto default;
        case 'x':
            nextCharNonLF();
            if (isEoF())
                goto default;
            else if (src.front == '"')
            {
                lexHexString();
                return;
            }
            else
                goto default;
        case '#':
            lexSpecialTokenSequence();
            if(config.iterStyle & IterationStyle.includeSpecialTokens)
                return;
            goto L_advance; // tail-recursion
        // "short" ASCII whites
        case 0x20:
        case 0x09: .. case 0x0d:
             if (config.iterStyle & IterationStyle.includeWhitespace)
                return lexWhitespace!true();
             lexWhitespace!false();
             goto L_advance; // tail-recursion
        default:
            if ((src.front & 0x80) && isLongWhite())
            {
                if (config.iterStyle & IterationStyle.includeWhitespace)
                    return lexWhitespace!true();
                lexWhitespace!false();
                goto L_advance; // tail-recursion
            }
            for(;;)
            {
                if(isSeparating())
                    break;
                nextCharNonLF();
                if(isEoF())
                    break;
            }

            current.type = lookupTokenType(src.slice);
            current.value = getTokenValue(current.type);
            if (current.value is null)
                setTokenValue();
            if (!(config.iterStyle & IterationStyle.ignoreEOF) && current.type == TokenType.specialEof)
            {
                _empty = true;
                return;
            }

            if (config.tokenStyle & TokenStyle.doNotReplaceSpecial)
                return;
            expandSpecialToken();
        }
    }

    // TODO: LexSource could be improved for forward ranges
    // to avoid buffering at all (by disabling it for a moment)
    // so keep the 'keep' parameter here and elsewhere
    void lexWhitespace(bool keep)()
    {
        current.type = TokenType.whitespace;
        do
        {
            nextChar();
        } while (!isEoF() && isWhite());
        static if (keep) setTokenValue();
    }

    void lexComment(bool keep)()
    in
    {
        assert (src.front == '/' || src.front == '*' || src.front == '+');
    }
    body
    {
        current.type = TokenType.comment;
        switch(src.front)
        {
        case '/':
            while (!isEoF() && !isNewline(src.front))
            {
                nextCharNonLF();
            }
            break;
        case '*':
            while (!isEoF())
            {
                if (src.front == '*')
                {
                    static if (keep) nextCharNonLF();
                    else src.popFront();
                    if (src.front == '/')
                    {
                        nextCharNonLF();
                        break;
                    }
                }
                else
                    nextChar();
            }
            break;
        case '+':
            int depth = 1;
            while (depth > 0 && !isEoF())
            {
                if (src.front == '+')
                {
                    nextCharNonLF();
                    if (src.front == '/')
                    {
                        nextCharNonLF();
                        --depth;
                    }
                }
                else if (src.front == '/')
                {
                    nextCharNonLF();
                    if (src.front == '+')
                    {
                        nextCharNonLF();
                        ++depth;
                    }
                }
                else
                    nextChar();
            }
            break;
        default:
            assert(false);
        }
        static if (keep)
        setTokenValue();
    }

    void lexHexString()
    in
    {
        assert (src.front == '"');
    }
    body
    {
        current.type = TokenType.stringLiteral;
        nextChar();
        while (true)
        {
            if (isEoF())
            {
                errorMessage("Unterminated hex string literal");
                return;
            }
            else if (isHexDigit(src.front))
            {
                nextCharNonLF();
            }
            else if (isWhite() && (config.tokenStyle & TokenStyle.notEscaped))
            {
                nextChar();
            }
            else if (src.front == '"')
            {
                nextCharNonLF();
                break;
            }
            else
            {
                errorMessage(format("Invalid character '%s' in hex string literal",
                    cast(char) src.front));
                return;
            }
        }
        bool hasSuffix = lexStringSuffix();
        if (config.tokenStyle & TokenStyle.notEscaped)
        {
            if (config.tokenStyle & TokenStyle.includeQuotes)
                setTokenValue();
            else
                setTokenValue(2, hasSuffix ? -2 : -1);
        }
        else
        {
            // TODO: appender is an allocation happy fat pig
            // remove it later
            auto a = appender!(char[])();
            foreach (b; std.range.chunks(src.slice[2 .. $ - 1], 2))
            {
                auto s = cast(char[])b;
                ubyte ch = cast(ubyte)parse!uint(s, 16);
                a.put(ch);
            }
            // can safely assume ownership of data
            current.value = cast(string)a.data;
        }
    }

    void lexNumber()
    in
    {
        assert(isDigit(src.front) || src.front == '.');
    }
    body
    {
        if (src.front != '0')
        {
            lexDecimal();
            return;
        }
        else
        {
            switch (src.peek())
            {
            case 'x':
            case 'X':
                nextCharNonLF();
                nextCharNonLF();
                lexHex();
                break;
            case 'b':
            case 'B':
                nextCharNonLF();
                nextCharNonLF();
                lexBinary();
                break;
            default:
                lexDecimal();
                break;
            }
        }
    }

    void lexFloatSuffix()
    {
        switch (src.front)
        {
        case 'L':
            nextCharNonLF();
            current.type = TokenType.doubleLiteral;
            break;
        case 'f':
        case 'F':
            nextCharNonLF();
            current.type = TokenType.floatLiteral;
            break;
        default:
            break;
        }
        if (!isEoF() && src.front == 'i')
        {
            nextCharNonLF();
            if (current.type == TokenType.floatLiteral)
                current.type = TokenType.ifloatLiteral;
            else
                current.type = TokenType.idoubleLiteral;
        }
    }

    void lexIntSuffix()
    {
        bool foundU;
        bool foundL;
        while (!isEoF())
        {
            switch (src.front)
            {
            case 'u':
            case 'U':
                if (foundU)
                    return;
                switch (current.type)
                {
                case TokenType.intLiteral:
                    current.type = TokenType.uintLiteral;
                    nextCharNonLF();
                    break;
                case TokenType.longLiteral:
                    current.type = TokenType.ulongLiteral;
                    nextCharNonLF();
                    break;
                default:
                    assert (false);
                }
                foundU = true;
                break;
            case 'L':
                if (foundL)
                    return;
                switch (current.type)
                {
                case TokenType.intLiteral:
                    current.type = TokenType.longLiteral;
                    nextCharNonLF();
                    break;
                case TokenType.uintLiteral:
                    current.type = TokenType.ulongLiteral;
                    nextCharNonLF();
                    break;
                default:
                    assert (false);
                }
                foundL = true;
                break;
            default:
                return;
            }
        }
    }

    void lexExponent()
    in
    {
        assert (src.front == 'e' || src.front == 'E' || src.front == 'p'
            || src.front == 'P');
    }
    body
    {
        nextCharNonLF();
        bool foundSign = false;
        bool foundDigit = false;
        while (!isEoF())
        {
            switch (src.front)
            {
            case '-':
            case '+':
                if (foundSign)
                {
                    if (!foundDigit)
                    errorMessage("Expected an exponent");
                    return;
                }
                foundSign = true;
                nextCharNonLF();
                break;
            case '0': .. case '9':
            case '_':
                foundDigit = true;
                nextCharNonLF();
                break;
            case 'L':
            case 'f':
            case 'F':
            case 'i':
                lexFloatSuffix();
                return;
            default:
                if (!foundDigit)
                    errorMessage("Expected an exponent");
                return;
            }
        }
    }

    void lexDecimal()
    in
    {
        assert (isDigit(src.front) || src.front == '.');
    }
    body
    {
        bool foundDot = src.front == '.';
        if (foundDot)
            nextCharNonLF();
        current.type = TokenType.intLiteral;
        decimalLoop: while (!isEoF())
        {
            switch (src.front)
            {
            case '0': .. case '9':
            case '_':
                nextCharNonLF();
                break;
            case 'u':
            case 'U':
                if (!foundDot)
                    lexIntSuffix();
                break decimalLoop;
            case 'i':
                lexFloatSuffix();
                break decimalLoop;
            case 'L':
                if (foundDot)
                    lexFloatSuffix();
                else
                    lexIntSuffix();
                break decimalLoop;
            case 'f':
            case 'F':
                lexFloatSuffix();
                break decimalLoop;
            case 'e':
            case 'E':
                lexExponent();
                break decimalLoop;
            case '.':
                if (foundDot)
                    break decimalLoop;
                if (src.canPeek() && src.peek() == '.')
                    break decimalLoop;
                nextCharNonLF();
                foundDot = true;
                current.type = TokenType.doubleLiteral;
                break;
            default:
                break decimalLoop;
            }
        }
        setTokenValue();
    }

    void lexBinary()
    {
        current.type = TokenType.intLiteral;
        binaryLoop: while (!isEoF())
        {
            switch (src.front)
            {
            case '0':
            case '1':
            case '_':
                nextCharNonLF();
                break;
            case 'u':
            case 'U':
            case 'L':
                lexIntSuffix();
                break binaryLoop;
            default:
                break binaryLoop;
            }
        }
        setTokenValue();
    }

    void lexHex()
    {
        current.type = TokenType.intLiteral;
        bool foundDot;
        hexLoop: while (!isEoF())
        {
            switch (src.front)
            {
            case 'a': .. case 'f':
            case 'A': .. case 'F':
            case '0': .. case '9':
            case '_':
                nextCharNonLF();
                break;
            case 'u':
            case 'U':
                lexIntSuffix();
                break hexLoop;
            case 'i':
                if (foundDot)
                    lexFloatSuffix();
                break hexLoop;
            case 'L':
                if (foundDot)
                {
                    lexFloatSuffix();
                    break hexLoop;
                }
                else
                {
                    lexIntSuffix();
                    break hexLoop;
                }
            case 'p':
            case 'P':
                lexExponent();
                break hexLoop;
            case '.':
                if (foundDot)
                    break hexLoop;
                if (src.canPeek() && src.peek() == '.')
                    break hexLoop;
                nextCharNonLF();
                foundDot = true;
                current.type = TokenType.doubleLiteral;
                break;
            default:
                break hexLoop;
            }
        }
        setTokenValue();
    }

    bool lexStringSuffix()
    {
        current.type = TokenType.stringLiteral;
        bool foundSuffix = false;
        if (!isEoF())
        {
            switch (src.front)
            {
            case 'w':
                current.type = TokenType.wstringLiteral;
                goto case 'c';
            case 'd':
                current.type = TokenType.dstringLiteral;
                goto case 'c';
            case 'c':
                foundSuffix = true;
                nextCharNonLF();
                break;
            default:
                break;
            }
        }
        return foundSuffix;
    }

    void lexCharacterLiteral()
    in
    {
        assert (src.front == '\'');
    }
    body
    {
        current.type = TokenType.characterLiteral;
        nextChar();
        if (isEoF())
        {
            errorMessage("Unterminated character literal");
            return;
        }
        switch (src.front)
        {
        case '\'':
            break;
        case '\\':
            if (config.tokenStyle & TokenStyle.notEscaped)
                skipEscapeSequence();
            else
            {
                // the only special path
                // 40 bytes is enough for 2 quotes
                // and the longest character entity
                ubyte[40] utf8;
                size_t len;
                if (config.tokenStyle & TokenStyle.includeQuotes)
                {
                    utf8[0] = '\'';
                    len = decodeEscapeSequence(utf8[1..$]);
                    utf8[len++] = '\'';
                }
                else
                    len = decodeEscapeSequence(utf8[]);
                if (src.front != '\'')
                {
                    errorMessage("Expected \"'\" to end character literal");
                }
                // skip over last "'"
                nextChar();
                setTokenValue(utf8[0..len]);
            return;
            }
            break;
        default:
            if (src.front & 0x80)
            {
                while (src.front & 0x80)
                    nextChar();
                break;
            }
            else
            {
                nextChar();
                break;
            }
        }
        if (src.front != '\'')
            errorMessage("Expected \"'\" to end character literal");
        nextChar();
        if (config.tokenStyle & TokenStyle.includeQuotes)
            setTokenValue();
        else
            setTokenValue(1, -1);
    }

    void lexString()
    in
    {
        //assert (src.front == '"');
    }
    body
    {
        current.type = TokenType.stringLiteral;
        bool longWysiwg = src.slice.length > 0 && src.slice[0] == 'r'; // 2 chars : r"
        bool isWysiwyg = src.front == '`';
        // in case we need to unescape string
        Appender!(ubyte[]) unescaped;
        auto quote = src.front;
        nextChar();
        while (true)
        {
            if (isEoF())
            {
                errorMessage("Unterminated string literal");
                return;
            }
            else if (src.front == '\\')
            {
                if (isWysiwyg || longWysiwg)
                    nextChar();
                else if(config.tokenStyle & TokenStyle.notEscaped)
                {
                    skipEscapeSequence();
                }
                else
                {
                    if(unescaped == Appender!(ubyte[]).init)
                        unescaped = appender!(ubyte[])();
                    unescaped.put(src.slice());
                    decodeEscapeSequence(unescaped);
                    src.mark(); //start next slice after escape sequence
                }
            }
            else if (src.front == quote)
            {
                nextCharNonLF();
                break;
            }
            else
                nextChar();
        }
        lexStringSuffix();
        // helper to handle quotes
        void setData(R)(R range)
        {
            if (config.tokenStyle & TokenStyle.includeQuotes)
                setTokenValue(range);
            else if (longWysiwg)
                setTokenValue(range[2..$-1]);
            else
                setTokenValue(range[1..$-1]);
        }
        import std.stdio;
        if(unescaped != Appender!(ubyte[]).init)
        {
            //stuff in the last slice and use buffered data
            unescaped.put(src.slice);
            setData(unescaped.data);
        }
        else
        {
            setData(src.slice); //slice directly
        }
    }

    void lexDelimitedString()
    in
    {
        assert(src.front == '"');
    }
    body
    {
        current.type = TokenType.stringLiteral;

        nextChar();

        bool heredoc;
        ubyte open;
        ubyte close;

        switch (src.front)
        {
        case '[': open = '['; close = ']'; break;
        case '{': open = '{'; close = '}'; break;
        case '(': open = '('; close = ')'; break;
        case '<': open = '<'; close = '>'; break;
        default: heredoc = true; break;
        }
        if (heredoc)
            lexHeredocString();
        else
            lexNormalDelimitedString(open, close);
    }

    void lexNormalDelimitedString(ubyte open, ubyte close)
    in
    {
        assert(src.slice[0 .. 2] == `q"`);
    }
    body
    {
        current.type = TokenType.stringLiteral;
        int depth = 1;
        nextChar();
        while (true)
        {
            if (isEoF())
            {
                errorMessage("Unterminated string literal");
                break;
            }
            if (src.front == open)
            {
                nextChar();
                ++depth;
            }
            else if (src.front == close)
            {
                nextChar();
                --depth;
                if (depth <= 0)
                {
                    auto r = src.save(); //TODO: allocates for Fwd range
                    if (r.front == '"')
                    {
                        nextChar();
                        break;
                    }
                    else
                    {
                        errorMessage("Expected \" after balanced "
                            ~ cast(char) close ~ " but found "
                            ~ cast(char) r.front ~ " instead.");
                        break;
                    }
                }
            }
            else
                nextChar();
        }
        if (config.tokenStyle & TokenStyle.includeQuotes)
            setTokenValue();
        else
            setTokenValue(3, -2);
    }

    void lexHeredocString()
    in
    {
        assert (src.slice.equal("q\""));
    }
    body
    {
        typeof(src.slice) ident;
        uint newlineBytes;
        while (true)
        {
            if (isEoF())
            {
                errorMessage("Unterminated string literal");
                return;
            }
            else if (isNewline(src.front))
            {
                ident = src.slice[2..$];
                nextChar();
                newlineBytes = cast(uint) (src.slice.length - 2 - ident.length);
                break;
            }
            else if (isSeparating())
            {
                nextChar();
                ident = src.slice[2..$];
                nextChar();
                newlineBytes = 0;
                break;
            }
            else
            {
                nextChar();
            }
        }
        while (true)
        {
            if (isEoF())
            {
                errorMessage("Unterminated string literal");
                break;
            }
            else if (src.slice.length > ident.length
                && src.slice[$-ident.length .. $].equal(ident))
            {
                if (src.front == '"')
                {
                    nextChar();
                    lexStringSuffix();
                    break;
                }
                else
                {
                    errorMessage("Unterminated string literal: " ~ cast(string) src.slice);
                    break;
                }
            }
            else
                nextChar();
        }

        bool hasSuffix = lexStringSuffix();

        if (config.tokenStyle & TokenStyle.includeQuotes)
            setTokenValue();
        else
        {
            setTokenValue(cast(int) (2 + newlineBytes + ident.length),
                cast(int) (-(ident.length + (hasSuffix ? 2 : 1))));
        }
    }

    void lexTokenString()
    in
    {
        assert (src.front == '{');
    }
    body
    {
        current.type = TokenType.stringLiteral;
        nextChar();
        auto app = appender!(ubyte[])();
        if (config.tokenStyle & TokenStyle.includeQuotes)
        {
            app.put('q');
            app.put('{');
        }
        LexerConfig c = config;
        scope (exit) config = c;
        config.iterStyle = IterationStyle.everything;
        config.tokenStyle = TokenStyle.source;
        int depth = 1;

        while (!isEoF())
        {
            advance();
            if (current.type == TokenType.lBrace)
                ++depth;
            else if (current.type == TokenType.rBrace)
            {
                --depth;
                if (depth <= 0)
                    break;
            }
            app.put(representation(current.value));
        }
        config = c;
        if (config.tokenStyle & TokenStyle.includeQuotes)
        {
            app.put('}');
        }
        if (src.empty)
            current.type = TokenType.stringLiteral;
        else
        {
            switch (src.front)
            {
                case 'd':
                    if (config.tokenStyle & TokenStyle.includeQuotes)
                        app.put('d');
                    current.type = TokenType.dstringLiteral;
                    src.popFront();
                    break;
                case 'w':
                    if (config.tokenStyle & TokenStyle.includeQuotes)
                        app.put('w');
                    current.type = TokenType.wstringLiteral;
                    src.popFront();
                    break;
                case 'c':
                    if (config.tokenStyle & TokenStyle.includeQuotes)
                        app.put('c');
                    src.popFront();
                    goto default;
                default:
                    current.type = TokenType.stringLiteral;
                    break;
            }
        }
        current.value = cast(string) app.data;
    }

    void lexSpecialTokenSequence()
    in
    {
        assert (src.front == '#');
    }
    body
    {
        nextChar();
        auto r = src.save();
        auto app = appender!(ubyte[])();
        app.put('#');
        while (true)
        {
            if (r.isRangeEoF())
            {
                errorMessage("Found EOF when interpreting special token sequence");
                return;
            }
            else if (isNewline(r.front))
                break;
            else
            {
                app.put(r.front);
                r.popFront();
            }
        }
        auto m = match((cast(char[]) app.data),
            `#line\s+(?P<line>\d+)\s*(?P<filespec>".+")*?`);
        if (m)
        {
            current.type = TokenType.specialTokenSequence;
            current.value = (cast(char[]) app.data).idup;
            column += app.data.length;
            foreach (i; 0 .. app.data.length)
                src.popFront();
            auto c = m.captures;
            if (c["filespec"])
                config.fileName = c["filespec"].idup;
            auto l = c["line"];
            lineNumber = parse!uint(l);
        }
        else
        {
            current.type = TokenType.hash;
            current.value = tokenValue!(TokenType.hash);
        }
    }

//=====================================================================
//          Helpers for lexXYZ functions
//=====================================================================
    void skipEscapeSequence()
    {
        // no decoding, just minor sanity checks
        nextChar();
        switch (src.front)
        {
        case '\'':
        case '"':
        case '?':
        case '\\':
        case 'a':
        case 'b':
        case 'f':
        case 'n':
        case 'r':
        case 't':
        case 'v':
        case 0x0a:
        case 0x00:
            nextChar();
            return;
        case '0': .. case '7':
            foreach(i; 0 .. 3)
            {
                nextChar();
                if (src.front < '0' || src.front > '7') return;
            }
            return;
        case 'x':
            nextChar();
            foreach(i; 0 .. 2)
            {
                if (!isHexDigit(src.front))
                {
                    errorMessage("Expected hex digit");
                    return;
                }
                nextChar();
            }
            return;
        case 'u':
        case 'U':
            uint digits = src.front == 'u' ? 4 : 8;
            nextChar();
            foreach (i; 0 .. digits)
            {
                if (!isHexDigit(src.front))
                {
                    errorMessage("Expected hex digit instead of %s".format(
                        cast(char) src.front));
                    return;
                }
                nextChar();
            }
            return;
        case '&':
            while (!isEoF())
            {
                nextChar();
                if (src.front == ';')
                    break;
            }
            return;
        default:
            errorMessage("Invalid escape sequence");
            return;
        }
    }

    size_t decodeEscapeSequence(OutputRange)(OutputRange dest)
    in
    {
        assert (src.front == '\\');
    }
    body
    {
        size_t reencodeNumeric(ubyte[] src, int radix, OutputRange dest)
        {
            char[] chunk = cast(char[])src;
            char[4] utfBuf;
            uint codepoint = parse!uint(chunk, radix);
            size_t len;
            try
                 len = encode(utfBuf, codepoint);
            catch (UTFException ex)
            {
                errorMessage(ex.msg);
                return 0;
            }
            dest.put(cast(ubyte[]) utfBuf[0..len]);
            return len;
        }

        ubyte[40] buffer;
        src.popFront();
        switch (src.front)
            {
        case '\'':
        case '"':
        case '?':
        case '\\':
            buffer[0] = src.front;
            src.popFront();
            return 1;
        case 'a':  dest.put('\a'); src.popFront(); return 1;
        case 'b':  dest.put('\b'); src.popFront(); return 1;
        case 'f':  dest.put('\f'); src.popFront(); return 1;
        case 'n':  dest.put('\n'); src.popFront(); return 1;
        case 'r':  dest.put('\r'); src.popFront(); return 1;
        case 't':  dest.put('\t'); src.popFront(); return 1;
        case 'v':  dest.put('\v'); src.popFront(); return 1;
        case 0x0a: dest.put(cast(ubyte)0x0a); src.popFront(); return 1;
        case 0x00: dest.put(cast(ubyte)0x00); src.popFront(); return 1;
        case '0': .. case '7':
            size_t idx = 0;
            while(idx < 3 && !isEoF())
            {
                buffer[idx++] = src.front;
                src.popFront();
                if (src.front < '0' || src.front > '7') break;
            }
            return reencodeNumeric(buffer[0..idx], 8, dest);
        case 'x':
            src.popFront();
            foreach(i; 0 .. 2)
            {
                if (!isHexDigit(src.front))
                {
                    errorMessage("Expected hex digit");
                    return 1;
                }
                buffer[i] = src.front;
                src.popFront();
            }
            return reencodeNumeric(buffer[0..2], 16, dest);
        case 'u':
        case 'U':
            uint digitCount = src.front == 'u' ? 4 : 8;
            src.popFront();
            foreach (i; 0 .. digitCount)
            {
                if (!isHexDigit(src.front))
                {
                    errorMessage("Expected hex digit");
                    return 1;
                }
                buffer[i] = src.front;
                src.popFront();
            }
            return reencodeNumeric(buffer[0..digitCount], 16, dest);
        case '&':
            src.popFront();
            size_t idx = 0;
            while (!isEoF())
            {
                if (isAlpha(src.front))
                {
                    buffer[idx++] = src.front;
                    if(idx == buffer.length) // way over maximum length
                        errorMessage("Invalid character entity");
                    src.popFront();
                }
                else if (src.front == ';')
                {
                    src.popFront();
                    break;
                }
                else
                {
                        errorMessage("Invalid character entity");
                        return idx;
                }
            }
            //TODO: avoid looking up as UTF string, use raw bytes
            string chunk = cast(string)buffer[0..idx];
            auto names = assumeSorted(map!"a.name"(characterEntities));
            auto place = names.lowerBound(chunk).length;
            if (place == names.length || names[place] != chunk)
            {
                errorMessage("Invalid character entity \"&%s;\""
                    .format(cast(string) chunk));
                return 1;
            }
            auto entity = characterEntities[place].value;
            dest.put(cast(ubyte[]) entity);
            return entity.length;
        default:
            errorMessage("Invalid escape sequence");
            return 1;
        }
    }

    // advances underlying mark-slice range and counts lines, cols
    void nextChar()
    {
        bool foundNewline;
        if (src.front == '\r')
        {
            src.popFront();
            foundNewline = true;
        }
        if (src.front == '\n')
        {
            src.popFront();
            foundNewline = true;
        }
        else
        {
            src.popFront();
        }
        if (foundNewline)
        {
            ++lineNumber;
            column = 0;
        }
        else
            ++column;

    }

    //same but don't bother for LF sequences
    void nextCharNonLF()
    {
        src.popFront();
        ++column;
    }

    void setTokenValue()()
    {
        current.value = cache.get(src.slice);
    }

    void setTokenValue()(int startOffset, int endOffset)
    in
    {
        assert(startOffset >= 0);
        assert(endOffset <= 0);
    }
    body
    {
        auto piece = src.slice;
        // avoid unsigned arithmetic as endOffset is negative
        int end = cast(int)piece.length + endOffset;
        current.value = cache.get(src.slice[startOffset .. end]);
    }

    void setTokenValue(R)(R range)
        if(isRandomAccessRange!R && is(ElementType!R : const(ubyte)))
    {
        current.value = cache.get(range);
    }

    bool isEoF() const
    {
        return src.empty || src.front == 0 || src.front == 0x1a;
    }

    bool isSeparating()
    {
        auto ch = src.front;
        if (ch <= 0x2f) return true;
        if (ch >= ':' && ch <= '@') return true;
        if (ch >= '[' && ch <= '^') return true;
        if (ch >= '{' && ch <= '~') return true;
        if (ch == '`') return true;
        if ((ch & 0x80) && isLongWhite()) return true;
        return false;
    }

    bool isWhite()
    {
        auto c = src.front;
        if (c & 0x80) // multi-byte utf-8
        {
            return isLongWhite();
        }
        else
            return c == 0x20 || (c >= 0x09 && c <= 0x0d);
    }

    bool isLongWhite()
    {
        assert(src.front & 0x80); // only non-ascii
        //TODO: here and elsewhere we'd better have
        // some kind of lookahead in LexSource instead of .save
        auto r = src.save();
        if (r.front != 0xe2)
            return false;
        else
            r.popFront();
        if (r.empty || r.front != 0x80)
            return false;
        else
            r.popFront();
        if (r.empty || (r.front != 0xa8 && r.front != 0xa9))
            return false;
        return true;
    }

    void expandSpecialToken()
    {
        switch (current.type)
        {
        case TokenType.specialDate:
            current.type = TokenType.stringLiteral;
            auto time = Clock.currTime();
            current.value = format("%s %02d %04d", time.month, time.day, time.year);
            return;
        case TokenType.specialTime:
            auto time = Clock.currTime();
            current.type = TokenType.stringLiteral;
            current.value = (cast(TimeOfDay)(time)).toISOExtString();
            return;
        case TokenType.specialTimestamp:
            auto time = Clock.currTime();
            auto dt = cast(DateTime) time;
            current.type = TokenType.stringLiteral;
            current.value = format("%s %s %02d %02d:%02d:%02d %04d",
                dt.dayOfWeek, dt.month, dt.day, dt.hour, dt.minute,
                dt.second, dt.year);
            return;
        case TokenType.specialVendor:
            current.type = TokenType.stringLiteral;
            current.value = config.vendorString;
            return;
        case TokenType.specialVersion:
            current.type = TokenType.stringLiteral;
            current.value = format("%d", config.versionNumber);
            return;
        case TokenType.specialLine:
            current.type = TokenType.intLiteral;
            current.value = format("%d", current.line);
            return;
        case TokenType.specialFile:
            current.type = TokenType.stringLiteral;
            current.value = config.fileName;
            return;
        default:
            return;
        }
    }

    void errorMessage(string s)
    {
        import std.string: format;
        if (config.errorFunc !is null)
            config.errorFunc(config.fileName, current.startIndex,
                current.line, current.column, s);
        else
            throw new Exception(format("%s(%d:%d): %s",
                config.fileName, current.line, current.column, s));
    }

    this(LexSrc lex, LexerConfig cfg)
    {
        src = move(lex); // lex is r-value
        lineNumber = 1;
        column = 0;
        _empty = false;
        config = move(cfg); // ditto with cfg
        cache = StringCache(initialTableSize);
    }
    enum initialTableSize = 2048;
    Token current;
    uint lineNumber;
    uint column;
    LexSrc src;
    bool _empty;
    LexerConfig config;
    StringCache cache;
}

/**
 * Returns: true if the token is an operator
 */
pure nothrow bool isOperator(const TokenType t)
{
    return t >= TokenType.assign && t <= TokenType.xorEqual;
}

/**
 * ditto
 */
pure nothrow bool isOperator(ref const Token t)
{
    return isOperator(t.type);
}

/**
 * Returns: true if the token is a keyword
 */
pure nothrow bool isKeyword(const TokenType t)
{
    return t >= TokenType.bool_ && t <= TokenType.with_;
}

/**
 * ditto
 */
pure nothrow bool isKeyword(ref const Token t)
{
    return isKeyword(t.type);
}

/**
 * Returns: true if the token is a built-in type
 */
pure nothrow bool isBasicType(const TokenType t)
{
    return t >= TokenType.bool_ && t <= TokenType.wchar_;
}

/**
 * ditto
 */
pure nothrow bool isBasicType(ref const Token t)
{
    return isBasicType(t.type);
}

/**
 * Returns: true if the token is an attribute
 */
pure nothrow bool isAttribute(const TokenType t)
{
    return t >= TokenType.align_ && t <= TokenType.static_;
}

/**
 * ditto
 */
pure nothrow bool isAttribute(ref const Token t)
{
    return isAttribute(t.type);
}

/**
 * Returns: true if the token is a protection attribute
 */
pure nothrow bool isProtection(const TokenType t)
{
    return t >= TokenType.export_ && t <= TokenType.public_;
}

/**
 * ditto
 */
pure nothrow bool isProtection(ref const Token t)
{
    return isProtection(t.type);
}

/**
 * Returns: true if the token is a compile-time constant such as ___DATE__
 */
pure nothrow bool isConstant(const TokenType t)
{
    return t >= TokenType.specialDate && t <= TokenType.traits;
}

/**
 * ditto
 */
pure nothrow bool isConstant(ref const Token t)
{
    return isConstant(t.type);
}

/**
 * Returns: true if the token is a string or number literal
 */
pure nothrow bool isLiteral(const TokenType t)
{
    return t >= TokenType.doubleLiteral && t <= TokenType.wstringLiteral;
}

/**
 * ditto
 */
pure nothrow bool isLiteral(ref const Token t)
{
    return isLiteral(t.type);
}

/**
 * Returns: true if the token is a number literal
 */
pure nothrow bool isNumberLiteral(const TokenType t)
{
    return t >= TokenType.doubleLiteral && t <= TokenType.ulongLiteral;
}

/**
 * ditto
 */
pure nothrow bool isNumberLiteral(ref const Token t)
{
    return isNumberLiteral(t.type);
}

/**
 * Returns: true if the token is a string literal
 */
pure nothrow bool isStringLiteral(const TokenType t)
{
    return t >= TokenType.dstringLiteral && t <= TokenType.wstringLiteral;
}

/**
 * ditto
 */
pure nothrow bool isStringLiteral(ref const Token t)
{
    return isStringLiteral(t.type);
}

/**
 * Returns: true if the token is whitespace, a commemnt, a special token
 *     sequence, or an identifier
 */
pure nothrow bool isMisc(const TokenType t)
{
    return t >= TokenType.comment && t <= TokenType.specialTokenSequence;
}

/**
 * ditto
 */
pure nothrow bool isMisc(ref const Token t)
{
    return isMisc(t.type);
}

/**
 * Listing of all the tokens in the D language.
 */
enum TokenType: ushort
{
    assign, /// =
    at, /// @
    bitAnd, /// &
    bitAndEqual, /// &=
    bitOr, /// |
    bitOrEqual, /// |=
    catEqual, /// ~=
    colon, /// :
    comma, /// ,
    decrement, /// --
    div, /// /
    divEqual, /// /=
    dollar, /// $
    dot, /// .
    equal, /// ==
    goesTo, /// =>
    greater, /// >
    greaterEqual, /// >=
    hash, /// #
    increment, /// ++
    lBrace, /// {
    lBracket, /// [
    less, /// <
    lessEqual, /// <=
    lessEqualGreater, /// <>=
    lessOrGreater, /// <>
    logicAnd, /// &&
    logicOr, /// ||
    lParen, /// $(LPAREN)
    minus, /// -
    minusEqual, /// -=
    mod, /// %
    modEqual, /// %=
    mulEqual, /// *=
    not, /// !
    notEqual, /// !=
    notGreater, /// !>
    notGreaterEqual, /// !>=
    notLess, /// !<
    notLessEqual, /// !<=
    notLessEqualGreater, /// !<>
    plus, /// +
    plusEqual, /// +=
    pow, /// ^^
    powEqual, /// ^^=
    rBrace, /// }
    rBracket, /// ]
    rParen, /// $(RPAREN)
    semicolon, /// ;
    shiftLeft, /// <<
    shiftLeftEqual, /// <<=
    shiftRight, /// >>
    shiftRightEqual, /// >>=
    slice, /// ..
    star, /// *
    ternary, /// ?
    tilde, /// ~
    unordered, /// !<>=
    unsignedShiftRight, /// >>>
    unsignedShiftRightEqual, /// >>>=
    vararg, /// ...
    xor, /// ^
    xorEqual, /// ^=

    bool_, /// $(D_KEYWORD bool)
    byte_, /// $(D_KEYWORD byte)
    cdouble_, /// $(D_KEYWORD cdouble)
    cent_, /// $(D_KEYWORD cent)
    cfloat_, /// $(D_KEYWORD cfloat)
    char_, /// $(D_KEYWORD char)
    creal_, /// $(D_KEYWORD creal)
    dchar_, /// $(D_KEYWORD dchar)
    double_, /// $(D_KEYWORD double)
    float_, /// $(D_KEYWORD float)
    function_, /// $(D_KEYWORD function)
    idouble_, /// $(D_KEYWORD idouble)
    ifloat_, /// $(D_KEYWORD ifloat)
    int_, /// $(D_KEYWORD int)
    ireal_, /// $(D_KEYWORD ireal)
    long_, /// $(D_KEYWORD long)
    real_, /// $(D_KEYWORD real)
    short_, /// $(D_KEYWORD short)
    ubyte_, /// $(D_KEYWORD ubyte)
    ucent_, /// $(D_KEYWORD ucent)
    uint_, /// $(D_KEYWORD uint)
    ulong_, /// $(D_KEYWORD ulong)
    ushort_, /// $(D_KEYWORD ushort)
    void_, /// $(D_KEYWORD void)
    wchar_, /// $(D_KEYWORD wchar)

    align_, /// $(D_KEYWORD align)
    deprecated_, /// $(D_KEYWORD deprecated)
    extern_, /// $(D_KEYWORD extern)
    pragma_, /// $(D_KEYWORD pragma)
    export_, /// $(D_KEYWORD export)
    package_, /// $(D_KEYWORD package)
    private_, /// $(D_KEYWORD private)
    protected_, /// $(D_KEYWORD protected)
    public_, /// $(D_KEYWORD public)
    abstract_, /// $(D_KEYWORD abstract)
    auto_, /// $(D_KEYWORD auto)
    const_, /// $(D_KEYWORD const)
    final_, /// $(D_KEYWORD final)
    gshared, /// $(D_KEYWORD __gshared)
    immutable_, /// $(D_KEYWORD immutable)
    inout_, /// $(D_KEYWORD inout)
    scope_, /// $(D_KEYWORD scope)
    shared_, /// $(D_KEYWORD shared)
    static_, /// $(D_KEYWORD static)

    synchronized_, /// $(D_KEYWORD synchronized)
    alias_, /// $(D_KEYWORD alias)
    asm_, /// $(D_KEYWORD asm)
    assert_, /// $(D_KEYWORD assert)
    body_, /// $(D_KEYWORD body)
    break_, /// $(D_KEYWORD break)
    case_, /// $(D_KEYWORD case)
    cast_, /// $(D_KEYWORD cast)
    catch_, /// $(D_KEYWORD catch)
    class_, /// $(D_KEYWORD class)
    continue_, /// $(D_KEYWORD continue)
    debug_, /// $(D_KEYWORD debug)
    default_, /// $(D_KEYWORD default)
    delegate_, /// $(D_KEYWORD delegate)
    delete_, /// $(D_KEYWORD delete)
    do_, /// $(D_KEYWORD do)
    else_, /// $(D_KEYWORD else)
    enum_, /// $(D_KEYWORD enum)
    false_, /// $(D_KEYWORD false)
    finally_, /// $(D_KEYWORD finally)
    foreach_, /// $(D_KEYWORD foreach)
    foreach_reverse_, /// $(D_KEYWORD foreach_reverse)
    for_, /// $(D_KEYWORD for)
    goto_, /// $(D_KEYWORD goto)
    if_, /// $(D_KEYWORD if)
    import_, /// $(D_KEYWORD import)
    in_, /// $(D_KEYWORD in)
    interface_, /// $(D_KEYWORD interface)
    invariant_, /// $(D_KEYWORD invariant)
    is_, /// $(D_KEYWORD is)
    lazy_, /// $(D_KEYWORD lazy)
    macro_, /// $(D_KEYWORD macro)
    mixin_, /// $(D_KEYWORD mixin)
    module_, /// $(D_KEYWORD module)
    new_, /// $(D_KEYWORD new)
    nothrow_, /// $(D_KEYWORD nothrow)
    null_, /// $(D_KEYWORD null)
    out_, /// $(D_KEYWORD out)
    override_, /// $(D_KEYWORD override)
    pure_, /// $(D_KEYWORD pure)
    ref_, /// $(D_KEYWORD ref)
    return_, /// $(D_KEYWORD return)
    struct_, /// $(D_KEYWORD struct)
    super_, /// $(D_KEYWORD super)
    switch_, /// $(D_KEYWORD switch)
    template_, /// $(D_KEYWORD template)
    this_, /// $(D_KEYWORD this)
    throw_, /// $(D_KEYWORD throw)
    true_, /// $(D_KEYWORD true)
    try_, /// $(D_KEYWORD try)
    typedef_, /// $(D_KEYWORD typedef)
    typeid_, /// $(D_KEYWORD typeid)
    typeof_, /// $(D_KEYWORD typeof)
    union_, /// $(D_KEYWORD union)
    unittest_, /// $(D_KEYWORD unittest)
    version_, /// $(D_KEYWORD version)
    volatile_, /// $(D_KEYWORD volatile)
    while_, /// $(D_KEYWORD while)
    with_, /// $(D_KEYWORD with)

    specialDate, /// $(D_KEYWORD ___DATE__)
    specialEof, /// $(D_KEYWORD ___EOF__)
    specialTime, /// $(D_KEYWORD ___TIME__)
    specialTimestamp, /// $(D_KEYWORD ___TIMESTAMP__)
    specialVendor, /// $(D_KEYWORD ___VENDOR__)
    specialVersion, /// $(D_KEYWORD ___VERSION__)
    specialFile, /// $(D_KEYWORD ___FILE__)
    specialLine, /// $(D_KEYWORD ___LINE__)
    specialModule, /// $(D_KEYWORD ___MODULE__)
    specialFunction, /// $(D_KEYWORD ___FUNCTION__)
    specialPrettyFunction, /// $(D_KEYWORD ___PRETTY_FUNCTION__)
    specialTokenSequence, /// #line 10 "file.d"

    comment, /// $(D_COMMENT /** comment */) or $(D_COMMENT // comment) or $(D_COMMENT ///comment)
    identifier, /// anything else
    scriptLine, /// Line at the beginning of source file that starts from #!
    traits, /// $(D_KEYWORD ___traits)
    parameters, /// $(D_KEYWORD ___parameters)
    vector, /// $(D_KEYWORD ___vector)
    whitespace, /// whitespace
    doubleLiteral, /// 123.456
    floatLiteral, /// 123.456f or 0x123_45p-3
    idoubleLiteral, /// 123.456i
    ifloatLiteral, /// 123.456fi
    intLiteral, /// 123 or 0b1101010101
    longLiteral, /// 123L
    realLiteral, /// 123.456L
    irealLiteral, /// 123.456Li
    uintLiteral, /// 123u
    ulongLiteral, /// 123uL
    characterLiteral, /// 'a'
    dstringLiteral, /// $(D_STRING "32-bit string"d)
    stringLiteral, /// $(D_STRING "an 8-bit string")
    wstringLiteral, /// $(D_STRING "16-bit string"w)
}

// Implementation details follow
private:

// For now a private helper that is tailored to the way lexer works
// hides away forwardness of range by buffering
// RA-version is strightforward thin wrapping
// ATM it is byte-oriented
private struct LexSource(R)
    if(isForwardRange!R && !isRandomAccessRange!R)
    {
    bool empty() const { return _empty; }

    auto ref front() const
    {
        return accum[accumIdx];
    }

    auto ref peek() const
    in
    {
        assert (accumIdx + 1 < accum.length);
    }
    body
    {
        return accum[accumIdx + 1];
    }

    void popFront()
    {
        ++_index;
        range.popFront();
        // if that was last byte
        // just advance so that open-righted slice just works
        accumIdx =  (accumIdx+1) & mask;
        if(range.empty)
        {
            _empty = true;
            return;
        }
        if(accumIdx == savedAccumIdx)
        {
            // and move stuff around
            auto oldLen = accum.length;
            auto toCopy = oldLen - accumIdx;
            accum.length *= 2; // keep pow of 2
            // copy starting with last item
            copy(retro(accum[accumIdx..oldLen]),
                retro(accum[$-toCopy..$]));
            savedAccumIdx = accum.length - toCopy;
        }
        accum[accumIdx] = range.front;
    }

    auto save()
    {
        typeof(this) copy = this;
        copy.range = range.save;
        // sadly need to dup circular buffer, as it overwrites items
        copy.accum = copy.accum.dup;
        return copy;
    }

    // mark a position to slice from later on
    size_t mark()
    {
        savedAccumIdx = accumIdx;
        return accumIdx;
    }

    // slice to current position from previously marked position
    auto slice() @property
    {
        // it's an open right range as usual
        return CircularRange(accum, savedAccumIdx, accumIdx);
    }

    size_t index() const @property
    {
        return _index;
    }

private:
    this(R src, size_t bufferSize)
    {
        range = src;
        assert(bufferSize > 0);
        assert((bufferSize & (bufferSize-1)) == 0); //is power of 2
        accum = new ubyte[bufferSize];
        if(range.empty)
            _empty = true;
        else
            accum[accumIdx] = range.front; // load front
    }

    // a true RA-range of ubyte
    struct CircularRange
    {
        this(ubyte[] buf, size_t s, size_t e)
        {
            assert((buffer.length & (buffer.length-1)) == 0);
            buffer = buf;
            start = s;
            end = e;
        }
        //Forward range primitives
        @property bool empty() const { return start == end; }
        @property auto ref front() const { return buffer[start]; }
        void popFront() { start = (start + 1) & mask; }
        @property auto save() { return this; }

        //Backwards is a bit slower, but should be rarely used (if at all)
        @property ref back(){ return buffer[(end-1) & mask]; }
        void popBack() { end  = (end - 1) & mask; }

        // RA range primitives
        ref opIndex(size_t idx){ return buffer[(start+idx) & mask]; }
        @property size_t length()
        {
            return end < start ? end + buffer.length -start : end - start;
        }
        alias length opDollar;

        auto opSlice(size_t newStart, size_t newEnd)
        {
            size_t maskedStart = (start+newStart) & mask;
            size_t maskedEnd = (start+newEnd) & mask;
            return typeof(this)(buffer, maskedStart, maskedEnd);
        }
        // @@@bug fwd-ref in ldc0.10 (if placed above previous one)
        auto opSlice(){ return opSlice(0, length); }
    private:
        @property auto mask(){ return buffer.length-1; }
        size_t start, end;
        ubyte[] buffer;
    }

    @property auto mask(){ return accum.length-1; }

    R range;
    bool _empty;
    ubyte[] accum; // accumulator buffer for non-RA ranges
    size_t savedAccumIdx;
    size_t accumIdx; // current index in accumulator
    size_t _index; // index of current element in original range
}

// TODO: make sure it's RandomAccess later
/*static assert(isRandomAccessRange!(
    LexSource!(typeof(filter!"true"(cast(ubyte[])null)))
    .CircularRange)
);*/

//trivial pass-through for RA ranges
private struct LexSource(R)
    if(isRandomAccessRange!R)
{
    bool empty() const @property { return cur >= range.length; }
    bool canPeek() const { return cur + 1 < range.length; }
    auto ref front() const @property { return range[cur]; }
    void popFront(){ cur++; }

    auto ref peek() const
    in
    {
        assert (canPeek());
    }
    body
    {
        return range[cur + 1];
    }

    auto save()
    {
        typeof(this) copy = this;
        copy.range = range.save;
        return copy;
    }

    auto mark()
    {
        saved = cur;
    }

    // use the underliying range slicing capability
    auto slice() @property
    {
        return range[saved..cur];
    }

    size_t index() const @property
    {
        return cur;
    }

private:
    this(R src)
    {
        range = src;
    }
    size_t cur, saved;
    R range;
}

auto lexerSource(Range)(Range range, size_t bufSize=8)
    if(isForwardRange!Range && !isRandomAccessRange!Range
    && is(ElementType!Range : const(ubyte)))
{
    return LexSource!(Range)(range, bufSize);
}

auto lexerSource(Range)(Range range)
    if(isRandomAccessRange!Range
    && is(ElementType!Range : const(ubyte)))
{
    return LexSource!(Range)(range);
}

unittest
{
    // test the basic functionality of a "mark-slice" range
    import std.string, std.stdio;

    static void test_hello(T)(T lexs)
    {
        assert(lexs.front == 'H');
        lexs.popFront();
        assert(lexs.front == 'e');
        foreach(i; 0..2)
        {
            auto saved = lexs.save;
            lexs.mark();
            assert(lexs.slice.equal(""));
            lexs.popFront();
            assert(lexs.slice.equal("e"), text(cast(char)lexs.front));
            lexs.popFrontN(4);
            auto bytes = lexs.slice.map!"cast(char)a".array();
            assert(bytes.equal("ello,"), bytes.to!string);
            lexs.mark();
            assert(lexs.slice.equal(""));
            assert(lexs.front == 'w');
            lexs.popFrontN(6);
            assert(lexs.empty);
            auto s = lexs.slice();
            auto msg = s.save.map!"cast(char)a".array;
            assert(s[].equal("world!"), msg);
            assert(s[2..$-1].equal("rld"), msg);
            assert(s[0] == 'w' && s[$-1] == '!');
            s.popFront();
            assert(s.front == 'o' && s.back == '!');
            s.popBack();
            assert(s.front == 'o' && s.back == 'd');
            //restore and repeat again
            lexs = saved;
        }
    }

    static void test_empty(T)(T lexs)
    {
        assert(lexs.empty);
        lexs.mark();
        assert(lexs.slice().equal(""));
    }

    auto fwdLex = lexerSource(
        "Hello, world!"
        .representation
        .filter!"a != ' '", 16 // and the one that is more then enough
    );
    test_hello(fwdLex);
    fwdLex = lexerSource(
        "Hello, world!"
        .representation
        .filter!"a != ' '", 1 // try the smallest initial buffer
    );
    test_hello(fwdLex);
    fwdLex = lexerSource("".representation.filter!"a != ' '");
    auto raLex = lexerSource("".representation);
    test_empty(raLex);
    test_empty(fwdLex);
    raLex = lexerSource("Hello,world!".representation);
    test_hello(raLex);
}

// uses auto-detection for pure, safe nothrow
bool isRangeEoF(R)(ref R range)
{
    return range.empty || range.front == 0 || range.front == 0x1a;
}

// Lookup table for token values
immutable(string[TokenType.max + 1]) tokenValues = [
    "=",
    "@",
    "&",
    "&=",
    "|",
    "|=",
    "~=",
    ":",
    ",",
    "--",
    "/",
    "/=",
    "$",
    ".",
    "==",
    "=>",
    ">",
    ">=",
    "#",
    "++",
    "{",
    "[",
    "<",
    "<=",
    "<>=",
    "<>",
    "&&",
    "||",
    "(",
    "-",
    "-=",
    "%",
    "%=",
    "*=",
    "!",
    "!=",
    "!>",
    "!>=",
    "!<",
    "!<=",
    "!<>",
    "+",
    "+=",
    "^^",
    "^^=",
    "}",
    "]",
    ")",
    ";",
    "<<",
    "<<=",
    ">>",
    ">>=",
    "..",
    "*",
    "?",
    "~",
    "!<>=",
    ">>>",
    ">>>=",
    "...",
    "^",
    "^=",
    "bool",
    "byte",
    "cdouble",
    "cent",
    "cfloat",
    "char",
    "creal",
    "dchar",
    "double",
    "float",
    "function",
    "idouble",
    "ifloat",
    "int",
    "ireal",
    "long",
    "real",
    "short",
    "ubyte",
    "ucent",
    "uint",
    "ulong",
    "ushort",
    "void",
    "wchar",
    "align",
    "deprecated",
    "extern",
    "pragma",
    "export",
    "package",
    "private",
    "protected",
    "public",
    "abstract",
    "auto",
    "const",
    "final",
    "__gshared",
    "immutable",
    "inout",
    "scope",
    "shared",
    "static",
    "synchronized",
    "alias",
    "asm",
    "assert",
    "body",
    "break",
    "case",
    "cast",
    "catch",
    "class",
    "continue",
    "debug",
    "default",
    "delegate",
    "delete",
    "do",
    "else",
    "enum",
    "false",
    "finally",
    "foreach",
    "foreach_reverse",
    "for",
    "goto",
    "if",
    "import",
    "in",
    "interface",
    "invariant",
    "is",
    "lazy",
    "macro",
    "mixin",
    "module",
    "new",
    "nothrow",
    "null",
    "out",
    "override",
    "pure",
    "ref",
    "return",
    "struct",
    "super",
    "switch",
    "template",
    "this",
    "throw",
    "true",
    "try",
    "typedef",
    "typeid",
    "typeof",
    "union",
    "unittest",
    "version",
    "volatile",
    "while",
    "with",
    "__DATE__",
    "__EOF__",
    "__TIME__",
    "__TIMESTAMP__",
    "__VENDOR__",
    "__VERSION__",
    "__FILE__",
    "__LINE__",
    "__MODULE__",
    "__FUNCTION__",
    "__PRETTY_FUNCTION__",
    null,
    null,
    null,
    null,
    "__traits",
    "__parameters",
    "__vector",
    null,
    null,
    null,
    null,
    null,
    null,
    null,
    null,
    null,
    null,
    null,
    null,
    null,
    null,
    null,
];

pure string getTokenValue(const TokenType type)
{
    return tokenValues[type];
}

template tokenValue(TokenType val)
{
    enum tokenValue = getTokenValue(val);
}

private pure bool isNewline(ubyte ch)
{
    return ch == '\n' || ch == '\r';
}

pure TokenType lookupTokenType(R)(R input)
{
    switch(input.length)
    {
    case 2:
        switch (input[0])
        {
        case 'd': if (input[1] == 'o') return TokenType.do_; else break;
        case 'i':
            if (input[1] == 'f') return TokenType.if_;
            else if (input[1] == 'n') return TokenType.in_;
            else if (input[1] == 's') return TokenType.is_;
            else break;
        default: break;
        }
        break;
    case 3:
        switch (input[0])
        {
        case 'a': if (input[1..$].equal("sm")) return TokenType.asm_; else break;
        case 'f': if (input[1..$].equal("or")) return TokenType.for_; else break;
        case 'i': if (input[1..$].equal("nt")) return TokenType.int_; else break;
        case 'n': if (input[1..$].equal("ew")) return TokenType.new_; else break;
        case 'o': if (input[1..$].equal("ut")) return TokenType.out_; else break;
        case 'r': if (input[1..$].equal("ef")) return TokenType.ref_; else break;
        case 't': if (input[1..$].equal("ry")) return TokenType.try_; else break;
        default: break;
        }
        break;
    case 4:
        switch (input[0])
        {
        case 'a': if (input[1..$].equal("uto")) return TokenType.auto_; else break;
        case 'b': if (input[1..$].equal("ody")) return TokenType.body_;
            else if (input[1..$].equal("ool")) return TokenType.bool_;
            else if (input[1..$].equal("yte")) return TokenType.byte_;
            else break;
        case 'c': if (input[1..$].equal("ase")) return TokenType.case_;
            else if (input[1..$].equal("ast")) return TokenType.cast_;
            else if (input[1..$].equal("ent")) return TokenType.cent_;
            else if (input[1..$].equal("har")) return TokenType.char_;
            else break;
        case 'e': if (input[1..$].equal("lse")) return TokenType.else_;
            else if (input[1..$].equal("num")) return TokenType.enum_;
            else break;
        case 'g': if (input[1..$].equal("oto")) return TokenType.goto_; else break;
        case 'l': if (input[1..$].equal("azy")) return TokenType.lazy_;
            else if (input[1..$].equal("ong")) return TokenType.long_;
            else break;
        case 'n': if (input[1..$].equal("ull")) return TokenType.null_; else break;
        case 'p': if (input[1..$].equal("ure")) return TokenType.pure_; else break;
        case 'r': if (input[1..$].equal("eal")) return TokenType.real_; else break;
        case 't': if (input[1..$].equal("his")) return TokenType.this_;
            else if (input[1..$].equal("rue")) return TokenType.true_;
            else break;
        case 'u': if (input[1..$].equal("int")) return TokenType.uint_; else break;
        case 'v': if (input[1..$].equal("oid")) return TokenType.void_; else break;
        case 'w': if (input[1..$].equal("ith")) return TokenType.with_; else break;
        default: break;
        }
        break;
    case 5:
        switch (input[0])
        {
        case 'a': if (input[1..$].equal("lias")) return TokenType.alias_;
            else if (input[1..$].equal("lign")) return TokenType.align_; else break;
        case 'b': if (input[1..$].equal("reak")) return TokenType.break_; else break;
        case 'c': if (input[1..$].equal("atch")) return TokenType.catch_;
            else if (input[1..$].equal("lass")) return TokenType.class_;
            else if (input[1..$].equal("onst")) return TokenType.const_;
            else if (input[1..$].equal("real")) return TokenType.creal_;
            else break;
        case 'd': if (input[1..$].equal("char")) return TokenType.dchar_;
            else if (input[1..$].equal("ebug")) return TokenType.debug_; else break;
        case 'f': if (input[1..$].equal("alse")) return TokenType.false_;
            else if (input[1..$].equal("inal")) return TokenType.final_;
            else if (input[1..$].equal("loat")) return TokenType.float_;
            else break;
        case 'i': if (input[1..$].equal("nout")) return TokenType.inout_;
            else if (input[1..$].equal("real")) return TokenType.ireal_; else break;
        case 'm': if (input[1..$].equal("acro")) return TokenType.macro_;
            else if (input[1..$].equal("ixin")) return TokenType.mixin_; else break;
        case 's': if (input[1..$].equal("cope")) return TokenType.scope_;
            else if (input[1..$].equal("hort")) return TokenType.short_;
            else if (input[1..$].equal("uper")) return TokenType.super_; else break;
        case 't': if (input[1..$].equal("hrow")) return TokenType.throw_; else break;
        case 'u': if (input[1..$].equal("byte")) return TokenType.ubyte_;
            else if (input[1..$].equal("cent")) return TokenType.ucent_;
            else if (input[1..$].equal("long")) return TokenType.ulong_;
            else if (input[1..$].equal("nion")) return TokenType.union_;
            else break;
        case 'w': if (input[1..$].equal("char")) return TokenType.wchar_;
            else if (input[1..$].equal("hile")) return TokenType.while_;
            else break;
        default: break;
        }
        break;
    case 6:
        switch (input[0])
        {
        case 'a': if (input[1..$].equal("ssert")) return TokenType.assert_; else break;
        case 'c': if (input[1..$].equal("float")) return TokenType.cfloat_; else break;
        case 'd': if (input[1..$].equal("elete")) return TokenType.delete_;
            else if (input[1..$].equal("ouble")) return TokenType.double_; else break;
        case 'e': if (input[1..$].equal("xport")) return TokenType.export_;
            else if (input[1..$].equal("xtern")) return TokenType.extern_; else break;
        case 'i': if (input[1..$].equal("float")) return TokenType.ifloat_;
            else if (input[1..$].equal("mport")) return TokenType.import_; else break;
        case 'm': if (input[1..$].equal("odule")) return TokenType.module_; else break;
        case 'p': if (input[1..$].equal("ragma")) return TokenType.pragma_;
            else if (input[1..$].equal("ublic")) return TokenType.public_; else break;
        case 'r': if (input[1..$].equal("eturn")) return TokenType.return_; else break;
        case 's': if (input[1..$].equal("hared")) return TokenType.shared_;
            else if (input[1..$].equal("tatic")) return TokenType.static_;
            else if (input[1..$].equal("truct")) return TokenType.struct_;
            else if (input[1..$].equal("witch")) return TokenType.switch_; else break;
        case 't': if (input[1..$].equal("ypeid")) return TokenType.typeid_;
            else if (input[1..$].equal("ypeof")) return TokenType.typeof_; else break;
        case 'u': if (input[1..$].equal("short")) return TokenType.ushort_; else break;
        default: break;
        }
        break;
    case 7:
        switch (input[0])
        {
        case '_': if (input[1..$].equal("_EOF__")) return TokenType.specialEof; else break;
        case 'c': if (input[1..$].equal("double")) return TokenType.cdouble_; else break;
        case 'd': if (input[1..$].equal("efault")) return TokenType.default_; else break;
        case 'f': if (input[1..$].equal("inally")) return TokenType.finally_;
            else if (input[1..$].equal("oreach")) return TokenType.foreach_; else break;
        case 'i': if (input[1..$].equal("double")) return TokenType.idouble_; else break;
        case 'n': if (input[1..$].equal("othrow")) return TokenType.nothrow_; else break;
        case 'p': if (input[1..$].equal("ackage")) return TokenType.package_;
            else if (input[1..$].equal("rivate")) return TokenType.private_; else break;
        case 't': if (input[1..$].equal("ypedef")) return TokenType.typedef_; else break;
        case 'v': if (input[1..$].equal("ersion")) return TokenType.version_; else break;
        default: break;
        }
        break;
    case 8:
        switch (input[0])
        {
        case '_': if (input[1..$].equal("_DATE__")) return TokenType.specialDate;
            else if (input[1..$].equal("_FILE__")) return TokenType.specialFile;
            else if (input[1..$].equal("_LINE__")) return TokenType.specialLine;
            else if (input[1..$].equal("_vector")) return TokenType.vector;
            else if (input[1..$].equal("_TIME__")) return TokenType.specialTime;
            else if (input[1..$].equal("_traits")) return TokenType.traits; else break;
        case 'a': if (input[1..$].equal("bstract")) return TokenType.abstract_; else break;
        case 'c': if (input[1..$].equal("ontinue")) return TokenType.continue_; else break;
        case 'd': if (input[1..$].equal("elegate")) return TokenType.delegate_; else break;
        case 'f': if (input[1..$].equal("unction")) return TokenType.function_; else break;
        case 'o': if (input[1..$].equal("verride")) return TokenType.override_; else break;
        case 't': if (input[1..$].equal("emplate")) return TokenType.template_; else break;
        case 'u': if (input[1..$].equal("nittest")) return TokenType.unittest_; else break;
        case 'v': if (input[1..$].equal("olatile")) return TokenType.volatile_; else break;
        default: break;
        }
        break;
    case 9:
        switch (input[0])
        {
        case '_': if (input[1..$].equal("_gshared")) return TokenType.gshared; else break;
        case 'i': if (input[1..$].equal("mmutable")) return TokenType.immutable_;
            else if (input[1..$].equal("nterface")) return TokenType.interface_;
            else if (input[1..$].equal("nvariant")) return TokenType.invariant_; else break;
        case 'p': if (input[1..$].equal("rotected")) return TokenType.protected_; else break;
        default: break;
        }
        break;
    case 10:
        switch (input[0])
        {
        case 'd': if (input[1..$].equal("eprecated")) return TokenType.deprecated_; else break;
        case '_':
            if (input[1..$].equal("_VENDOR__")) return TokenType.specialVendor;
            else if (input[1..$].equal("_MODULE__")) return TokenType.specialModule; else break;
        default: break;
        }
        break;
    case 11:
        if (input[1..$].equal("_VERSION__"))
            return TokenType.specialVersion;
        break;
    case 12:
        switch (input[0])
        {
        case 's': if (input[1..$].equal("ynchronized")) return TokenType.synchronized_; else break;
        case '_': if (input[1..$].equal("_FUNCTION__")) return TokenType.specialFunction;
            else if (input[1..$].equal("_parameters")) return TokenType.parameters; else break;
        default: break;
        }
        break;
    case 13:
        if (input[1..$].equal("_TIMESTAMP__"))
            return TokenType.specialTimestamp;
        break;
    case 15:
        if (input[1..$].equal("oreach_reverse"))
            return TokenType.foreach_reverse_;
        break;
    case 19:
        if (input[1..$].equal("_PRETTY_FUNCTION__"))
            return TokenType.specialPrettyFunction;
        break;
    default: break;
    }
    return TokenType.identifier;
}

class Trie(K, V) if (isInputRange!K): TrieNode!(K, V)
{
    /**
    * Adds the given value to the trie with the given key
    */
    void add(K key, V value) pure
    {
        TrieNode!(K,V) current = this;
        foreach(keyPart; key)
        {
            if ((keyPart in current.children) is null)
            {
                auto node = new TrieNode!(K, V);
                current.children[keyPart] = node;
                current = node;
            }
            else
                current = current.children[keyPart];
        }
        current.value = value;
    }
}

class TrieNode(K, V) if (isInputRange!K)
{
    V value;
    TrieNode!(K,V)[ElementType!K] children;
}

string printCaseStatements(K, V)(TrieNode!(K,V) node, string indentString)
{
    string caseStatement = "";
    foreach(dchar k, TrieNode!(K,V) v; node.children)
    {
        caseStatement ~= indentString;
        caseStatement ~= "case '";
        caseStatement ~= k;
        caseStatement ~= "':\n";
        caseStatement ~= indentString;
        caseStatement ~= "\tnextCharNonLF();\n";
        if (v.children.length > 0)
        {
            caseStatement ~= indentString;
            caseStatement ~= "\tif (isEoF())\n";
            caseStatement ~= indentString;
            caseStatement ~= "\t{\n";
            caseStatement ~= indentString;
            caseStatement ~= "\t\tcurrent.value = tokenValue!("~node.children[k].value~");\n";
            caseStatement ~= indentString;
            caseStatement ~= "\t\tcurrent.type = " ~ node.children[k].value;
            caseStatement ~= ";\n";
            caseStatement ~= indentString;
            caseStatement ~= "\t\treturn;\n";
            caseStatement ~= indentString;
            caseStatement ~= "\t}\n";
            caseStatement ~= indentString;
            caseStatement ~= "\tswitch (src.front)\n";
            caseStatement ~= indentString;
            caseStatement ~= "\t{\n";
            caseStatement ~= printCaseStatements(v, indentString ~ "\t");
            caseStatement ~= indentString;
            caseStatement ~= "\tdefault:\n";
            caseStatement ~= indentString;
            caseStatement ~= "\t\tcurrent.type = ";
            caseStatement ~= v.value;
            caseStatement ~= ";\n";
            caseStatement ~= indentString;
            caseStatement ~= "\t\tcurrent.value = tokenValue!("~v.value~");\n";
            caseStatement ~= indentString;
            caseStatement ~= "\t\treturn;\n";
            caseStatement ~= indentString;
            caseStatement ~= "\t}\n";
        }
        else
        {
            caseStatement ~= indentString;
            caseStatement ~= "\tcurrent.type = ";
            caseStatement ~= v.value;
            caseStatement ~= ";\n";
            caseStatement ~= indentString;
            caseStatement ~= "\tcurrent.value = tokenValue!("~v.value~");\n";
            caseStatement ~= indentString;
            caseStatement ~= "\treturn;\n";
        }
    }
    return caseStatement;
}

string generateCaseTrie(string[] args ...)
{
    auto t = new Trie!(string, string);
    for(int i = 0; i < args.length; i+=2)
    {
        t.add(args[i], args[i+1]);
    }
    return printCaseStatements(t, "");
}

struct StringCache
{
    this(size_t startSize)
    {
        assert((startSize & (startSize-1)) == 0);
        index = new Slot*[startSize];
    }

    string get(R)(R range)
        if(isRandomAccessRange!R
            && is(Unqual!(ElementType!R) : const(ubyte)))
    {
        uint h = hash(range);
        uint bucket = h & (index.length-1);
        Slot *s = index[bucket];
        if(s == null)
        {
            string str = putIntoCache(range);
            index[bucket] = allocateSlot(str, h);
            uniqueSlots++;
            return str;
        }
        for(;;)
        {
            if(s.hash == h && s.value.equal(range))
                return s.value;
            if(s.next == null) break;
            s = s.next;
        }
        string str = putIntoCache(range);
        s.next = allocateSlot(str, h);
        uniqueSlots++;
        // had at least 1 item in this bucket
        // and inserted another one - check load factor
        if(uniqueSlots*loadDenom > index.length*loadQuot)
            rehash();
        return str;
    }

private:

    static uint hash(R)(R data)
    {
        uint hash = 0;
        foreach (b; data)
        {
            hash ^= sbox[b];
            hash *= 3;
        }
        return hash;
    }

    struct Slot
    {
        string value;
        Slot* next;
        uint hash;
    };

    void printLoadFactor()
    {
        size_t cnt = 0, maxChain = 0;
        foreach(Slot* s; index)
        {
            size_t chain = 0;
            for(Slot* p = s; p; p = p.next)
            {
                chain++;
            }
            maxChain = max(chain, maxChain);
            cnt += chain;
        }
        import std.stdio;
        assert(cnt == uniqueSlots);
        writefln("Load factor: %.3f; max bucket %d",
            cast(double)cnt/index.length,
                maxChain);
    }

    void rehash()
    {
        //writefln("BEFORE (size = %d):", index.length);
        //printLoadFactor();
        size_t oldLen = index.length;
        index.length *= 2;
        for (size_t i = 0; i < oldLen; i++)
        {
            Slot* cur = index[i], prev;
            while(cur)
            {
                //has extra bit set - move it out
                if(cur.hash & oldLen)
                {
                    if(prev == null)
                    {
                        Slot* r = cur;
                        index[i] = cur.next;
                        cur = cur.next;
                        insertIntoBucket(r, i + oldLen);
                    }
                    else
                    {
                        Slot* r = removeLink(cur, prev);
                        insertIntoBucket(r, i + oldLen);
                    }
                }
                else
                {
                    prev = cur;
                    cur = cur.next;
                }
            }
        }
        //writefln("AFTER (size = %d):", index.length);
        //printLoadFactor();
    }

    static Slot* removeLink(ref Slot* cur, Slot* prev)
    {
        prev.next = cur.next;
        Slot* r = cur;
        cur = cur.next;
        return r;
    }

    //insert at front of bucket
    void insertIntoBucket(Slot* what, size_t bucket)
    {
        what.next = null;
        Slot* p = index[bucket];
        what.next = p;
        index[bucket] = what;
    }

    Slot* allocateSlot(string val, uint hash)
    {
        auto slice = allocateInCache(Slot.sizeof);
        auto newSlot = cast(Slot*)slice.ptr;
        *newSlot = Slot(val, null, hash);
        return newSlot;
    }

    Slot*[] index;
    size_t uniqueSlots;
    enum loadQuot = 2, loadDenom = 3;

    // leave some slack for alloctors/GC meta-data
    enum chunkSize = 16*1024 - size_t.sizeof*8;
    ubyte*[] chunkS;
    size_t next = chunkSize;
    //TODO: add aligned variant that allocates at word boundary
    ubyte[] allocateInCache(size_t size)
    {
        import core.memory;
        if(next + size > chunkSize)
        {
            // avoid huge allocations
            if(size> chunkSize/4)
            {
                ubyte* p = cast(ubyte*)GC.malloc(size,
                    GC.BlkAttr.NO_SCAN);
                return p[0..size];
            }
            chunkS ~= cast(ubyte*)GC.malloc(chunkSize,
                GC.BlkAttr.NO_SCAN);
            next = 0;
        }
        auto slice = chunkS[$-1][next..next+size];
        next += size;
        return slice;
    }

    string putIntoCache(R)(R data)
    {
        auto slice = allocateInCache(data.length);
        slice[] = data[];
        return cast(string)slice;
    }

}

immutable uint[] sbox = [
    0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53,
    0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982,
    0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56,
    0x514F4303, 0x7BE12B83, 0x7192F195, 0x82DC7300,
    0x084380B4, 0x480B55D3, 0x5F430471, 0x13F75991,
    0x3F9CF22C, 0x2FE0907A, 0xFD8E1E69, 0x7B1D5DE8,
    0xD575A85C, 0xAD01C50A, 0x7EE00737, 0x3CE981E8,
    0x0E447EFA, 0x23089DD6, 0xB59F149F, 0x13600EC7,
    0xE802C8E6, 0x670921E4, 0x7207EFF0, 0xE74761B0,
    0x69035234, 0xBFA40F19, 0xF63651A0, 0x29E64C26,
    0x1F98CCA7, 0xD957007E, 0xE71DDC75, 0x3E729595,
    0x7580B7CC, 0xD7FAF60B, 0x92484323, 0xA44113EB,
    0xE4CBDE08, 0x346827C9, 0x3CF32AFA, 0x0B29BCF1,
    0x6E29F7DF, 0xB01E71CB, 0x3BFBC0D1, 0x62EDC5B8,
    0xB7DE789A, 0xA4748EC9, 0xE17A4C4F, 0x67E5BD03,
    0xF3B33D1A, 0x97D8D3E9, 0x09121BC0, 0x347B2D2C,
    0x79A1913C, 0x504172DE, 0x7F1F8483, 0x13AC3CF6,
    0x7A2094DB, 0xC778FA12, 0xADF7469F, 0x21786B7B,
    0x71A445D0, 0xA8896C1B, 0x656F62FB, 0x83A059B3,
    0x972DFE6E, 0x4122000C, 0x97D9DA19, 0x17D5947B,
    0xB1AFFD0C, 0x6EF83B97, 0xAF7F780B, 0x4613138A,
    0x7C3E73A6, 0xCF15E03D, 0x41576322, 0x672DF292,
    0xB658588D, 0x33EBEFA9, 0x938CBF06, 0x06B67381,
    0x07F192C6, 0x2BDA5855, 0x348EE0E8, 0x19DBB6E3,
    0x3222184B, 0xB69D5DBA, 0x7E760B88, 0xAF4D8154,
    0x007A51AD, 0x35112500, 0xC9CD2D7D, 0x4F4FB761,
    0x694772E3, 0x694C8351, 0x4A7E3AF5, 0x67D65CE1,
    0x9287DE92, 0x2518DB3C, 0x8CB4EC06, 0xD154D38F,
    0xE19A26BB, 0x295EE439, 0xC50A1104, 0x2153C6A7,
    0x82366656, 0x0713BC2F, 0x6462215A, 0x21D9BFCE,
    0xBA8EACE6, 0xAE2DF4C1, 0x2A8D5E80, 0x3F7E52D1,
    0x29359399, 0xFEA1D19C, 0x18879313, 0x455AFA81,
    0xFADFE838, 0x62609838, 0xD1028839, 0x0736E92F,
    0x3BCA22A3, 0x1485B08A, 0x2DA7900B, 0x852C156D,
    0xE8F24803, 0x00078472, 0x13F0D332, 0x2ACFD0CF,
    0x5F747F5C, 0x87BB1E2F, 0xA7EFCB63, 0x23F432F0,
    0xE6CE7C5C, 0x1F954EF6, 0xB609C91B, 0x3B4571BF,
    0xEED17DC0, 0xE556CDA0, 0xA7846A8D, 0xFF105F94,
    0x52B7CCDE, 0x0E33E801, 0x664455EA, 0xF2C70414,
    0x73E7B486, 0x8F830661, 0x8B59E826, 0xBB8AEDCA,
    0xF3D70AB9, 0xD739F2B9, 0x4A04C34A, 0x88D0F089,
    0xE02191A2, 0xD89D9C78, 0x192C2749, 0xFC43A78F,
    0x0AAC88CB, 0x9438D42D, 0x9E280F7A, 0x36063802,
    0x38E8D018, 0x1C42A9CB, 0x92AAFF6C, 0xA24820C5,
    0x007F077F, 0xCE5BC543, 0x69668D58, 0x10D6FF74,
    0xBE00F621, 0x21300BBE, 0x2E9E8F46, 0x5ACEA629,
    0xFA1F86C7, 0x52F206B8, 0x3EDF1A75, 0x6DA8D843,
    0xCF719928, 0x73E3891F, 0xB4B95DD6, 0xB2A42D27,
    0xEDA20BBF, 0x1A58DBDF, 0xA449AD03, 0x6DDEF22B,
    0x900531E6, 0x3D3BFF35, 0x5B24ABA2, 0x472B3E4C,
    0x387F2D75, 0x4D8DBA36, 0x71CB5641, 0xE3473F3F,
    0xF6CD4B7F, 0xBF7D1428, 0x344B64D0, 0xC5CDFCB6,
    0xFE2E0182, 0x2C37A673, 0xDE4EB7A3, 0x63FDC933,
    0x01DC4063, 0x611F3571, 0xD167BFAF, 0x4496596F,
    0x3DEE0689, 0xD8704910, 0x7052A114, 0x068C9EC5,
    0x75D0E766, 0x4D54CC20, 0xB44ECDE2, 0x4ABC653E,
    0x2C550A21, 0x1A52C0DB, 0xCFED03D0, 0x119BAFE2,
    0x876A6133, 0xBC232088, 0x435BA1B2, 0xAE99BBFA,
    0xBB4F08E4, 0xA62B5F49, 0x1DA4B695, 0x336B84DE,
    0xDC813D31, 0x00C134FB, 0x397A98E6, 0x151F0E64,
    0xD9EB3E69, 0xD3C7DF60, 0xD2F2C336, 0x2DDD067B,
    0xBD122835, 0xB0B3BD3A, 0xB0D54E46, 0x8641F1E4,
    0xA0B38F96, 0x51D39199, 0x37A6AD75, 0xDF84EE41,
    0x3C034CBA, 0xACDA62FC, 0x11923B8B, 0x45EF170A,
];

unittest
{
    LexerConfig cfg;
    auto tkr = "void main(){ }".representation.byToken(cfg);
    assert(tkr.map!"a.value".equal(["void", "main", "(", ")", "{", "}"]));
    tkr = "1234 54.23232".representation.byToken(cfg);
    assert(tkr.equal(["1234", "54.23232"]));
    auto str =  r"0 0. .0 1 0x3 0b102 007";
    cfg.iterStyle = IterationStyle.everything;
    tkr = str.representation.byToken(cfg);
    assert(tkr.map!"a.value".equal(["0", " ", "0.", " ",
        ".0", " ", "1", " ", "0x3", " ", "0b10",
        "2", " ", "007"]
    ), text(tkr.map!"a.value"));
}

unittest
{
    import std.stdio;
    auto source = cast(ubyte[]) (
        " bool byte cdouble cent cfloat char creal dchar double float function"
        ~ " idouble ifloat int ireal long real short ubyte ucent uint ulong"
        ~ " ushort void wchar align deprecated extern pragma export package private"
        ~ " protected public abstract auto const final __gshared immutable inout"
        ~ " scope shared static synchronized alias asm assert body break case"
        ~ " cast catch class continue debug default delegate delete do else"
        ~ " enum false finally foreach foreach_reverse for goto if import in"
        ~ " interface invariant is lazy macro mixin module new nothrow null"
        ~ " out override pure ref return struct super switch template this"
        ~ " throw true try typedef typeid typeof union unittest version volatile"
        ~ " while with __traits __parameters __vector __VENDOR__ __MODULE__"
        ~ " __VERSION__ __TIMESTAMP__ __PRETTY_FUNCTION__");
    auto expected = ["bool", "byte", "cdouble",
        "cent", "cfloat", "char", "creal",
        "dchar", "double", "float", "function",
        "idouble", "ifloat", "int", "ireal", "long",
        "real", "short", "ubyte", "ucent", "uint",
        "ulong", "ushort", "void", "wchar", "align",
        "deprecated", "extern", "pragma", "export",
        "package", "private", "protected", "public",
        "abstract", "auto", "const", "final", "__gshared",
        "immutable", "inout", "scope", "shared",
        "static", "synchronized", "alias", "asm", "assert",
        "body", "break", "case", "cast", "catch",
        "class", "continue", "debug", "default", "delegate",
        "delete", "do", "else", "enum", "false",
        "finally", "foreach", "foreach_reverse", "for",
        "goto", "if", "import", "in", "interface",
        "invariant", "is", "lazy","macro", "mixin",
        "module", "new", "nothrow", "null", "out",
        "override", "pure", "ref", "return", "struct",
        "super", "switch", "template", "this", "throw",
        "true", "try", "typedef", "typeid", "typeof",
        "union", "unittest", "version", "volatile",
        "while", "with", "__traits", "__parameters", "__vector",
        "__VENDOR__", "__MODULE__", "__VERSION__", "__TIMESTAMP__",
        "__PRETTY_FUNCTION__"];
    LexerConfig config;
    config.tokenStyle = TokenStyle.doNotReplaceSpecial;
    auto tokens = byToken(source, config);
//    writeln(tokens.map!"a.value"().array());
    assert (equal(map!"a.value"(tokens), expected));
}

unittest
{
    auto source = cast(ubyte[]) ("=@& &=| |=~=:,--/ /=$.===>> >=++{[< <=<>=<>&&||(- -=%%=*=!!=!>!>=!<!<=!<>+ +=^^^^=}]);<< <<=>> >>=..*?~!<>=>>>>>>=...^ ^=");
    auto expected = ["=",  "@", "&", "&=", "|", "|=", "~=",
        ":", ",", "--", "/", "/=", "$", ".", "==",
        "=>", ">", ">=", "++", "{", "[", "<",
        "<=", "<>=", "<>", "&&", "||", "(", "-", "-=", "%",
        "%=", "*=", "!", "!=", "!>", "!>=", "!<",
        "!<=", "!<>", "+", "+=", "^^", "^^=",
        "}", "]", ")", ";", "<<", "<<=", ">>",
        ">>=", "..", "*", "?", "~", "!<>=",
        ">>>", ">>>=", "...", "^", "^="];
    LexerConfig config;
    auto tokens = byToken(source, config);
    //writeln(tokens.map!"a.value"().array());
    assert (equal(map!"a.value"(tokens), expected), map!"a.value"(tokens).text());
}

unittest
{
    auto source = cast(ubyte[]) (`
        1 1.2 //comment
        1.2f 1u 1uL 0b011 0b1uu 0b1 /+abc/+def+/+/0x11001uL
        123e1L 123e+1f 123e-1i 15e++ 4ea 1.2u 4i 1337L 4.2L 1..2 4.3.5.8
        0xabc 0xabcp4 0x1P-10 0x40u 0x29L 0x4Lu 0xdeadbeef
    `);
    auto expected = ["1", "1.2", "1.2f", "1u", "1uL", "0b011", "0b1u", "u", "0b1",
        "0x11001uL", "123e1L", "123e+1f", "123e-1i", "15e+", "+", "4e", "a",
        "1.2", "u", "4i", "1337L", "4.2L", "1", "..", "2", "4.3", ".5", ".8",
        "0xabc", "0xabcp4", "0x1P-10", "0x40u", "0x29L", "0x4Lu", "0xdeadbeef"];
    int errCount = 0;
    void errorFunction(string file, size_t index, uint line, uint col, string msg)
    {
        ++errCount;
    }
    LexerConfig config;
    config.errorFunc = &errorFunction;
    auto tokens = byToken(source, config);
    //writeln(tokens.map!"a.value"());
    assert (equal(map!"a.value"(tokens), expected), map!"a.value"(tokens).text());
    assert (errCount == 2);
}

unittest
{
    auto source = cast(ubyte[]) ("int #line 4\n double q{abcde (a + b) == 0} '\\u0020' q\"HEREDOC\r\nabcde\r\nHEREDOC\"");
    LexerConfig config;
    auto tokens = byToken(source, config);
    assert (tokens.front.line == 1);
    assert (tokens.moveFront() == TokenType.int_);
    assert (tokens.front.line == 4);
    assert (isBasicType(tokens.front));
    assert (tokens.front.value == "double");
    tokens.popFront();
    assert (tokens.front.value == "abcde (a + b) == 0", tokens.front.value);
    assert (isStringLiteral(tokens.front), tokens.front.type.text());
    tokens.popFront();
    assert (tokens.front.value == " ");
    assert (tokens.front.type == TokenType.characterLiteral);
    tokens.popFront();
    assert (tokens.front.value == "abcde\r\n", "[%s]".format(tokens.front.value));
}

unittest
{
    auto source = cast(ubyte[]) "q{(a & 1) == 0} q\"/foo]/\" q\"HEREDOC\r\nabcde\r\nHEREDOC\"";
    LexerConfig config;
    config.tokenStyle = TokenStyle.includeQuotes;
    auto tokens = byToken(source, config);
    assert (tokens.front.value == "q{(a & 1) == 0}", tokens.front.value);
    tokens.popFront();
    assert (tokens.front.value == "q\"/foo]/\"", tokens.front.value);
    tokens.popFront();
    assert (tokens.front.value == "q\"HEREDOC\r\nabcde\r\nHEREDOC\"", tokens.front.value);
}

unittest
{
    auto source = cast(ubyte[]) (`"string`);
    int errCount = 0;
    void errorFunction(string file, size_t index, uint line, uint col, string msg)
    {
        ++errCount;
    }
    LexerConfig config;
    config.errorFunc = &errorFunction;
    auto tokens = byToken(source, config);
    assert (errCount == 1);
}

unittest
{
    auto source = cast(ubyte[]) ("import foo");
    LexerConfig config;
    auto tokens = byToken(source, config);
    Token a = tokens.moveFront();
    assert (a.type == TokenType.import_);
    Token b = tokens.moveFront();
    assert (b.type == TokenType.identifier);
    assert (a != b);
    assert (a != "foo");
    assert (a < b);
    assert (b == "foo");
    assert (b > a);
    assert (!(a > a));
    assert (tokens.empty);
}

unittest
{
    auto source = cast(ubyte[]) ("import std.stdio; void main(){writeln(\"hello world\");}");
    LexerConfig config;
    auto tokens = byToken(source, config);
    int tokenCount = 0;
    foreach (t; tokens)
    {
        ++tokenCount;
    }
    assert (tokenCount == 16);
}

//void main(string[] args){}