Updated lexer docs. Implemented delete and fp operator rules. Fixed bug with AST traversal

This commit is contained in:
Hackerpilot 2014-01-26 22:47:21 -08:00
parent 2f78272fed
commit d13d680b74
9 changed files with 995 additions and 85 deletions

29
analysis/del.d Normal file
View File

@ -0,0 +1,29 @@
// Copyright Brian Schott (Sir Alaran) 2014.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
module analysis.del;
import stdx.d.ast;
import stdx.d.lexer;
import analysis.base;
/**
* Checks for use of the deprecated "delete" keyword
*/
class DeleteCheck : BaseAnalyzer
{
alias visit = BaseAnalyzer.visit;
this(string fileName)
{
super(fileName);
}
override void visit(DeleteExpression d)
{
addErrorMessage(d.line, d.column, "Avoid using the deprecated delete keyword");
d.accept(this);
}
}

38
analysis/fish.d Normal file
View File

@ -0,0 +1,38 @@
// Copyright Brian Schott (Sir Alaran) 2014.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
module analysis.fish;
import stdx.d.ast;
import stdx.d.lexer;
import analysis.base;
/**
* Checks for use of the deprecated floating point comparison operators.
*/
class FloatOperatorCheck : BaseAnalyzer
{
alias visit = BaseAnalyzer.visit;
this(string fileName)
{
super(fileName);
}
override void visit(RelExpression r)
{
if (r.operator == tok!"<>"
|| r.operator == tok!"!<>"
|| r.operator == tok!"!>"
|| r.operator == tok!"!<"
|| r.operator == tok!"!<>="
|| r.operator == tok!"!>="
|| r.operator == tok!"!<=")
{
addErrorMessage(r.line, r.column, "Avoid using the deprecated floating-point operators");
}
r.accept(this);
}
}

View File

@ -15,6 +15,8 @@ import analysis.base;
import analysis.style;
import analysis.enumarrayliteral;
import analysis.pokemon;
import analysis.del;
import analysis.fish;
void messageFunction(string fileName, size_t line, size_t column, string message,
bool isError)
@ -63,8 +65,14 @@ void analyze(File output, string[] fileNames, bool staticAnalyze = true)
auto pokemon = new PokemonExceptionCheck(fileName);
pokemon.visit(m);
auto del = new DeleteCheck(fileName);
del.visit(m);
auto fish = new FloatOperatorCheck(fileName);
fish.visit(m);
foreach (message; sort(chain(enums.messages, style.messages,
pokemon.messages).array))
pokemon.messages, del.messages, fish.messages).array))
{
writeln(message);
}

5
main.d
View File

@ -109,10 +109,11 @@ int main(string[] args)
}
else if (tokenDump)
{
writeln("text blank\tindex\tline\tcolumn\tcomment");
foreach (token; tokens)
{
writeln(", token.text is null ? str(token.type) : token.text,
"» ", token.text !is null, " ", token.index, " ", token.line, " ", token.column, " ",
writefln("<<%20s>>%b\t%d\t%d\t%d", token.text is null ? str(token.type) : token.text,
token.text !is null, token.index, token.line, token.column,
token.comment);
}
return 0;

View File

@ -31,6 +31,58 @@ import std.string;
abstract class ASTVisitor
{
public:
void visit(ExpressionNode n)
{
if (cast(AddExpression) n) visit(cast(AddExpression) n);
else if (cast(AndAndExpression) n) visit(cast(AndAndExpression) n);
else if (cast(AndExpression) n) visit(cast(AndExpression) n);
else if (cast(AsmAddExp) n) visit(cast(AsmAddExp) n);
else if (cast(AsmAndExp) n) visit(cast(AsmAndExp) n);
else if (cast(AsmEqualExp) n) visit(cast(AsmEqualExp) n);
else if (cast(AsmLogAndExp) n) visit(cast(AsmLogAndExp) n);
else if (cast(AsmLogOrExp) n) visit(cast(AsmLogOrExp) n);
else if (cast(AsmMulExp) n) visit(cast(AsmMulExp) n);
else if (cast(AsmOrExp) n) visit(cast(AsmOrExp) n);
else if (cast(AsmRelExp) n) visit(cast(AsmRelExp) n);
else if (cast(AsmShiftExp) n) visit(cast(AsmShiftExp) n);
else if (cast(AssertExpression) n) visit(cast(AssertExpression) n);
else if (cast(AssignExpression) n) visit(cast(AssignExpression) n);
else if (cast(CmpExpression) n) visit(cast(CmpExpression) n);
else if (cast(DeleteExpression) n) visit(cast(DeleteExpression) n);
else if (cast(EqualExpression) n) visit(cast(EqualExpression) n);
else if (cast(Expression) n) visit(cast(Expression) n);
else if (cast(FunctionCallExpression) n) visit(cast(FunctionCallExpression) n);
else if (cast(FunctionLiteralExpression) n) visit(cast(FunctionLiteralExpression) n);
else if (cast(IdentityExpression) n) visit(cast(IdentityExpression) n);
else if (cast(ImportExpression) n) visit(cast(ImportExpression) n);
else if (cast(IndexExpression) n) visit(cast(IndexExpression) n);
else if (cast(InExpression) n) visit(cast(InExpression) n);
else if (cast(IsExpression) n) visit(cast(IsExpression) n);
else if (cast(LambdaExpression) n) visit(cast(LambdaExpression) n);
else if (cast(MixinExpression) n) visit(cast(MixinExpression) n);
else if (cast(MulExpression) n) visit(cast(MulExpression) n);
else if (cast(NewAnonClassExpression) n) visit(cast(NewAnonClassExpression) n);
else if (cast(NewExpression) n) visit(cast(NewExpression) n);
else if (cast(OrExpression) n) visit(cast(OrExpression) n);
else if (cast(OrOrExpression) n) visit(cast(OrOrExpression) n);
else if (cast(PostIncDecExpression) n) visit(cast(PostIncDecExpression) n);
else if (cast(PowExpression) n) visit(cast(PowExpression) n);
else if (cast(PragmaExpression) n) visit(cast(PragmaExpression) n);
else if (cast(PreIncDecExpression) n) visit(cast(PreIncDecExpression) n);
else if (cast(PrimaryExpression) n) visit(cast(PrimaryExpression) n);
else if (cast(RelExpression) n) visit(cast(RelExpression) n);
else if (cast(ShiftExpression) n) visit(cast(ShiftExpression) n);
else if (cast(SliceExpression) n) visit(cast(SliceExpression) n);
else if (cast(TemplateMixinExpression) n) visit(cast(TemplateMixinExpression) n);
else if (cast(TernaryExpression) n) visit(cast(TernaryExpression) n);
else if (cast(TraitsExpression) n) visit(cast(TraitsExpression) n);
else if (cast(TypeidExpression) n) visit(cast(TypeidExpression) n);
else if (cast(TypeofExpression) n) visit(cast(TypeofExpression) n);
else if (cast(UnaryExpression) n) visit(cast(UnaryExpression) n);
else if (cast(XorExpression) n) visit(cast(XorExpression) n);
}
/** */ void visit(AddExpression addExpression) { addExpression.accept(this); }
/** */ void visit(AliasDeclaration aliasDeclaration) { aliasDeclaration.accept(this); }
/** */ void visit(AliasInitializer aliasInitializer) { aliasInitializer.accept(this); }
@ -104,7 +156,6 @@ public:
/** */ void visit(EponymousTemplateDeclaration eponymousTemplateDeclaration) { eponymousTemplateDeclaration.accept(this); }
/** */ void visit(EqualExpression equalExpression) { equalExpression.accept(this); }
/** */ void visit(Expression expression) { expression.accept(this); }
/** */ void visit(ExpressionNode expressionNode) { expressionNode.accept(this); }
/** */ void visit(ExpressionStatement expressionStatement) { expressionStatement.accept(this); }
/** */ void visit(FinalSwitchStatement finalSwitchStatement) { finalSwitchStatement.accept(this); }
/** */ void visit(Finally finally_) { finally_.accept(this); }
@ -234,10 +285,11 @@ public:
interface ASTNode
{
public:
/** */ void accept(ASTVisitor visitor);
}
immutable string DEFAULT_ACCEPT = q{void accept(ASTVisitor visitor) {}};
immutable string DEFAULT_ACCEPT = q{override void accept(ASTVisitor visitor) {}};
template visitIfNotNull(fields ...)
{
@ -259,19 +311,28 @@ template visitIfNotNull(fields ...)
}
}
abstract class ExpressionNode : ASTNode {}
abstract class ExpressionNode : ASTNode
{
public:
override void accept(ASTVisitor visitor)
{
assert (false);
}
}
mixin template BinaryExpressionBody()
{
ExpressionNode left;
ExpressionNode right;
size_t line;
size_t column;
}
///
class AddExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(left, right));
}
@ -283,7 +344,7 @@ public:
class AliasDeclaration : ASTNode
{
public:
void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(type, name, initializers));
}
@ -332,7 +393,7 @@ public:
class AndAndExpression : ExpressionNode
{
public:
void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(left, right));
}
@ -343,7 +404,7 @@ public:
class AndExpression : ExpressionNode
{
public:
void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(left, right));
}
@ -566,7 +627,7 @@ public:
class AssertExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(assertion, message));
}
@ -578,7 +639,7 @@ public:
class AssignExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(ternaryExpression, assignExpression));
}
@ -816,7 +877,7 @@ public:
class CmpExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(shiftExpression, equalExpression,
identityExpression, relExpression, inExpression));
@ -1031,11 +1092,13 @@ public:
class DeleteExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(unaryExpression));
}
/** */ UnaryExpression unaryExpression;
/** */ size_t line;
/** */ size_t column;
}
///
@ -1151,7 +1214,7 @@ public:
class EqualExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(left, right));
}
@ -1163,7 +1226,7 @@ public:
class Expression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(items));
}
@ -1293,7 +1356,7 @@ public:
class FunctionCallExpression : ExpressionNode
{
public:
void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(unaryExpression, arguments, templateArguments));
}
@ -1306,7 +1369,7 @@ public:
class FunctionCallStatement : ASTNode
{
public:
void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(functionCallExpression));
}
@ -1338,7 +1401,7 @@ public:
class FunctionLiteralExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(type, parameters, functionAttributes,
functionBody));
@ -1413,7 +1476,7 @@ public:
class IdentityExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(left, right));
}
@ -1478,7 +1541,7 @@ public:
class ImportExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(assignExpression));
}
@ -1489,7 +1552,7 @@ public:
class IndexExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(unaryExpression, argumentList));
}
@ -1501,7 +1564,7 @@ public:
class InExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(left, right));
}
@ -1575,7 +1638,7 @@ public:
class IsExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(type, identifier, typeSpecialization,
templateParameterList));
@ -1626,7 +1689,7 @@ public:
class LambdaExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(identifier, parameters, functionAttributes,
assignExpression));
@ -1689,7 +1752,7 @@ public:
class MixinExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(assignExpression));
}
@ -1748,7 +1811,7 @@ public:
class MulExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(left, right));
}
@ -1760,7 +1823,7 @@ public:
class NewAnonClassExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(allocatorArguments, constructorArguments,
baseClassList, structBody));
@ -1775,7 +1838,7 @@ public:
class NewExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(newAnonClassExpression, type, arguments,
assignExpression));
@ -1863,7 +1926,7 @@ public:
class OrExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(left, right));
}
@ -1874,7 +1937,7 @@ public:
class OrOrExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(left, right));
}
@ -1937,7 +2000,7 @@ public:
class PostIncDecExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(unaryExpression));
}
@ -1949,7 +2012,7 @@ public:
class PowExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(left, right));
}
@ -1971,7 +2034,7 @@ public:
class PragmaExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(identifier, argumentList));
}
@ -1983,7 +2046,7 @@ public:
class PreIncDecExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(unaryExpression));
}
@ -1995,7 +2058,7 @@ public:
class PrimaryExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(basicType, primary, typeofExpression,
typeidExpression, arrayLiteral, assocArrayLiteral, expression,
@ -2035,7 +2098,7 @@ public:
class RelExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(left, right));
}
@ -2096,7 +2159,7 @@ public:
class ShiftExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(left, right));
}
@ -2120,7 +2183,7 @@ public:
class SliceExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(unaryExpression, lower, upper));
}
@ -2409,7 +2472,7 @@ public:
class TemplateMixinExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(identifier, templateArguments, mixinTemplateName));
}
@ -2534,7 +2597,7 @@ public:
class TernaryExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(orOrExpression, expression, ternaryExpression));
}
@ -2558,7 +2621,7 @@ public:
class TraitsExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(identifier, templateArgumentList));
}
@ -2647,7 +2710,7 @@ public:
class TypeidExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(type, expression));
}
@ -2659,7 +2722,7 @@ public:
class TypeofExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(expression, return_));
}
@ -2671,7 +2734,7 @@ public:
class UnaryExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
// TODO prefix, postfix, unary
mixin (visitIfNotNull!(primaryExpression, newExpression,
@ -2803,7 +2866,7 @@ public:
class XorExpression : ExpressionNode
{
public:
/+override+/ void accept(ASTVisitor visitor)
override void accept(ASTVisitor visitor)
{
mixin (visitIfNotNull!(left, right));
}

View File

@ -1874,6 +1874,8 @@ class ClassFour(A, B) if (someTest()) : Super {}}c;
{
mixin(traceEnterAndExit!(__FUNCTION__));
auto node = new DeleteExpression;
node.line = current.line;
node.column = current.column;
if (expect(tok!"delete") is null) return null;
node.unaryExpression = parseUnaryExpression();
return node;
@ -3990,6 +3992,7 @@ q{(int a, ...)
*/
PragmaDeclaration parsePragmaDeclaration()
{
mixin (traceEnterAndExit!(__FUNCTION__));
auto node = new PragmaDeclaration;
node.pragmaExpression = parsePragmaExpression();
expect(tok!";");
@ -4005,6 +4008,7 @@ q{(int a, ...)
*/
PragmaExpression parsePragmaExpression()
{
mixin (traceEnterAndExit!(__FUNCTION__));
auto node = new PragmaExpression;
expect(tok!"pragma");
expect(tok!"(");
@ -4264,8 +4268,9 @@ q{(int a, ...)
* | $(LITERAL '!<=')
* ;)
*/
ExpressionNode parseRelExpression(ExpressionNode shift = null)
ExpressionNode parseRelExpression(ExpressionNode shift)
{
mixin (traceEnterAndExit!(__FUNCTION__));
return parseLeftAssocBinaryExpression!(RelExpression, ShiftExpression,
tok!"<", tok!"<=", tok!">", tok!">=", tok!"!<>=", tok!"!<>",
tok!"<>", tok!"<>=", tok!"!>", tok!"!>=", tok!"!>=", tok!"!<",
@ -6238,7 +6243,11 @@ protected:
{
auto n = new ExpressionType;
static if (__traits(hasMember, ExpressionType, "operator"))
{
n.line = current.line;
n.column = current.column;
n.operator = advance().type;
}
else
advance();
n.left = node;

View File

@ -1,8 +1,99 @@
// Written in the D programming language
/**
* $(H2 Summary)
* This module contains a range-based _lexer generator.
*
* $(H2 Overview)
* The _lexer generator consists of a template mixin, $(LREF Lexer), along with
* several helper templates for generating such things as token identifiers.
*
* To write a _lexer using this API:
* $(OL
* $(LI Create the string array costants for your language.
* $(UL
* $(LI $(LINK2 #.StringConstants, String Constants))
* ))
* $(LI Create aliases for the various token and token identifier types
* specific to your language.
* $(UL
* $(LI $(LREF TokenIdType))
* $(LI $(LREF tokenStringRepresentation))
* $(LI $(LREF TokenStructure))
* $(LI $(LREF TokenId))
* ))
* $(LI Create a struct that mixes in the Lexer template mixin and
* implements the necessary functions.
* $(UL
* $(LI $(LREF Lexer))
* ))
* )
* Examples:
* $(UL
* $(LI A _lexer for D is available $(LINK2 https://github.com/Hackerpilot/Dscanner/blob/master/stdx/d/lexer.d, here).)
* $(LI A _lexer for Lua is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/lualexer.d, here).)
* )
* $(DDOC_ANCHOR StringConstants) $(H2 String Constants)
* $(DL
* $(DT $(B staticTokens))
* $(DD A listing of the tokens whose exact value never changes and which cannot
* possibly be a token handled by the default token lexing function. The
* most common example of this kind of token is an operator such as
* $(D_STRING "*"), or $(D_STRING "-") in a programming language.)
* $(DT $(B dynamicTokens))
* $(DD A listing of tokens whose value is variable, such as whitespace,
* identifiers, number literals, and string literals.)
* $(DT $(B possibleDefaultTokens))
* $(DD A listing of tokens that could posibly be one of the tokens handled by
* the default token handling function. An common example of this is
* a keyword such as $(D_STRING "for"), which looks like the beginning of
* the identifier $(D_STRING "fortunate"). isSeparating is called to
* determine if the character after the $(D_STRING 'r') separates the
* identifier, indicating that the token is $(D_STRING "for"), or if lexing
* should be turned over to the defaultTokenFunction.)
* $(DT $(B tokenHandlers))
* $(DD A mapping of prefixes to custom token handling function names. The
* generated _lexer will search for the even-index elements of this array,
* and then call the function whose name is the element immedately after the
* even-indexed element. This is used for lexing complex tokens whose prefix
* is fixed.)
* )
*
* Here are some example constants for a simple calculator _lexer:
* ---
* // There are a near infinite number of valid number literals, so numbers are
* // dynamic tokens.
* enum string[] dynamicTokens = ["numberLiteral", "whitespace"];
*
* // The operators are always the same, and cannot start a numberLiteral, so
* // they are staticTokens
* enum string[] staticTokens = ["-", "+", "*", "/"];
*
* // In this simple example there are no keywords or other tokens that could
* // look like dynamic tokens, so this is blank.
* enum string[] possibleDefaultTokens = [];
*
* // If any whitespace character or digit is encountered, pass lexing over to
* // our custom handler functions. These will be demonstrated in an example
* // later on.
* enum string[] tokenHandlers = [
* "0", "lexNumber",
* "1", "lexNumber",
* "2", "lexNumber",
* "3", "lexNumber",
* "4", "lexNumber",
* "5", "lexNumber",
* "6", "lexNumber",
* "7", "lexNumber",
* "8", "lexNumber",
* "9", "lexNumber",
* " ", "lexWhitespace",
* "\n", "lexWhitespace",
* "\t", "lexWhitespace",
* "\r", "lexWhitespace"
* ];
* ---
*
* Copyright: Brian Schott 2013
* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0)
* Authors: Brian Schott, with ideas shamelessly stolen from Andrei Alexandrescu
@ -16,7 +107,12 @@ module stdx.lexer;
* unsigned integral type that is able to hold the value
* staticTokens.length + dynamicTokens.length. For example if there are 20
* static tokens, 30 dynamic tokens, and 10 possible default tokens, this
* template will alias itself to ubyte, as 20 + 30 + 10 < ubyte.max.
* template will alias itself to ubyte, as 20 + 30 + 10 < $(D_KEYWORD ubyte).max.
* Examples:
* ---
* // In our calculator example this means that IdType is an alias for ubyte.
* alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens);
* ---
*/
template TokenIdType(alias staticTokens, alias dynamicTokens,
alias possibleDefaultTokens)
@ -32,7 +128,15 @@ template TokenIdType(alias staticTokens, alias dynamicTokens,
}
/**
* Looks up the string representation of the given token type.
* Looks up the string representation of the given token type. This is the
* opposite of the function of the TokenId template.
* Params: type = the token type identifier
* Examples:
* ---
* alias str = tokenStringRepresentation(IdType, staticTokens, dynamicTokens, possibleDefaultTokens);
* assert (str(tok!"*") == "*");
* ---
* See_also: $(LREF TokenId)
*/
string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens, alias possibleDefaultTokens)(IdType type) @property
{
@ -57,18 +161,18 @@ string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens
* valid token type identifier)
* )
* In all cases this template will alias itself to a constant of type IdType.
* This template will fail at compile time if $(D_PARAM symbol) is not one of
* the staticTokens, dynamicTokens, or possibleDefaultTokens.
* Examples:
* ---
* enum string[] staticTokens = ["+", "-", "*", "/"];
* enum string[] dynamicTokens = ["number"];
* enum string[] possibleDefaultTokens = [];
* alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens);
* template tok(string symbol)
* {
* alias tok = TokenId!(IdType, staticTokens, dynamicTokens,
* possibleDefaultTokens, symbol);
* }
* // num and plus are of type ubyte.
* IdType plus = tok!"+";
* IdType num = tok!"numberLiteral";
* ---
*/
template TokenId(IdType, alias staticTokens, alias dynamicTokens,
@ -118,35 +222,49 @@ template TokenId(IdType, alias staticTokens, alias dynamicTokens,
/**
* The token that is returned by the lexer.
* Params:
* IDType = The D type of the "type" token type field.
* IdType = The D type of the "type" token type field.
* extraFields = A string containing D code for any extra fields that should
* be included in the token structure body. This string is passed
* directly to a mixin statement.
* Examples:
* ---
* // No extra struct fields are desired in this example, so leave it blank.
* alias Token = TokenStructure!(IdType, "");
* Token minusToken = Token(tok!"-");
* ---
*/
struct TokenStructure(IDType, string extraFields = "")
struct TokenStructure(IdType, string extraFields = "")
{
public:
/**
* == overload for the the token type.
*/
bool opEquals(IDType type) const pure nothrow @safe
bool opEquals(IdType type) const pure nothrow @safe
{
return this.type == type;
}
/**
*
* Constructs a token from a token type.
* Params: type = the token type
*/
this(IDType type)
this(IdType type)
{
this.type = type;
}
/**
*
* Constructs a token.
* Params:
* type = the token type
* text = the text of the token, which may be null
* line = the line number at which this token occurs
* column = the column nmuber at which this token occurs
* index = the byte offset from the beginning of the input at which this
* token occurs
*/
this(IDType type, string text, size_t line, size_t column, size_t index)
this(IdType type, string text, size_t line, size_t column, size_t index)
{
this.text = text;
this.line = line;
@ -156,39 +274,105 @@ public:
}
/**
*
* The _text of the token.
*/
string text;
/**
*
* The line number at which this token occurs.
*/
size_t line;
/**
*
* The Column nmuber at which this token occurs.
*/
size_t column;
/**
*
* The byte offset from the beginning of the input at which this token
* occurs.
*/
size_t index;
/**
*
* The token type.
*/
IDType type;
IdType type;
mixin (extraFields);
}
/**
* The implementation of the _lexer is contained within this mixin template.
* To use it, this template should be mixed in to a struct that represents the
* _lexer for your language. This struct should implement the following methods:
* $(UL
* $(LI popFront, which should call this mixin's _popFront() and
* additionally perform any token filtering or shuffling you deem
* necessary. For example, you can implement popFront to skip comment or
* tokens.)
* $(LI A function that serves as the default token lexing function. For
* most languages this will be the identifier lexing function.)
* $(LI A function that is able to determine if an identifier/keyword has
* come to an end. This function must retorn $(D_KEYWORD bool) and take
* a single $(D_KEYWORD size_t) argument representing the number of
* bytes to skip over before looking for a separating character.)
* $(LI Any functions referred to in the tokenHandlers template paramater.
* These functions must be marked $(D_KEYWORD pure nothrow), take no
* arguments, and return a token)
* $(LI A constructor that initializes the range field as well as calls
* popFront() exactly once (to initialize the _front field).)
* )
* Examples:
* ---
* struct CalculatorLexer
* {
* mixin Lexer!(IdType, Token, defaultTokenFunction, isSeparating,
* staticTokens, dynamicTokens, tokenHandlers, possibleDefaultTokens);
*
* this (ubyte[] bytes)
* {
* this.range = LexerRange(bytes);
* popFront();
* }
*
* void popFront() pure
* {
* _popFront();
* }
*
* Token lexNumber() pure nothrow @safe
* {
* ...
* }
*
* Token lexWhitespace() pure nothrow @safe
* {
* ...
* }
*
* Token defaultTokenFunction() pure nothrow @safe
* {
* // There is no default token in the example calculator language, so
* // this is always an error.
* range.popFront();
* return Token(tok!"");
* }
*
* bool isSeparating(size_t offset) pure nothrow @safe
* {
* // For this example language, always return true.
* return true;
* }
* }
* ---
*/
mixin template Lexer(IDType, Token, alias defaultTokenFunction,
alias tokenSeparatingFunction, alias staticTokens, alias dynamicTokens,
alias pseudoTokenHandlers, alias possibleDefaultTokens)
alias tokenHandlers, alias possibleDefaultTokens)
{
static assert (pseudoTokenHandlers.length % 2 == 0, "Each pseudo-token must"
static assert (tokenHandlers.length % 2 == 0, "Each pseudo-token must"
~ " have a corresponding handler function name.");
static string generateMask(const ubyte[] arr)
@ -214,7 +398,7 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction,
import std.string;
import std.range;
string[] pseudoTokens = stupidToArray(pseudoTokenHandlers.stride(2));
string[] pseudoTokens = stupidToArray(tokenHandlers.stride(2));
string[] allTokens = stupidToArray(sort(staticTokens ~ possibleDefaultTokens ~ pseudoTokens).uniq);
string code;
for (size_t i = 0; i < allTokens.length; i++)
@ -240,7 +424,7 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction,
if (pseudoTokens.countUntil(tokens[0]) >= 0)
{
return " return "
~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(tokens[0]) + 1]
~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1]
~ "();\n";
}
else if (staticTokens.countUntil(tokens[0]) >= 0)
@ -251,7 +435,7 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction,
else if (pseudoTokens.countUntil(tokens[0]) >= 0)
{
return " return "
~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(tokens[0]) + 1]
~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1]
~ "();\n";
}
}
@ -271,14 +455,14 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction,
if (token.length <= 8)
{
code ~= " return "
~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(token) + 1]
~ tokenHandlers[tokenHandlers.countUntil(token) + 1]
~ "();\n";
}
else
{
code ~= " if (range.peek(" ~ text(token.length - 1) ~ ") == \"" ~ escape(token) ~"\")\n";
code ~= " return "
~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(token) + 1]
~ tokenHandlers[tokenHandlers.countUntil(token) + 1]
~ "();\n";
}
}
@ -325,16 +509,23 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction,
return code;
}
/**
* Implements the range primitive front().
*/
ref const(Token) front() pure nothrow const @property
{
return _front;
}
void _popFront() pure
{
_front = advance();
}
/**
* Implements the range primitive empty().
*/
bool empty() pure const nothrow @property
{
return _front.type == tok!"\0";
@ -359,9 +550,7 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction,
return retVal;
}
/**
* This only exists because the real array() can't be called at compile-time
*/
// This only exists because the real array() can't be called at compile-time
static string[] stupidToArray(R)(R range)
{
string[] retVal;
@ -397,13 +586,30 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction,
}
}
/**
* The lexer input.
*/
LexerRange range;
/**
* The token that is currently at the front of the range.
*/
Token _front;
}
/**
* Range structure that wraps the _lexer's input.
*/
struct LexerRange
{
/**
* Params:
* bytes = the _lexer input
* index = the initial offset from the beginning of $(D_PARAM bytes)
* column = the initial column number
* line = the initial line number
*/
this(const(ubyte)[] bytes, size_t index = 0, size_t column = 1, size_t line = 1) pure nothrow @safe
{
this.bytes = bytes;
@ -412,31 +618,52 @@ struct LexerRange
this.line = line;
}
/**
* Returns: a mark at the current position that can then be used with slice.
*/
size_t mark() const nothrow pure @safe
{
return index;
}
/**
* Sets the range to the given position
* Params: m = the position to seek to
*/
void seek(size_t m) nothrow pure @safe
{
index = m;
}
/**
* Returs a slice of the input byte array betwene the given mark and the
* current position.
* Params m = the beginning index of the slice to return
*/
const(ubyte)[] slice(size_t m) const nothrow pure @safe
{
return bytes[m .. index];
}
/**
* Implements the range primitive _empty.
*/
bool empty() const nothrow pure @safe
{
return index >= bytes.length;
}
/**
* Implements the range primitive _front.
*/
ubyte front() const nothrow pure @safe
{
return bytes[index];
}
/**
* Returns: the current item as well as the items $(D_PARAM p) items ahead.
*/
const(ubyte)[] peek(size_t p) const nothrow pure @safe
{
return index + p + 1 > bytes.length
@ -444,48 +671,79 @@ struct LexerRange
: bytes[index .. index + p + 1];
}
/**
*
*/
ubyte peekAt(size_t offset) const nothrow pure @safe
{
return bytes[index + offset];
}
/**
* Returns: true if it is possible to peek $(D_PARAM p) bytes ahead.
*/
bool canPeek(size_t p) const nothrow pure @safe
{
return index + p < bytes.length;
}
/**
* Implements the range primitive _popFront.
*/
void popFront() pure nothrow @safe
{
index++;
column++;
}
/**
* Implements the algorithm _popFrontN more efficiently.
*/
void popFrontN(size_t n) pure nothrow @safe
{
index += n;
column += n;
}
/**
* Increments the range's line number and resets the column counter.
*/
void incrementLine() pure nothrow @safe
{
column = 1;
line++;
}
/**
* The input _bytes.
*/
const(ubyte)[] bytes;
/**
* The range's current position.
*/
size_t index;
/**
* The current _column number.
*/
size_t column;
/**
* The current _line number.
*/
size_t line;
}
/**
* The string cache should be used within lexer implementations for several
* reasons:
* $(UL
* $(LI Reducing memory consumption.)
* $(LI Increasing performance in token comparisons)
* $(LI Correctly creating immutable token text if the lexing source is not
* immutable)
* )
* The string cache implements a map/set for strings. Placing a string in the
* cache returns an identifier that can be used to instantly access the stored
* string. It is then possible to simply compare these indexes instead of
* performing full string comparisons when comparing the string content of
* dynamic tokens. The string cache also handles its own memory, so that mutable
* ubyte[] to lexers can still have immutable string fields in their tokens.
* Because the string cache also performs de-duplication it is possible to
* drastically reduce the memory usage of a lexer.
*/
struct StringCache
{
@ -493,7 +751,10 @@ public:
@disable this();
this(size_t bucketCount = defaultBucketCount)
/**
* Params: bucketCount = the initial number of buckets.
*/
this(size_t bucketCount)
{
buckets = new Item*[bucketCount];
}
@ -512,6 +773,9 @@ public:
return get(cache(bytes));
}
/**
* Equivalent to calling cache() and get().
*/
string cacheGet(const(ubyte[]) bytes, uint hash) pure nothrow @safe
{
return get(cache(bytes, hash));
@ -536,6 +800,11 @@ public:
return cache(bytes, hash);
}
/**
* Caches a string as above, but uses the given has code instead of
* calculating one itself. Use this alongside hashStep() can reduce the
* amount of work necessary when lexing dynamic tokens.
*/
size_t cache(const(ubyte)[] bytes, uint hash) pure nothrow @safe
in
{
@ -583,11 +852,21 @@ public:
writeln("rehashes: ", rehashCount);
}
/**
* Incremental hashing.
* Params:
* b = the byte to add to the hash
* h = the hash that has been calculated so far
* Returns: the new hash code for the string.
*/
static uint hashStep(ubyte b, uint h) pure nothrow @safe
{
return (h ^ sbox[b]) * 3;
}
/**
* The default bucket count for the string cache.
*/
static enum defaultBucketCount = 2048;
private:

483
stdx/lexer.html Normal file
View File

@ -0,0 +1,483 @@
<h1>stdx.lexer</h1> <!-- Generated by Ddoc from lexer.d -->
This module contains a range-based lexer generator.
<p></p>
The lexer generator consists of a template mixin, Lexer, along with several
helper templates for generating such things as token identifiers.
<p></p>
To generate a lexer using this API, several constants must be supplied:
<dl><dt>staticTokens</dt>
<dd>A listing of the tokens whose exact value never changes and which cannot
possibly be a token handled by the default token lexing function. The
most common example of this kind of token is an operator such as "*", or
"-" in a programming language.</dd>
<dt>dynamicTokens</dt>
<dd>A listing of tokens whose value is variable, such as whitespace,
identifiers, number literals, and string literals.</dd>
<dt>possibleDefaultTokens</dt>
<dd>A listing of tokens that could posibly be one of the tokens handled by
the default token handling function. An common example of this is
a keyword such as <span class="d_string">"for"</span>, which looks like the beginning of
the identifier <span class="d_string">"fortunate"</span>. isSeparating is called to
determine if the character after the <span class="d_string">'r'</span> separates the
identifier, indicating that the token is <span class="d_string">"for"</span>, or if lexing
should be turned over to the defaultTokenFunction.</dd>
<dt>tokenHandlers</dt>
<dd>A mapping of prefixes to custom token handling function names. The
generated lexer will search for the even-index elements of this array,
and then call the function whose name is the element immedately after the
even-indexed element. This is used for lexing complex tokens whose prefix
is fixed.</dd>
</dl>
<p></p>
Here are some example constants for a simple calculator lexer:
<pre class="d_code"><span class="d_comment">// There are a near infinite number of valid number literals, so numbers are
</span><span class="d_comment">// dynamic tokens.
</span><span class="d_keyword">enum</span> string[] dynamicTokens = [<span class="d_string">"numberLiteral"</span>, <span class="d_string">"whitespace"</span>];
<span class="d_comment">// The operators are always the same, and cannot start a numberLiteral, so
</span><span class="d_comment">// they are staticTokens
</span><span class="d_keyword">enum</span> string[] staticTokens = [<span class="d_string">"-"</span>, <span class="d_string">"+"</span>, <span class="d_string">"*"</span>, <span class="d_string">"/"</span>];
<span class="d_comment">// In this simple example there are no keywords or other tokens that could
</span><span class="d_comment">// look like dynamic tokens, so this is blank.
</span><span class="d_keyword">enum</span> string[] possibleDefaultTokens = [];
<span class="d_comment">// If any whitespace character or digit is encountered, pass lexing over to
</span><span class="d_comment">// our custom handler functions. These will be demonstrated in an example
</span><span class="d_comment">// later on.
</span><span class="d_keyword">enum</span> string[] tokenHandlers = [
<span class="d_string">"0"</span>, <span class="d_string">"lexNumber"</span>,
<span class="d_string">"1"</span>, <span class="d_string">"lexNumber"</span>,
<span class="d_string">"2"</span>, <span class="d_string">"lexNumber"</span>,
<span class="d_string">"3"</span>, <span class="d_string">"lexNumber"</span>,
<span class="d_string">"4"</span>, <span class="d_string">"lexNumber"</span>,
<span class="d_string">"5"</span>, <span class="d_string">"lexNumber"</span>,
<span class="d_string">"6"</span>, <span class="d_string">"lexNumber"</span>,
<span class="d_string">"7"</span>, <span class="d_string">"lexNumber"</span>,
<span class="d_string">"8"</span>, <span class="d_string">"lexNumber"</span>,
<span class="d_string">"9"</span>, <span class="d_string">"lexNumber"</span>,
<span class="d_string">" "</span>, <span class="d_string">"lexWhitespace"</span>,
<span class="d_string">"\n"</span>, <span class="d_string">"lexWhitespace"</span>,
<span class="d_string">"\t"</span>, <span class="d_string">"lexWhitespace"</span>,
<span class="d_string">"\r"</span>, <span class="d_string">"lexWhitespace"</span>
];
</pre>
<p></p>
<b>Examples:</b><br><ul><li>A lexer for D is available <a href="https://github.com/Hackerpilot/Dscanner/blob/master/stdx/d/lexer.d">here</a>.</li>
<li>A lexer for Lua is available <a href="https://github.com/Hackerpilot/lexer-demo/blob/master/lualexer.d">here</a>.</li>
</ul>
<p></p>
<b>License:</b><br><a href="http://www.boost.org/LICENSE_1_0.txt Boost">License 1.0</a>
<p></p>
<b>Authors:</b><br>Brian Schott, with ideas shamelessly stolen from Andrei Alexandrescu
<p></p>
<b>Source:</b><br>
<a href="https://github.com/D-Programming-Language/phobos/blob/master/std/lexer.d">std/lexer.d</a><p></p>
<dl><dt class="d_decl"><a name=".TokenIdType"></a>template <a name="TokenIdType"></a><span class="ddoc_psymbol">TokenIdType</span>(alias staticTokens, alias dynamicTokens, alias possibleDefaultTokens)</dt>
<dd>Template for determining the type used for a token type. Selects the smallest
unsigned integral type that is able to hold the value
staticTokens.length + dynamicTokens.length. For example if there are 20
static tokens, 30 dynamic tokens, and 10 possible default tokens, this
template will alias itself to ubyte, as 20 + 30 + 10 &lt; <span class="d_keyword">ubyte</span>.max.
<p></p>
<b>Examples:</b><br><pre class="d_code"><span class="d_comment">// In our calculator example this means that IdType is an alias for ubyte.
</span><span class="d_keyword">alias</span> IdType = <span class="d_psymbol">TokenIdType</span>!(staticTokens, dynamicTokens, possibleDefaultTokens);
</pre>
<p></p>
</dd>
<dt class="d_decl"><a name=".tokenStringRepresentation"></a>@property string <a name="tokenStringRepresentation"></a><span class="ddoc_psymbol">tokenStringRepresentation</span>(IdType, alias staticTokens, alias dynamicTokens, alias possibleDefaultTokens)(IdType <i>type</i>);
</dt>
<dd>Looks up the string representation of the given token type. This is the
opposite of the function of the TokenId template.
<p></p>
<b>Parameters:</b><table class=parms><tr><td valign=top>IdType type</td>
<td valign=top>the token type identifier</td></tr>
</table><p></p>
<b>Examples:</b><br><pre class="d_code"><span class="d_keyword">alias</span> str = <span class="d_psymbol">tokenStringRepresentation</span>(IdType, staticTokens, dynamicTokens, possibleDefaultTokens);
<span class="d_keyword">assert</span> (str(tok!<span class="d_string">"*"</span>) == <span class="d_string">"*"</span>);
</pre>
<p></p>
<b>See Also:</b><br>TokenId<p></p>
</dd>
<dt class="d_decl"><a name=".TokenId"></a>template <a name="TokenId"></a><span class="ddoc_psymbol">TokenId</span>(IdType, alias staticTokens, alias dynamicTokens, alias possibleDefaultTokens, string symbol)</dt>
<dd>Generates the token type identifier for the given symbol. There are two
special cases:
<ul> <li>If symbol is "", then the token identifier will be 0</li>
<li>If symbol is "\0", then the token identifier will be the maximum
valid token type identifier</li>
</ul>
In all cases this template will alias itself to a constant of type IdType.
This template will fail at compile time if <span class="d_param">symbol</span> is not one of
the staticTokens, dynamicTokens, or possibleDefaultTokens.
<p></p>
<b>Examples:</b><br><pre class="d_code"><span class="d_keyword">template</span> tok(string symbol)
{
<span class="d_keyword">alias</span> tok = <span class="d_psymbol">TokenId</span>!(IdType, staticTokens, dynamicTokens,
possibleDefaultTokens, symbol);
}
<span class="d_comment">// num and plus are of type ubyte.
</span>IdType plus = tok!<span class="d_string">"+"</span>;
IdType num = tok!<span class="d_string">"numberLiteral"</span>;
</pre>
<p></p>
</dd>
<dt class="d_decl"><a name=".TokenStructure"></a>struct <a name="TokenStructure"></a><span class="ddoc_psymbol">TokenStructure</span>(IdType, string extraFields = "");
</dt>
<dd>The token that is returned by the lexer.
<p></p>
<b>Parameters:</b><table class=parms><tr><td valign=top>IdType</td>
<td valign=top>The D type of the "type" token type field.</td></tr>
<tr><td valign=top>extraFields</td>
<td valign=top>A string containing D code for any extra fields that should
be included in the token structure body. This string is passed
directly to a mixin statement.</td></tr>
</table><p></p>
<b>Examples:</b><br><pre class="d_code"><span class="d_comment">// No extra struct fields are desired in this example, so leave it blank.
</span><span class="d_keyword">alias</span> Token = <span class="d_psymbol">TokenStructure</span>!(IdType, <span class="d_string">""</span>);
Token minusToken = Token(tok!<span class="d_string">"-"</span>);
</pre>
<p></p>
<dl><dt class="d_decl"><a name=".opEquals"></a>const pure nothrow @safe bool <a name="opEquals"></a><span class="ddoc_psymbol">opEquals</span>(IdType <i>type</i>);
</dt>
<dd>== overload for the the token <i>type</i>.<p></p>
</dd>
<dt class="d_decl"><a name=".this"></a> this(IdType <i>type</i>);
</dt>
<dd>Constructs a token from a token <i>type</i>.
<p></p>
<b>Parameters:</b><table class=parms><tr><td valign=top>IdType <i>type</i></td>
<td valign=top>the token <i>type</i></td></tr>
</table><p></p>
</dd>
<dt class="d_decl"><a name=".this"></a> this(IdType <i>type</i>, string <i>text</i>, size_t <i>line</i>, size_t <i>column</i>, size_t <i>index</i>);
</dt>
<dd>Constructs a token.
<p></p>
<b>Parameters:</b><table class=parms><tr><td valign=top>IdType <i>type</i></td>
<td valign=top>the token <i>type</i></td></tr>
<tr><td valign=top>string <i>text</i></td>
<td valign=top>the <i>text</i> of the token, which may be <b>null</b></td></tr>
<tr><td valign=top>size_t <i>line</i></td>
<td valign=top>the <i>line</i> number at which this token occurs</td></tr>
<tr><td valign=top>size_t <i>column</i></td>
<td valign=top>the <i>column</i> nmuber at which this token occurs</td></tr>
<tr><td valign=top>size_t <i>index</i></td>
<td valign=top>the byte offset from the beginning of the input at which this
token occurs</td></tr>
</table><p></p>
</dd>
<dt class="d_decl"><a name=".text"></a>string <a name="text"></a><span class="ddoc_psymbol">text</span>;
</dt>
<dd>The <a name="text"></a><span class="ddoc_psymbol">text</span> of the token.<p></p>
</dd>
<dt class="d_decl"><a name=".line"></a>size_t <a name="line"></a><span class="ddoc_psymbol">line</span>;
</dt>
<dd>The <a name="line"></a><span class="ddoc_psymbol">line</span> number at which this token occurs.<p></p>
</dd>
<dt class="d_decl"><a name=".column"></a>size_t <a name="column"></a><span class="ddoc_psymbol">column</span>;
</dt>
<dd>The Column nmuber at which this token occurs.<p></p>
</dd>
<dt class="d_decl"><a name=".index"></a>size_t <a name="index"></a><span class="ddoc_psymbol">index</span>;
</dt>
<dd>The byte offset from the beginning of the input at which this token
occurs.<p></p>
</dd>
<dt class="d_decl"><a name=".type"></a>IdType <a name="type"></a><span class="ddoc_psymbol">type</span>;
</dt>
<dd>The token <a name="type"></a><span class="ddoc_psymbol">type</span>.<p></p>
</dd>
</dl>
</dd>
<dt class="d_decl"><a name=".Lexer"></a>template <a name="Lexer"></a><span class="ddoc_psymbol">Lexer</span>(IDType, Token, alias defaultTokenFunction, alias tokenSeparatingFunction, alias staticTokens, alias dynamicTokens, alias tokenHandlers, alias possibleDefaultTokens)</dt>
<dd>The implementation of the lexer is contained within this mixin template.
To use it, this template should be mixed in to a struct that represents the
lexer for your language. This struct should implement the following methods:
<ul> <li>popFront, which should call this mixin's popFront() and
additionally perform any token filtering or shuffling you deem
necessary. For example, you can implement popFront to skip comment or
tokens.</li>
<li>A function that serves as the default token lexing function. For
most languages this will be the identifier lexing function.</li>
<li>A function that is able to determine if an identifier/keyword has
come to an end. This function must retorn <span class="d_keyword">bool</span> and take
a single <span class="d_keyword">size_t</span> argument representing the number of
bytes to skip over before looking for a separating character.</li>
<li>Any functions referred to in the tokenHandlers template paramater.
These functions must be marked <span class="d_keyword">pure nothrow</span>, take no
arguments, and return a token</li>
<li>A constructor that initializes the range field as well as calls
popFront() exactly once (to initialize the front field).</li>
</ul>
<p></p>
<b>Examples:</b><br><pre class="d_code"><span class="d_keyword">struct</span> CalculatorLexer
{
<span class="d_keyword">mixin</span> <span class="d_psymbol">Lexer</span>!(IdType, Token, defaultTokenFunction, isSeparating,
staticTokens, dynamicTokens, tokenHandlers, possibleDefaultTokens);
<span class="d_keyword">this</span> (<span class="d_keyword">ubyte</span>[] bytes)
{
<span class="d_keyword">this</span>.range = LexerRange(bytes);
popFront();
}
<span class="d_keyword">void</span> popFront() <span class="d_keyword">pure</span>
{
_popFront();
}
Token lexNumber() <span class="d_keyword">pure</span> <span class="d_keyword">nothrow</span> @safe
{
...
}
Token lexWhitespace() <span class="d_keyword">pure</span> <span class="d_keyword">nothrow</span> @safe
{
...
}
Token defaultTokenFunction() <span class="d_keyword">pure</span> <span class="d_keyword">nothrow</span> @safe
{
<span class="d_comment">// There is no default token in the example calculator language, so
</span> <span class="d_comment">// this is always an error.
</span> range.popFront();
<span class="d_keyword">return</span> Token(tok!<span class="d_string">""</span>);
}
<span class="d_keyword">bool</span> isSeparating(size_t offset) <span class="d_keyword">pure</span> <span class="d_keyword">nothrow</span> @safe
{
<span class="d_comment">// For this example language, always return true.
</span> <span class="d_keyword">return</span> <span class="d_keyword">true</span>;
}
}
</pre>
<p></p>
<dl><dt class="d_decl"><a name=".front"></a>const pure nothrow @property const(Token) <a name="front"></a><span class="ddoc_psymbol">front</span>();
</dt>
<dd>Implements the range primitive <a name="front"></a><span class="ddoc_psymbol">front</span>().<p></p>
</dd>
<dt class="d_decl"><a name=".empty"></a>const pure nothrow @property bool <a name="empty"></a><span class="ddoc_psymbol">empty</span>();
</dt>
<dd>Implements the range primitive <a name="empty"></a><span class="ddoc_psymbol">empty</span>().<p></p>
</dd>
<dt class="d_decl"><a name=".range"></a>LexerRange <a name="range"></a><span class="ddoc_psymbol">range</span>;
</dt>
<dd>The lexer input.<p></p>
</dd>
<dt class="d_decl"><a name="._front"></a>Token <a name="_front"></a><span class="ddoc_psymbol">_front</span>;
</dt>
<dd>The token that is currently at the front of the range.<p></p>
</dd>
</dl>
</dd>
<dt class="d_decl"><a name=".LexerRange"></a>struct <a name="LexerRange"></a><span class="ddoc_psymbol">LexerRange</span>;
</dt>
<dd>Range structure that wraps the lexer's input.<p></p>
<dl><dt class="d_decl"><a name=".LexerRange.this"></a>pure nothrow @safe this(const(ubyte)[] <i>bytes</i>, size_t <i>index</i> = 0, size_t <i>column</i> = 1, size_t <i>line</i> = 1);
</dt>
<dd><b>Parameters:</b><table class=parms><tr><td valign=top>const(ubyte)[] <i>bytes</i></td>
<td valign=top>the lexer input</td></tr>
<tr><td valign=top>size_t <i>index</i></td>
<td valign=top>the initial offset from the beginning of <span class="d_param"><i>bytes</i></span></td></tr>
<tr><td valign=top>size_t <i>column</i></td>
<td valign=top>the initial <i>column</i> number</td></tr>
<tr><td valign=top>size_t <i>line</i></td>
<td valign=top>the initial <i>line</i> number</td></tr>
</table><p></p>
</dd>
<dt class="d_decl"><a name=".LexerRange.mark"></a>const pure nothrow @safe size_t <a name="mark"></a><span class="ddoc_psymbol">mark</span>();
</dt>
<dd><b>Returns:</b><br>a <a name="mark"></a><span class="ddoc_psymbol">mark</span> at the current position that can then be used with slice.<p></p>
</dd>
<dt class="d_decl"><a name=".LexerRange.seek"></a>pure nothrow @safe void <a name="seek"></a><span class="ddoc_psymbol">seek</span>(size_t <i>m</i>);
</dt>
<dd>Sets the range to the given position
<p></p>
<b>Parameters:</b><table class=parms><tr><td valign=top>size_t <i>m</i></td>
<td valign=top>the position to <a name="seek"></a><span class="ddoc_psymbol">seek</span> to</td></tr>
</table><p></p>
</dd>
<dt class="d_decl"><a name=".LexerRange.slice"></a>const pure nothrow @safe const(ubyte)[] <a name="slice"></a><span class="ddoc_psymbol">slice</span>(size_t <i>m</i>);
</dt>
<dd>Returs a <a name="slice"></a><span class="ddoc_psymbol">slice</span> of the input byte array betwene the given mark and the
current position.
Params <i>m</i> = the beginning index of the <a name="slice"></a><span class="ddoc_psymbol">slice</span> to return<p></p>
</dd>
<dt class="d_decl"><a name=".LexerRange.empty"></a>const pure nothrow @safe bool <a name="empty"></a><span class="ddoc_psymbol">empty</span>();
</dt>
<dd>Implements the range primitive empty.<p></p>
</dd>
<dt class="d_decl"><a name=".LexerRange.front"></a>const pure nothrow @safe ubyte <a name="front"></a><span class="ddoc_psymbol">front</span>();
</dt>
<dd>Implements the range primitive front.<p></p>
</dd>
<dt class="d_decl"><a name=".LexerRange.peek"></a>const pure nothrow @safe const(ubyte)[] <a name="peek"></a><span class="ddoc_psymbol">peek</span>(size_t <i>p</i>);
</dt>
<dd><b>Returns:</b><br>the current item as well as the items <span class="d_param"><i>p</i></span> items ahead.<p></p>
</dd>
<dt class="d_decl"><a name=".LexerRange.peekAt"></a>const pure nothrow @safe ubyte <a name="peekAt"></a><span class="ddoc_psymbol">peekAt</span>(size_t <i>offset</i>);
</dt>
<dd><p></p>
</dd>
<dt class="d_decl"><a name=".LexerRange.canPeek"></a>const pure nothrow @safe bool <a name="canPeek"></a><span class="ddoc_psymbol">canPeek</span>(size_t <i>p</i>);
</dt>
<dd><b>Returns:</b><br><b>true</b> if it is possible to peek <span class="d_param"><i>p</i></span> bytes ahead.<p></p>
</dd>
<dt class="d_decl"><a name=".LexerRange.popFront"></a>pure nothrow @safe void <a name="popFront"></a><span class="ddoc_psymbol">popFront</span>();
</dt>
<dd>Implements the range primitive popFront.<p></p>
</dd>
<dt class="d_decl"><a name=".LexerRange.popFrontN"></a>pure nothrow @safe void <a name="popFrontN"></a><span class="ddoc_psymbol">popFrontN</span>(size_t <i>n</i>);
</dt>
<dd>Implements the algorithm popFrontN more efficiently.<p></p>
</dd>
<dt class="d_decl"><a name=".LexerRange.incrementLine"></a>pure nothrow @safe void <a name="incrementLine"></a><span class="ddoc_psymbol">incrementLine</span>();
</dt>
<dd>Increments the range's line number and resets the column counter.<p></p>
</dd>
<dt class="d_decl"><a name=".LexerRange.bytes"></a>const(ubyte)[] <a name="bytes"></a><span class="ddoc_psymbol">bytes</span>;
</dt>
<dd>The input bytes.<p></p>
</dd>
<dt class="d_decl"><a name=".LexerRange.index"></a>size_t <a name="index"></a><span class="ddoc_psymbol">index</span>;
</dt>
<dd>The range's current position.<p></p>
</dd>
<dt class="d_decl"><a name=".LexerRange.column"></a>size_t <a name="column"></a><span class="ddoc_psymbol">column</span>;
</dt>
<dd>The current column number.<p></p>
</dd>
<dt class="d_decl"><a name=".LexerRange.line"></a>size_t <a name="line"></a><span class="ddoc_psymbol">line</span>;
</dt>
<dd>The current line number.<p></p>
</dd>
</dl>
</dd>
<dt class="d_decl"><a name=".StringCache"></a>struct <a name="StringCache"></a><span class="ddoc_psymbol">StringCache</span>;
</dt>
<dd>The string cache implements a map/set for strings. Placing a string in the
cache returns an identifier that can be used to instantly access the stored
string. It is then possible to simply compare these indexes instead of
performing full string comparisons when comparing the string content of
dynamic tokens. The string cache also handles its own memory, so that mutable
ubyte[] to lexers can still have immutable string fields in their tokens.
Because the string cache also performs de-duplication it is possible to
drastically reduce the memory usage of a lexer.<p></p>
<dl><dt class="d_decl"><a name=".StringCache.this"></a> this(size_t <i>bucketCount</i>);
</dt>
<dd><b>Parameters:</b><table class=parms><tr><td valign=top>size_t <i>bucketCount</i></td>
<td valign=top>the initial number of buckets.</td></tr>
</table><p></p>
</dd>
<dt class="d_decl"><a name=".StringCache.cacheGet"></a>pure nothrow @safe string <a name="cacheGet"></a><span class="ddoc_psymbol">cacheGet</span>(const(ubyte[]) <i>bytes</i>);
</dt>
<dd>Equivalent to calling cache() and get().
<pre class="d_code">StringCache cache;
<span class="d_keyword">ubyte</span>[] str = ['a', 'b', 'c'];
string s = cache.get(cache.cache(str));
<span class="d_keyword">assert</span>(s == <span class="d_string">"abc"</span>);
</pre>
<p></p>
</dd>
<dt class="d_decl"><a name=".StringCache.cacheGet"></a>pure nothrow @safe string <a name="cacheGet"></a><span class="ddoc_psymbol">cacheGet</span>(const(ubyte[]) <i>bytes</i>, uint <i>hash</i>);
</dt>
<dd>Equivalent to calling cache() and get().<p></p>
</dd>
<dt class="d_decl"><a name=".StringCache.cache"></a>pure nothrow @safe size_t <a name="cache"></a><span class="ddoc_psymbol">cache</span>(const(ubyte)[] <i>bytes</i>);
</dt>
<dd>Caches a string.
<p></p>
<b>Parameters:</b><table class=parms><tr><td valign=top>const(ubyte)[] <i>bytes</i></td>
<td valign=top>the string to <a name="cache"></a><span class="ddoc_psymbol">cache</span></td></tr>
</table><p></p>
<b>Returns:</b><br>A key that can be used to retrieve the cached string
<p></p>
<b>Examples:</b><br><pre class="d_code">StringCache <span class="d_psymbol">cache</span>;
<span class="d_keyword">ubyte</span>[] <span class="d_param">bytes</span> = ['a', 'b', 'c'];
size_t first = <span class="d_psymbol">cache</span>.<span class="d_psymbol">cache</span>(<span class="d_param">bytes</span>);
size_t second = <span class="d_psymbol">cache</span>.<span class="d_psymbol">cache</span>(<span class="d_param">bytes</span>);
<span class="d_keyword">assert</span> (first == second);
</pre>
<p></p>
</dd>
<dt class="d_decl"><a name=".StringCache.cache"></a>pure nothrow @safe size_t <a name="cache"></a><span class="ddoc_psymbol">cache</span>(const(ubyte)[] <i>bytes</i>, uint <i>hash</i>);
</dt>
<dd>Caches a string as above, but uses the given has code instead of
calculating one itself. Use this alongside hashStep() can reduce the
amount of work necessary when lexing dynamic tokens.<p></p>
</dd>
<dt class="d_decl"><a name=".StringCache.get"></a>const pure nothrow @safe string <a name="get"></a><span class="ddoc_psymbol">get</span>(size_t <i>index</i>);
</dt>
<dd>Gets a cached string based on its key.
<p></p>
<b>Parameters:</b><table class=parms><tr><td valign=top>size_t <i>index</i></td>
<td valign=top>the key</td></tr>
</table><p></p>
<b>Returns:</b><br>the cached string<p></p>
</dd>
<dt class="d_decl"><a name=".StringCache.hashStep"></a>static pure nothrow @safe uint <a name="hashStep"></a><span class="ddoc_psymbol">hashStep</span>(ubyte <i>b</i>, uint <i>h</i>);
</dt>
<dd>Incremental hashing.
<p></p>
<b>Parameters:</b><table class=parms><tr><td valign=top>ubyte <i>b</i></td>
<td valign=top>the byte to add to the hash</td></tr>
<tr><td valign=top>uint <i>h</i></td>
<td valign=top>the hash that has been calculated so far</td></tr>
</table><p></p>
<b>Returns:</b><br>the new hash code for the string.<p></p>
</dd>
<dt class="d_decl"><a name=".StringCache.defaultBucketCount"></a>static int <a name="defaultBucketCount"></a><span class="ddoc_psymbol">defaultBucketCount</span>;
</dt>
<dd>The default bucket count for the string cache.<p></p>
</dd>
</dl>
</dd>
</dl>
<table width=100%><tr><td><hr align="left" size="8" width="100%" color="maroon" /></td><td width=5%><a href=#top>[top]</a></td></tr></table>

BIN
stdx/lexer.o Normal file

Binary file not shown.