Merge branch 'range-based-lexer' of https://github.com/Hackerpilot/Dscanner into range-based-lexer

This commit is contained in:
Hackerpilot 2013-02-22 22:24:26 +00:00
commit 9b0d3d78d1
4 changed files with 2240 additions and 2432 deletions

View File

@ -1,232 +0,0 @@
import std.stdio;
import std.algorithm;
string[] opkwds = [
"=", // Assign
"@", // At
"&", // BitAnd
"&=", // BitAndEquals
"|", // BitOr
"|=", // BitOrEquals
"~=", // CatEquals
":", // Colon
",", // Comma
"--", // Decrement
"/", // Div
"/=", // DivEquals
"$", // Dollar
".", // Dot
"==", // Equals
"=>", // GoesTo
">", // Greater
">=", // GreaterEqual
"#", // Hash
"++", // Increment
"{", // LBrace
"[", // LBracket
"<", // Less
"<=", // LessEqual
"<>=", // LessEqualGreater
"<>", // LessOrGreater
"&&", // LogicAnd
"||", // LogicOr
"(", // LParen
"-", // Minus
"-=", // MinusEquals
"%", // Mod
"%=", // ModEquals
"*=", // MulEquals
"!", // Not
"!=", // NotEquals
"!>", // NotGreater
"!>=", // NotGreaterEqual
"!<", // NotLess
"!<=", // NotLessEqual
"!<>", // NotLessEqualGreater
"+", // Plus
"+=", // PlusEquals
"^^", // Pow
"^^=", // PowEquals
"}", // RBrace
"]", // RBracket
")", // RParen
";", // Semicolon
"<<", // ShiftLeft
"<<=", // ShiftLeftEqual
">>", // ShiftRight
">>=", // ShiftRightEqual
"..", // Slice
"*", // Star
"?", // Ternary
"~", // Tilde
"!<>=", // Unordered
">>>", // UnsignedShiftRight
">>>=", // UnsignedShiftRightEqual
"...", // Vararg
"^", // Xor
"^=", // XorEquals
"bool",
"byte",
"cdouble",
"cent",
"cfloat",
"char",
"creal",
"dchar",
"double",
"dstring",
"float",
"function",
"idouble",
"ifloat",
"int",
"ireal",
"long",
"real",
"short",
"string",
"ubyte",
"ucent",
"uint",
"ulong",
"ushort",
"void",
"wchar",
"wstring",
"align",
"deprecated",
"extern",
"pragma",
"export",
"package",
"private",
"protected",
"public",
"abstract",
"auto",
"const",
"final",
"__gshared",
"immutable",
"inout",
"scope",
"shared",
"static",
"synchronized",
"alias",
"asm",
"assert",
"body",
"break",
"case",
"cast",
"catch",
"class",
"continue",
"debug",
"default",
"delegate",
"delete",
"do",
"else",
"enum",
"false",
"finally",
"foreach",
"foreach_reverse",
"for",
"goto",
"if",
"import",
"in",
"interface",
"invariant",
"is",
"lazy",
"macro",
"mixin",
"module",
"new",
"nothrow",
"null",
"out",
"override",
"pure",
"ref",
"return",
"struct",
"super",
"switch",
"template",
"this",
"throw",
"true",
"try",
"typedef",
"typeid",
"typeof",
"union",
"unittest",
"version",
"volatile",
"while",
"with",
"__DATE__",
"__EOF__",
"__TIME__",
"__TIMESTAMP__",
"__VENDOR__",
"__VERSION__",
"__FILE__",
"__LINE__",
null, // Comment
null, // Identifier
null, // ScriptLine
"__traits",
"__parameters",
"__vector",
null, // Whitespace
null, // SpecialTokenSequence
null, // DoubleLiteral
null, // FloatLiteral
null, // IDoubleLiteral
null, // IFloatLiteral
null, // IntLiteral
null, // LongLiteral
null, // RealLiteral
null, // IRealLiteral
null, // UnsignedIntLiteral
null, // UnsignedLongLiteral
null, // DStringLiteral
null, // StringLiteral
null, // WStringLiteral
];
immutable string opKwdValues =
"#/=*=+=++-=--^^=~=<<=%==>>>=||=&&=,;:!<=!<>=!=!>=?...()[]{}@$"
~ "boolcdoublecentcfloatcrealdchardstringfunctionidoubleifloatirealubyte"
~ "ucentuintulongushortvoidwcharwstringaligndeprecatedexternpragmaexport"
~ "packageprivateprotectedpublicabstractautoconstfinal__gsharedimmutable"
~ "inoutscopesharedstaticsynchronizedaliasasmassertbodybreakcasecastcatch"
~ "classcontinuedebugdefaultdelegatedeleteelseenumfalsefinally"
~ "foreach_reversegotoimportinterfaceinvariantlazymacromixinmodule"
~ "newnothrownulloverridepurerefreturnstructsuperswitchtemplatethistruetry"
~ "typedeftypeidtypeofunionunittestversionvolatilewhilewith__traits"
~ "__vector__parameters__DATE__EOF__TIME__TIMESTAMP__VENDOR__VERSION__"
~ "FILE__LINE__";
void main(string[] args)
{
writeln("immutable(string[]) tokenValues = [");
foreach (s; opkwds)
{
if (s is null)
{
writeln("\tnull,");
continue;
}
auto n = opKwdValues.countUntil(s);
writeln("\topKwdValues[", n, " .. ", n + s.length, "], // ", s);
}
writeln("];");
}

26
main.d
View File

@ -97,12 +97,13 @@ int main(string[] args)
{ {
string[] importDirs; string[] importDirs;
bool sloc; bool sloc;
/+bool dotComplete;+/ bool dotComplete;
/+bool json;+/ bool parenComplete;
/+bool parenComplete;+/
bool highlight; bool highlight;
bool ctags; bool ctags;
bool recursiveCtags; bool json;
bool declaration;
bool recursive;
bool format; bool format;
bool help; bool help;
bool tokenCount; bool tokenCount;
@ -112,8 +113,9 @@ int main(string[] args)
{ {
getopt(args, "I", &importDirs,/+ "dotComplete|d", &dotComplete,+/ "sloc|l", &sloc, getopt(args, "I", &importDirs,/+ "dotComplete|d", &dotComplete,+/ "sloc|l", &sloc,
/+"json|j", &json,+/ /+"parenComplete|p", &parenComplete,+/ "highlight", &highlight, /+"json|j", &json,+/ /+"parenComplete|p", &parenComplete,+/ "highlight", &highlight,
"ctags|c", &ctags, "recursive|r|R", &recursiveCtags, "help|h", &help, "ctags|c", &ctags, "recursive|r|R", &recursive, "help|h", &help,
"tokenCount", &tokenCount, "frequencyCount", &frequencyCount); "tokenCount", &tokenCount, "frequencyCount", &frequencyCount,
"declaration|e", &declaration);
} }
catch (Exception e) catch (Exception e)
{ {
@ -148,7 +150,10 @@ int main(string[] args)
config.fileName = arg; config.fileName = arg;
uint count; uint count;
auto f = File(arg); auto f = File(arg);
ubyte[] buffer = uninitializedArray!(ubyte[])(f.size); import core.stdc.stdlib;
ubyte[] buffer = (cast(ubyte*)malloc(f.size))[0..f.size];
scope(exit) free(buffer.ptr);
//uninitializedArray!(ubyte[])(f.size);
foreach (t; byToken(f.rawRead(buffer), config)) foreach (t; byToken(f.rawRead(buffer), config))
{ {
if (tokenCount) if (tokenCount)
@ -210,9 +215,16 @@ options:
and methods available in the current scope that begin with the text and methods available in the current scope that begin with the text
before the cursor position. before the cursor position.
--declaration | -e [sourceFile] cursorPosition
Prints the absolute path to the file in which the symbol at the cursor
position was declared, as well as its line number.
--highlight [sourceFile] - Syntax-highlight the given source file. The --highlight [sourceFile] - Syntax-highlight the given source file. The
resulting HTML will be written to standard output. resulting HTML will be written to standard output.
--imports | -i [sourceFiles]
Prints modules imported by the given source file.
-I includePath -I includePath
Include _includePath_ in the list of paths used to search for imports. Include _includePath_ in the list of paths used to search for imports.
By default dscanner will search in the current working directory as By default dscanner will search in the current working directory as

File diff suppressed because it is too large Load Diff

View File

@ -975,10 +975,10 @@ private:
void lexWhitespace(bool keep)() void lexWhitespace(bool keep)()
{ {
current.type = TokenType.whitespace; current.type = TokenType.whitespace;
while (!isEoF() && isWhite()) do
{ {
nextChar(); nextChar();
} }while (!isEoF() && isWhite());
static if (keep) setTokenValue(); static if (keep) setTokenValue();
} }
@ -2010,15 +2010,18 @@ private:
return idx; return idx;
} }
} }
auto chunk = buffer[0..idx]; //TODO: avoid looking up as UTF string, use raw bytes
auto entity = cast(string)chunk in characterEntities; string chunk = cast(string)buffer[0..idx];
if (entity is null) auto names = assumeSorted(map!"a.name"(characterEntities));
auto place = names.lowerBound(chunk).length;
if (place == names.length || names[place] != chunk)
{ {
errorMessage("Invalid character entity \"&%s;\"" errorMessage("Invalid character entity \"&%s;\""
.format(cast(string) chunk)); .format(cast(string) chunk));
return 1; return 1;
} }
dest.put(cast(ubyte[]) (*entity)[0..$]); auto entity = characterEntities[place].value;
dest.put(cast(ubyte[]) entity);
return entity.length; return entity.length;
default: default:
errorMessage("Invalid escape sequence"); errorMessage("Invalid escape sequence");
@ -2099,7 +2102,7 @@ private:
if (ch >= '[' && ch <= '^') return true; if (ch >= '[' && ch <= '^') return true;
if (ch >= '{' && ch <= '~') return true; if (ch >= '{' && ch <= '~') return true;
if (ch == '`') return true; if (ch == '`') return true;
if (isWhite()) return true; //TODO: test only long 'whites' if ((ch & 0x80) && isLongWhite()) return true;
return false; return false;
} }
@ -2108,24 +2111,30 @@ private:
auto c = src.front; auto c = src.front;
if (c & 0x80) // multi-byte utf-8 if (c & 0x80) // multi-byte utf-8
{ {
//TODO: here and elsewhere we'd better have return isLongWhite();
// some kind of lookahead in LexSource instead of .save
auto r = src.save();
if (r.front != 0xe2)
return false;
else
r.popFront();
if (r.empty || r.front != 0x80)
return false;
else
r.popFront();
if (r.empty || (r.front != 0xa8 && r.front != 0xa9))
return false;
return true;
} }
else else
return c == 0x20 || (c >= 0x09 && c <= 0x0d); return c == 0x20 || (c >= 0x09 && c <= 0x0d);
} }
bool isLongWhite()
{
assert(src.front & 0x80); // only non-ascii
//TODO: here and elsewhere we'd better have
// some kind of lookahead in LexSource instead of .save
auto r = src.save();
if (r.front != 0xe2)
return false;
else
r.popFront();
if (r.empty || r.front != 0x80)
return false;
else
r.popFront();
if (r.empty || (r.front != 0xa8 && r.front != 0xa9))
return false;
return true;
}
void errorMessage(string s) void errorMessage(string s)
{ {
@ -3050,40 +3059,33 @@ struct StringCache
if(isRandomAccessRange!R if(isRandomAccessRange!R
&& is(Unqual!(ElementType!R) : const(ubyte))) && is(Unqual!(ElementType!R) : const(ubyte)))
{ {
size_t bucket;
hash_t h; uint h = hash(range);
string* val = find(range, bucket, h); uint bucket = h % mapSize;
if (val !is null) Slot *s = &index[bucket];
//1st slot not yet initialized?
if(s.value.ptr == null)
{
*s = Slot(putIntoCache(range), null, h);
return s.value;
}
Slot* insSlot = s;
for(;;)
{ {
return *val; if(s.hash == h && s.value.equal(range))
} return s.value;
else insSlot = s;
{ s = s.next;
auto s = putIntoCache(range); if(s == null) break;
index[bucket] ~= s; }
return s; string str = putIntoCache(range);
} insertIntoSlot(insSlot, str, h);
return str;
} }
private: private:
import core.stdc.string; static uint hash(R)(R data)
string* find(R)(R data, out size_t bucket, out hash_t h)
{
h = hash(data);
bucket = h % mapSize;
foreach (i; 0 .. index[bucket].length)
{
if (equal(index[bucket][i], data))
{
return &index[bucket][i];
}
}
return null;
}
static hash_t hash(R)(R data)
{ {
uint hash = 0; uint hash = 0;
foreach (b; data) foreach (b; data)
@ -3095,28 +3097,54 @@ private:
} }
enum mapSize = 2048; enum mapSize = 2048;
string[][mapSize] index;
struct Slot
{
string value;
Slot* next;
uint hash;
};
void insertIntoSlot(Slot* tgt, string val, uint hash)
{
auto slice = allocateInCache(Slot.sizeof);
auto newSlot = cast(Slot*)slice.ptr;
*newSlot = Slot(val, null, hash);
tgt.next = newSlot;
}
Slot[mapSize] index;
// leave some slack for alloctors/GC meta-data // leave some slack for alloctors/GC meta-data
enum chunkSize = 16*1024 - size_t.sizeof*8; enum chunkSize = 16*1024 - size_t.sizeof*8;
ubyte*[] chunkS; ubyte*[] chunkS;
size_t next = chunkSize; size_t next = chunkSize;
string putIntoCache(R)(R data) ubyte[] allocateInCache(size_t size)
{ {
import core.memory; import core.memory;
if(next + size > chunkSize)
if(next + data.length > chunkSize)
{ {
// avoid huge strings // avoid huge allocations
if(data.length > chunkSize/4) if(size> chunkSize/4)
return (cast(char[])data).idup; {
ubyte* p = cast(ubyte*)GC.malloc(size,
GC.BlkAttr.NO_SCAN | GC.BlkAttr.NO_INTERIOR);
return p[0..size];
}
chunkS ~= cast(ubyte*)GC.malloc(chunkSize, chunkS ~= cast(ubyte*)GC.malloc(chunkSize,
GC.BlkAttr.NO_SCAN | GC.BlkAttr.NO_INTERIOR); GC.BlkAttr.NO_SCAN | GC.BlkAttr.NO_INTERIOR);
next = 0; next = 0;
} }
auto slice = chunkS[$-1][next..next+data.length]; auto slice = chunkS[$-1][next..next+size];
next += size;
return slice;
}
string putIntoCache(R)(R data)
{
auto slice = allocateInCache(data.length);
slice[] = data[]; slice[] = data[];
next += data.length;
return cast(string)slice; return cast(string)slice;
} }