From 18889620b548acc27654af4047e1b73b2a70557b Mon Sep 17 00:00:00 2001 From: "Roman D. Boiko" Date: Thu, 26 Apr 2012 11:57:49 +0300 Subject: [PATCH] Updates according to code review --- codegen.d | 27 ++++++++++++++++++++++++++- tokenizer.d | 49 +++++++++++++++++++++++++------------------------ 2 files changed, 51 insertions(+), 25 deletions(-) diff --git a/codegen.d b/codegen.d index 13b744e..8fd54c7 100644 --- a/codegen.d +++ b/codegen.d @@ -60,7 +60,7 @@ string printCaseStatements(K, V)(TrieNode!(K,V) node, string indentString) if (v.children.length > 0) { caseStatement ~= indentString; - caseStatement ~= "\tif (endIndex >= inputString.length)\n"; + caseStatement ~= "\tif (isEoF(inputString, endIndex))\n"; caseStatement ~= indentString; caseStatement ~= "\t{\n"; caseStatement ~= indentString; @@ -110,3 +110,28 @@ string generateCaseTrie(string[] args ...) } return printCaseStatements(t, ""); } + +/** + * Returns: true if index points to end of inputString, false otherwise + */ +pure nothrow bool isEoF(S)(S inputString, size_t index) +{ + // note: EoF is determined according to D specification + return index >= inputString.length + || inputString[index] == Character.NUL + || inputString[index] == Character.SUB; +} + +private: + + // Unicode character literals + enum Character + { + // End of file (EoF) + NUL = '\u0000', // NUL character + SUB = '\u001A', // Substitute character + + // Line feed (EoL) + CR = '\u000D', // CR character + LF = '\u000A', // LF character + } \ No newline at end of file diff --git a/tokenizer.d b/tokenizer.d index efb67a5..3fb9142 100644 --- a/tokenizer.d +++ b/tokenizer.d @@ -432,12 +432,34 @@ pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex, } } - // todo: in some cases loop is interrupted before float literal is parsed, and some invalid inputs are accepted; - // suggested solution is to extract lexing integer into a separate function + // suggest to extract lexing integers into a separate function + // please see unittest below token.value = inputString[startIndex .. endIndex]; } +unittest { + dump!lexDecimal("55e-4"); // yeilds intLiteral, but should be float + dump!lexDecimal("3e+f"); // floatLiteral, but should be considered invalid + dump!lexDecimal("3e++f"); // intLiteral 3e+, but should be considered invalid + // actually, there are lots of bugs. The point is that without decomposition of integer lexing from floating-point lexing + // it is very hard to prove algorithm correctness +} + +// Temporary function to illustrate some problems +// Executes T and dumps results to console +void dump(alias T)(string s) { + size_t start; + size_t end; + Token tok; + T!(string)(s, start, end, tok); + // dump results + writeln(tok.type); + writeln(tok.value); + writeln(start); + writeln(end); +} + nothrow void lexHex(S)(ref S inputString, ref size_t startIndex, ref size_t endIndex, ref Token token) if (isSomeString!S) { @@ -742,30 +764,9 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl // This should never happen. if (endIndex <= prevIndex) { - stderr.writeln("FAIL"); // why not put assert(false)? being here indicates a bug in code, I guess + stderr.writeln("FAIL"); return []; } } return tokenAppender.data; } - -private: - - /** - * Returns: true if index points to end of inputString, false otherwise - */ - pure nothrow bool isEoF(S)(S inputString, size_t index) - { - // note: EoF is determined according to D specification - return index >= inputString.length - || inputString[index] == NUL_CHAR - || inputString[index] == SUB_CHAR; - } - - // End of file (EoF) - const NUL_CHAR = '\u0000'; // NUL character - const SUB_CHAR = '\u001A'; // Substitute character - - // Line feed (EoL) - const CR_CHAR = '\u000D'; // CR character - const LF_CHAR = '\u000A'; // LF character \ No newline at end of file