ldc/ddmd/lexer.d
David Nadlinger 9f998a398d Initial merge of upstream v2.071.0-b2
Notably, the glue layer side of the changed multiple interface
inheritance layout (DMD a54e89d) has not been implemented yet.

This corresponds to DMD commit 3f6a763c0589dd03c1c206eafd434b593702564e.
2016-04-03 15:15:14 +01:00

2516 lines
72 KiB
D

// Compiler implementation of the D programming language
// Copyright (c) 1999-2015 by Digital Mars
// All Rights Reserved
// written by Walter Bright
// http://www.digitalmars.com
// Distributed under the Boost Software License, Version 1.0.
// http://www.boost.org/LICENSE_1_0.txt
module ddmd.lexer;
import core.stdc.ctype;
import core.stdc.errno;
import core.stdc.stdarg;
import core.stdc.stdio;
import core.stdc.string;
import core.stdc.time;
import ddmd.entity;
import ddmd.errors;
import ddmd.globals;
import ddmd.id;
import ddmd.identifier;
import ddmd.root.longdouble;
import ddmd.root.outbuffer;
import ddmd.root.port;
import ddmd.root.rmem;
import ddmd.root.stringtable;
import ddmd.tokens;
import ddmd.utf;
enum LS = 0x2028; // UTF line separator
enum PS = 0x2029; // UTF paragraph separator
/********************************************
* Do our own char maps
*/
immutable ubyte[256] cmtable;
enum CMoctal = 0x1;
enum CMhex = 0x2;
enum CMidchar = 0x4;
enum CMzerosecond = 0x8;
enum CMdigitsecond = 0x10;
enum CMsinglechar = 0x20;
bool isoctal(char c)
{
return (cmtable[c] & CMoctal) != 0;
}
bool ishex(char c)
{
return (cmtable[c] & CMhex) != 0;
}
bool isidchar(char c)
{
return (cmtable[c] & CMidchar) != 0;
}
bool isZeroSecond(char c)
{
return (cmtable[c] & CMzerosecond) != 0;
}
bool isDigitSecond(char c)
{
return (cmtable[c] & CMdigitsecond) != 0;
}
bool issinglechar(char c)
{
return (cmtable[c] & CMsinglechar) != 0;
}
static this()
{
foreach (const c; 0 .. cmtable.length)
{
if ('0' <= c && c <= '7')
cmtable[c] |= CMoctal;
if (isxdigit(c))
cmtable[c] |= CMhex;
if (isalnum(c) || c == '_')
cmtable[c] |= CMidchar;
switch (c)
{
case 'x': case 'X':
case 'b': case 'B':
cmtable[c] |= CMzerosecond;
break;
case '0': .. case '9':
case 'e': case 'E':
case 'f': case 'F':
case 'l': case 'L':
case 'p': case 'P':
case 'u': case 'U':
case 'i':
case '.':
case '_':
cmtable[c] |= CMzerosecond | CMdigitsecond;
break;
default:
break;
}
switch (c)
{
case '\\':
case '\n':
case '\r':
case 0:
case 0x1A:
case '\'':
break;
default:
if (!(c & 0x80))
cmtable[c] |= CMsinglechar;
break;
}
}
}
unittest
{
//printf("lexer.unittest\n");
/* Not much here, just trying things out.
*/
string text = "int";
scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, 0, 0);
TOK tok;
tok = lex1.nextToken();
//printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOKint32);
assert(tok == TOKint32);
tok = lex1.nextToken();
assert(tok == TOKeof);
tok = lex1.nextToken();
assert(tok == TOKeof);
}
/***********************************************************
*/
class Lexer
{
public:
__gshared OutBuffer stringbuffer;
Loc scanloc; // for error messages
const(char)* base; // pointer to start of buffer
const(char)* end; // past end of buffer
const(char)* p; // current character
const(char)* line; // start of current line
Token token;
bool doDocComment; // collect doc comment information
bool anyToken; // seen at least one token
bool commentToken; // comments are TOKcomment's
bool errors; // errors occurred during lexing or parsing
/*********************
* Creates a Lexer.
* Params:
* filename = used for error messages
* base = source code, ending in a 0 byte
* begoffset = starting offset into base[]
* endoffset = last offset into base[]
* doDocComment = handle documentation comments
* commentToken = comments become TOKcomment's
*/
this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset, bool doDocComment, bool commentToken)
{
scanloc = Loc(filename, 1, 1);
//printf("Lexer::Lexer(%p,%d)\n",base,length);
//printf("lexer.filename = %s\n", filename);
token = Token.init;
this.base = base;
this.end = base + endoffset;
p = base + begoffset;
line = p;
this.doDocComment = doDocComment;
this.commentToken = commentToken;
//initKeywords();
/* If first line starts with '#!', ignore the line
*/
if (p[0] == '#' && p[1] == '!')
{
p += 2;
while (1)
{
char c = *p;
switch (c)
{
case '\n':
p++;
break;
case '\r':
p++;
if (*p == '\n')
p++;
break;
case 0:
case 0x1A:
break;
default:
if (c & 0x80)
{
uint u = decodeUTF();
if (u == PS || u == LS)
break;
}
p++;
continue;
}
break;
}
endOfLine();
}
}
final TOK nextToken()
{
if (token.next)
{
Token* t = token.next;
memcpy(&token, t, Token.sizeof);
t.free();
}
else
{
scan(&token);
}
//token.print();
return token.value;
}
/***********************
* Look ahead at next token's value.
*/
final TOK peekNext()
{
return peek(&token).value;
}
/***********************
* Look 2 tokens ahead at value.
*/
final TOK peekNext2()
{
Token* t = peek(&token);
return peek(t).value;
}
/****************************
* Turn next token in buffer into a token.
*/
final void scan(Token* t)
{
const lastLine = scanloc.linnum;
Loc startLoc;
t.blockComment = null;
t.lineComment = null;
while (1)
{
t.ptr = p;
//printf("p = %p, *p = '%c'\n",p,*p);
t.loc = loc();
switch (*p)
{
case 0:
case 0x1A:
t.value = TOKeof; // end of file
return;
case ' ':
case '\t':
case '\v':
case '\f':
p++;
continue;
// skip white space
case '\r':
p++;
if (*p != '\n') // if CR stands by itself
endOfLine();
continue;
// skip white space
case '\n':
p++;
endOfLine();
continue;
// skip white space
case '0':
if (!isZeroSecond(p[1])) // if numeric literal does not continue
{
++p;
t.uns64value = 0;
t.value = TOKint32v;
return;
}
goto Lnumber;
case '1': .. case '9':
if (!isDigitSecond(p[1])) // if numeric literal does not continue
{
t.uns64value = *p - '0';
++p;
t.value = TOKint32v;
return;
}
Lnumber:
t.value = number(t);
return;
case '\'':
if (issinglechar(p[1]) && p[2] == '\'')
{
t.uns64value = p[1]; // simple one character literal
t.value = TOKcharv;
p += 3;
}
else
t.value = charConstant(t);
return;
case 'r':
if (p[1] != '"')
goto case_ident;
p++;
goto case '`';
case '`':
t.value = wysiwygStringConstant(t, *p);
return;
case 'x':
if (p[1] != '"')
goto case_ident;
p++;
t.value = hexStringConstant(t);
return;
case 'q':
if (p[1] == '"')
{
p++;
t.value = delimitedStringConstant(t);
return;
}
else if (p[1] == '{')
{
p++;
t.value = tokenStringConstant(t);
return;
}
else
goto case_ident;
case '"':
t.value = escapeStringConstant(t, 0);
return;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
/*case 'q': case 'r':*/
case 's':
case 't':
case 'u':
case 'v':
case 'w':
/*case 'x':*/
case 'y':
case 'z':
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
case '_':
case_ident:
{
while (1)
{
const c = *++p;
if (isidchar(c))
continue;
else if (c & 0x80)
{
const s = p;
const u = decodeUTF();
if (isUniAlpha(u))
continue;
error("char 0x%04x not allowed in identifier", u);
p = s;
}
break;
}
Identifier id = Identifier.idPool(cast(char*)t.ptr, p - t.ptr);
t.ident = id;
t.value = cast(TOK)id.value;
anyToken = 1;
if (*t.ptr == '_') // if special identifier token
{
__gshared bool initdone = false;
__gshared char[11 + 1] date;
__gshared char[8 + 1] time;
__gshared char[24 + 1] timestamp;
if (!initdone) // lazy evaluation
{
initdone = true;
time_t ct;
.time(&ct);
const p = ctime(&ct);
assert(p);
sprintf(&date[0], "%.6s %.4s", p + 4, p + 20);
sprintf(&time[0], "%.8s", p + 11);
sprintf(&timestamp[0], "%.24s", p);
}
if (id == Id.DATE)
{
t.ustring = date.ptr;
goto Lstr;
}
else if (id == Id.TIME)
{
t.ustring = time.ptr;
goto Lstr;
}
else if (id == Id.VENDOR)
{
t.ustring = global.compiler.vendor;
goto Lstr;
}
else if (id == Id.TIMESTAMP)
{
t.ustring = timestamp.ptr;
Lstr:
t.value = TOKstring;
t.postfix = 0;
t.len = cast(uint)strlen(t.ustring);
}
else if (id == Id.VERSIONX)
{
uint major = 0;
uint minor = 0;
bool point = false;
for (const(char)* p = global._version + 1; 1; p++)
{
const c = *p;
if (isdigit(cast(char)c))
minor = minor * 10 + c - '0';
else if (c == '.')
{
if (point)
break;
// ignore everything after second '.'
point = true;
major = minor;
minor = 0;
}
else
break;
}
t.value = TOKint64v;
t.uns64value = major * 1000 + minor;
}
else if (id == Id.EOFX)
{
t.value = TOKeof;
// Advance scanner to end of file
while (!(*p == 0 || *p == 0x1A))
p++;
}
}
//printf("t->value = %d\n",t->value);
return;
}
case '/':
p++;
switch (*p)
{
case '=':
p++;
t.value = TOKdivass;
return;
case '*':
p++;
startLoc = loc();
while (1)
{
while (1)
{
const c = *p;
switch (c)
{
case '/':
break;
case '\n':
endOfLine();
p++;
continue;
case '\r':
p++;
if (*p != '\n')
endOfLine();
continue;
case 0:
case 0x1A:
error("unterminated /* */ comment");
p = end;
t.loc = loc();
t.value = TOKeof;
return;
default:
if (c & 0x80)
{
const u = decodeUTF();
if (u == PS || u == LS)
endOfLine();
}
p++;
continue;
}
break;
}
p++;
if (p[-2] == '*' && p - 3 != t.ptr)
break;
}
if (commentToken)
{
t.loc = startLoc;
t.value = TOKcomment;
return;
}
else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr)
{
// if /** but not /**/
getDocComment(t, lastLine == startLoc.linnum);
}
continue;
case '/':
// do // style comments
startLoc = loc();
while (1)
{
const c = *++p;
switch (c)
{
case '\n':
break;
case '\r':
if (p[1] == '\n')
p++;
break;
case 0:
case 0x1A:
if (commentToken)
{
p = end;
t.loc = startLoc;
t.value = TOKcomment;
return;
}
if (doDocComment && t.ptr[2] == '/')
getDocComment(t, lastLine == startLoc.linnum);
p = end;
t.loc = loc();
t.value = TOKeof;
return;
default:
if (c & 0x80)
{
const u = decodeUTF();
if (u == PS || u == LS)
break;
}
continue;
}
break;
}
if (commentToken)
{
p++;
endOfLine();
t.loc = startLoc;
t.value = TOKcomment;
return;
}
if (doDocComment && t.ptr[2] == '/')
getDocComment(t, lastLine == startLoc.linnum);
p++;
endOfLine();
continue;
case '+':
{
int nest;
startLoc = loc();
p++;
nest = 1;
while (1)
{
char c = *p;
switch (c)
{
case '/':
p++;
if (*p == '+')
{
p++;
nest++;
}
continue;
case '+':
p++;
if (*p == '/')
{
p++;
if (--nest == 0)
break;
}
continue;
case '\r':
p++;
if (*p != '\n')
endOfLine();
continue;
case '\n':
endOfLine();
p++;
continue;
case 0:
case 0x1A:
error("unterminated /+ +/ comment");
p = end;
t.loc = loc();
t.value = TOKeof;
return;
default:
if (c & 0x80)
{
uint u = decodeUTF();
if (u == PS || u == LS)
endOfLine();
}
p++;
continue;
}
break;
}
if (commentToken)
{
t.loc = startLoc;
t.value = TOKcomment;
return;
}
if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr)
{
// if /++ but not /++/
getDocComment(t, lastLine == startLoc.linnum);
}
continue;
}
default:
break;
}
t.value = TOKdiv;
return;
case '.':
p++;
if (isdigit(*p))
{
/* Note that we don't allow ._1 and ._ as being
* valid floating point numbers.
*/
p--;
t.value = inreal(t);
}
else if (p[0] == '.')
{
if (p[1] == '.')
{
p += 2;
t.value = TOKdotdotdot;
}
else
{
p++;
t.value = TOKslice;
}
}
else
t.value = TOKdot;
return;
case '&':
p++;
if (*p == '=')
{
p++;
t.value = TOKandass;
}
else if (*p == '&')
{
p++;
t.value = TOKandand;
}
else
t.value = TOKand;
return;
case '|':
p++;
if (*p == '=')
{
p++;
t.value = TOKorass;
}
else if (*p == '|')
{
p++;
t.value = TOKoror;
}
else
t.value = TOKor;
return;
case '-':
p++;
if (*p == '=')
{
p++;
t.value = TOKminass;
}
else if (*p == '-')
{
p++;
t.value = TOKminusminus;
}
else
t.value = TOKmin;
return;
case '+':
p++;
if (*p == '=')
{
p++;
t.value = TOKaddass;
}
else if (*p == '+')
{
p++;
t.value = TOKplusplus;
}
else
t.value = TOKadd;
return;
case '<':
p++;
if (*p == '=')
{
p++;
t.value = TOKle; // <=
}
else if (*p == '<')
{
p++;
if (*p == '=')
{
p++;
t.value = TOKshlass; // <<=
}
else
t.value = TOKshl; // <<
}
else if (*p == '>')
{
p++;
if (*p == '=')
{
p++;
t.value = TOKleg; // <>=
}
else
t.value = TOKlg; // <>
}
else
t.value = TOKlt; // <
return;
case '>':
p++;
if (*p == '=')
{
p++;
t.value = TOKge; // >=
}
else if (*p == '>')
{
p++;
if (*p == '=')
{
p++;
t.value = TOKshrass; // >>=
}
else if (*p == '>')
{
p++;
if (*p == '=')
{
p++;
t.value = TOKushrass; // >>>=
}
else
t.value = TOKushr; // >>>
}
else
t.value = TOKshr; // >>
}
else
t.value = TOKgt; // >
return;
case '!':
p++;
if (*p == '=')
{
p++;
t.value = TOKnotequal; // !=
}
else if (*p == '<')
{
p++;
if (*p == '>')
{
p++;
if (*p == '=')
{
p++;
t.value = TOKunord; // !<>=
}
else
t.value = TOKue; // !<>
}
else if (*p == '=')
{
p++;
t.value = TOKug; // !<=
}
else
t.value = TOKuge; // !<
}
else if (*p == '>')
{
p++;
if (*p == '=')
{
p++;
t.value = TOKul; // !>=
}
else
t.value = TOKule; // !>
}
else
t.value = TOKnot; // !
return;
case '=':
p++;
if (*p == '=')
{
p++;
t.value = TOKequal; // ==
}
else if (*p == '>')
{
p++;
t.value = TOKgoesto; // =>
}
else
t.value = TOKassign; // =
return;
case '~':
p++;
if (*p == '=')
{
p++;
t.value = TOKcatass; // ~=
}
else
t.value = TOKtilde; // ~
return;
case '^':
p++;
if (*p == '^')
{
p++;
if (*p == '=')
{
p++;
t.value = TOKpowass; // ^^=
}
else
t.value = TOKpow; // ^^
}
else if (*p == '=')
{
p++;
t.value = TOKxorass; // ^=
}
else
t.value = TOKxor; // ^
return;
case '(':
p++;
t.value = TOKlparen;
return;
case ')':
p++;
t.value = TOKrparen;
return;
case '[':
p++;
t.value = TOKlbracket;
return;
case ']':
p++;
t.value = TOKrbracket;
return;
case '{':
p++;
t.value = TOKlcurly;
return;
case '}':
p++;
t.value = TOKrcurly;
return;
case '?':
p++;
t.value = TOKquestion;
return;
case ',':
p++;
t.value = TOKcomma;
return;
case ';':
p++;
t.value = TOKsemicolon;
return;
case ':':
p++;
t.value = TOKcolon;
return;
case '$':
p++;
t.value = TOKdollar;
return;
case '@':
p++;
t.value = TOKat;
return;
case '*':
p++;
if (*p == '=')
{
p++;
t.value = TOKmulass;
}
else
t.value = TOKmul;
return;
case '%':
p++;
if (*p == '=')
{
p++;
t.value = TOKmodass;
}
else
t.value = TOKmod;
return;
case '#':
{
p++;
Token n;
scan(&n);
if (n.value == TOKidentifier && n.ident == Id.line)
{
poundLine();
continue;
}
else
{
t.value = TOKpound;
return;
}
}
default:
{
dchar c = *p;
if (c & 0x80)
{
c = decodeUTF();
// Check for start of unicode identifier
if (isUniAlpha(c))
goto case_ident;
if (c == PS || c == LS)
{
endOfLine();
p++;
continue;
}
}
if (c < 0x80 && isprint(c))
error("character '%c' is not a valid token", c);
else
error("character 0x%02x is not a valid token", c);
p++;
continue;
}
}
}
}
final Token* peek(Token* ct)
{
Token* t;
if (ct.next)
t = ct.next;
else
{
t = Token.alloc();
scan(t);
ct.next = t;
}
return t;
}
/*********************************
* tk is on the opening (.
* Look ahead and return token that is past the closing ).
*/
final Token* peekPastParen(Token* tk)
{
//printf("peekPastParen()\n");
int parens = 1;
int curlynest = 0;
while (1)
{
tk = peek(tk);
//tk->print();
switch (tk.value)
{
case TOKlparen:
parens++;
continue;
case TOKrparen:
--parens;
if (parens)
continue;
tk = peek(tk);
break;
case TOKlcurly:
curlynest++;
continue;
case TOKrcurly:
if (--curlynest >= 0)
continue;
break;
case TOKsemicolon:
if (curlynest)
continue;
break;
case TOKeof:
break;
default:
continue;
}
return tk;
}
}
/*******************************************
* Parse escape sequence.
*/
final uint escapeSequence()
{
uint c = *p;
int ndigits;
switch (c)
{
case '\'':
case '"':
case '?':
case '\\':
Lconsume:
p++;
break;
case 'a':
c = 7;
goto Lconsume;
case 'b':
c = 8;
goto Lconsume;
case 'f':
c = 12;
goto Lconsume;
case 'n':
c = 10;
goto Lconsume;
case 'r':
c = 13;
goto Lconsume;
case 't':
c = 9;
goto Lconsume;
case 'v':
c = 11;
goto Lconsume;
case 'u':
ndigits = 4;
goto Lhex;
case 'U':
ndigits = 8;
goto Lhex;
case 'x':
ndigits = 2;
Lhex:
p++;
c = *p;
if (ishex(cast(char)c))
{
uint v = 0;
int n = 0;
while (1)
{
if (isdigit(cast(char)c))
c -= '0';
else if (islower(c))
c -= 'a' - 10;
else
c -= 'A' - 10;
v = v * 16 + c;
c = *++p;
if (++n == ndigits)
break;
if (!ishex(cast(char)c))
{
error("escape hex sequence has %d hex digits instead of %d", n, ndigits);
break;
}
}
if (ndigits != 2 && !utf_isValidDchar(v))
{
error("invalid UTF character \\U%08x", v);
v = '?'; // recover with valid UTF character
}
c = v;
}
else
error("undefined escape hex sequence \\%c", c);
break;
case '&':
// named character entity
for (const idstart = ++p; 1; p++)
{
switch (*p)
{
case ';':
c = HtmlNamedEntity(idstart, p - idstart);
if (c == ~0)
{
error("unnamed character entity &%.*s;", cast(int)(p - idstart), idstart);
c = ' ';
}
p++;
break;
default:
if (isalpha(*p) || (p != idstart && isdigit(*p)))
continue;
error("unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart);
break;
}
break;
}
break;
case 0:
case 0x1A:
// end of file
c = '\\';
break;
default:
if (isoctal(cast(char)c))
{
uint v = 0;
int n = 0;
do
{
v = v * 8 + (c - '0');
c = *++p;
}
while (++n < 3 && isoctal(cast(char)c));
c = v;
if (c > 0xFF)
error("escape octal sequence \\%03o is larger than \\377", c);
}
else
error("undefined escape sequence \\%c", c);
break;
}
return c;
}
/**************************************
*/
final TOK wysiwygStringConstant(Token* t, int tc)
{
Loc start = loc();
p++;
stringbuffer.reset();
while (1)
{
dchar c = *p++;
switch (c)
{
case '\n':
endOfLine();
break;
case '\r':
if (*p == '\n')
continue;
// ignore
c = '\n'; // treat EndOfLine as \n character
endOfLine();
break;
case 0:
case 0x1A:
error("unterminated string constant starting at %s", start.toChars());
t.setString();
return TOKstring;
case '"':
case '`':
if (c == tc)
{
t.setString(stringbuffer);
stringPostfix(t);
return TOKstring;
}
break;
default:
if (c & 0x80)
{
p--;
const u = decodeUTF();
p++;
if (u == PS || u == LS)
endOfLine();
stringbuffer.writeUTF8(u);
continue;
}
break;
}
stringbuffer.writeByte(c);
}
}
/**************************************
* Lex hex strings:
* x"0A ae 34FE BD"
*/
final TOK hexStringConstant(Token* t)
{
Loc start = loc();
uint n = 0;
uint v = ~0; // dead assignment, needed to suppress warning
p++;
stringbuffer.reset();
while (1)
{
dchar c = *p++;
switch (c)
{
case ' ':
case '\t':
case '\v':
case '\f':
continue; // skip white space
case '\r':
if (*p == '\n')
continue; // ignore '\r' if followed by '\n'
// Treat isolated '\r' as if it were a '\n'
goto case '\n';
case '\n':
endOfLine();
continue;
case 0:
case 0x1A:
error("unterminated string constant starting at %s", start.toChars());
t.setString();
return TOKxstring;
case '"':
if (n & 1)
{
error("odd number (%d) of hex characters in hex string", n);
stringbuffer.writeByte(v);
}
t.setString(stringbuffer);
stringPostfix(t);
return TOKxstring;
default:
if (c >= '0' && c <= '9')
c -= '0';
else if (c >= 'a' && c <= 'f')
c -= 'a' - 10;
else if (c >= 'A' && c <= 'F')
c -= 'A' - 10;
else if (c & 0x80)
{
p--;
const u = decodeUTF();
p++;
if (u == PS || u == LS)
endOfLine();
else
error("non-hex character \\u%04x in hex string", u);
}
else
error("non-hex character '%c' in hex string", c);
if (n & 1)
{
v = (v << 4) | c;
stringbuffer.writeByte(v);
}
else
v = c;
n++;
break;
}
}
assert(0); // see bug 15731
}
/**************************************
* Lex delimited strings:
* q"(foo(xxx))" // "foo(xxx)"
* q"[foo(]" // "foo("
* q"/foo]/" // "foo]"
* q"HERE
* foo
* HERE" // "foo\n"
* Input:
* p is on the "
*/
final TOK delimitedStringConstant(Token* t)
{
Loc start = loc();
dchar delimleft = 0;
dchar delimright = 0;
uint nest = 1;
uint nestcount = ~0; // dead assignment, needed to suppress warning
Identifier hereid = null;
uint blankrol = 0;
uint startline = 0;
p++;
stringbuffer.reset();
while (1)
{
dchar c = *p++;
//printf("c = '%c'\n", c);
switch (c)
{
case '\n':
Lnextline:
endOfLine();
startline = 1;
if (blankrol)
{
blankrol = 0;
continue;
}
if (hereid)
{
stringbuffer.writeUTF8(c);
continue;
}
break;
case '\r':
if (*p == '\n')
continue;
// ignore
c = '\n'; // treat EndOfLine as \n character
goto Lnextline;
case 0:
case 0x1A:
error("unterminated delimited string constant starting at %s", start.toChars());
t.setString();
return TOKstring;
default:
if (c & 0x80)
{
p--;
c = decodeUTF();
p++;
if (c == PS || c == LS)
goto Lnextline;
}
break;
}
if (delimleft == 0)
{
delimleft = c;
nest = 1;
nestcount = 1;
if (c == '(')
delimright = ')';
else if (c == '{')
delimright = '}';
else if (c == '[')
delimright = ']';
else if (c == '<')
delimright = '>';
else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
{
// Start of identifier; must be a heredoc
Token tok;
p--;
scan(&tok); // read in heredoc identifier
if (tok.value != TOKidentifier)
{
error("identifier expected for heredoc, not %s", tok.toChars());
delimright = c;
}
else
{
hereid = tok.ident;
//printf("hereid = '%s'\n", hereid->toChars());
blankrol = 1;
}
nest = 0;
}
else
{
delimright = c;
nest = 0;
if (isspace(c))
error("delimiter cannot be whitespace");
}
}
else
{
if (blankrol)
{
error("heredoc rest of line should be blank");
blankrol = 0;
continue;
}
if (nest == 1)
{
if (c == delimleft)
nestcount++;
else if (c == delimright)
{
nestcount--;
if (nestcount == 0)
goto Ldone;
}
}
else if (c == delimright)
goto Ldone;
if (startline && isalpha(c) && hereid)
{
Token tok;
auto psave = p;
p--;
scan(&tok); // read in possible heredoc identifier
//printf("endid = '%s'\n", tok.ident->toChars());
if (tok.value == TOKidentifier && tok.ident.equals(hereid))
{
/* should check that rest of line is blank
*/
goto Ldone;
}
p = psave;
}
stringbuffer.writeUTF8(c);
startline = 0;
}
}
Ldone:
if (*p == '"')
p++;
else if (hereid)
error("delimited string must end in %s\"", hereid.toChars());
else
error("delimited string must end in %c\"", delimright);
t.setString(stringbuffer);
stringPostfix(t);
return TOKstring;
}
/**************************************
* Lex delimited strings:
* q{ foo(xxx) } // " foo(xxx) "
* q{foo(} // "foo("
* q{{foo}"}"} // "{foo}"}""
* Input:
* p is on the q
*/
final TOK tokenStringConstant(Token* t)
{
uint nest = 1;
const start = loc();
const pstart = ++p;
while (1)
{
Token tok;
scan(&tok);
switch (tok.value)
{
case TOKlcurly:
nest++;
continue;
case TOKrcurly:
if (--nest == 0)
{
t.setString(pstart, p - 1 - pstart);
stringPostfix(t);
return TOKstring;
}
continue;
case TOKeof:
error("unterminated token string constant starting at %s", start.toChars());
t.setString();
return TOKstring;
default:
continue;
}
}
}
/**************************************
*/
final TOK escapeStringConstant(Token* t, int wide)
{
const start = loc();
p++;
stringbuffer.reset();
while (1)
{
dchar c = *p++;
switch (c)
{
case '\\':
switch (*p)
{
case 'u':
case 'U':
case '&':
c = escapeSequence();
stringbuffer.writeUTF8(c);
continue;
default:
c = escapeSequence();
break;
}
break;
case '\n':
endOfLine();
break;
case '\r':
if (*p == '\n')
continue;
// ignore
c = '\n'; // treat EndOfLine as \n character
endOfLine();
break;
case '"':
t.setString(stringbuffer);
stringPostfix(t);
return TOKstring;
case 0:
case 0x1A:
p--;
error("unterminated string constant starting at %s", start.toChars());
t.setString();
return TOKstring;
default:
if (c & 0x80)
{
p--;
c = decodeUTF();
if (c == LS || c == PS)
{
c = '\n';
endOfLine();
}
p++;
stringbuffer.writeUTF8(c);
continue;
}
break;
}
stringbuffer.writeByte(c);
}
}
/**************************************
*/
final TOK charConstant(Token* t)
{
auto tk = TOKcharv;
//printf("Lexer::charConstant\n");
p++;
dchar c = *p++;
switch (c)
{
case '\\':
switch (*p)
{
case 'u':
t.uns64value = escapeSequence();
tk = TOKwcharv;
break;
case 'U':
case '&':
t.uns64value = escapeSequence();
tk = TOKdcharv;
break;
default:
t.uns64value = escapeSequence();
break;
}
break;
case '\n':
L1:
endOfLine();
goto case;
case '\r':
case 0:
case 0x1A:
case '\'':
error("unterminated character constant");
t.uns64value = '?';
return tk;
default:
if (c & 0x80)
{
p--;
c = decodeUTF();
p++;
if (c == LS || c == PS)
goto L1;
if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
tk = TOKwcharv;
else
tk = TOKdcharv;
}
t.uns64value = c;
break;
}
if (*p != '\'')
{
error("unterminated character constant");
t.uns64value = '?';
return tk;
}
p++;
return tk;
}
/***************************************
* Get postfix of string literal.
*/
final void stringPostfix(Token* t)
{
switch (*p)
{
case 'c':
case 'w':
case 'd':
t.postfix = *p;
p++;
break;
default:
t.postfix = 0;
break;
}
}
/**************************************
* Read in a number.
* If it's an integer, store it in tok.TKutok.Vlong.
* integers can be decimal, octal or hex
* Handle the suffixes U, UL, LU, L, etc.
* If it's double, store it in tok.TKutok.Vdouble.
* Returns:
* TKnum
* TKdouble,...
*/
final TOK number(Token* t)
{
int base = 10;
const start = p;
uinteger_t n = 0; // unsigned >=64 bit integer type
int d;
bool err = false;
bool overflow = false;
dchar c = *p;
if (c == '0')
{
++p;
c = *p;
switch (c)
{
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
n = c - '0';
++p;
base = 8;
break;
case 'x':
case 'X':
++p;
base = 16;
break;
case 'b':
case 'B':
++p;
base = 2;
break;
case '.':
if (p[1] == '.')
goto Ldone;
// if ".."
if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
goto Ldone;
// if ".identifier" or ".unicode"
goto Lreal;
// '.' is part of current token
case 'i':
case 'f':
case 'F':
goto Lreal;
case '_':
++p;
base = 8;
break;
case 'L':
if (p[1] == 'i')
goto Lreal;
break;
default:
break;
}
}
while (1)
{
c = *p;
switch (c)
{
case '0':
case '1':
++p;
d = c - '0';
break;
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
if (base == 2 && !err)
{
error("binary digit expected");
err = true;
}
++p;
d = c - '0';
break;
case '8':
case '9':
++p;
if (base < 10 && !err)
{
error("radix %d digit expected, not '%c'", base, c);
err = true;
}
d = c - '0';
break;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
++p;
if (base != 16)
{
if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
goto Lreal;
if (!err)
{
error("radix %d digit expected, not '%c'", base, c);
err = true;
}
}
if (c >= 'a')
d = c + 10 - 'a';
else
d = c + 10 - 'A';
break;
case 'L':
if (p[1] == 'i')
goto Lreal;
goto Ldone;
case '.':
if (p[1] == '.')
goto Ldone;
// if ".."
if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
goto Ldone;
// if ".identifier" or ".unicode"
goto Lreal;
// otherwise as part of a floating point literal
case 'p':
case 'P':
case 'i':
Lreal:
p = start;
return inreal(t);
case '_':
++p;
continue;
default:
goto Ldone;
}
// Avoid expensive overflow check if we aren't at risk of overflow
if (n <= 0x0FFF_FFFF_FFFF_FFFFUL)
n = n * base + d;
else
{
import core.checkedint : mulu, addu;
n = mulu(n, base, overflow);
n = addu(n, d, overflow);
}
}
Ldone:
if (overflow && !err)
{
error("integer overflow");
err = true;
}
enum FLAGS : int
{
FLAGS_none = 0,
FLAGS_decimal = 1, // decimal
FLAGS_unsigned = 2, // u or U suffix
FLAGS_long = 4, // L suffix
}
alias FLAGS_none = FLAGS.FLAGS_none;
alias FLAGS_decimal = FLAGS.FLAGS_decimal;
alias FLAGS_unsigned = FLAGS.FLAGS_unsigned;
alias FLAGS_long = FLAGS.FLAGS_long;
FLAGS flags = (base == 10) ? FLAGS_decimal : FLAGS_none;
// Parse trailing 'u', 'U', 'l' or 'L' in any combination
const psuffix = p;
while (1)
{
FLAGS f;
switch (*p)
{
case 'U':
case 'u':
f = FLAGS_unsigned;
goto L1;
case 'l':
f = FLAGS_long;
error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
goto L1;
case 'L':
f = FLAGS_long;
L1:
p++;
if ((flags & f) && !err)
{
error("unrecognized token");
err = true;
}
flags = cast(FLAGS)(flags | f);
continue;
default:
break;
}
break;
}
if (base == 8 && n >= 8)
error("octal literals 0%llo%.*s are no longer supported, use std.conv.octal!%llo%.*s instead", n, p - psuffix, psuffix, n, p - psuffix, psuffix);
TOK result;
switch (flags)
{
case FLAGS_none:
/* Octal or Hexadecimal constant.
* First that fits: int, uint, long, ulong
*/
if (n & 0x8000000000000000L)
result = TOKuns64v;
else if (n & 0xFFFFFFFF00000000L)
result = TOKint64v;
else if (n & 0x80000000)
result = TOKuns32v;
else
result = TOKint32v;
break;
case FLAGS_decimal:
/* First that fits: int, long, long long
*/
if (n & 0x8000000000000000L)
{
if (!err)
{
error("signed integer overflow");
err = true;
}
result = TOKuns64v;
}
else if (n & 0xFFFFFFFF80000000L)
result = TOKint64v;
else
result = TOKint32v;
break;
case FLAGS_unsigned:
case FLAGS_decimal | FLAGS_unsigned:
/* First that fits: uint, ulong
*/
if (n & 0xFFFFFFFF00000000L)
result = TOKuns64v;
else
result = TOKuns32v;
break;
case FLAGS_decimal | FLAGS_long:
if (n & 0x8000000000000000L)
{
if (!err)
{
error("signed integer overflow");
err = true;
}
result = TOKuns64v;
}
else
result = TOKint64v;
break;
case FLAGS_long:
if (n & 0x8000000000000000L)
result = TOKuns64v;
else
result = TOKint64v;
break;
case FLAGS_unsigned | FLAGS_long:
case FLAGS_decimal | FLAGS_unsigned | FLAGS_long:
result = TOKuns64v;
break;
default:
debug
{
printf("%x\n", flags);
}
assert(0);
}
t.uns64value = n;
return result;
}
/**************************************
* Read in characters, converting them to real.
* Bugs:
* Exponent overflow not detected.
* Too much requested precision is not detected.
*/
final TOK inreal(Token* t)
{
//printf("Lexer::inreal()\n");
debug
{
assert(*p == '.' || isdigit(*p));
}
stringbuffer.reset();
auto pstart = p;
bool hex = false;
dchar c = *p++;
// Leading '0x'
if (c == '0')
{
c = *p++;
if (c == 'x' || c == 'X')
{
hex = true;
c = *p++;
}
}
// Digits to left of '.'
while (1)
{
if (c == '.')
{
c = *p++;
break;
}
if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
{
c = *p++;
continue;
}
break;
}
// Digits to right of '.'
while (1)
{
if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
{
c = *p++;
continue;
}
break;
}
if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
{
c = *p++;
if (c == '-' || c == '+')
{
c = *p++;
}
bool anyexp = false;
while (1)
{
if (isdigit(c))
{
anyexp = true;
c = *p++;
continue;
}
if (c == '_')
{
c = *p++;
continue;
}
if (!anyexp)
error("missing exponent");
break;
}
}
else if (hex)
error("exponent required for hex float");
--p;
while (pstart < p)
{
if (*pstart != '_')
stringbuffer.writeByte(*pstart);
++pstart;
}
stringbuffer.writeByte(0);
auto sbufptr = cast(const(char)*)stringbuffer.data;
TOK result;
t.float80value = Port.strtold(sbufptr, null);
errno = 0;
switch (*p)
{
case 'F':
case 'f':
// Only interested in errno return
cast(void)Port.strtof(sbufptr, null);
result = TOKfloat32v;
p++;
break;
default:
/* Should do our own strtod(), since dmc and linux gcc
* accept 2.22507e-308, while apple gcc will only take
* 2.22508e-308. Not sure who is right.
*/
// Only interested in errno return
cast(void)Port.strtod(sbufptr, null);
result = TOKfloat64v;
break;
case 'l':
error("use 'L' suffix instead of 'l'");
goto case 'L';
case 'L':
result = TOKfloat80v;
p++;
break;
}
if (*p == 'i' || *p == 'I')
{
if (*p == 'I')
error("use 'i' suffix instead of 'I'");
p++;
switch (result)
{
case TOKfloat32v:
result = TOKimaginary32v;
break;
case TOKfloat64v:
result = TOKimaginary64v;
break;
case TOKfloat80v:
result = TOKimaginary80v;
break;
default:
break;
}
}
if (errno == ERANGE)
{
const char* suffix = (result == TOKfloat32v || result == TOKimaginary32v) ? "f" : "";
error(scanloc, "number '%s%s' is not representable", sbufptr, suffix);
}
debug
{
switch (result)
{
case TOKfloat32v:
case TOKfloat64v:
case TOKfloat80v:
case TOKimaginary32v:
case TOKimaginary64v:
case TOKimaginary80v:
break;
default:
assert(0);
}
}
return result;
}
final Loc loc()
{
scanloc.charnum = cast(uint)(1 + p - line);
return scanloc;
}
final void error(const(char)* format, ...)
{
va_list ap;
va_start(ap, format);
.verror(token.loc, format, ap);
va_end(ap);
errors = true;
}
final void error(Loc loc, const(char)* format, ...)
{
va_list ap;
va_start(ap, format);
.verror(loc, format, ap);
va_end(ap);
errors = true;
}
final void deprecation(const(char)* format, ...)
{
va_list ap;
va_start(ap, format);
.vdeprecation(token.loc, format, ap);
va_end(ap);
if (global.params.useDeprecated == 0)
errors = true;
}
/*********************************************
* parse:
* #line linnum [filespec]
* also allow __LINE__ for linnum, and __FILE__ for filespec
*/
final void poundLine()
{
auto linnum = this.scanloc.linnum;
const(char)* filespec = null;
const loc = this.loc();
Token tok;
scan(&tok);
if (tok.value == TOKint32v || tok.value == TOKint64v)
{
const lin = cast(int)(tok.uns64value - 1);
if (lin != tok.uns64value - 1)
error("line number %lld out of range", cast(ulong)tok.uns64value);
else
linnum = lin;
}
else if (tok.value == TOKline)
{
}
else
goto Lerr;
while (1)
{
switch (*p)
{
case 0:
case 0x1A:
case '\n':
Lnewline:
this.scanloc.linnum = linnum;
if (filespec)
this.scanloc.filename = filespec;
return;
case '\r':
p++;
if (*p != '\n')
{
p--;
goto Lnewline;
}
continue;
case ' ':
case '\t':
case '\v':
case '\f':
p++;
continue;
// skip white space
case '_':
if (memcmp(p, "__FILE__".ptr, 8) == 0)
{
p += 8;
filespec = mem.xstrdup(scanloc.filename);
continue;
}
goto Lerr;
case '"':
if (filespec)
goto Lerr;
stringbuffer.reset();
p++;
while (1)
{
uint c;
c = *p;
switch (c)
{
case '\n':
case '\r':
case 0:
case 0x1A:
goto Lerr;
case '"':
stringbuffer.writeByte(0);
filespec = mem.xstrdup(cast(const(char)*)stringbuffer.data);
p++;
break;
default:
if (c & 0x80)
{
uint u = decodeUTF();
if (u == PS || u == LS)
goto Lerr;
}
stringbuffer.writeByte(c);
p++;
continue;
}
break;
}
continue;
default:
if (*p & 0x80)
{
uint u = decodeUTF();
if (u == PS || u == LS)
goto Lnewline;
}
goto Lerr;
}
}
Lerr:
error(loc, "#line integer [\"filespec\"]\\n expected");
}
/********************************************
* Decode UTF character.
* Issue error messages for invalid sequences.
* Return decoded character, advance p to last character in UTF sequence.
*/
final uint decodeUTF()
{
const s = p;
assert(*s & 0x80);
// Check length of remaining string up to 6 UTF-8 characters
size_t len;
for (len = 1; len < 6 && s[len]; len++)
{
}
size_t idx = 0;
dchar u;
const msg = utf_decodeChar(s, len, idx, u);
p += idx - 1;
if (msg)
{
error("%s", msg);
}
return u;
}
/***************************************************
* Parse doc comment embedded between t->ptr and p.
* Remove trailing blanks and tabs from lines.
* Replace all newlines with \n.
* Remove leading comment character from each line.
* Decide if it's a lineComment or a blockComment.
* Append to previous one for this token.
*/
final void getDocComment(Token* t, uint lineComment)
{
/* ct tells us which kind of comment it is: '/', '*', or '+'
*/
const ct = t.ptr[2];
/* Start of comment text skips over / * *, / + +, or / / /
*/
const(char)* q = t.ptr + 3; // start of comment text
const(char)* qend = p;
if (ct == '*' || ct == '+')
qend -= 2;
/* Scan over initial row of ****'s or ++++'s or ////'s
*/
for (; q < qend; q++)
{
if (*q != ct)
break;
}
/* Remove leading spaces until start of the comment
*/
int linestart = 0;
if (ct == '/')
{
while (q < qend && (*q == ' ' || *q == '\t'))
++q;
}
else if (q < qend)
{
if (*q == '\r')
{
++q;
if (q < qend && *q == '\n')
++q;
linestart = 1;
}
else if (*q == '\n')
{
++q;
linestart = 1;
}
}
/* Remove trailing row of ****'s or ++++'s
*/
if (ct != '/')
{
for (; q < qend; qend--)
{
if (qend[-1] != ct)
break;
}
}
/* Comment is now [q .. qend].
* Canonicalize it into buf[].
*/
OutBuffer buf;
for (; q < qend; q++)
{
char c = *q;
switch (c)
{
case '*':
case '+':
if (linestart && c == ct)
{
linestart = 0;
/* Trim preceding whitespace up to preceding \n
*/
while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
buf.offset--;
continue;
}
break;
case ' ':
case '\t':
break;
case '\r':
if (q[1] == '\n')
continue;
// skip the \r
goto Lnewline;
default:
if (c == 226)
{
// If LS or PS
if (q[1] == 128 && (q[2] == 168 || q[2] == 169))
{
q += 2;
goto Lnewline;
}
}
linestart = 0;
break;
Lnewline:
c = '\n'; // replace all newlines with \n
goto case;
case '\n':
linestart = 1;
/* Trim trailing whitespace
*/
while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
buf.offset--;
break;
}
buf.writeByte(c);
}
/* Trim trailing whitespace (if the last line does not have newline)
*/
if (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
{
while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
buf.offset--;
}
// Always end with a newline
if (!buf.offset || buf.data[buf.offset - 1] != '\n')
buf.writeByte('\n');
buf.writeByte(0);
// It's a line comment if the start of the doc comment comes
// after other non-whitespace on the same line.
const(char)** dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment;
// Combine with previous doc comment, if any
if (*dc)
*dc = combineComments(*dc, cast(const(char)*)buf.data);
else
*dc = cast(const(char)*)buf.extractData();
}
/********************************************
* Combine two document comments into one,
* separated by a newline.
*/
final static const(char)* combineComments(const(char)* c1, const(char)* c2)
{
//printf("Lexer::combineComments('%s', '%s')\n", c1, c2);
auto c = c2;
if (c1)
{
c = c1;
if (c2)
{
size_t len1 = strlen(c1);
size_t len2 = strlen(c2);
int insertNewLine = 0;
if (len1 && c1[len1 - 1] != '\n')
{
++len1;
insertNewLine = 1;
}
auto p = cast(char*)mem.xmalloc(len1 + 1 + len2 + 1);
memcpy(p, c1, len1 - insertNewLine);
if (insertNewLine)
p[len1 - 1] = '\n';
p[len1] = '\n';
memcpy(p + len1 + 1, c2, len2);
p[len1 + 1 + len2] = 0;
c = p;
}
}
return c;
}
private:
final void endOfLine()
{
scanloc.linnum++;
line = p;
}
}