dexed/src/ce_dlang.pas

unit ce_dlang;

{$mode objfpc}{$H+}

interface

uses
  Classes, SysUtils, ce_dlangutils;

const

  D2Kw: array[0..109] of string =
  ( 'abstract', 'alias', 'align', 'asm', 'assert', 'auto',
    'body', 'bool', 'break', 'byte',
    'case', 'cast', 'catch', 'cdouble', 'cent', 'cfloat', 'char', 'class',
    'const', 'continue', 'creal',
    'dchar', 'debug', 'default', 'delegate', 'delete', 'deprecated', 'do', 'double',
    'else', 'enum', 'export', 'extern',
    'false', 'final', 'finally', 'float', 'for', 'foreach',
    'foreach_reverse', 'function',
    'goto',
    'idouble', 'if', 'ifloat', 'immutable', 'import', 'in', 'inout', 'int',
    'interface', 'invariant', 'ireal', 'is',
    'lazy', 'long',
    'macro', 'mixin', 'module',
    'new', 'nothrow', 'null',
    'out', 'override',
    'package', 'pragma', 'private', 'protected', 'ptrdiff_t', 'public', 'pure',
    'real', 'ref', 'return',
    'size_t', 'scope', 'shared', 'short', 'static', 'string', 'struct',
    'super', 'switch', 'synchronized',
    'template', 'this', 'throw', 'true', 'try', 'typedef', 'typeid', 'typeof',
    'ubyte', 'ucent', 'uint', 'ulong', 'union', 'unittest', 'ushort',
    'version', 'void', 'volatile',
    'wchar', 'while', 'with',
    '__FILE__', '__MODULE__', '__LINE__', '__FUNCTION__', '__PRETTY_FUNCTION__'
  );

type

  (**
   * sector for an array of Keyword with a common hash.
   *)
  TD2DictionaryEntry = record
    filled: Boolean;
    values: array of string;
  end;

  (**
   * Dictionary for the D2 keywords.
   *)
  TD2Dictionary = object
  private
    fLongest, fShortest: NativeInt;
    fEntries: array[Byte] of TD2DictionaryEntry;
    function toHash(const aValue: string): Byte; {$IFNDEF DEBUG}inline;{$ENDIF}
    procedure addEntry(const aValue: string);
  public
    constructor create;
    destructor destroy; // do not remove even if empty (compat with char-map version)
    function find(const aValue: string): boolean;
  end;

  (**
   * Represents the pointer in a source file.
   * Automatically updates the line and the column.
   *)
  TReaderHead = object
  private
    fLineIndex: Integer;
    fColumnIndex: Integer;
    fAbsoluteIndex: Integer;
    fReaderHead: PChar;
    function getColAndLine: TPoint;
  public
    constructor create(const aText: PChar; const aColAndLine: TPoint);
    procedure setReader(const aText: PChar; const aColAndLine: TPoint);
    //
    function next: PChar;
    function previous: PChar;
    //
    property AbsoluteIndex: Integer read fAbsoluteIndex;
    property LineIndex: Integer read fLineIndex;
    property ColumnIndex: Integer read fColumnIndex;
    property LineAnColumn: TPoint read getColAndLine;
    //
    property head: PChar read fReaderHead;
  end;

  TLexTokenKind = (ltkIllegal, ltkChar, ltkComment, ltkIdentifier, ltkKeyword,
    ltkNumber, ltkOperator, ltkString, ltkSymbol);

const
  LexTokenKindString : array[TLexTokenKind] of string =
    ( 'Illegal', 'Character', 'Comment', 'Identifier', 'Keyword',
      'Number', 'Operator', 'String', 'Symbol');

type

  (*****************************************************************************
   * Lexer token
   *)
  PLexToken = ^TLexToken;
  TLexToken = record
    position: TPoint;
    kind: TLexTokenKind;
    data: string;
  end;

  (*****************************************************************************
   * List of lexer tokens
   *)
  TLexTokenList = class(TList)
  private
    function getToken(index: integer): TLexToken;
  public
    procedure clear; override;
    procedure addToken(aValue: PLexToken);
    property token[index: integer]: TLexToken read getToken;
  end;

  TLexTokenEnumerator = class
    fList: TLexTokenList;
    fIndex: Integer;
    function GetCurrent: TLexToken;
    function MoveNext: Boolean;
    property Current: TLexToken read GetCurrent;
  end;

  (*****************************************************************************
   * Error record
   *)
  PLexError = ^TLexError;
  TLexError = record
    position: TPoint;
    msg: string;
  end;

  (*****************************************************************************
   * Error list
   *)
  TLexErrorList = class(TList)
  private
    function getError(index: integer): TLexError;
  public
    procedure clear; override;
    procedure addError(aValue: PLexError);
    property error[index: integer]: TLexError read getError;
  end;

  TLexErrorEnumerator = class
    fList: TLexErrorList;
    fIndex: Integer;
    function GetCurrent: TLexError;
    function MoveNext: Boolean;
    property Current: TLexError read GetCurrent;
  end;

  operator enumerator(aTokenList: TLexTokenList): TLexTokenEnumerator;
  operator enumerator(anErrorList: TLexErrorList): TLexErrorEnumerator;

  (*****************************************************************************
   * Lexes aText and fills aList with the TLexToken found.
   *)
  procedure lex(const aText: string; const aList: TLexTokenList);

  (*****************************************************************************
   * Detects various syntactic errors in a TLexTokenList
   *)
  procedure checkSyntacticErrors(const aTokenList: TLexTokenList; const anErrorList: TLexErrorList);

  (*****************************************************************************
   * Outputs the module name from a tokenized D source.
   *)
  function getModuleName(const aTokenList: TLexTokenList):string;

  (*****************************************************************************
   * Compares two TPoints.
   *)
  operator = (lhs: TPoint; rhs: TPoint): boolean;

implementation

var
  D2Dictionary: TD2Dictionary;

{$REGION TReaderHead------------------------------------------------------------}
operator = (lhs: TPoint; rhs: TPoint): boolean;
begin
  exit( (lhs.y = rhs.y) and (lhs.x = rhs.x) );
end;

constructor TReaderHead.create(const aText: PChar; const aColAndLine: TPoint);
begin
  setReader(aText,aColAndLine);
end;

procedure TReaderHead.setReader(const aText: PChar; const aColAndLine: TPoint);
begin
  fLineIndex := aColAndLine.y;
  fColumnIndex := aColAndLine.x;
  fReaderHead := aText;
  while (LineAnColumn <> aColAndLine) do next;
  //
  // editor not 0 based ln index
  if fLineIndex = 0 then fLineIndex := 1;
end;

function TReaderHead.getColAndLine: TPoint;
begin
  exit( Point(fColumnIndex, fLineIndex) );
end;

function TReaderHead.next: PChar;
begin
  Inc(fReaderHead);
  Inc(fAbsoluteIndex);
  Inc(fColumnIndex);
  if (fReaderHead^ = #10) then
  begin
    Inc(fLineIndex);
    fColumnIndex := 0;
  end;
  exit(fReaderHead);
end;

function TReaderHead.previous: PChar;
begin
  // note: it breaks the column but not the line count
  Dec(fReaderHead);
  Dec(fColumnIndex);
  Dec(fAbsoluteIndex);
  exit(fReaderHead);
end;
{$ENDREGION}

{$REGION TD2Dictionary----------------------------------------------------------}
constructor TD2Dictionary.create;
var
  value: string;
begin
  for value in D2Kw do
    addEntry(value);
end;

destructor TD2Dictionary.destroy;
begin
end;

{$IFDEF DEBUG}{$R-}{$ENDIF}
function TD2Dictionary.toHash(const aValue: string): Byte;
var
  i: Integer;
begin
  result := 0;
	for i := 1 to length(aValue) do result +=
    (Byte(aValue[i]) shl (4 and (1-i))) xor 25;
end;
{$IFDEF DEBUG}{$R+}{$ENDIF}

procedure TD2Dictionary.addEntry(const aValue: string);
var
  hash: Byte;
begin
  if find(aValue) then exit;
  hash := toHash(aValue);
  fEntries[hash].filled := true;
  setLength(fEntries[hash].values, length(fEntries[hash].values) + 1);
  fEntries[hash].values[high(fEntries[hash].values)] := aValue;
  if fLongest <= length(aValue) then
    fLongest := length(aValue);
  if fShortest >= length(aValue) then
    fShortest := length(aValue);
end;

function TD2Dictionary.find(const aValue: string): boolean;
var
  hash: Byte;
  i: NativeInt;
begin
  result := false;
  if length(aValue) > fLongest then exit;
  if length(aValue) < fShortest then exit;
  hash := toHash(aValue);
  if (not fEntries[hash].filled) then exit(false);
  for i:= 0 to high(fEntries[hash].values) do
    if fEntries[hash].values[i] = aValue then exit(true);
end;
{$ENDREGION}

{$REGION Lexing-----------------------------------------------------------------}
function TLexTokenList.getToken(index: integer): TLexToken;
begin
  result := PLexToken(Items[index])^;
end;

procedure TLexTokenList.clear;
begin
  while Count > 0 do
  begin
    Dispose( PLexToken(Items[Count-1]) );
    Delete(Count-1);
  end;
end;

procedure TLexTokenList.addToken(aValue: PLexToken);
begin
  add(Pointer(aValue));
end;

function TLexTokenEnumerator.GetCurrent: TLexToken;
begin
  exit(fList.token[fIndex]);
end;

function TLexTokenEnumerator.MoveNext: Boolean;
begin
  Inc(fIndex);
  exit(fIndex < fList.Count);
end;

operator enumerator(aTokenList: TLexTokenList): TLexTokenEnumerator;
begin
  result := TLexTokenEnumerator.Create;
  result.fList := aTokenList;
  result.fIndex := -1;
end;

{$BOOLEVAL ON}
procedure lex(const aText: string; const aList: TLexTokenList);
var
  reader: TReaderHead;
  identifier: string;

  function isOutOfBound: boolean;
  begin
    exit(reader.AbsoluteIndex > length(aText))
  end;

  procedure addToken(aTk: TLexTokenKind);
  var
    ptk: PLexToken;
  begin
    ptk := new(PLexToken);
    ptk^.kind := aTk;
    ptk^.position := reader.LineAnColumn;
    ptk^.data := identifier;
    aList.Add(ptk);
  end;

begin

  reader.create(@aText[1], Point(0,0));
  while (true) do
  begin

    if isOutOfBound then
      exit;

    identifier := '';

    // skip blanks
    while isWhite(reader.head^) do
    begin
      reader.next;
      if isOutOfBound then exit;
    end;

    // line comment
    if (reader.head^ = '/') then
    begin
      if (reader.next^ = '/') then
      begin
        if isOutOfBound then exit;
        while (reader.head^ <> #10) do
        begin
          reader.next;
          identifier += reader.head^;
          if isOutOfBound then exit;
        end;
        reader.next;
        addToken(ltkComment);
        continue;
      end
      else
        reader.previous;
    end;

    // block comments 1
    if (reader.head^ = '/') then
    begin
      if (reader.next^ = '*') then
      begin
        if isOutOfBound then exit;
        while (reader.head^ <> '*') or (reader.next^ <> '/') do
          if isOutOfBound then exit;
        reader.next;
        addToken(ltkComment);
        continue;
      end
      else
        reader.previous;
    end;

    // block comments 2
    if (reader.head^ = '/') then
    begin
      if (reader.next^ = '+') then
      begin
        if isOutOfBound then exit;
        while (reader.head^ <> '+') or (reader.next^ <> '/') do
          if isOutOfBound then exit;
        reader.next;
        addToken(ltkComment);
        continue;
      end
      else
        reader.previous;
    end;

    // string 1, note: same escape error as in SynD2Syn
    if (reader.head^ in ['r', 'x']) then
    begin
      if not (reader.next^ = '"') then
        reader.previous;
    end;
    if (reader.head^ = '"') then
    begin
      reader.next;
      if isOutOfBound then exit;
      if (reader.head^ = '"') then
      begin
        reader.next;
        addToken(ltkString);
        continue;
      end;
      while (true) do
      begin
        if reader.head^ = '\' then
        begin
          reader.next;
          if (reader.head^ = '"') then
          begin
            reader.next;
            continue;
          end;
        end;
        if (reader.head^ = '"') then
          break;
        identifier += reader.head^;
        reader.next;
        if isOutOfBound then exit;
      end;
      if isStringPostfix(reader.next^) then
        reader.next;
      addToken(ltkString);
      continue;
    end;

    // string 2
    if (reader.head^ = '`') then
    begin
      reader.next;
      if isOutOfBound then exit;
      while (reader.head^ <> '`') do
      begin
        identifier += reader.head^;
        reader.next;
        if isOutOfBound then exit;
      end;
      if isStringPostfix(reader.next^) then
        reader.next;
      if isOutOfBound then exit;
      addToken(ltkString);
      continue;
    end;

    // token string
    if (reader.head^ = 'q') and (reader.next^ = '{') then
    begin
      reader.next;
      if isOutOfBound then exit;
      while (reader.head^ <> '}') do
      begin
        identifier += reader.head^;
        reader.next;
        if isOutOfBound then exit;
      end;
      reader.next;
      addToken(ltkString);
      continue;
    end else reader.previous;

    //chars, note: same escape error as in SynD2Syn
    if (reader.head^ = #39) then
    begin
      reader.next;
      if isOutOfBound then exit;
      if (reader.head^ = #39) then
      begin
        reader.next;
        addToken(ltkString);
        continue;
      end;
      while (true) do
      begin
        if reader.head^ = '\' then
        begin
          reader.next;
          if (reader.head^ = #39) then
          begin
            reader.next;
            continue;
          end;
        end;
        if (reader.head^ = #39) then
          break;
        identifier += reader.head^;
        reader.next;
        if isOutOfBound then exit;
      end;
      reader.next;
      addToken(ltkChar);
      continue;
    end;

    // check negative float '-0.'
    if (reader.head^ = '-') then
    begin
      identifier += reader.head^;
      if reader.next^ = '0' then
      begin
        if reader.next^ = '.' then
          reader.previous // back to 0, get into "binary/hex numbr/float"
        else
          begin
            reader.previous;
            reader.previous; // back to -
            identifier := '';
          end;
      end
      else
      begin
        reader.previous; // back to -
        identifier := '';
      end;
    end;

    // + suffixes
    // + exponent
    // float .xxxx

    // binary/hex numbr/float
    if (reader.head^ = '0') then
    begin
      identifier += reader.head^;
      if (reader.next^ in ['b','B']) then
      begin
        identifier += reader.head^;
        while isBit(reader.next^) or (reader.head^ = '_') do
        begin
          if isOutOfBound then exit;
          identifier += reader.head^;
        end;
        addToken(ltkNumber);
        continue;
      end
      else reader.previous;
      if (reader.next^ in ['x','X']) then
      begin
        identifier += reader.head^;
        while isHex(reader.next^) or (reader.head^ = '_') do
        begin
          if isOutOfBound then exit;
          identifier += reader.head^;
        end;
        addToken(ltkNumber);
        continue;
      end
      else reader.previous;
      if (reader.next^ = '.') then
      begin
        identifier += reader.head^;
        while isNumber(reader.next^) do
        begin
          if isOutOfBound then exit;
          identifier += reader.head^;
        end;
        addToken(ltkNumber);
        continue;
      end
      else reader.previous;
      identifier := '';
    end;

    // check negative float/int '-xxx'
    if (reader.head^ = '-') then
    begin
      identifier += reader.head^;
      if not isNumber(reader.next^) then
      begin
        reader.previous; // back to '-'
        identifier := '';
      end;
    end;

    // numbers
    if isNumber(reader.head^) then
    begin
      identifier += reader.head^;
      while isNumber(reader.next^) or (reader.head^ = '_')  do
      begin
        if isOutOfBound then exit;
        identifier += reader.head^;
      end;
      addToken(ltkNumber);
      continue;
    end;

    // symbols
    if isSymbol(reader.head^) then
    begin
      identifier += reader.head^;
      reader.next;
      addToken(ltkSymbol);
      if isOutOfBound then exit;
      continue;
    end;

    // operators
    if isOperator1(reader.head^) then
    begin
      identifier += reader.head^;
      while isOperator1(reader.next^) do
      begin
        if isOutOfBound then exit;
        identifier += reader.head^;
      end;
      case length(identifier) of
        4:begin
            if (not isOperator1(reader.head^)) and
              isOperator4(identifier) then
            begin
              addToken(ltkOperator);
              continue;
            end;
          end;
        3:begin
            if (not isOperator1(reader.head^)) and
              isOperator3(identifier) then
            begin
              addToken(ltkOperator);
              continue;
            end;
          end;
        2:begin
            if (not isOperator1(reader.head^)) and
              isOperator2(identifier) then
            begin
              addToken(ltkOperator);
              continue;
            end;
          end;
        1:begin
            if not isOperator1(reader.head^)
            then
              begin
                addToken(ltkOperator);
                continue;
              end;
          end;
      end;
    end;

    // identifier accum
    if isFirstIdentifier(reader.head^) then
    begin
      while isIdentifier(reader.head^) do
      begin
        identifier += reader.head^;
        reader.next;
        if isOutOfBound then exit;
      end;
      if D2Dictionary.find(identifier) then
        addToken(ltkKeyword)
      else
        addToken(ltkIdentifier);
      continue;
    end;

    // error
    identifier += ' (unrecognized lexer input)';
    addToken(ltkIllegal);

  end;
end;
{$BOOLEVAL OFF}
{$ENDREGION}

{$REGION Syntactic errors}
function TLexErrorList.getError(index: integer): TLexError;
begin
  result := PLexError(Items[index])^;
end;

procedure TLexErrorList.clear;
begin
  while Count > 0 do
  begin
    Dispose( PLexError(Items[Count-1]) );
    Delete(Count-1);
  end;
end;

procedure TLexErrorList.addError(aValue: PLexError);
begin
  add(Pointer(aValue));
end;

function TLexErrorEnumerator.GetCurrent: TLexError;
begin
  exit(fList.error[fIndex]);
end;

function TLexErrorEnumerator.MoveNext: Boolean;
begin
  Inc(fIndex);
  exit(fIndex < fList.Count);
end;

operator enumerator(anErrorList: TLexErrorList): TLexErrorEnumerator;
begin
  result := TLexErrorEnumerator.Create;
  result.fList := anErrorList;
  result.fIndex := -1;
end;

procedure checkSyntacticErrors(const aTokenList: TLexTokenList; const anErrorList: TLexErrorList);
const
  errPrefix = 'syntactic error: ';
var
  tk, old1, old2, lastSig: TLexToken;
  err: PLexError;
  tkIndex: NativeInt;
  pareCnt, curlCnt, squaCnt: NativeInt;
  pareLeft, curlLeft, squaLeft: boolean;
procedure addError(const aMsg: string);
begin
  err := new(PLexError);
  err^.msg := errPrefix + aMsg;
  err^.position := aTokenList.token[tkIndex].position;
  anErrorList.addError(err);
end;
label
  _preSeq;
begin

  tkIndex := -1;
  pareCnt := 0;
  curlCnt := 0;
  squaCnt := 0;
  pareLeft:= False;
  curlLeft:= False;
  squaLeft:= False;
  FillByte( old1, sizeOf(TLexToken), 0);
  FillByte( old2, sizeOf(TLexToken), 0);
  FillByte( lastSig, sizeOf(TLexToken), 0);

  for tk in aTokenList do
  begin
    Inc(tkIndex);

    // brackets count
    if tk.kind = ltkSymbol then
    begin
      case tk.data of
      '(': Inc(pareCnt);
      '{': Inc(curlCnt);
      '[': Inc(squaCnt);
      ')': Dec(pareCnt);
      '}': Dec(curlCnt);
      ']': Dec(squaCnt);
      end;

      // only for the first occurence
      if not pareLeft then if pareCnt = -1 then
      begin
        addError('a left parenthesis is missing');
        pareLeft := true;
      end;
      if not curlLeft then if curlCnt = -1 then
      begin
        addError('a left curly bracket is missing');
        curlLeft := true;
      end;
      if not squaLeft then if squaCnt = -1 then
      begin
        addError('a left square bracket is missing');
        squaLeft := true;
      end;

      // at the end
      if (tkIndex = aTokenList.Count-1) then
      begin
        if pareCnt > 0 then
          addError('a right parenthesis is missing');
        if curlCnt > 0 then
          addError('a right curly bracket is missing');
        if squaCnt > 0 then
          addError('a right square bracket is missing');
      end;

      goto _preSeq;
    end;

    // lexer invalid token
    if tk.kind = ltkIllegal then
    begin
      addError(tk.data);
      goto _preSeq;
    end;

_preSeq:

    // invalid sequences
    if tkIndex > 0 then
    begin
      // empty statements:
      if (tk.kind = ltkSymbol) and (tk.data = ';') then
        if (lastSig.kind = ltkSymbol) and (lastSig.data = ';') then
          addError('invalid syntax for empty statement');
      if tk.kind <> ltkComment then lastSig := tk;

      // suspicious double keywords
      if (old1.kind = ltkKeyword) and (tk.kind = ltkKeyword) then
        if old1.data = tk.data then
          addError('keyword is duplicated');

      // suspicious double numbers
      if (old1.kind = ltkNumber) and (tk.kind = ltkNumber) then
        addError('symbol or operator expected after number');
    end;
    if tkIndex > 1 then
    begin
    end;

    old1 := tk;
    old2 := old1;
  end;


end;

function getModuleName(const aTokenList: TLexTokenList): string;
var
  ltk: TLexToken;
  mtok: boolean;
begin
  result := '';
  mtok := false;
  for ltk in aTokenList do
  begin
    if mtok then
    begin
      if ltk.kind = ltkIdentifier then
        result += ltk.data;
      if ltk.kind = ltkSymbol then
      case ltk.data of
        '.': result += ltk.data;
        ';': exit;
      end;
    end
    else
      if ltk.kind = ltkKeyword then
        if ltk.data = 'module' then
          mtok := true;
  end;
end;
{$ENDREGION}

initialization
  D2Dictionary.create;
finalization
  D2Dictionary.destroy;
end.