298 lines
8.8 KiB
Lua
298 lines
8.8 KiB
Lua
|
-- Copyright 2006-2019 Mitchell mitchell.att.foicica.com. See License.txt.
|
||
|
-- AWK LPeg lexer.
|
||
|
-- Modified by Wolfgang Seeberg 2012, 2013.
|
||
|
|
||
|
local lexer = require('lexer')
|
||
|
local token, word_match = lexer.token, lexer.word_match
|
||
|
local P, R, S = lpeg.P, lpeg.R, lpeg.S
|
||
|
|
||
|
local lex = lexer.new('awk')
|
||
|
|
||
|
local LEFTBRACKET = '['
|
||
|
local RIGHTBRACKET = ']'
|
||
|
local SLASH = '/'
|
||
|
local BACKSLASH = '\\'
|
||
|
local CARET = '^'
|
||
|
local CR = '\r'
|
||
|
local LF = '\n'
|
||
|
local CRLF = CR .. LF
|
||
|
local DQUOTE = '"'
|
||
|
local DELIMITER_MATCHES = {['('] = ')', ['['] = ']'}
|
||
|
local COMPANION = {['('] = '[', ['['] = '('}
|
||
|
local CC = {
|
||
|
alnum = 1, alpha = 1, blank = 1, cntrl = 1, digit = 1, graph = 1, lower = 1,
|
||
|
print = 1, punct = 1, space = 1, upper = 1, xdigit = 1
|
||
|
}
|
||
|
local LastRegexEnd = 0
|
||
|
local BackslashAtCommentEnd = 0
|
||
|
local KW_BEFORE_RX = {
|
||
|
case = 1, ['do'] = 1, ['else'] = 1, exit = 1, print = 1, printf = 1,
|
||
|
['return'] = 1
|
||
|
}
|
||
|
|
||
|
local function findKeyword(input, e)
|
||
|
local i = e
|
||
|
while i > 0 and input:find("^[%l]", i) do i = i - 1 end
|
||
|
local w = input:sub(i + 1, e)
|
||
|
if i == 0 then
|
||
|
return KW_BEFORE_RX[w] == 1
|
||
|
elseif input:find("^[%u%d_]", i) then
|
||
|
return false
|
||
|
else
|
||
|
return KW_BEFORE_RX[w] == 1
|
||
|
end
|
||
|
end
|
||
|
|
||
|
local function isRegex(input, i)
|
||
|
while i >= 1 and input:find('^[ \t]', i) do i = i - 1 end
|
||
|
if i < 1 then return true end
|
||
|
if input:find("^[-!%%&(*+,:;<=>?[^{|}~\f]", i) or findKeyword(input, i) then
|
||
|
return true
|
||
|
elseif input:sub(i, i) == SLASH then
|
||
|
return i ~= LastRegexEnd -- deals with /xx/ / /yy/.
|
||
|
elseif input:find('^[]%w)."]', i) then
|
||
|
return false
|
||
|
elseif input:sub(i, i) == LF then
|
||
|
if i == 1 then return true end
|
||
|
i = i - 1
|
||
|
if input:sub(i, i) == CR then
|
||
|
if i == 1 then return true end
|
||
|
i = i - 1
|
||
|
end
|
||
|
elseif input:sub(i, i) == CR then
|
||
|
if i == 1 then return true end
|
||
|
i = i - 1
|
||
|
else
|
||
|
return false
|
||
|
end
|
||
|
if input:sub(i, i) == BACKSLASH and i ~= BackslashAtCommentEnd then
|
||
|
return isRegex(input, i - 1)
|
||
|
else
|
||
|
return true
|
||
|
end
|
||
|
end
|
||
|
|
||
|
local function eatCharacterClass(input, s, e)
|
||
|
local i = s
|
||
|
while i <= e do
|
||
|
if input:find('^[\r\n]', i) then
|
||
|
return false
|
||
|
elseif input:sub(i, i + 1) == ':]' then
|
||
|
local str = input:sub(s, i - 1)
|
||
|
return CC[str] == 1 and i + 1
|
||
|
end
|
||
|
i = i + 1
|
||
|
end
|
||
|
return false
|
||
|
end
|
||
|
|
||
|
local function eatBrackets(input, i, e)
|
||
|
if input:sub(i, i) == CARET then i = i + 1 end
|
||
|
if input:sub(i, i) == RIGHTBRACKET then i = i + 1 end
|
||
|
while i <= e do
|
||
|
if input:find('^[\r\n]', i) then
|
||
|
return false
|
||
|
elseif input:sub(i, i) == RIGHTBRACKET then
|
||
|
return i
|
||
|
elseif input:sub(i, i + 1) == '[:' then
|
||
|
i = eatCharacterClass(input, i + 2, e)
|
||
|
if not i then return false end
|
||
|
elseif input:sub(i, i) == BACKSLASH then
|
||
|
i = i + 1
|
||
|
if input:sub(i, i + 1) == CRLF then i = i + 1 end
|
||
|
end
|
||
|
i = i + 1
|
||
|
end
|
||
|
return false
|
||
|
end
|
||
|
|
||
|
local function eatRegex(input, i)
|
||
|
local e = #input
|
||
|
while i <= e do
|
||
|
if input:find('^[\r\n]', i) then
|
||
|
return false
|
||
|
elseif input:sub(i, i) == SLASH then
|
||
|
LastRegexEnd = i
|
||
|
return i
|
||
|
elseif input:sub(i, i) == LEFTBRACKET then
|
||
|
i = eatBrackets(input, i + 1, e)
|
||
|
if not i then return false end
|
||
|
elseif input:sub(i, i) == BACKSLASH then
|
||
|
i = i + 1
|
||
|
if input:sub(i, i + 1) == CRLF then i = i + 1 end
|
||
|
end
|
||
|
i = i + 1
|
||
|
end
|
||
|
return false
|
||
|
end
|
||
|
|
||
|
local ScanRegexResult
|
||
|
local function scanGawkRegex(input, index)
|
||
|
if isRegex(input, index - 2) then
|
||
|
local i = eatRegex(input, index)
|
||
|
if not i then
|
||
|
ScanRegexResult = false
|
||
|
return false
|
||
|
end
|
||
|
local rx = input:sub(index - 1, i)
|
||
|
for bs in rx:gmatch("[^\\](\\+)[BSsWwy<>`']") do
|
||
|
-- /\S/ is special, but /\\S/ is not.
|
||
|
if #bs % 2 == 1 then return i + 1 end
|
||
|
end
|
||
|
ScanRegexResult = i + 1
|
||
|
else
|
||
|
ScanRegexResult = false
|
||
|
end
|
||
|
return false
|
||
|
end
|
||
|
-- Is only called immediately after scanGawkRegex().
|
||
|
local function scanRegex()
|
||
|
return ScanRegexResult
|
||
|
end
|
||
|
|
||
|
local function scanString(input, index)
|
||
|
local i = index
|
||
|
local e = #input
|
||
|
while i <= e do
|
||
|
if input:find('^[\r\n]', i) then
|
||
|
return false
|
||
|
elseif input:sub(i, i) == DQUOTE then
|
||
|
return i + 1
|
||
|
elseif input:sub(i, i) == BACKSLASH then
|
||
|
i = i + 1
|
||
|
-- lexer.delimited_range() doesn't handle CRLF.
|
||
|
if input:sub(i, i + 1) == CRLF then i = i + 1 end
|
||
|
end
|
||
|
i = i + 1
|
||
|
end
|
||
|
return false
|
||
|
end
|
||
|
|
||
|
-- purpose: prevent isRegex() from entering a comment line that ends with a
|
||
|
-- backslash.
|
||
|
local function scanComment(input, index)
|
||
|
local _, i = input:find('[^\r\n]*', index)
|
||
|
if input:sub(i, i) == BACKSLASH then BackslashAtCommentEnd = i end
|
||
|
return i + 1
|
||
|
end
|
||
|
|
||
|
local function scanFieldDelimiters(input, index)
|
||
|
local i = index
|
||
|
local e = #input
|
||
|
local left = input:sub(i - 1, i - 1)
|
||
|
local count = 1
|
||
|
local right = DELIMITER_MATCHES[left]
|
||
|
local left2 = COMPANION[left]
|
||
|
local count2 = 0
|
||
|
local right2 = DELIMITER_MATCHES[left2]
|
||
|
while i <= e do
|
||
|
if input:find('^[#\r\n]', i) then
|
||
|
return false
|
||
|
elseif input:sub(i, i) == right then
|
||
|
count = count - 1
|
||
|
if count == 0 then return count2 == 0 and i + 1 end
|
||
|
elseif input:sub(i, i) == left then
|
||
|
count = count + 1
|
||
|
elseif input:sub(i, i) == right2 then
|
||
|
count2 = count2 - 1
|
||
|
if count2 < 0 then return false end
|
||
|
elseif input:sub(i, i) == left2 then
|
||
|
count2 = count2 + 1
|
||
|
elseif input:sub(i, i) == DQUOTE then
|
||
|
i = scanString(input, i + 1)
|
||
|
if not i then return false end
|
||
|
i = i - 1
|
||
|
elseif input:sub(i, i) == SLASH then
|
||
|
if isRegex(input, i - 1) then
|
||
|
i = eatRegex(input, i + 1)
|
||
|
if not i then return false end
|
||
|
end
|
||
|
elseif input:sub(i, i) == BACKSLASH then
|
||
|
if input:sub(i + 1, i + 2) == CRLF then
|
||
|
i = i + 2
|
||
|
elseif input:find('^[\r\n]', i + 1) then
|
||
|
i = i + 1
|
||
|
end
|
||
|
end
|
||
|
i = i + 1
|
||
|
end
|
||
|
return false
|
||
|
end
|
||
|
|
||
|
-- Whitespace.
|
||
|
lex:add_rule('whitespace', token(lexer.WHITESPACE, lexer.space^1))
|
||
|
|
||
|
-- Comments.
|
||
|
lex:add_rule('comment', token(lexer.COMMENT, '#' * P(scanComment)))
|
||
|
|
||
|
-- Strings.
|
||
|
lex:add_rule('string', token(lexer.STRING, DQUOTE * P(scanString)))
|
||
|
|
||
|
-- No leading sign because it might be binary.
|
||
|
local float = ((lexer.digit^1 * ('.' * lexer.digit^0)^-1) +
|
||
|
('.' * lexer.digit^1)) *
|
||
|
(S('eE') * S('+-')^-1 * lexer.digit^1)^-1
|
||
|
|
||
|
-- Fields. E.g. $1, $a, $(x), $a(x), $a[x], $"1", $$a, etc.
|
||
|
lex:add_rule('field',
|
||
|
token('field', P('$') * S('$+-')^0 *
|
||
|
(float +
|
||
|
lexer.word^0 * '(' * P(scanFieldDelimiters) +
|
||
|
lexer.word^1 * ('[' * P(scanFieldDelimiters))^-1 +
|
||
|
'"' * P(scanString) +
|
||
|
'/' * P(eatRegex) * '/')))
|
||
|
lex:add_style('field', lexer.STYLE_LABEL)
|
||
|
|
||
|
-- Regular expressions.
|
||
|
-- Slash delimited regular expressions are preceded by most operators or
|
||
|
-- the keywords 'print' and 'case', possibly on a preceding line. They
|
||
|
-- can contain unescaped slashes and brackets in brackets. Some escape
|
||
|
-- sequences like '\S', '\s' have special meanings with Gawk. Tokens that
|
||
|
-- contain them are displayed differently.
|
||
|
lex:add_rule('gawkRegex', token('gawkRegex', SLASH * P(scanGawkRegex)))
|
||
|
lex:add_style('gawkRegex', lexer.STYLE_PREPROCESSOR..',underlined')
|
||
|
lex:add_rule('regex', token(lexer.REGEX, SLASH * P(scanRegex)))
|
||
|
|
||
|
-- Operators.
|
||
|
lex:add_rule('gawkOperator', token('gawkOperator', P("|&") + "@" + "**=" +
|
||
|
"**"))
|
||
|
lex:add_style('gawkOperator', lexer.STYLE_OPERATOR..',underlined')
|
||
|
lex:add_rule('operator', token(lexer.OPERATOR, S('!%&()*+,-/:;<=>?[\\]^{|}~')))
|
||
|
|
||
|
-- Numbers.
|
||
|
lex:add_rule('gawkNumber', token('gawkNumber', lexer.hex_num + lexer.oct_num))
|
||
|
lex:add_style('gawkNumber', lexer.STYLE_NUMBER..',underlined')
|
||
|
lex:add_rule('number', token(lexer.NUMBER, float))
|
||
|
|
||
|
-- Keywords.
|
||
|
lex:add_rule('keyword', token(lexer.KEYWORD, word_match[[
|
||
|
BEGIN END atan2 break close continue cos delete do else exit exp fflush for
|
||
|
function getline gsub if in index int length log match next nextfile print
|
||
|
printf rand return sin split sprintf sqrt srand sub substr system tolower
|
||
|
toupper while
|
||
|
]]))
|
||
|
|
||
|
lex:add_rule('builtInVariable', token('builtInVariable', word_match[[
|
||
|
ARGC ARGV CONVFMT ENVIRON FILENAME FNR FS NF NR OFMT OFS ORS RLENGTH RS RSTART
|
||
|
SUBSEP
|
||
|
]]))
|
||
|
lex:add_style('builtInVariable', lexer.STYLE_CONSTANT)
|
||
|
|
||
|
lex:add_rule('gawkBuiltInVariable', token('gawkBuiltInVariable', word_match[[
|
||
|
ARGIND BINMODE ERRNO FIELDWIDTHS FPAT FUNCTAB IGNORECASE LINT PREC PROCINFO
|
||
|
ROUNDMODE RT SYMTAB TEXTDOMAIN
|
||
|
]]))
|
||
|
lex:add_style('gawkBuiltInVariable', lexer.STYLE_CONSTANT..',underlined')
|
||
|
|
||
|
-- Functions.
|
||
|
lex:add_rule('function', token(lexer.FUNCTION, lexer.word * #P('(')))
|
||
|
|
||
|
-- Identifiers.
|
||
|
lex:add_rule('identifier', token(lexer.IDENTIFIER, lexer.word))
|
||
|
|
||
|
-- Fold points.
|
||
|
lex:add_fold_point(lexer.OPERATOR, '{', '}')
|
||
|
lex:add_fold_point(lexer.COMMENT, '#', lexer.fold_line_comments('#'))
|
||
|
|
||
|
return lex
|