iup-stack/iup/etc/lexlua/awk.lua

298 lines
8.8 KiB
Lua
Raw Permalink Normal View History

2023-02-20 16:44:45 +00:00
-- Copyright 2006-2019 Mitchell mitchell.att.foicica.com. See License.txt.
-- AWK LPeg lexer.
-- Modified by Wolfgang Seeberg 2012, 2013.
local lexer = require('lexer')
local token, word_match = lexer.token, lexer.word_match
local P, R, S = lpeg.P, lpeg.R, lpeg.S
local lex = lexer.new('awk')
local LEFTBRACKET = '['
local RIGHTBRACKET = ']'
local SLASH = '/'
local BACKSLASH = '\\'
local CARET = '^'
local CR = '\r'
local LF = '\n'
local CRLF = CR .. LF
local DQUOTE = '"'
local DELIMITER_MATCHES = {['('] = ')', ['['] = ']'}
local COMPANION = {['('] = '[', ['['] = '('}
local CC = {
alnum = 1, alpha = 1, blank = 1, cntrl = 1, digit = 1, graph = 1, lower = 1,
print = 1, punct = 1, space = 1, upper = 1, xdigit = 1
}
local LastRegexEnd = 0
local BackslashAtCommentEnd = 0
local KW_BEFORE_RX = {
case = 1, ['do'] = 1, ['else'] = 1, exit = 1, print = 1, printf = 1,
['return'] = 1
}
local function findKeyword(input, e)
local i = e
while i > 0 and input:find("^[%l]", i) do i = i - 1 end
local w = input:sub(i + 1, e)
if i == 0 then
return KW_BEFORE_RX[w] == 1
elseif input:find("^[%u%d_]", i) then
return false
else
return KW_BEFORE_RX[w] == 1
end
end
local function isRegex(input, i)
while i >= 1 and input:find('^[ \t]', i) do i = i - 1 end
if i < 1 then return true end
if input:find("^[-!%%&(*+,:;<=>?[^{|}~\f]", i) or findKeyword(input, i) then
return true
elseif input:sub(i, i) == SLASH then
return i ~= LastRegexEnd -- deals with /xx/ / /yy/.
elseif input:find('^[]%w)."]', i) then
return false
elseif input:sub(i, i) == LF then
if i == 1 then return true end
i = i - 1
if input:sub(i, i) == CR then
if i == 1 then return true end
i = i - 1
end
elseif input:sub(i, i) == CR then
if i == 1 then return true end
i = i - 1
else
return false
end
if input:sub(i, i) == BACKSLASH and i ~= BackslashAtCommentEnd then
return isRegex(input, i - 1)
else
return true
end
end
local function eatCharacterClass(input, s, e)
local i = s
while i <= e do
if input:find('^[\r\n]', i) then
return false
elseif input:sub(i, i + 1) == ':]' then
local str = input:sub(s, i - 1)
return CC[str] == 1 and i + 1
end
i = i + 1
end
return false
end
local function eatBrackets(input, i, e)
if input:sub(i, i) == CARET then i = i + 1 end
if input:sub(i, i) == RIGHTBRACKET then i = i + 1 end
while i <= e do
if input:find('^[\r\n]', i) then
return false
elseif input:sub(i, i) == RIGHTBRACKET then
return i
elseif input:sub(i, i + 1) == '[:' then
i = eatCharacterClass(input, i + 2, e)
if not i then return false end
elseif input:sub(i, i) == BACKSLASH then
i = i + 1
if input:sub(i, i + 1) == CRLF then i = i + 1 end
end
i = i + 1
end
return false
end
local function eatRegex(input, i)
local e = #input
while i <= e do
if input:find('^[\r\n]', i) then
return false
elseif input:sub(i, i) == SLASH then
LastRegexEnd = i
return i
elseif input:sub(i, i) == LEFTBRACKET then
i = eatBrackets(input, i + 1, e)
if not i then return false end
elseif input:sub(i, i) == BACKSLASH then
i = i + 1
if input:sub(i, i + 1) == CRLF then i = i + 1 end
end
i = i + 1
end
return false
end
local ScanRegexResult
local function scanGawkRegex(input, index)
if isRegex(input, index - 2) then
local i = eatRegex(input, index)
if not i then
ScanRegexResult = false
return false
end
local rx = input:sub(index - 1, i)
for bs in rx:gmatch("[^\\](\\+)[BSsWwy<>`']") do
-- /\S/ is special, but /\\S/ is not.
if #bs % 2 == 1 then return i + 1 end
end
ScanRegexResult = i + 1
else
ScanRegexResult = false
end
return false
end
-- Is only called immediately after scanGawkRegex().
local function scanRegex()
return ScanRegexResult
end
local function scanString(input, index)
local i = index
local e = #input
while i <= e do
if input:find('^[\r\n]', i) then
return false
elseif input:sub(i, i) == DQUOTE then
return i + 1
elseif input:sub(i, i) == BACKSLASH then
i = i + 1
-- lexer.delimited_range() doesn't handle CRLF.
if input:sub(i, i + 1) == CRLF then i = i + 1 end
end
i = i + 1
end
return false
end
-- purpose: prevent isRegex() from entering a comment line that ends with a
-- backslash.
local function scanComment(input, index)
local _, i = input:find('[^\r\n]*', index)
if input:sub(i, i) == BACKSLASH then BackslashAtCommentEnd = i end
return i + 1
end
local function scanFieldDelimiters(input, index)
local i = index
local e = #input
local left = input:sub(i - 1, i - 1)
local count = 1
local right = DELIMITER_MATCHES[left]
local left2 = COMPANION[left]
local count2 = 0
local right2 = DELIMITER_MATCHES[left2]
while i <= e do
if input:find('^[#\r\n]', i) then
return false
elseif input:sub(i, i) == right then
count = count - 1
if count == 0 then return count2 == 0 and i + 1 end
elseif input:sub(i, i) == left then
count = count + 1
elseif input:sub(i, i) == right2 then
count2 = count2 - 1
if count2 < 0 then return false end
elseif input:sub(i, i) == left2 then
count2 = count2 + 1
elseif input:sub(i, i) == DQUOTE then
i = scanString(input, i + 1)
if not i then return false end
i = i - 1
elseif input:sub(i, i) == SLASH then
if isRegex(input, i - 1) then
i = eatRegex(input, i + 1)
if not i then return false end
end
elseif input:sub(i, i) == BACKSLASH then
if input:sub(i + 1, i + 2) == CRLF then
i = i + 2
elseif input:find('^[\r\n]', i + 1) then
i = i + 1
end
end
i = i + 1
end
return false
end
-- Whitespace.
lex:add_rule('whitespace', token(lexer.WHITESPACE, lexer.space^1))
-- Comments.
lex:add_rule('comment', token(lexer.COMMENT, '#' * P(scanComment)))
-- Strings.
lex:add_rule('string', token(lexer.STRING, DQUOTE * P(scanString)))
-- No leading sign because it might be binary.
local float = ((lexer.digit^1 * ('.' * lexer.digit^0)^-1) +
('.' * lexer.digit^1)) *
(S('eE') * S('+-')^-1 * lexer.digit^1)^-1
-- Fields. E.g. $1, $a, $(x), $a(x), $a[x], $"1", $$a, etc.
lex:add_rule('field',
token('field', P('$') * S('$+-')^0 *
(float +
lexer.word^0 * '(' * P(scanFieldDelimiters) +
lexer.word^1 * ('[' * P(scanFieldDelimiters))^-1 +
'"' * P(scanString) +
'/' * P(eatRegex) * '/')))
lex:add_style('field', lexer.STYLE_LABEL)
-- Regular expressions.
-- Slash delimited regular expressions are preceded by most operators or
-- the keywords 'print' and 'case', possibly on a preceding line. They
-- can contain unescaped slashes and brackets in brackets. Some escape
-- sequences like '\S', '\s' have special meanings with Gawk. Tokens that
-- contain them are displayed differently.
lex:add_rule('gawkRegex', token('gawkRegex', SLASH * P(scanGawkRegex)))
lex:add_style('gawkRegex', lexer.STYLE_PREPROCESSOR..',underlined')
lex:add_rule('regex', token(lexer.REGEX, SLASH * P(scanRegex)))
-- Operators.
lex:add_rule('gawkOperator', token('gawkOperator', P("|&") + "@" + "**=" +
"**"))
lex:add_style('gawkOperator', lexer.STYLE_OPERATOR..',underlined')
lex:add_rule('operator', token(lexer.OPERATOR, S('!%&()*+,-/:;<=>?[\\]^{|}~')))
-- Numbers.
lex:add_rule('gawkNumber', token('gawkNumber', lexer.hex_num + lexer.oct_num))
lex:add_style('gawkNumber', lexer.STYLE_NUMBER..',underlined')
lex:add_rule('number', token(lexer.NUMBER, float))
-- Keywords.
lex:add_rule('keyword', token(lexer.KEYWORD, word_match[[
BEGIN END atan2 break close continue cos delete do else exit exp fflush for
function getline gsub if in index int length log match next nextfile print
printf rand return sin split sprintf sqrt srand sub substr system tolower
toupper while
]]))
lex:add_rule('builtInVariable', token('builtInVariable', word_match[[
ARGC ARGV CONVFMT ENVIRON FILENAME FNR FS NF NR OFMT OFS ORS RLENGTH RS RSTART
SUBSEP
]]))
lex:add_style('builtInVariable', lexer.STYLE_CONSTANT)
lex:add_rule('gawkBuiltInVariable', token('gawkBuiltInVariable', word_match[[
ARGIND BINMODE ERRNO FIELDWIDTHS FPAT FUNCTAB IGNORECASE LINT PREC PROCINFO
ROUNDMODE RT SYMTAB TEXTDOMAIN
]]))
lex:add_style('gawkBuiltInVariable', lexer.STYLE_CONSTANT..',underlined')
-- Functions.
lex:add_rule('function', token(lexer.FUNCTION, lexer.word * #P('(')))
-- Identifiers.
lex:add_rule('identifier', token(lexer.IDENTIFIER, lexer.word))
-- Fold points.
lex:add_fold_point(lexer.OPERATOR, '{', '}')
lex:add_fold_point(lexer.COMMENT, '#', lexer.fold_line_comments('#'))
return lex