1 files changed, 62 insertions, 112 deletions
diff --git a/lua/lexers/awk.lua b/lua/lexers/awk.lua
index 87e39d9..0b3f9bf 100644
--- a/lua/lexers/awk.lua
+++ b/lua/lexers/awk.lua
@@ -1,12 +1,12 @@
--- Copyright 2006-2017 Mitchell mitchell.att.foicica.com. See LICENSE.
+-- Copyright 2006-2022 Mitchell. See LICENSE.
 -- AWK LPeg lexer.
 -- Modified by Wolfgang Seeberg 2012, 2013.
 
-local l = require('lexer')
-local token, word_match = l.token, l.word_match
-local P, R, S = lpeg.P, lpeg.R, lpeg.S
+local lexer = require('lexer')
+local token, word_match = lexer.token, lexer.word_match
+local P, S = lpeg.P, lpeg.S
 
-local M = {_NAME = 'awk'}
+local lex = lexer.new('awk')
 
 local LEFTBRACKET = '['
 local RIGHTBRACKET = ']'
@@ -20,14 +20,13 @@ local DQUOTE = '"'
 local DELIMITER_MATCHES = {['('] = ')', ['['] = ']'}
 local COMPANION = {['('] = '[', ['['] = '('}
 local CC = {
-  alnum = 1, alpha = 1, blank = 1, cntrl = 1, digit = 1, graph = 1, lower = 1,
-  print = 1, punct = 1, space = 1, upper = 1, xdigit = 1
+  alnum = 1, alpha = 1, blank = 1, cntrl = 1, digit = 1, graph = 1, lower = 1, print = 1, punct = 1,
+  space = 1, upper = 1, xdigit = 1
 }
 local LastRegexEnd = 0
 local BackslashAtCommentEnd = 0
 local KW_BEFORE_RX = {
-  case = 1, ['do'] = 1, ['else'] = 1, exit = 1, print = 1, printf = 1,
-  ['return'] = 1
+  case = 1, ['do'] = 1, ['else'] = 1, exit = 1, print = 1, printf = 1, ['return'] = 1
 }
 
 local function findKeyword(input, e)
@@ -146,9 +145,7 @@ local function scanGawkRegex(input, index)
   return false
 end
 -- Is only called immediately after scanGawkRegex().
-local function scanRegex()
-  return ScanRegexResult
-end
+local function scanRegex() return ScanRegexResult end
 
 local function scanString(input, index)
   local i = index
@@ -160,7 +157,7 @@ local function scanString(input, index)
       return i + 1
     elseif input:sub(i, i) == BACKSLASH then
       i = i + 1
-      -- l.delimited_range() doesn't handle CRLF.
+      -- lexer.range() doesn't handle CRLF.
       if input:sub(i, i + 1) == CRLF then i = i + 1 end
     end
     i = i + 1
@@ -168,8 +165,7 @@ local function scanString(input, index)
   return false
 end
 
--- purpose: prevent isRegex() from entering a comment line that ends with a
--- backslash.
+-- purpose: prevent isRegex() from entering a comment line that ends with a backslash.
 local function scanComment(input, index)
   local _, i = input:find('[^\r\n]*', index)
   if input:sub(i, i) == BACKSLASH then BackslashAtCommentEnd = i end
@@ -220,115 +216,69 @@ local function scanFieldDelimiters(input, index)
 end
 
 -- Whitespace.
-local ws = token(l.WHITESPACE, l.space^1)
+lex:add_rule('whitespace', token(lexer.WHITESPACE, lexer.space^1))
 
 -- Comments.
-local comment = token(l.COMMENT, '#' * P(scanComment))
+lex:add_rule('comment', token(lexer.COMMENT, '#' * P(scanComment)))
 
 -- Strings.
-local string = token(l.STRING, DQUOTE * P(scanString))
-
--- Regular expressions.
--- Slash delimited regular expressions are preceded by most operators or
--- the keywords 'print' and 'case', possibly on a preceding line. They
--- can contain unescaped slashes and brackets in brackets. Some escape
--- sequences like '\S', '\s' have special meanings with Gawk. Tokens that
--- contain them are displayed differently.
-local regex = token(l.REGEX, SLASH * P(scanRegex))
-local gawkRegex = token('gawkRegex', SLASH * P(scanGawkRegex))
+lex:add_rule('string', token(lexer.STRING, DQUOTE * P(scanString)))
 
--- no leading sign because it might be binary.
-local float = ((l.digit ^ 1 * ('.' * l.digit ^ 0) ^ -1) +
-    ('.' * l.digit ^ 1)) * (S('eE') * S('+-') ^ -1 * l.digit ^ 1) ^ -1
--- Numbers.
-local number = token(l.NUMBER, float)
-local gawkNumber = token('gawkNumber', l.hex_num + l.oct_num)
-
--- Operators.
-local operator = token(l.OPERATOR, S('!%&()*+,-/:;<=>?[\\]^{|}~'))
-local gawkOperator = token('gawkOperator', P("|&") + "@" + "**=" + "**")
+-- No leading sign because it might be binary.
+local float = ((lexer.digit^1 * ('.' * lexer.digit^0)^-1) + ('.' * lexer.digit^1)) *
+  (S('eE') * S('+-')^-1 * lexer.digit^1)^-1
 
 -- Fields. E.g. $1, $a, $(x), $a(x), $a[x], $"1", $$a, etc.
-local field = token('field', P('$') * S('$+-') ^ 0 *
-                    (float + (l.word ^ 0 * '(' * P(scanFieldDelimiters)) +
-                     (l.word ^ 1 * ('[' * P(scanFieldDelimiters)) ^ -1) +
-                     ('"' * P(scanString)) + ('/' * P(eatRegex) * '/')))
-
--- Functions.
-local func = token(l.FUNCTION, l.word * #P('('))
-
--- Identifiers.
-local identifier = token(l.IDENTIFIER, l.word)
-
--- Keywords.
-local keyword = token(l.KEYWORD, word_match{
-  'BEGIN', 'END', 'atan2', 'break', 'close', 'continue', 'cos', 'delete', 'do',
-  'else', 'exit', 'exp', 'fflush', 'for', 'function', 'getline', 'gsub', 'if',
-  'in', 'index', 'int', 'length', 'log', 'match', 'next', 'nextfile', 'print',
-  'printf', 'rand', 'return', 'sin', 'split', 'sprintf', 'sqrt', 'srand', 'sub',
-  'substr', 'system', 'tolower', 'toupper', 'while'
-})
-
-local gawkKeyword = token('gawkKeyword', word_match{
-  'BEGINFILE', 'ENDFILE', 'adump', 'and', 'asort', 'asorti', 'bindtextdomain',
-  'case', 'compl', 'dcgettext', 'dcngettext', 'default', 'extension', 'func',
-  'gensub', 'include', 'isarray', 'load', 'lshift', 'mktime', 'or', 'patsplit',
-  'rshift', 'stopme', 'strftime', 'strtonum', 'switch', 'systime', 'xor'
-})
+lex:add_rule('field', token('field', '$' * S('$+-')^0 *
+  (float + lexer.word^0 * '(' * P(scanFieldDelimiters) + lexer.word^1 *
+    ('[' * P(scanFieldDelimiters))^-1 + '"' * P(scanString) + '/' * P(eatRegex) * '/')))
+lex:add_style('field', lexer.styles.label)
 
-local builtInVariable = token('builtInVariable', word_match{
-  'ARGC', 'ARGV', 'CONVFMT', 'ENVIRON', 'FILENAME', 'FNR', 'FS', 'NF', 'NR',
-  'OFMT', 'OFS', 'ORS', 'RLENGTH', 'RS', 'RSTART', 'SUBSEP'
-})
-
-local gawkBuiltInVariable = token('gawkBuiltInVariable', word_match {
-  'ARGIND', 'BINMODE', 'ERRNO', 'FIELDWIDTHS', 'FPAT', 'FUNCTAB', 'IGNORECASE',
-  'LINT', 'PREC', 'PROCINFO', 'ROUNDMODE', 'RT', 'SYMTAB', 'TEXTDOMAIN'
-})
-
--- Within each group order matters, but the groups themselves (except the
--- last) can be in any order.
-M._rules = {
-  {'whitespace', ws},
-
-  {'comment', comment},
-
-  {'string', string},
+-- Regular expressions.
+-- Slash delimited regular expressions are preceded by most operators or the keywords 'print'
+-- and 'case', possibly on a preceding line. They can contain unescaped slashes and brackets
+-- in brackets. Some escape sequences like '\S', '\s' have special meanings with Gawk. Tokens
+-- that contain them are displayed differently.
+lex:add_rule('gawkRegex', token('gawkRegex', SLASH * P(scanGawkRegex)))
+lex:add_style('gawkRegex', lexer.styles.preprocessor .. {underlined = true})
+lex:add_rule('regex', token(lexer.REGEX, SLASH * P(scanRegex)))
 
-  {'field', field},
+-- Operators.
+lex:add_rule('gawkOperator', token('gawkOperator', P("|&") + "@" + "**=" + "**"))
+lex:add_style('gawkOperator', lexer.styles.operator .. {underlined = true})
+lex:add_rule('operator', token(lexer.OPERATOR, S('!%&()*+,-/:;<=>?[\\]^{|}~')))
 
-  {'gawkRegex', gawkRegex},
-  {'regex', regex},
-  {'gawkOperator', gawkOperator},
-  {'operator', operator},
+-- Numbers.
+lex:add_rule('gawkNumber', token('gawkNumber', lexer.hex_num + lexer.oct_num))
+lex:add_style('gawkNumber', lexer.styles.number .. {underlined = true})
+lex:add_rule('number', token(lexer.NUMBER, float))
 
-  {'gawkNumber', gawkNumber},
-  {'number', number},
+-- Keywords.
+lex:add_rule('keyword', token(lexer.KEYWORD, word_match{
+  'BEGIN', 'END', 'atan2', 'break', 'close', 'continue', 'cos', 'delete', 'do', 'else', 'exit',
+  'exp', 'fflush', 'for', 'function', 'getline', 'gsub', 'if', 'in', 'index', 'int', 'length',
+  'log', 'match', 'next', 'nextfile', 'print', 'printf', 'rand', 'return', 'sin', 'split',
+  'sprintf', 'sqrt', 'srand', 'sub', 'substr', 'system', 'tolower', 'toupper', 'while'
+}))
+
+lex:add_rule('builtInVariable', token('builtInVariable', word_match(
+  'ARGC ARGV CONVFMT ENVIRON FILENAME FNR FS NF NR OFMT OFS ORS RLENGTH RS RSTART SUBSEP')))
+lex:add_style('builtInVariable', lexer.styles.constant)
+
+lex:add_rule('gawkBuiltInVariable', token('gawkBuiltInVariable', word_match{
+  'ARGIND', 'BINMODE', 'ERRNO', 'FIELDWIDTHS', 'FPAT', 'FUNCTAB', 'IGNORECASE', 'LINT', 'PREC',
+  'PROCINFO', 'ROUNDMODE', 'RT', 'SYMTAB', 'TEXTDOMAIN'
+}))
+lex:add_style('gawkBuiltInVariable', lexer.styles.constant .. {underlined = true})
 
-  {'keyword', keyword},
-  {'builtInVariable', builtInVariable},
-  {'gawkKeyword', gawkKeyword},
-  {'gawkBuiltInVariable', gawkBuiltInVariable},
-  {'function', func},
-  {'identifier', identifier},
-}
+-- Functions.
+lex:add_rule('function', token(lexer.FUNCTION, lexer.word * #P('(')))
 
-M._tokenstyles = {
-  builtInVariable = l.STYLE_CONSTANT,
-  default = l.STYLE_ERROR,
-  field = l.STYLE_LABEL,
-  gawkBuiltInVariable = l.STYLE_CONSTANT..',underlined',
-  gawkKeyword = l.STYLE_KEYWORD..',underlined',
-  gawkNumber = l.STYLE_NUMBER..',underlined',
-  gawkOperator = l.STYLE_OPERATOR..',underlined',
-  gawkRegex = l.STYLE_PREPROCESSOR..',underlined',
-  regex = l.STYLE_PREPROCESSOR
-}
+-- Identifiers.
+lex:add_rule('identifier', token(lexer.IDENTIFIER, lexer.word))
 
-M._foldsymbols = {
-  _patterns = {'[{}]', '#'},
-  [l.OPERATOR] = {['{'] = 1, ['}'] = -1},
-  [l.COMMENT] = {['#'] = l.fold_line_comments('#')}
-}
+-- Fold points.
+lex:add_fold_point(lexer.OPERATOR, '{', '}')
+lex:add_fold_point(lexer.COMMENT, lexer.fold_consecutive_lines('#'))
 
-return M
+return lex