diff options
| author | Marc André Tanner <mat@brain-dump.org> | 2015-10-16 12:36:47 +0200 |
|---|---|---|
| committer | Marc André Tanner <mat@brain-dump.org> | 2015-11-08 13:35:36 +0100 |
| commit | 039042f2e323c1f982f1de61b702c88fb33d6ccb (patch) | |
| tree | 67dea69de9462e0c27ea2a743b4c5d1798eaa057 /lexers/awk.lua | |
| parent | b1ec60061623601ca6185a16d77c6c6c62135e95 (diff) | |
| download | vis-039042f2e323c1f982f1de61b702c88fb33d6ccb.tar.gz vis-039042f2e323c1f982f1de61b702c88fb33d6ccb.tar.xz | |
Import LPeg based lexers from Scintillua 3.6.1-1
These are Copyright (c) 2007-2015 Mitchell and released under the
MIT license.
Diffstat (limited to 'lexers/awk.lua')
| -rw-r--r-- | lexers/awk.lua | 334 |
1 files changed, 334 insertions, 0 deletions
diff --git a/lexers/awk.lua b/lexers/awk.lua new file mode 100644 index 0000000..7948d21 --- /dev/null +++ b/lexers/awk.lua @@ -0,0 +1,334 @@ +-- Copyright 2006-2015 Mitchell mitchell.att.foicica.com. See LICENSE. +-- AWK LPeg lexer. +-- Modified by Wolfgang Seeberg 2012, 2013. + +local l = require('lexer') +local token, word_match = l.token, l.word_match +local P, R, S = lpeg.P, lpeg.R, lpeg.S + +local M = {_NAME = 'awk'} + +local LEFTBRACKET = '[' +local RIGHTBRACKET = ']' +local SLASH = '/' +local BACKSLASH = '\\' +local CARET = '^' +local CR = '\r' +local LF = '\n' +local CRLF = CR .. LF +local DQUOTE = '"' +local DELIMITER_MATCHES = {['('] = ')', ['['] = ']'} +local COMPANION = {['('] = '[', ['['] = '('} +local CC = { + alnum = 1, alpha = 1, blank = 1, cntrl = 1, digit = 1, graph = 1, lower = 1, + print = 1, punct = 1, space = 1, upper = 1, xdigit = 1 +} +local LastRegexEnd = 0 +local BackslashAtCommentEnd = 0 +local KW_BEFORE_RX = { + case = 1, ['do'] = 1, ['else'] = 1, exit = 1, print = 1, printf = 1, + ['return'] = 1 +} + +local function findKeyword(input, e) + local i = e + while i > 0 and input:find("^[%l]", i) do i = i - 1 end + local w = input:sub(i + 1, e) + if i == 0 then + return KW_BEFORE_RX[w] == 1 + elseif input:find("^[%u%d_]", i) then + return false + else + return KW_BEFORE_RX[w] == 1 + end +end + +local function isRegex(input, i) + while i >= 1 and input:find('^[ \t]', i) do i = i - 1 end + if i < 1 then return true end + if input:find("^[-!%%&(*+,:;<=>?[^{|}~\f]", i) or findKeyword(input, i) then + return true + elseif input:sub(i, i) == SLASH then + return i ~= LastRegexEnd -- deals with /xx/ / /yy/. + elseif input:find('^[]%w)."]', i) then + return false + elseif input:sub(i, i) == LF then + if i == 1 then return true end + i = i - 1 + if input:sub(i, i) == CR then + if i == 1 then return true end + i = i - 1 + end + elseif input:sub(i, i) == CR then + if i == 1 then return true end + i = i - 1 + else + return false + end + if input:sub(i, i) == BACKSLASH and i ~= BackslashAtCommentEnd then + return isRegex(input, i - 1) + else + return true + end +end + +local function eatCharacterClass(input, s, e) + local i = s + while i <= e do + if input:find('^[\r\n]', i) then + return false + elseif input:sub(i, i + 1) == ':]' then + local str = input:sub(s, i - 1) + return CC[str] == 1 and i + 1 + end + i = i + 1 + end + return false +end + +local function eatBrackets(input, i, e) + if input:sub(i, i) == CARET then i = i + 1 end + if input:sub(i, i) == RIGHTBRACKET then i = i + 1 end + while i <= e do + if input:find('^[\r\n]', i) then + return false + elseif input:sub(i, i) == RIGHTBRACKET then + return i + elseif input:sub(i, i + 1) == '[:' then + i = eatCharacterClass(input, i + 2, e) + if not i then return false end + elseif input:sub(i, i) == BACKSLASH then + i = i + 1 + if input:sub(i, i + 1) == CRLF then i = i + 1 end + end + i = i + 1 + end + return false +end + +local function eatRegex(input, i) + local e = #input + while i <= e do + if input:find('^[\r\n]', i) then + return false + elseif input:sub(i, i) == SLASH then + LastRegexEnd = i + return i + elseif input:sub(i, i) == LEFTBRACKET then + i = eatBrackets(input, i + 1, e) + if not i then return false end + elseif input:sub(i, i) == BACKSLASH then + i = i + 1 + if input:sub(i, i + 1) == CRLF then i = i + 1 end + end + i = i + 1 + end + return false +end + +local ScanRegexResult +local function scanGawkRegex(input, index) + if isRegex(input, index - 2) then + local i = eatRegex(input, index) + if not i then + ScanRegexResult = false + return false + end + local rx = input:sub(index - 1, i) + for bs in rx:gmatch("[^\\](\\+)[BSsWwy<>`']") do + -- /\S/ is special, but /\\S/ is not. + if #bs % 2 == 1 then return i + 1 end + end + ScanRegexResult = i + 1 + else + ScanRegexResult = false + end + return false +end +-- Is only called immediately after scanGawkRegex(). +local function scanRegex() + return ScanRegexResult +end + +local function scanString(input, index) + local i = index + local e = #input + while i <= e do + if input:find('^[\r\n]', i) then + return false + elseif input:sub(i, i) == DQUOTE then + return i + 1 + elseif input:sub(i, i) == BACKSLASH then + i = i + 1 + -- l.delimited_range() doesn't handle CRLF. + if input:sub(i, i + 1) == CRLF then i = i + 1 end + end + i = i + 1 + end + return false +end + +-- purpose: prevent isRegex() from entering a comment line that ends with a +-- backslash. +local function scanComment(input, index) + local _, i = input:find('[^\r\n]*', index) + if input:sub(i, i) == BACKSLASH then BackslashAtCommentEnd = i end + return i + 1 +end + +local function scanFieldDelimiters(input, index) + local i = index + local e = #input + local left = input:sub(i - 1, i - 1) + local count = 1 + local right = DELIMITER_MATCHES[left] + local left2 = COMPANION[left] + local count2 = 0 + local right2 = DELIMITER_MATCHES[left2] + while i <= e do + if input:find('^[#\r\n]', i) then + return false + elseif input:sub(i, i) == right then + count = count - 1 + if count == 0 then return count2 == 0 and i + 1 end + elseif input:sub(i, i) == left then + count = count + 1 + elseif input:sub(i, i) == right2 then + count2 = count2 - 1 + if count2 < 0 then return false end + elseif input:sub(i, i) == left2 then + count2 = count2 + 1 + elseif input:sub(i, i) == DQUOTE then + i = scanString(input, i + 1) + if not i then return false end + i = i - 1 + elseif input:sub(i, i) == SLASH then + if isRegex(input, i - 1) then + i = eatRegex(input, i + 1) + if not i then return false end + end + elseif input:sub(i, i) == BACKSLASH then + if input:sub(i + 1, i + 2) == CRLF then + i = i + 2 + elseif input:find('^[\r\n]', i + 1) then + i = i + 1 + end + end + i = i + 1 + end + return false +end + +-- Whitespace. +local ws = token(l.WHITESPACE, l.space^1) + +-- Comments. +local comment = token(l.COMMENT, '#' * P(scanComment)) + +-- Strings. +local string = token(l.STRING, DQUOTE * P(scanString)) + +-- Regular expressions. +-- Slash delimited regular expressions are preceded by most operators or +-- the keywords 'print' and 'case', possibly on a preceding line. They +-- can contain unescaped slashes and brackets in brackets. Some escape +-- sequences like '\S', '\s' have special meanings with Gawk. Tokens that +-- contain them are displayed differently. +local regex = token(l.REGEX, SLASH * P(scanRegex)) +local gawkRegex = token('gawkRegex', SLASH * P(scanGawkRegex)) + +-- no leading sign because it might be binary. +local float = ((l.digit ^ 1 * ('.' * l.digit ^ 0) ^ -1) + + ('.' * l.digit ^ 1)) * (S('eE') * S('+-') ^ -1 * l.digit ^ 1) ^ -1 +-- Numbers. +local number = token(l.NUMBER, float) +local gawkNumber = token('gawkNumber', l.hex_num + l.oct_num) + +-- Operators. +local operator = token(l.OPERATOR, S('!%&()*+,-/:;<=>?[\\]^{|}~')) +local gawkOperator = token('gawkOperator', P("|&") + "@" + "**=" + "**") + +-- Fields. E.g. $1, $a, $(x), $a(x), $a[x], $"1", $$a, etc. +local field = token('field', P('$') * S('$+-') ^ 0 * + (float + (l.word ^ 0 * '(' * P(scanFieldDelimiters)) + + (l.word ^ 1 * ('[' * P(scanFieldDelimiters)) ^ -1) + + ('"' * P(scanString)) + ('/' * P(eatRegex) * '/'))) + +-- Functions. +local func = token(l.FUNCTION, l.word * #P('(')) + +-- Identifiers. +local identifier = token(l.IDENTIFIER, l.word) + +-- Keywords. +local keyword = token(l.KEYWORD, word_match{ + 'BEGIN', 'END', 'atan2', 'break', 'close', 'continue', 'cos', 'delete', 'do', + 'else', 'exit', 'exp', 'fflush', 'for', 'function', 'getline', 'gsub', 'if', + 'in', 'index', 'int', 'length', 'log', 'match', 'next', 'nextfile', 'print', + 'printf', 'rand', 'return', 'sin', 'split', 'sprintf', 'sqrt', 'srand', 'sub', + 'substr', 'system', 'tolower', 'toupper', 'while' +}) + +local gawkKeyword = token('gawkKeyword', word_match{ + 'BEGINFILE', 'ENDFILE', 'adump', 'and', 'asort', 'asorti', 'bindtextdomain', + 'case', 'compl', 'dcgettext', 'dcngettext', 'default', 'extension', 'func', + 'gensub', 'include', 'isarray', 'load', 'lshift', 'mktime', 'or', 'patsplit', + 'rshift', 'stopme', 'strftime', 'strtonum', 'switch', 'systime', 'xor' +}) + +local builtInVariable = token('builtInVariable', word_match{ + 'ARGC', 'ARGV', 'CONVFMT', 'ENVIRON', 'FILENAME', 'FNR', 'FS', 'NF', 'NR', + 'OFMT', 'OFS', 'ORS', 'RLENGTH', 'RS', 'RSTART', 'SUBSEP' +}) + +local gawkBuiltInVariable = token('gawkBuiltInVariable', word_match { + 'ARGIND', 'BINMODE', 'ERRNO', 'FIELDWIDTHS', 'FPAT', 'FUNCTAB', 'IGNORECASE', + 'LINT', 'PREC', 'PROCINFO', 'ROUNDMODE', 'RT', 'SYMTAB', 'TEXTDOMAIN' +}) + +-- Within each group order matters, but the groups themselves (except the +-- last) can be in any order. +M._rules = { + {'whitespace', ws}, + + {'comment', comment}, + + {'string', string}, + + {'field', field}, + + {'gawkRegex', gawkRegex}, + {'regex', regex}, + {'gawkOperator', gawkOperator}, + {'operator', operator}, + + {'gawkNumber', gawkNumber}, + {'number', number}, + + {'keyword', keyword}, + {'builtInVariable', builtInVariable}, + {'gawkKeyword', gawkKeyword}, + {'gawkBuiltInVariable', gawkBuiltInVariable}, + {'function', func}, + {'identifier', identifier}, +} + +M._tokenstyles = { + builtInVariable = l.STYLE_CONSTANT, + default = l.STYLE_ERROR, + field = l.STYLE_LABEL, + gawkBuiltInVariable = l.STYLE_CONSTANT..',underlined', + gawkKeyword = l.STYLE_KEYWORD..',underlined', + gawkNumber = l.STYLE_NUMBER..',underlined', + gawkOperator = l.STYLE_OPERATOR..',underlined', + gawkRegex = l.STYLE_PREPROCESSOR..',underlined', + regex = l.STYLE_PREPROCESSOR +} + +M._foldsymbols = { + _patterns = {'[{}]', '#'}, + [l.OPERATOR] = {['{'] = 1, ['}'] = -1}, + [l.COMMENT] = {['#'] = l.fold_line_comments('#')} +} + +return M |
