From 64554981c786c7a95e64c9f4336d3624d7239672 Mon Sep 17 00:00:00 2001 From: Murray Calavera Date: Sat, 27 Jan 2018 18:51:23 +0000 Subject: lexers: improve scheme lexer * fix character literals (#\" no longer quotes the entire file etc.) * properly nest block comments and support simplified datum comment * add r7rs keywords, functions and directives * fix identifiers - pipes were not recognized as delimiters - some valid identifiers were not recognized - some were partially parsed as keywords - quoting only worked on plain alphanumeric identifiers * fix numbers (some valid numbers were not recognised) * dont parse boolean constants as functions --- lua/lexers/scheme.lua | 249 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 190 insertions(+), 59 deletions(-) (limited to 'lua') diff --git a/lua/lexers/scheme.lua b/lua/lexers/scheme.lua index 37aba56..681f2fd 100644 --- a/lua/lexers/scheme.lua +++ b/lua/lexers/scheme.lua @@ -12,87 +12,218 @@ local ws = token(l.WHITESPACE, l.space^1) -- Comments. local line_comment = ';' * l.nonnewline^0 -local block_comment = '#|' * (l.any - '|#')^0 * P('|#')^-1 -local comment = token(l.COMMENT, line_comment + block_comment) +local block_comment = l.nested_pair('#|', '|#') +-- TODO: this should handle any datum and take into account "#\)", ";" etc. +local datum_comment + = P'#;' * l.space^0 + * (l.delimited_range("()", false, true, true) + (l.any - l.space)^1) +local comment = token(l.COMMENT, datum_comment + line_comment + block_comment) -- Strings. -local literal = (P("'") + '#' * S('\\bdox')) * l.word +local character + = P'#\\' * ( P'alarm' + P'backspace' + P'delete' + P'escape' + + P'newline' + P'null' + P'return' + P'space' + P'tab') + + P'#\\x' * l.xdigit^1 + + P'#\\' * P(1) local dq_str = l.delimited_range('"') -local string = token(l.STRING, literal + dq_str) - --- Numbers. -local number = token(l.NUMBER, P('-')^-1 * l.digit^1 * (S('./') * l.digit^1)^-1) +local string = token(l.STRING, character + dq_str) -- Keywords. local keyword = token(l.KEYWORD, word_match({ - 'and', 'begin', 'case', 'cond', 'cond-expand', 'define', 'define-macro', - 'delay', 'do', 'else', 'fluid-let', 'if', 'lambda', 'let', 'let*', 'letrec', - 'or', 'quasiquote', 'quote', 'set!', -}, '-*!')) + "and", "or", "not", "else", + + "library", "define-library", "export", "include-library-declarations", + "cond-expand", "import", "rename", "only", "except", "prefix", "include", + "include-ci", + + "begin", "case", "case-lambda", "cond", "define", "define-record-type", + "define-syntax", "define-values", "delay", "delay-force", "do", "if", + "guard", "lambda", "let", "let*", "let*-values", "let-syntax", "let-values", + "letrec", "letrec*", "letrec-syntax", "parameterize", "quasiquote", "quote", + "set!", "unless", "unquote", "unquote-splicing", "when", + + "define-macro", "fluid-let" +}, '.-+!$%&*/:<=>?@^_~')) -- Functions. local func = token(l.FUNCTION, word_match({ - 'abs', 'acos', 'angle', 'append', 'apply', 'asin', 'assoc', 'assq', 'assv', - 'atan', 'car', 'cdr', 'caar', 'cadr', 'cdar', 'cddr', 'caaar', 'caadr', - 'cadar', 'caddr', 'cdaar', 'cdadr', 'cddar', 'cdddr', - 'call-with-current-continuation', 'call-with-input-file', - 'call-with-output-file', 'call-with-values', 'call/cc', 'catch', 'ceiling', - 'char->integer', 'char-downcase', 'char-upcase', 'close-input-port', - 'close-output-port', 'cons', 'cos', 'current-input-port', - 'current-output-port', 'delete-file', 'display', 'dynamic-wind', 'eval', - 'exit', 'exact->inexact', 'exp', 'expt', 'file-or-directory-modify-seconds', - 'floor', 'force', 'for-each', 'gcd', 'gensym', 'get-output-string', 'getenv', - 'imag-part', 'integer->char', 'lcm', 'length', 'list', 'list->string', - 'list->vector', 'list-ref', 'list-tail', 'load', 'log', 'magnitude', - 'make-polar', 'make-rectangular', 'make-string', 'make-vector', 'map', 'max', - 'member', 'memq', 'memv', 'min', 'modulo', 'newline', 'nil', 'not', - 'number->string', 'open-input-file', 'open-input-string', 'open-output-file', - 'open-output-string', 'peek-char', 'quotient', 'read', 'read-char', - 'read-line', 'real-part', 'remainder', 'reverse', 'reverse!', 'round', - 'set-car!', 'set-cdr!', 'sin', 'sqrt', 'string', 'string->list', - 'string->number', 'string->symbol', 'string-append', 'string-copy', - 'string-fill!', 'string-length', 'string-ref', 'string-set!', 'substring', - 'symbol->string', 'system', 'tan', 'truncate', 'values', 'vector', - 'vector->list', 'vector-fill!', 'vector-length', 'vector-ref', 'vector-set!', - 'with-input-from-file', 'with-output-to-file', 'write', 'write-char', - 'boolean?', 'char-alphabetic?', 'char-ci<=?', 'char-ci=?', 'char-ci>?', 'char-lower-case?', 'char-numeric?', 'char-ready?', - 'char-upper-case?', 'char-whitespace?', 'char<=?', 'char=?', 'char>?', 'char?', 'complex?', 'eof-object?', 'eq?', 'equal?', - 'eqv?', 'even?', 'exact?', 'file-exists?', 'inexact?', 'input-port?', - 'integer?', 'list?', 'negative?', 'null?', 'number?', 'odd?', 'output-port?', - 'pair?', 'port?', 'positive?', 'procedure?', 'rational?', 'real?', - 'string-ci<=?', 'string-ci=?', 'string-ci>?', - 'string<=?', 'string=?', 'string>?', 'string?', - 'symbol?', 'vector?', 'zero?', - '#t', '#f' -}, '-/<>!?=#')) + "*", "+", "-", "/", "<", "<=", "=", "=>", ">", ">=", "abs", "append", + "apply", "assoc", "assq", "assv", "binary-port?", "boolean=?", "boolean?", + "bytevector", "bytevector-append", "bytevector-copy", "bytevector-copy!", + "bytevector-length", "bytevector-u8-ref", "bytevector-u8-set!", + "bytevector?", "caar", "cadr", "call-with-current-continuation", + "call-with-port", "call-with-values", "call/cc", "car", "cdar", "cddr", + "cdr", "ceiling", "char->integer", "char-ready?", "char<=?", "char=?", "char>?", "char?", "close-input-port", + "close-output-port", "close-port", "complex?", "cons", "current-error-port", + "current-input-port", "current-output-port", "denominator", "dynamic-wind", + "eof-object", "eof-object?", "eq?", "equal?", "eqv?", "error", + "error-object-irritants", "error-object-message", "error-object?", "even?", + "exact", "exact-integer-sqrt", "exact-integer?", "exact?", "expt", + "features", "file-error?", "floor", "floor-quotient", "floor-remainder", + "floor/", "flush-output-port", "for-each", "gcd", "get-output-bytevector", + "get-output-string", "inexact", "inexact?", "input-port-open?", + "input-port?", "integer->char", "integer?", "lcm", "length", "list", + "list->string", "list->vector", "list-copy", "list-ref", "list-set!", + "list-tail", "list?", "make-bytevector", "make-list", "make-parameter", + "make-string", "make-vector", "map", "max", "member", "memq", "memv", "min", + "modulo", "negative?", "newline", "null?", "number->string", "number?", + "numerator", "odd?", "open-input-bytevector", "open-input-string", + "open-output-bytevector", "open-output-string", "output-port-open?", + "output-port?", "pair?", "peek-char", "peek-u8", "port?", "positive?", + "procedure?", "quotient", "raise", "raise-continuable", "rational?", + "rationalize", "read-bytevector", "read-bytevector!", "read-char", + "read-error?", "read-line", "read-string", "read-u8", "real?", "remainder", + "reverse", "round", "set-car!", "set-cdr!", "square", "string", + "string->list", "string->number", "string->symbol", "string->utf8", + "string->vector", "string-append", "string-copy", "string-copy!", + "string-fill!", "string-for-each", "string-length", "string-map", + "string-ref", "string-set!", "string<=?", "string=?", "string>?", "string?", "substring", "symbol->string", + "symbol=?", "symbol?", "syntax-error", "syntax-rules", "textual-port?", + "truncate", "truncate-quotient", "truncate-remainder", "truncate/", + "u8-ready?", "utf8->string", "values", "vector", "vector->list", + "vector->string", "vector-append", "vector-copy", "vector-copy!", + "vector-fill!", "vector-for-each", "vector-length", "vector-map", + "vector-ref", "vector-set!", "vector?", "with-exception-handler", + "write-bytevector", "write-char", "write-string", "write-u8", "zero?", + + "char-alphabetic?", "char-ci<=?", "char-ci=?", + "char-ci>?", "char-downcase", "char-foldcase", "char-lower-case?", + "char-numeric?", "char-upcase", "char-upper-case?", "char-whitespace?", + "digit-value", "string-ci<=?", "string-ci=?", + "string-ci>?", "string-downcase", "string-foldcase", "string-upcase", + + "angle", "imag-part", "magnitude", "make-polar", "make-rectangular", + "real-part", + + "caaaar", "caaadr", "caaar", "caadar", "caaddr", "caadr", "cadaar", "cadadr", + "cadar", "caddar", "cadddr", "caddr", "cdaaar", "cdaadr", "cdaar", "cdadar", + "cdaddr", "cdadr", "cddaar", "cddadr", "cddar", "cdddar", "cddddr", "cdddr", + + "environment", "eval", + + "call-with-input-file", "call-with-output-file", "delete-file", + "file-exists?", "open-binary-input-file", "open-binary-output-file", + "open-input-file", "open-output-file", "with-input-from-file", + "with-output-to-file", + + "acos", "asin", "atan", "cos", "exp", "finite?", "infinite?", "log", "nan?", + "sin", "sqrt", "tan", + + "force", "make-promise", "promise?", + + "load", + + "command-line", "emergency-exit", "exit", "get-environment-variable", + "get-environment-variables", + + "read", + + "interaction-environment", + + "current-jiffy", "current-second", "jiffies-per-second", + + "display", "write", "write-shared", "write-simple", + + "syntax-case", "er-macro-transformer", "sc-macro-transformer", + "rsc-macro-transformer" +}, '.-+!$%&*/:<=>?@^_~')) + +local directive = token(l.PREPROCESSOR, P'#!fold-case' + P'#!no-fold-case') +local boolean = token(l.CONSTANT, + word_match({'#t', '#f', '#true', '#false'}, '#')) -- Identifiers. -local word = (l.alpha + S('-!?')) * (l.alnum + S('-!?'))^0 -local identifier = token(l.IDENTIFIER, word) +local explicit_sign = S('+-') --- Operators. -local operator = token(l.OPERATOR, S('<>=*/+-`@%:()')) +local initial = l.alpha + S('!$%&*/:<=>?@^_~') +local subsequent = initial + l.digit + explicit_sign + P'.' + +local sign_subsequent = initial + explicit_sign +local dot_subsequent = sign_subsequent + P'.' --- Entity. -local entity = token('entity', '&' * word) +local peculiar_identifier + = explicit_sign * P'.' * dot_subsequent * subsequent^0 + + explicit_sign * sign_subsequent * subsequent^0 + + P'.' * dot_subsequent * subsequent^0 + + explicit_sign + +local ident + = l.delimited_range('|') + + initial * subsequent^0 + + peculiar_identifier + +local identifier = token(l.IDENTIFIER, ident) +local symbol = token(l.CLASS, P"'" * ident) + +-- Numbers. +local function num(r) + local exactness = (P'#i' + P'#e')^-1 + + local radix = ({ + [2] = P'#b', + [8] = P'#o', + [10] = P('#d')^-1, + [16] = P'#x' + })[r] + + local digit = ({ + [2] = S'01', + [8] = R'07', + [10] = l.digit, + [16] = l.xdigit + })[r] + + local prefix = radix * exactness + exactness * radix + local suffix = (P'e' * S('+-')^-1 * l.digit^1)^-1 + + local infnan = P'+inf.0' + P'-inf.0' + P'+nan.0' + P'-nan.0' + + local decimal + = l.digit^1 * suffix + + P'.' * l.digit^1 * suffix + + l.digit^1 * P'.' * l.digit^0 * suffix + + local ureal + = digit^1 * P'/' * digit^1 + + (r == 10 and decimal or P(false)) + + digit^1 + local real + = S('+-')^-1 * ureal + + infnan + + local i = P'i' + local complex + = real * P'@' * real + + real * S'+-' * ureal^-1 * i + + real * infnan * i + + infnan * i + + real + + S'+-' * ureal^-1 * i + + return prefix * complex +end + +local number = token(l.NUMBER, num(2) + num(8) + num(10) + num(16)) + +-- Operators. +local operator = token(l.OPERATOR, P'#u8' + P',@' + S(".`'#(),")) M._rules = { {'whitespace', ws}, + {'directive', directive}, + {'boolean', boolean}, + {'comment', comment}, + {'string', string}, + {'number', number}, {'keyword', keyword}, {'func', func}, {'identifier', identifier}, - {'string', string}, - {'comment', comment}, - {'number', number}, + {'symbol', symbol}, {'operator', operator}, - {'entity', entity}, } -M._tokenstyles = { - entity = l.STYLE_VARIABLE -} M._foldsymbols = { _patterns = {'[%(%)%[%]{}]', '#|', '|#', ';'}, -- cgit v1.2.3