diff options
Diffstat (limited to 'lua/lexers/sml.lua')
| -rw-r--r-- | lua/lexers/sml.lua | 180 |
1 files changed, 80 insertions, 100 deletions
diff --git a/lua/lexers/sml.lua b/lua/lexers/sml.lua index 093e67c..ba2015e 100644 --- a/lua/lexers/sml.lua +++ b/lua/lexers/sml.lua @@ -1,111 +1,91 @@ --- Copyright 2017 Murray Calavera. See LICENSE. +-- Copyright 2017-2022 Murray Calavera. See LICENSE. -- Standard ML LPeg lexer. -local l = require('lexer') -local token = l.token - -local function mlword(words) - return l.word_match(words, "'") -end +local lexer = require('lexer') +local token, word_match = lexer.token, lexer.word_match +local P, S = lpeg.P, lpeg.S + +local lex = lexer.new('sml') + +-- Whitespace. +local ws = token(lexer.WHITESPACE, lexer.space^1) +lex:add_rule('whitespace', ws) + +-- Structures. +local id = (lexer.alnum + "'" + '_')^0 +local aid = lexer.alpha * id +local longid = (aid * '.')^0 * aid +local struct_dec = token(lexer.KEYWORD, 'structure') * ws * token(lexer.CLASS, aid) * ws * + token(lexer.OPERATOR, '=') * ws +lex:add_rule('struct_new', struct_dec * token(lexer.KEYWORD, 'struct')) +lex:add_rule('struct_alias', struct_dec * token(lexer.CLASS, longid)) +lex:add_rule('structure', token(lexer.CLASS, aid * '.')) + +-- Open. +lex:add_rule('open', token(lexer.KEYWORD, word_match('open structure functor')) * ws * + token(lexer.CLASS, longid)) + +-- Keywords. +lex:add_rule('keyword', token(lexer.KEYWORD, word_match{ + 'abstype', 'and', 'andalso', 'as', 'case', 'do', 'datatype', 'else', 'end', 'exception', 'fn', + 'fun', 'handle', 'if', 'in', 'infix', 'infixr', 'let', 'local', 'nonfix', 'of', 'op', 'orelse', + 'raise', 'rec', 'then', 'type', 'val', 'with', 'withtype', 'while', -- + 'eqtype', 'functor', 'include', 'sharing', 'sig', 'signature', 'struct', 'structure' +})) + +-- Types. +lex:add_rule('type', token(lexer.TYPE, word_match{ + 'int', 'real', 'word', 'bool', 'char', 'string', 'unit', 'array', 'exn', 'list', 'option', + 'order', 'ref', 'substring', 'vector' +})) + +-- Functions. +-- `real`, `vector` and `substring` are a problem. +lex:add_rule('function', token(lexer.FUNCTION, word_match{ + 'app', 'before', 'ceil', 'chr', 'concat', 'exnMessage', 'exnName', 'explode', 'floor', 'foldl', + 'foldr', 'getOpt', 'hd', 'ignore', 'implode', 'isSome', 'length', 'map', 'not', 'null', 'ord', + 'print', 'real', 'rev', 'round', 'size', 'str', 'substring', 'tl', 'trunc', 'valOf', 'vector', + 'o', 'abs', 'mod', 'div' +})) -local ws = token(l.WHITESPACE, l.space^1) +-- Constants. +lex:add_rule('constant', token(lexer.CONSTANT, word_match('true false nil') + lexer.upper * id)) --- single line comments are valid in successor ml -local cl = '(*)' * l.nonnewline^0 -local comment = token(l.COMMENT, cl + l.nested_pair('(*', '*)')) +-- Indentifiers (non-symbolic). +lex:add_rule('identifier', token(lexer.IDENTIFIER, lexer.lower * id)) -local string = token(l.STRING, lpeg.P('#')^-1 * l.delimited_range('"', true)) +-- Strings. +lex:add_rule('string', token(lexer.STRING, P('#')^-1 * lexer.range('"', true))) -local function num(digit) - return digit * (digit^0 * lpeg.P('_'))^0 * digit^1 + digit -end +-- Comments. +local line_comment = lexer.to_eol('(*)') +local block_comment = lexer.range('(*', '*)', false, false, true) +lex:add_rule('comment', token(lexer.COMMENT, line_comment + block_comment)) -local int = num(l.digit) -local frac = lpeg.P('.') * int +-- Numbers. +local function num(digit) return digit * (digit^0 * '_')^0 * digit^1 + digit end +local int = num(lexer.digit) +local frac = '.' * int local minus = lpeg.P('~')^-1 local exp = lpeg.S('eE') * minus * int local real = int * frac^-1 * exp + int * frac * exp^-1 -local hex = num(l.xdigit) +local hex = num(lexer.xdigit) local bin = num(lpeg.S('01')) - -local number = token(l.NUMBER, - lpeg.P('0w') * int - + (lpeg.P('0wx') + lpeg.P('0xw')) * hex - + (lpeg.P('0wb') + lpeg.P('0bw')) * bin - + minus * lpeg.P('0x') * hex - + minus * lpeg.P('0b') * bin - + minus * real - + minus * int -) - -local keyword = token(l.KEYWORD, mlword{ - 'abstype', 'and', 'andalso', 'as', 'case', 'do', 'datatype', 'else', 'end', - 'exception', 'fn', 'fun', 'handle', 'if', 'in', 'infix', 'infixr', 'let', - 'local', 'nonfix', 'of', 'op', 'orelse', 'raise', 'rec', 'then', - 'type', 'val', 'with', 'withtype', 'while', - - 'eqtype', 'functor', 'include', 'sharing', 'sig', 'signature', - 'struct', 'structure' -}) - --- includes valid symbols for identifiers -local operator = token(l.OPERATOR, lpeg.S('!*/+-^:@=<>()[]{},;._|#%&$?~`\\')) - -local type = token(l.TYPE, mlword{ - 'int', 'real', 'word', 'bool', 'char', 'string', 'unit', - 'array', 'exn', 'list', 'option', 'order', 'ref', 'substring', 'vector' -}) - --- `real`, `vector` and `substring` are a problem -local func = token(l.FUNCTION, mlword{ - 'app', 'before', 'ceil', 'chr', 'concat', 'exnMessage', 'exnName', - 'explode', 'floor', 'foldl', 'foldr', 'getOpt', 'hd', 'ignore', - 'implode', 'isSome', 'length', 'map', 'not', 'null', 'ord', 'print', - 'real', 'rev', 'round', 'size', 'str', 'substring', 'tl', 'trunc', - 'valOf', 'vector', - 'o', 'abs', 'mod', 'div' -}) - --- non-symbolic identifiers only -local id = (l.alnum + "'" + '_')^0 -local aid = l.alpha * id -local longid = (aid * lpeg.P('.'))^0 * aid -local identifier = token(l.IDENTIFIER, l.lower * id) -local typevar = token(l.VARIABLE, "'" * id) -local c = mlword{'true', 'false', 'nil'} -local const = token(l.CONSTANT, l.upper * id + c) -local structure = token(l.CLASS, aid * lpeg.P('.')) - -local open - = token(l.KEYWORD, mlword{'open', 'structure', 'functor'}) - * ws * token(l.CLASS, longid) - -local struct_dec - = token(l.KEYWORD, lpeg.P('structure')) * ws - * token(l.CLASS, aid) * ws - * token(l.OPERATOR, lpeg.P('=')) * ws - -local struct_new = struct_dec * token(l.KEYWORD, lpeg.P('struct')) -local struct_alias = struct_dec * token(l.CLASS, longid) - -local M = {_NAME = 'sml'} - -M._rules = { - {'whitespace', ws}, - {'comment', comment}, - {'number', number}, - {'struct_new', struct_new}, - {'struct_alias', struct_alias}, - {'structure', structure}, - {'open', open}, - {'type', type}, - {'keyword', keyword}, - {'function', func}, - {'string', string}, - {'operator', operator}, - {'typevar', typevar}, - {'constant', const}, - {'identifier', identifier}, -} - -return M +-- LuaFormatter off +lex:add_rule('number', token(lexer.NUMBER, + '0w' * int + + (P('0wx') + '0xw') * hex + + (P('0wb') + '0bw') * bin + + minus * '0x' * hex + + minus * '0b' * bin + + minus * real + + minus * int)) +-- LuaFormatter on + +-- Type variables. +lex:add_rule('typevar', token(lexer.VARIABLE, "'" * id)) + +-- Operators. +lex:add_rule('operator', token(lexer.OPERATOR, S('!*/+-^:@=<>()[]{},;._|#%&$?~`\\'))) + +return lex |
