1 files changed, 59 insertions, 74 deletions
diff --git a/lua/lexers/xml.lua b/lua/lexers/xml.lua
index e0098e5..640e924 100644
--- a/lua/lexers/xml.lua
+++ b/lua/lexers/xml.lua
@@ -1,93 +1,78 @@
--- Copyright 2006-2017 Mitchell mitchell.att.foicica.com. See LICENSE.
+-- Copyright 2006-2022 Mitchell. See LICENSE.
 -- XML LPeg lexer.
 
-local l = require('lexer')
-local token, word_match = l.token, l.word_match
-local P, R, S, V = lpeg.P, lpeg.R, lpeg.S, lpeg.V
+local lexer = require('lexer')
+local token, word_match = lexer.token, lexer.word_match
+local P, S = lpeg.P, lpeg.S
 
-local M = {_NAME = 'xml'}
+local lex = lexer.new('xml')
 
 -- Whitespace.
-local ws = token(l.WHITESPACE, l.space^1)
+local ws = token(lexer.WHITESPACE, lexer.space^1)
+lex:add_rule('whitespace', ws)
 
 -- Comments and CDATA.
-local comment = token(l.COMMENT, '<!--' * (l.any - '-->')^0 * P('-->')^-1)
-local cdata = token('cdata', '<![CDATA[' * (l.any - ']]>')^0 * P(']]>')^-1)
+lex:add_rule('comment', token(lexer.COMMENT, lexer.range('<!--', '-->')))
+lex:add_rule('cdata', token('cdata', lexer.range('<![CDATA[', ']]>')))
+lex:add_style('cdata', lexer.styles.comment)
 
--- Strings.
-local sq_str = l.delimited_range("'", false, true)
-local dq_str = l.delimited_range('"', false, true)
-local string = #S('\'"') * l.last_char_includes('=') *
-               token(l.STRING, sq_str + dq_str)
-
-local in_tag = #P((1 - S'><')^0 * '>')
-
--- Numbers.
-local number = #l.digit * l.last_char_includes('=') *
-               token(l.NUMBER, l.digit^1 * P('%')^-1) * in_tag
+-- Doctypes and other markup tags.
+local alpha = lpeg.R('az', 'AZ', '\127\255')
+local word_char = lexer.alnum + S('_-:.??')
+local identifier = (alpha + S('_-:.?')) * word_char^0
+local doctype = token('doctype', '<!DOCTYPE') * ws * token('doctype', identifier) *
+  (ws * identifier)^-1 * (1 - P('>'))^0 * token('doctype', '>')
+lex:add_rule('doctype', doctype)
+lex:add_style('doctype', lexer.styles.comment)
 
-local alpha = R('az', 'AZ', '\127\255')
-local word_char = l.alnum + S('_-:.??')
-local identifier = (l.alpha + S('_-:.??')) * word_char^0
-local namespace = token(l.OPERATOR, ':') * token('namespace', identifier)
+-- Processing instructions.
+lex:add_rule('proc_insn', token('proc_insn', '<?' * (1 - P('?>'))^0 * P('?>')^-1))
+lex:add_style('proc_insn', lexer.styles.comment)
 
 -- Elements.
-local element = token('element', '<' * P('/')^-1 * identifier) * namespace^-1
-
--- Attributes.
-local attribute = token('attribute', identifier) * namespace^-1 *
-                  #(l.space^0 * '=')
+local namespace = token(lexer.OPERATOR, ':') * token('namespace', identifier)
+lex:add_rule('element', token('element', '<' * P('/')^-1 * identifier) * namespace^-1)
+lex:add_style('element', lexer.styles.keyword)
+lex:add_style('namespace', lexer.styles.class)
 
 -- Closing tags.
-local close_tag = token('element', P('/')^-1 * '>')
+lex:add_rule('close_tag', token('element', P('/')^-1 * '>'))
+
+-- Attributes.
+lex:add_rule('attribute', token('attribute', identifier) * namespace^-1 * #(lexer.space^0 * '='))
+lex:add_style('attribute', lexer.styles.type)
 
 -- Equals.
-local equals = token(l.OPERATOR, '=') * in_tag
+-- TODO: performance is terrible on large files.
+local in_tag = P(function(input, index)
+  local before = input:sub(1, index - 1)
+  local s, e = before:find('<[^>]-$'), before:find('>[^<]-$')
+  if s and e then return s > e and index or nil end
+  if s then return index end
+  return input:find('^[^<]->', index) and index or nil
+end)
+
+-- lex:add_rule('equal', token(lexer.OPERATOR, '=')) -- * in_tag
+
+-- Strings.
+local sq_str = lexer.range("'", false, false)
+local dq_str = lexer.range('"', false, false)
+lex:add_rule('string',
+  #S('\'"') * lexer.last_char_includes('=') * token(lexer.STRING, sq_str + dq_str))
+
+-- Numbers.
+local number = token(lexer.NUMBER, lexer.dec_num * P('%')^-1)
+lex:add_rule('number', #lexer.digit * lexer.last_char_includes('=') * number) -- *in_tag)
 
 -- Entities.
-local entity = token('entity', '&' * word_match{
-  'lt', 'gt', 'amp', 'apos', 'quot'
-} * ';')
+lex:add_rule('entity', token('entity', '&' * word_match('lt gt amp apos quot') * ';'))
+lex:add_style('entity', lexer.styles.operator)
 
--- Doctypes and other markup tags.
-local doctype = token('doctype', P('<!DOCTYPE')) * ws *
-                token('doctype', identifier) * (ws * identifier)^-1 *
-                (1 - P('>'))^0 * token('doctype', '>')
+-- Fold Points.
+local function disambiguate_lt(text, pos, line, s) return not line:find('^</', s) and 1 or -1 end
+lex:add_fold_point('element', '<', disambiguate_lt)
+lex:add_fold_point('element', '/>', -1)
+lex:add_fold_point(lexer.COMMENT, '<!--', '-->')
+lex:add_fold_point('cdata', '<![CDATA[', ']]>')
 
--- Processing instructions.
-local proc_insn = token('proc_insn', P('<?') * (1 - P('?>'))^0 * P('?>')^-1)
-
-M._rules = {
-  {'whitespace', ws},
-  {'comment', comment},
-  {'cdata', cdata},
-  {'doctype', doctype},
-  {'proc_insn', proc_insn},
-  {'element', element},
-  {'close_tag', close_tag},
-  {'attribute', attribute},
-  {'equals', equals},
-  {'string', string},
-  {'number', number},
-  {'entity', entity},
-}
-
-M._tokenstyles = {
-  element = l.STYLE_KEYWORD,
-  namespace = l.STYLE_CLASS,
-  attribute = l.STYLE_TYPE,
-  cdata = l.STYLE_COMMENT,
-  entity = l.STYLE_OPERATOR,
-  doctype = l.STYLE_COMMENT,
-  proc_insn = l.STYLE_COMMENT,
-  --markup = l.STYLE_COMMENT
-}
-
-M._foldsymbols = {
-  _patterns = {'</?', '/>', '<!%-%-', '%-%->', '<!%[CDATA%[', '%]%]>'},
-  element = {['<'] = 1, ['/>'] = -1, ['</'] = -1},
-  [l.COMMENT] = {['<!--'] = 1, ['-->'] = -1},
-  cdata = {['<![CDATA['] = 1, [']]>'] = -1}
-}
-
-return M
+return lex