aboutsummaryrefslogtreecommitdiff
path: root/lua/lexers/xml.lua
blob: 2692b2a22700072c7695c348c724146951ad30ec (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
-- Copyright 2006-2016 Mitchell mitchell.att.foicica.com. See LICENSE.
-- XML LPeg lexer.

local l = require('lexer')
local token, word_match = l.token, l.word_match
local P, R, S, V = lpeg.P, lpeg.R, lpeg.S, lpeg.V

local M = {_NAME = 'xml'}

-- Whitespace.
local ws = token(l.WHITESPACE, l.space^1)

-- Comments and CDATA.
local comment = token(l.COMMENT, '<!--' * (l.any - '-->')^0 * P('-->')^-1)
local cdata = token('cdata', '<![CDATA[' * (l.any - ']]>')^0 * P(']]>')^-1)

-- Strings.
local sq_str = l.delimited_range("'", false, true)
local dq_str = l.delimited_range('"', false, true)
local string = #S('\'"') * l.last_char_includes('=') *
               token(l.STRING, sq_str + dq_str)

local in_tag = P(function(input, index)
  local before = input:sub(1, index - 1)
  local s, e = before:find('<[^>]-$'), before:find('>[^<]-$')
  if s and e then return s > e and index or nil end
  if s then return index end
  return input:find('^[^<]->', index) and index or nil
end)

-- Numbers.
local number = #l.digit * l.last_char_includes('=') *
               token(l.NUMBER, l.digit^1 * P('%')^-1) * in_tag

local alpha = R('az', 'AZ', '\127\255')
local word_char = l.alnum + S('_-:.??')
local identifier = (l.alpha + S('_-:.??')) * word_char^0
local namespace = token(l.OPERATOR, ':') * token('namespace', identifier)

-- Elements.
local element = token('element', '<' * P('/')^-1 * identifier) * namespace^-1

-- Attributes.
local attribute = token('attribute', identifier) * namespace^-1 *
                  #(l.space^0 * '=')

-- Closing tags.
local close_tag = token('element', P('/')^-1 * '>')

-- Equals.
local equals = token(l.OPERATOR, '=') * in_tag

-- Entities.
local entity = token('entity', '&' * word_match{
  'lt', 'gt', 'amp', 'apos', 'quot'
} * ';')

-- Doctypes and other markup tags.
local doctype = token('doctype', P('<!DOCTYPE')) * ws *
                token('doctype', identifier) * (ws * identifier)^-1 *
                (1 - P('>'))^0 * token('doctype', '>')

-- Processing instructions.
local proc_insn = token('proc_insn', P('<?') * (1 - P('?>'))^0 * P('?>')^-1)

M._rules = {
  {'whitespace', ws},
  {'comment', comment},
  {'cdata', cdata},
  {'doctype', doctype},
  {'proc_insn', proc_insn},
  {'element', element},
  {'close_tag', close_tag},
  {'attribute', attribute},
  {'equals', equals},
  {'string', string},
  {'number', number},
  {'entity', entity},
}

M._tokenstyles = {
  element = l.STYLE_KEYWORD,
  namespace = l.STYLE_CLASS,
  attribute = l.STYLE_TYPE,
  cdata = l.STYLE_COMMENT,
  entity = l.STYLE_OPERATOR,
  doctype = l.STYLE_COMMENT,
  proc_insn = l.STYLE_COMMENT,
  --markup = l.STYLE_COMMENT
}

M._foldsymbols = {
  _patterns = {'</?', '/>', '<!%-%-', '%-%->', '<!%[CDATA%[', '%]%]>'},
  element = {['<'] = 1, ['/>'] = -1, ['</'] = -1},
  [l.COMMENT] = {['<!--'] = 1, ['-->'] = -1},
  cdata = {['<![CDATA['] = 1, [']]>'] = -1}
}

return M