aboutsummaryrefslogtreecommitdiff
path: root/lua/lexers/html.lua
diff options
context:
space:
mode:
authorqiu-x <alex@alexslomka.xyz>2022-06-29 07:56:51 +0200
committerFelix Van der Jeugt <felix.vanderjeugt@posteo.net>2022-11-29 21:57:18 +0100
commit8a420ecc4c1ed50111464ec66901bd983eaf2dbd (patch)
treef31d2186cafaee6e7f18d32fe99144c3e8148c00 /lua/lexers/html.lua
parent981b90a203484182feace48471fe2b53dae7676f (diff)
downloadvis-8a420ecc4c1ed50111464ec66901bd983eaf2dbd.tar.gz
vis-8a420ecc4c1ed50111464ec66901bd983eaf2dbd.tar.xz
Resync the lexers with Scintillua
- Resync the lexers with Scintillua - Update the lexer readme - Update `zenburn` theme to fix some highlighting issues - lexers: redirect print function to vis:info() - Fix support for custom style names - As per error message "lexer.delimited_range() is deprecated, use lexer.range()". - Remove remaining `lexer.delimited_range()` call - Set syntax to `nil` if the file type has no matching lexer - Updated Go lexer for Go 1.18. - lexers/dsv: convert to new lexer format (cherry picked from commit 9edbc3cd9ea1d7142b1305840432a3d2739e755a) - lexers/gemini: disable legacy gemini lexer This reverts commit 468f9ee1b027a7ce98b1a249fa1af5888feeb989. It is in legacy format and of questionable quality. Ideally it should be contributed upstream from where it will eventually trickle down to us. - lexers/git-rebase: convert to new lexer format (cherry picked from commit 4000a4cc9ac4a4c2869dfae772b977a82aee8d8c) - lexers/strace: convert to new lexer format (cherry picked from commit e420451320d97eb164f5629c1bcfab0b595be29d) - lexers/typescript: add new upstream lexer revision 28e2b60 (cherry picked from commit 7326e6deecdaa75fa94ae9ebdb653f9f907b33f2) - use `package.searchpath` instead of a local `searchpath` function - Restore `filetype: support filetype detection via hashbang` - Remove redundant comment - Restore gemini lexer
Diffstat (limited to 'lua/lexers/html.lua')
-rw-r--r--lua/lexers/html.lua270
1 files changed, 128 insertions, 142 deletions
diff --git a/lua/lexers/html.lua b/lua/lexers/html.lua
index ba6e3e2..0cb3c2f 100644
--- a/lua/lexers/html.lua
+++ b/lua/lexers/html.lua
@@ -1,162 +1,148 @@
--- Copyright 2006-2017 Mitchell mitchell.att.foicica.com. See LICENSE.
+-- Copyright 2006-2022 Mitchell. See LICENSE.
-- HTML LPeg lexer.
-local l = require('lexer')
-local token, word_match = l.token, l.word_match
-local P, R, S, V = lpeg.P, lpeg.R, lpeg.S, lpeg.V
+local lexer = require('lexer')
+local token, word_match = lexer.token, lexer.word_match
+local P, S = lpeg.P, lpeg.S
-local M = {_NAME = 'html'}
-
-case_insensitive_tags = true
+local lex = lexer.new('html')
-- Whitespace.
-local ws = token(l.WHITESPACE, l.space^1)
+local ws = token(lexer.WHITESPACE, lexer.space^1)
+lex:add_rule('whitespace', ws)
-- Comments.
-local comment = token(l.COMMENT, '<!--' * (l.any - '-->')^0 * P('-->')^-1)
-
--- Strings.
-local sq_str = l.delimited_range("'")
-local dq_str = l.delimited_range('"')
-local string = #S('\'"') * l.last_char_includes('=') *
- token(l.STRING, sq_str + dq_str)
+lex:add_rule('comment', token(lexer.COMMENT, lexer.range('<!--', '-->')))
-local in_tag = #P((1 - S'><')^0 * '>')
-
--- Numbers.
-local number = #l.digit * l.last_char_includes('=') *
- token(l.NUMBER, l.digit^1 * P('%')^-1) * in_tag
+-- Doctype.
+lex:add_rule('doctype', token('doctype', lexer.range('<!' * word_match('doctype', true), '>')))
+lex:add_style('doctype', lexer.styles.comment)
-- Elements.
-local known_element = token('element', '<' * P('/')^-1 * word_match({
- 'a', 'abbr', 'address', 'area', 'article', 'aside', 'audio', 'b', 'base',
- 'bdi', 'bdo', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption',
- 'cite', 'code', 'col', 'colgroup', 'content', 'data', 'datalist', 'dd',
- 'decorator', 'del', 'details', 'dfn', 'div', 'dl', 'dt', 'element', 'em',
- 'embed', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2',
- 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hr', 'html', 'i', 'iframe', 'img',
- 'input', 'ins', 'kbd', 'keygen', 'label', 'legend', 'li', 'link', 'main',
- 'map', 'mark', 'menu', 'menuitem', 'meta', 'meter', 'nav', 'noscript',
- 'object', 'ol', 'optgroup', 'option', 'output', 'p', 'param', 'pre',
- 'progress', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'script', 'section',
- 'select', 'shadow', 'small', 'source', 'spacer', 'spacer', 'span', 'strong',
- 'style', 'sub', 'summary', 'sup', 'table', 'tbody', 'td', 'template',
- 'textarea', 'tfoot', 'th', 'thead', 'time', 'title', 'tr', 'track', 'u', 'ul',
- 'var', 'video', 'wbr'
-}, nil, case_insensitive_tags))
-local unknown_element = token('unknown_element', '<' * P('/')^-1 * l.word)
-local element = known_element + unknown_element
-
--- Attributes.
-local known_attribute = token('attribute', word_match({
- 'accept', 'accept-charset', 'accesskey', 'action', 'align', 'alt', 'async',
- 'autocomplete', 'autofocus', 'autoplay', 'bgcolor', 'border', 'buffered',
- 'challenge', 'charset', 'checked', 'cite', 'class', 'code', 'codebase',
- 'color', 'cols', 'colspan', 'content', 'contenteditable', 'contextmenu',
- 'controls', 'coords', 'data', 'data-', 'datetime', 'default', 'defer', 'dir',
- 'dirname', 'disabled', 'download', 'draggable', 'dropzone', 'enctype', 'for',
- 'form', 'headers', 'height', 'hidden', 'high', 'href', 'hreflang',
- 'http-equiv', 'icon', 'id', 'ismap', 'itemprop', 'keytype', 'kind', 'label',
- 'lang', 'language', 'list', 'loop', 'low', 'manifest', 'max', 'maxlength',
- 'media', 'method', 'min', 'multiple', 'name', 'novalidate', 'open', 'optimum',
- 'pattern', 'ping', 'placeholder', 'poster', 'preload', 'pubdate',
- 'radiogroup', 'readonly', 'rel', 'required', 'reversed', 'role', 'rows',
- 'rowspan', 'sandbox', 'spellcheck', 'scope', 'scoped', 'seamless', 'selected',
- 'shape', 'size', 'sizes', 'span', 'src', 'srcdoc', 'srclang', 'start',
- 'step', 'style', 'summary', 'tabindex', 'target', 'title', 'type', 'usemap',
- 'value', 'width', 'wrap'
-}, '-', case_insensitive_tags) + ((P('data-') + 'aria-') * (l.alnum + '-')^1))
-local unknown_attribute = token('unknown_attribute', l.word)
-local attribute = (known_attribute + unknown_attribute) * #(l.space^0 * '=')
+local single_element = token('single_element', '<' * P('/')^-1 * word_match(
+ {
+ 'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta',
+ 'param', 'source', 'track', 'wbr'
+ }, true))
+local paired_element = token('element', '<' * P('/')^-1 * word_match({
+ 'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b', 'bdi', 'bdo', 'blockquote', 'body',
+ 'button', 'canvas', 'caption', 'cite', 'code', 'colgroup', 'content', 'data', 'datalist', 'dd',
+ 'decorator', 'del', 'details', 'dfn', 'div', 'dl', 'dt', 'element', 'em', 'fieldset',
+ 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header',
+ 'html', 'i', 'iframe', 'ins', 'kbd', 'label', 'legend', 'li', 'main', 'map', 'mark', 'menu',
+ 'menuitem', 'meter', 'nav', 'noscript', 'object', 'ol', 'optgroup', 'option', 'output', 'p',
+ 'pre', 'progress', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'script', 'section', 'select', 'shadow',
+ 'small', 'spacer', 'span', 'strong', 'style', 'sub', 'summary', 'sup', 'table', 'tbody', 'td',
+ 'template', 'textarea', 'tfoot', 'th', 'thead', 'time', 'title', 'tr', 'u', 'ul', 'var', 'video'
+}, true))
+local known_element = single_element + paired_element
+local unknown_element = token('unknown_element', '<' * P('/')^-1 * (lexer.alnum + '-')^1)
+local element = (known_element + unknown_element) * -P(':')
+lex:add_rule('element', element)
+lex:add_style('single_element', lexer.styles.keyword)
+lex:add_style('element', lexer.styles.keyword)
+lex:add_style('unknown_element', lexer.styles.keyword .. {italics = true})
-- Closing tags.
local tag_close = token('element', P('/')^-1 * '>')
+lex:add_rule('tag_close', tag_close)
+
+-- Attributes.
+local known_attribute = token('attribute', word_match({
+ 'accept', 'accept-charset', 'accesskey', 'action', 'align', 'alt', 'async', 'autocomplete',
+ 'autofocus', 'autoplay', 'bgcolor', 'border', 'buffered', 'challenge', 'charset', 'checked',
+ 'cite', 'class', 'code', 'codebase', 'color', 'cols', 'colspan', 'content', 'contenteditable',
+ 'contextmenu', 'controls', 'coords', 'data', 'data-', 'datetime', 'default', 'defer', 'dir',
+ 'dirname', 'disabled', 'download', 'draggable', 'dropzone', 'enctype', 'for', 'form', 'headers',
+ 'height', 'hidden', 'high', 'href', 'hreflang', 'http-equiv', 'icon', 'id', 'ismap', 'itemprop',
+ 'keytype', 'kind', 'label', 'lang', 'language', 'list', 'loop', 'low', 'manifest', 'max',
+ 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'novalidate', 'open', 'optimum',
+ 'pattern', 'ping', 'placeholder', 'poster', 'preload', 'pubdate', 'radiogroup', 'readonly', 'rel',
+ 'required', 'reversed', 'role', 'rows', 'rowspan', 'sandbox', 'scope', 'scoped', 'seamless',
+ 'selected', 'shape', 'size', 'sizes', 'span', 'spellcheck', 'src', 'srcdoc', 'srclang', 'start',
+ 'step', 'style', 'summary', 'tabindppex', 'target', 'title', 'type', 'usemap', 'value', 'width',
+ 'wrap'
+}, true) + ((P('data-') + 'aria-') * (lexer.alnum + '-')^1))
+local unknown_attribute = token('unknown_attribute', (lexer.alnum + '-')^1)
+local attribute = (known_attribute + unknown_attribute) * #(lexer.space^0 * '=')
+lex:add_rule('attribute', attribute)
+lex:add_style('attribute', lexer.styles.type)
+lex:add_style('unknown_attribute', lexer.styles.type .. {italics = true})
-- Equals.
-local equals = token(l.OPERATOR, '=') * in_tag
+-- TODO: performance is terrible on large files.
+local in_tag = P(function(input, index)
+ local before = input:sub(1, index - 1)
+ local s, e = before:find('<[^>]-$'), before:find('>[^<]-$')
+ if s and e then return s > e and index or nil end
+ if s then return index end
+ return input:find('^[^<]->', index) and index or nil
+end)
+
+local equals = token(lexer.OPERATOR, '=') -- * in_tag
+-- lex:add_rule('equals', equals)
--- Entities.
-local entity = token('entity', '&' * (l.any - l.space - ';')^1 * ';')
+-- Strings.
+local string = #S('\'"') * lexer.last_char_includes('=') *
+ token(lexer.STRING, lexer.range("'") + lexer.range('"'))
+lex:add_rule('string', string)
--- Doctype.
-local doctype = token('doctype', '<!' *
- word_match({'doctype'}, nil, case_insensitive_tags) *
- (l.any - '>')^1 * '>')
-
-M._rules = {
- {'whitespace', ws},
- {'comment', comment},
- {'doctype', doctype},
- {'element', element},
- {'tag_close', tag_close},
- {'attribute', attribute},
--- {'equals', equals},
- {'string', string},
- {'number', number},
- {'entity', entity},
-}
-
-M._tokenstyles = {
- element = l.STYLE_KEYWORD,
- unknown_element = l.STYLE_KEYWORD..',italics',
- attribute = l.STYLE_TYPE,
- unknown_attribute = l.STYLE_TYPE..',italics',
- entity = l.STYLE_OPERATOR,
- doctype = l.STYLE_COMMENT
-}
-
-M._foldsymbols = {
- _patterns = {'</?', '/>', '<!%-%-', '%-%->'},
- element = {['<'] = 1, ['/>'] = -1, ['</'] = -1},
- unknown_element = {['<'] = 1, ['/>'] = -1, ['</'] = -1},
- [l.COMMENT] = {['<!--'] = 1, ['-->'] = -1}
-}
+-- Numbers.
+local number = token(lexer.NUMBER, lexer.dec_num * P('%')^-1)
+lex:add_rule('number', #lexer.digit * lexer.last_char_includes('=') * number) -- *in_tag)
--- Tags that start embedded languages.
-M.embed_start_tag = element *
- (ws^1 * attribute * ws^0 * equals * ws^0 * string)^0 *
- ws^0 * tag_close
-M.embed_end_tag = element * tag_close
-
--- Embedded CSS.
-local css = l.load('css')
-local style_element = word_match({'style'}, nil, case_insensitive_tags)
-local css_start_rule = #(P('<') * style_element *
- ('>' + P(function(input, index)
- if input:find('^%s+type%s*=%s*(["\'])text/css%1', index) then
- return index
- end
-end))) * M.embed_start_tag -- <style type="text/css">
-local css_end_rule = #('</' * style_element * ws^0 * '>') *
- M.embed_end_tag -- </style>
-l.embed_lexer(M, css, css_start_rule, css_end_rule)
-
--- Embedded JavaScript.
-local js = l.load('javascript')
-local script_element = word_match({'script'}, nil, case_insensitive_tags)
-local js_start_rule = #(P('<') * script_element *
- ('>' + P(function(input, index)
- if input:find('^%s+type%s*=%s*(["\'])text/javascript%1', index) then
- return index
- end
-end))) * M.embed_start_tag -- <script type="text/javascript">
-local js_end_rule = #('</' * script_element * ws^0 * '>') *
- M.embed_end_tag -- </script>
-local js_line_comment = '//' * (l.nonnewline_esc - js_end_rule)^0
-local js_block_comment = '/*' * (l.any - '*/' - js_end_rule)^0 * P('*/')^-1
-js._RULES['comment'] = token(l.COMMENT, js_line_comment + js_block_comment)
-l.embed_lexer(M, js, js_start_rule, js_end_rule)
-
--- Embedded CoffeeScript.
-local cs = l.load('coffeescript')
-local script_element = word_match({'script'}, nil, case_insensitive_tags)
-local cs_start_rule = #(P('<') * script_element * P(function(input, index)
- if input:find('^[^>]+type%s*=%s*(["\'])text/coffeescript%1', index) then
- return index
+-- Entities.
+lex:add_rule('entity', token('entity', '&' * (lexer.any - lexer.space - ';')^1 * ';'))
+lex:add_style('entity', lexer.styles.comment)
+
+-- Fold points.
+local function disambiguate_lt(text, pos, line, s)
+ if line:find('/>', s) then
+ return 0
+ elseif line:find('^</', s) then
+ return -1
+ else
+ return 1
end
-end)) * M.embed_start_tag -- <script type="text/coffeescript">
-local cs_end_rule = #('</' * script_element * ws^0 * '>') *
- M.embed_end_tag -- </script>
-l.embed_lexer(M, cs, cs_start_rule, cs_end_rule)
+end
+lex:add_fold_point('element', '<', disambiguate_lt)
+lex:add_fold_point('unknown_element', '<', disambiguate_lt)
+lex:add_fold_point(lexer.COMMENT, '<!--', '-->')
-return M
+-- Tags that start embedded languages.
+-- Export these patterns for proxy lexers (e.g. ASP) that need them.
+lex.embed_start_tag = element * (ws * attribute * ws^-1 * equals * ws^-1 * string)^0 * ws^-1 *
+ tag_close
+lex.embed_end_tag = element * tag_close
+
+-- Embedded CSS (<style type="text/css"> ... </style>).
+local css = lexer.load('css')
+local style_element = word_match('style', true)
+local css_start_rule = #('<' * style_element * ('>' + P(function(input, index)
+ if input:find('^%s+type%s*=%s*(["\'])text/css%1', index) then return index end
+end))) * lex.embed_start_tag
+local css_end_rule = #('</' * style_element * ws^-1 * '>') * lex.embed_end_tag
+lex:embed(css, css_start_rule, css_end_rule)
+
+-- Embedded JavaScript (<script type="text/javascript"> ... </script>).
+local js = lexer.load('javascript')
+local script_element = word_match('script', true)
+local js_start_rule = #('<' * script_element * ('>' + P(function(input, index)
+ if input:find('^%s+type%s*=%s*(["\'])text/javascript%1', index) then return index end
+end))) * lex.embed_start_tag
+local js_end_rule = #('</' * script_element * ws^-1 * '>') * lex.embed_end_tag
+local js_line_comment = '//' * (lexer.nonnewline - js_end_rule)^0
+local js_block_comment = '/*' * (lexer.any - '*/' - js_end_rule)^0 * P('*/')^-1
+js:modify_rule('comment', token(lexer.COMMENT, js_line_comment + js_block_comment))
+lex:embed(js, js_start_rule, js_end_rule)
+
+-- Embedded CoffeeScript (<script type="text/coffeescript"> ... </script>).
+local cs = lexer.load('coffeescript')
+script_element = word_match('script', true)
+local cs_start_rule = #('<' * script_element * P(function(input, index)
+ if input:find('^[^>]+type%s*=%s*(["\'])text/coffeescript%1', index) then return index end
+end)) * lex.embed_start_tag
+local cs_end_rule = #('</' * script_element * ws^-1 * '>') * lex.embed_end_tag
+lex:embed(cs, cs_start_rule, cs_end_rule)
+
+return lex