diff options
| author | Marc André Tanner <mat@brain-dump.org> | 2016-10-05 10:41:41 +0200 |
|---|---|---|
| committer | Marc André Tanner <mat@brain-dump.org> | 2016-10-05 10:49:32 +0200 |
| commit | 6c38a7f19107552c58a9e168a7e114216c5b53c8 (patch) | |
| tree | b31911dbaff41fa0661b2fad291a3efa271c11c2 | |
| parent | 6cf931355b264618879b5bb3ada36067bf246882 (diff) | |
| download | vis-6c38a7f19107552c58a9e168a7e114216c5b53c8.tar.gz vis-6c38a7f19107552c58a9e168a7e114216c5b53c8.tar.xz | |
lexers: reduce changes to scintilla core lexing code
Based upon scintillua rev 568 id 55b15760cd31.
| -rw-r--r-- | lexers/lexer.lua | 121 |
1 files changed, 82 insertions, 39 deletions
diff --git a/lexers/lexer.lua b/lexers/lexer.lua index 2b2b6e1..bce9c29 100644 --- a/lexers/lexer.lua +++ b/lexers/lexer.lua @@ -58,8 +58,8 @@ local M = {} -- lower case followed by a *.lua* extension. For example, a new Lua lexer has -- the name *lua.lua*. -- --- Note: Try to refrain from using one-character language names like "b", "c", --- or "d". For example, Scintillua uses "b_lang", "cpp", and "dmd", +-- Note: Try to refrain from using one-character language names like "c", "d", +-- or "r". For example, Scintillua uses "ansi_c", "dmd", and "rstats", -- respectively. -- -- ### New Lexer Template @@ -89,7 +89,7 @@ local M = {} -- -- return M -- --- The first 4 lines of code simply define often used convenience variables. The +-- The first 3 lines of code simply define often used convenience variables. The -- 5th and last lines define and return the lexer object Scintilla uses; they -- are very important and must be part of every lexer. The sixth line defines -- something called a "token", an essential building block of lexers. You will @@ -338,6 +338,7 @@ local M = {} -- font:_name_ | The name of the font the style uses. -- size:_int_ | The size of the font the style uses. -- [not]bold | Whether or not the font face is bold. +-- weight:_int_ | The weight or boldness of a font, between 1 and 999. -- [not]italics | Whether or not the font face is italic. -- [not]underlined| Whether or not the font face is underlined. -- fore:_color_ | The foreground color of the font face. @@ -516,6 +517,20 @@ local M = {} -- local php_end_rule = token('php_tag', '?>') -- l.embed_lexer(html, M, php_start_rule, php_end_rule) -- +-- ### Lexers with Complex State +-- +-- A vast majority of lexers are not stateful and can operate on any chunk of +-- text in a document. However, there may be rare cases where a lexer does need +-- to keep track of some sort of persistent state. Rather than using `lpeg.P` +-- function patterns that set state variables, it is recommended to make use of +-- Scintilla's built-in, per-line state integers via [`lexer.line_state`](). It +-- was designed to accommodate up to 32 bit flags for tracking state. +-- [`lexer.line_from_position()`]() will return the line for any position given +-- to an `lpeg.P` function pattern. (Any positions derived from that position +-- argument will also work.) +-- +-- Writing stateful lexers is beyond the scope of this document. +-- -- ## Code Folding -- -- When reading source code, it is occasionally helpful to temporarily hide @@ -841,6 +856,9 @@ local M = {} -- @field indent_amount (table, Read-only) -- Table of indentation amounts in character columns, for line numbers -- starting from zero. +-- @field line_state (table) +-- Table of integer line states for line numbers starting from zero. +-- Line states can be used by lexers for keeping track of persistent states. -- @field property (table) -- Map of key-value string pairs. -- @field property_expanded (table, Read-only) @@ -850,7 +868,7 @@ local M = {} -- Map of key-value pairs with values interpreted as numbers, or `0` if not -- found. -- @field style_at (table, Read-only) --- Table of style names at positions in the buffer starting from zero. +-- Table of style names at positions in the buffer starting from 1. module('lexer')]=] lpeg = require('lpeg') @@ -931,7 +949,8 @@ end local function add_lexer(grammar, lexer, token_rule) local token_rule = join_tokens(lexer) local lexer_name = lexer._NAME - for _, child in ipairs(lexer._CHILDREN) do + for i = 1, #lexer._CHILDREN do + local child = lexer._CHILDREN[i] if child._CHILDREN then add_lexer(grammar, child) end local child_name = child._NAME local rules = child._EMBEDDEDRULES[lexer_name] @@ -973,10 +992,11 @@ local default = { 'identifier', 'operator', 'error', 'preprocessor', 'constant', 'variable', 'function', 'class', 'type', 'label', 'regex', 'embedded' } -for _, v in ipairs(default) do - M[string_upper(v)] = v - if not M['STYLE_'..string_upper(v)] then - M['STYLE_'..string_upper(v)] = '' +for i = 1, #default do + local name, upper_name = default[i], string_upper(default[i]) + M[upper_name] = name + if not M['STYLE_'..upper_name] then + M['STYLE_'..upper_name] = '' end end -- Predefined styles. @@ -984,18 +1004,20 @@ local predefined = { 'default', 'linenumber', 'bracelight', 'bracebad', 'controlchar', 'indentguide', 'calltip' } -for _, v in ipairs(predefined) do - M[string_upper(v)] = v - if not M['STYLE_'..string_upper(v)] then - M['STYLE_'..string_upper(v)] = '' +for i = 1, #predefined do + local name, upper_name = predefined[i], string_upper(predefined[i]) + M[upper_name] = name + if not M['STYLE_'..upper_name] then + M['STYLE_'..upper_name] = '' end end --- -- Initializes or loads and returns the lexer of string name *name*. --- Scintilla calls this function to load a lexer. Parent lexers also call this --- function to load child lexers and vice-versa. The user calls this function --- to load a lexer when using Scintillua as a Lua library. +-- Scintilla calls this function in order to load a lexer. Parent lexers also +-- call this function in order to load child lexers and vice-versa. The user +-- calls this function in order to load a lexer when using Scintillua as a Lua +-- library. -- @param name The name of the lexing language. -- @param alt_name The alternate name of the lexing language. This is useful for -- embedding the same child lexer with multiple sets of start and end tokens. @@ -1042,20 +1064,28 @@ function M.load(name, alt_name) if lexer._lexer then local l, _r, _s = lexer._lexer, lexer._rules, lexer._tokenstyles if not l._tokenstyles then l._tokenstyles = {} end - for _, r in ipairs(_r or {}) do - -- Prevent rule id clashes. - l._rules[#l._rules + 1] = {lexer._NAME..'_'..r[1], r[2]} + if _r then + for i = 1, #_r do + -- Prevent rule id clashes. + l._rules[#l._rules + 1] = {lexer._NAME..'_'.._r[i][1], _r[i][2]} + end + end + if _s then + for token, style in pairs(_s) do l._tokenstyles[token] = style end end - for token, style in pairs(_s or {}) do l._tokenstyles[token] = style end lexer = l end -- Add the lexer's styles and build its grammar. if lexer._rules then - for token, style in pairs(lexer._tokenstyles or {}) do - add_style(lexer, token, style) + if lexer._tokenstyles then + for token, style in pairs(lexer._tokenstyles) do + add_style(lexer, token, style) + end + end + for i = 1, #lexer._rules do + add_rule(lexer, lexer._rules[i][1], lexer._rules[i][2]) end - for _, r in ipairs(lexer._rules) do add_rule(lexer, r[1], r[2]) end build_grammar(lexer) end -- Add the lexer's unique whitespace style. @@ -1123,15 +1153,16 @@ function M.lex(lexer, text, init_style) end --- --- Folds a chunk of text *text* with lexer *lexer*. --- Folds *text* starting at position *start_pos* on line number *start_line* --- with a beginning fold level of *start_level* in the buffer. If *lexer* has a --- a `_fold` function or a `_foldsymbols` table, that field is used to perform --- folding. Otherwise, if *lexer* has a `_FOLDBYINDENTATION` field set, or if a +-- Determines fold points in a chunk of text *text* with lexer *lexer*. +-- *text* starts at position *start_pos* on line number *start_line* with a +-- beginning fold level of *start_level* in the buffer. If *lexer* has a `_fold` +-- function or a `_foldsymbols` table, that field is used to perform folding. +-- Otherwise, if *lexer* has a `_FOLDBYINDENTATION` field set, or if a -- `fold.by.indentation` property is set, folding by indentation is done. -- @param lexer The lexer object to fold with. -- @param text The text in the buffer to fold. --- @param start_pos The position in the buffer *text* starts at. +-- @param start_pos The position in the buffer *text* starts at, starting at +-- zero. -- @param start_line The line number *text* starts on. -- @param start_level The fold level *text* starts on. -- @return table of fold levels. @@ -1282,9 +1313,10 @@ M.hex_num = '0' * lpeg_S('xX') * M.xdigit^1 M.oct_num = '0' * lpeg_R('07')^1 M.integer = lpeg_S('+-')^-1 * (M.hex_num + M.oct_num + M.dec_num) M.float = lpeg_S('+-')^-1 * - (M.digit^0 * '.' * M.digit^1 + M.digit^1 * '.' * M.digit^0 + - M.digit^1) * - lpeg_S('eE') * lpeg_S('+-')^-1 * M.digit^1 + ((M.digit^0 * '.' * M.digit^1 + M.digit^1 * '.' * M.digit^0) * + (lpeg_S('eE') * lpeg_S('+-')^-1 * M.digit^1)^-1 + + (M.digit^1 * lpeg_S('eE') * lpeg_S('+-')^-1 * M.digit^1)) + M.word = (M.alpha + '_') * (M.alnum + '_')^0 --- @@ -1410,7 +1442,7 @@ end -- @param word_chars Optional string of additional characters considered to be -- part of a word. By default, word characters are alphanumerics and -- underscores ("%w_" in Lua). This parameter may be `nil` or the empty string --- to indicate no additional word characters. +-- in order to indicate no additional word characters. -- @param case_insensitive Optional boolean flag indicating whether or not the -- word match is case-insensitive. The default is `false`. -- @return pattern @@ -1420,8 +1452,8 @@ end -- @name word_match function M.word_match(words, word_chars, case_insensitive) local word_list = {} - for _, word in ipairs(words) do - word_list[case_insensitive and word:lower() or word] = true + for i = 1, #words do + word_list[case_insensitive and words[i]:lower() or words[i]] = true end local chars = M.alnum + '_' if word_chars then chars = chars + lpeg_S(word_chars) end @@ -1449,7 +1481,9 @@ function M.embed_lexer(parent, child, start_rule, end_rule) if not child._EMBEDDEDRULES then child._EMBEDDEDRULES = {} end if not child._RULES then -- creating a child lexer to be embedded if not child._rules then error('Cannot embed language with no rules') end - for _, r in ipairs(child._rules) do add_rule(child, r[1], r[2]) end + for i = 1, #child._rules do + add_rule(child, child._rules[i][1], child._rules[i][2]) + end end child._EMBEDDEDRULES[parent._NAME] = { ['start_rule'] = start_rule, @@ -1463,8 +1497,10 @@ function M.embed_lexer(parent, child, start_rule, end_rule) if not parent._tokenstyles then parent._tokenstyles = {} end local tokenstyles = parent._tokenstyles tokenstyles[child._NAME..'_whitespace'] = M.STYLE_WHITESPACE - for token, style in pairs(child._tokenstyles or {}) do - tokenstyles[token] = style + if child._tokenstyles then + for token, style in pairs(child._tokenstyles) do + tokenstyles[token] = style + end end child._lexer = parent -- use parent's tokens if child is embedding itself parent_lexer = parent -- use parent's tokens if the calling lexer is a proxy @@ -1544,11 +1580,18 @@ M.property_expanded = setmetatable({}, { --[[ The functions and fields below were defined in C. --- +-- Returns the line number of the line that contains position *pos*, which +-- starts from 1. +-- @param pos The position to get the line number of. +-- @return number +local function line_from_position(pos) end + +--- -- Individual fields for a lexer instance. -- @field _NAME The string name of the lexer. -- @field _rules An ordered list of rules for a lexer grammar. -- Each rule is a table containing an arbitrary rule name and the LPeg pattern --- associated with the rule. The order of rules is important as rules are +-- associated with the rule. The order of rules is important, as rules are -- matched sequentially. -- Child lexers should not use this table to access and/or modify their -- parent's rules and vice-versa. Use the `_RULES` table instead. |
