lexers: reduce changes to scintilla core lexing code

Based upon scintillua rev 568 id 55b15760cd31.
author: Marc André Tanner <mat@brain-dump.org> 2016-10-05 10:41:41 +0200
committer: Marc André Tanner <mat@brain-dump.org> 2016-10-05 10:49:32 +0200
commit: 6c38a7f19107552c58a9e168a7e114216c5b53c8 (patch)
tree: b31911dbaff41fa0661b2fad291a3efa271c11c2
parent: 6cf931355b264618879b5bb3ada36067bf246882 (diff)
download: vis-6c38a7f19107552c58a9e168a7e114216c5b53c8.tar.gz
vis-6c38a7f19107552c58a9e168a7e114216c5b53c8.tar.xz
1 files changed, 82 insertions, 39 deletions
diff --git a/lexers/lexer.lua b/lexers/lexer.lua
index 2b2b6e1..bce9c29 100644
--- a/lexers/lexer.lua
+++ b/lexers/lexer.lua
@@ -58,8 +58,8 @@ local M = {}
 -- lower case followed by a *.lua* extension. For example, a new Lua lexer has
 -- the name *lua.lua*.
 --
--- Note: Try to refrain from using one-character language names like "b", "c",
--- or "d". For example, Scintillua uses "b_lang", "cpp", and "dmd",
+-- Note: Try to refrain from using one-character language names like "c", "d",
+-- or "r". For example, Scintillua uses "ansi_c", "dmd", and "rstats",
 -- respectively.
 --
 -- ### New Lexer Template
@@ -89,7 +89,7 @@ local M = {}
 --
 --     return M
 --
--- The first 4 lines of code simply define often used convenience variables. The
+-- The first 3 lines of code simply define often used convenience variables. The
 -- 5th and last lines define and return the lexer object Scintilla uses; they
 -- are very important and must be part of every lexer. The sixth line defines
 -- something called a "token", an essential building block of lexers. You will
@@ -338,6 +338,7 @@ local M = {}
 -- font:_name_    | The name of the font the style uses.
 -- size:_int_     | The size of the font the style uses.
 -- [not]bold      | Whether or not the font face is bold.
+-- weight:_int_   | The weight or boldness of a font, between 1 and 999.
 -- [not]italics   | Whether or not the font face is italic.
 -- [not]underlined| Whether or not the font face is underlined.
 -- fore:_color_   | The foreground color of the font face.
@@ -516,6 +517,20 @@ local M = {}
 --     local php_end_rule = token('php_tag', '?>')
 --     l.embed_lexer(html, M, php_start_rule, php_end_rule)
 --
+-- ### Lexers with Complex State
+--
+-- A vast majority of lexers are not stateful and can operate on any chunk of
+-- text in a document. However, there may be rare cases where a lexer does need
+-- to keep track of some sort of persistent state. Rather than using `lpeg.P`
+-- function patterns that set state variables, it is recommended to make use of
+-- Scintilla's built-in, per-line state integers via [`lexer.line_state`](). It
+-- was designed to accommodate up to 32 bit flags for tracking state.
+-- [`lexer.line_from_position()`]() will return the line for any position given
+-- to an `lpeg.P` function pattern. (Any positions derived from that position
+-- argument will also work.)
+--
+-- Writing stateful lexers is beyond the scope of this document.
+--
 -- ## Code Folding
 --
 -- When reading source code, it is occasionally helpful to temporarily hide
@@ -841,6 +856,9 @@ local M = {}
 -- @field indent_amount (table, Read-only)
 --   Table of indentation amounts in character columns, for line numbers
 --   starting from zero.
+-- @field line_state (table)
+--   Table of integer line states for line numbers starting from zero.
+--   Line states can be used by lexers for keeping track of persistent states.
 -- @field property (table)
 --   Map of key-value string pairs.
 -- @field property_expanded (table, Read-only)
@@ -850,7 +868,7 @@ local M = {}
 --   Map of key-value pairs with values interpreted as numbers, or `0` if not
 --   found.
 -- @field style_at (table, Read-only)
---   Table of style names at positions in the buffer starting from zero.
+--   Table of style names at positions in the buffer starting from 1.
 module('lexer')]=]
 
 lpeg = require('lpeg')
@@ -931,7 +949,8 @@ end
 local function add_lexer(grammar, lexer, token_rule)
   local token_rule = join_tokens(lexer)
   local lexer_name = lexer._NAME
-  for _, child in ipairs(lexer._CHILDREN) do
+  for i = 1, #lexer._CHILDREN do
+    local child = lexer._CHILDREN[i]
     if child._CHILDREN then add_lexer(grammar, child) end
     local child_name = child._NAME
     local rules = child._EMBEDDEDRULES[lexer_name]
@@ -973,10 +992,11 @@ local default = {
   'identifier', 'operator', 'error', 'preprocessor', 'constant', 'variable',
   'function', 'class', 'type', 'label', 'regex', 'embedded'
 }
-for _, v in ipairs(default) do
-  M[string_upper(v)] = v
-  if not M['STYLE_'..string_upper(v)] then
-    M['STYLE_'..string_upper(v)] = ''
+for i = 1, #default do
+  local name, upper_name = default[i], string_upper(default[i])
+  M[upper_name] = name
+  if not M['STYLE_'..upper_name] then
+    M['STYLE_'..upper_name] = ''
   end
 end
 -- Predefined styles.
@@ -984,18 +1004,20 @@ local predefined = {
   'default', 'linenumber', 'bracelight', 'bracebad', 'controlchar',
   'indentguide', 'calltip'
 }
-for _, v in ipairs(predefined) do
-  M[string_upper(v)] = v
-  if not M['STYLE_'..string_upper(v)] then
-    M['STYLE_'..string_upper(v)] = ''
+for i = 1, #predefined do
+  local name, upper_name = predefined[i], string_upper(predefined[i])
+  M[upper_name] = name
+  if not M['STYLE_'..upper_name] then
+    M['STYLE_'..upper_name] = ''
   end
 end
 
 ---
 -- Initializes or loads and returns the lexer of string name *name*.
--- Scintilla calls this function to load a lexer. Parent lexers also call this
--- function to load child lexers and vice-versa. The user calls this function
--- to load a lexer when using Scintillua as a Lua library.
+-- Scintilla calls this function in order to load a lexer. Parent lexers also
+-- call this function in order to load child lexers and vice-versa. The user
+-- calls this function in order to load a lexer when using Scintillua as a Lua
+-- library.
 -- @param name The name of the lexing language.
 -- @param alt_name The alternate name of the lexing language. This is useful for
 --   embedding the same child lexer with multiple sets of start and end tokens.
@@ -1042,20 +1064,28 @@ function M.load(name, alt_name)
   if lexer._lexer then
     local l, _r, _s = lexer._lexer, lexer._rules, lexer._tokenstyles
     if not l._tokenstyles then l._tokenstyles = {} end
-    for _, r in ipairs(_r or {}) do
-      -- Prevent rule id clashes.
-      l._rules[#l._rules + 1] = {lexer._NAME..'_'..r[1], r[2]}
+    if _r then
+      for i = 1, #_r do
+        -- Prevent rule id clashes.
+        l._rules[#l._rules + 1] = {lexer._NAME..'_'.._r[i][1], _r[i][2]}
+      end
+    end
+    if _s then
+      for token, style in pairs(_s) do l._tokenstyles[token] = style end
     end
-    for token, style in pairs(_s or {}) do l._tokenstyles[token] = style end
     lexer = l
   end
 
   -- Add the lexer's styles and build its grammar.
   if lexer._rules then
-    for token, style in pairs(lexer._tokenstyles or {}) do
-      add_style(lexer, token, style)
+    if lexer._tokenstyles then
+      for token, style in pairs(lexer._tokenstyles) do
+        add_style(lexer, token, style)
+      end
+    end
+    for i = 1, #lexer._rules do
+      add_rule(lexer, lexer._rules[i][1], lexer._rules[i][2])
     end
-    for _, r in ipairs(lexer._rules) do add_rule(lexer, r[1], r[2]) end
     build_grammar(lexer)
   end
   -- Add the lexer's unique whitespace style.
@@ -1123,15 +1153,16 @@ function M.lex(lexer, text, init_style)
 end
 
 ---
--- Folds a chunk of text *text* with lexer *lexer*.
--- Folds *text* starting at position *start_pos* on line number *start_line*
--- with a beginning fold level of *start_level* in the buffer. If *lexer* has a
--- a `_fold` function or a `_foldsymbols` table, that field is used to perform
--- folding. Otherwise, if *lexer* has a `_FOLDBYINDENTATION` field set, or if a
+-- Determines fold points in a chunk of text *text* with lexer *lexer*.
+-- *text* starts at position *start_pos* on line number *start_line* with a
+-- beginning fold level of *start_level* in the buffer. If *lexer* has a `_fold`
+-- function or a `_foldsymbols` table, that field is used to perform folding.
+-- Otherwise, if *lexer* has a `_FOLDBYINDENTATION` field set, or if a
 -- `fold.by.indentation` property is set, folding by indentation is done.
 -- @param lexer The lexer object to fold with.
 -- @param text The text in the buffer to fold.
--- @param start_pos The position in the buffer *text* starts at.
+-- @param start_pos The position in the buffer *text* starts at, starting at
+--   zero.
 -- @param start_line The line number *text* starts on.
 -- @param start_level The fold level *text* starts on.
 -- @return table of fold levels.
@@ -1282,9 +1313,10 @@ M.hex_num = '0' * lpeg_S('xX') * M.xdigit^1
 M.oct_num = '0' * lpeg_R('07')^1
 M.integer = lpeg_S('+-')^-1 * (M.hex_num + M.oct_num + M.dec_num)
 M.float = lpeg_S('+-')^-1 *
-          (M.digit^0 * '.' * M.digit^1 + M.digit^1 * '.' * M.digit^0 +
-           M.digit^1) *
-          lpeg_S('eE') * lpeg_S('+-')^-1 * M.digit^1
+          ((M.digit^0 * '.' * M.digit^1 + M.digit^1 * '.' * M.digit^0) *
+           (lpeg_S('eE') * lpeg_S('+-')^-1 * M.digit^1)^-1 +
+           (M.digit^1 * lpeg_S('eE') * lpeg_S('+-')^-1 * M.digit^1))
+
 M.word = (M.alpha + '_') * (M.alnum + '_')^0
 
 ---
@@ -1410,7 +1442,7 @@ end
 -- @param word_chars Optional string of additional characters considered to be
 --   part of a word. By default, word characters are alphanumerics and
 --   underscores ("%w_" in Lua). This parameter may be `nil` or the empty string
---   to indicate no additional word characters.
+--   in order to indicate no additional word characters.
 -- @param case_insensitive Optional boolean flag indicating whether or not the
 --   word match is case-insensitive. The default is `false`.
 -- @return pattern
@@ -1420,8 +1452,8 @@ end
 -- @name word_match
 function M.word_match(words, word_chars, case_insensitive)
   local word_list = {}
-  for _, word in ipairs(words) do
-    word_list[case_insensitive and word:lower() or word] = true
+  for i = 1, #words do
+    word_list[case_insensitive and words[i]:lower() or words[i]] = true
   end
   local chars = M.alnum + '_'
   if word_chars then chars = chars + lpeg_S(word_chars) end
@@ -1449,7 +1481,9 @@ function M.embed_lexer(parent, child, start_rule, end_rule)
   if not child._EMBEDDEDRULES then child._EMBEDDEDRULES = {} end
   if not child._RULES then -- creating a child lexer to be embedded
     if not child._rules then error('Cannot embed language with no rules') end
-    for _, r in ipairs(child._rules) do add_rule(child, r[1], r[2]) end
+    for i = 1, #child._rules do
+      add_rule(child, child._rules[i][1], child._rules[i][2])
+    end
   end
   child._EMBEDDEDRULES[parent._NAME] = {
     ['start_rule'] = start_rule,
@@ -1463,8 +1497,10 @@ function M.embed_lexer(parent, child, start_rule, end_rule)
   if not parent._tokenstyles then parent._tokenstyles = {} end
   local tokenstyles = parent._tokenstyles
   tokenstyles[child._NAME..'_whitespace'] = M.STYLE_WHITESPACE
-  for token, style in pairs(child._tokenstyles or {}) do
-    tokenstyles[token] = style
+  if child._tokenstyles then
+    for token, style in pairs(child._tokenstyles) do
+      tokenstyles[token] = style
+    end
   end
   child._lexer = parent -- use parent's tokens if child is embedding itself
   parent_lexer = parent -- use parent's tokens if the calling lexer is a proxy
@@ -1544,11 +1580,18 @@ M.property_expanded = setmetatable({}, {
 --[[ The functions and fields below were defined in C.
 
 ---
+-- Returns the line number of the line that contains position *pos*, which
+-- starts from 1.
+-- @param pos The position to get the line number of.
+-- @return number
+local function line_from_position(pos) end
+
+---
 -- Individual fields for a lexer instance.
 -- @field _NAME The string name of the lexer.
 -- @field _rules An ordered list of rules for a lexer grammar.
 --   Each rule is a table containing an arbitrary rule name and the LPeg pattern
---   associated with the rule. The order of rules is important as rules are
+--   associated with the rule. The order of rules is important, as rules are
 --   matched sequentially.
 --   Child lexers should not use this table to access and/or modify their
 --   parent's rules and vice-versa. Use the `_RULES` table instead.
author	Marc André Tanner <mat@brain-dump.org>	2016-10-05 10:41:41 +0200
committer	Marc André Tanner <mat@brain-dump.org>	2016-10-05 10:49:32 +0200
commit	6c38a7f19107552c58a9e168a7e114216c5b53c8 (patch)
tree	b31911dbaff41fa0661b2fad291a3efa271c11c2
parent	6cf931355b264618879b5bb3ada36067bf246882 (diff)
download	vis-6c38a7f19107552c58a9e168a7e114216c5b53c8.tar.gz vis-6c38a7f19107552c58a9e168a7e114216c5b53c8.tar.xz