lua/lexers/markdown.lua


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151

-- Copyright 2006-2025 Mitchell. See LICENSE.
-- Markdown LPeg lexer.

local lexer = lexer
local P, S, B = lpeg.P, lpeg.S, lpeg.B

local lex = lexer.new(..., {no_user_word_lists = true})

-- Distinguish between horizontal and vertical space so html start rule has a chance to match.
lex:modify_rule('whitespace', lex:tag(lexer.WHITESPACE, S(' \t')^1 + S('\r\n')^1))

-- Block elements.
local function h(n)
	return lex:tag(string.format('%s.h%s', lexer.HEADING, n),
		lexer.to_eol(lexer.starts_line(string.rep('#', n))))
end
lex:add_rule('header', h(6) + h(5) + h(4) + h(3) + h(2) + h(1))

lex:add_rule('hr',
	lex:tag('hr', lpeg.Cmt(lexer.starts_line(lpeg.C(S('*-_')), true), function(input, index, c)
		local line = input:match('[^\r\n]*', index):gsub('[ \t]', '')
		if line:find('[^' .. c .. ']') or #line < 2 then return nil end
		return (select(2, input:find('\r?\n', index)) or #input) + 1 -- include \n for eolfilled styles
	end)))

lex:add_rule('list', lex:tag(lexer.LIST,
	lexer.starts_line(lexer.digit^1 * '.' + S('*+-'), true) * S(' \t')))

local hspace = lexer.space - '\n'
local blank_line = '\n' * hspace^0 * ('\n' + P(-1))

local code_line = lexer.starts_line((B('    ') + B('\t')) * lpeg.P(function(input, index)
	-- Backtrack to the start of the current paragraph, which is either after a blank line,
	-- at the start of a higher level of indentation, or at the start of the buffer.
	local line, blank_line = lexer.line_from_position(index), false
	while line > 0 do
		local s, e = lexer.line_start[line], lexer.line_end[line]
		blank_line = s == e or lexer.text_range(s, e - s + 1):find('^%s+$')
		if blank_line then break end
		local indent_amount = lexer.indent_amount[line]
		line = line - 1
		if line > 0 and lexer.indent_amount[line] > indent_amount then break end
	end

	-- If the start of the paragraph does not being with a '    ' or '\t', then this line
	-- is a continuation of the current paragraph, not a code block.
	local text = lexer.text_range(lexer.line_start[line + 1], 4)
	if not text:find('^\t') and text ~= '    ' then return false end

	-- If the current paragraph is a code block, then so is this line.
	if line <= 1 then return true end

	-- Backtrack to see if this line is in a list item. If so, it is not a code block.
	while line > 1 do
		line = line - 1
		local s, e = lexer.line_start[line], lexer.line_end[line]
		local blank = s == e or lexer.text_range(s, e - s + 1):find('^%s+$')
		if not blank and lexer.indent_amount[line] == 0 then break end
	end
	text = lexer.text_range(lexer.line_start[line], 8) -- note: only 2 is needed for unordered lists
	if text:find('^[*+-][ \t]') then return false end
	if text:find('^%d+%.[ \t]') then return false end

	return true -- if all else fails, it is probably a code block
end) * lexer.to_eol(), true)

local code_block = lexer.range(lexer.starts_line('```', true),
	'\n' * hspace^0 * '```' * hspace^0 * ('\n' + P(-1))) +
	lexer.range(lexer.starts_line('~~~', true), '\n' * hspace^0 * '~~~' * hspace^0 * ('\n' + P(-1)))

local code_inline = lpeg.Cmt(lpeg.C(P('`')^1), function(input, index, bt)
	-- `foo`, ``foo``, ``foo`bar``, `foo``bar` are all allowed.
	local _, e = input:find('[^`]' .. bt .. '%f[^`]', index)
	return (e or #input) + 1
end)

lex:add_rule('block_code', lex:tag(lexer.CODE, code_line + code_block + code_inline))

lex:add_rule('blockquote', lex:tag(lexer.STRING, lexer.starts_line('>', true)))

-- Span elements.
lex:add_rule('escape', lex:tag(lexer.DEFAULT, P('\\') * 1))

local link_text = lexer.range('[', ']', true)
local link_target =
	'(' * (lexer.any - S(') \t'))^0 * (S(' \t')^1 * lexer.range('"', false, false))^-1 * ')'
local link_url = 'http' * P('s')^-1 * '://' * (lexer.any - lexer.space)^1 +
	('<' * lexer.alpha^2 * ':' * (lexer.any - lexer.space - '>')^1 * '>')
lex:add_rule('link', lex:tag(lexer.LINK, P('!')^-1 * link_text * link_target + link_url))

local link_ref = lex:tag(lexer.REFERENCE, link_text * S(' \t')^0 * lexer.range('[', ']', true))
local ref_link_label = lex:tag(lexer.REFERENCE, lexer.range('[', ']', true) * ':')
local ws = lex:get_rule('whitespace')
local ref_link_url = lex:tag(lexer.LINK, (lexer.any - lexer.space)^1)
local ref_link_title = lex:tag(lexer.STRING, lexer.range('"', true, false) +
	lexer.range("'", true, false) + lexer.range('(', ')', true))
lex:add_rule('link_ref', link_ref + ref_link_label * ws * ref_link_url * (ws * ref_link_title)^-1)

local punct_space = lexer.punct + lexer.space

-- Handles flanking delimiters as described in
-- https://github.github.com/gfm/#emphasis-and-strong-emphasis in the cases where simple
-- delimited ranges are not sufficient.
local function flanked_range(s, not_inword)
	local fl_char = lexer.any - s - lexer.space
	local left_fl = B(punct_space - s) * s * #fl_char + s * #(fl_char - lexer.punct)
	local right_fl = B(lexer.punct) * s * #(punct_space - s) + B(fl_char) * s
	return left_fl * (lexer.any - blank_line - (not_inword and s * #punct_space or s))^0 * right_fl
end

local asterisk_strong = flanked_range('**')
local underscore_strong = (B(punct_space) + #lexer.starts_line('_')) * flanked_range('__', true) *
	#(punct_space + -1)
lex:add_rule('strong', lex:tag(lexer.BOLD, asterisk_strong + underscore_strong))

local asterisk_em = flanked_range('*')
local underscore_em = (B(punct_space) + #lexer.starts_line('_')) * flanked_range('_', true) *
	#(punct_space + -1)
lex:add_rule('em', lex:tag(lexer.ITALIC, asterisk_em + underscore_em))

-- Embedded HTML.
local html = lexer.load('html')
local start_rule = lexer.starts_line(P(' ')^-3) * #P('<') * html:get_rule('tag') -- P(' ')^4 starts code_line
local end_rule = #blank_line * ws
lex:embed(html, start_rule, end_rule)

local FOLD_HEADER, FOLD_BASE = lexer.FOLD_HEADER, lexer.FOLD_BASE
-- Fold '#' headers.
function lex:fold(text, start_line, start_level)
	local levels = {}
	local line_num = start_line
	if start_level > FOLD_HEADER then start_level = start_level - FOLD_HEADER end
	for line in (text .. '\n'):gmatch('(.-)\r?\n') do
		local header = line:match('^%s*(#*)')
		-- If the previous line was a header, this line's level has been pre-defined.
		-- Otherwise, use the previous line's level, or if starting to fold, use the start level.
		local level = levels[line_num] or levels[line_num - 1] or start_level
		if level > FOLD_HEADER then level = level - FOLD_HEADER end
		-- If this line is a header, set its level to be one less than the header level
		-- (so it can be a fold point) and mark it as a fold point.
		if #header > 0 then
			level = FOLD_BASE + #header - 1 + FOLD_HEADER
			levels[line_num + 1] = FOLD_BASE + #header
		end
		levels[line_num] = level
		line_num = line_num + 1
	end
	return levels
end

return lex