perf(treesitter)!: incremental invalidation

Problem:
After an edit that changes the number of injection regions, the
LanguageTree drops all the existing trees. This inefficient because the
injections should be parsed from scratch.

Solution:
When setting included regions, match them with the existing regions so
that they can be reparsed incrementally. This uses a table that maps
region values to their indices. Regions are matched by "similarity",
because some changes of regions cannot be precisely tracked by
`_edit()`.

Breaking change:
The indices of `parser:trees()` behave now differently because existing
regions are reused. So `parser:parse(true)` does not ensure that the
tree table is list-like. Also, when new regions are added manually, they
are first added and then the stale regions are discarded. So the
existing uses of `trees[1]` may break. Use `next(trees())` instead.
This commit is contained in:
Jaehwang Jung 2024-04-13 19:34:40 +09:00
parent b40ec083ae
commit 60cdbf3f92
3 changed files with 412 additions and 92 deletions

View File

@ -1343,7 +1343,7 @@ LanguageTree:invalidate({reload}) *LanguageTree:invalidate()*
Should only be called when the tracked state of the LanguageTree is not
valid against the parse tree in treesitter. Doesn't clear filesystem
cache. Called often, so needs to be fast.
cache.
Parameters: ~
• {reload} (`boolean?`)
@ -1478,10 +1478,7 @@ LanguageTree:tree_for_range({range}, {opts})
LanguageTree:trees() *LanguageTree:trees()*
Returns all trees of the regions parsed by this parser. Does not include
child languages. The result is list-like if
• this LanguageTree is the root, in which case the result is empty or a
singleton list; or
• the root LanguageTree is fully parsed.
child languages.
Return: ~
(`table<integer, TSTree>`)

View File

@ -77,14 +77,26 @@ local TSCallbackNames = {
---@field private _opts table Options
---@field private _parser TSParser Parser for language
---@field private _has_regions boolean
---@field private _regions table<integer, Range6[]>?
---
---List of regions this tree should manage and parse. If nil then regions are
---taken from _trees. This is mostly a short-lived cache for included_regions()
---@field private _regions table<integer, Range6[]>?
---
---Inverse region table, i.e., a (chaining) hash table from regions to their index in `_region`.
---Used for checking if an added region is already managed by this parser, so that it can reuse
---the existing tree for incremental parsing.
---The hash function is simply `region[1][3]` (the start byte of its first range).
---Each bucket has the shape of { region1, index of region1, region2, index of region2, ... }.
---@field private _regions_inv table<integer, (Range6[]|integer)[]>?
---
---@field private _lang string Language name
---@field private _parent? vim.treesitter.LanguageTree Parent LanguageTree
---@field private _source (integer|string) Buffer or string to parse
---@field private _trees table<integer, TSTree> Reference to parsed tree (one for each language).
---
---Reference to parsed tree (one for each language).
---Each key is the index of region, which is synced with _regions and _valid.
---@field private _trees table<integer, TSTree>
---
---@field private _valid boolean|table<integer,boolean> If the parsed tree is valid
---@field private _logger? fun(logtype: string, msg: string)
---@field private _logfile? file*
@ -228,7 +240,7 @@ end
--- Invalidates this parser and its children.
---
--- Should only be called when the tracked state of the LanguageTree is not valid against the parse
--- tree in treesitter. Doesn't clear filesystem cache. Called often, so needs to be fast.
--- tree in treesitter. Doesn't clear filesystem cache.
---@param reload boolean|nil
function LanguageTree:invalidate(reload)
self._valid = false
@ -248,9 +260,6 @@ end
--- Returns all trees of the regions parsed by this parser.
--- Does not include child languages.
--- The result is list-like if
--- * this LanguageTree is the root, in which case the result is empty or a singleton list; or
--- * the root LanguageTree is fully parsed.
---
---@return table<integer, TSTree>
function LanguageTree:trees()
@ -409,6 +418,14 @@ function LanguageTree:_add_injections()
return query_time
end
---@param region (Range)[]
---@return Range4
local function region_range(region)
local srow, scol, _, _ = Range.unpack4(region[1])
local _, _, erow, ecol = Range.unpack4(region[#region])
return { srow, scol, erow, ecol }
end
--- Recursively parse all regions in the language tree using |treesitter-parsers|
--- for the corresponding languages and run injection queries on the parsed trees
--- to determine whether child trees should be created and parsed.
@ -585,6 +602,125 @@ function LanguageTree:_iter_regions(fn)
end
end
---Add a region to the inverse region table.
---@param regions_inv table<integer, (Range6[]|integer)[]>
---@param i integer
---@param region Range6[]
local function regions_inv_insert(regions_inv, i, region)
local start_byte = region[1][3]
local bucket = regions_inv[start_byte]
if not bucket then
regions_inv[start_byte] = { region, i }
else
table.insert(bucket, region)
table.insert(bucket, i)
end
end
---Remove a region from the inverse region table.
---@param regions_inv table<integer, (Range6[]|integer)[]>
---@param region Range6[]
local function regions_inv_remove(regions_inv, region)
local start_byte = region[1][3]
local bucket = assert(regions_inv[start_byte])
for e = 1, #bucket, 2 do
if vim.deep_equal(bucket[e], region) then
table.remove(bucket, e + 1)
table.remove(bucket, e)
if #bucket == 0 then
regions_inv[start_byte] = nil
end
return
end
end
error('region not found')
end
---Whether two region values are approximately equal. Should be implied by equality.
---See the comment in `set_included_regions` on why we use similarity.
---For now it simply compares the last bytes of the first and the last regions.
---@param region1 Range6[]
---@param region2 Range6[]
---@return boolean
local function region_similar(region1, region2)
return region1[1][6] == region2[1][6] or region1[#region1][6] == region2[#region2][6]
end
---Find the given region from the inverse region table.
---If there is no exact match, find an approximately matching region.
---@param regions_inv table<integer, (Range6[]|integer)[]>
---@param region Range6[]
---@return integer?
---@return boolean? exact
local function regions_inv_lookup(regions_inv, region)
local bucket = regions_inv[region[1][3]]
if not bucket then
return
end
local i ---@type integer?
for e = 1, #bucket, 2 do
local old_region = bucket[e] --[[@as Range6[] ]]
if region_similar(old_region, region) then
i = bucket[e + 1] --[[@as integer]]
if vim.deep_equal(old_region, region) then
return i, true
end
end
end
return i, false
end
---@param i integer
function LanguageTree:_invalidate_region(i)
if self._valid == true then
self._valid = {}
for j, _ in pairs(self._regions) do
self._valid[j] = true
end
self._valid[i] = false
elseif type(self._valid) == 'table' then
self._valid[i] = false
end
end
---@param i integer
function LanguageTree:_discard_region(i)
if not self._has_regions then
return
end
if self._regions then
regions_inv_remove(self._regions_inv, self._regions[i])
self._regions[i] = nil
end
if self._trees[i] then
local region = self._trees[i]:included_ranges(true)
self:_log(function()
return 'discarding region', i, region_tostr(region)
end)
self:_do_callback('changedtree', region, self._trees[i])
local discarded_range = region_range(region)
self._trees[i] = nil
-- Discard children's regions that are included in the discarded region. This is necessary
-- because changes that only remove trees in this parser keep the children parsers untouched.
for _, child in pairs(self._children) do
for child_i, child_region in pairs(child:included_regions()) do
if Range.contains(discarded_range, region_range(child_region)) then
child:_discard_region(child_i)
end
end
end
end
-- If it's boolean (fully valid/invalid), deleting a region doesn't change its value.
if type(self._valid) == 'table' then
self._valid[i] = nil
end
end
--- Sets the included regions that should be parsed by this |LanguageTree|.
--- A region is a set of nodes and/or ranges that will be parsed in the same context.
---
@ -604,7 +740,23 @@ end
function LanguageTree:set_included_regions(new_regions)
self._has_regions = true
-- Transform the tables from 4 element long to 6 element long (with byte offset)
-- Refresh self._regions and self._regions_inv
self:included_regions()
local touched = {} ---@type table<integer, true>
-- Check if the parser already has each region so that they can be parsed incrementally from an
-- existing tree. We find the existing regions by "similarity" instead of the exact equality,
-- because the values of an existing region and the matching region in `new_regions` may not be
-- equal, in which case the existing tree can't be reused.
--
-- Inequality of matching regions happens because `_edit` does not accurately track changes in the
-- existing regions. One (probably the only?) case is when a multi-range region created from a
-- non-`include-children` injection or a combined injection is edited in a way that adds a range
-- to the region, e.g., when adding a line in markdown fenced code block (with language).
--
-- Matching the regions doesn't need to precise: the consequence of false match and false
-- non-match is just a minor loss in efficiency due to reparsing a region from scratch.
for _, region in ipairs(new_regions) do
for i, range in ipairs(region) do
if type(range) == 'table' and #range == 4 then
@ -613,26 +765,50 @@ function LanguageTree:set_included_regions(new_regions)
region[i] = { range:range(true) }
end
end
end
---@cast region Range6[]
-- included_regions is not guaranteed to be list-like, but this is still sound, i.e. if
-- new_regions is different from included_regions, then outdated regions in included_regions are
-- invalidated. For example, if included_regions = new_regions ++ hole ++ outdated_regions, then
-- outdated_regions is invalidated by _iter_regions in else branch.
if #self:included_regions() ~= #new_regions then
-- TODO(lewis6991): inefficient; invalidate trees incrementally
for _, t in pairs(self._trees) do
self:_do_callback('changedtree', t:included_ranges(true), t)
local i, exact = regions_inv_lookup(self._regions_inv, region)
if not exact then
if i then
self:_log(function()
return 'invalidating inexactly matched region', i, region_tostr(self._regions[i])
end)
regions_inv_remove(self._regions_inv, self._regions[i])
else
i = #self._regions + 1 -- this always gives an unoccupied index even if there are holes
end
self._regions[i] = region
regions_inv_insert(self._regions_inv, i, region)
self:_invalidate_region(i)
end
self._trees = {}
self:invalidate()
else
self:_iter_regions(function(i, region)
return vim.deep_equal(new_regions[i], region)
end)
---@cast i integer
touched[i] = true
end
self._regions = new_regions
-- Discard stale regions.
for i, _ in pairs(self._regions) do
if not touched[i] then
self:_discard_region(i)
end
end
end
--- @param region Range6[]
local function prune_empty_ranges(region)
local cur = 1
for i, range in ipairs(region) do
if range[3] ~= range[6] then
if cur < i then
region[cur] = range
end
cur = cur + 1
end
end
for i = #region, cur, -1 do
region[i] = nil
end
end
---Gets the set of included regions managed by this LanguageTree. This can be different from the
@ -651,12 +827,27 @@ function LanguageTree:included_regions()
return { {} }
end
local regions = {} ---@type Range6[][]
local regions = {} ---@type table<integer, Range6[]>
local regions_inv = {} ---@type table<integer, (Range6[]|integer)[]>
for i, _ in pairs(self._trees) do
regions[i] = self._trees[i]:included_ranges(true)
local region = self._trees[i]:included_ranges(true)
-- If user deletes a range in a region, `tree:edit()` leaves an empty range instead of deleting
-- it. This could be a bug in treesitter.
prune_empty_ranges(region)
if #region > 0 then
regions[i] = region
regions_inv_insert(regions_inv, i, region)
else
self._trees[i] = nil
-- If it's boolean (fully valid/invalid), deleting a region doesn't change its value.
if type(self._valid) == 'table' then
self._valid[i] = nil
end
end
end
self._regions = regions
self._regions_inv = regions_inv
return regions
end
@ -908,6 +1099,7 @@ function LanguageTree:_edit(
end
self._regions = nil
self._regions_inv = nil
local changed_range = {
start_row,
@ -1071,14 +1263,7 @@ end
---@param range Range
---@return boolean
local function tree_contains(tree, range)
local tree_ranges = tree:included_ranges(false)
return Range.contains({
tree_ranges[1][1],
tree_ranges[1][2],
tree_ranges[#tree_ranges][3],
tree_ranges[#tree_ranges][4],
}, range)
return Range.contains(region_range(tree:included_ranges(false)), range)
end
--- Determines whether {range} is contained in the |LanguageTree|.

View File

@ -253,20 +253,25 @@ end]]
local root = _G.parser:parse()[1]:root()
_G.parser:set_included_regions({ { root:child(0) } })
_G.parser:invalidate()
return { _G.parser:parse(true)[1]:root():range() }
local _, tree = next(_G.parser:parse(true))
return { tree:root():range() }
end)
eq({ 0, 0, 18, 1 }, res2)
eq({ { { 0, 0, 0, 18, 1, 512 } } }, exec_lua [[ return parser:included_regions() ]])
eq(
{ { { 0, 0, 0, 18, 1, 512 } } },
exec_lua [[return vim.tbl_values(_G.parser:included_regions())]]
)
local range_tbl = exec_lua(function()
_G.parser:set_included_regions { { { 0, 0, 17, 1 } } }
_G.parser:parse()
return _G.parser:included_regions()
end)
eq({ { { 0, 0, 0, 17, 1, 508 } } }, range_tbl)
eq(
{ { { 0, 0, 0, 17, 1, 508 } } },
exec_lua(function()
_G.parser:set_included_regions { { { 0, 0, 17, 1 } } }
_G.parser:parse()
return vim.tbl_values(_G.parser:included_regions())
end)
)
end)
it('allows to set complex ranges', function()
@ -283,7 +288,8 @@ end]]
parser:set_included_regions({ nodes })
local root = parser:parse(true)[1]:root()
local _, tree = next(parser:parse(true))
local root = tree:root()
local res = {}
for i = 0, (root:named_child_count() - 1) do
@ -826,7 +832,7 @@ print()
1,
exec_lua(function()
_G.parser:parse({ 0, 2 })
return #_G.parser:children().lua:trees()
return vim.tbl_count(_G.parser:children().lua:trees())
end)
)
@ -834,7 +840,7 @@ print()
2,
exec_lua(function()
_G.parser:parse({ 2, 6 })
return #_G.parser:children().lua:trees()
return vim.tbl_count(_G.parser:children().lua:trees())
end)
)
@ -842,11 +848,106 @@ print()
7,
exec_lua(function()
_G.parser:parse(true)
return #_G.parser:children().lua:trees()
return vim.tbl_count(_G.parser:children().lua:trees())
end)
)
end)
it('reuses similar existing regions', function()
insert(dedent [[
* line1
line2]])
exec_lua(function()
_G.parser = vim.treesitter.get_parser(0, 'markdown', {
injections = {
markdown = '((inline) @injection.content (#set! injection.language "markdown_inline"))',
},
})
end)
local function get_regions()
return exec_lua(function()
_G.parser:parse(true)
local result = {}
for i, tree in pairs(_G.parser:children().markdown_inline:trees()) do
result[i] = tree:included_ranges()
end
return result
end)
end
eq({
[1] = { { 0, 2, 1, 0 }, { 1, 2, 1, 7 } },
}, get_regions())
feed('2ggyyp')
-- region index does not change
eq({
[1] = { { 0, 2, 1, 0 }, { 1, 2, 2, 0 }, { 2, 2, 2, 7 } },
}, get_regions())
feed('2ggdd')
eq({
[1] = { { 0, 2, 1, 0 }, { 1, 2, 1, 7 } },
}, get_regions())
feed('ggyGP')
-- the old region moves while maintaining its index
eq({
[1] = { { 2, 2, 3, 0 }, { 3, 2, 3, 7 } },
[2] = { { 0, 2, 1, 0 }, { 1, 2, 1, 7 } },
}, get_regions())
end)
it("recursively discards children's regions contained in a parent's discarded region", function()
insert(dedent [[
`return`
```
line 4
```
line 6 `return`
```]])
exec_lua(function()
_G.parser = vim.treesitter.get_parser(0, 'markdown', {
injections = {
-- inject code span to lua
markdown_inline = '((code_span) @injection.content (#offset! @injection.content 0 1 0 -1) (#set! injection.language "lua"))',
},
})
end)
local function get_regions()
return exec_lua(function()
_G.parser:parse(true)
local result = {}
for i, tree in pairs(_G.parser:children().markdown_inline:children().lua:trees()) do
result[i] = tree:included_ranges()
end
return result
end)
end
-- Initially, "line 4" is in the fenced code block, and "line 6 `return`" is a normal paragraph
-- with a inline code span.
eq({
[1] = { { 0, 1, 0, 7 } },
[2] = { { 5, 8, 5, 14 } },
}, get_regions())
-- Extend the code block to "line 6 `return`". Note that the only effect to markdown_inline
-- parser is removing a region, so it does not parse anything in markdown_inline parser.
feed('5ggD')
-- Despite not parsing at the parent (markdown_inline) parser, the regions in children (lua)
-- parser that are included in the parent's removed region should be removed as well.
-- The "`return`" at the first line is just for preventing the lua parser from being removed.
eq({
[1] = { { 0, 1, 0, 7 } },
}, get_regions())
end)
describe('languagetree is_valid()', function()
before_each(function()
insert(dedent [[
@ -857,10 +958,8 @@ print()
]])
feed(':set ft=help<cr>')
exec_lua(function()
vim.treesitter.get_parser(0, 'vimdoc', {
_G.parser = vim.treesitter.get_parser(0, 'vimdoc', {
injections = {
vimdoc = '((codeblock (language) @injection.language (code) @injection.content) (#set! injection.include-children))',
},
@ -868,21 +967,34 @@ print()
end)
end)
local function get_regions()
return exec_lua(function()
if not _G.parser:children().lua then
return nil
end
local result = {}
for i, tree in pairs(_G.parser:children().lua:trees()) do
result[i] = tree:included_ranges()
end
return result
end)
end
it('is valid excluding, invalid including children initially', function()
eq(true, exec_lua('return vim.treesitter.get_parser():is_valid(true)'))
eq(false, exec_lua('return vim.treesitter.get_parser():is_valid()'))
eq(true, exec_lua('return parser:is_valid(true)'))
eq(false, exec_lua('return parser:is_valid()'))
end)
it('is fully valid after a full parse', function()
exec_lua('vim.treesitter.get_parser():parse(true)')
eq(true, exec_lua('return vim.treesitter.get_parser():is_valid(true)'))
eq(true, exec_lua('return vim.treesitter.get_parser():is_valid()'))
exec_lua('parser:parse(true)')
eq(true, exec_lua('return parser:is_valid(true)'))
eq(true, exec_lua('return parser:is_valid()'))
end)
it('is fully valid after a parsing a range on parsed tree', function()
exec_lua('vim.treesitter.get_parser():parse({5, 7})')
eq(true, exec_lua('return vim.treesitter.get_parser():is_valid(true)'))
eq(true, exec_lua('return vim.treesitter.get_parser():is_valid()'))
eq(true, exec_lua('return parser:is_valid(true)'))
eq(true, exec_lua('return parser:is_valid()'))
end)
describe('when adding content with injections', function()
@ -897,36 +1009,36 @@ print()
end)
it('is fully invalid after changes', function()
eq(false, exec_lua('return vim.treesitter.get_parser():is_valid(true)'))
eq(false, exec_lua('return vim.treesitter.get_parser():is_valid()'))
eq(false, exec_lua('return parser:is_valid(true)'))
eq(false, exec_lua('return parser:is_valid()'))
end)
it('is valid excluding, invalid including children after a rangeless parse', function()
exec_lua('vim.treesitter.get_parser():parse()')
eq(true, exec_lua('return vim.treesitter.get_parser():is_valid(true)'))
eq(false, exec_lua('return vim.treesitter.get_parser():is_valid()'))
exec_lua('parser:parse()')
eq(true, exec_lua('return parser:is_valid(true)'))
eq(false, exec_lua('return parser:is_valid()'))
end)
it(
'is fully valid after a range parse that leads to parsing not parsed injections',
function()
exec_lua('vim.treesitter.get_parser():parse({5, 7})')
eq(true, exec_lua('return vim.treesitter.get_parser():is_valid(true)'))
eq(true, exec_lua('return vim.treesitter.get_parser():is_valid()'))
exec_lua('parser:parse({5, 7})')
eq(true, exec_lua('return parser:is_valid(true)'))
eq(true, exec_lua('return parser:is_valid()'))
end
)
it(
'is valid excluding, invalid including children after a range parse that does not lead to parsing not parsed injections',
function()
exec_lua('vim.treesitter.get_parser():parse({2, 4})')
eq(true, exec_lua('return vim.treesitter.get_parser():is_valid(true)'))
eq(false, exec_lua('return vim.treesitter.get_parser():is_valid()'))
exec_lua('parser:parse({2, 4})')
eq(true, exec_lua('return parser:is_valid(true)'))
eq(false, exec_lua('return parser:is_valid()'))
end
)
end)
describe('when removing content with injections', function()
describe('when removing an injection region', function()
before_each(function()
feed('G')
insert(dedent [[
@ -935,41 +1047,67 @@ print()
<
>lua
local a = {}
local b = {}
<
]])
exec_lua('vim.treesitter.get_parser():parse(true)')
exec_lua('parser:parse(true)')
eq({ [1] = { { 6, 0, 7, 0 } }, [2] = { { 10, 0, 11, 0 } } }, get_regions())
feed('Gd3k')
-- the empty region is pruned
eq({ [1] = { { 6, 0, 7, 0 } } }, get_regions())
end)
it('is fully invalid after changes', function()
eq(false, exec_lua('return vim.treesitter.get_parser():is_valid(true)'))
eq(false, exec_lua('return vim.treesitter.get_parser():is_valid()'))
eq(false, exec_lua('return parser:is_valid(true)'))
eq(false, exec_lua('return parser:is_valid()'))
end)
it('is valid excluding, invalid including children after a rangeless parse', function()
exec_lua('vim.treesitter.get_parser():parse()')
eq(true, exec_lua('return vim.treesitter.get_parser():is_valid(true)'))
eq(false, exec_lua('return vim.treesitter.get_parser():is_valid()'))
exec_lua('parser:parse()')
eq(true, exec_lua('return parser:is_valid(true)'))
eq(false, exec_lua('return parser:is_valid()'))
end)
it('is fully valid after a range parse that leads to parsing modified child tree', function()
exec_lua('vim.treesitter.get_parser():parse({5, 7})')
eq(true, exec_lua('return vim.treesitter.get_parser():is_valid(true)'))
eq(true, exec_lua('return vim.treesitter.get_parser():is_valid()'))
it('is fully valid after a range parse that includes injection region', function()
exec_lua('parser:parse({5, 7})')
eq(true, exec_lua('return parser:is_valid(true)'))
eq(true, exec_lua('return parser:is_valid()'))
end)
end)
describe('when editing an injection region', function()
before_each(function()
feed('G')
insert(dedent [[
>lua
local a = 1
<
]])
exec_lua('parser:parse(true)')
feed('G2kA<BS>2<ESC>') -- 1 → 2
end)
it(
'is valid excluding, invalid including children after a range parse that does not lead to parsing modified child tree',
function()
exec_lua('vim.treesitter.get_parser():parse({2, 4})')
eq(true, exec_lua('return vim.treesitter.get_parser():is_valid(true)'))
eq(false, exec_lua('return vim.treesitter.get_parser():is_valid()'))
end
)
it('is fully invalid after changes', function()
eq(false, exec_lua('return parser:is_valid(true)'))
eq(false, exec_lua('return parser:is_valid()'))
end)
it('is valid excluding, invalid including children after a rangeless parse', function()
exec_lua('parser:parse()')
eq(true, exec_lua('return parser:is_valid(true)'))
eq(false, exec_lua('return parser:is_valid()'))
end)
it('is fully valid after a range parse that includes modified region', function()
exec_lua('parser:parse({5, 7})')
eq(true, exec_lua('return parser:is_valid(true)'))
eq(true, exec_lua('return parser:is_valid()'))
end)
end)
end)
end)