Skip to content

Commit a83369a

Browse files
committed
Implement annot-preserving replace for annot strs
Implement `replace` function for `AnnotatedString` that properly handles annotation regions during pattern replacement operations. The function tracks which bytes are replaced versus preserved, maintaining annotations only on original content and adding new annotations from replacement text. - Supports AnnotatedChar, AnnotatedString, and SubString replacements - Drops, shifts, and splits existing annotations appropriately - Refactored `_insert_annotations!` to work with annotation vectors directly - Adjacent replacements with identical annotations are merged into single regions - Lots of tests (thanks Claude!) Performance is strangely poor. For the test case mentioned in the REVIEW comment within `_insert_annotations!` we should be able to perform the replacement in ~200ns (compared to ~70ns for the equivalent unannotated case). However, for two reasons that are beyond me instead it takes ~4400ns. See the REVIEW comments for more details, help would be much appreciated.
1 parent 75d1897 commit a83369a

File tree

2 files changed

+660
-15
lines changed

2 files changed

+660
-15
lines changed

base/strings/annotated_io.jl

Lines changed: 154 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -163,18 +163,18 @@ This is implemented so that one can say write an `AnnotatedString` to an
163163
`AnnotatedIOBuffer` one character at a time without needlessly producing a
164164
new annotation for each character.
165165
"""
166-
function _insert_annotations!(io::AnnotatedIOBuffer, annotations::Vector{RegionAnnotation}, offset::Int = position(io))
166+
function _insert_annotations!(annots::Vector{RegionAnnotation}, newannots::Vector{RegionAnnotation}, offset::Int = 0)
167167
run = 0
168-
if !isempty(io.annotations) && last(last(io.annotations).region) == offset
169-
for i in reverse(axes(annotations, 1))
170-
annot = annotations[i]
168+
if !isempty(annots) && last(last(annots).region) == offset
169+
for i in reverse(axes(newannots, 1))
170+
annot = newannots[i]
171171
first(annot.region) == 1 || continue
172-
i <= length(io.annotations) || continue
173-
if annot.label == last(io.annotations).label && annot.value == last(io.annotations).value
172+
i <= length(annots) || continue
173+
if annot.label == last(annots).label && annot.value == last(annots).value
174174
valid_run = true
175175
for runlen in 1:i
176-
new = annotations[begin+runlen-1]
177-
old = io.annotations[end-i+runlen]
176+
new = newannots[begin+runlen-1]
177+
old = annots[end-i+runlen]
178178
if last(old.region) != offset || first(new.region) != 1 || old.label != new.label || old.value != new.value
179179
valid_run = false
180180
break
@@ -188,18 +188,157 @@ function _insert_annotations!(io::AnnotatedIOBuffer, annotations::Vector{RegionA
188188
end
189189
end
190190
for runindex in 0:run-1
191-
old_index = lastindex(io.annotations) - run + 1 + runindex
192-
old = io.annotations[old_index]
193-
new = annotations[begin+runindex]
194-
io.annotations[old_index] = setindex(old, first(old.region):last(new.region)+offset, :region)
191+
old_index = lastindex(annots) - run + 1 + runindex
192+
old = annots[old_index]
193+
new = newannots[begin+runindex]
194+
extannot = (region = first(old.region):last(new.region)+offset,
195+
label = old.label,
196+
value = old.value)
197+
annots[old_index] = extannot
195198
end
196-
for index in run+1:lastindex(annotations)
197-
annot = annotations[index]
199+
for index in run+1:lastindex(newannots)
200+
annot = newannots[index]
198201
start, stop = first(annot.region), last(annot.region)
199-
push!(io.annotations, setindex(annotations[index], start+offset:stop+offset, :region))
202+
# REVIEW: For some reason, construction of `newannot`
203+
# can be a significant contributor to the overall runtime
204+
# of this function. For instance, executing:
205+
#
206+
# replace(AnnotatedIOBuffer(), S"apple",
207+
# 'e' => S"{red:x}", 'p' => S"{green:y}")
208+
#
209+
# results in 3 calls to `_insert_annotations!`. It takes
210+
# ~570ns in total, compared to ~200ns if we push `annot`
211+
# instead of `newannot`. Commenting out the `_insert_annotations!`
212+
# line reduces the runtime to ~170ns, from which we can infer
213+
# that constructing `newannot` is somehow responsible for
214+
# a ~30ns -> ~400ns (~13x) increase in runtime!!
215+
# This also comes with a marginal increase in allocations
216+
# (compared to the commented out version) of 2 -> 14 (250b -> 720b).
217+
#
218+
# This seems quite strange, but I haven't dug into the generated
219+
# LLVM or ASM code. If anybody reading this is interested in checking
220+
# this out, that would be brilliant 🙏.
221+
#
222+
# What I have done is found that "direct tuple reconstruction"
223+
# (as below) is several times faster than using `setindex`.
224+
newannot = (region = start+offset:stop+offset,
225+
label = annot.label,
226+
value = annot.value)
227+
push!(annots, newannot)
200228
end
201229
end
202230

231+
_insert_annotations!(io::AnnotatedIOBuffer, newannots::Vector{RegionAnnotation}, offset::Int = position(io)) =
232+
_insert_annotations!(io.annotations, newannots, offset)
233+
234+
# String replacement
235+
236+
# REVIEW: For some reason the `Core.kwcall` indirection seems to cause a
237+
# substantial slowdown here. If we remove `; count` from the signature
238+
# and run the sample code above in `_insert_annotations!`, the runtime
239+
# drops from ~4400ns to ~580ns (~7x faster). I cannot guess why this is.
240+
function replace(out::AnnotatedIOBuffer, str::AnnotatedString, pat_f::Pair...; count = typemax(Int))
241+
if count == 0 || isempty(pat_f)
242+
write(out, str)
243+
return out
244+
end
245+
e1, patterns, replacers, repspans, notfound = _replace_init(str.string, pat_f, count)
246+
if notfound
247+
foreach(_free_pat_replacer, patterns)
248+
write(out, str)
249+
return out
250+
end
251+
# Modelled after `Base.annotated_chartransform`, but needing
252+
# to handle a bit more complexity.
253+
isappending = eof(out)
254+
newannots = empty(out.annotations)
255+
bytepos = bytestart = firstindex(str.string)
256+
replacements = [(region = (bytestart - 1):(bytestart - 1), offset = position(out))]
257+
nrep = 1
258+
while nrep <= count
259+
repspans, ridx, xspan, newbytes, bytepos = @inline _replace_once(
260+
out.io, str.string, bytestart, e1, patterns, replacers, repspans, count, nrep, bytepos)
261+
first(xspan) >= e1 && break
262+
nrep += 1
263+
# NOTE: When the replaced pattern ends with a multi-codeunit character,
264+
# `xspan` only covers up to the start of that character. However,
265+
# for us to correctly account for the changes to the string we need
266+
# the /entire/ span of codeunits that were replaced.
267+
if !isempty(xspan) && codeunit(str.string, last(xspan)) > 0x80
268+
xspan = first(xspan):nextind(str.string, last(xspan))-1
269+
end
270+
drift = last(replacements).offset
271+
thisrep = (region = xspan, offset = drift + newbytes - length(xspan))
272+
destoff = first(xspan) - 1 + drift
273+
push!(replacements, thisrep)
274+
replacement = replacers[ridx]
275+
_isannotated(replacement) || continue
276+
annots = annotations(replacement)
277+
annots′ = if eltype(annots) == Annotation # When it's a char not a string
278+
region = 1:newbytes
279+
[@NamedTuple{region::UnitRange{Int64}, label::Symbol, value}((region, label, value))
280+
for (; label, value) in annots]
281+
else
282+
annots
283+
end::Vector{RegionAnnotation}
284+
_insert_annotations!(newannots, annots′, destoff)
285+
end
286+
push!(replacements, (region = e1:(e1-1), offset = last(replacements).offset))
287+
foreach(_free_pat_replacer, patterns)
288+
write(out.io, SubString(str.string, bytepos))
289+
# NOTE: To enable more efficient annotation clearing,
290+
# we make use of the fact that `_replace_once` picks
291+
# replacements ordered by their match start position.
292+
# This means that the start of `.region`s in
293+
# `replacements` is monotonically increasing.
294+
isappending || _clear_annotations_in_region!(out.annotations, first(replacements).offset:position(out))
295+
for (; region, label, value) in str.annotations
296+
start, stop = first(region), last(region)
297+
prioridx = searchsortedlast(
298+
replacements, (region = start:start, offset = 0),
299+
by = r -> first(r.region))
300+
postidx = searchsortedfirst(
301+
replacements, (region = stop:stop, offset = 0),
302+
by = r -> first(r.region))
303+
priorrep, postrep = replacements[prioridx], replacements[postidx]
304+
if prioridx == postidx && start >= first(priorrep.region) && stop <= last(priorrep.region)
305+
# Region contained with a replacement
306+
continue
307+
elseif postidx - prioridx <= 1 && start > last(priorrep.region) && stop < first(postrep.region)
308+
# Lies between replacements
309+
shiftregion = (start + priorrep.offset):(stop + priorrep.offset)
310+
shiftann = (region = shiftregion, label, value)
311+
push!(out.annotations, shiftann)
312+
else
313+
# Split between replacements
314+
prevrep = replacements[max(begin, prioridx - 1)]
315+
for rep in @view replacements[max(begin, prioridx - 1):min(end, postidx + 1)]
316+
gap = max(start, last(prevrep.region)+1):min(stop, first(rep.region)-1)
317+
if !isempty(gap)
318+
shiftregion = (first(gap) + prevrep.offset):(last(gap) + prevrep.offset)
319+
shiftann = (; region = shiftregion, label, value)
320+
push!(out.annotations, shiftann)
321+
end
322+
prevrep = rep
323+
end
324+
end
325+
end
326+
append!(out.annotations, newannots)
327+
out
328+
end
329+
330+
replace(out::IO, str::AnnotatedString, pat_f::Pair...; count=typemax(Int)) =
331+
replace(out, str.string, pat_f...; count)
332+
333+
function replace(str::AnnotatedString, pat_f::Pair...; count=typemax(Int))
334+
isempty(pat_f) || iszero(count) && return str
335+
out = AnnotatedIOBuffer()
336+
replace(out, str, pat_f...; count)
337+
read(seekstart(out), AnnotatedString)
338+
end
339+
340+
# Printing
341+
203342
function printstyled end
204343

205344
# NOTE: This is an interim solution to the invalidations caused

0 commit comments

Comments
 (0)