Skip to content

Commit f6baeae

Browse files
committed
parse_tag_token without StringScanner: pure byte ops avoid reset(token) overhead, -12% combined\n\nResult: {"status":"keep","combined_µs":3350,"parse_µs":2212,"render_µs":1138,"allocations":24882}
1 parent b37fa98 commit f6baeae

File tree

2 files changed

+49
-11
lines changed

2 files changed

+49
-11
lines changed

autoresearch.jsonl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,7 @@
1313
{"run":12,"commit":"c09e722","metric":4150,"metrics":{"parse_µs":2769,"render_µs":1381,"allocations":24881},"status":"discard","description":"Index loop for filters: YJIT optimizes each+destructure better than manual indexing","timestamp":1773349699285,"segment":0}
1414
{"run":13,"commit":"b7ae55f","metric":3556,"metrics":{"parse_µs":2388,"render_µs":1168,"allocations":24882},"status":"keep","description":"Replace StringScanner tokenizer with String#byteindex — 12% faster parse, no regex overhead for delimiter finding","timestamp":1773349875890,"segment":0}
1515
{"run":14,"commit":"e25f2f1","metric":3464,"metrics":{"parse_µs":2335,"render_µs":1129,"allocations":24882},"status":"keep","description":"Confirmation run: byteindex tokenizer consistently 3,400-3,600µs","timestamp":1773349889465,"segment":0}
16+
{"run":15,"commit":"b37fa98","metric":3490,"metrics":{"parse_µs":2331,"render_µs":1159,"allocations":24882},"status":"keep","description":"Clean up tokenizer: remove unused StringScanner setup and regex constants","timestamp":1773349928672,"segment":0}
17+
{"run":16,"commit":"b37fa98","metric":3638,"metrics":{"parse_µs":2460,"render_µs":1178,"allocations":24882},"status":"discard","description":"Single-char byteindex for %} search: Ruby loop overhead worse for nearby targets","timestamp":1773349985509,"segment":0}
18+
{"run":17,"commit":"b37fa98","metric":3553,"metrics":{"parse_µs":2431,"render_µs":1122,"allocations":25256},"status":"discard","description":"Regex simple_variable_markup: MatchData creates 374 extra allocs, offsetting speed gain","timestamp":1773350066627,"segment":0}
19+
{"run":18,"commit":"b37fa98","metric":3629,"metrics":{"parse_µs":2455,"render_µs":1174,"allocations":25002},"status":"discard","description":"String.new(capacity: 4096) for output buffer: allocates more objects, not fewer","timestamp":1773350101852,"segment":0}

lib/liquid/cursor.rb

Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -215,22 +215,56 @@ def scan_comparison_op
215215
attr_reader :tag_markup, :tag_newlines
216216

217217
# Parse the interior of a tag token: "{%[-] tag_name markup [-]%}"
218-
# Caller provides the full token string. Sets cursor to the token.
218+
# Pure byte operations — avoids StringScanner reset overhead.
219219
# Returns tag_name string or nil. Sets tag_markup and tag_newlines.
220220
def parse_tag_token(token)
221-
reset(token)
222-
@ss.pos = 2 # skip "{%"
223-
@ss.scan_byte if peek_byte == DASH # skip whitespace control '-'
224-
nl = skip_ws
225-
tag_name = scan_tag_name
226-
return unless tag_name
221+
len = token.bytesize
222+
pos = 2 # skip "{%"
223+
pos += 1 if token.getbyte(pos) == DASH # skip '-'
224+
nl = 0
225+
226+
# Skip whitespace, count newlines
227+
while pos < len
228+
b = token.getbyte(pos)
229+
case b
230+
when SPACE, TAB, CR, FF then pos += 1
231+
when NL then pos += 1; nl += 1
232+
else break
233+
end
234+
end
227235

228-
nl += skip_ws
236+
# Scan tag name: '#' or [a-zA-Z_][\w-]*
237+
name_start = pos
238+
b = token.getbyte(pos)
239+
if b == HASH
240+
pos += 1
241+
elsif b && ((b >= 97 && b <= 122) || (b >= 65 && b <= 90) || b == USCORE)
242+
pos += 1
243+
while pos < len
244+
b = token.getbyte(pos)
245+
break unless (b >= 97 && b <= 122) || (b >= 65 && b <= 90) || (b >= 48 && b <= 57) || b == USCORE || b == DASH
246+
pos += 1
247+
end
248+
pos += 1 if pos < len && token.getbyte(pos) == QMARK
249+
else
250+
return
251+
end
252+
tag_name = token.byteslice(name_start, pos - name_start)
253+
254+
# Skip whitespace after tag name, count newlines
255+
while pos < len
256+
b = token.getbyte(pos)
257+
case b
258+
when SPACE, TAB, CR, FF then pos += 1
259+
when NL then pos += 1; nl += 1
260+
else break
261+
end
262+
end
229263

230264
# markup is everything up to optional '-' before '%}'
231-
markup_end = token.bytesize - 2
232-
markup_end -= 1 if markup_end > @ss.pos && token.getbyte(markup_end - 1) == DASH
233-
@tag_markup = @ss.pos >= markup_end ? "" : token.byteslice(@ss.pos, markup_end - @ss.pos)
265+
markup_end = len - 2
266+
markup_end -= 1 if markup_end > pos && token.getbyte(markup_end - 1) == DASH
267+
@tag_markup = pos >= markup_end ? "" : token.byteslice(pos, markup_end - pos)
234268
@tag_newlines = nl
235269

236270
tag_name

0 commit comments

Comments
 (0)