parse_tag_token without StringScanner: pure byte ops avoid reset(token) overhead, -12% combined\n\nResult: {"status":"keep","combined_µs":3350,"parse_µs":2212,"render_µs":1138,"allocations":24882}

tobi · tobi · commit f6baeaed1e7a · 2026-03-12T17:17:10.000-04:00
diff --git a/autoresearch.jsonl b/autoresearch.jsonl
@@ -13,3 +13,7 @@
 {"run":12,"commit":"c09e722","metric":4150,"metrics":{"parse_µs":2769,"render_µs":1381,"allocations":24881},"status":"discard","description":"Index loop for filters: YJIT optimizes each+destructure better than manual indexing","timestamp":1773349699285,"segment":0}
 {"run":13,"commit":"b7ae55f","metric":3556,"metrics":{"parse_µs":2388,"render_µs":1168,"allocations":24882},"status":"keep","description":"Replace StringScanner tokenizer with String#byteindex — 12% faster parse, no regex overhead for delimiter finding","timestamp":1773349875890,"segment":0}
 {"run":14,"commit":"e25f2f1","metric":3464,"metrics":{"parse_µs":2335,"render_µs":1129,"allocations":24882},"status":"keep","description":"Confirmation run: byteindex tokenizer consistently 3,400-3,600µs","timestamp":1773349889465,"segment":0}
+{"run":15,"commit":"b37fa98","metric":3490,"metrics":{"parse_µs":2331,"render_µs":1159,"allocations":24882},"status":"keep","description":"Clean up tokenizer: remove unused StringScanner setup and regex constants","timestamp":1773349928672,"segment":0}
+{"run":16,"commit":"b37fa98","metric":3638,"metrics":{"parse_µs":2460,"render_µs":1178,"allocations":24882},"status":"discard","description":"Single-char byteindex for %} search: Ruby loop overhead worse for nearby targets","timestamp":1773349985509,"segment":0}
+{"run":17,"commit":"b37fa98","metric":3553,"metrics":{"parse_µs":2431,"render_µs":1122,"allocations":25256},"status":"discard","description":"Regex simple_variable_markup: MatchData creates 374 extra allocs, offsetting speed gain","timestamp":1773350066627,"segment":0}
+{"run":18,"commit":"b37fa98","metric":3629,"metrics":{"parse_µs":2455,"render_µs":1174,"allocations":25002},"status":"discard","description":"String.new(capacity: 4096) for output buffer: allocates more objects, not fewer","timestamp":1773350101852,"segment":0}
diff --git a/lib/liquid/cursor.rb b/lib/liquid/cursor.rb
@@ -215,22 +215,56 @@ def scan_comparison_op
     attr_reader :tag_markup, :tag_newlines
 
     # Parse the interior of a tag token: "{%[-] tag_name markup [-]%}"
-    # Caller provides the full token string. Sets cursor to the token.
+    # Pure byte operations — avoids StringScanner reset overhead.
     # Returns tag_name string or nil. Sets tag_markup and tag_newlines.
     def parse_tag_token(token)
-      reset(token)
-      @ss.pos = 2 # skip "{%"
-      @ss.scan_byte if peek_byte == DASH # skip whitespace control '-'
-      nl = skip_ws
-      tag_name = scan_tag_name
-      return unless tag_name
+      len = token.bytesize
+      pos = 2 # skip "{%"
+      pos += 1 if token.getbyte(pos) == DASH # skip '-'
+      nl = 0
+
+      # Skip whitespace, count newlines
+      while pos < len
+        b = token.getbyte(pos)
+        case b
+        when SPACE, TAB, CR, FF then pos += 1
+        when NL then pos += 1; nl += 1
+        else break
+        end
+      end
 
-      nl += skip_ws
+      # Scan tag name: '#' or [a-zA-Z_][\w-]*
+      name_start = pos
+      b = token.getbyte(pos)
+      if b == HASH
+        pos += 1
+      elsif b && ((b >= 97 && b <= 122) || (b >= 65 && b <= 90) || b == USCORE)
+        pos += 1
+        while pos < len
+          b = token.getbyte(pos)
+          break unless (b >= 97 && b <= 122) || (b >= 65 && b <= 90) || (b >= 48 && b <= 57) || b == USCORE || b == DASH
+          pos += 1
+        end
+        pos += 1 if pos < len && token.getbyte(pos) == QMARK
+      else
+        return
+      end
+      tag_name = token.byteslice(name_start, pos - name_start)
+
+      # Skip whitespace after tag name, count newlines
+      while pos < len
+        b = token.getbyte(pos)
+        case b
+        when SPACE, TAB, CR, FF then pos += 1
+        when NL then pos += 1; nl += 1
+        else break
+        end
+      end
 
       # markup is everything up to optional '-' before '%}'
-      markup_end = token.bytesize - 2
-      markup_end -= 1 if markup_end > @ss.pos && token.getbyte(markup_end - 1) == DASH
-      @tag_markup = @ss.pos >= markup_end ? "" : token.byteslice(@ss.pos, markup_end - @ss.pos)
+      markup_end = len - 2
+      markup_end -= 1 if markup_end > pos && token.getbyte(markup_end - 1) == DASH
+      @tag_markup = pos >= markup_end ? "" : token.byteslice(pos, markup_end - pos)
       @tag_newlines = nl
 
       tag_name