From 710d0eefb697b4287789c7adc5157ba33de86b48 Mon Sep 17 00:00:00 2001
From: Connor Shea <2977353+connorshea@users.noreply.github.com>
Date: Fri, 5 Jun 2026 16:01:40 -0600
Subject: [PATCH 1/2] Optimize Sheet#rows_generator hot path

Benchmarked on a generated 20,000-row x 40-col xlsx (mixed shared-string,
numeric, and empty cells). Median full-parse time dropped ~1.68x
(1.85s -> 1.11s) and allocations dropped ~3x (28.6M -> ~9M), with
byte-identical output verified on both default- and prefixed-namespace
files across rows, simple_rows, and rows_with_meta_data.

Changes, in order of impact:

- Resolve the namespace prefix once via a `namespace_resolved` flag
  instead of re-checking `node.namespaces` on every node. Worksheets use
  a default namespace, so the old `prefix.empty?` guard never latched and
  allocated a namespaces hash for every node in the stream. This is the
  bulk of the wall-clock win.
- Hoist `node.name`/`node.node_type` into locals (each was read up to 4x
  per node in the if/elsif chain) and reorder branches so the hottest
  nodes (<v>/<c>) are tested first.
- Use `node.attribute_hash` instead of `node.attributes` for cell and row
  nodes. `Reader#attributes` is `attribute_hash.merge(namespaces)`, so it
  built and merged a namespaces hash on every call; we only need the
  element's own attributes. This is an allocation/GC win (drops Hash#merge
  and the per-node namespaces hash) with negligible wall-clock change.
  Safe because xlsx row/cell elements never declare their own namespaces
  (the namespace hash is empty for them), so the result is identical --
  confirmed byte-for-byte including the self-closing-row + meta-data path.
- In fill_in_empty_cells, drop the redundant `.to_a` on the column range
  and use `delete_suffix` instead of `gsub` to strip the row number.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 lib/creek/sheet.rb | 59 ++++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 25 deletions(-)
diff --git a/lib/creek/sheet.rb b/lib/creek/sheet.rb
index 69d8497..8d250ff 100644
--- a/lib/creek/sheet.rb
+++ b/lib/creek/sheet.rb
@@ -101,30 +101,48 @@ def rows_generator(include_meta_data = false, use_simple_rows_format = false)
         cell_type = nil
         cell_style_idx = nil
         @book.files.file.open(path) do |xml|
-          prefix = ''
+          namespace_resolved = false
           name_row = 'row'
           name_c = 'c'
           name_v = 'v'
           name_t = 't'
           Nokogiri::XML::Reader.from_io(xml).each do |node|
-            if prefix.empty? && node.namespaces.any?
+            # Resolve the namespace prefix once, from the first element that
+            # declares the spreadsheetml namespace (the worksheet root). Caching
+            # this avoids allocating a namespaces hash for every node in the stream.
+            if !namespace_resolved && node.namespaces.any?
               namespace = node.namespaces.detect { |_key, uri| uri == SPREADSHEETML_URI }
-              prefix = if namespace && namespace[0].start_with?('xmlns:')
-                         namespace[0].delete_prefix('xmlns:') + ':'
-                       else
-                         ''
-                       end
-              name_row = "#{prefix}row"
-              name_c = "#{prefix}c"
-              name_v = "#{prefix}v"
-              name_t = "#{prefix}t"
+              if namespace
+                prefix = namespace[0].start_with?('xmlns:') ? namespace[0].delete_prefix('xmlns:') + ':' : ''
+                name_row = "#{prefix}row"
+                name_c = "#{prefix}c"
+                name_v = "#{prefix}v"
+                name_t = "#{prefix}t"
+                namespace_resolved = true
+              end
             end
-            if node.name == name_row && node.node_type == opener
-              row = node.attributes
+
+            node_name = node.name
+            node_type = node.node_type
+
+            if node_type == opener && (node_name == name_v || node_name == name_t)
+              unless cell.nil?
+                node.read
+                cells[cell] = convert(node.value, cell_type, cell_style_idx)
+              end
+            elsif node_name == name_c && node_type == opener
+              # attribute_hash avoids the namespaces lookup + merge that
+              # Reader#attributes performs on every call; we only need t/s/r.
+              attributes     = node.attribute_hash
+              cell_type      = attributes['t']
+              cell_style_idx = attributes['s']
+              cell           = attributes['r']
+            elsif node_name == name_row && node_type == opener
+              row = node.attribute_hash
               row['cells'] = {}
               cells = {}
               y << (include_meta_data ? row : cells) if node.self_closing?
-            elsif node.name == name_row && node.node_type == closer
+            elsif node_name == name_row && node_type == closer
               processed_cells = fill_in_empty_cells(cells, row['r'], cell, use_simple_rows_format)
               @headers = processed_cells if with_headers && row['r'] == HEADERS_ROW_NUMBER
 
@@ -138,15 +156,6 @@ def rows_generator(include_meta_data = false, use_simple_rows_format = false)
 
               row['cells'] = processed_cells
               y << (include_meta_data ? row : processed_cells)
-            elsif node.name == name_c && node.node_type == opener
-              cell_type      = node.attributes['t']
-              cell_style_idx = node.attributes['s']
-              cell           = node.attributes['r']
-            elsif (node.name == name_v || node.name == name_t) && node.node_type == opener
-              unless cell.nil?
-                node.read
-                cells[cell] = convert(node.value, cell_type, cell_style_idx)
-              end
             end
           end
         end
@@ -172,8 +181,8 @@ def fill_in_empty_cells(cells, row_number, last_col, use_simple_rows_format)
       new_cells = {}
       return new_cells if cells.empty?
 
-      last_col = last_col.gsub(row_number, '')
-      ('A'..last_col).to_a.each do |column|
+      last_col = last_col.delete_suffix(row_number)
+      ('A'..last_col).each do |column|
         id = cell_id(column, use_simple_rows_format, row_number)
         new_cells[id] = cells["#{column}#{row_number}"]
       end

From 9b4703b83abc506441ce356b26c235cfc403c94d Mon Sep 17 00:00:00 2001
From: Connor Shea <2977353+connorshea@users.noreply.github.com>
Date: Fri, 5 Jun 2026 16:11:48 -0600
Subject: [PATCH 2/2] Use per-attribute node.attribute() for cell t/s/r

Follow-up to the rows_generator optimization. Fetch the three cell
attributes with individual node.attribute('t'/'s'/'r') C calls instead of
building a per-cell attribute_hash and indexing it. With hundreds of
thousands of cells the per-cell Hash allocation dominates, so three cheap
lookups are both faster and leaner.

Same 20,000-row x 40-col benchmark, vs the attribute_hash version:

  | Cell-attr approach     | Median  | Allocations |
  |------------------------|---------|-------------|
  | node.attribute_hash    | ~1.10s  | 9,226,711   |
  | node.attribute() x3    | ~1.08s  | 7,626,711   |

  => ~3-4% faster, 1.6M (~17%) fewer allocations (deterministic: ~640K
     cells x the per-cell hash no longer built).

Output byte-identical across rows, simple_rows, and rows_with_meta_data on
both default- and prefixed-namespace files. Suite green (47 examples).
The row-opener node keeps attribute_hash (only ~one per row, and it
preserves the exact meta-data row hash).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 lib/creek/sheet.rb | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/lib/creek/sheet.rb b/lib/creek/sheet.rb
index 8d250ff..51c3776 100644
--- a/lib/creek/sheet.rb
+++ b/lib/creek/sheet.rb
@@ -131,12 +131,13 @@ def rows_generator(include_meta_data = false, use_simple_rows_format = false)
                 cells[cell] = convert(node.value, cell_type, cell_style_idx)
               end
             elsif node_name == name_c && node_type == opener
-              # attribute_hash avoids the namespaces lookup + merge that
-              # Reader#attributes performs on every call; we only need t/s/r.
-              attributes     = node.attribute_hash
-              cell_type      = attributes['t']
-              cell_style_idx = attributes['s']
-              cell           = attributes['r']
+              # Fetch the three attributes individually rather than via
+              # attribute_hash/attributes: with hundreds of thousands of cells
+              # the per-cell Hash allocation dominates, so three cheap C lookups
+              # are both faster and leaner than building and indexing a hash.
+              cell_type      = node.attribute('t')
+              cell_style_idx = node.attribute('s')
+              cell           = node.attribute('r')
             elsif node_name == name_row && node_type == opener
               row = node.attribute_hash
               row['cells'] = {}