From 710d0eefb697b4287789c7adc5157ba33de86b48 Mon Sep 17 00:00:00 2001 From: Connor Shea <2977353+connorshea@users.noreply.github.com> Date: Fri, 5 Jun 2026 16:01:40 -0600 Subject: [PATCH 1/2] Optimize Sheet#rows_generator hot path Benchmarked on a generated 20,000-row x 40-col xlsx (mixed shared-string, numeric, and empty cells). Median full-parse time dropped ~1.68x (1.85s -> 1.11s) and allocations dropped ~3x (28.6M -> ~9M), with byte-identical output verified on both default- and prefixed-namespace files across rows, simple_rows, and rows_with_meta_data. Changes, in order of impact: - Resolve the namespace prefix once via a `namespace_resolved` flag instead of re-checking `node.namespaces` on every node. Worksheets use a default namespace, so the old `prefix.empty?` guard never latched and allocated a namespaces hash for every node in the stream. This is the bulk of the wall-clock win. - Hoist `node.name`/`node.node_type` into locals (each was read up to 4x per node in the if/elsif chain) and reorder branches so the hottest nodes (/) are tested first. - Use `node.attribute_hash` instead of `node.attributes` for cell and row nodes. `Reader#attributes` is `attribute_hash.merge(namespaces)`, so it built and merged a namespaces hash on every call; we only need the element's own attributes. This is an allocation/GC win (drops Hash#merge and the per-node namespaces hash) with negligible wall-clock change. Safe because xlsx row/cell elements never declare their own namespaces (the namespace hash is empty for them), so the result is identical -- confirmed byte-for-byte including the self-closing-row + meta-data path. - In fill_in_empty_cells, drop the redundant `.to_a` on the column range and use `delete_suffix` instead of `gsub` to strip the row number. Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/creek/sheet.rb | 59 ++++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/lib/creek/sheet.rb b/lib/creek/sheet.rb index 69d8497..8d250ff 100644 --- a/lib/creek/sheet.rb +++ b/lib/creek/sheet.rb @@ -101,30 +101,48 @@ def rows_generator(include_meta_data = false, use_simple_rows_format = false) cell_type = nil cell_style_idx = nil @book.files.file.open(path) do |xml| - prefix = '' + namespace_resolved = false name_row = 'row' name_c = 'c' name_v = 'v' name_t = 't' Nokogiri::XML::Reader.from_io(xml).each do |node| - if prefix.empty? && node.namespaces.any? + # Resolve the namespace prefix once, from the first element that + # declares the spreadsheetml namespace (the worksheet root). Caching + # this avoids allocating a namespaces hash for every node in the stream. + if !namespace_resolved && node.namespaces.any? namespace = node.namespaces.detect { |_key, uri| uri == SPREADSHEETML_URI } - prefix = if namespace && namespace[0].start_with?('xmlns:') - namespace[0].delete_prefix('xmlns:') + ':' - else - '' - end - name_row = "#{prefix}row" - name_c = "#{prefix}c" - name_v = "#{prefix}v" - name_t = "#{prefix}t" + if namespace + prefix = namespace[0].start_with?('xmlns:') ? namespace[0].delete_prefix('xmlns:') + ':' : '' + name_row = "#{prefix}row" + name_c = "#{prefix}c" + name_v = "#{prefix}v" + name_t = "#{prefix}t" + namespace_resolved = true + end end - if node.name == name_row && node.node_type == opener - row = node.attributes + + node_name = node.name + node_type = node.node_type + + if node_type == opener && (node_name == name_v || node_name == name_t) + unless cell.nil? + node.read + cells[cell] = convert(node.value, cell_type, cell_style_idx) + end + elsif node_name == name_c && node_type == opener + # attribute_hash avoids the namespaces lookup + merge that + # Reader#attributes performs on every call; we only need t/s/r. + attributes = node.attribute_hash + cell_type = attributes['t'] + cell_style_idx = attributes['s'] + cell = attributes['r'] + elsif node_name == name_row && node_type == opener + row = node.attribute_hash row['cells'] = {} cells = {} y << (include_meta_data ? row : cells) if node.self_closing? - elsif node.name == name_row && node.node_type == closer + elsif node_name == name_row && node_type == closer processed_cells = fill_in_empty_cells(cells, row['r'], cell, use_simple_rows_format) @headers = processed_cells if with_headers && row['r'] == HEADERS_ROW_NUMBER @@ -138,15 +156,6 @@ def rows_generator(include_meta_data = false, use_simple_rows_format = false) row['cells'] = processed_cells y << (include_meta_data ? row : processed_cells) - elsif node.name == name_c && node.node_type == opener - cell_type = node.attributes['t'] - cell_style_idx = node.attributes['s'] - cell = node.attributes['r'] - elsif (node.name == name_v || node.name == name_t) && node.node_type == opener - unless cell.nil? - node.read - cells[cell] = convert(node.value, cell_type, cell_style_idx) - end end end end @@ -172,8 +181,8 @@ def fill_in_empty_cells(cells, row_number, last_col, use_simple_rows_format) new_cells = {} return new_cells if cells.empty? - last_col = last_col.gsub(row_number, '') - ('A'..last_col).to_a.each do |column| + last_col = last_col.delete_suffix(row_number) + ('A'..last_col).each do |column| id = cell_id(column, use_simple_rows_format, row_number) new_cells[id] = cells["#{column}#{row_number}"] end From 9b4703b83abc506441ce356b26c235cfc403c94d Mon Sep 17 00:00:00 2001 From: Connor Shea <2977353+connorshea@users.noreply.github.com> Date: Fri, 5 Jun 2026 16:11:48 -0600 Subject: [PATCH 2/2] Use per-attribute node.attribute() for cell t/s/r Follow-up to the rows_generator optimization. Fetch the three cell attributes with individual node.attribute('t'/'s'/'r') C calls instead of building a per-cell attribute_hash and indexing it. With hundreds of thousands of cells the per-cell Hash allocation dominates, so three cheap lookups are both faster and leaner. Same 20,000-row x 40-col benchmark, vs the attribute_hash version: | Cell-attr approach | Median | Allocations | |------------------------|---------|-------------| | node.attribute_hash | ~1.10s | 9,226,711 | | node.attribute() x3 | ~1.08s | 7,626,711 | => ~3-4% faster, 1.6M (~17%) fewer allocations (deterministic: ~640K cells x the per-cell hash no longer built). Output byte-identical across rows, simple_rows, and rows_with_meta_data on both default- and prefixed-namespace files. Suite green (47 examples). The row-opener node keeps attribute_hash (only ~one per row, and it preserves the exact meta-data row hash). Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/creek/sheet.rb | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/creek/sheet.rb b/lib/creek/sheet.rb index 8d250ff..51c3776 100644 --- a/lib/creek/sheet.rb +++ b/lib/creek/sheet.rb @@ -131,12 +131,13 @@ def rows_generator(include_meta_data = false, use_simple_rows_format = false) cells[cell] = convert(node.value, cell_type, cell_style_idx) end elsif node_name == name_c && node_type == opener - # attribute_hash avoids the namespaces lookup + merge that - # Reader#attributes performs on every call; we only need t/s/r. - attributes = node.attribute_hash - cell_type = attributes['t'] - cell_style_idx = attributes['s'] - cell = attributes['r'] + # Fetch the three attributes individually rather than via + # attribute_hash/attributes: with hundreds of thousands of cells + # the per-cell Hash allocation dominates, so three cheap C lookups + # are both faster and leaner than building and indexing a hash. + cell_type = node.attribute('t') + cell_style_idx = node.attribute('s') + cell = node.attribute('r') elsif node_name == name_row && node_type == opener row = node.attribute_hash row['cells'] = {}