Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
b7b6733
pdf: add PdfTextMode config option to HtmlConfig
andiwand Jul 1, 2026
74f51ee
pdf: dual-layer and single-layer text rendering (#577 #578)
andiwand Jul 1, 2026
430df42
pdf: deduplicate helpers and fix two visual-layer bugs
andiwand Jul 1, 2026
ca65d58
test: verify both PdfTextMode values render a PDF without crashing
andiwand Jul 1, 2026
d1b4527
revert test
andiwand Jul 1, 2026
f02ed9a
pdf: deduplicate single/dual-layer text rendering helpers
andiwand Jul 2, 2026
1cdce2a
pdf: fix selection-layer run width on merge and propagate font-size
andiwand Jul 2, 2026
26d31f2
pdf: fix selection-layer space y-offset and sort lines into reading o…
andiwand Jul 2, 2026
c8a1458
font: overflow PUA re-encode into Supplementary PUA-A with format-12 …
andiwand Jul 3, 2026
532f94a
update refs
andiwand Jul 3, 2026
f4e2fb5
pdf: address PR #579 review — text-rendering correctness fixes
andiwand Jul 3, 2026
88d736c
pdf: dedup dual/single-layer text rendering into a shared toolbox
andiwand Jul 3, 2026
b097f6c
pdf: compress and dedup comments in html pdf_file
andiwand Jul 3, 2026
263cbb4
checkout lfs; cleanup
andiwand Jul 3, 2026
39c3045
generalize tests
andiwand Jul 3, 2026
e5960c4
cleanup
andiwand Jul 3, 2026
aa22f26
Apply suggestions from code review
andiwand Jul 3, 2026
34fa3f5
Merge branch 'pdf-text-selection' of github.com:opendocument-app/Open…
andiwand Jul 3, 2026
c3e9e0b
format
andiwand Jul 3, 2026
aa79226
Emit text layer in pt instead of px
andiwand Jul 3, 2026
2b6d136
pdf: keep selection layer in stream order; document XY-cut plan
andiwand Jul 3, 2026
682ce0c
minor cleanup
andiwand Jul 3, 2026
4b28314
pdf: share line-grouping state across the three text layers
andiwand Jul 3, 2026
7cd9c2f
pdf: document writing-mode, non-Manhattan limits, and bottom-up clust…
andiwand Jul 3, 2026
93e4adb
pdf: inline per-layer line-grouping state again
andiwand Jul 3, 2026
6fee5ff
update refs
andiwand Jul 3, 2026
212a73a
pdf: don't force a whole run onto the PUA path for a recovered word-b…
andiwand Jul 3, 2026
39b0623
update ref
andiwand Jul 3, 2026
9927b3f
pdf single-layer: width-bearing selectable spaces for recovered word …
andiwand Jul 4, 2026
4a2530e
pdf single-layer: align recovered word-break space to the text baseline
andiwand Jul 4, 2026
5b1d682
Merge remote-tracking branch 'origin/main' into pdf-text-selection
andiwand Jul 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/build_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ jobs:
with:
token: ${{ secrets.PAT_ANDIWAND }}
submodules: true
lfs: true

- name: ubuntu install tidy
if: runner.os == 'Linux'
Expand Down
32 changes: 32 additions & 0 deletions src/odr/html.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,21 @@ enum class HtmlTableGridlines {
hard,
};

/// @brief PDF text rendering mode.
///
/// Selects how text is emitted in PDF→HTML output.
///
/// - `dual_layer`: A visual layer (paint order, embedded PUA glyphs) and a
/// separate transparent selection/search layer (reading order, real Unicode).
/// Similar to pdf.js. No JavaScript required.
/// - `single_layer`: A single combined layer where every glyph is mapped to
/// Unicode via frequency analysis. Similar to pdf2htmlEX. No JavaScript
/// required.
enum class PdfTextMode {
dual_layer,
single_layer,
};

/// @brief HTML configuration.
struct HtmlConfig {
// document output file names
Expand Down Expand Up @@ -106,6 +121,23 @@ struct HtmlConfig {
std::string background_image_format{"png"};
double background_image_dpi{144.0};

// PDF text mode
PdfTextMode pdf_text_mode{PdfTextMode::dual_layer};
// `dual_layer`'s invisible selection-layer text is rendered in a local
// system font (tried in order; the first that resolves wins) rather than
// the embedded PDF font, so its natural width rarely matches the
// PDF-derived box width CSS `text-justify` is asked to fill (justify can
// only add spacing, never compress).
// `pdf_dual_layer_fallback_font_size_adjust` is applied as that @font-face's
// `size-adjust` (0-1, written out as a percent) to shrink the fallback font's
// metrics toward the PDF's, leaving less — ideally no — gap for justify to
// compress instead of stretch into. Safe to underestimate (justify then just
// spreads characters further; harmless on an invisible layer) but not to
// overestimate (the excess is clipped, not shrunk).
std::vector<std::string> pdf_dual_layer_fallback_fonts{
"Arial", "Helvetica", "Liberation Sans", "DejaVu Sans", "Nimbus Sans"};
double pdf_dual_layer_fallback_font_size_adjust{0.5};

// drm options
bool no_drm{false};

Expand Down
18 changes: 11 additions & 7 deletions src/odr/internal/font/cff_transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,9 +144,9 @@ std::string cff::wrap_to_otf(const CffFont &font,
const std::map<char32_t, std::uint16_t> &extra) {
const std::uint16_t glyphs = font.glyph_count();

// The uniform PUA re-encode: pua_code_point(glyph) -> glyph over
// every glyph. serialize_cmap throws if a code point is beyond the BMP, which
// also bounds the glyph count to the PUA capacity.
// The uniform PUA re-encode: pua_code_point(glyph) -> glyph over every glyph.
// Glyphs past the 6400-slot BMP PUA overflow into Supplementary PUA-A, and
// serialize_cmap emits a format-12 subtable to cover them.
std::map<char32_t, std::uint16_t> pua;
for (std::uint16_t glyph = 0; glyph < glyphs; ++glyph) {
pua[pua_code_point(glyph)] = glyph;
Expand Down Expand Up @@ -183,10 +183,14 @@ std::string cff::wrap_to_otf(const CffFont &font,
tables.emplace_back("cmap", serialize_cmap(pua));
tables.emplace_back("name", serialize_name(font.name()));
tables.emplace_back("post", serialize_post());
tables.emplace_back("OS/2",
serialize_os2(font.units_per_em(), bbox.y_min, bbox.y_max,
static_cast<std::uint16_t>(first),
static_cast<std::uint16_t>(last)));
// OS/2 usFirst/usLastCharIndex are u16; a beyond-BMP PUA code point (large
// glyph counts overflow into Supplementary PUA-A) is clamped to 0xFFFF.
tables.emplace_back(
"OS/2",
serialize_os2(
font.units_per_em(), bbox.y_min, bbox.y_max,
static_cast<std::uint16_t>(std::min<char32_t>(first, 0xffff)),
static_cast<std::uint16_t>(std::min<char32_t>(last, 0xffff))));

return build_sfnt(0x4f54544f /* 'OTTO' */, std::move(tables));
}
Expand Down
86 changes: 73 additions & 13 deletions src/odr/internal/font/sfnt_transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
#include <algorithm>
#include <bit>
#include <cstdint>
#include <limits>
#include <map>
#include <ranges>
#include <stdexcept>
#include <utility>

namespace odr::internal::font {
Expand All @@ -17,8 +17,15 @@ namespace {

namespace bs = util::byte_string;

constexpr char32_t pua_base = 0xe000;
constexpr std::uint16_t pua_capacity = 0xf8ff - 0xe000 + 1; // 6400
// Glyphs are re-encoded to Private Use Area code points, filling the BMP PUA
// first and overflowing into Supplementary PUA-A (Plane 15). A uint16 glyph id
// (max 65535) offset past the 6400-slot BMP PUA tops out at U+FE6FF, well
// inside PUA-A's 65534 slots, so Supplementary PUA-B is never needed.
constexpr char32_t pua_base = 0xe000; // BMP PUA start
constexpr std::uint16_t pua_bmp_capacity = 0xf8ff - 0xe000 + 1; // 6400
constexpr char32_t pua_supp_a_base = 0xf0000; // PUA-A (Plane 15)
constexpr std::uint32_t pua_capacity =
pua_bmp_capacity + (0xffffd - 0xf0000 + 1); // 6400 + 65534 = 71934

void pad4(std::string &s) {
while (s.size() % 4 != 0) {
Expand Down Expand Up @@ -66,8 +73,14 @@ SearchHints search_hints(const std::uint16_t count, const std::uint16_t unit) {

namespace odr::internal {

namespace bs = util::byte_string;

char32_t font::pua_code_point(const std::uint16_t glyph) noexcept {
return pua_base + glyph;
if (glyph < pua_bmp_capacity) {
return pua_base + glyph;
}
// Overflow past the BMP PUA into Supplementary PUA-A (U+F0000..U+FFFFD).
return pua_supp_a_base + (glyph - pua_bmp_capacity);
}

std::string
Expand Down Expand Up @@ -137,7 +150,59 @@ font::build_sfnt(const std::uint32_t sfnt_version,
return out;
}

/// Format-12 `cmap` subtable (segmented coverage): sequential map groups over
/// the full Unicode range, each `[startCharCode, endCharCode]` mapping to
/// `startGlyphID + (code - startCharCode)`. Used when the map reaches beyond
/// the BMP (glyphs overflowing into Supplementary PUA-A), which format 4 cannot
/// express. Wrapped in a (Windows, Unicode full repertoire) encoding record.
static std::string
serialize_cmap_format12(const std::map<char32_t, std::uint16_t> &map) {
struct Group {
std::uint32_t start_code;
std::uint32_t end_code;
std::uint32_t start_glyph;
};
std::vector<Group> groups;
for (const auto &[code, glyph] : map) {
if (!groups.empty() && code == groups.back().end_code + 1 &&
glyph == groups.back().start_glyph +
(groups.back().end_code - groups.back().start_code) + 1) {
groups.back().end_code = code; // extend the current lockstep run
} else {
groups.push_back({code, code, glyph});
}
}

std::string sub;
bs::put_u16_be(sub, 12); // format
bs::put_u16_be(sub, 0); // reserved
bs::put_u32_be(sub,
static_cast<std::uint32_t>(16 + 12 * groups.size())); // len
bs::put_u32_be(sub, 0); // language
bs::put_u32_be(sub, static_cast<std::uint32_t>(groups.size()));
for (const auto &g : groups) {
bs::put_u32_be(sub, g.start_code);
bs::put_u32_be(sub, g.end_code);
bs::put_u32_be(sub, g.start_glyph);
}

std::string cmap;
bs::put_u16_be(cmap, 0); // version
bs::put_u16_be(cmap, 1); // numTables
bs::put_u16_be(cmap, 3); // platformID (Windows)
bs::put_u16_be(cmap, 10); // encodingID (Unicode full repertoire)
bs::put_u32_be(cmap, 12); // offset to the subtable
cmap += sub;
return cmap;
}

std::string font::serialize_cmap(const std::map<char32_t, std::uint16_t> &map) {
// Format 4 tops out at the BMP; a map that overflows into the Supplementary
// PUA needs format 12's 32-bit code ranges instead.
if (!map.empty() && map.rbegin()->first > 0xffff) {
return serialize_cmap_format12(map);
}

// A format-4 segment: a contiguous code range [start, end] whose glyph is
// `code + delta` (mod 2^16), i.e. idRangeOffset = 0.
struct Segment {
Expand All @@ -147,11 +212,6 @@ std::string font::serialize_cmap(const std::map<char32_t, std::uint16_t> &map) {
};
std::vector<Segment> segments;
for (const auto &[code, glyph] : map) {
if (code > 0xffff) {
throw std::runtime_error(
"sfnt: serialize_cmap supports only BMP code points (format 4); "
"beyond-BMP coverage (format 12) is a follow-up");
}
const auto c = static_cast<std::uint16_t>(code);
const auto delta = static_cast<std::uint16_t>(glyph - c);
if (!segments.empty() && c == segments.back().end + 1 &&
Expand Down Expand Up @@ -275,10 +335,10 @@ std::string font::serialize_os2(const std::uint16_t units_per_em,

void font::reencode_to_pua(sfnt::SfntFont &font,
const std::map<char32_t, std::uint16_t> &extra) {
if (font.glyph_count() > pua_capacity) {
throw std::runtime_error(
"sfnt_transform: glyph count exceeds BMP PUA capacity");
}
// A uint16 glyph id always fits: `pua_code_point` maps the BMP PUA first
// (6400 slots) then overflows into Supplementary PUA-A, whose combined
// `pua_capacity` (71934) exceeds any 16-bit glyph count.
static_assert(std::numeric_limits<std::uint16_t>::max() < pua_capacity);

std::map<char32_t, std::uint16_t> map;
for (std::uint16_t glyph = 0; glyph < font.glyph_count(); ++glyph) {
Expand Down
Loading
Loading