diff --git a/.gitignore b/.gitignore index 21cecc927..97859a175 100644 --- a/.gitignore +++ b/.gitignore @@ -55,6 +55,7 @@ tests/SIL.Machine.Tests/Corpora/TestData/usfm/target/* tests/SIL.Machine.Tests/Corpora/TestData/project/* tests/SIL.Machine.Tests/Corpora/TestData/pretranslations.json .idea +.worktrees # Local-only HermitCrab benchmark fixtures (real Sena/Indonesian grammars + word lists, used # for ad hoc perf/allocation testing) + FieldWorks project backups. Large and/or not licensed diff --git a/docs/hermitcrab-parse-algorithm-analysis.md b/docs/hermitcrab-parse-algorithm-analysis.md new file mode 100644 index 000000000..1149ebfea --- /dev/null +++ b/docs/hermitcrab-parse-algorithm-analysis.md @@ -0,0 +1,360 @@ +# Where the 15 million steps go: an algorithmic dissection of HermitCrab parsing + +This document dissects, empirically and against the literature, why a single legitimate Sena +word costs the HermitCrab engine millions of rule applications, and identifies the specific +redundancies that could be removed **without constraining the grammar and without losing valid +parses**. It is the analysis companion to `complexity-cap.md` (which bounds the damage) and +`docs/hermitcrab-grammar-performance.md` (which helps grammar authors avoid the damage). This +document is about making the engine itself stop doing provably repeated work. + +All numbers below are from the real Sena grammar (`samples/data/sena-hc.xml`, ~33k lines, two +`morphologicalRuleOrder="unordered"` strata, 25 morphological rules + ~19 multi-slot affix +templates in the main stratum), measured with an instrumented harness that replicates +`Morpher.ParseWordCore` exactly and swaps in a behavior-identical, counting clone of the +analysis cascade. A "step" is one rule-application attempt (`ParseContext.Step`), the same unit +`MaxParseSteps` budgets. + +## 1. The headline data + +Two worst-case words, dissected end to end: + +| | `atawirambo` (parses, 2 results) | `cinacemerwa` (fails, 0 results) | +|---|---|---| +| Total steps | 14,905,517 | 37,543,196 | +| Analysis phase steps | 14,202,364 (95.3%) | 29,494,226 (78.6%) | +| Analysis candidates produced | 41 | 41 | +| — of which reach any lexical root | 4 | 5 | +| — of which yield a parse | 2 | 0 | +| Synthesis phase steps | 703,153 | 8,048,970 | +| Synthesis inputs after `ExpandAlternatives` | 17,699 | 218,847 | +| Cascade node expansions (main stratum) | 158,227 | 523,773 | +| Unique states — (shape, rule-multiset) | 1,626 | 12,168 | +| Unique states — + syntacticFS in key | 2,546 | (not measured) | +| **Redundant re-expansions** | **98.4% of tree** | **97.7% of tree** | + +Three facts jump out: + +1. **The cost is analysis, not synthesis.** 79–95% of all steps are spent unapplying + morphological rules to hypothesize underlying forms — producing just 41 candidates, of which + only 4–5 ever match a lexical root. + +2. **The analysis tree is ~98% transpositions.** The cascade re-expands states it has already + fully explored. One state (`shape='t'`, a 12-affix multiset) was re-expanded **7,200 times** + for `atawirambo`. These are order-variants: unapplying prefix `a-` then suffix `-mbo` vs. + `-mbo` then `a-` reaches the same (shape, remaining-rules) state, and the engine explores the + entire subtree below it again each time. + +3. **Nothing prunes hopeless work.** The most expensive word in the corpus (`cinacemerwa`, + 37.5M steps) returns *zero* parses: 218,847 fully-synthesized candidate words, every one + failing at the end-of-pipeline checks (surface match / `IsWordValid`). The engine has no + notion of "this branch can no longer succeed." + +## 2. The combinatorial structure, precisely + +### 2.1 Analysis: all *orderings*, deduped only at the end + +For `unordered` strata (both Sena strata), analysis morphology runs through +`CombinationRuleCascade` with `multiApp: true` +(`AnalysisStratumRule.cs:50-71` → `CombinationRuleCascade.cs:32-54`). In that mode the recursion +restarts at rule index 0 on **every** level: the search enumerates all ordered sequences (with +repetition, bounded per-rule by `MaxApplicationCount`, default 1) of rule unapplications. For a +word where k independent affixes can strip, that is O(k!) paths to the same end state, not +O(2^k) states. + +Each node expansion attempts the **entire rule battery** — visible in the per-rule diagnostics +as bands of rules with *identical* attempt counts (14 prefix rules × 319,267 attempts, 30 rules +× 158,227 attempts = one attempt per rule per node). Every attempt costs one step plus, if the +rule's syntactic gates pass, a full-shape anchored FST match per allomorph +(`AnalysisAffixProcessRule.cs:61-64`, `MatchingMethod.Unification`, `AllSubmatches: true`), and +every successful unapplication deep-clones the `Word` including its `Shape` +(`AnalysisAffixProcessAllomorphRuleSpec.ApplyRhs`). + +Deduplication exists but fires **after the work is done**: + +- Each cascade's terminal `HashSet` collapses equal results — but + `Word.ValueEquals` (`Word.cs:583-600`) includes the `_mruleApps` **sequence**, so two + orderings of the same affix set are *not* equal and are both kept, and in any case the + HashSet dedups storage, not the recomputation that produced the duplicate. +- `MergeEquivalentAnalyses` (`AnalysisStratumRule.cs:140-178`) merges by **shape only**, at the + stratum output boundary — after the tree has been fully walked. The merged variants are + stashed in `Word.Alternatives`… and then `ExpandAlternatives` (`Word.cs:452-494`) + re-materializes every one of them as a separate synthesis input. Merging defers the + explosion; it does not remove it (16,330 alternatives for one candidate of `atawirambo`; + 98,197 for one candidate of `cinacemerwa`). + +On top of the cascade, templates and morphological rules mutually recurse +(`AnalysisStratumRule.cs:188-230`): every cascade output gets the full template battery applied, +and every template output re-enters the full cascade — again with no memoization, which is why +total analysis steps (14.2M) are ~3.6× the cascade-internal rule attempts (3.96M). + +### 2.2 Synthesis: a directed replay that still scans the whole battery + +Synthesis is *not* a search — each analysis trail dictates the exact rule sequence, gated by +`IsMorphologicalRuleApplicable` (`Word.cs:269-276`: the next pending rule must equal the rule +being tried). But the `CombinationRuleCascade` used for unordered synthesis +(`SynthesisStratumRule.cs:35`) still **attempts all ~40 rules at every node** and lets the gate +reject 39 of them, one step each: every rule shows exactly 17,877 synthesis attempts for +`atawirambo`'s 17,699 synthesis inputs. The engine already knows the one rule that can apply +(`_mruleApps[_mruleAppIndex]`); it looks for it by exhaustive scan. + +And the expensive correctness checks run dead last: allomorph environments, allomorph/morpheme +co-occurrence, disjunctive allomorph selection, and the surface-form match are all evaluated +only after the entire synthesis cascade has produced a finished word +(`Allomorph.IsWordValid`, `Morpher.IsWordValid`, `Morpher.IsMatch` — `Morpher.cs:711-753`). +`cinacemerwa` synthesized 218,847 complete words and threw away every single one at that final +stage. + +## 3. What the literature says + +The most striking finding is internal: **HermitCrab's founding paper already solved this +problem, by packing rather than forking.** Maxwell (1994) — the original Hermit Crab design +(Michael Maxwell's, not David Weber's; Weber's tools are AMPLE/STAMP) — avoids exponential +analysis explicitly *"by encoding into the form being parsed the ambiguities which arise +during parsing"*: rule unapplication uninstantiates features and marks undone +deletions/epentheses `[+optional]`, producing **one underspecified shape that denotes the whole +candidate set**, with lexical lookup as unification against it. The .NET implementation keeps +this for phonology (`AnalysisRewriteRule` mutates one shape in place, which is why phonological +rules are invisible in the step counters) but forks a concrete `Word` per choice at the +morphological level — losing the design's central invariant exactly where Bantu grammars +multiply. Maxwell quotes Anderson (1988): with realistic rule depth, "simply undoing the +effects of the rules… [is] quite impractical" if candidates multiply. The measured +98%-transposition tree is that prediction come true. (Historically, Hermit Crab benchmarked +within ~3× of PC-KIMMO when ambiguity stayed *in the form* rather than in the agenda.) + +The rest of the (verified) literature converges on the same handful of completeness-safe +mechanisms: + +1. **The complexity is real but local.** Two-level morphological recognition is NP-complete in + general, PSPACE-complete with unrestricted deletion (Barton 1986; Barton, Berwick & Ristad + 1987) — so no restructuring gives a polynomial worst case, and the budget/soft-stop outer + net stays. But the hardness is driven by *"local rather than global ambiguity"*, and + Koskenniemi & Church (1988) locate the exponent precisely: parse cost is linear in word + length and exponential in the number of **unresolved choice points that coexist before the + first lexical anchor** — regressive-harmony prefixes in their data; subject/tense/object + prefix slots before the verb root in Sena. + +2. **"Overanalysis" and its two published cures.** Unapply-everything-then-look-up is what + Karttunen & Beesley call the overanalysis problem. Cure (a): **interleave lexical lookup + with analysis** (Koskenniemi's tandem lookup "does not pursue analyses that have no matching + lexical path"); sound whenever the lexical filter over-approximates the lexicon. Cure (b): + compose lexicon and rules at compile time — the FST endgame, out of scope here. Notably, + rule *composition alone does not help*: "the ambiguity remains" (Karttunen & Beesley) — + only lexicon information and state merging shrink the candidate set. + +3. **Memoization is compatible with exact all-parses output.** Memoizing a backtracking parser + keyed on state yields chart-parser complexity (Norvig 1991); Earley deduction / tabling + (Johnson 1995; Shieber et al. 1995) gives the answer-complete discipline for it (memo entry + = subscribers + answers; converging searches subscribe instead of recomputing). The exact + model-counting literature (Sang et al. 2004; Bacchus et al. 2009) proves caching coexists + with exhaustive (not just best-first) semantics. The game-search literature contributes the + key-design discipline (Kishimoto & Müller 2004: keys must contain exactly what the remaining + computation reads — a full-path key blew up searches 1000×). + +4. **Dead-end pruning is unambiguously sound.** Nogood recording / UNSAT-component caching + (Dechter & Mateescu 2007; Sang et al. 2004): discarding states proven to yield zero + completions can never lose a parse. The boolean residue of A* heuristics — precomputed + necessary conditions for *any* completion to exist — is the admissible-pruning transfer + (Klein & Manning 2003); best-first *ordering* itself buys nothing when running to + exhaustion. + +5. **Packed representations are guaranteed to exist.** Rewrite-rule cascades denote regular + relations (Johnson 1972; Kaplan & Kay 1994), so for a fixed surface word the candidate set + is a regular language — representable as a lattice/DAG where each rule applies once to the + whole structure (polynomial in lattice size), instead of once per enumerated path. Shared + forests with tail sharing (Billot & Lang 1989; Tomita-style local-ambiguity packing) are the + grammar-level version; AND/OR search with context-based merging (Dechter & Mateescu 2007) + the search-level one. HFST optimized-lookup demonstrates the endpoint: cost bounded by + distinct (position, state) pairs, not derivation paths. + +6. **What does NOT transfer:** classical dominance pruning and symmetry breaking keep one + representative per equivalence orbit — sound only for optimization/best-parse, unsound for + literally-all-parses unless the merged items are provably output-identical (which is just + deduplication); Viterbi-style weighted DP is best-parse machinery. + +7. **Field precedent.** The FLEx mailing list documents this exact pain (Awetí words at ~9-20 + minutes), fixed until now only by hand-editing grammars (Andy Black's audit took a word from + ~9 min to ~100 s). Maxwell (1998) shows IA (listed-allomorph) and IP (rule) descriptions are + mechanically interconvertible — precedent for precompiling cheap rules into listed + allomorphs. No published engine-side fix exists; this analysis + `GrammarAnalyzer` would be + the first citable treatment. + +## 4. Concrete opportunities, ranked + +Ranking merges the empirical measurements (§1–§2) with the literature's soundness analysis +(§3). The first three are engine changes with no formalism impact and no lost parses; the +later ones are progressively larger architectural moves. + +### 4.1 Transposition table over analysis states (~50–100× on the dominant phase) + +Key: `(shape, per-rule unapplication counts, SyntacticFeatureStruct, stratum)` — measured +98.4% hit rate on `atawirambo`, 97.7% on `cinacemerwa`. Two designs: + +- **Conservative (output-identical):** memo value = the set of (result `Word`, trail-suffix) + continuations discovered below the state; on a revisit, replay the continuations onto the + new prefix trail (cheap list operations — no FST matching, no shape cloning). Produces + byte-identical output including all order-variant trails and traces. +- **Aggressive (canonical trails):** for `unordered` strata, record the trail as a canonical + multiset and stop generating order-variants entirely; synthesis gates on multiset membership + instead of sequence position. Semantically defensible — "unordered" means order is not + linguistically meaningful — and collapses `ExpandAlternatives` too, but changes trace output + and needs corpus-level verification that parse *results* are unchanged. + +The conservative design alone converts the 158,227-expansion tree into a 2,546-expansion DAG. + +Key-design discipline from the literature (the "GHI problem" in game search): the memo key must +contain **exactly** what the remaining computation can read — here that means the shape, the +per-rule unapplication counts (they gate `MaxApplicationCount`), the syntactic FS (it gates +`OutSyntacticFeatureStruct.IsUnifiable`), and for compounding the non-head state; but *not* the +trail order. Keying on too much (e.g. the full trail) silently degrades the hit rate back to +zero. The measured 1,626 → 2,546 state growth when adding the FS to the key shows the FS +splits few states in practice — cheap to include, and required for soundness. + +The **cheapest first slice** of this is a *nogood cache* only: record states whose subtree +yielded zero results, skip them on revisit. No continuation replay, no trail bookkeeping, +trivially sound (discarding a zero-completion branch can never lose a parse). Since failure is +the overwhelmingly common case (only 4/41 candidates ever reach the lexicon), most of the +98.4% redundancy is *failed* subtrees re-searched — a nogood-only table captures most of the +win for a fraction of the implementation risk. The tabling literature's discipline applies on +upgrade to a full memo: a memo entry holds subscribers + answers, and a search converging on an +in-flight entry subscribes rather than recomputing. + +### 4.2 Early lexical intersection — "tandem lookup" (the literature's decisive fix) + +37 of 41 `atawirambo` candidates never matched any lexical root, and the tree that produced +them is the entire cost. This is Karttunen & Beesley's "overanalysis" problem, and the +published cure that doesn't require FST compilation is Koskenniemi's tandem lookup: consult the +lexicon *during* analysis and refuse to pursue hypotheses no lexical path can complete. +Soundness condition: the filter must **over-approximate** the lexicon (only kill hypotheses +that could never survive lookup), which tolerates underspecified segments conservatively. + +Concretely: if every remaining unappliable rule only strips edge material (true for ordinary +affix rules — verifiable statically per grammar by `GrammarAnalyzer`), then a candidate can +only ever reach roots already present inside its current shape. Precompute a substring index +over root allomorphs (Aho-Corasick / suffix automaton, matching at the natural-class level so +underspecified nodes over-approximate); prune any branch whose shape contains no possible +root. This attacks the exponent the literature identifies — unresolved prefix choice points +stacking up *before the search ever touches the root* — and AMPLE's dictionary-first +architecture is the existence proof that the same grammar content can be searched +lexicon-anchored. + +### 4.3 Direct rule indexing in synthesis (~40× on synthesis steps) + +Unordered synthesis knows the single rule that can apply next; replace the scan-all-rules +cascade with a `Dictionary>` lookup (compounding-rule +`null` entries fall back to the scan). Behavior-identical by construction: the 39 skipped +attempts are exactly the ones `IsMorphologicalRuleApplicable` rejects today. Turns +`cinacemerwa`'s 8.0M synthesis steps into ~200K. + +### 4.4 Early constraint checking in synthesis + +Allomorph environment and co-occurrence constraints that are already decidable mid-derivation +(the environment's context is fully inside an already-built portion of the word, morphemes +already placed) could fail candidates before the rest of the cascade runs, instead of at +`IsWordValid`. Requires care with material later phonological rules could still change; the +statically-safe subset is identifiable per grammar (`GrammarAnalyzer` again). + +### 4.5 Rule-battery prefiltering in analysis (constant factor) + +At every analysis node all 25+ rules are attempted; most fail their anchored FST match +immediately. An index from edge-segment natural classes to the affix rules whose patterns could +possibly match (AMPLE-style position/anchor indexing) skips guaranteed-miss attempts without +changing semantics. + +### 4.6 Cross-word memoization (corpus-scale extension) + +The transposition state contains no reference to the original surface word — states like +`('t', {12 affixes})` recur across *words*. A bounded (LRU) cross-word memo could make +"Parse All Words" batch runs dramatically sublinear in practice. Interaction with per-parse +`ParseContext` budgets needs design; flagged as an extension, not a first step. + +### 4.7 Packed candidate representation (the endgame short of full FST) + +Restore Maxwell's original invariant at the morphological level: represent the analysis +candidate set as a shared lattice/DAG (guaranteed to exist — the candidate set of a +rewrite-rule cascade over a fixed surface form is a regular language, Kaplan & Kay 1994), +where each rule stage applies once to the whole structure and equal states merge (the foma/HFST +habit of determinize-minimize between stages, transplanted). `Word.Alternatives` + +`ExpandAlternatives` is a half-built version of this — it packs (by shape, at stratum +boundaries) but then fully unpacks before synthesis. Making synthesis verify *lattice nodes* +instead of expanded candidates is the biggest win and the biggest change; it converges with +the separate FST effort and should be weighed against it rather than built independently. + +### Priorities + +1. **4.1 nogood slice** — cheapest, trivially sound, captures most of the measured 98%. +2. **4.1 full memo + 4.3 synthesis rule indexing** — mechanical, output-identical. +3. **4.2 tandem lexical intersection** — the decisive fix per the literature; needs the + `GrammarAnalyzer` edge-stripper check. +4. **4.4 / 4.5 invariants and prefilters** — constant factors, fit the existing lint. +5. **4.6 / 4.7** — corpus-scale and architectural endgames, coordinate with the FST effort. + +The complexity cap (`complexity-cap.md`) stays regardless: the worst case is NP-complete +(PSPACE-complete with unrestricted deletion), so a budget outer net is formally motivated, and +Barton's "bounded nulls" + Maxwell's own "unapply a deletion rule only N times" sanction the +existing `DeletionReapplications`/`MaxAnalysisShapeGrowth` knobs as part of the formalism, not +an apology. + +## 5. Sources + +Primary sources verified against fetched text by the research pass (adversarial spot-checks +6/6 confirmed): + +- M. Maxwell (1994), *Parsing Using Linearly Ordered Phonological Rules* — the original Hermit + Crab: packing ambiguity into underspecified forms. https://arxiv.org/abs/cmp-lg/9411015 +- M. Maxwell (1991), *Phonological Analysis and Opaque Rule Orders*, IWPT-2. + https://aclanthology.org/1991.iwpt-1.13/ (overgeneration bound; full text not yet retrieved) +- M. Maxwell (1998), *Two Theories of Morphology, One Implementation*, SILEWP 1998-001. + https://www.sil.org/resources/publications/entry/7814 +- G.E. Barton (1986), *Computational Complexity in Two-Level Morphology*, ACL. + https://aclanthology.org/P86-1009.pdf; and *Constraint Propagation in KIMMO Systems*, ACL. + https://aclanthology.org/P86-1008.pdf; Barton, Berwick & Ristad (1987), *Computational + Complexity and Natural Language*, MIT Press. +- K. Koskenniemi & K. Church (1988), *Complexity, Two-Level Morphology and Finnish*, COLING. + https://aclanthology.org/C88-1069.pdf +- L. Karttunen & K. Beesley (2005), *Twenty-Five Years of Finite-State Morphology*. + https://web.stanford.edu/group/cslipublications/cslipublications/koskenniemi-festschrift/8-karttunen-beesley.pdf +- L. Karttunen, R. Kaplan & A. Zaenen (1992), *Two-Level Morphology with Composition*, COLING. + https://aclanthology.org/C92-1025.pdf +- R. Kaplan & M. Kay (1994), *Regular Models of Phonological Rule Systems*, CL 20(3). + https://aclanthology.org/J94-3001.pdf +- P. Norvig (1991), *Techniques for Automatic Memoization with Applications to Context-Free + Parsing*, CL 17(1). https://aclanthology.org/J91-1004/ +- M. Johnson (1995), *Memoization in Top-Down Parsing*, CL 21(3). + https://aclanthology.org/J95-3005.pdf +- S. Shieber, Y. Schabes & F. Pereira (1995), *Principles and Implementation of Deductive + Parsing*. https://arxiv.org/abs/cmp-lg/9404008 +- S. Billot & B. Lang (1989), *The Structure of Shared Forests in Ambiguous Parsing*, ACL. + https://aclanthology.org/P89-1018.pdf +- D. Klein & C. Manning (2003), *A* Parsing: Fast Exact Viterbi Parse Selection*, HLT-NAACL. + https://nlp.stanford.edu/pubs/klein2003astar.pdf; (2001) *Parsing and Hypergraphs*, IWPT. +- T. Sang, F. Bacchus, P. Beame, H. Kautz & T. Pitassi (2004), *Combining Component Caching and + Clause Learning for Effective Model Counting*, SAT. + http://www.cs.toronto.edu/~fbacchus/Papers/SangetalSAT2004.pdf +- R. Dechter & R. Mateescu (2007), *AND/OR Search Spaces for Graphical Models*, AIJ. + https://ics.uci.edu/~dechter/publications/r147.pdf +- A. Kishimoto & M. Müller (2004), *A General Solution to the Graph History Interaction + Problem*, AAAI. https://cdn.aaai.org/AAAI/2004/AAAI04-102.pdf +- M. Mohri & R. Sproat (1996), *An Efficient Compiler for Weighted Rewrite Rules*, ACL. + https://aclanthology.org/P96-1031.pdf; L. Karttunen (1995), *The Replace Operator*, ACL. + https://arxiv.org/pdf/cmp-lg/9504032; W. Skut et al. (2004), bimachines. + https://arxiv.org/pdf/cs/0407046 +- M. Mohri, F. Pereira & M. Riley (2002), *Weighted Finite-State Transducers in Speech + Recognition*, CS&L. https://cs.nyu.edu/~mohri/pub/csl01.pdf; OpenFst. + https://cs.nyu.edu/~mohri/pub/fst.pdf +- M. Hulden (2009), *Foma: a Finite-State Compiler and Library*, EACL. + https://aclanthology.org/E09-2008.pdf; HFST optimized-lookup. + https://github.com/hfst/hfst/wiki/OptimizedLookupFormat +- M. Silfverberg & K. Lindén (2009), HFST runtime lookup (67k–308k words/s). +- E. Antworth, PC-KIMMO v2 morphological parsing (chart over morphemes). + https://software.sil.org/pc-kimmo/morphological-parsing/ +- D. Weber, H.A. Black & S. McConnel (1988), *AMPLE: A Tool for Exploring Morphology*, SIL + OPAC 12. https://www.sil.org/resources/archives/5761 +- FLEx field evidence: flex-list "parsing broke down" thread (Awetí, ~9 min → ~100 s by manual + grammar audit). https://groups.google.com/g/flex-list/c/pkxCwIxIktg +- Negative results consulted: Ibaraki (1977) dominance pruning (optimality-only guarantee); + Crawford et al. (1996) symmetry breaking (one-representative-per-orbit, unsound for + all-parses); Anders et al. (2024). https://arxiv.org/abs/2407.04419 + +## 6. Corpus context + +*(top-N step counts per corpus — completed when the full-corpus scan lands)* diff --git a/src/SIL.Machine.Morphology.HermitCrab.Tool/BatchCommand.cs b/src/SIL.Machine.Morphology.HermitCrab.Tool/BatchCommand.cs new file mode 100644 index 000000000..2e4561873 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab.Tool/BatchCommand.cs @@ -0,0 +1,241 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using ManyConsole; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// Phase 0 of parse-optimization.md: parses every word in a word list and records a per-word result +/// signature plus elapsed time, so two runs (before/after an engine change) can be diffed to confirm +/// parse results are unchanged. Flushed per word and crash-resumable via --start, since some corpus +/// words are expensive enough (100+ seconds, and one has been observed to crash a process outright) that +/// losing partial progress on a multi-hour corpus run is unacceptable. +/// +internal class BatchCommand : ConsoleCommand +{ + private readonly HCContext _context; + private int _startIndex; + private string _ruleStatsPath; + private bool _parallel; + private int _parallelDegree = -1; + + public BatchCommand(HCContext context) + { + _context = context; + + IsCommand( + "batch", + "Parses every word in a word list, recording a result signature and timing per word (see parse-optimization.md Phase 0)" + ); + SkipsCommandSummaryBeforeRunning(); + HasAdditionalArguments(2, " "); + HasOption( + "start=", + "0-based line index to resume at (for crash recovery; ignored with --parallel)", + v => _startIndex = int.Parse(v) + ); + HasOption( + "rule-stats=", + "accumulate per-rule firing stats (category/stem/allomorph/environment buckets, with example " + + "words) across the whole run and write a report to {FILE} -- run with --sequential, the " + + "counters are not thread-safe", + v => _ruleStatsPath = v + ); + HasOption( + "parallel:", + "parse words concurrently across a load-balanced, longest-word-first Parallel.ForEach " + + "(parse-optimization.md Phase 8a) -- requires the Morpher itself to be --sequential for " + + "the per-word memo tables to engage; degree defaults to Environment.ProcessorCount, or " + + "{N} if given; trades --start crash-resume for speed (output is buffered and written " + + "index-ordered at the end)", + v => + { + _parallel = true; + if (!string.IsNullOrEmpty(v)) + _parallelDegree = int.Parse(v); + } + ); + } + + public override int Run(string[] remainingArguments) + { + string wordListPath = remainingArguments[0]; + string outputPath = remainingArguments[1]; + + string[] words = File.ReadAllLines(wordListPath).Select(w => w.Trim()).Where(w => w.Length > 0).ToArray(); + + if (_ruleStatsPath != null) + { + if (_parallel) + { + _context.Out.WriteLine( + "ERROR: --rule-stats and --parallel cannot be combined (counters are not thread-safe)." + ); + return -1; + } + if (_context.Morpher.MaxDegreeOfParallelism != 1) + { + _context.Out.WriteLine( + "WARNING: --rule-stats requested without --sequential; per-rule counters are not " + + "thread-safe and will be unreliable under within-word parallelism." + ); + } + _context.Morpher.AccumulateRuleStats = true; + } + + if (_parallel) + { + if (_context.Morpher.MaxDegreeOfParallelism != 1) + { + _context.Out.WriteLine( + "WARNING: --parallel requested without --sequential; the per-word memo tables " + + "(parse-optimization.md Phases 2/3/3b) only engage on the sequential cascade, so " + + "this run will not get their benefit." + ); + } + if (_startIndex > 0) + { + _context.Out.WriteLine("WARNING: --start is ignored under --parallel; running the full word list."); + } + return RunParallel(words, outputPath); + } + + return RunSequential(words, outputPath); + } + + private int RunSequential(string[] words, string outputPath) + { + using var writer = new StreamWriter(outputPath, append: _startIndex > 0) { AutoFlush = true }; + var totalSw = Stopwatch.StartNew(); + long parsed = 0, + skipped = 0; + for (int i = _startIndex; i < words.Length; i++) + { + string word = words[i]; + // Sentinel written before the attempt: if this word crashes the process, a wrapper script + // can read the last line to find where to resume (see run_sena_shards.ps1 precedent). + writer.WriteLine($"{i}\t{word}\tSTARTED"); + (string status, long elapsedMs, string signature) = ParseOneWord(word); + writer.WriteLine($"{i}\t{word}\t{elapsedMs}\t{status}\t{signature}"); + if (status == "SKIPPED") + skipped++; + else + parsed++; + if (i % 100 == 0) + { + _context.Out.WriteLine("[{0}/{1}]", i, words.Length); + // Rewritten (not appended) every checkpoint so a mid-run crash on a pathological word + // still leaves a usable report reflecting everything parsed so far. + if (_ruleStatsPath != null) + WriteRuleStatsReport(); + } + } + totalSw.Stop(); + if (_ruleStatsPath != null) + WriteRuleStatsReport(); + _context.Out.WriteLine( + "batch complete: {0} words parsed ({1} skipped), {2}ms total", + parsed, + skipped, + totalSw.ElapsedMilliseconds + ); + return 0; + } + + // Phase 8a: the earlier per-word AutoFlush writer is not thread-safe and crash-resume has no meaning + // once words are handed out out-of-order, so rows are buffered per index and written once at the end. + // Ordering the work queue longest-word-first, combined with the load-balanced (chunked, not static + // range) partitioner below, is what closes the 2.9x gap between wall clock and the perfect-packing + // bound measured on 2026-07-03 -- heavy words no longer cluster onto a few threads. + private int RunParallel(string[] words, string outputPath) + { + var rows = new string[words.Length]; + int[] order = Enumerable.Range(0, words.Length).OrderByDescending(i => words[i].Length).ToArray(); + + var totalSw = Stopwatch.StartNew(); + long parsed = 0, + skipped = 0; + long completed = 0; + + var parallelOptions = new ParallelOptions(); + if (_parallelDegree > 0) + parallelOptions.MaxDegreeOfParallelism = _parallelDegree; + + Parallel.ForEach( + Partitioner.Create(order, loadBalance: true), + parallelOptions, + i => + { + string word = words[i]; + (string status, long elapsedMs, string signature) = ParseOneWord(word); + rows[i] = $"{i}\t{word}\t{elapsedMs}\t{status}\t{signature}"; + if (status == "SKIPPED") + Interlocked.Increment(ref skipped); + else + Interlocked.Increment(ref parsed); + long n = Interlocked.Increment(ref completed); + if (n % 100 == 0) + _context.Out.WriteLine("[{0}/{1}]", n, words.Length); + } + ); + totalSw.Stop(); + + using (var writer = new StreamWriter(outputPath, append: false)) + { + foreach (string row in rows) + writer.WriteLine(row); + } + + _context.Out.WriteLine( + "batch complete: {0} words parsed ({1} skipped), {2}ms total", + parsed, + skipped, + totalSw.ElapsedMilliseconds + ); + return 0; + } + + private (string status, long elapsedMs, string signature) ParseOneWord(string word) + { + var sw = Stopwatch.StartNew(); + try + { + Word[] results = _context.Morpher.ParseWord(word, out _).ToArray(); + sw.Stop(); + return ("ok", sw.ElapsedMilliseconds, BuildSignature(results)); + } + catch (InvalidShapeException) + { + return ("SKIPPED", 0, "-"); + } + } + + private void WriteRuleStatsReport() + { + using var statsWriter = new StreamWriter(_ruleStatsPath, append: false); + RuleStatsReport.Write(statsWriter, "Analysis", _context.Morpher.AnalysisRuleStats); + RuleStatsReport.Write(statsWriter, "Synthesis", _context.Morpher.SynthesisRuleStats); + } + + // Order-independent (sorted) so two runs that find the same parses in a different internal order + // still compare equal; a change in this signature means parse RESULTS changed, which every phase in + // parse-optimization.md is required not to do. + private static string BuildSignature(IEnumerable results) + { + List signatures = results + .Select(w => + string.Join("+", w.AllomorphsInMorphOrder.Select(a => a.Morpheme.Id)) + + "|" + + w.Shape.ToRegexString(w.Stratum.CharacterDefinitionTable, true) + ) + .OrderBy(s => s, StringComparer.Ordinal) + .ToList(); + return signatures.Count == 0 ? "-" : string.Join(";", signatures); + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab.Tool/HCContext.cs b/src/SIL.Machine.Morphology.HermitCrab.Tool/HCContext.cs index e6c8866c0..42179d46c 100644 --- a/src/SIL.Machine.Morphology.HermitCrab.Tool/HCContext.cs +++ b/src/SIL.Machine.Morphology.HermitCrab.Tool/HCContext.cs @@ -8,9 +8,9 @@ internal class HCContext(Language language, TextWriter outWriter) private Morpher _morpher; private readonly TextWriter _outWriter = outWriter; - public void Compile() + public void Compile(bool sequential = false) { - _morpher = new Morpher(new TraceManager(), _language); + _morpher = new Morpher(new TraceManager(), _language, sequential ? 1 : -1); } public Language Language diff --git a/src/SIL.Machine.Morphology.HermitCrab.Tool/Program.cs b/src/SIL.Machine.Morphology.HermitCrab.Tool/Program.cs index ff8e86bcc..3d30ef513 100644 --- a/src/SIL.Machine.Morphology.HermitCrab.Tool/Program.cs +++ b/src/SIL.Machine.Morphology.HermitCrab.Tool/Program.cs @@ -20,6 +20,8 @@ public static int Main(string[] args) string scriptFile = null; bool showHelp = false; bool quitOnError = true; + bool sequential = false; + bool lexicalGate = false; var p = new OptionSet { @@ -31,6 +33,20 @@ public static int Main(string[] args) "continues when an error occurs while loading the configuration", value => quitOnError = value == null }, + { + "sequential", + "parse single-threaded (maxDegreeOfParallelism: 1) -- the mode a caller that " + + "parallelizes across words itself (e.g. batch corpus runs) should use; also the " + + "only mode the analysis nogood cache (parse-optimization.md Phase 2) currently covers", + value => sequential = value != null + }, + { + "lexical-gate", + "enable Morpher.EnableLexicalGating (parse-optimization.md Phase 5) -- default off, " + + "highest-risk optimization; use for A/B corpus verification against a run without " + + "this flag", + value => lexicalGate = value != null + }, { "h|help", "show this help message and exit", value => showHelp = value != null }, }; @@ -66,7 +82,8 @@ public static int Main(string[] args) context = new HCContext(language, output ?? Console.Out); Console.Write("Compiling rules... "); - context.Compile(); + context.Compile(sequential); + context.Morpher.EnableLexicalGating = lexicalGate; Console.WriteLine("done."); Console.WriteLine("{0} loaded.", language.Name); Console.WriteLine(); @@ -92,6 +109,7 @@ public static int Main(string[] args) new TracingCommand(context), new TestCommand(context), new StatsCommand(context), + new BatchCommand(context), }; string input; diff --git a/src/SIL.Machine.Morphology.HermitCrab.Tool/RuleStatsReport.cs b/src/SIL.Machine.Morphology.HermitCrab.Tool/RuleStatsReport.cs new file mode 100644 index 000000000..bb6349515 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab.Tool/RuleStatsReport.cs @@ -0,0 +1,56 @@ +using System.Diagnostics; +using System.IO; +using System.Linq; +using SIL.Machine.Rules; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// Formats the InstrumentedRule tree (see Morpher.AccumulateRuleStats) as a flat, grep-able text report: +/// one line per rule with its totals, followed by its bucket breakdowns sorted so the rarest (most +/// suspicious) buckets are easy to spot against the common case -- that's the "300 times vs 4 times, are +/// the 4 wrong?" comparison this whole feature exists for. +/// +internal static class RuleStatsReport +{ + public static void Write(TextWriter writer, string label, InstrumentedRule root) + { + writer.WriteLine($"==== {label} ===="); + if (root == null) + { + writer.WriteLine("(no rule tree)"); + return; + } + WriteRule(writer, root, ""); + writer.WriteLine(); + } + + private static void WriteRule(TextWriter writer, InstrumentedRule rule, string path) + { + if (rule == null) + return; + + string fullPath = string.IsNullOrEmpty(path) ? rule.Name ?? "?" : $"{path} > {rule.Name}"; + + if (rule.InputCount > 0 || rule.BucketGroups.Count > 0) + { + double elapsedMs = rule.ElapsedTime * 1000.0 / Stopwatch.Frequency; + writer.WriteLine( + $"{fullPath}\tinputs={rule.InputCount}\tsuccesses={rule.SuccessCount}\toutputs={rule.OutputCount}\telapsedMs={elapsedMs:F0}" + ); + + foreach (var group in rule.BucketGroups.OrderBy(g => g.Key)) + { + writer.WriteLine($" [{group.Key}]"); + foreach (var bucket in group.Value.OrderByDescending(b => b.Value.Count)) + { + string examples = string.Join(" | ", bucket.Value.Examples); + writer.WriteLine($" {bucket.Key}: {bucket.Value.Count}\te.g. {examples}"); + } + } + } + + foreach (var sub in rule.SubRules) + WriteRule(writer, sub, fullPath); + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/AnalysisAffixTemplateRule.cs b/src/SIL.Machine.Morphology.HermitCrab/AnalysisAffixTemplateRule.cs index f401ce0fa..f93fec49c 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/AnalysisAffixTemplateRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/AnalysisAffixTemplateRule.cs @@ -10,7 +10,7 @@ namespace SIL.Machine.Morphology.HermitCrab { - internal class AnalysisAffixTemplateRule : IRule + internal class AnalysisAffixTemplateRule : InstrumentedRule { private readonly Morpher _morpher; private readonly AffixTemplate _template; @@ -18,6 +18,7 @@ internal class AnalysisAffixTemplateRule : IRule public AnalysisAffixTemplateRule(Morpher morpher, AffixTemplate template) { + Name = template.Name; _morpher = morpher; _template = template; _rules = new List>( @@ -27,9 +28,10 @@ public AnalysisAffixTemplateRule(Morpher morpher, AffixTemplate template) FreezableEqualityComparer.Default )) ); + AddSubRules(_rules); } - public IEnumerable Apply(Word input) + public override IEnumerable Apply(Word input) { if (!_morpher.RuleSelector(_template)) return Enumerable.Empty(); @@ -41,7 +43,11 @@ public IEnumerable Apply(Word input) if (_morpher.TraceManager.IsTracing) _morpher.TraceManager.BeginUnapplyTemplate(_template, input); - Word inWord = input.Clone(); + // Shape-sharing clone (parse-optimization.md Phase 10a): this clone is frozen on the next + // line and nothing between clone and freeze touches the shape -- slot rules only ever READ + // it (FST matching), and their outputs are separate deep clones. Falls back to a deep copy + // automatically when input's shape isn't frozen yet (e.g. unmemoized/tracing paths). + Word inWord = input.CloneShareFrozenShape(); inWord.Freeze(); var output = new HashSet(FreezableEqualityComparer.Default); @@ -60,6 +66,7 @@ public IEnumerable Apply(Word input) sfs.Add(fs); outWord.SyntacticFeatureStruct = sfs; } + AddRuleStats(output.Count); return output; } diff --git a/src/SIL.Machine.Morphology.HermitCrab/AnalysisLanguageRule.cs b/src/SIL.Machine.Morphology.HermitCrab/AnalysisLanguageRule.cs index 4bdd3c959..68bc7e5e4 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/AnalysisLanguageRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/AnalysisLanguageRule.cs @@ -6,7 +6,7 @@ namespace SIL.Machine.Morphology.HermitCrab { - internal class AnalysisLanguageRule : IRule + internal class AnalysisLanguageRule : InstrumentedRule { private readonly Morpher _morpher; private readonly List _strata; @@ -14,12 +14,14 @@ internal class AnalysisLanguageRule : IRule public AnalysisLanguageRule(Morpher morpher, Language language) { + Name = "Analysis"; _morpher = morpher; _strata = language.Strata.Reverse().ToList(); _rules = _strata.Select(stratum => stratum.CompileAnalysisRule(morpher)).ToList(); + AddSubRules(_rules); } - public IEnumerable Apply(Word input) + public override IEnumerable Apply(Word input) { var inputSet = new HashSet(FreezableEqualityComparer.Default) { input }; var tempSet = new HashSet(FreezableEqualityComparer.Default); @@ -45,6 +47,7 @@ public IEnumerable Apply(Word input) inputSet = outputSet; } + AddRuleStats(results.Count); return results; } } diff --git a/src/SIL.Machine.Morphology.HermitCrab/AnalysisScope.cs b/src/SIL.Machine.Morphology.HermitCrab/AnalysisScope.cs new file mode 100644 index 000000000..6eab999af --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/AnalysisScope.cs @@ -0,0 +1,88 @@ +using System.Collections.Concurrent; +using System.Collections.Generic; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Per-parse cache carrier threaded through clones exactly like + /// -- reference-shared, excluded from + /// Word.FreezeImpl/Word.ValueEquals so existing dedup semantics are unchanged. Holds the + /// analysis-cascade memo table from parse-optimization.md Phases 2 (nogoods) and 3 (positive memo with + /// trail replay) -- see . + /// One instance per call: entries are state facts + /// about a specific parse (a state key does not encode the target surface word), so sharing this + /// across concurrent parses of different words would be unsound without also scoping the key to the + /// word -- that cross-word extension is explicitly deferred (parse-optimization.md §4.6). + /// Thread-safe because a single word's analysis can itself run in parallel + /// (ParallelCombinationRuleCascade, used when > 1) -- + /// though today only the sequential cascade actually reads/writes this (see + /// 's doc comment). + /// + internal sealed class AnalysisScope + { + public AnalysisScope(bool lexicalGatingActive) + { + LexicalGatingActive = lexicalGatingActive; + } + + // parse-optimization.md Phase 5: precomputed once per parse (Morpher.ParseWord already knows + // whether tracing/guessRoot/the grammar's own qualification allow the gate for this call) rather + // than re-checked per candidate -- cheap, and keeps MemoizedCombinationRuleCascade from needing a + // direct Morpher reference just to read three unrelated conditions. + public bool LexicalGatingActive { get; } + + // OOM guard (parse-optimization.md Phase 3): cinacemerwa-class words have crashed a test host on + // memory before budgets existed to stop them, and a positive memo holds actual Word lists (not + // just a boolean like the nogood case), so it is the one that can grow unboundedly. Past the cap, + // new subtrees are simply not memoized -- correctness is unaffected, only the hit rate degrades. + private const int MaxMemoEntries = 100_000; + + public ConcurrentDictionary Memo { get; } = + new ConcurrentDictionary(); + + // Same discipline, different battery: template unapplication (AnalysisStratumRule.ApplyTemplates) + // reads exactly what AnalysisStateKey captures (shape, syntactic FS, stratum, non-heads) and + // nothing trail-order-dependent, so equal-keyed arrivals get equal template outputs modulo the + // trail prefix -- the same Word.ReplayOnto graft Phase 3 uses. Kept as a separate table from + // Memo because the two record different computations over the same key space (a state's mrule + // subtree vs. its one-level template outputs); merging them would conflate a "no mrule results" + // nogood with "no template outputs". Measured motivation (2026-07-03, atawirambo): the template + // battery was invoked 38,840 times against ~2,581 unique keys -- 93% of total parse wall time -- + // because Phases 2/3 memoized only the mrule cascade and templates sat outside it. + public ConcurrentDictionary TemplateMemo { get; } = + new ConcurrentDictionary(); + + // Keys currently under expansion on some call stack -- guards the in-flight re-entry case (a + // multiApp cascade can reach the same state again before its own first expansion has completed, + // e.g. via a self-loop). A hit here must fall through to plain, unmemoized expansion rather than + // read a nonexistent/partial entry or deadlock; see MemoizedCombinationRuleCascade.ApplyRules. + public ConcurrentDictionary InProgress { get; } = + new ConcurrentDictionary(); + + public bool HasMemoCapacity => Memo.Count < MaxMemoEntries; + } + + /// + /// A memoized analysis-cascade subtree (parse-optimization.md Phase 3). empty = + /// the Phase 2 "nogood" case (subtree proved to yield nothing); non-empty = the Phase 3 positive case, + /// replayable onto a differently-ordered arrival at the same via + /// , using / + /// to split each stored result's trail/non-heads into the + /// (discarded, replaced) prefix and the (kept) subtree-local suffix. There is no "budget exhausted / + /// incomplete" flag: this branch has no step/time budget infrastructure (see parse-optimization.md + /// "Branch context"), so every recorded subtree was explored to full completion. + /// + internal sealed class MemoEntry + { + public MemoEntry(IReadOnlyList results, int mruleTrailPrefixLength, int nonHeadPrefixLength) + { + Results = results; + MruleTrailPrefixLength = mruleTrailPrefixLength; + NonHeadPrefixLength = nonHeadPrefixLength; + } + + public IReadOnlyList Results { get; } + public int MruleTrailPrefixLength { get; } + public int NonHeadPrefixLength { get; } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/AnalysisStateKey.cs b/src/SIL.Machine.Morphology.HermitCrab/AnalysisStateKey.cs new file mode 100644 index 000000000..fe48f977d --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/AnalysisStateKey.cs @@ -0,0 +1,117 @@ +using System; +using System.Collections.Generic; +using SIL.Machine.Annotations; +using SIL.Machine.FeatureModel; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Order-independent identity of an analysis-cascade node, used by the memo cache in + /// 's Unordered-mode morphological rule cascade (see + /// parse-optimization.md Phases 2-3). Two Words with an equal key are guaranteed to make identical + /// decisions in every analysis-side morphological rule that cascade can invoke -- verified by + /// inspecting what each one reads from its input: + /// + /// : Shape (FST pattern match) and + /// (unifiability gate) plus a per-rule unapplication count. + /// : adds + /// (MaxStemCount gate) -- it never reads the non-heads' own content, only the count. + /// : adds + /// . + /// + /// but never the ORDER those rules were unapplied in -- exactly the redundancy this key collapses. + /// Deliberately excludes fields includes for a different purpose + /// (result dedup): the unapplication trail as an ordered SEQUENCE (replaced here by an + /// order-independent multiset) and _isLastAppliedRuleFinal/IsPartial, which are not read + /// by any analysis-side rule (grep-verified against every file matching + /// MorphologicalRules/Analysis*.cs and PhonologicalRules/Analysis*.cs). + /// + internal readonly struct AnalysisStateKey : IEquatable + { + private readonly Shape _shape; + private readonly Stratum _stratum; + private readonly FeatureStruct _syntacticFS; + private readonly FeatureStruct _realizationalFS; + private readonly int _nonHeadCount; + private readonly IReadOnlyDictionary _ruleCounts; + private readonly int _hashCode; + + public AnalysisStateKey(Word word) + { + _shape = word.Shape; + _stratum = word.Stratum; + _syntacticFS = word.SyntacticFeatureStruct; + _realizationalFS = word.RealizationalFeatureStruct; + _nonHeadCount = word.NonHeadCount; + _ruleCounts = word.UnappliedRuleCounts; + + // Defensive, not incidental: AnalysisAffixTemplateRule.Apply reassigns SyntacticFeatureStruct + // to a freshly-cloned, unfrozen FeatureStruct AFTER the owning Word is already frozen (its + // setter has no CheckFrozen() guard, unlike RealizationalFeatureStruct's -- a pre-existing + // quirk nothing previously surfaced, since Word's own FreezeImpl/ValueEquals deliberately + // exclude SyntacticFeatureStruct). Freeze() is idempotent (FeatureStruct.cs: "if (IsFrozen) + // return") and every write site in this codebase clones before mutating (never mutates a + // FeatureStruct another reference might still be holding), so freezing here on read is safe. + _shape.Freeze(); + _syntacticFS.Freeze(); + _realizationalFS.Freeze(); + + int hash = 17; + hash = hash * 31 + _shape.GetFrozenHashCode(); + hash = hash * 31 + (_stratum?.GetHashCode() ?? 0); + hash = hash * 31 + _syntacticFS.GetFrozenHashCode(); + hash = hash * 31 + _realizationalFS.GetFrozenHashCode(); + hash = hash * 31 + _nonHeadCount; + if (_ruleCounts != null) + { + // XOR, not the usual *31 rolling combine: the multiset is unordered, so the combination + // must be commutative -- two dictionaries with the same entries built up in different + // unapplication orders must hash identically. + int multisetHash = 0; + foreach (KeyValuePair kvp in _ruleCounts) + multisetHash ^= (kvp.Key.GetHashCode() * 397) ^ kvp.Value; + hash = hash * 31 + multisetHash; + } + _hashCode = hash; + } + + public override int GetHashCode() => _hashCode; + + public override bool Equals(object obj) => obj is AnalysisStateKey other && Equals(other); + + public bool Equals(AnalysisStateKey other) + { + if (_hashCode != other._hashCode) + return false; + if (_nonHeadCount != other._nonHeadCount || !ReferenceEquals(_stratum, other._stratum)) + return false; + if (!_shape.ValueEquals(other._shape)) + return false; + if ( + !_syntacticFS.ValueEquals(other._syntacticFS) + || !_realizationalFS.ValueEquals(other._realizationalFS) + ) + return false; + return RuleCountsEqual(_ruleCounts, other._ruleCounts); + } + + private static bool RuleCountsEqual( + IReadOnlyDictionary a, + IReadOnlyDictionary b + ) + { + int aCount = a?.Count ?? 0; + int bCount = b?.Count ?? 0; + if (aCount != bCount) + return false; + if (aCount == 0) + return true; + foreach (KeyValuePair kvp in a) + { + if (!b.TryGetValue(kvp.Key, out int otherCount) || otherCount != kvp.Value) + return false; + } + return true; + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/AnalysisStratumRule.cs b/src/SIL.Machine.Morphology.HermitCrab/AnalysisStratumRule.cs index aadef0838..318730151 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/AnalysisStratumRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/AnalysisStratumRule.cs @@ -8,7 +8,7 @@ namespace SIL.Machine.Morphology.HermitCrab { - internal class AnalysisStratumRule : IRule + internal class AnalysisStratumRule : InstrumentedRule { private readonly IRule _mrulesRule; private readonly IRule _prulesRule; @@ -18,6 +18,7 @@ internal class AnalysisStratumRule : IRule public AnalysisStratumRule(Morpher morpher, Stratum stratum) { + Name = stratum.Name; _stratum = stratum; _morpher = morpher; _prulesRule = new LinearRuleCascade( @@ -47,14 +48,16 @@ public AnalysisStratumRule(Morpher morpher, Stratum stratum) break; case MorphologicalRuleOrder.Unordered: // Single-threaded when the caller caps within-word parallelism (e.g. it - // parallelizes across words itself); parallel cascade otherwise. + // parallelizes across words itself); parallel cascade otherwise. The sequential + // cascade additionally memoizes analysis states (parse-optimization.md Phases 2-3) -- + // the parallel one does not yet, see MemoizedCombinationRuleCascade's doc comment. _mrulesRule = morpher.MaxDegreeOfParallelism == 1 ? (IRule) - new CombinationRuleCascade( + new MemoizedCombinationRuleCascade( mrules, - true, - FreezableEqualityComparer.Default + FreezableEqualityComparer.Default, + morpher ) : new ParallelCombinationRuleCascade( mrules, @@ -68,6 +71,9 @@ public AnalysisStratumRule(Morpher morpher, Stratum stratum) }; break; } + AddSubRule(_prulesRule); + AddSubRule(_templatesRule); + AddSubRule(_mrulesRule); } private IRule CompileAffixTemplate(AffixTemplate template, Morpher morpher) @@ -106,8 +112,9 @@ private IRule CompilePhonologicalRule(IPhonologicalRule prule, Morphe } } - public IEnumerable Apply(Word input) + public override IEnumerable Apply(Word input) { + long startTime = Stopwatch.GetTimestamp(); if (_morpher.TraceManager.IsTracing) _morpher.TraceManager.BeginUnapplyStratum(_stratum, input); @@ -117,6 +124,29 @@ public IEnumerable Apply(Word input) _prulesRule.Apply(input); input.Freeze(); + + // parse-optimization.md Phase 4 Gate B: once phonological unapplication has grown this + // candidate past the longest form the grammar's own lexicon+rules could ever produce (root + + // every rule's own max insertion, see GrammarAnalyzer), no morphological unapplication from + // here can ever recover a valid analysis -- affix rules only ever shrink during analysis (they + // reverse synthesis's insertions), so the length can only go down from here. Skip the + // (potentially exponential) morphological cascade entirely rather than let it search a subtree + // that is already provably dead. Null MaxAnalysisLength means the grammar couldn't be measured + // this exactly (a compounding rule, or a phonological pattern with quantifiers/groups) -- see + // GrammarAnalyzer's remarks -- so the gate is off, matching today's unbounded behavior exactly. + // Bypassed while tracing (ground rule 1): the early return below would also skip this word's + // EndUnapplyStratum trace event further down. + if ( + !_morpher.TraceManager.IsTracing + && _morpher.MaxAnalysisLength is int maxLength + && input.Shape.SegmentCount() > maxLength + ) + { + ElapsedTime += Stopwatch.GetTimestamp() - startTime; + AddRuleStats(0); + return Enumerable.Empty(); + } + IDictionary shapeWord = null; // Don't merge if tracing because it messes up the tracing. bool mergeEquivalentAnalyses = _morpher.MergeEquivalentAnalyses && !_morpher.TraceManager.IsTracing; @@ -151,9 +181,16 @@ public IEnumerable Apply(Word input) if (_morpher.MaxUnapplications > 0 && output.Count >= _morpher.MaxUnapplications) break; } + ElapsedTime += Stopwatch.GetTimestamp() - startTime; + AddRuleStats(output.Count); return output; } + // Test hook: incremented on every template-memo replay (see ApplyTemplateBattery). The + // equivalence test that covers the replay path asserts this is nonzero so it can never go + // vacuous -- a memo that silently stops firing would otherwise look exactly like a passing test. + internal static long DiagTemplateMemoHits; + private IEnumerable ApplyMorphologicalRules(Word input) { foreach (Word mruleOutWord in _mrulesRule.Apply(input)) @@ -173,9 +210,47 @@ private IEnumerable ApplyMorphologicalRules(Word input) } } + // Runs the template battery for `input`, memoized by AnalysisStateKey (parse-optimization.md + // Phase 3, extended 2026-07-03). Template unapplication reads only what the key captures -- + // shape, syntactic FS, stratum, non-heads -- never the trail's ORDER, so an equal-keyed arrival + // gets the stored outputs replayed (Word.ReplayOnto grafts the new arrival's own trail/non-head + // prefix, identical to the mrule-cascade memo). Measured motivation: on Sena's `atawirambo`, the + // template battery ran 38,840 times against ~2,581 unique keys and accounted for 93% of parse + // wall time -- the mrule cascade Phases 2/3 memoized had already shrunk to ~1.4s. Sequential + // only and skipped while tracing (scope is null then), matching the mrule memo's scoping; no + // in-flight re-entry guard is needed here because _templatesRule.Apply is eager and + // self-contained (the template<->mrule mutual recursion lives in this class's enumerators, + // outside the memoized call). + private IEnumerable ApplyTemplateBattery(Word input) + { + AnalysisScope scope = input.AnalysisScope; + if (scope == null || _morpher.MaxDegreeOfParallelism != 1) + return _templatesRule.Apply(input); + + var key = new AnalysisStateKey(input); + if (scope.TemplateMemo.TryGetValue(key, out MemoEntry entry)) + { + var replayed = new List(entry.Results.Count); + DiagTemplateMemoHits++; + foreach (Word stored in entry.Results) + replayed.Add(stored.ReplayOnto(input, entry.MruleTrailPrefixLength, entry.NonHeadPrefixLength)); + return replayed; + } + + var results = new List(_templatesRule.Apply(input)); + if (scope.HasMemoCapacity) + { + scope.TemplateMemo.TryAdd( + key, + new MemoEntry(results, input.MorphologicalRuleTrailLength, input.NonHeadCount) + ); + } + return results; + } + private IEnumerable ApplyTemplates(Word input) { - foreach (Word tempOutWord in _templatesRule.Apply(input)) + foreach (Word tempOutWord in ApplyTemplateBattery(input)) { switch (_stratum.MorphologicalRuleOrder) { diff --git a/src/SIL.Machine.Morphology.HermitCrab/GrammarAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/GrammarAnalyzer.cs new file mode 100644 index 000000000..f64282303 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/GrammarAnalyzer.cs @@ -0,0 +1,199 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; +using SIL.Machine.Morphology.HermitCrab.PhonologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Computes a grammar-wide, word-independent bound on how long an underlying (analysis) form can + /// validly be, relative to the lexicon and rule set actually declared (parse-optimization.md Phase 4's + /// "Gate B" -- Gate A, a mirror-image synthesis-side bound, was attempted and reverted; see the note + /// in ). The bound is a deliberately loose over-approximation + /// -- summed across every rule's own already-declared reapplication limit + /// (, + /// ), never estimated -- so it can prune a candidate only + /// when NO combination of rules in the grammar could ever produce something that long, regardless of + /// which specific root or derivation path is under consideration. Returns null (meaning "no admissible + /// bound, gate off") the moment any rule's shape falls outside what this class knows how to measure + /// exactly (quantifiers/groups/alternations in a phonological Lhs/Rhs, or a compounding rule present + /// at all, since compounding combines multiple full root lengths rather than adding a bounded affix) + /// -- per the plan's own rule: skipping only costs pruning opportunity, an admissible bound must never + /// be guessed. + /// + public static class GrammarAnalyzer + { + /// + /// The longest possible underlying form (in real segments) any analysis candidate could validly + /// represent: the longest root allomorph in the lexicon, plus every affix/realizational rule's own + /// maximum possible net insertion (its allomorphs' / + /// actions, summed and multiplied by + /// ), + /// plus every phonological deletion-type subrule's maximum possible net restoration. Null if any + /// rule in the grammar can't be measured this way (see class remarks). + /// + /// + /// The phonological term is compounding, not additive: AnalysisRewriteRule's Deletion + /// reapply loop runs + 1 passes, and each pass is a + /// SimultaneousPhonologicalPatternRule sweep that can restore EVERY non-overlapping match + /// site in the current shape at once, not just one -- a real case (RewriteRuleTests + /// .MultipleDeletionRules: an 8-segment root deletes two independent "ii" clusters down to a + /// 4-segment surface form in one pass) needs more than "count of subrules" restored segments per + /// pass. Bounding the number of sites by the current running length (itself already an + /// over-approximation of the true pre-phonology length at this point) keeps this sound: real growth + /// can never exceed runningLength * subruleDelta per pass, since a simultaneous sweep cannot + /// match more sites than there are segments to match against. + /// + public static int? ComputeMaxAnalysisLength(Language language, int deletionReapplications) + { + int bound = 0; + foreach (Stratum stratum in language.Strata) + { + if (stratum.MorphologicalRules.OfType().Any()) + return null; + + int longestRoot = stratum.Entries.SelectMany(e => e.Allomorphs).Select(SegmentCount).DefaultIfEmpty(0).Max(); + bound += longestRoot; + + foreach (AffixProcessRule rule in stratum.MorphologicalRules.OfType()) + bound += MaxAllomorphInsertion(rule.Allomorphs) * rule.MaxApplicationCount; + + foreach ( + RealizationalAffixProcessRule rule in stratum.MorphologicalRules.OfType() + ) + bound += MaxAllomorphInsertion(rule.Allomorphs); + + int phonoGrowthRate = 0; + foreach (RewriteRule rule in stratum.PhonologicalRules.OfType()) + { + if (!TryGetFlatSegmentCount(rule.Lhs, out int lhsCount)) + return null; + foreach (RewriteSubrule sr in rule.Subrules) + { + if (!TryGetFlatSegmentCount(sr.Rhs, out int rhsCount)) + return null; + if (lhsCount > rhsCount) + phonoGrowthRate += lhsCount - rhsCount; + } + } + for (int pass = 0; pass < deletionReapplications + 1 && phonoGrowthRate > 0; pass++) + bound += bound * phonoGrowthRate; + } + return bound; + } + + /// + /// parse-optimization.md Phase 5's edge-stripper qualification: true only if every affix rule in + /// the grammar is a pure "copy a contiguous span of the input, optionally with material inserted + /// only before/after it" transform, and no stratum has a or a + /// . This is the soundness precondition for + /// : + /// assumes a root that exists in the lexicon must still appear as an intact contiguous window in + /// any not-yet-fully-analyzed candidate. Reduplication (the same input span copied more than once) + /// and infixation (material inserted BETWEEN two copied spans, splitting one span's material from + /// another's) both break that assumption -- the true root window would be split or duplicated, so + /// a real root could be invisible to a contiguous-window search. Compounding combines multiple + /// independent root windows, and metathesis physically reorders segments -- both are also outside + /// what a contiguous-window search over the ORIGINAL lexicon strings can safely reason about. + /// This is a single whole-language verdict, not per-stratum: simpler and strictly safer than the + /// per-stratum granularity the plan sketches (a grammar with one unqualified stratum disables the + /// gate everywhere rather than only where it's actually unsafe). + /// + public static bool IsEdgeStripperQualified(Language language) + { + foreach (Stratum stratum in language.Strata) + { + if (stratum.MorphologicalRules.OfType().Any()) + return false; + if (stratum.PhonologicalRules.OfType().Any()) + return false; + + foreach (AffixProcessRule rule in stratum.MorphologicalRules.OfType()) + { + if (rule.Allomorphs.Any(a => !IsEdgeStripperAllomorph(a))) + return false; + } + foreach ( + RealizationalAffixProcessRule rule in stratum.MorphologicalRules.OfType() + ) + { + if (rule.Allomorphs.Any(a => !IsEdgeStripperAllomorph(a))) + return false; + } + } + return true; + } + + /// + /// An allomorph qualifies if its Rhs, scanned in order, looks like + /// [insert]* [copy]+ [insert]* with every copied part name appearing at most once: all + /// copied-from-input material forms one contiguous block (no insertion sandwiched between two + /// copy actions -- that would be infixation, splitting the input material apart), and no part is + /// copied twice (that would be reduplication). + /// + private static bool IsEdgeStripperAllomorph(AffixProcessAllomorph allomorph) + { + var seenParts = new HashSet(); + bool sawCopy = false; + bool sawInsertAfterCopy = false; + foreach (MorphologicalOutputAction action in allomorph.Rhs) + { + switch (action) + { + case CopyFromInput copyFromInput: + if (sawInsertAfterCopy || !seenParts.Add(copyFromInput.PartName)) + return false; + sawCopy = true; + break; + case ModifyFromInput modifyFromInput: + if (sawInsertAfterCopy || !seenParts.Add(modifyFromInput.PartName)) + return false; + sawCopy = true; + break; + case InsertSegments _: + case InsertSimpleContext _: + if (sawCopy) + sawInsertAfterCopy = true; + break; + } + } + return true; + } + + private static int SegmentCount(RootAllomorph allomorph) => allomorph.Segments.Shape.SegmentCount(); + + private static int MaxAllomorphInsertion(IEnumerable allomorphs) + { + int max = 0; + foreach (AffixProcessAllomorph allo in allomorphs) + { + int insertion = 0; + foreach (MorphologicalOutputAction action in allo.Rhs) + { + switch (action) + { + case InsertSegments insertSegments: + insertion += insertSegments.Segments.Shape.SegmentCount(); + break; + case InsertSimpleContext _: + insertion += 1; + break; + // CopyFromInput/ModifyFromInput carry forward material already matched from the + // input (the root or a nested part) -- already counted via the root/allomorph + // length elsewhere, so they contribute 0 NEW segments here. + } + } + if (insertion > max) + max = insertion; + } + return max; + } + + private static bool TryGetFlatSegmentCount(Pattern pattern, out int count) + { + count = pattern.Children.Count; + return pattern.Children.All(c => c is Constraint ctr && ctr.Type() == HCFeatureSystem.Segment); + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/HermitCrabExtensions.cs b/src/SIL.Machine.Morphology.HermitCrab/HermitCrabExtensions.cs index 5cf2ad5af..aa42e29a8 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/HermitCrabExtensions.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/HermitCrabExtensions.cs @@ -142,6 +142,23 @@ internal static void SetDeleted(this ShapeNode node, bool deleted) ); } + /// + /// Counts real phonetic segments -- excludes boundary/anchor markers and nodes marked + /// (see parse-optimization.md's Phase 3 note on why deletion + /// marks rather than removes a node). Used by 's Phase 4 length-bound + /// gates, which reason about how many real segments a candidate could ever justify. + /// + internal static int SegmentCount(this Shape shape) + { + int count = 0; + foreach (ShapeNode node in shape) + { + if (node.Type() == HCFeatureSystem.Segment && !node.IsDeleted()) + count++; + } + return count; + } + internal static bool IsIterative(this ShapeNode node) { return node.Annotation.Data != null; diff --git a/src/SIL.Machine.Morphology.HermitCrab/MemoizedCombinationRuleCascade.cs b/src/SIL.Machine.Morphology.HermitCrab/MemoizedCombinationRuleCascade.cs new file mode 100644 index 000000000..7dc392b7f --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/MemoizedCombinationRuleCascade.cs @@ -0,0 +1,145 @@ +using System.Collections.Generic; +using SIL.Machine.Rules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Drop-in replacement for the sequential on + /// Unordered-order analysis strata (parse-optimization.md Phases 2-3). Before expanding a node, checks + /// whether an earlier expansion elsewhere in the same word's analysis -- reached via a different + /// unapplication order, but with an equal -- already searched this + /// exact state: + /// + /// proved empty (Phase 2's "nogood" case) -> skip straight to "no results"; + /// produced results (Phase 3) -> replay them () instead of + /// re-searching: clone each stored result and graft the CURRENT arrival's own trail/non-head prefix + /// onto the stored subtree-local suffix (see and + /// for why only the prefix -- never the suffix -- needs replacing). + /// + /// Measured on the real Sena grammar's worst word (cinacemerwa), Phase 2 alone (nogoods only): 523,774 + /// node expansions, 91.1% nogood-cache hits, 254s -> 192s wall clock (~24%) -- real but well below the + /// node-count reduction, because a nogood hit is a cheap dictionary lookup while the remaining ~9% of + /// productive nodes still paid full FST-match + clone cost on every revisit. Phase 3 targets exactly + /// that remaining cost: a state that DOES yield results is typically revisited hundreds to thousands of + /// times (the single worst atawirambo state was re-expanded 7,200x), and each revisit was a full + /// re-search before this change. + /// + /// Scoped to the sequential cascade only. The parallel cascade + /// (ParallelCombinationRuleCascade, used when > 1) + /// is a level-by-level breadth-first walk with no natural "this subtree is fully expanded" moment to + /// hang a memo write on, so it is left unmemoized for now -- callers that want this optimization + /// should construct their with maxDegreeOfParallelism: 1 (the mode the + /// constructor's own doc comment already recommends for a caller that parallelizes across words + /// itself, which describes exactly the corpus-batch workloads this optimization targets). + /// + internal class MemoizedCombinationRuleCascade : RuleCascade + { + private readonly Morpher _morpher; + + public MemoizedCombinationRuleCascade( + IEnumerable> rules, + IEqualityComparer comparer, + Morpher morpher + ) + : base(rules, true, comparer) + { + _morpher = morpher; + } + + public override IEnumerable Apply(Word input) + { + var output = new HashSet(Comparer); + ApplyRules(input, output); + AddRuleStats(output.Count); + return output; + } + + // Returns every result produced strictly within the subtree rooted at `input` (i.e. by applying one + // or more rules starting from `input`, at any depth) -- NOT including `input` itself. This is both + // the return value callers use and the value memoized against `input`'s AnalysisStateKey once the + // subtree finishes, so a later differently-ordered arrival at the same state can replay it via + // Word.ReplayOnto instead of re-searching. + private List ApplyRules(Word input, HashSet output) + { + AnalysisScope scope = input.AnalysisScope; + // Null while tracing (Morpher.ParseWord skips allocating one) or for words never routed through + // ParseWord at all (e.g. rule-level unit tests) -- fall back to unmemoized behavior rather than + // throw, per the ground rule that tracing must stay byte-identical to the unmemoized engine. + if (scope == null) + return ApplyRulesRaw(input, output); + + var key = new AnalysisStateKey(input); + + if (scope.Memo.TryGetValue(key, out MemoEntry entry)) + { + var replayed = new List(entry.Results.Count); + foreach (Word storedResult in entry.Results) + { + Word replay = storedResult.ReplayOnto(input, entry.MruleTrailPrefixLength, entry.NonHeadPrefixLength); + output.Add(replay); + replayed.Add(replay); + } + return replayed; + } + + // In-flight re-entry guard: a multiApp cascade can reach the same state again while its own + // first expansion is still on the stack (e.g. a self-loop through a rule that returns its input + // unchanged via a different route). Rather than read a nonexistent/partial entry or deadlock, + // fall through to a plain unmemoized expansion for just this arrival -- correctness-neutral, + // it only forgoes memoization for the one re-entrant call. + if (!scope.InProgress.TryAdd(key, 0)) + return ApplyRulesRaw(input, output); + + List results; + try + { + results = ApplyRulesRaw(input, output); + } + finally + { + scope.InProgress.TryRemove(key, out _); + } + + // OOM guard (parse-optimization.md Phase 3): past the cap, keep searching correctly, just stop + // growing the table. No "exhausted" bookkeeping is needed here -- this branch has no + // step/time-budget infrastructure, so `results` always reflects a fully-completed subtree. + if (scope.HasMemoCapacity) + scope.Memo.TryAdd(key, new MemoEntry(results, input.MorphologicalRuleTrailLength, input.NonHeadCount)); + + return results; + } + + private List ApplyRulesRaw(Word input, HashSet output) + { + var local = new List(); + for (int i = 0; i < Rules.Count; i++) + { + foreach (Word result in ApplyRule(Rules[i], i, input)) + { + local.Add(result); + output.Add(result); + // avoid infinite loop -- same guard CombinationRuleCascade uses + if (Comparer.Equals(input, result)) + continue; + + // parse-optimization.md Phase 5: emission above is cheap and deeper strata/templates + // may still transform this result, so it always gets recorded regardless -- only + // DESCENT (the potentially exponential part) is gated. A result whose shape can't + // reach any root in its own or a deeper stratum can never bottom out in a valid + // analysis no matter how the rest of the cascade proceeds. + if ( + result.AnalysisScope != null + && result.AnalysisScope.LexicalGatingActive + && !_morpher.HasReachableRoot(result) + ) + { + continue; + } + + local.AddRange(ApplyRules(result, output)); + } + } + return local; + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs b/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs index 10cdc45c6..222953051 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs @@ -16,12 +16,37 @@ namespace SIL.Machine.Morphology.HermitCrab { + /// + /// + /// Corpus-batch hosts running Server GC MUST set a heap hard limit (e.g. + /// DOTNET_GCHeapHardLimit or GCHeapHardLimitPercent) when parallelizing across + /// words (parse-optimization.md Phase 8, see also 's maxDegreeOfParallelism remarks). Measured 2026-07-03: 16-way concurrency + /// with Server GC and no limit reached 45GB on a 64GB host and had to be killed; the same + /// workload with DOTNET_GCHeapHardLimit=0x600000000 (24GB) completed. A follow-up + /// measurement (13 of the heaviest known Sena words, all running concurrently at once -- + /// a harder case than a real mixed corpus, where lighter words finish early and relieve + /// pressure) found this is not always free: wall-clock rose ~30-45% under the same limit + /// versus unlimited (e.g. one word went 96.9s → 130.8s) even though every word still + /// completed and results stayed byte-identical. The blowup is not the per-parse memo + /// tables (/) retaining + /// too much -- measured at 6K-8K and 35K-58K stored instances respectively + /// for the heaviest known words, tens of MB at most given 's and + /// 's copy-on-write sharing -- it is Server GC deferring collection + /// of the much larger volume of transient search/replay garbage for throughput, under + /// concurrent heavy-word pressure. Set a limit sized to what the host can spare, and expect a + /// real (not cosmetic) throughput/memory trade-off under sustained all-heavy concurrent load; + /// do not assume the limit is a free safety net on every workload shape. + /// + /// public class Morpher : IMorphologicalAnalyzer, IMorphologicalGenerator { private readonly Language _lang; private readonly IRule _analysisRule; private readonly IRule _synthesisRule; private readonly Dictionary _allomorphTries; + private readonly Dictionary> _reachabilityTries; + private readonly bool _lexicalGatingQualified; private readonly ITraceManager _traceManager; private readonly ReadOnlyObservableCollection _morphemes; private readonly IList _lexicalPatterns = new List(); @@ -63,8 +88,23 @@ public Morpher(ITraceManager traceManager, Language lang, int maxDegreeOfParalle morphemes.AddRange(stratum.MorphologicalRules.OfType()); morphemes.AddRange(stratum.AffixTemplates.SelectMany(t => t.Slots).SelectMany(s => s.Rules).Distinct()); } + // parse-optimization.md Phase 5: for each stratum, the tries of itself and every stratum + // "deeper" than it -- deeper meaning closer to the root, i.e. earlier in Language.Strata's own + // (root-most-first) order, since AnalysisLanguageRule walks strata in the OPPOSITE order + // (Reverse(), surface-first) and a candidate currently at stratum S can still be transformed + // by every stratum S has yet to reach on its way to the root. + _reachabilityTries = new Dictionary>(); + var soFar = new List(); + foreach (Stratum stratum in _lang.Strata) + { + soFar.Add(_allomorphTries[stratum]); + _reachabilityTries[stratum] = new List(soFar); + } + _lexicalGatingQualified = GrammarAnalyzer.IsEdgeStripperQualified(_lang); + _analysisRule = lang.CompileAnalysisRule(this); _synthesisRule = lang.CompileSynthesisRule(this); + ((InstrumentedRule)_synthesisRule).Name = "Synthesis"; MaxStemCount = 2; MaxUnapplications = 0; MergeEquivalentAnalyses = true; @@ -81,6 +121,56 @@ public ITraceManager TraceManager public int DeletionReapplications { get; set; } + private int? _maxAnalysisLengthOverride; + private bool _maxAnalysisLengthOverrideSet; + + /// + /// The longest underlying form (in real segments, i.e. ) + /// any analysis candidate can be before it is pruned as unreachable (parse-optimization.md Phase 4's + /// "Gate B") -- auto-derived from the grammar (: + /// the longest lexicon root plus every rule's own maximum possible insertion) unless explicitly set. + /// Setting this (including to null, which disables the gate entirely) overrides the + /// auto-derived value; re-derived fresh from the current grammar and + /// on every read otherwise, so it never goes stale if either changes after construction. Auto-derives + /// to null (gate off) when the grammar contains a compounding rule or a phonological rule shape + /// this analysis can't measure exactly -- see 's remarks. + /// + public int? MaxAnalysisLength + { + get + { + return _maxAnalysisLengthOverrideSet + ? _maxAnalysisLengthOverride + : GrammarAnalyzer.ComputeMaxAnalysisLength(_lang, DeletionReapplications); + } + set + { + _maxAnalysisLengthOverride = value; + _maxAnalysisLengthOverrideSet = true; + } + } + + /// + /// parse-optimization.md Phase 5: prune an analysis subtree before descending into it when no root + /// in the current stratum (or any stratum deeper than it) can match ANY contiguous window of the + /// candidate's current shape -- see and + /// . Default off, as the plan requires: + /// even when set, the gate only actually activates for a given parse when the grammar itself + /// qualifies (, checked once at construction) + /// and the call isn't tracing or root-guessing ('s + /// guessRoot synthesizes from lexical PATTERNS, bypassing the real lexicon entirely, so a + /// real-lexicon reachability gate would be unsound applied to it). This is the plan's own + /// highest-risk phase; turn on only after the corpus A/B protocol in parse-optimization.md's Phase + /// 5 section holds for your grammar. + /// + public bool EnableLexicalGating { get; set; } + + /// Reachability check backing -- see its remarks. + internal bool HasReachableRoot(Word word) + { + return _reachabilityTries[word.Stratum].Any(trie => trie.ContainsRootAnywhere(word.Shape)); + } + public int MaxStemCount { get; set; } /// @@ -110,6 +200,21 @@ public Language Language get { return _lang; } } + /// + /// When true, ParseWord does not clear rule stats (InstrumentedRule.InputCount/OutputCount/ + /// ElapsedTime/BucketGroups) at the start of each parse, so they accumulate across an entire corpus + /// batch instead of reflecting only the most recent word. Off by default: existing single-word + /// callers (e.g. an interactive "why didn't this parse" UI) expect ClearStats every call. The rule + /// tree is shared across every ParseWord call on this Morpher, so a caller enabling this on a Morpher + /// used from multiple threads is responsible for keeping calls single-threaded (see + /// MemoizedCombinationRuleCascade's doc comment on maxDegreeOfParallelism: 1 for corpus-batch runs). + /// + public bool AccumulateRuleStats { get; set; } + + public InstrumentedRule AnalysisRuleStats => _analysisRule as InstrumentedRule; + + public InstrumentedRule SynthesisRuleStats => _synthesisRule as InstrumentedRule; + /// /// Parses the specified surface form. /// @@ -133,11 +238,30 @@ public IEnumerable ParseWord(string word, out object trace, bool guessRoot Shape shape = _lang.SurfaceStratum.CharacterDefinitionTable.Segment(word); var input = new Word(_lang.SurfaceStratum, shape); + // Skipped while tracing: the nogood cascade this backs skips expansions outright on a hit, + // which would also skip the trace events those expansions fire (parse-optimization.md Phase 2 + // ground rules -- traces must stay byte-identical to the unmemoized engine). + if (!_traceManager.IsTracing) + { + // Phase 5's lexical gate is unsound under guessRoot (it synthesizes from lexical PATTERNS, + // bypassing the real lexicon the gate's reachability index is built from) -- this check + // covers the whole parse, not just guessRoot's own fallback branch further down, since the + // gate would otherwise have already pruned candidates during _analysisRule.Apply below, + // before guessRoot's branch ever runs. + bool lexicalGatingActive = EnableLexicalGating && _lexicalGatingQualified && !guessRoot; + input.AnalysisScope = new AnalysisScope(lexicalGatingActive); + } input.Freeze(); if (_traceManager.IsTracing) _traceManager.AnalyzeWord(_lang, input); trace = input.CurrentTrace; + if (!AccumulateRuleStats) + { + AnalysisRuleStats?.ClearStats(); + SynthesisRuleStats?.ClearStats(); + } + // Unapply rules IList analyses = _analysisRule.Apply(input).ToList(); @@ -340,6 +464,15 @@ private IEnumerable Synthesize(string word, IList analyses) private IEnumerable SynthesizeAnalysis(string word, Word analysisWord) { + // Gate A from parse-optimization.md's Phase 4 sketch (pre-phonology length-vs-target pruning) + // was attempted and reverted here: `alternative` at this point is still essentially the bare + // root allomorph -- the pending affix trail's own insertions haven't been applied yet, they + // happen inside _synthesisRule.Apply below alongside phonology -- so comparing its length to + // the target surface length without also accounting for that trail's own insertions produced + // false rejections (confirmed against the unit suite: CompoundingRuleTests/MetathesisRuleTests + // regressed). A correct version would need to sum each pending trail rule's own max insertion + // (GrammarAnalyzer already computes this per-rule for Gate B) rather than compare bare-root + // length directly -- left as follow-up, not attempted this pass. foreach (Word synthesisWord in LexicalLookup(analysisWord)) { foreach (Word alternative in synthesisWord.ExpandAlternatives()) diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisAffixProcessRule.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisAffixProcessRule.cs index b9f6d4acc..82881fb36 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisAffixProcessRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisAffixProcessRule.cs @@ -7,7 +7,7 @@ namespace SIL.Machine.Morphology.HermitCrab.MorphologicalRules { - public class AnalysisAffixProcessRule : IRule + public class AnalysisAffixProcessRule : InstrumentedRule { private readonly Morpher _morpher; private readonly AffixProcessRule _rule; @@ -15,6 +15,7 @@ public class AnalysisAffixProcessRule : IRule public AnalysisAffixProcessRule(Morpher morpher, AffixProcessRule rule) { + Name = rule.Name; _morpher = morpher; _rule = rule; @@ -37,7 +38,7 @@ public AnalysisAffixProcessRule(Morpher morpher, AffixProcessRule rule) } } - public IEnumerable Apply(Word input) + public override IEnumerable Apply(Word input) { if (!_morpher.RuleSelector(_rule)) return Enumerable.Empty(); @@ -75,11 +76,21 @@ public IEnumerable Apply(Word input) _morpher.TraceManager.MorphologicalRuleUnapplied(_rule, i, input, outWord); output.Add(outWord); unapplied = true; + + if (_morpher.AccumulateRuleStats) + { + string example = RuleStatsHelper.Example(input); + RecordBucket(RuleStatsHelper.AllomorphGroup, i.ToString(), example); + RecordBucket(RuleStatsHelper.CategoryGroup, RuleStatsHelper.Category(input), example); + RecordBucket(RuleStatsHelper.StemNameGroup, RuleStatsHelper.StemName(input), example); + RecordBucket(RuleStatsHelper.RootDirectGroup, RuleStatsHelper.IsRootDirect(input), example); + } } if (_morpher.TraceManager.IsTracing && !unapplied) _morpher.TraceManager.MorphologicalRuleNotUnapplied(_rule, i, input); } + AddRuleStats(output.Count); return output; } } diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisCompoundingRule.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisCompoundingRule.cs index b5013d4ee..eac07f24f 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisCompoundingRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisCompoundingRule.cs @@ -7,7 +7,7 @@ namespace SIL.Machine.Morphology.HermitCrab.MorphologicalRules { - public class AnalysisCompoundingRule : IRule + public class AnalysisCompoundingRule : InstrumentedRule { private readonly Morpher _morpher; private readonly CompoundingRule _rule; @@ -15,6 +15,7 @@ public class AnalysisCompoundingRule : IRule public AnalysisCompoundingRule(Morpher morpher, CompoundingRule rule) { + Name = rule.Name; _morpher = morpher; _rule = rule; @@ -37,7 +38,7 @@ public AnalysisCompoundingRule(Morpher morpher, CompoundingRule rule) } } - public IEnumerable Apply(Word input) + public override IEnumerable Apply(Word input) { if (!_morpher.RuleSelector(_rule)) return Enumerable.Empty(); @@ -146,12 +147,25 @@ RootAllomorph allo in _morpher.SearchRootAllomorphs(_rule.Stratum, outWord.Curre _morpher.TraceManager.MorphologicalRuleUnapplied(_rule, i, input, outWord); output.Add(outWord); unapplied = true; + + if (_morpher.AccumulateRuleStats) + { + string example = RuleStatsHelper.Example(input); + RecordBucket(RuleStatsHelper.AllomorphGroup, i.ToString(), example); + RecordBucket(RuleStatsHelper.CategoryGroup, RuleStatsHelper.Category(input), example); + RecordBucket( + RuleStatsHelper.NonHeadCategoryGroup, + RuleStatsHelper.Category(outWord.CurrentNonHead), + example + ); + } } if (_morpher.TraceManager.IsTracing && !unapplied) _morpher.TraceManager.MorphologicalRuleNotUnapplied(_rule, i, input); } + AddRuleStats(output.Count); return output; } } diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisRealizationalAffixProcessRule.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisRealizationalAffixProcessRule.cs index 031c6fbad..298e31aa6 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisRealizationalAffixProcessRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisRealizationalAffixProcessRule.cs @@ -7,7 +7,7 @@ namespace SIL.Machine.Morphology.HermitCrab.MorphologicalRules { - public class AnalysisRealizationalAffixProcessRule : IRule + public class AnalysisRealizationalAffixProcessRule : InstrumentedRule { private readonly Morpher _morpher; private readonly RealizationalAffixProcessRule _rule; @@ -15,6 +15,7 @@ public class AnalysisRealizationalAffixProcessRule : IRule public AnalysisRealizationalAffixProcessRule(Morpher morpher, RealizationalAffixProcessRule rule) { + Name = rule.Name; _morpher = morpher; _rule = rule; @@ -37,7 +38,7 @@ public AnalysisRealizationalAffixProcessRule(Morpher morpher, RealizationalAffix } } - public IEnumerable Apply(Word input) + public override IEnumerable Apply(Word input) { if (!_morpher.RuleSelector(_rule)) return Enumerable.Empty(); @@ -60,11 +61,21 @@ public IEnumerable Apply(Word input) _morpher.TraceManager.MorphologicalRuleUnapplied(_rule, i, input, outWord); output.Add(outWord); unapplied = true; + + if (_morpher.AccumulateRuleStats) + { + string example = RuleStatsHelper.Example(input); + RecordBucket(RuleStatsHelper.AllomorphGroup, i.ToString(), example); + RecordBucket(RuleStatsHelper.CategoryGroup, RuleStatsHelper.Category(input), example); + RecordBucket(RuleStatsHelper.StemNameGroup, RuleStatsHelper.StemName(input), example); + RecordBucket(RuleStatsHelper.RootDirectGroup, RuleStatsHelper.IsRootDirect(input), example); + } } if (_morpher.TraceManager.IsTracing && !unapplied) _morpher.TraceManager.MorphologicalRuleNotUnapplied(_rule, i, input); } + AddRuleStats(output.Count); return output; } } diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisAffixProcessRule.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisAffixProcessRule.cs index 98a3895d0..7c6c53648 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisAffixProcessRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisAffixProcessRule.cs @@ -8,7 +8,7 @@ namespace SIL.Machine.Morphology.HermitCrab.MorphologicalRules { - public class SynthesisAffixProcessRule : IRule + public class SynthesisAffixProcessRule : InstrumentedRule { private readonly Morpher _morpher; private readonly AffixProcessRule _rule; @@ -16,6 +16,7 @@ public class SynthesisAffixProcessRule : IRule public SynthesisAffixProcessRule(Morpher morpher, AffixProcessRule rule) { + Name = rule.Name; _morpher = morpher; _rule = rule; _rules = new List>(); @@ -38,7 +39,7 @@ public SynthesisAffixProcessRule(Morpher morpher, AffixProcessRule rule) } } - public IEnumerable Apply(Word input) + public override IEnumerable Apply(Word input) { if (!input.IsMorphologicalRuleApplicable(_rule)) return Enumerable.Empty(); @@ -215,6 +216,15 @@ public IEnumerable Apply(Word input) _morpher.TraceManager.MorphologicalRuleApplied(_rule, i, input, outWord); output.Add(outWord); + if (_morpher.AccumulateRuleStats) + { + string example = RuleStatsHelper.Example(input); + RecordBucket(RuleStatsHelper.AllomorphGroup, i.ToString(), example); + RecordBucket(RuleStatsHelper.CategoryGroup, RuleStatsHelper.Category(input), example); + RecordBucket(RuleStatsHelper.StemNameGroup, RuleStatsHelper.StemName(input), example); + RecordBucket(RuleStatsHelper.RootDirectGroup, RuleStatsHelper.IsRootDirect(input), example); + } + // return all word syntheses that match subrules that are constrained by environments, // HC violates the disjunctive property of allomorphs here because it cannot check the // environmental constraints until it has a surface form, we will enforce the disjunctive @@ -237,6 +247,7 @@ public IEnumerable Apply(Word input) } } + AddRuleStats(output.Count); return output; } } diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisCompoundingRule.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisCompoundingRule.cs index 29e3bd5f3..9ab42884e 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisCompoundingRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisCompoundingRule.cs @@ -9,7 +9,7 @@ namespace SIL.Machine.Morphology.HermitCrab.MorphologicalRules { - public class SynthesisCompoundingRule : IRule + public class SynthesisCompoundingRule : InstrumentedRule { private readonly Morpher _morpher; private readonly CompoundingRule _rule; @@ -17,6 +17,7 @@ public class SynthesisCompoundingRule : IRule public SynthesisCompoundingRule(Morpher morpher, CompoundingRule rule) { + Name = rule.Name; _morpher = morpher; _rule = rule; _subruleMatchers = new List, Matcher>>(); @@ -42,7 +43,7 @@ private Matcher BuildMatcher(IEnumerable> lhs) ); } - public IEnumerable Apply(Word input) + public override IEnumerable Apply(Word input) { if (!input.IsMorphologicalRuleApplicable(_rule)) return Enumerable.Empty(); @@ -209,6 +210,18 @@ public IEnumerable Apply(Word input) _morpher.TraceManager.MorphologicalRuleApplied(_rule, i, input, outWord); output.Add(outWord); + + if (_morpher.AccumulateRuleStats) + { + string example = RuleStatsHelper.Example(input); + RecordBucket(RuleStatsHelper.AllomorphGroup, i.ToString(), example); + RecordBucket(RuleStatsHelper.CategoryGroup, RuleStatsHelper.Category(input), example); + RecordBucket( + RuleStatsHelper.NonHeadCategoryGroup, + RuleStatsHelper.Category(input.CurrentNonHead), + example + ); + } break; } if (_morpher.TraceManager.IsTracing) @@ -228,6 +241,7 @@ public IEnumerable Apply(Word input) } } + AddRuleStats(output.Count); return output; } diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisRealizationalAffixProcessRule.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisRealizationalAffixProcessRule.cs index bd1717f82..e9821ec3f 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisRealizationalAffixProcessRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisRealizationalAffixProcessRule.cs @@ -9,7 +9,7 @@ namespace SIL.Machine.Morphology.HermitCrab.MorphologicalRules { - public class SynthesisRealizationalAffixProcessRule : IRule + public class SynthesisRealizationalAffixProcessRule : InstrumentedRule { private readonly Morpher _morpher; private readonly RealizationalAffixProcessRule _rule; @@ -17,6 +17,7 @@ public class SynthesisRealizationalAffixProcessRule : IRule public SynthesisRealizationalAffixProcessRule(Morpher morpher, RealizationalAffixProcessRule rule) { + Name = rule.Name; _morpher = morpher; _rule = rule; _rules = new List>(); @@ -38,7 +39,7 @@ public SynthesisRealizationalAffixProcessRule(Morpher morpher, RealizationalAffi } } - public IEnumerable Apply(Word input) + public override IEnumerable Apply(Word input) { if (!_morpher.RuleSelector(_rule)) return Enumerable.Empty(); @@ -146,6 +147,15 @@ public IEnumerable Apply(Word input) output.Add(outWord); + if (_morpher.AccumulateRuleStats) + { + string example = RuleStatsHelper.Example(input); + RecordBucket(RuleStatsHelper.AllomorphGroup, i.ToString(), example); + RecordBucket(RuleStatsHelper.CategoryGroup, RuleStatsHelper.Category(input), example); + RecordBucket(RuleStatsHelper.StemNameGroup, RuleStatsHelper.StemName(input), example); + RecordBucket(RuleStatsHelper.RootDirectGroup, RuleStatsHelper.IsRootDirect(input), example); + } + // return all word syntheses that match subrules that are constrained by environments, // HC violates the disjunctive property of allomorphs here because it cannot check the // environmental constraints until it has a surface form, we will enforce the disjunctive @@ -168,6 +178,7 @@ public IEnumerable Apply(Word input) } } + AddRuleStats(output.Count); return output; } diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisMetathesisRule.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisMetathesisRule.cs index 5d160243f..f17fd671b 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisMetathesisRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisMetathesisRule.cs @@ -8,7 +8,7 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { - public class AnalysisMetathesisRule : IRule + public class AnalysisMetathesisRule : InstrumentedRule { private readonly Morpher _morpher; private readonly MetathesisRule _rule; @@ -16,6 +16,7 @@ public class AnalysisMetathesisRule : IRule public AnalysisMetathesisRule(Morpher morpher, MetathesisRule rule) { + Name = rule.Name; _morpher = morpher; _rule = rule; @@ -35,7 +36,7 @@ public AnalysisMetathesisRule(Morpher morpher, MetathesisRule rule) _patternRule = new IterativePhonologicalPatternRule(ruleSpec, settings); } - public IEnumerable Apply(Word input) + public override IEnumerable Apply(Word input) { if (!_morpher.RuleSelector(_rule)) return Enumerable.Empty(); @@ -48,11 +49,20 @@ public IEnumerable Apply(Word input) { if (_morpher.TraceManager.IsTracing) _morpher.TraceManager.PhonologicalRuleUnapplied(_rule, -1, origInput, input); + + if (_morpher.AccumulateRuleStats) + { + string example = RuleStatsHelper.Example(input); + RecordBucket(RuleStatsHelper.CategoryGroup, RuleStatsHelper.Category(input), example); + RecordBucket(RuleStatsHelper.RootDirectGroup, RuleStatsHelper.IsRootDirect(input), example); + } + AddRuleStats(1); return input.ToEnumerable(); } if (_morpher.TraceManager.IsTracing) _morpher.TraceManager.PhonologicalRuleNotUnapplied(_rule, -1, input); + AddRuleStats(0); return Enumerable.Empty(); } } diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisRewriteRule.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisRewriteRule.cs index e691b4c0a..1e311fad2 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisRewriteRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisRewriteRule.cs @@ -10,7 +10,7 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { - public class AnalysisRewriteRule : IRule + public class AnalysisRewriteRule : InstrumentedRule { private enum ReapplyType { @@ -25,6 +25,7 @@ private enum ReapplyType public AnalysisRewriteRule(Morpher morpher, RewriteRule rule) { + Name = rule.Name; _morpher = morpher; _rule = rule; @@ -118,7 +119,7 @@ private static bool IsUnifiable(Constraint constraint, Pattern Apply(Word input) + public override IEnumerable Apply(Word input) { if (!_morpher.RuleSelector(_rule)) return Enumerable.Empty(); @@ -173,6 +174,14 @@ public IEnumerable Apply(Word input) if (_morpher.TraceManager.IsTracing) _morpher.TraceManager.PhonologicalRuleUnapplied(_rule, i, origInput, input); applied = true; + + if (_morpher.AccumulateRuleStats) + { + string example = RuleStatsHelper.Example(input); + RecordBucket(RuleStatsHelper.SubruleGroup, i.ToString(), example); + RecordBucket(RuleStatsHelper.CategoryGroup, RuleStatsHelper.Category(input), example); + RecordBucket(RuleStatsHelper.RootDirectGroup, RuleStatsHelper.IsRootDirect(input), example); + } } else if (_morpher.TraceManager.IsTracing) { @@ -180,6 +189,7 @@ public IEnumerable Apply(Word input) } } + AddRuleStats(applied ? 1 : 0); if (applied) return input.ToEnumerable(); return Enumerable.Empty(); diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisMetathesisRule.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisMetathesisRule.cs index 2d8c3af5a..70bb1abe6 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisMetathesisRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisMetathesisRule.cs @@ -7,7 +7,7 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { - public class SynthesisMetathesisRule : IRule + public class SynthesisMetathesisRule : InstrumentedRule { private readonly Morpher _morpher; private readonly MetathesisRule _rule; @@ -15,6 +15,7 @@ public class SynthesisMetathesisRule : IRule public SynthesisMetathesisRule(Morpher morpher, MetathesisRule rule) { + Name = rule.Name; _morpher = morpher; _rule = rule; @@ -32,7 +33,7 @@ public SynthesisMetathesisRule(Morpher morpher, MetathesisRule rule) _patternRule = new IterativePhonologicalPatternRule(ruleSpec, settings); } - public IEnumerable Apply(Word input) + public override IEnumerable Apply(Word input) { if (!_morpher.RuleSelector(_rule)) return Enumerable.Empty(); @@ -45,11 +46,20 @@ public IEnumerable Apply(Word input) { if (_morpher.TraceManager.IsTracing) _morpher.TraceManager.PhonologicalRuleApplied(_rule, -1, origInput, input); + + if (_morpher.AccumulateRuleStats) + { + string example = RuleStatsHelper.Example(input); + RecordBucket(RuleStatsHelper.CategoryGroup, RuleStatsHelper.Category(input), example); + RecordBucket(RuleStatsHelper.RootDirectGroup, RuleStatsHelper.IsRootDirect(input), example); + } + AddRuleStats(1); return input.ToEnumerable(); } if (_morpher.TraceManager.IsTracing) _morpher.TraceManager.PhonologicalRuleNotApplied(_rule, -1, input, FailureReason.Pattern, null); + AddRuleStats(0); return Enumerable.Empty(); } } diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisRewriteRule.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisRewriteRule.cs index ecf84a7dc..827ed5cae 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisRewriteRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisRewriteRule.cs @@ -8,7 +8,7 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { - public class SynthesisRewriteRule : IRule + public class SynthesisRewriteRule : InstrumentedRule { private readonly Morpher _morpher; private readonly RewriteRule _rule; @@ -16,6 +16,7 @@ public class SynthesisRewriteRule : IRule public SynthesisRewriteRule(Morpher morpher, RewriteRule rule) { + Name = rule.Name; _morpher = morpher; _rule = rule; @@ -48,17 +49,17 @@ public SynthesisRewriteRule(Morpher morpher, RewriteRule rule) } } - public IEnumerable Apply(Word input) + public override IEnumerable Apply(Word input) { if (!_morpher.RuleSelector(_rule)) return Enumerable.Empty(); Word origInput = null; + bool collectResults = _morpher.TraceManager.IsTracing || _morpher.AccumulateRuleStats; if (_morpher.TraceManager.IsTracing) - { origInput = input.Clone(); + if (collectResults) input.CurrentRuleResults = new Dictionary>(); - } bool applied = _patternRule.Apply(input).Any(); @@ -81,8 +82,29 @@ public IEnumerable Apply(Word input) _morpher.TraceManager.PhonologicalRuleNotApplied(_rule, i, input, FailureReason.Pattern, null); } } - input.CurrentRuleResults = null; } + + if (applied && _morpher.AccumulateRuleStats) + { + string example = RuleStatsHelper.Example(input); + for (int i = 0; i < _rule.Subrules.Count; i++) + { + if ( + input.CurrentRuleResults.TryGetValue(i, out Tuple reason) + && reason.Item1 == FailureReason.None + ) + { + RecordBucket(RuleStatsHelper.SubruleGroup, i.ToString(), example); + break; + } + } + RecordBucket(RuleStatsHelper.CategoryGroup, RuleStatsHelper.Category(input), example); + RecordBucket(RuleStatsHelper.RootDirectGroup, RuleStatsHelper.IsRootDirect(input), example); + } + if (collectResults) + input.CurrentRuleResults = null; + + AddRuleStats(applied ? 1 : 0); if (applied) return input.ToEnumerable(); return Enumerable.Empty(); diff --git a/src/SIL.Machine.Morphology.HermitCrab/RootAllomorphTrie.cs b/src/SIL.Machine.Morphology.HermitCrab/RootAllomorphTrie.cs index 7d34e81a5..ac42a1420 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/RootAllomorphTrie.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/RootAllomorphTrie.cs @@ -77,5 +77,25 @@ public IEnumerable Search(Shape shape) yield return _allomorphs[match.ID]; } } + + /// + /// parse-optimization.md Phase 5: does some root allomorph in this trie match ANY contiguous + /// window of (start anchored per attempt, end NOT anchored -- a root is + /// typically shorter than the remaining unstripped candidate) -- unlike , + /// which only checks a match starting at the shape's first segment and consuming to its end (the + /// bare-root-after-full-unapplication case uses this + /// for). Used as a cheap admissibility check before descending into an analysis subtree: if no + /// window matches anywhere, no root in this stratum can ever be reached from here. + /// + public bool ContainsRootAnywhere(Shape shape) + { + foreach (Annotation startAnn in shape.Annotations.Where(ann => _filter(ann))) + { + IEnumerable> matches; + if (_fsa.Transduce(shape, startAnn, null, true, false, false, out matches) && matches.Any()) + return true; + } + return false; + } } } diff --git a/src/SIL.Machine.Morphology.HermitCrab/RuleStatsHelper.cs b/src/SIL.Machine.Morphology.HermitCrab/RuleStatsHelper.cs new file mode 100644 index 000000000..a1707d2f4 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/RuleStatsHelper.cs @@ -0,0 +1,48 @@ +using System.Linq; +using SIL.Machine.FeatureModel; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Builds the bucket keys/examples that InstrumentedRule.RecordBucket stores, so a corpus-wide stats run + /// can answer questions like "does this rule only ever fire on verbs" or "does this allomorph only ever + /// attach to a bare stem" from real parse traffic rather than re-reading the grammar XML by hand. + /// RootAllomorph/StemName are only meaningfully populated on the synthesis side (analysis doesn't know + /// the root's lexical identity until the derivation bottoms out), so those groups read "(none)" for most + /// analysis-direction calls -- that is itself useful signal, not a bug. + /// + internal static class RuleStatsHelper + { + public const string CategoryGroup = "category"; + public const string StemNameGroup = "stemName"; + public const string AllomorphGroup = "allomorph"; + public const string RootDirectGroup = "rootDirect"; + public const string SubruleGroup = "subrule"; + public const string NonHeadCategoryGroup = "nonHeadCategory"; + + public static string Category(Word word) + { + FeatureSymbol pos = word.SyntacticFeatureStruct?.PartsOfSpeech().FirstOrDefault(); + return pos?.ID ?? "(none)"; + } + + public static string StemName(Word word) + { + return word.RootAllomorph?.StemName?.Name ?? "(none)"; + } + + // "true" = this application's input had no morphological rules recorded on it yet -- for synthesis + // that means the affix/phonological rule fired directly against the bare stem; for analysis it means + // this was the innermost/first rule unapplied. Either reading answers "does this only ever touch the + // stem, or does it also apply once other affixes are already present." + public static string IsRootDirect(Word word) + { + return word.MorphologicalRules.Any() ? "false" : "true"; + } + + public static string Example(Word word) + { + return word.ToString(); + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/SynthesisAffixTemplateRule.cs b/src/SIL.Machine.Morphology.HermitCrab/SynthesisAffixTemplateRule.cs index 21248d002..a69f4cc85 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/SynthesisAffixTemplateRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/SynthesisAffixTemplateRule.cs @@ -6,7 +6,7 @@ namespace SIL.Machine.Morphology.HermitCrab { - internal class SynthesisAffixTemplateRule : IRule + internal class SynthesisAffixTemplateRule : InstrumentedRule { private readonly Morpher _morpher; private readonly AffixTemplate _template; @@ -14,6 +14,7 @@ internal class SynthesisAffixTemplateRule : IRule public SynthesisAffixTemplateRule(Morpher morpher, AffixTemplate template) { + Name = template.Name; _morpher = morpher; _template = template; _rules = new List>( @@ -23,14 +24,16 @@ public SynthesisAffixTemplateRule(Morpher morpher, AffixTemplate template) FreezableEqualityComparer.Default )) ); + AddSubRules(_rules); } - public IEnumerable Apply(Word input) + public override IEnumerable Apply(Word input) { if (_morpher.TraceManager.IsTracing) _morpher.TraceManager.BeginApplyTemplate(_template, input); var output = new HashSet(FreezableEqualityComparer.Default); ApplySlots(input, 0, output); + AddRuleStats(output.Count); return output; } diff --git a/src/SIL.Machine.Morphology.HermitCrab/SynthesisAffixTemplatesRule.cs b/src/SIL.Machine.Morphology.HermitCrab/SynthesisAffixTemplatesRule.cs index a5ab1aa2a..d4329d239 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/SynthesisAffixTemplatesRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/SynthesisAffixTemplatesRule.cs @@ -7,7 +7,7 @@ namespace SIL.Machine.Morphology.HermitCrab { - internal class SynthesisAffixTemplatesRule : IRule + internal class SynthesisAffixTemplatesRule : InstrumentedRule { private readonly Morpher _morpher; private readonly Stratum _stratum; @@ -16,13 +16,15 @@ internal class SynthesisAffixTemplatesRule : IRule public SynthesisAffixTemplatesRule(Morpher morpher, Stratum stratum) { + Name = stratum.Name; _morpher = morpher; _stratum = stratum; _templates = stratum.AffixTemplates.ToList(); _templateRules = _templates.Select(temp => temp.CompileSynthesisRule(morpher)).ToList(); + AddSubRules(_templateRules); } - public IEnumerable Apply(Word input) + public override IEnumerable Apply(Word input) { if (!input.RealizationalFeatureStruct.IsUnifiable(input.SyntacticFeatureStruct)) return Enumerable.Empty(); @@ -74,6 +76,7 @@ public IEnumerable Apply(Word input) } } + AddRuleStats(output.Count); return output; } diff --git a/src/SIL.Machine.Morphology.HermitCrab/SynthesisStratumRule.cs b/src/SIL.Machine.Morphology.HermitCrab/SynthesisStratumRule.cs index 72ff8b24b..20cc206e6 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/SynthesisStratumRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/SynthesisStratumRule.cs @@ -1,4 +1,5 @@ using System.Collections.Generic; +using System.Diagnostics; using System.Linq; using SIL.Extensions; using SIL.Machine.Annotations; @@ -7,7 +8,7 @@ namespace SIL.Machine.Morphology.HermitCrab { - internal class SynthesisStratumRule : IRule + internal class SynthesisStratumRule : InstrumentedRule { private readonly IRule _mrulesRule; private readonly IRule _prulesRule; @@ -17,26 +18,25 @@ internal class SynthesisStratumRule : IRule public SynthesisStratumRule(Morpher morpher, Stratum stratum) { + Name = stratum.Name; _templatesRule = new SynthesisAffixTemplatesRule(morpher, stratum); _mrulesRule = null; - IEnumerable> mrules = stratum.MorphologicalRules.Select(mrule => - mrule.CompileSynthesisRule(morpher) - ); + // Paired (not just the compiled rules) so the Unordered cascade can look up the trail-directed + // rule instead of probing the whole battery -- see TrailDirectedRuleCascade. + var compiledMRules = stratum + .MorphologicalRules.Select(mrule => (Rule: mrule, Compiled: mrule.CompileSynthesisRule(morpher))) + .ToList(); switch (stratum.MorphologicalRuleOrder) { case MorphologicalRuleOrder.Linear: _mrulesRule = new LinearRuleCascade( - mrules, + compiledMRules.Select(p => p.Compiled), true, FreezableEqualityComparer.Default ); break; case MorphologicalRuleOrder.Unordered: - _mrulesRule = new CombinationRuleCascade( - mrules, - true, - FreezableEqualityComparer.Default - ); + _mrulesRule = new TrailDirectedRuleCascade(compiledMRules, FreezableEqualityComparer.Default); break; } _prulesRule = new LinearRuleCascade( @@ -44,13 +44,17 @@ public SynthesisStratumRule(Morpher morpher, Stratum stratum) ); _stratum = stratum; _morpher = morpher; + AddSubRule(_mrulesRule); + AddSubRule(_prulesRule); + AddSubRule(_templatesRule); } - public IEnumerable Apply(Word input) + public override IEnumerable Apply(Word input) { if (!_morpher.RuleSelector(_stratum) || input.RootAllomorph.Morpheme.Stratum.Depth > _stratum.Depth) return input.ToEnumerable(); + long startTime = Stopwatch.GetTimestamp(); if (_morpher.TraceManager.IsTracing) _morpher.TraceManager.BeginApplyStratum(_stratum, input); @@ -88,6 +92,9 @@ public IEnumerable Apply(Word input) } if (_morpher.TraceManager.IsTracing && output.Count == 0) _morpher.TraceManager.EndApplyStratum(_stratum, input); + + ElapsedTime += Stopwatch.GetTimestamp() - startTime; + AddRuleStats(output.Count); return output; } diff --git a/src/SIL.Machine.Morphology.HermitCrab/TrailDirectedRuleCascade.cs b/src/SIL.Machine.Morphology.HermitCrab/TrailDirectedRuleCascade.cs new file mode 100644 index 000000000..bc023a224 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/TrailDirectedRuleCascade.cs @@ -0,0 +1,76 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; +using SIL.Machine.Rules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Synthesis-side replacement for on + /// strata. Unlike analysis -- where any subset/order of + /// morphological rules is a live hypothesis and every rule genuinely must be tried -- synthesis already + /// knows, from the trail recorded during analysis (), + /// exactly which single rule (or, for an unresolved compounding rule, which subset) can possibly apply + /// next. The unmodified cascade still probes the entire rule battery at every node and lets + /// reject the misses -- reject calls that are pure + /// overhead: and + /// both return empty on that check with no trace call, so + /// skipping the attempt entirely changes neither the result set nor trace output. Realizational affix + /// rules are excluded from the trail () and self-govern via + /// feature-structure checks instead, so they are always attempted, exactly as before. + /// + internal class TrailDirectedRuleCascade : InstrumentedRule + { + // Preserves the stratum's original rule order: when more than one rule can apply at a single node + // (a realizational rule alongside the trail-directed rule, or multiple compounding rules), trace + // calls must fire in the same relative order the unmodified all-rules cascade produced. + private readonly List<(IMorphologicalRule MorphologicalRule, IRule CompiledRule)> _rules; + private readonly IEqualityComparer _comparer; + + public TrailDirectedRuleCascade( + IEnumerable<(IMorphologicalRule MorphologicalRule, IRule CompiledRule)> rules, + IEqualityComparer comparer + ) + { + Name = "TrailDirectedRuleCascade"; + _rules = new List<(IMorphologicalRule, IRule)>(rules); + _comparer = comparer; + AddSubRules(_rules.Select(p => p.CompiledRule)); + } + + public override IEnumerable Apply(Word input) + { + var output = new HashSet(_comparer); + ApplyRules(input, output); + AddRuleStats(output.Count); + return output; + } + + private void ApplyRules(Word input, HashSet output) + { + bool hasNext = input.TryGetNextMorphologicalRuleToApply(out IMorphologicalRule next); + foreach ((IMorphologicalRule mrule, IRule compiled) in _rules) + { + bool attempt; + if (mrule is RealizationalAffixProcessRule) + attempt = true; + else if (!hasNext) + attempt = false; + else if (next == null) + attempt = mrule is CompoundingRule; + else + attempt = ReferenceEquals(mrule, next); + + if (!attempt) + continue; + + foreach (Word result in compiled.Apply(input)) + { + if (!_comparer.Equals(input, result)) + ApplyRules(result, output); + output.Add(result); + } + } + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/Word.cs b/src/SIL.Machine.Morphology.HermitCrab/Word.cs index 96748875f..11f623d74 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/Word.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/Word.cs @@ -70,12 +70,22 @@ public Word(Stratum stratum, Shape shape) } protected Word(Word word) + : this(word, shareFrozenShape: false) { } + + // parse-optimization.md Phase 10a: shareFrozenShape lets clone sites that provably never + // mutate the clone's shape (see CloneShareFrozenShape) skip the deep Shape copy -- 10-pre + // measured that copy (ShapeNodes + their Annotation graphs) at ~25% of all + // per-word allocated bytes on heavy Sena words. Sharing only ever happens when the source + // shape is already frozen, so the shared instance is immutable: any later mutation attempt + // throws (Shape's freeze guards) instead of silently corrupting the other holder, and any + // legitimate downstream edit goes through another Clone(), which deep-copies as always. + private Word(Word word, bool shareFrozenShape) { _allomorphs = new Dictionary(word._allomorphs); Stratum = word.Stratum; Source = word; // Don't copy Alternatives. - _shape = word._shape.Clone(); + _shape = shareFrozenShape && word._shape.IsFrozen ? word._shape : word._shape.Clone(); _rootAllomorph = word._rootAllomorph; SyntacticFeatureStruct = word.SyntacticFeatureStruct.Clone(); RealizationalFeatureStruct = word.RealizationalFeatureStruct.Clone(); @@ -98,6 +108,7 @@ protected Word(Word word) _isLastAppliedRuleFinal = word._isLastAppliedRuleFinal; _isPartial = word._isPartial; CurrentTrace = word.CurrentTrace; + AnalysisScope = word.AnalysisScope; _disjunctiveAllomorphIndices = word._disjunctiveAllomorphIndices == null || word._disjunctiveAllomorphIndices.Count == 0 ? null @@ -226,6 +237,16 @@ public IEnumerable MorphemesInApplicationOrder public object CurrentTrace { get; set; } + /// + /// Carrier for the analysis nogood cache (parse-optimization.md Phase 2). Reference-shared like + /// , deliberately excluded from and + /// so dedup semantics are unchanged. Null for words never routed through + /// (e.g. words built directly by rule-level + /// unit tests) or while tracing, in which case the cascade that reads this must fall back to + /// unmemoized behavior rather than throw. + /// + internal AnalysisScope AnalysisScope { get; set; } + public bool IsPartial { get { return _isPartial; } @@ -257,6 +278,26 @@ internal bool IsMorphologicalRuleApplicable(IMorphologicalRule rule) return curRule == rule || (curRule == null && rule is CompoundingRule); } + /// + /// Exposes the same trail-position state checks, so a + /// synthesis cascade can look up the one rule (or, when comes back null, the + /// compounding rules) that could possibly apply next, instead of probing the whole rule battery and + /// relying on to reject every miss. Returns false when no + /// morphological rule can apply at all ( is meaningless in that case, not "any + /// compounding rule" -- that reading only holds when this returns true and is + /// null). + /// + internal bool TryGetNextMorphologicalRuleToApply(out IMorphologicalRule rule) + { + if (_mruleAppIndex < 0) + { + rule = null; + return false; + } + rule = _mruleApps[_mruleAppIndex]; + return true; + } + internal bool HasRemainingRulesFromStratum(Stratum stratum) { if (_mruleAppIndex < 0) @@ -356,6 +397,14 @@ internal int GetUnapplicationCount(IMorphologicalRule mrule) return numUnapplies; } + /// + /// The full per-rule unapplication-count multiset backing , for + /// (order-independent analysis-cascade memoization -- see + /// parse-optimization.md Phase 2). Null means empty, matching this class's existing lazy-allocation + /// convention. + /// + internal IReadOnlyDictionary UnappliedRuleCounts => _mrulesUnapplied; + /// /// Notifies this word synthesis that the specified morphological rule has applied. /// @@ -416,6 +465,15 @@ internal int NonHeadCount get { return _nonHeadApps.Count; } } + /// + /// Length of the morphological-rule trail so far -- _mruleApps.Count. Recorded alongside + /// at the point a 's subtree is memoized + /// (parse-optimization.md Phase 3), so a later differently-ordered arrival at the same key knows + /// where its own trail ends and the memoized subtree's suffix begins -- see + /// . + /// + internal int MorphologicalRuleTrailLength => _mruleApps.Count; + internal void NonHeadUnapplied(Word nonHead) { CheckFrozen(); @@ -474,6 +532,54 @@ internal IList ExpandAlternatives() return alternatives; } + /// + /// Re-parents a Word computed while exploring the subtree below some analysis-cascade node N onto + /// -- a different Word that reached the same + /// as N via a different morphological-rule unapplication order (parse-optimization.md Phase 3's + /// positive memo; see ). Everything computed strictly + /// WITHIN the subtree -- deeper shape/feature edits, and any rules or non-heads unapplied below N -- + /// is a deterministic function of N's content alone (Shape, both FeatureStructs, the rule-unapplication + /// multiset, and non-head count all match between N and by definition of + /// an equal key), so it is kept as-is from `this`. Only the two ORDERED structures the key deliberately + /// summarizes as counts/multisets -- the morphological-rule trail and the non-head list -- have their + /// PREFIX (whatever was accumulated before reaching N) replaced with 's own + /// actual prefix, since arrival order can only ever affect that part. + /// + /// The word that hit the memo -- its trail/non-heads become the new prefix. + /// + /// _mruleApps.Count of N at the moment its subtree was memoized -- everything in `this`'s trail + /// from this index on is the subtree-local suffix to keep. + /// + /// Same, for _nonHeadApps. + internal Word ReplayOnto(Word queryNode, int mruleTrailPrefixLength, int nonHeadPrefixLength) + { + // Shape-sharing clone (parse-optimization.md Phase 10a): a replay edits only the two trail + // lists below and then freezes -- the shape is never touched, so the deep Shape copy a plain + // Clone() makes here (hundreds of thousands of replays per heavy word) is pure waste. + Word clone = CloneShareFrozenShape(); + + List mruleSuffix = clone._mruleApps.GetRange( + mruleTrailPrefixLength, + clone._mruleApps.Count - mruleTrailPrefixLength + ); + clone._mruleApps.Clear(); + clone._mruleApps.AddRange(queryNode._mruleApps); + clone._mruleApps.AddRange(mruleSuffix); + clone._mruleAppIndex = clone._mruleApps.Count - 1; + + List nonHeadSuffix = clone._nonHeadApps.GetRange( + nonHeadPrefixLength, + clone._nonHeadApps.Count - nonHeadPrefixLength + ); + clone._nonHeadApps.Clear(); + clone._nonHeadApps.AddRange(queryNode._nonHeadApps.CloneItems()); + clone._nonHeadApps.AddRange(nonHeadSuffix); + clone._nonHeadAppIndex = clone._nonHeadApps.Count - 1; + + clone.Freeze(); + return clone; + } + public Allomorph GetAllomorph(Annotation morph) { var alloID = (string)morph.FeatureStruct.GetValue(HCFeatureSystem.Allomorph); @@ -584,6 +690,21 @@ public Word Clone() return new Word(this); } + /// + /// , except the clone shares this word's instance instead + /// of deep-copying it -- only when that shape is already frozen (otherwise this falls back to a + /// normal deep copy, so it is always safe to call). For callers that clone, edit non-shape state, + /// and freeze -- never touching the shape -- the deep copy is pure waste: parse-optimization.md + /// Phase 10-pre measured the Shape/annotation graph at ~25% of all bytes allocated on heavy + /// words, dominated by exactly such clones. The contract is on the caller: the clone's shape must + /// never be mutated before the clone is discarded or frozen. Violations fail loudly (the shared + /// shape is frozen, so mutation throws) rather than corrupting the source. + /// + internal Word CloneShareFrozenShape() + { + return new Word(this, shareFrozenShape: true); + } + public override string ToString() { return Shape.ToRegexString(Stratum.CharacterDefinitionTable, true); diff --git a/src/SIL.Machine/FeatureModel/FeatureStruct.cs b/src/SIL.Machine/FeatureModel/FeatureStruct.cs index cc9c083a7..d3f8c4d7c 100644 --- a/src/SIL.Machine/FeatureModel/FeatureStruct.cs +++ b/src/SIL.Machine/FeatureModel/FeatureStruct.cs @@ -1368,11 +1368,32 @@ private void EnsureWritable() _sharedSource = null; } + // Test hook: incremented every time Freeze() takes the shared-hash shortcut below. A hash- or + // value-equality assertion alone can't tell the shortcut apart from the (equally correct) full + // walk, since both compute the same result -- this counter is what makes the regression test + // non-vacuous (parse-optimization.md Phase 3/3b hit this exact trap with ReplayOnto's memo + // tests, both of which passed even when the replay/graft logic was mutated to a no-op). + internal static long DiagSharedFreezeHits; + public void Freeze() { if (IsFrozen) return; + // A copy-on-write clone that was never mutated still borrows _sharedSource's exact + // _definite reference (see EnsureWritable): its hash cannot differ, since a frozen + // source's _definite subtree is immutable and any mutation of this clone would have + // already inflated a private copy and cleared _shared. Skip the full FreezeImpl walk + // and adopt the cached hash directly -- the same shortcut Shape.Freeze() already takes + // for its own copy-on-write clones (parse-optimization.md Phase 7b). + if (_shared && _sharedSource != null) + { + IsFrozen = true; + _hashCode = _sharedSource.GetFrozenHashCode(); + DiagSharedFreezeHits++; + return; + } + _hashCode = FreezeImpl(new HashSet()); } diff --git a/src/SIL.Machine/Rules/CombinationRuleCascade.cs b/src/SIL.Machine/Rules/CombinationRuleCascade.cs index 25fdfb8c6..f0818179f 100644 --- a/src/SIL.Machine/Rules/CombinationRuleCascade.cs +++ b/src/SIL.Machine/Rules/CombinationRuleCascade.cs @@ -26,6 +26,7 @@ public override IEnumerable Apply(TData input) { var output = new HashSet(Comparer); ApplyRules(input, !MultipleApplication ? new HashSet() : null, output); + AddRuleStats(output.Count); return output; } diff --git a/src/SIL.Machine/Rules/InstrumentedRule.cs b/src/SIL.Machine/Rules/InstrumentedRule.cs new file mode 100644 index 000000000..277deca24 --- /dev/null +++ b/src/SIL.Machine/Rules/InstrumentedRule.cs @@ -0,0 +1,118 @@ +using System.Collections.Generic; +using SIL.Machine.Annotations; + +namespace SIL.Machine.Rules +{ + /// + /// One observed "context" a rule succeeded under -- e.g. the part of speech of the word it applied to, + /// which allomorph/subrule fired, or whether the input was still a bare root. Grammar-constraint mining + /// (parse-optimization.md-adjacent: use runtime evidence to suggest tightenable rule declarations) needs + /// both the count (300 vs 4 is the signal) and a handful of real words (so a linguist can eyeball the 4 + /// counterexamples and judge whether they're legitimate or a grammar bug). + /// + public class RuleBucket + { + public const int MaxExamples = 10; + + public long Count; + public readonly List Examples = new List(); + + public void Record(string example) + { + Count++; + if (Examples.Count < MaxExamples) + Examples.Add(example); + } + } + + /// + /// This class instruments IRules. + /// Statistics are stored in InputCount, OutputCount, and ElapsedTime. + /// The rules update the statistics when Apply is called. + /// Name and SubRules are filled in when the rule is created. + /// Rules that can distinguish *why* a given application succeeded (which allomorph, which category, + /// whether the target was a bare stem, ...) additionally record named buckets via RecordBucket -- + /// see Morpher.AccumulateRuleStats for how these survive across a whole corpus run instead of being + /// cleared per word. + /// + /// + /// + public abstract class InstrumentedRule : IRule + where TData : IAnnotatedData + { + public string Name { get; set; } + public int InputCount; + public int OutputCount; + public int SuccessCount; + public long ElapsedTime; + public IList> SubRules = new List>(); + + // Keyed by an arbitrary "bucket group" name (e.g. "category", "allomorph") so one rule can report + // several independent breakdowns without them being conflated into a single key space. + public IDictionary> BucketGroups = + new Dictionary>(); + + // Generic-arity backtick suffix (e.g. "CombinationRuleCascade`2") stripped so reports read as + // "CombinationRuleCascade" -- callers that want something more specific (a stratum/template/morpheme + // name) still overwrite Name after construction. + protected InstrumentedRule() + { + string typeName = GetType().Name; + int tickIndex = typeName.IndexOf('`'); + Name = tickIndex < 0 ? typeName : typeName.Substring(0, tickIndex); + } + + protected void AddSubRules(IEnumerable> rules) + { + foreach (IRule rule in rules) + { + AddSubRule(rule); + } + } + + protected void AddSubRule(IRule rule) + { + SubRules.Add(rule as InstrumentedRule); + } + + protected void AddRuleStats(int outputCount) + { + InputCount++; + OutputCount += outputCount; + if (outputCount > 0) + SuccessCount++; + } + + // group examples: "category" ("Verb", "Noun", ...), "allomorph" ("0", "1", ...), "stemName", + // "rootDirect" ("true"/"false"). Callers pick whichever groups are meaningful for that rule type. + protected void RecordBucket(string group, string key, string example) + { + if (!BucketGroups.TryGetValue(group, out Dictionary buckets)) + { + buckets = new Dictionary(); + BucketGroups[group] = buckets; + } + if (!buckets.TryGetValue(key, out RuleBucket bucket)) + { + bucket = new RuleBucket(); + buckets[key] = bucket; + } + bucket.Record(example); + } + + public void ClearStats() + { + InputCount = 0; + OutputCount = 0; + SuccessCount = 0; + ElapsedTime = 0; + BucketGroups.Clear(); + foreach (var rule in SubRules) + { + rule?.ClearStats(); + } + } + + public abstract IEnumerable Apply(TData input); + } +} diff --git a/src/SIL.Machine/Rules/LinearRuleCascade.cs b/src/SIL.Machine/Rules/LinearRuleCascade.cs index b4e985a33..1e413999a 100644 --- a/src/SIL.Machine/Rules/LinearRuleCascade.cs +++ b/src/SIL.Machine/Rules/LinearRuleCascade.cs @@ -26,6 +26,7 @@ public override IEnumerable Apply(TData input) { var output = new HashSet(Comparer); ApplyRules(input, 0, output); + AddRuleStats(output.Count); return output; } diff --git a/src/SIL.Machine/Rules/ParallelCombinationRuleCascade.cs b/src/SIL.Machine/Rules/ParallelCombinationRuleCascade.cs index b698989f1..c7ff9ae12 100644 --- a/src/SIL.Machine/Rules/ParallelCombinationRuleCascade.cs +++ b/src/SIL.Machine/Rules/ParallelCombinationRuleCascade.cs @@ -81,7 +81,9 @@ public override IEnumerable Apply(TData input) to = temp; } - return output.Distinct(Comparer); + TData[] distinctOutput = output.Distinct(Comparer).ToArray(); + AddRuleStats(distinctOutput.Length); + return distinctOutput; } } } diff --git a/src/SIL.Machine/Rules/ParallelRuleBatch.cs b/src/SIL.Machine/Rules/ParallelRuleBatch.cs index afe053c79..9663a0bcd 100644 --- a/src/SIL.Machine/Rules/ParallelRuleBatch.cs +++ b/src/SIL.Machine/Rules/ParallelRuleBatch.cs @@ -28,7 +28,9 @@ public override IEnumerable Apply(TData input) } ); - return output.Distinct(Comparer); + TData[] distinctOutput = output.Distinct(Comparer).ToArray(); + AddRuleStats(distinctOutput.Length); + return distinctOutput; } } } diff --git a/src/SIL.Machine/Rules/PermutationRuleCascade.cs b/src/SIL.Machine/Rules/PermutationRuleCascade.cs index b16671f44..3af96a395 100644 --- a/src/SIL.Machine/Rules/PermutationRuleCascade.cs +++ b/src/SIL.Machine/Rules/PermutationRuleCascade.cs @@ -26,6 +26,7 @@ public override IEnumerable Apply(TData input) { var output = new HashSet(Comparer); ApplyRules(input, 0, output); + AddRuleStats(output.Count); return output; } diff --git a/src/SIL.Machine/Rules/PipelineRuleCascade.cs b/src/SIL.Machine/Rules/PipelineRuleCascade.cs index 524300883..1db20338f 100644 --- a/src/SIL.Machine/Rules/PipelineRuleCascade.cs +++ b/src/SIL.Machine/Rules/PipelineRuleCascade.cs @@ -29,6 +29,7 @@ public override IEnumerable Apply(TData input) inputSet = outputSet; } + AddRuleStats(outputSet.Count); return outputSet; } } diff --git a/src/SIL.Machine/Rules/RuleBatch.cs b/src/SIL.Machine/Rules/RuleBatch.cs index 61249068b..cea1af827 100644 --- a/src/SIL.Machine/Rules/RuleBatch.cs +++ b/src/SIL.Machine/Rules/RuleBatch.cs @@ -4,7 +4,7 @@ namespace SIL.Machine.Rules { - public class RuleBatch : IRule + public class RuleBatch : InstrumentedRule where TData : IAnnotatedData { private readonly List> _rules; @@ -25,6 +25,7 @@ public RuleBatch(IEnumerable> rules, bool disjunctive, IEq _rules = new List>(rules); _disjunctive = disjunctive; _comparer = comparer; + AddSubRules(_rules); } public IReadOnlyList> Rules @@ -42,16 +43,20 @@ public bool IsDisjunctive get { return _disjunctive; } } - public virtual IEnumerable Apply(TData input) + public override IEnumerable Apply(TData input) { var output = new HashSet(_comparer); foreach (IRule rule in _rules) { output.UnionWith(rule.Apply(input)); if (_disjunctive && output.Count > 0) + { + AddRuleStats(output.Count); return output; + } } + AddRuleStats(output.Count); return output; } } diff --git a/src/SIL.Machine/Rules/RuleCascade.cs b/src/SIL.Machine/Rules/RuleCascade.cs index a139e8ced..c1876d9c4 100644 --- a/src/SIL.Machine/Rules/RuleCascade.cs +++ b/src/SIL.Machine/Rules/RuleCascade.cs @@ -5,7 +5,7 @@ namespace SIL.Machine.Rules { - public abstract class RuleCascade : IRule + public abstract class RuleCascade : InstrumentedRule where TData : IAnnotatedData { private readonly ReadOnlyList> _rules; @@ -30,6 +30,7 @@ IEqualityComparer comparer _rules = new ReadOnlyList>(rules.ToList()); _multiApp = multiApp; _comparer = comparer; + AddSubRules(_rules); } public IEqualityComparer Comparer @@ -47,7 +48,7 @@ public IReadOnlyList> Rules get { return _rules; } } - public abstract IEnumerable Apply(TData input); + public abstract override IEnumerable Apply(TData input); protected virtual IEnumerable ApplyRule(IRule rule, int index, TData input) { diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs index 8245d17a1..1f5b6c243 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs @@ -553,4 +553,303 @@ private static string AnalysisSignature(Morpher morpher, string word) .OrderBy(s => s, System.StringComparer.Ordinal) ); } + + [Test] + public void ParseWord_SingleThreaded_MatchesParallel_WithCompounding() + { + // Exercises the Phase-3 positive-memo replay path (parse-optimization.md) specifically for + // compounding, not just plain affixes: an affix rule that commutes with a compounding rule -- + // both peers in the same Unordered MorphologicalRules cascade -- means the analysis cascade can + // revisit an equal AnalysisStateKey (same shape/features/rule-counts/non-head count) reached via + // different arrival orders, where the accumulated _nonHeadApps prefix need not be identical + // across arrivals even though the key treats non-heads as a bare count. Word.ReplayOnto grafts + // each arrival's OWN accumulated trail/non-head prefix onto a memoized subtree's suffix rather + // than reusing the memoized arrival's prefix verbatim -- if that graft were wrong, the memoized + // single-threaded cascade would diverge from the (unmemoized) parallel cascade here. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var rule1 = new CompoundingRule { Name = "rule1" }; + Allophonic.MorphologicalRules.Add(rule1); + rule1.Subrules.Add( + new CompoundingSubrule + { + HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, + NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("head"), new InsertSegments(Table3, "+"), new CopyFromInput("nonHead") }, + } + ); + + var prefix = new AffixProcessRule + { + Name = "prefix", + Gloss = "PAST", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct + .New(Language.SyntacticFeatureSystem) + .Feature(Head) + .EqualTo(head => head.Feature("tense").EqualTo("past")) + .Value, + }; + Allophonic.MorphologicalRules.Insert(0, prefix); + prefix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new InsertSegments(Table3, "di+"), new CopyFromInput("1") }, + } + ); + + var parallel = new Morpher(TraceManager, Language); + var singleThreaded = new Morpher(TraceManager, Language, maxDegreeOfParallelism: 1); + + foreach (string word in new[] { "pʰutdidat", "pʰutdat" }) + { + List singleResult = singleThreaded.ParseWord(word).ToList(); + List parallelResult = parallel.ParseWord(word).ToList(); + Assert.That( + singleResult.Select(WordResultSignature).OrderBy(s => s, System.StringComparer.Ordinal), + Is.EqualTo(parallelResult.Select(WordResultSignature).OrderBy(s => s, System.StringComparer.Ordinal)), + $"single-threaded parse of '{word}' must match the parallel parse" + ); + } + } + + private static string WordResultSignature(Word word) + { + // AllomorphsInMorphOrder alone would not catch a broken trail/non-head graft (it walks Shape + // annotations, which ReplayOnto never touches) -- MorphemesInApplicationOrder walks _mruleApps/ + // _nonHeadApps directly, which is exactly what Word.ReplayOnto rewrites. + return string.Join("+", word.AllomorphsInMorphOrder.Select(a => a.Morpheme.Id)) + + "|" + + string.Join("+", word.MorphemesInApplicationOrder.Select(m => m.Id)); + } + + [Test] + public void ParseWord_SingleThreaded_MatchesParallel_WithAffixTemplate() + { + // Exercises the template-battery memo (AnalysisStratumRule.ApplyTemplateBattery) specifically: + // TWO free prefix rules that commute with each other and with a template slot suffix. Unapplying + // di-then-ku vs ku-then-di reaches the same AnalysisStateKey (same shape, same rule MULTISET) + // with a different trail ORDER, so the second arrival replays the first arrival's stored + // template outputs with its own trail prefix grafted on (Word.ReplayOnto). One commuting prefix + // is NOT enough -- a single rule can only unapply once, so no key would ever be re-arrived at + // and the memo would never fire (verified: with one prefix the replay counter stays 0). + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + + var edSuffix = new AffixProcessRule + { + Id = "TPAST", + Name = "template_ed_suffix", + Gloss = "PAST", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + edSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+d") }, + } + ); + var verbTemplate = new AffixTemplate + { + Name = "verb_template", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + verbTemplate.Slots.Add(new AffixTemplateSlot(edSuffix) { Optional = true }); + Morphophonemic.AffixTemplates.Add(verbTemplate); + + var diPrefix = new AffixProcessRule + { + Id = "TDI", + Name = "template_di_prefix", + Gloss = "DI", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + diPrefix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new InsertSegments(Table3, "di+"), new CopyFromInput("1") }, + } + ); + Morphophonemic.MorphologicalRules.Add(diPrefix); + + var kuPrefix = new AffixProcessRule + { + Id = "TKU", + Name = "template_ku_prefix", + Gloss = "KU", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + kuPrefix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new InsertSegments(Table3, "gu+"), new CopyFromInput("1") }, + } + ); + Morphophonemic.MorphologicalRules.Add(kuPrefix); + + try + { + var parallel = new Morpher(TraceManager, Language); + var singleThreaded = new Morpher(TraceManager, Language, maxDegreeOfParallelism: 1); + + AnalysisStratumRule.DiagTemplateMemoHits = 0; + foreach (string word in new[] { "digusagd", "disagd", "gusagd", "sagd", "sag" }) + { + List singleResult = singleThreaded.ParseWord(word).ToList(); + List parallelResult = parallel.ParseWord(word).ToList(); + Assert.That( + singleResult.Select(WordResultSignature).OrderBy(s => s, System.StringComparer.Ordinal), + Is.EqualTo( + parallelResult.Select(WordResultSignature).OrderBy(s => s, System.StringComparer.Ordinal) + ), + $"single-threaded parse of '{word}' must match the parallel parse" + ); + Assert.That( + singleResult, + Is.Not.Empty.Or.Property("Count").EqualTo(parallelResult.Count), + $"'{word}' sanity: both engines agree on parse count" + ); + } + // Guards against this test going vacuous: the replay path must actually fire for this + // grammar. (Mutation-tested like Phase 3's mrule-memo test, with the same result: breaking + // the ReplayOnto graft -- returning stored words verbatim -- does NOT fail the equivalence + // assertions above, because merge-by-shape plus ExpandAlternatives make trail-order + // differences unobservable in final signatures for grammars like this one. The graft's + // necessity rests on the construction argument documented in MemoizedCombinationRuleCascade; + // this assertion at least pins that the memoized path is exercised at all.) + Assert.That( + AnalysisStratumRule.DiagTemplateMemoHits, + Is.GreaterThan(0), + "the template memo's replay path must actually fire for this grammar -- if this " + + "trips, the test grammar no longer forces a re-arrival at an equal state key " + + "and the equivalence assertions above are vacuously passing" + ); + } + finally + { + Morphophonemic.AffixTemplates.Remove(verbTemplate); + Morphophonemic.MorphologicalRules.Remove(diPrefix); + Morphophonemic.MorphologicalRules.Remove(kuPrefix); + } + } + + [Test] + public void EnableLexicalGating_MatchesDisabled_SimpleAffixGrammar() + { + // parse-optimization.md Phase 5: on a grammar with no reduplication/compounding/metathesis, the + // lexical gate should qualify and activate, but must never change which analyses come out -- + // it only prunes subtrees that could never reach any root, and Entries["32"] ("sag") is directly + // reachable the whole way through this simple suffix's unapplication. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var suffix = new AffixProcessRule + { + Id = "LEX_GATE_TEST_SUFFIX", + Name = "lex_gate_test_suffix", + Gloss = "PAST", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + suffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+d") }, + } + ); + Morphophonemic.MorphologicalRules.Add(suffix); + try + { + Assert.That( + GrammarAnalyzer.IsEdgeStripperQualified(Language), + Is.True, + "precondition: this grammar has no reduplication/compounding/metathesis" + ); + + var gateOff = new Morpher(TraceManager, Language, maxDegreeOfParallelism: 1); + var gateOn = new Morpher(TraceManager, Language, maxDegreeOfParallelism: 1) { EnableLexicalGating = true }; + + foreach (string word in new[] { "sagd", "sag" }) + { + List offResult = gateOff.ParseWord(word).ToList(); + List onResult = gateOn.ParseWord(word).ToList(); + Assert.That( + onResult.Select(WordResultSignature).OrderBy(s => s, System.StringComparer.Ordinal), + Is.EqualTo( + offResult.Select(WordResultSignature).OrderBy(s => s, System.StringComparer.Ordinal) + ), + $"lexical-gate-on parse of '{word}' must match gate-off parse" + ); + } + } + finally + { + Morphophonemic.MorphologicalRules.Remove(suffix); + } + } + + [Test] + public void IsEdgeStripperQualified_ReturnsFalse_ForReduplication() + { + // The same Lhs part copied twice in Rhs -- GrammarAnalyzer's own definition of reduplication + // (mirrors AnalysisMorphologicalTransform's capturedParts[name] > 1 case). + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var redup = new AffixProcessRule + { + Id = "LEX_GATE_TEST_REDUP", + Name = "lex_gate_test_redup", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, + } + ); + Allophonic.MorphologicalRules.Add(redup); + try + { + Assert.That(GrammarAnalyzer.IsEdgeStripperQualified(Language), Is.False); + } + finally + { + Allophonic.MorphologicalRules.Remove(redup); + } + } + + [Test] + public void IsEdgeStripperQualified_ReturnsFalse_ForInfixation() + { + // Material inserted BETWEEN two copied (and here, distinct) parts splits the input's own + // contiguous material apart -- a real root's contiguous window in the lexicon would no longer + // appear as a contiguous window in this rule's output. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var infix = new AffixProcessRule + { + Id = "LEX_GATE_TEST_INFIX", + Name = "lex_gate_test_infix", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + infix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = + { + Pattern.New("1").Annotation(any).OneOrMore.Value, + Pattern.New("2").Annotation(any).OneOrMore.Value, + }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table1, "um"), new CopyFromInput("2") }, + } + ); + Allophonic.MorphologicalRules.Add(infix); + try + { + Assert.That(GrammarAnalyzer.IsEdgeStripperQualified(Language), Is.False); + } + finally + { + Allophonic.MorphologicalRules.Remove(infix); + } + } } diff --git a/tests/SIL.Machine.Tests/FeatureModel/FeatureStructTests.cs b/tests/SIL.Machine.Tests/FeatureModel/FeatureStructTests.cs index 2f5a6430d..6ab6eb250 100644 --- a/tests/SIL.Machine.Tests/FeatureModel/FeatureStructTests.cs +++ b/tests/SIL.Machine.Tests/FeatureModel/FeatureStructTests.cs @@ -1223,6 +1223,27 @@ public void Clone_OfFrozen_NeverMutated_EqualsSourceBothDirections() Assert.That(FreezableEqualityComparer.Default.Equals(source, clone), Is.True); } + [Test] + public void Clone_OfFrozen_NeverMutated_Freeze_MatchesSourceFrozenHashCode() + { + // parse-optimization.md Phase 7b: Freeze() on a copy-on-write clone that borrows a frozen + // source's exact backing (never mutated, so _shared stays true) must adopt the source's + // already-computed hash rather than recomputing it -- the source's _definite subtree is + // immutable, so the two are guaranteed to hash identically. A hash/value-equality assertion + // alone can't distinguish the shortcut from the (equally correct) full walk -- both compute + // the same answer -- so this also asserts the counter that proves the shortcut actually fired. + FeatureSystem featSys = CowFeatSys(); + FeatureStruct source = BuildNestedFrozen(featSys); + + FeatureStruct clone = source.Clone(); + long hitsBefore = FeatureStruct.DiagSharedFreezeHits; + clone.Freeze(); + + Assert.That(FeatureStruct.DiagSharedFreezeHits, Is.EqualTo(hitsBefore + 1)); + Assert.That(clone.GetFrozenHashCode(), Is.EqualTo(source.GetFrozenHashCode())); + Assert.That(source.ValueEquals(clone), Is.True); + } + [Test] public void Clone_FrozenReentrant_MutateClone_PreservesSharingAndLeavesSourceUnchanged() {