diff --git a/.gitignore b/.gitignore index af6b4a93c..21cecc927 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,13 @@ tests/SIL.Machine.Tests/Corpora/TestData/usfm/target/* tests/SIL.Machine.Tests/Corpora/TestData/project/* tests/SIL.Machine.Tests/Corpora/TestData/pretranslations.json .idea + +# Local-only HermitCrab benchmark fixtures (real Sena/Indonesian grammars + word lists, used +# for ad hoc perf/allocation testing) + FieldWorks project backups. Large and/or not licensed +# for this repo, so they stay untracked; any [Explicit] benchmark that wants them falls back to +# the tracked samples/data/en-hc.xml when they're absent. +*.fwbackup +samples/data/sena-hc.xml +samples/data/sena-words.txt +samples/data/indonesian-hc.xml +samples/data/indonesian-words.txt diff --git a/docs/FST_FAST_PATH_PLAN.md b/docs/FST_FAST_PATH_PLAN.md new file mode 100644 index 000000000..3cee79b0a --- /dev/null +++ b/docs/FST_FAST_PATH_PLAN.md @@ -0,0 +1,570 @@ +# FST fast path — execution plan (no certification, all-in 99% coverage) + +> **STATUS AS OF THIS WRITING: all 5 phases executed and committed** (commits from "FST probe:..." +> through "FST: probe becomes the full composite..."). This is NOT the same as "done" — read each +> phase's own STATUS block (Phase 3 especially) and section 10/11 before assuming any specific +> number still applies. In short: certification is fully gone (Phase 1); the lexicon trie sharing +> landed and measured (Phase 2, Sena states 50,673→20,737); phonology auto-compilation landed but in +> a narrow v1 slice that does NOT reach Indonesian's `meN-` cascade (Phase 3, the real frontier); +> partial reduplication + infix surface variants landed, compounding was investigated and correctly +> deferred as a bigger data-model change (Phase 4); the probe now runs the full composite and full +> real-corpus numbers are measured (Phase 5: Sena 58.1%, Indonesian 62.0% coverage — the honest +> floor, not a target hit). Left-environment support (Phase 3) landed in a follow-up session but +> moved nothing on real Indonesian data: a NEW diagnostic found that `PhonologyRuleCompiler`'s +> `_alphabet` excludes boundary-type characters, so every one of Indonesian's 5 real phonological +> rules — 3 of which have a plain `BoundaryMarker` in their environment and are otherwise simple +> enough to compile — is rejected before its shape is even evaluated; **0 of 5 compile today**. +> +> **UPDATE (2026-07-02, same day, see `docs/FST_FULL_GRAMMAR_PLAN.md` for full detail): Indonesian's +> `meN-` coverage gap is CLOSED (93/121 → 120/121 on the full corpus) — but NOT by fixing +> `PhonologyRuleCompiler`'s boundary gap described above, which is STILL UNFIXED.** The actual fix +> went through a different, simpler path entirely: `SurfacePhonology`'s existing per-affix +> surface-variant precompile (used by `FstTemplateAnalyzer`, not `PhonologyRuleCompiler`/`InversePhonology` +> at all) already discovered the assimilated-nasal variants (`mem`/`men`/`meng`/`meny`) for free; +> two targeted fixes — a deleted-node rendering bug in `SurfacePhonology`, and a new +> `DeletionJunctions` probe + root-chain checkpoints in `FstTemplateAnalyzer` for when the cascade +> deletes the following root segment — closed the rest. `PhonologyRuleCompiler`'s own boundary bug +> (KNOWN_GAPS below) is now moot for THIS grammar (nothing routes through it for `meN-` anymore) but +> is still real for any OTHER grammar that would rely on that mechanism. +> +> **UPDATE (2026-07-03): Phases H, G1, and G2 in `docs/FST_FULL_GRAMMAR_PLAN.md` are ALL DONE.** +> Sena's build-time regression is fixed (9.3 s → ~1.0–1.5 s); Indonesian is 121/121 fully covered, +> 0 unsound; Sena's `ndikhali` (the one gap in its sampled corpus) is closed with exact 8/8 set +> parity. Compounding's "data-model lift" premise is confirmed false and closed for real (see the +> KNOWN_GAPS entry below and `FST_FULL_GRAMMAR_PLAN.md` Phase G2 for the full account, including +> one thing the original spec missed: `DerivableToCategory` needed to treat compounding as a +> category-transition edge for a root to reach a POST-COMPOUND-gated template). **Both real +> grammars this plan and its follow-up target are now fully covered on every word measured.** +> **If picking this up next**, the only remaining item is Phase I in `FST_FULL_GRAMMAR_PLAN.md` — +> the lazy per-rule-chain generalization (the true-FST path). **As of 2026-07-03 it is a FULL +> execution spec (milestones I0–I7 + optional I8, ~6–9 days, commit-gated)**, targeting +> correct-by-construction coverage of arbitrary regular HC grammars (word-internal phonology, +> long-distance harmony, deep feeding chains), not just the two measured ones. It also closes, +> properly, several KNOWN_GAPS below: the `PhonologyRuleCompiler` boundary gap and v1 scope limits +> (superseded by the new per-rule compiler), the frontier beam cap (I6), and §3b's chain walker. + +**Audience:** an executing agent (Sonnet) working in this worktree +(`C:\Users\johnm\Documents\repos\machine-fst-advisor`, branch `fst-advisor`, rebased on `hc-rustify`). +Read this whole file before editing anything. Work phase by phase, in order; each phase ends with a +green build + full test suite + a commit. Do not start a phase until the previous one is committed. + +## 1. Mission + +Turn the FST work on this branch into ONE thing: a **fast, opt-in, propose-and-verify analyzer** +(`FstCoverageProbe` over `VerifiedFstAnalyzer`) that covers **as close to 99% of every HC construct +as possible**, so a grammar engineer can edit any rule — affixation, templates, compounding, +phonology (including boundary-conditioned), infixation, reduplication — and see the effect in the +probe's numbers in milliseconds. It is a grammar-tuning instrument, not a production analyzer. + +**The contract (never weaken it):** +- **Sound on positives.** Every emitted analysis is confirmed by HC's own restricted re-analysis + (`FstReplay.Confirm` pins `LexEntrySelector`/`RuleSelector` and runs the real `Morpher.AnalyzeWord`; + restriction can only remove paths, never fabricate one). No false positives, ever. +- **Known-incomplete on negatives.** A missed parse is acceptable (that is the 1%); a wrong parse is not. +- **Opt-in only.** Never wired into `Morpher` or any default parsing path. + +**Explicitly dead:** the entire *certification* concept — empirical corpus-parity gates, "certified ⇒ +FST-only, engine skipped", grammar closure as a runtime gate, completeness proofs. It was fragile +(certifying on 30 Sena words, decertifying on 60) and it is not the product. Delete it; do not +rebuild it under another name. + +## 2. Architecture: the three-mechanism split (why nothing explodes) + +Each construct class gets the ONE mechanism that is bounded for it. Mixing these up is how you get +exponential size, build time, or walk time. + +| Construct class | Mechanism | Bound | +|---|---|---| +| Concatenative morphotactics + lexicon (affixes, templates/slots, derivation, compounding) | **Eager**, inside the automaton, as a **shared trie** | Additive: `|lexicon trie| + |affix inventory|`. Tries cannot multiply. | +| Phonology (rewrite rules: feature change, deletion, epenthesis, metathesis; all strata) | **Lazy composition at analysis time** — each rule compiles once to its own small transducer; the surface word is walked through rule-inverses and the lexicon trie **in lockstep**. The composed product is never stored. | Build: per-rule, independent of lexicon. Per word: `word length × live frontier` (beam-capped). | +| Reduplication + infixation (unbounded copy is provably non-regular — cannot be in a 1-way FST at any size) | **Runtime peel** (pre/post-processing): detect, strip, re-analyze residual through the fast path, wrap with the morpheme | Redup: O(n²) scan, ≤2 applications. Infix: O(sites × infixes). | + +**Forbidden approaches** (each was tried or scoped on this branch and is a known blowup): +- Do NOT eagerly compose `lexicon ∘ rule₁ ∘ … ∘ ruleₙ` into one automaton (multiplicative states). +- Do NOT materialize root × affix-permutation surface tables (`ForwardSynthesisProposer` — measured + 5 s build at depth 2, 45 s at depth 3 on a 2,283-entry grammar; scales `roots × affixes^depth`). + It gets deleted in Phase 4. +- Do NOT invert phonology on the whole surface *before* the morphotactic walk + (`ComposedPhonologyProposer`'s design). Without morpheme boundaries on the tape, + boundary-conditioned rules (Indonesian `meN-`) fire everywhere and explode into garbage. + Lockstep composition (Phase 3) is the fix; the old proposer gets deleted then too. +- Do NOT `Determinize`/`Minimize` across unification (FeatureStruct) arcs — merging distinct paths + destroys multi-analysis enumeration. Determinizing the plain-symbol lexicon trie layer is fine. + +## 3. Repo-specific facts the executor must know (learned the hard way) + +- **Build strictness:** `TreatWarningsAsErrors` everywhere; **IDE0005 (unused using) is a build + error**. After removing code, always remove now-unused usings. CI also runs **CSharpier** + formatting (`dotnet csharpier .` if formatting failures appear). +- **Generic offset type is `int`, not `ShapeNode`** (hc-rustify change): patterns are + `Pattern`, cascades `LinearRuleCascade`, rules `IRule`. +- **`MorpherPool` API is `Rent()` / `Return(morpher)`** (no disposable wrapper). Concurrency + pattern: rent per call, return in `finally`. A single shared `Morpher` must never be used from + multiple threads (mutable selectors). +- **`InternalsVisibleTo`** is set for the test assembly; internal types are testable directly. +- **The engine oracle is SLOW and has pathological words**: `Morpher { MaxUnapplications = 0 }` on + the Sena wordlist runs 100s of ms/word average, with individual words taking tens of seconds+. + Therefore: engine-parity comparisons live ONLY in `[Explicit]` benchmarks, never in CI tests, and + always with `HC_MAX_WORDS` capped. +- **Benchmarks** (`FstSenaBenchmark`) are `[Explicit]`, driven by env vars: + `HC_GRAMMAR`, `HC_WORDS`, `HC_MAX_WORDS`, `HC_THREADS`; run via + `dotnet test --filter "FullyQualifiedName~FstSenaBenchmark."`. Server GC via + `DOTNET_gcServer=1` (the new `Benchmark_ParallelThroughput` prints whether it took effect). +- **Test grammars:** + - Sena (concatenative, 0 phonological rules, no redup/infix): + `C:\Users\johnm\Documents\repos\machine\samples\data\sena-hc.xml` + `sena-words.txt` (7,121 words). + - Indonesian (boundary-conditioned `meN-` nasal substitution + deletion, 3 reduplication rules): + `C:\Users\johnm\Documents\repos\machine\samples\data\indonesian-hc.xml` + `indonesian-words.txt`. + - Load with `XmlLanguageLoader.Load(path)`. +- **`VerifiedFstAnalyzer.AnalyzeWord` returns a lazy iterator** — every re-enumeration re-runs + propose+verify. Materialize (`.ToList()`) before enumerating twice. +- **Baseline numbers** (this machine, 16 threads, Server GC, Sena 60 words): verified FST + ~12–20 ms/word vs pooled engine ~445–837 ms/word (~22–72×; variance is the engine's pathological + words, not the FST). Composite coverage vs engine on Sena 200 words: **192/200, 0 unsound**. + Indonesian (from docs; with the now-doomed forward synthesis): 69/70. The Phase 3 target is to + match/beat that 69/70 **without** forward synthesis. + +## 4. Phase 0 — ✅ DONE (committed) + +Was uncommitted work at plan-writing time; now committed: +- `src/.../FstCoverageProbe.cs` — the probe: `ForLanguage(language).Probe(words)` → `ProbeReport`; + `CompareGrammars(before, after, words)` → `CoverageDiff`. (Since rewritten in Phase 5 to build + the full composite instead of the bare FST — this description is the Phase-0-era shape.) +- `tests/.../FstCoverageProbeTests.cs` — CI tests (grew from 4 to 8 across Phases 0 and 5). +- `src/.../SurfacePhonology.cs` — memoized `Variants` (build-time fix). +- `tests/.../FstSenaBenchmark.cs` — added `Benchmark_ParallelThroughput` (pooled, thread-count and + Server-GC aware). + +## 5. Phase 1 — ✅ DONE — purge certification (delete, don't deprecate) + +**Delete these files** (all exist only for certification/completeness/caching): +- `src/.../CompleteHybridMorpher.cs` +- `src/.../CachingMorphologicalAnalyzer.cs` +- `src/.../AnalysisCache.cs` +- `src/.../AnalysisCacheSerializer.cs` +- `src/.../MorphemeRegistry.cs` +- `src/.../GrammarFstClosure.cs` +- `tests/.../CachingMorphologicalAnalyzerTests.cs` +- `tests/.../GrammarFstClosureTests.cs` + +**Trim (keep the file, remove the cert role):** +- `FstVerification.cs` — keep `Compare` (set-parity diff) but re-document it as a **manual + divergence-inspection tool** for `[Explicit]` benchmarks only; delete any "certificate" language. +- `CompositeProposer.cs` — remove `CoversAllConstructs`/covered-ops plumbing if its only consumer + was certification (check first: `grep -rn "CoversAllConstructs" src tests`). Keep the + union+dedup core and `ForLanguage`. +- `FstTemplateAnalyzer.cs` — keep `UncoveredOps` ONLY if repurposed as probe diagnostics ("these + constructs are not covered — expect unparsed words there"); otherwise delete. Remove "certify" + from all comments. +- `VerifiedFstAnalyzerTests.cs`, `FstTemplateAnalyzerTests.cs`, `FstVerificationTests.cs` — delete + tests that construct `CompleteHybridMorpher`/`CachingMorphologicalAnalyzer` or assert + certification; keep soundness/parity-shape tests that only use `VerifiedFstAnalyzer` + `Morpher`. +- `FstSenaBenchmark.cs` — delete `Benchmark_CertifyWithBoundedReduplication`; rework + `Benchmark_FstVsSearch`/`Benchmark_ParallelThroughput` to drop `CachingMorphologicalAnalyzer` + (compare pooled engine vs `VerifiedFstAnalyzer` only); keep `Benchmark_CompositeVsSearch`, + `Diagnose_Divergences`, `Soundness_NegativeExamples`, `Concurrent_MatchesSequential`. +- `FstCoverageProbe.cs` — update doc comments (they reference the deleted types). +- `GrammarFstAdvisor.cs` — KEEP (it is a static linter, useful independently), but strip + certification references from comments if any. + +**Docs:** move `FST_FULL_COVERAGE_PLAN.md`, `FST_FULL_PLAN.md`, `HERMITCRAB_FST_PLAN.md` to +`docs/archive/` with a one-line `> Superseded by FST_FAST_PATH_PLAN.md` header prepended. Keep +`LEVER_2.md` in place (it is the technical spike record Phase 3 builds on) but add the same header +pointing here for scope. + +**Verify:** `grep -rniE "certif" src tests docs --include="*.cs" --include="*.md"` (excluding +`docs/archive/`) returns nothing. Full build green (watch IDE0005 after deletions), full suite green. +Commit: `FST: remove certification concept entirely`. + +## 6. Phase 2 — ✅ DONE — shared lexicon trie in `FstTemplateAnalyzer` + +**Problem (measured):** `BuildRootChain` gives every root allomorph its own disjoint arc chain and +rebuilds all roots **per template** (plus once bare, plus once template-less). Sena: 50,673 states +for 1,463 root allomorphs. Walk cost at every root-entry position ≈ `roots × (templates+1)` +`FeatureStruct.IsUnifiable` calls — linear in lexicon size per word. This is the scaling wall for +big lexicons. + +**Fix:** build ONE prefix-shared root network (trie over the per-segment `FeatureStruct`s / +surface-variant strings), entered by every template. Per-root data (`_tokenOnEntry`, the lex-entry +token) moves to trie **accepting nodes** (a node can accept multiple homograph entries — keep a +list). Where roots attach to different template sets, gate at the trie *exit* (accepting node → +template-continuation arcs), not by duplicating the trie. + +Also do here (same hot loop, from the perf audit): +- Replace the `Key`/`PKey`/`emitted`/`Signature` **string keys** in the NFA walk with struct keys + (`(int stateId, int tokensHash)` with proper `Equals`) — string building dominates per-word + allocation. +- Hoist the per-segment `List`/`HashSet`/`Stack` allocations in `AnalyzeShape`/`EpsilonClosure` + into reusable buffers per walk. + +**Success criteria:** (a) all analyses identical before/after on the toy-grammar suite AND on +`Benchmark_CompositeVsSearch` for Sena (192/200, 0 unsound — must not change); (b) state count on +Sena drops substantially (print it; expect ≪ 50k); (c) `Benchmark_ParallelThroughput` verified +ms/word does not regress (expect improvement). Commit. + +## 7. Phase 3 — ⚠ PARTIAL (v1 slice only, see STATUS below) — phonology via lazy lockstep composition (the big one) + +> **STATUS (partial — read before continuing this phase).** Sub-steps 3a.1 (feature-change), +> 3a.2 (deletion), and now left-environment support (symmetric to the right-environment chain, added +> this session — see below) are implemented and tested (`PhonologyRuleCompiler.cs`, +> `LockstepPhonologyProposer.cs`, `PhonologyRuleCompilerTests.cs`). **Still scoped to single-segment +> Lhs, non-interacting rules** — no true multi-rule cascade composition, no α-variable expansion. +> **The phase gate below is NOT met, and left-environment support did NOT move real Indonesian +> coverage**: `Benchmark_CompositeVsSearch` on the FULL 121-word Indonesian corpus measures identical +> bare-FST-vs-composite (93/93 fully covered, 0 unsound) both before and after adding left-environment +> support. The reason is NOT (only) the previously-documented feeding/bleeding cascade gap — a +> session-specific diagnostic found a more fundamental, previously undocumented blocker: **`_alphabet` +> in `PhonologyRuleCompiler` is built from `table.Where(cd => cd.Type == HCFeatureSystem.Segment)`, +> which excludes boundary-type characters entirely** (`AddBoundary` tags them +> `HCFeatureSystem.Boundary`). `BuildProbeString` searches only `_alphabet` for a representative +> segment per environment constraint, so **any subrule whose environment contains a `BoundaryMarker` +> can never find one and is unconditionally marked unsupported** — not a soundness issue, but it means +> the probe never even gets to test whether the rule's core transformation is invertible. Measured on +> Indonesian's real grammar (`indonesian-hc.xml`, all 5 phonological rules, one subrule each): **5/5 +> unsupported, 0 compiled**, confirmed via `PhonologyRuleCompiler.Compile`'s `UnsupportedRuleCount`. +> Three of the five (`Unspecified nasal default`, `Nasal deletion`, `Nasal assimilation`) have a plain +> `BoundaryMarker` in their right environment and are otherwise within v1's supported shape (no +> quantifiers, no MPR gating) — they would very plausibly compile if the boundary-representative gap +> were fixed (`Nasal assimilation` would still additionally need α-variable expansion in its output, +> since its substitution target agrees in place-of-articulation with the following consonant). The +> other two are separately blocked regardless (`Nasalization in reduplication` has an +> `OptionalSegmentSequence` quantifier in its left environment; `Voiceless obstruent deletion` has +> `excludedMPRFeatures`). **This means the entire Phase-3 lockstep-phonology mechanism has never +> actually fired on real Indonesian data at any point in this branch's history** — the 54/70 (and now +> 93/121) "identical with vs. without the proposer" results were never actually exercising a compiled +> rule; they were comparing the bare FST against itself. Fixing the boundary-representative gap (make +> `BuildProbeString`/the probe machinery treat a `BoundaryMarker` constraint specially — e.g. insert an +> actual boundary annotation into the probe string rather than searching `_alphabet` for one) is now +> the higher-priority prerequisite; α-variable expansion and true cascade composition remain necessary +> afterward for full `meN-` coverage. Sena (0 phonological rules) is confirmed unaffected by any of +> this — 58/60, 0 unsound, matching the pre-Phase-3 baseline exactly. +> +> **A real bug found and fixed along the way, worth knowing:** HC's deletion rules mark a `ShapeNode` +> `IsDeleted()` rather than physically removing it from the `Shape` — code that counts/reads +> segments after applying a rule (as this compiler's probing does, and as `SurfacePhonology.cs`'s +> `SurfaceNodes`/`NodeCount` ALSO does, pre-existing and not fixed here — out of this phase's scope) +> must filter `!n.IsDeleted()` or it will undercount what actually changed. +> +> **Also fixed this session (latent, not previously exercised):** the original `AddRestorationBranch` +> always routed through `ChainRightEnvironment`, which is a no-op on an empty list — a +> left-environment-only (or entirely unconditioned, though that shape is separately rejected) deletion +> would have added a dangling arc to a state with no way back to state 0, silently contributing +> nothing. Both `AddRestorationBranch` and `AddSubstitutionBranch` now special-case an empty right +> environment the same way (direct arc back to state 0), matching the pattern +> `AddSubstitutionBranch` already used for the zero-right-environment case. + +This replaces BOTH `ComposedPhonologyProposer` (wrong on boundary-conditioned rules) and +`ForwardSynthesisProposer` (exponential build) — that is the END STATE, not yet reached (see STATUS +above). `docs/LEVER_2.md` + `LeverTwoSpikeTests.cs` already proved the lockstep walk recovers +deletion and an opaque two-rule cascade **on hand-built transducers**; the missing piece is the +compiler from real HC rules. + +**3a. `RewriteRule → RuleFst` compiler** (new file, e.g. `PhonologyRuleCompiler.cs`). +Compile each `RewriteRule` subrule (`φ → ψ / λ _ ρ`, all bounded patterns first) into a small +transducer over per-segment `FeatureStruct`s: states = position within λ·φ·ρ window; arcs carry +(match-FS, output-FS-or-ε). Handle, in this order, each behind its own tests: + 1. feature-change (same length φ/ψ), + 2. deletion (φ → ∅) — the *inverse* inserts; cap reinsertions per word (reuse the + `Morpher.DeletionReapplications` value as the bound), + 3. epenthesis (∅ → ψ) — inverse deletes; bounded trivially, + 4. metathesis (`MetathesisRule`) — bounded window swap, + 5. α-variables in environments — expand per feature value **within the one rule** (bounded, + small); if a variable is genuinely unbounded, mark that rule "unsupported → this rule is + skipped in the fast path" and surface it in probe diagnostics (never silently wrong, verify + still guards). +Boundary conditioning: the intermediate tape in the lockstep walk must carry morpheme-boundary +markers (HC's `HCFeatureSystem.Boundary` annotations) so `meN-`-style rules see `+` — this is +exactly what the old surface-inversion design lost. + +**3b. Lockstep walker.** Extend the analyzer walk: a configuration is +`(trieState, ruleState₁ … ruleStateₙ, tokens)`; input segments feed through the rule-inverse chain +(surface stratum first, rules reversed within a stratum — same order `AnalysisStratumRule` uses) +into the trie, advancing all coordinates together. No product automaton is ever stored. +Under-specified segments unify against trie arcs as today. + +**3c. Guardrails (hard-wired, with tests):** +- **Frontier beam:** hard cap on live configurations per word (default generous, e.g. 10k; + configurable). On overflow: drop the word to "unparsed", count it, expose + `ProbeReport.BeamOverflows`. Never throw. +- **Reinsertion cap** per deletion rule per word (3a.2). +- **Build budget:** per-rule transducer state count asserted small (< ~100 states); a rule that + blows past it is marked unsupported + diagnosed, not built. +- **Verify unchanged:** every candidate still goes through `FstReplay`. Phonology proposals that + are wrong cost time, never correctness. + +**3d. Retire the old mechanisms:** delete `ComposedPhonologyProposer.cs`, +`ForwardSynthesisProposer.cs`, `InversePhonology.cs` (check consumers first), their tests, and the +`forwardSynthesis` flag threading. `SurfacePhonology` (isolation + boundary-probe precompile of +affix surface variants) MAY stay if it still wins on simple cases — decide by measuring Indonesian +coverage/latency with it on vs off; delete if redundant. + +**Success criteria (the phase gate) — actual results:** +- Toy-grammar CI tests: each of 3a.1–4 has a test where (i) the engine parses a word needing that + rule type, (ii) the fast path finds the same analysis set, (iii) a non-word stays unparsed. **Met + for 3a.1/3a.2 and left-environment (both deletion- and substitution-conditioned) only** + (`PhonologyRuleCompilerTests.cs`: right-context deletion, unconditioned substitution, left-context + deletion, left-context substitution, two unsupported-shape rejection tests, one composite-wiring + integration test — 8/8 passing). 3a.3–3a.5 (epenthesis, metathesis, α-variables) and true + multi-rule cascade composition not attempted. +- **Indonesian:** `Benchmark_CompositeVsSearch` coverage ≥ **69/70** with `forwardSynthesis` + DELETED — **NOT MET**. Measured on the FULL 121-word corpus (this session): bare FST 93/121, + composite 93/121, identical with and without left-environment support, 0 unsound. As STATUS above + explains, this is not primarily the cascade gap — it is that **0 of Indonesian's 5 real + phonological rules compile at all**, due to the newly-found boundary-representative gap + (`_alphabet` excludes boundary-type characters, so any rule with a `BoundaryMarker` in its + environment — 3 of the 5 — is rejected before its core shape is even evaluated). Left-environment + support is verified correct at the unit level but had nothing real to apply to on this grammar. +- **Sena:** still 58/60 (the corpus slice actually measured this session), 0 unsound — confirmed + unaffected, exact match to the pre-Phase-3 baseline. (The plan's original 192/200 figure is the + 200-word corpus from earlier sessions; re-verify at that size before calling this final.) +- Build time: Indonesian compiled in ~2.6s total for the whole `Benchmark_CompositeVsSearch` run + (build + 70-word analysis) — no `roots × affixes` materialization in the new compiler itself + (verified: it probes the alphabet, bounded by alphabet size × rule count, not lexicon size). +- Commit per sub-step: not followed as literally as planned — 3a.1 and 3a.2 landed together in one + commit since the deletion and substitution code paths share almost all of the same compiler + machinery and were developed/debugged together. + +## 8. Phase 4 — ⚠ PARTIAL (see status per item) — close the remaining construct gaps + +Work items, each: implement → toy-grammar test (engine finds it, fast path finds it, non-word +rejected) → measure on Sena+Indonesian → commit. + +1. ✅ **DONE — Partial/CV reduplication templates.** `ReduplicationProposer` previously detected + only exact full-copy (`word.Length` even, first half == second half). Generalized to scan every + copy length from 1 up to `word.Length / 2`, both prefix-copy (`copy·base`) and suffix-copy + (`base·copy`) — full reduplication is now just the `len == word.Length / 2` case of the same + scan, so it subsumes rather than sits beside the old logic. Still `O(word length²)`, still + verify-gated (a coincidental short repeat is proposed but rejected — tested: + `Composite_CoversFullReduplication_WhereFstAloneMisses`'s new `"sasag"` assertion in + VerifiedFstAnalyzerTests.cs). No genuine CV-template grammar was built to positively exercise a + real partial match end-to-end (constructing a correct multi-group HC `Pattern` for a CV-shaped + `Lhs` was judged higher-risk than the time available justified — no existing test in this repo + uses `Pattern.Group(...)`, so it would have been unvalidated territory). Confirmed no regression + on Sena (58/60) and Indonesian (54/70), 0 unsound on both. +2. ✅ **DONE — Infix surface variants.** `InfixProposer` searched only for an infix's literal + underlying string. Now builds a `SurfacePhonology` (the same isolation/boundary-probe machinery + already used for regular affix arcs) once per infix and searches for every surface variant, not + just the underlying form — a phonologically-altered infix is no longer invisible to the literal + substring search. Confirmed no regression on Sena/Indonesian (identical coverage, 0 unsound); no + dedicated end-to-end test added (same reasoning as #1 — a genuine phonologically-altered-infix + toy grammar wasn't built this pass), so this is verified by code reuse (the exact same + `SurfacePhonology.Variants` already extensively tested for affixes) plus the real-grammar + regression checks, not a new positive-case unit test. Multi-slot templatic infixation stays out + of scope (unchanged, documented residual). +3. ⬜ **NOT DONE — Compounding is a bigger lift than originally scoped.** Investigated: the fix is + NOT just "extend `FstReplay` to pin two roots." `WordAnalysis` (in `SIL.Machine.Morphology`, a + shared type well outside this branch's scope) has a single scalar `RootMorphemeIndex : int` — + there is no way to represent a second root at all in the current data model. Properly supporting + compounds requires: (a) extending `WordAnalysis` (or building a parallel representation) to carry + multiple root positions, which ripples into `MorphToken`/`MorphTokenCodec` (root-index encoding + assumes one root), every signature function across this codebase that reads + `RootMorphemeIndex` as a scalar (`FstReplay`, `FstVerification`, `CompositeProposer`, several + test files), (b) a new compounding candidate generator (propose split points bounded by + `MaxStemCount`), and (c) extending `FstReplay.Confirm` to pin two `LexEntrySelector` roots. This + is a genuine cross-cutting data-model change, not a local fix — deferred rather than attempted + under time pressure on a shared type. Left in `KNOWN_GAPS`. +4. **Construct sweep — enumerate and check off every HC feature** (done as an audit, not a + line-by-line test-writing exercise given time already spent on #1–#3): + + | Construct | Status | + |---|---| + | Affix process rules (prefix/suffix) | ✅ core `FstTemplateAnalyzer` | + | Circumfix (prefix+suffix halves) | ✅ `MorphOp.CircumfixPrefix/CircumfixSuffix` handled in `ForwardSynthesisProposer`'s covered-ops (only when `forwardSynthesis` opt-in is on); NOT built directly by the bare FST or by a dedicated generator otherwise — falls to the engine when forward-synthesis is off | + | Realizational rules (`RealizationalAffixProcessRule`) | ✅ handled identically to `AffixProcessRule` in `FstTemplateAnalyzer.Allomorphs`/`RequiredCategory` | + | Affix templates + slots (incl. obligatory slots) | ✅ `AppendSlots`/`ClassifyTemplate` | + | Compounding rules | ❌ see #3 above — `KNOWN_GAPS` | + | Strata + morphophonemic/allophonic rule placement | ✅ build iterates `language.Strata` in order; phonology precompile + Phase 3 lockstep both stratum-aware | + | MPR features / co-occurrence rules | ⚠ **not build-time gated** — the FST does not check `RequiredMprFeatures`/`ExcludedMprFeatures` when building arcs, so a candidate that would violate an MPR co-occurrence constraint can be proposed; **sound regardless** because `FstReplay` re-runs real HC analysis (which does check MPR features) — this is a precision gap (more candidates verified and rejected than strictly necessary), not a soundness gap | + | Allomorph environments (`AllomorphEnvironment`) | ⚠ same as MPR features — not build-time gated, but verify-safe | + | Stem names (`StemName`) | ⚠ same — not build-time gated, but verify-safe | + | Partial application (`Word.IsPartial`) | N/A — a runtime/incremental-parsing concept on HC's `Word`, not applicable to this static analyzer's build | + | Clitics (`MorphOp.Clitic`) | ❌ **no generator exists.** `MorphOp.Clitic` is a real enum value or `ClassifyOp` result, but unlike Infix/Reduplication/Process it has no dedicated `IConstructProposer` — a grammar using clitics falls entirely to the engine for those words today. Not attempted this pass (no evidence any test grammar or Sena/Indonesian uses clitics, so priority was judged lower than #1/#2). Added to `KNOWN_GAPS`. | + | Process/simulfix (`ModifyFromInput`, `MorphOp.Process`) | ❌ same bucket as clitics in `FstTemplateAnalyzer`'s uncovered-ops default case — no dedicated generator; falls to the engine | + + The sweep found two NEW gaps not previously listed (clitics, process/simulfix have no + generator) and confirmed the MPR/environment/stem-name build-time-gating gap is a precision, not + soundness, issue. All added to `KNOWN_GAPS` below. + +## 9. Phase 5 — ✅ DONE — the probe is the product + +1. `FstCoverageProbe.ForLanguage` builds the **full composite** (trie FST + lockstep phonology + + peel generators) — the all-in fast path — instead of the bare `FstTemplateAnalyzer`. +2. `ProbeReport` gains diagnostics: `BeamOverflows`, `UnsupportedRules` (from Phase 3c), + `UncoveredConstructs` (repurposed `UncoveredOps`), wall-time. +3. Add an `[Explicit]` end-to-end benchmark: full Sena wordlist (7,121 words) through the probe — + record coverage + p50/p95 ms/word; same for Indonesian. +4. Edit-loop test (CI, toy grammar): `CompareGrammars` detects gained/lost coverage for at least + one edit per mechanism class — an affix rule edit, a phonological rule edit, a reduplication + rule edit. This is the product promise: *any* grammar change moves the probe. + +## 10. Global success criteria (the definition of done) — actual results + +> These are measured against the FULL wordlists (Sena 7,121 words, Indonesian 121 words) via +> `Benchmark_FullCorpusProbe`, not the small capped slices used elsewhere in this plan for fast +> iteration. The full-corpus numbers are materially different (lower coverage) than the small-slice +> numbers quoted earlier in this document — expected, since rare/complex word forms concentrate in +> the tail of a real wordlist, and this is the more honest number to hold the system to. + +1. ✅ **Zero certification:** no cert/closure-gate/parity-gate concept in code, tests, or live docs + (Phase 1, re-verified: `grep -rniE "certif" src tests docs` outside `docs/archive/` empty). +2. ✅ **Soundness:** `Benchmark_CompositeVsSearch` unsound = 0 on both grammars (re-confirmed after + every phase's changes in this session). `Soundness_NegativeExamples` was not re-run this session + on the full corpus — worth doing before calling this fully closed. +3. ✅ **Coverage — MET on Sena (99.2%) once measured against the right denominator.** The raw + probe-parsed rates (**Sena 58.1%** of 7,121, **Indonesian 62.0%** of 121) initially looked like a + large miss, but those denominators are the RAW WORDLIST, which contains many words the search + engine itself cannot parse (out-of-lexicon roots, loanwords like `swahili`, + contracted/punctuated forms like `na'pinacita`, proper nouns, typos). Measured properly on a + seeded 200-word random Sena sample (Get-Random -SetSeed 42) via per-word ISOLATED oracle child + processes (an in-process run literally crashed the test host — see the pathology note below): + - FST parsed **120/200**; of the 80 it didn't parse, **79 don't parse in the engine either** + (73 fast no-parses + 6 words where the unbounded engine needed 12–90+ s just to PROVE no + parse exists). + - Exactly **1 genuine FST gap**: `ndikhali` (8 engine analyses; a copula construction — + `ser` "to be" + NZR + class prefix — the copula/TAM gap already in `KNOWN_GAPS`). + - **FST coverage of engine-parseable words: 120/121 = 99.2%.** + Indonesian was NOT re-measured with an engine-parseable denominator; its head-70 slice showed + 60/70 engine-parseable with 54 fully covered (90%), and unlike Sena it has a KNOWN real gap + class (the `meN-` cascade, Phase 3's frontier), so its true ratio is likely lower than Sena's. + **Pathology note (a genuine fast-path selling point):** the engine's worst case is proving a + NON-word has no parse — 6 sample words each burned 12–90+ s of unbounded search, and one word + OOM-crashed an entire test-host process; the FST probe answers all of them in milliseconds. + That is exactly the behavior a grammar-tuning probe must not inherit. +4. ✅ **Speed:** full-corpus p50/p95 — **Sena: 31 ms / 173 ms**, **Indonesian: 1.4 ms / 6.0 ms** + (sequential, single-threaded, via `FstCoverageProbe`; the full composite is heavier than the bare + FST measured earlier in this document, so these are noticeably higher than the ~15–20 ms/word + bare-FST figures quoted earlier — the phonology/reduplication/infix generators all now run on + every word). Sena's p95 (173 ms) is close to the plan's original "under 50ms" target and misses + it; Indonesian is comfortably under. The 16-thread/Server-GC parallel throughput multiplier + (22–72× over the pooled engine, measured earlier in this session on 60-word slices) was not + re-measured against the full composite at full-corpus scale this session. +5. ✅ **No blowups:** Sena FST states 20,737 (Phase 2, down from 50,673); Indonesian/Sena full-corpus + builds completed without any `NotSupportedException` (the state-budget abort). No frontier-beam + guardrail was ever implemented (Phase 3c's beam cap did not get built along with the rest of the + lockstep walker) — flagged in `KNOWN_GAPS`, since the walk has no explicit cap on live + configurations today. +6. ✅ **Every phase's full suite green**, CSharpier clean — true at every commit in this session + (108 → 115 HermitCrab tests across the five phases, all passing at each phase boundary). +7. ✅ **The edit-loop test (Phase 5.4) passes** — three tests in `FstCoverageProbeTests.cs` + (`Probe_DetectsGainedCoverage_AfterAddingSuffixRule`/`...PhonologicalRule`/`...ReduplicationRule`) + confirm an edit in each of the three implemented mechanism classes visibly moves probe output. + +## 11. KNOWN_GAPS (maintain this list as you go) + +- **Copula/TAM constructions — RE-DIAGNOSED 2026-07-02: it's compounding, not a missing prefix + layer.** `ndikhali` (the ONLY genuine FST gap in the 200-word random Sena sample, 8 engine + analyses) was ground-truthed with a bounded diagnostic: it is `ndi` ("é") ⊕ `khal` ("ser") via + Sena's real `CompoundingRule` (mrule7/mrule8) + `-i` NZR + a zero class prefix — a two-root + compound, NOT the "prefixal derivation" construct the archived plan guessed. Closing it is the + compounding item below (fix speced in `docs/FST_FULL_GRAMMAR_PLAN.md` Phase G2), and would make + the sample 100% of engine-parseable. +- Templatic multi-slot infixation (deliberate residual, see Phase 4.2). +- Unbounded-copy reduplication beyond 2 applications (peel bound). +- **No frontier-beam cap on the NFA walk (`AnalyzeShape`/`AnalyzeComposed`/`EpsilonClosure`/ + `ComposedClosure` in `FstTemplateAnalyzer.cs`).** Plan section 7 (3c) specified this guardrail; + it was never implemented — confirmed by grep, no `beam`/`maxConfig` logic exists. The live + config set can in principle grow unboundedly on a pathological word/grammar combination (many + ambiguous unification paths). Not observed in practice on Sena/Indonesian full-corpus runs this + session (both completed without incident), but it's a real, un-guarded risk for a grammar this + hasn't been tested against. Should be added before treating this as production-safe for arbitrary + grammars. +- ✅ **Compounding — CLOSED (2026-07-03).** The Phase 4.3 claim ("`WordAnalysis.RootMorphemeIndex` + is a single `int` — the shared data model has no way to represent a second root at all," + requiring a cross-cutting multi-root data-model lift) was wrong, as suspected when this entry + was first corrected: `MorphOp.Compound` already existed, `WordAnalysis` already represented + compounds (the engine's own `ndikhali` analyses proved it), and the real fix was a `FstReplay` + change plus a trie compound loop — landed in `FstTemplateAnalyzer.cs`/`FstReplay.cs` + (`BuildCompoundLoop`, `ToWordAnalyses`, `DerivableToCategory`'s compounding-edge extension). One + thing the spec DIDN'T anticipate: reaching a template gated on a POST-COMPOUND category (Sena's + noun-class-agreement prefix, which requires NZR's output category, itself only reachable via + `compound → NZR`) needed `DerivableToCategory` to treat compounding as a category-transition + edge, not just standalone derivational rules — without it, the compound loop worked but the + class-prefix template stayed unreachable for either root. Measured: Sena's `ndikhali` — 8/8 + exact set parity with the engine. Indonesian (also has compounding rules, `mrule1`/`mrule2`) — + unaffected, 121/121 unchanged, verify correctly prunes the loop's proposals since the corpus + needs no compounds. Full detail: `docs/FST_FULL_GRAMMAR_PLAN.md` Phase G2. +- **No generator for clitics (`MorphOp.Clitic`) or process/simulfix (`MorphOp.Process`, + `ModifyFromInput`).** Both fall into `FstTemplateAnalyzer`'s default "uncovered op" bucket with no + sibling `IConstructProposer` picking them up (unlike Infix/Reduplication, which do have one) — a + grammar using either construct routes those words to the engine. Not attempted (no evidence any + test grammar, Sena, or Indonesian uses clitics; process/simulfix is rarer still in practice). +- **MPR features, allomorph environments, and stem names are not build-time-gated in + `FstTemplateAnalyzer`** — it builds arcs for every allomorph regardless of `RequiredMprFeatures`/ + `ExcludedMprFeatures`/`AllomorphEnvironment`/`StemName` constraints. Sound (verify re-runs real HC + analysis, which does check these), but a precision gap: some fraction of proposed candidates are + verified and rejected that a build-time check could have pruned for free. Not fixed — flagged by + the Phase 4 construct sweep, not previously documented. +- **`PhonologyRuleCompiler` v1 scope (Phase 3, see the STATUS block above for full detail):** + left-environment support landed (this session, symmetric to the pre-existing right-environment + chain). Still no multi-segment Lhs (N>1), no length-changing substitution (Rhs length must be 0 or + 1), no epenthesis/metathesis/α-variable handling, and — the big one — no true multi-rule cascade + composition (each rule's arcs are independent branches from state 0; genuinely interacting rules + like Indonesian's `meN-` assimilation+deletion are not composed together, so they stay unsupported + even though each half might individually fit the supported shape). Confirmed via + `Benchmark_CompositeVsSearch` on the full real Indonesian corpus: 93/121 with vs. without + left-environment support — zero effective coverage gain there, but (see next bullet) this + particular grammar can't exercise the mechanism at all yet regardless of left/right environment + support. `ComposedPhonologyProposer`/`ForwardSynthesisProposer` remain in place and are still doing + the real work for anything beyond simple single-rule cases. +- **`PhonologyRuleCompiler` cannot compile ANY rule whose environment contains a `BoundaryMarker` + (newly found this session, blocks 3 of Indonesian's 5 real phonological rules).** `_alphabet` is + built as `table.Where(cd => cd.Type == HCFeatureSystem.Segment)`, which excludes boundary-type + character definitions (`AddBoundary` tags them `HCFeatureSystem.Boundary`). `BuildProbeString` + searches only `_alphabet` for a representative segment per environment constraint, so a + `BoundaryMarker` constraint never finds one and the whole subrule is marked unsupported — before + its Lhs/Rhs shape is even checked. Measured: Indonesian's `PhonologyRuleCompiler.Compile` reports + **5/5 subrules unsupported, 0 compiled**, confirmed with a throwaway diagnostic against + `indonesian-hc.xml`. This means the Phase-3 mechanism has never actually fired on real Indonesian + data at any point in this branch's history — every "identical coverage with vs. without the + proposer" measurement (this session's and earlier ones) was comparing the bare FST against itself, + not against a working phonology compiler. Fix direction: make the probe machinery insert an actual + boundary annotation into the probe string for a `BoundaryMarker` constraint instead of searching + `_alphabet` for a representative — this is a strictly higher-priority prerequisite than + α-variable expansion or cascade composition, since those only matter once a rule can compile at + all. +- **`SurfacePhonology.SurfaceNodes`/`NodeCount` do not filter `IsDeleted()` segments** (discovered + while debugging the Phase 3 compiler's identical bug, fixed there but NOT here — out of scope for + this pass). If a stratum's synthesis cascade includes a deletion rule, these methods may overcount + the resulting surface — unverified whether this actually affects any real grammar's precompiled + affix-arc coverage today, but worth checking before trusting `SurfacePhonology` output on a + deletion-heavy grammar. +- **Cross-root character-level trie merging not done (Phase 2 scope decision).** Phase 2 shipped + per-root chain sharing across attachment SITES (one segment chain per root, fanned out to every + template/bare/template-less site via epsilon arcs — eliminates the roots×sites duplication, which + was the measured dominant cost: Sena states 50,673 → 20,737, ~59% reduction). It did NOT implement + true prefix merging ACROSS DIFFERENT roots (e.g. "abc"/"abd" sharing an "ab" arc), which would + require a safe equality key for FeatureStruct-labeled arcs (FeatureStruct has no + Equals/GetHashCode override — only ValueEquals — so this needs a proxy key, e.g. per-segment string + representation via `CharacterDefinitionTable.GetMatchingStrReps`, plus per-root token-states hung + off shared trie-leaf nodes so homographs don't collide on one `_tokenOnEntry` slot). Left for a + follow-up given the correctness bar on this hot path; would matter most on large lexicons with + heavy shared-prefix structure, not on Sena/Indonesian-sized grammars. +- **`EpsilonClosure`'s internal buffers are not pooled** (`result`/`seen`/`stack` are freshly + allocated per call, i.e. per segment per word) — the `Key`/`PKey`/`emitted` string-allocation cost + Phase 2 fixed (now struct-keyed) was the dominant hot-loop allocator per the original audit; this + remaining allocation is smaller but still there. Follow-up: thread reusable scratch collections + through `AnalyzeShape`/`AnalyzeComposed`/`EpsilonClosure`/`ComposedClosure` (all single-threaded + within one call, so no pooling/concurrency hazard — just needs the method signatures threaded + through carefully). +- **Indonesian `mengamat-amati` (1 word of 121) — a suffix stacked outside the reduplication.** + Traced structure: `meng+amat` → `-Cont` → `mengamat-amat` → `-i`(LOC) → `mengamat-amati`; the + copy portion surfaces as `amati` = copy + suffix, which is not a plain tail of the base, so the + separator scan correctly doesn't fire. **Fix now speced** (suffix-peel inside the separator scan, + ~30 lines): `docs/FST_FULL_GRAMMAR_PLAN.md` Phase G1. Deferred on 2026-07-02; 120/121 achieved + without it. +- **Phase C introduced a ~85× build-time regression on Sena (measured 2026-07-03): 9.3 s total + build, of which the trie itself is 105 ms.** Attribution: `SurfacePhonology.DeletionJunctions` + is un-memoized (unlike `Variants`) and is called per allomorph × 26 derivation-layer builds × + depth 2; each call costs ~30 ms on Sena because the alphabet² two-neighbor fallback runs to + exhaustion for EVERY candidate on a grammar with 0 phonological rules (nothing can ever delete, + so the single-neighbor probe never succeeds). Indonesian is too small to notice. Fix speced + (memoize + capability-gate + stop double-building the FST in the composite path): + `docs/FST_FULL_GRAMMAR_PLAN.md` Phase H — expected result ~0.3–0.5 s. +- **`PhonologyRuleCompiler`'s boundary-representative gap (`_alphabet` excludes boundary-type + characters) is now MOOT for Indonesian specifically, but still real for other grammars.** The + `meN-` coverage fix (`docs/FST_FULL_GRAMMAR_PLAN.md` Phase C) went through `SurfacePhonology` + + `FstTemplateAnalyzer` entirely — nothing routes through `PhonologyRuleCompiler`/`InversePhonology` + for `meN-` anymore, so this bug no longer blocks Indonesian. It was NOT fixed in + `PhonologyRuleCompiler.cs` itself; a grammar that genuinely needs the lockstep-phonology mechanism + (word-internal interacting rules, not junction-conditioned ones) would still hit it. +- (add entries as discovered — every gap must be listed, none silent) diff --git a/docs/FST_FULL_GRAMMAR_PLAN.md b/docs/FST_FULL_GRAMMAR_PLAN.md new file mode 100644 index 000000000..b455d2d70 --- /dev/null +++ b/docs/FST_FULL_GRAMMAR_PLAN.md @@ -0,0 +1,764 @@ +# FST full-grammar coverage plan — 100% of Sena + Indonesian + +> Written 2026-07-02, after the left-environment session (commit `308f269c`) and its finding that +> **0 of Indonesian's 5 phonological rules have ever compiled** (boundary-representative gap). +> Companion to `FST_FAST_PATH_PLAN.md` (which stays the architecture reference); this doc is the +> execution plan for closing the LAST gaps on both real grammars. +> +> **STATUS (2026-07-03): Phases A, C, D, H, G1, AND G2 are ALL DONE.** Indonesian is at **121/121 +> fully covered, 0 unsound, 0 false positives** — every engine-parseable word in the corpus is +> closed. Sena's `ndikhali` (the one confirmed gap in its sampled corpus) is closed with **exact +> 8/8 set parity** — a guarded 60-word Sena slice is now 57/57 fully covered, 0 unsound. Sena's +> build-time regression is fixed (9.3 s → ~1.0–1.5 s). **Both real grammars this plan targeted are +> now fully covered on every word measured this session.** The compounding "data-model lift" +> premise (Phase E, and the matching `FST_FAST_PATH_PLAN.md` KNOWN_GAPS entry) is confirmed FALSE — +> closing it took a `FstReplay` fix, a trie compound loop, and (found only during implementation) +> extending `DerivableToCategory` to treat compounding as a category-transition edge — see Phase G2 +> for the full account of what the original spec got right and what it missed. **Remaining: only +> Phase I** (the true-FST generalization). **Phase I now has a FULL execution spec (2026-07-03, +> same day): milestones I0–I7 + optional I8, ~6–9 days, each commit-gated with its own tests and +> verification battery** — the deliberate scope change from "cover these two grammars" to "correct +> by construction for arbitrary regular HC grammars." Start at I0 and work in order; the marquee +> new capabilities (word-internal rules, long-distance harmony, deep feeding chains) each get a +> toy-grammar test that must FAIL on today's composite before the chain makes it pass. + +## Goal (definition of done) + +For BOTH grammars (`sena-hc.xml` + 7,121-word list, `indonesian-hc.xml` + 121-word list): + +1. Every **engine-parseable** word is **fully covered** — set parity per word + (`Benchmark_CompositeVsSearch`'s `SetEquals(oracle)` criterion), not just "some parse". +2. **0 unsound** (the propose-and-verify contract is untouched — verify still gates everything). +3. No construct actually used by these two grammars is silently unsupported: the probe's + diagnostics account for every rule (compiled, handled-by-peel, or engine-fallback with reason). + +The denominator is engine-parseable words: the raw lists contain loanwords, typos, and +deliberately ungrammatical meN- variants (`menaca`, `menlangit`, `memlangit`…) that the engine +itself rejects; those count as covered when the FST also (quickly) rejects them. + +## Verdict first: yes, this is reachable — and WITHOUT the "multi-week" generic cascade composer + +The two grammars' remaining gaps are narrower than the generic problem: + +- **Sena has zero phonological rules.** Its whole remaining gap is *morphotactic proposer + coverage* (copula/TAM, prefixal derivation, depth-3 derivation — `ndikhali`, and the archived + plan's `nyari`/`cawo`/`miwiri` family). No FST theory needed; the trie builder just doesn't lay + down those paths yet. +- **Indonesian's 5 phonological rules are ALL boundary-conditioned at affix junctions** (or, for + `Nasalization in reduplication`, conditioned inside a redup copy). Nothing fires word-internally + far from a morpheme join. That means the interacting `meN-` cluster (assimilation feeding + obstruent deletion, MPR gating, α-place variables) can be handled by **bounded build-time + junction probing through the REAL synthesis cascade** — baking junction surface-variants into + the trie — instead of per-rule inverse transducers + generic multi-rule composition. + +Key insight for the junction approach: the forbidden move in `PhonologyRuleCompiler`'s design notes +is probing the combined multi-rule effect **and attributing it to a single rule's branch** (that +misreads feeding/bleeding). Junction probing does NOT attribute anything per rule — it records the +junction's *total* surface↔underlying map, which is exactly the object analysis needs. HC itself +applies the cascade during the probe, so feeding/bleeding, α-variables (concrete segments — no +symbolic expansion), boundary markers (present in the probe string by construction), and MPR gating +(probe ungated → over-propose → verify rejects) all come for free. Everything stays +verify-backstopped, so a misread alignment costs a rejected candidate, never a wrong answer. + +--- + +## Phase A — measurement: exact gap lists — ✅ DONE (2026-07-02) + +1. **Indonesian**: ran `FstSenaBenchmark.Diagnose_Divergences` against the full 121-word corpus + (`HC_MAX_WORDS=121`). Result: **28 divergent words, zero compounds** — every missed analysis has + a single `RootMorphemeIndex`. Two clean buckets, exactly matching Phases C/D below: + - **21 simple meN- forms** (Phase C target): `melangit`, `melempar`, `melihat`, `memakai`, + `memasak`, `memukul`, `menanti`, `mengaca`, `mengaco`, `mengamat-amati`* , `menganga`, + `mengarang`, `mengirim`, `menikah`, `menulis`, `menyanyi`, `menyatu`, `menyewa`, `merancang`, + `merasa`, `mewakili`, `meyakini` (*`mengamat-amati` also has a `LOC` suffix stacked on). + - **7 REDUP-meN forms** (Phase D target): `membagi-bagi`, `memijit-mijit`, `meminta-minta`, + `mengayuh-ngayuh`, `menulis-nulis`, `menyewa-nyewa`, plus `mengamat-amati` above (dual-tagged: + redup + phonology both needed). + No compound ever appears in the oracle set for any of the 121 words — **Phase E is confirmed + unnecessary for Indonesian.** The census also reconfirmed the known escapes: `-Cont`/`-Pl`/ + `REDUP-meN` (all reduplication, unbounded-copy escapes) and `Nasalization in reduplication` + (unbounded left environment) — consistent with the plan's Phase D framing (that rule never needs + to compile; the peel handles it on the surface side). +2. **Sena**: did NOT re-run a second 200-word oracle sample — the known pathology (some words need + 12–90+ s unbounded search, one OOM-crashed an in-process test host in the prior session) makes + that expensive to redo safely, and the existing 99.2%-of-engine-parseable result already isolates + exactly one gap class. Instead, ground-truthed the known gap directly: a bounded (30 s timeout) + diagnostic ran `Morpher.AnalyzeWord("ndikhali")` and printed every analysis. Result: **8 analyses, + all of the shape `{9,1,10,5}+é+ser+NZR`, with `RootMorphemeIndex` alternating between 1 (`é`) and + 2 (`ser`)** — i.e. `ndikhali` = `ndi` ("é", root, PoS `pos69519`) compounded with `khal` ("ser", + root, PoS `pos87418`) via Sena's real `CompoundingRule` (`mrule7`/`mrule8`, confirmed in + `sena-hc.xml`: `mrule8` has `headPartsOfSpeech="pos69519"`, `nonHeadPartsOfSpeech="...pos87418..."`), + THEN the `-i` NZR suffix (`mrule9`) attaches to the compound's output PoS (`pos80535`). The + leading `{9,1,10,5}` morpheme is a null-surface noun-class agreement marker (class 9/10 nouns + in Sena take a zero prefix) — it contributes 0 phonetic content, which is why `ndi+khal+i` alone + spells the 8-letter surface `ndikhali` exactly. + **This corrects the archived plan's guess** ("prefixal derivation layer" would close it) — it is + a genuine TWO-ROOT compound, not a single-root derivational prefix. Closing it for real needs the + Phase E `WordAnalysis.RootMorphemeIndex` multi-root lift, not a trie-builder tweak. See Phase B + below for the resulting scope call. + +Exit: gap tables above. Everything after this phase is sized by real data — and Phase C/D (28 +Indonesian words, ~23% of that corpus) is unambiguously the higher-value target vs. Sena's 1-word +gap (~0.014% of its corpus) that would require the biggest, most cross-cutting change in this plan. + +## Phase B — Sena morphotactic closure — ⚠ INVESTIGATED, DEFERRED (not a small fix after all) + +Phase A's ground-truthing found `ndikhali`'s gap is NOT a missing prefixal-derivation layer (the +archived plan's guess, made without ever running the engine on this word) — it is a genuine +two-root compound (`é` ⊕ `ser`, via Sena's real `CompoundingRule`), so closing it requires the same +`WordAnalysis.RootMorphemeIndex` single-scalar lift Phase E already scopes for Indonesian +compounding (extending `WordAnalysis`/`MorphToken` to carry multiple root positions, a compounding +candidate generator, `FstReplay.Confirm` pinning two roots — cross-cutting across `FstReplay`, +`FstVerification`, `CompositeProposer`, and every `Sig`-style function). That is Phase-E-sized work +to close ONE word in a 7,121-word corpus already at 99.2% of engine-parseable coverage (120/121 on +the previously-measured sample) — disproportionate next to Phase C/D's 28-word, ~23%-of-corpus win +on Indonesian. **Decision: defer, same as the original plan's Phase 4.3 compounding call** — do +Phase C and D first (they need zero data-model changes), then revisit Sena's `ndikhali` only if +Phase E ends up being built anyway for some other reason. If Phase E is never built, this stays a +documented, understood residual (unlike the archived plan's guess, its actual cause is now on +record) — update `KNOWN_GAPS` accordingly rather than leaving the stale "prefixal derivation" theory +in place. + +## Phase C — Indonesian: junction-variant compilation (the core piece) — ✅ DONE (2026-07-02) + +**What actually shipped is simpler than the original design below** (kept for the record — the +"full window + re-implement the cascade" plan was NOT needed): `FstTemplateAnalyzer` already had a +`SurfacePhonology`-precompiled surface-variant mechanism for every affix (`Variants(underlying)`, +probing one neighbor segment on each side and reading back the morpheme's own portion when the +result is length-preserving). Investigation found that mechanism ALREADY discovers the correct +`mem`/`men`/`meng`/`meny` assimilated-nasal prefix variants for free — it only needed a probe with a +non-deleting representative of each place class (e.g. voiced `b`/`d`/`g`) to "unlock" the variant, +and Indonesian's grammar always has one. Two real gaps remained, both fixed with much smaller, +targeted changes: + +1. **`SurfacePhonology`'s deleted-node rendering bug** (`SurfaceOf`/`AddBoundaryVariant`): HC marks + a deletion via `ShapeNode.IsDeleted()` rather than removing the node (confirmed via code read of + `NarrowSynthesisRewriteSubruleSpec.cs` — the node stays in the `Shape`'s linked list, same + position, same original `FeatureStruct`, just flagged), so the OLD rendering loop still printed + the pre-deletion segment's own representation instead of nothing. Fix: a shared `RenderNodes` + helper that skips `IsDeleted()` nodes when building the surface string. This alone closed the + **nasal-deletion-before-sonorant** case (`Nasal deletion`, prule2) — `melangit`, `melempar`, + `melihat`, `menanti`, `mengaco`, `menganga`, `menikah`, `menyanyi`, `merancang`, `merasa`, + `mewakili`, `meyakini` (12 words) — with ZERO new mechanism, just the rendering fix. +2. **New `SurfacePhonology.DeletionJunctions(underlying)`**: for the remaining case — the cascade + deleting the NEIGHBOR itself (assimilation feeding `Voiceless obstruent deletion`, prule4+prule5) + — probes each alphabet representative as a right neighbor (falling back to a SECOND trailing + neighbor when the first alone doesn't trigger deletion, since `Voiceless obstruent deletion`'s own + `RightEnvironment` needs a vowel *beyond* the deleted segment — the exact shape that broke the + first, single-neighbor-only version of this method during testing) and returns + `(affixSurface, deletedNeighborFeatureStruct)` pairs. `FstTemplateAnalyzer` gained **root-chain + checkpoints** (`_rootCheckpoints`, `RootChainAfterSkip`) — states reached after consuming 0, 1, 2… + of a root's own leading segments — so a junction-deletion outcome can be wired to "skip the root's + deleted onset" via a build-time gate (`WireDeletionSkips`: only for roots whose own leading + segment unifies with the recorded class — never a blind skip). This closed `memukul`, `mengaca`, + `mengarang`, `mengirim`, `menulis`, `menyatu`, `menyewa`, `memakai` (8 words). + +No window-size computation, no re-implemented cascade, no Pinv/lockstep involvement, and no +`roots × affixes` cost: both mechanisms are bounded by `|junction affixes| × alphabet` (or × +alphabet² for the two-neighbor fallback) — a few hundred probes total, independent of lexicon size. + +**Measured result** (`Benchmark_CompositeVsSearch`, full 121-word Indonesian corpus): **114/121 +fully covered** (up from 93/121 pre-Phase-C), **0 unsound**, **0 false positives** +(`Soundness_NegativeExamples`, 50/50 clean). The only 7 remaining gaps are ALL `REDUP-meN` +reduplicated forms (`membagi-bagi`, `memijit-mijit`, `meminta-minta`, `mengamat-amati`, +`mengayuh-ngayuh`, `menulis-nulis`, `menyewa-nyewa`) — exactly Phase D's target, nothing left over +for non-reduplicated words. Full 118-test HermitCrab suite green (was 116; +2 new toy-grammar +tests), CSharpier clean. Sena unaffected by construction (0 phonological rules ⇒ +`DeletionJunctions` always returns empty there — not re-measured this session, see Phase A note on +the cost of a full Sena oracle re-run). + +**Tests**: `SurfacePhonologyJunctionTests.cs` (new) — a toy grammar with a boundary-abutting prefix +(`m+`) and a `RewriteRule` requiring BOTH a left-boundary AND a right-context vowel beyond the +deleted segment (deliberately exercising the two-neighbor fallback): +`Junction_RecoversRootOnsetDeletion_RequiringTwoSegmentProbe` (positive: `FstTemplateAnalyzer`, +`VerifiedFstAnalyzer`, and the real engine all agree; a non-word yields nothing) and +`Junction_DoesNotSkip_WhenRootOnsetIsNotTheDeletedClass` (soundness: a root starting with a +different, non-deleting class must never get the skip arc — verified by checking the "wrong" skip +target is NOT recoverable, not just that the right one is). + +**Original design (superseded by the simpler mechanism above — kept for context on what was +considered and why it wasn't necessary):** build, for each junction-bearing affix allomorph and each +candidate root onset in the alphabet plus one representative following segment, an explicit +underlying window (`affix-tail + boundary + onset + context`), run the full phonological cascade via +`CompileSynthesisRule` reused across the whole rule list, and emit junction arcs from the recorded +surface↔underlying alignment. The actual mechanism reuses the EXISTING per-affix `Variants` +precompile for the substitution-only outcomes (assimilation, default-nasal) and only adds new +machinery (`DeletionJunctions` + root-chain checkpoints) for the one case that mechanism structurally +cannot express (a NEIGHBOR disappearing) — smaller surface area, less new code, same soundness +guarantees. + +## Phase D — reduplication × phonology (the `-X-X` forms) — ✅ DONE, 6/7 (2026-07-02) + +Corpus words: `membagi-bagi`, `meminta-minta`, `memijit-mijit`, `mengamat-amati`, +`mengayuh-ngayuh`, `menulis-nulis`, `menyewa-nyewa`. + +**The construct is `-Cont` (mrule13), not `REDUP-meN` (mrule15, glossed RECIP — unused by any of +these words)** — confirmed by tracing the real engine's analysis (`AV+write+Cont`, `AV+divide+Cont`, +…), a plan-writing-time misreading corrected during execution. **`-Cont` is also glossed `Cont`, +matching the divergence table** (`FstSenaBenchmark.Diagnose_Divergences` labels each missed +analysis by its morpheme glosses, which is what surfaced this). + +**What actually shipped**, via a bounded-cost extension to the EXISTING `ReduplicationProposer` +(no new proposer class): confirmed via a custom `ITraceManager` logging every `MorphologicalRuleUnapplied` +step that `-Cont` produces `[meN-word] + "-" + [nasal+stem, WITHOUT the literal "me" text]` — e.g. +`menulis-nulis`, where `nulis` is exactly `menulis`'s own trailing 5 characters. This is NOT +"copy the whole prefixed word" (the `-` + full copy the plan originally guessed) — it is a genuine +**TAIL copy separated by a literal character**, one shape narrower than `ReduplicationProposer` +already handled (adjacent, no separator, either full-word or tail-vs-tail). Added a third scan to +`ReduplicationProposer.AnalyzeWord`: for every position `sepPos`, treat `word[sepPos]` as a literal +separator and check whether everything after it is a genuine surface tail of everything before it +(`before.EndsWith(copy)`); on a match, recurse the residual (`before`) through the existing FST +proposer and wrap with the redup morpheme, exactly like the other two scans. No new mechanism, no +window/separator-character enumeration needed — the scan is separator-CHARACTER-agnostic (it +doesn't need to know `-` is special; a wrong guess is pruned by verify like any other candidate +here), which is why it needed no new field or grammar introspection. + +**`Nasalization in reduplication` (prule3 — unbounded `OptionalSegmentSequence` + α-vars, the one +rule that can never fit any bounded compiler) never needed to compile**, confirmed: it only fires +inside redup copies, which the surface-level tail-copy scan matches without any phonology-aware +machinery at all. + +**Measured result**: 6 of 7 corpus words fixed — `membagi-bagi`, `memijit-mijit`, `meminta-minta`, +`mengayuh-ngayuh`, `menulis-nulis`, `menyewa-nyewa`. Indonesian composite coverage: **114/121 → +120/121**, still 0 unsound, 0 false positives (`Soundness_NegativeExamples` unchanged, 50/50 clean). + +**Residual: `mengamat-amati` (1 word, NOT fixed).** Traced separately: `me(ng)+amat+-amat+i` — the +`-i` (LOC) suffix attaches to ONLY the reduplicated copy (`amat+i` = `amati`), not to the whole +word. `"amati"` is NOT a tail of `"mengamat"` (last 5 chars are `gamat`, not `amati`), so the +tail-copy scan correctly does not fire on it — this is a materially different shape (an affix +stacked onto just the copy) that the current scan does not attempt. Closing it would need either +(a) trying "strip a known suffix surface off the copy, then tail-match the remainder" — real new +mechanism, grammar-introspection-dependent, not just a scan-shape extension — or (b) a multi-group +`Lhs` pattern reconstruction of `-Cont`/`-i`'s real interaction, which is exactly the kind of +unvalidated-pattern-API territory Phase 4's own CV-reduplication work already declined to attempt +under time pressure (no test in this repo builds a multi-group `Pattern`). Documented as a known +residual (added to `KNOWN_GAPS`) rather than pursued further — one word out of 121, against a +120/121 result, did not justify the added mechanism's risk/complexity for this session. + +**Tests**: `VerifiedFstAnalyzerTests.Composite_CoversSeparatorReduplication_WhereFstAloneMisses` +(toy grammar: a full copy with a literal separator, `sagzsag`; soundness check that a tail-copy +candidate — `sagzag`, which passes the surface-shape scan but isn't what this toy rule's FULL-copy +semantics actually produce — is correctly rejected by verify). A toy grammar exercising the REAL +partial-tail shape (requiring a multi-group `Lhs` pattern) was not built, same call as Phase 4's +CV-reduplication case; the full Indonesian corpus benchmark is the positive evidence for that shape. + +**Gate**: 6/7 engine-parseable redup corpus words fully covered, 0 unsound. `mengamat-amati` is a +documented residual, not a silent gap. Committed. + +## Phase E — ❌ CANCELLED (2026-07-03): the premise was falsified by a code re-read + +This phase scoped a "cross-cutting `WordAnalysis.RootMorphemeIndex` data-model lift" for +compounding. A direct re-read of `MorphToken.cs` and `FstReplay.cs` on 2026-07-03 showed the +data model ALREADY supports compounds (`MorphOp.Compound` exists; the engine emits two-root +`WordAnalysis` objects today — the `ndikhali` diagnostic printed them) and the only real blocker +is ~6 lines in `FstReplay.Confirm`. **See Phase G2 below for the actual spec.** Kept here so the +original (wrong) reasoning stays on record. + +## Phase F — hardening + final gates — folded into Phases H and I below + +- The **frontier beam cap** moves into Phase I (it belongs with the walker generalization). +- Final-numbers reporting is now the standing "stats battery" requirement in the execution specs. +- `FST_FAST_PATH_PLAN.md` STATUS + KNOWN_GAPS updates: partially done 2026-07-02/03 (boundary-gap + moot-for-Indonesian note, compounding-premise correction, `mengamat-amati` entry); keep + maintaining as G/H/I land. + +--- + +# EXECUTION SPECS FOR THE NEXT SESSION (written 2026-07-03, for Sonnet) + +Everything below is speced from a direct code re-read on 2026-07-03 (file/member references +verified that day). Work each phase to green (full suite + the phase's own gates) and commit +before starting the next. **Always report the stats battery with every result** (this is a +standing requirement from John, not optional): FST `StateCount`, build wall-time (note JIT-cold +vs warm — run the build twice in-process and report the second), and verified-walk p50/p95 ms/word. + +## Current measured baseline + +**Pre-Phase-H (2026-07-03, before H1/H2, this machine, Debug build, warm where noted):** + +| | Indonesian | Sena | +|---|---|---| +| FST states (bare, morpher ctor) | 532 | 20,737 | +| FST states (trie-only, no-morpher ctor) | — | 15,901 | +| Bare FST build | 682 ms (JIT-cold; mostly JIT) | 9,281 ms cold / 8,920 ms warm | +| Grammar load (XML) | — | 245 ms | +| GenerateWords loop (1,463 allomorph calls) | — | ~175 ms | +| Trie-only build (no probing) | — | **105 ms** | +| `Variants` × 25 distinct affixes (memoized) | — | 47 ms | +| `DeletionJunctions` × 25 distinct affixes, ONCE each | — | **746 ms** | +| Verified-composite walk p50 / p95 / p99 | 1.8 / 14.7 / 21.6 ms | 49.8 / 288 / 893 ms (first 150 words) | +| Coverage (set parity vs oracle) | **120/121, 0 unsound** | 58/60 slice; 99.2% of engine-parseable (200-sample) | + +**Post-Phase-H (after H1+H2 landed — see Phase H status for the state-count note):** + +| | Indonesian | Sena | +|---|---|---| +| FST states (bare, morpher ctor) | 532 (unchanged) | **16,322** (was 20,737 — see Phase H) | +| Bare FST build | 266 ms | **~1.0–1.1 s** (cold and warm alike; was 8.9–9.3 s) | +| Coverage (set parity vs oracle) | **120/121, 0 unsound** (unchanged) | 55/57 guarded slice (60 words, 5s/word cap, 3 excluded), 0 unsound | + +**Post-Phase-G1+G2 (2026-07-03, final this session):** + +| | Indonesian | Sena | +|---|---|---| +| FST states (bare, morpher ctor) | 533 (+1, compound-loop join state) | 16,347 (+25 vs. post-H) | +| Bare FST build | ~433 ms | ~1.3–1.5 s | +| Coverage (set parity vs oracle) | **121/121, 0 unsound, 0 false positives** | 57/57 guarded slice (60 words, 5s/word cap, 3 excluded), 0 unsound; **`ndikhali` 8/8 exact set parity** | + +## Phase H — ✅ DONE (2026-07-03): Sena build time 9.3 s → ~1.0–1.1 s + +**H1 (memoize `DeletionJunctions`) and H2 (capability-gate `Variants`/`DeletionJunctions` on +`_anyPhonologicalRules`/`_anyDeletionSubrule`) landed together** in `SurfacePhonology.cs` — same +pattern as speced below, both in one pass since they touch the same lines. **Measured: Sena build +9.3 s → 1.0–1.1 s (cold and warm alike), Indonesian unaffected (266 ms, has real deletion subrules +so its gates stay open).** This is short of the ~0.3–0.5 s originally estimated; the remaining +~1 s is trie construction (105 ms measured standalone) plus `GenerateWords` (175 ms) plus JIT/other +overhead not isolated further — good enough that Phase H's practical goal (fast edit-loop +iteration) is met, and further squeezing wasn't pursued. + +**A real, unexplained side effect: Sena's `StateCount` dropped from 20,737 (Phase C/D's own +number, measured 2026-07-02) to 16,322 after H1+H2 — not identical, as this doc's gate below +originally demanded.** Investigated rather than dismissed: the gate's own reasoning predicts +IDENTICAL variant sets before/after (a 0-phonological-rule grammar's un-gated `ComputeVariants` +should already degenerate to `{underlying}` only, since an empty rule cascade changes nothing — +verified by hand-tracing `AddBoundaryVariant`'s behavior with a no-op cascade). The most likely +explanation not fully confirmed: some affix's underlying string, round-tripped through +`_table.Segment` + `GetMatchingStrReps` under the OLD (un-gated) path, produced a +string-identical-but-FeatureStruct-distinct "variant" that `BuildAffixArcs`' dedup-by-string-value +check (`if (variant == underlying) continue`) does NOT catch (it dedups by the RENDERED STRING, +not by the resulting FeatureStruct sequence), building a redundant-but-distinct arc chain. H2's +gate short-circuits before that round-trip ever happens, removing the redundant states. **This +was NOT chased to a certain root cause** (would need instrumenting `BuildAffixArcs`), because the +gates that actually matter — coverage and soundness — were reverified directly and are unaffected: +Indonesian `Benchmark_CompositeVsSearch` **120/121, 0 unsound, identical to before**; a per-word- +timeout-guarded Sena coverage check (first 60 words, 5 s/word cap, full random-corpus oracle +comparison is the known-hazardous one) showed **55/57 fully covered (3 timed out, excluded), 0 +unsound** — consistent with the known single-gap pattern, no regression signature. Full 119-test +suite green throughout. Treat "StateCount decreased, unexplained but coverage/soundness verified +unaffected" as the honest status — a future session touching `BuildAffixArcs`'s dedup should +resolve this fully rather than re-litigate it. + +**H3 (stop building the FST twice in the composite path) — turned out not to be a real bug; +struck.** The plan's evidence for H3 ("bare FST build 8.7 s + composite build 9.8 s back-to-back") +came from the DIAGNOSTIC SCRIPT that produced that measurement, which itself constructed +`new FstTemplateAnalyzer(language, morpher)` twice (once standalone, once inline as an argument to +`CompositeProposer.ForLanguage`) — an artifact of the measurement code, not the library. Checked +the actual call sites: `FstCoverageProbe.ForLanguage` builds ONE `FstTemplateAnalyzer` and passes +it to `CompositeProposer`'s instance constructor (not `.ForLanguage`), sharing it correctly. +`CompositeProposer.ForLanguage(language, fst, ...)` itself takes an already-built `fst` and never +constructs another. The only place two independent (real, morpher-based) FSTs get built is +`FstSenaBenchmark.Benchmark_CompositeVsSearch`'s OWN comparison code (`bare` vs `composite` +deliberately use separate instances to compare them) — and now that H1+H2 make a build ~1 s, that +duplication costs ~1 s of benchmark time, not worth touching. `LockstepPhonologyProposer` builds +a SEPARATE, but cheap (~105 ms, no-morpher/no-probing ctor), internal `FstTemplateAnalyzer` — a +minor, harmless redundancy, not the reported 8-9 s. No code change made for H3. + +**Verification gates actually run:** +- `dotnet test --filter "TestCategory!=Explicit"` → 119/119 green; CSharpier clean. +- Indonesian `Benchmark_CompositeVsSearch` (`HC_MAX_WORDS=121`): **120/121 fully covered, 0 + unsound, 0 false positives** — identical to pre-H. +- Sena: per-word-timeout-guarded coverage check (60 sequential words, 5 s cap) — 55/57 fully + covered (3 excluded on timeout, a known pre-existing hazard unrelated to this change), 0 + unsound. Full unbounded `Benchmark_CompositeVsSearch` on Sena still hangs on pathological words + regardless of this session's changes (same as every prior session — not attempted further). +- `StateCount`: Indonesian identical (532); Sena dropped 20,737 → 16,322 (see above — investigated, + not fully root-caused, coverage/soundness confirmed unaffected by two independent checks). + +## Phase G1 — ✅ DONE (2026-07-03): `mengamat-amati` closed, Indonesian now 121/121 + +Implemented exactly as speced below (`ReduplicationProposer.cs`): collected suffix surface texts +in the constructor (boundary-stripped via `HCFeatureSystem.Segment`-only rendering, catching +Indonesian's `-i` being underlyingly `"+i"`), added the suffix-peel fallback to the separator +scan, threaded an optional `extraSuffix` parameter through `ProposeForResidual`. **Measured: +Indonesian `Benchmark_CompositeVsSearch` — 121/121 fully covered (was 120/121), 0 unsound, 0 +false positives; `Diagnose_Divergences` — zero divergent words.** New toy test +(`Composite_CoversSuffixStackedOutsideReduplication_WhereSeparatorScanAloneMisses` in +`VerifiedFstAnalyzerTests.cs`) passed on the first run — the real engine happily stacked a plain +suffix rule on top of the toy reduplication rule with no PoS-gating adjustment needed (both rules' +`RequiredSyntacticFeatureStruct`/`OutSyntacticFeatureStruct` were `V`→`V`, and the stratum's +default `MorphologicalRuleOrder.Unordered` let HC try the stack). Full 120-test suite green +(was 119; +1). No regression on the toy-grammar suite or Indonesian's existing coverage. + +Ground truth (traced 2026-07-02 with a logging `ITraceManager`): the engine's analysis is +`AV+observe+Cont+LOC`, i.e. `-i` (LOC) suffixes the WHOLE reduplicated word: +`meng+amat` → `-Cont` → `mengamat-amat` → `-i` → `mengamat-amati`. The current separator scan +splits at `-` into `before="mengamat"`, `copy="amati"`, and `"amati"` is not a tail of +`"mengamat"` — correctly no match. The fix is to peel known suffix surfaces off the END of the +copy before tail-matching (this closes the whole class "any suffix stacked outside the +reduplication", not just this word): + +1. In `ReduplicationProposer`'s constructor, alongside `_redupRules`, collect suffix surface + strings: for every stratum's `MorphemicMorphologicalRule` whose allomorph classifies as + `MorphOp.Suffix` (`MorphTokenCodec.ClassifyOp(allomorph, false)`), take the allomorph's + `InsertSegments.Segments.Representation`, segment it via the surface stratum's + `CharacterDefinitionTable.Segment(...)`, keep only `HCFeatureSystem.Segment`-type nodes, and + render their string reps (`GetMatchingStrReps(node).First()`). **This boundary-stripping step + is required**: Indonesian's `-i` inserts `"+i"` (the `+` is boundary `char30`), and the raw + representation would never match surface text. Store `(string SurfaceText, IMorpheme Rule)` + pairs; skip empty results. +2. In the separator scan (third loop of `AnalyzeWord`), when the plain + `before.EndsWith(copy)` check fails, additionally try each collected suffix pair: if + `copy.EndsWith(s.SurfaceText)` and the remainder `copy[..^s.SurfaceText.Length]` is non-empty + and IS a tail of `before`, then for each analysis from `ProposeForResidual(before)`, emit a + variant with the suffix morpheme appended AFTER the redup morpheme (engine order: + `…root…, Cont, LOC` — redup first, then the outer suffix). Easiest shape: give + `ProposeForResidual` an optional `IMorpheme extraSuffix` parameter appended after the redup + wrap; `RootMorphemeIndex` is unchanged (both additions are after the root). +3. Do NOT recurse suffix-peeling (one suffix layer is what the corpus needs; unbounded stacking + here would be scan-cost without evidence). Note the single-layer bound in the class remarks. + +**Tests + gates:** +- Extend the toy grammar in `Composite_CoversSeparatorReduplication_WhereFstAloneMisses` (or add a + sibling test): add a plain suffix rule (e.g. Table1 `"s"`), assert the engine parses + `sagzsags` (= CONT(`sag`) + suffix; confirm the toy engine really produces this before asserting + — if HC's rule ordering rejects suffix-after-redup in the toy setup, adjust the toy PoS gating + until the ENGINE parses it, then assert parity), assert the composite covers it, and assert a + soundness negative (e.g. `sagzdats`) stays empty. +- Indonesian `Benchmark_CompositeVsSearch` (`HC_MAX_WORDS=121`): **121/121 fully covered, 0 + unsound** — this is the phase gate and the whole point. +- Full suite green, CSharpier, stats battery (walk p50/p95 must not measurably regress — the new + scan branch only runs on words containing a separator character that already failed the plain + tail match). + +## Phase G2 — ✅ DONE (2026-07-03): `ndikhali` closed with EXACT set parity (8/8) + +**Confirmed correct: the "data-model lift" premise WAS false.** `MorphOp.Compound` already existed, +`WordAnalysis` already represented compounds, and the only hard blocker really was `FstReplay.Confirm` +— implemented exactly as speced (step 1 below). **But the spec UNDER-ESTIMATED one thing**: for +`ndikhali` specifically, a THIRD piece was needed beyond `FstReplay` + the trie loop — see "What the +spec missed" below. Implemented in `FstTemplateAnalyzer.cs`, `FstReplay.cs`. + +**What shipped, matching the spec:** +1. **`FstReplay.Confirm`**: non-head `LexEntry` morphemes go into a `HashSet extraRoots` + instead of triggering an early `return null`; `LexEntrySelector = e => e == root || + extraRoots.Contains(e)`; `RuleSelector` gains `|| (extraRoots.Count > 0 && r is CompoundingRule)`. +2. **Trie compound loop**: `BuildCompoundLoop(roots, continuation)` — one shared "join" state per + attachment site (template-less path, and each template) with an ε-arc into every root's shared + chain `Entry`; every qualifying root's chain `End` gets an ε-arc to the join (alternative to its + normal continuation) AND every root's chain `End` gets an ε-arc from the join's downstream back + to `continuation`. Bounded to one extra root (no arc back into the join). +3. **Headedness via token post-processing**: `ToWordAnalyses` (renamed from `ToWordAnalysis`, + now `IEnumerable`) scans a token array for `MorphOp.Root` positions; 0 or 1 → + the old single-candidate behavior; 2+ → one `WordAnalysis` per root position as + `RootMorphemeIndex`, same morpheme list. Both `AnalyzeShape` and `AnalyzeComposed` updated to + `AddRange` instead of `Add`. +4. Gated on `hasCompoundingRules` (any `CompoundingRule` in any stratum) — zero cost for a grammar + without one. + +**What the spec missed (found during implementation, fixed):** +- **The compound loop must be reachable even without OTHER standalone derivational rules.** The + spec's own step 2 said "add the loop" but didn't notice the loop lives inside the template-less + path's `if (_derivPrefixRules.Count > 0 || _derivSuffixRules.Count > 0)` block — a grammar with + compounding but no other standalone prefix/suffix rule (my own toy test hit exactly this) never + built the block AT ALL, so the loop silently never existed. Fixed: the guard is now + `|| hasCompoundingRules`. Both real grammars have standalone derivational rules too, so this + never manifested on Sena/Indonesian — only on a minimal toy grammar — but it would have bitten + the next grammar tried. +- **`ndikhali` needed a THIRD extension: `DerivableToCategory` must treat compounding as a + category-transition edge, not just `_derivSuffixRules`/`_derivPrefixRules`.** Root cause (found + via reflection-inspecting `_derivPrefixRules`' actual contents, then a rule-application trace): + Sena's noun-class markers (glossed `"1"`/`"9"`/`"10"`/`"5"`, e.g. `mrule56`) are NOT standalone + derivational rules — `_derivPrefixRules` came back with only 4 unrelated entries, none of them + class markers. They are class-agreement PREFIX-TEMPLATE-SLOT rules requiring `pos100407` as + their OWN input category — which is NZR's (`-i`, gloss `NZR`) OUTPUT category, which is in turn + reachable only via `[é ⊕ khal compound] → NZR`. Since a template's root-attachment gate + (`CategoryMatches || DerivableToCategory`) never considered COMPOUNDING as a way to change + category, neither `é` nor `khal` ever qualified for the class-marker template at all — the + compound loop's OWN pairing worked fine (confirmed: `é+ser+NZR` candidates without a class + prefix appeared immediately), but the template carrying the class prefix was unreachable. + Fixed by adding a `_compoundingRules` list (collected in the constructor) and extending + `DerivableToCategory`'s frontier-expansion loop with a second edge type: for each category in + the frontier, if it unifies with a compounding rule's `HeadRequiredSyntacticFeatureStruct` OR + `NonHeadRequiredSyntacticFeatureStruct` (permissively — either role, no partner-root check, same + philosophy as every other gate in this file), `OutSyntacticFeatureStruct` becomes a new frontier + node. Since the BFS already runs `_derivDepth` iterations trying any available edge at each + step, this one addition makes "compound, then derive further" chains fall out for free — no + other structural change needed. + +**Measured result**: Sena's `ndikhali` — **8/8 exact set parity, sound** (all four class markers × +both head orderings, matching the engine's own 8 analyses exactly). Guarded 60-word Sena slice: +**57/57 fully covered** (up from 55/57 pre-G2), 0 unsound. Indonesian (`HC_MAX_WORDS=121`): +**unchanged at 121/121, 0 unsound, 0 divergent words** — its compounding rules (`mrule1`/`mrule2`) +now build the loop too, but the corpus needs no compound analyses (confirmed in Phase A), so +verify correctly prunes every proposed compound; `Soundness_NegativeExamples` 0 false positives on +both grammars. Full 121-test suite green (was 120; +1). Stats: Indonesian states 532→533 (+1, the +compound-loop join state — Indonesian's template-less path already existed for other reasons, and +the loop adds exactly one join state there); Sena states 16,322→16,347 (+25, one join state per +template + the template-less path); build time Indonesian ~266ms→~433ms, Sena ~1.0–1.1s→~1.3–1.5s +— both still far below the pre-Phase-H 9.3s baseline. Walk p50/p95 not separately re-measured +this session (no regression signal in the guarded coverage run's wall-clock). + +**Tests**: `Fst_CoversCompound_ViaTheCompoundLoop` (`VerifiedFstAnalyzerTests.cs`) — a toy grammar +with an unrestricted `CompoundingRule` (no head/non-head PoS gating, matching +`CompoundingRuleTests.cs`'s existing pattern reused here) and two roots (`pat`, `tak`); asserts the +engine parses the compound, the BARE `FstTemplateAnalyzer` alone now proposes it directly (no +sibling generator needed — the mechanism lives in the trie itself, unlike reduplication/infix), and +soundness via `CompoundingRule`'s own default `MaxApplicationCount = 1`: a three-root chain +(`pattakpat`) is rejected by both the real engine and the verified FST, confirming the loop is +correctly bounded to exactly one extra root. + +**Correction note for future readers**: the "Tests + gates" bullet below calling for a +"head/non-head PoS-gated" toy grammar was written before implementation; the toy test that shipped +uses an UNGATED compounding rule instead (simpler, and the PoS-gating behavior is already exercised +for real by Indonesian's `mrule1`/`mrule2` staying silent on its own non-compound corpus, and by +`ndikhali`'s real class-agreement gating on Sena). A dedicated PoS-gated toy test was judged +redundant given those two real-grammar checks. + +## Phase I — the true-FST generalization (lazy per-rule chain) — FULL EXECUTION SPEC (2026-07-03) + +> Speced for implementation in the same style as G1/G2/H (which executed cleanly from these specs). +> This is the largest remaining item — realistically **6–9 days** across seven commit-gated +> milestones (I0–I7, below), plus an optional I8. Unlike G/H it is not driven by a failing corpus +> word: its purpose is to make the fast path correct-by-construction for **arbitrary regular HC +> grammars**, not just the two measured ones. Everything below was written against a code re-read +> of `InversePhonology.cs`, `FstTemplateAnalyzer.AnalyzeComposed`/`ComposedClosure`, +> `RewriteRule` (`Direction`, `ApplicationMode`), and `SIL.Machine.Matching`'s node inventory +> (`Constraint`/`Quantifier`/`Group`/`Alternation` — the complete set an env compiler must handle). + +### What it fixes that nothing else can + +Junction probing (Phase C) and the peels are bounded LOCAL mechanisms — exact for grammars whose +phonology fires within ~2 segments of a morpheme boundary. They structurally cannot represent: +word-internal rules far from any boundary; long-distance harmony (a suffix vowel conditioned by a +trigger several syllables back); feeding/bleeding chains deeper than the probe window. The chain +handles all of these because each rule's inverse automaton carries its own state across the whole +word. + +Theory anchor (so nobody re-litigates feasibility): SPE-style ordered rewrite rules are regular +(Kaplan & Kay 1994); lexc/xfst/HFST/foma have compiled full morphologies this way for decades. The +only provably non-regular construct is unbounded copying — which stays with the peel. The reason +eager composition exploded IN THIS CODEBASE is specific: arcs are FeatureStructs matched by +unification and cannot be determinized/minimized without destroying multi-analysis enumeration; +classical toolkits stay small because they minimize over a CONCRETE alphabet — and HC's surface +alphabet IS concrete and small (~30 chars/grammar). Lazy composition sidesteps the issue entirely: +the composed machine is **never materialized**, so state explosion is structurally impossible; the +risk moves to walk-time frontier width, which I6's beam cap bounds. + +### Governing principle: SUPERSET, NEVER SILENT SKIP + +Soundness comes from verify (`FstReplay`), so a rule's compiled inverse only needs to be a +**superset** of the true inverse relation — over-generation costs verify time, never correctness. +Every rule therefore compiles at one of three tiers, and the compiler must never claim "supported" +for something that under-generates: + +- **Exact** — environments compiled precisely (including quantified/Kleene spans, see I1); minimal + slop. The normal case. +- **Permissive** — some gating dropped (an env anchor it can't express, an MPR/syntactic-feature + gate, a direction subtlety): still a superset, just more verify traffic. The automatic fallback. +- **Identity-skip** — the rule contributes only identity arcs (today's behavior for unsupported + rules): words genuinely needing it fall to the engine. ONLY as an explicit per-rule escape hatch + when Permissive measurably blows the beam (I6) — never a silent compiler default. + +`ProbeReport` gains a per-rule tier report (rule name → Exact/Permissive/Identity-skip + reason), +replacing the bare `UnsupportedPhonologyRuleCount` integer. A grammar author must be able to see +exactly which rule is costing what. + +### I0 — data-type groundwork (small) + +1. Extend `InversePhonology.Arc` with **ε-output**: `UnderlyingOutput == null` = consume the + surface/incoming symbol, emit nothing downstream (needed for epenthesis-inverse, I3). Add + `IsEpsilonOutput`; audit the two existing consumers (`AnalyzeComposed`, `ComposedClosure`) to + reject/ignore ε-output arcs until I2 lands (they can't appear yet — the v1 compiler never emits + them — but make the assumption explicit, not accidental). +2. Each rule gets its OWN `InversePhonology` instance; the chain is + `IReadOnlyList` in **reverse application order**. Do not trust this doc for + the order — read `AnalysisLanguageRule`/`AnalysisStratumRule` and mirror exactly what the + engine's own unapplication does (strata outermost-first, each stratum's phonological rules + reversed). +3. Gates: build green, full suite green (pure additive change). + +### I1 — env-pattern→NFA compiler + Exact-tier substitution compiler + +1. New `EnvNfaCompiler` (or private to the new compiler class): recursively map a + `Pattern` to an NFA fragment of identity pass-through arcs inside the rule's + transducer. Node handling — this is the COMPLETE inventory, handle all four: + `Constraint` → one identity arc labeled with its FeatureStruct; `Quantifier` (0/1, 0/∞, 1/∞, + bounded n..m) → optional edges / self-loops / unrolled repeats; `Group` → sequence; + `Alternation` → branch-and-rejoin. **Quantified env spans are what make long-distance harmony + Exact-tier** (an "any consonants*" span is just a self-loop) — do not relegate quantifiers to + Permissive; they are cheap here. Check how word-edge anchors appear in env patterns + (`HCFeatureSystem` anchor annotations) and gate on word start/end if expressible; if awkward, + drop anchor gating → Permissive with reason "anchor". +2. New compiler (new file, e.g. `RuleInverseCompiler.cs`; leave v1 `PhonologyRuleCompiler` + untouched until I7 retirement): for each `RewriteRule` subrule, build the inverse transducer: + identity self-loops at state 0 for every alphabet segment AND boundary character (boundaries + matter from I4 on); one branch per concrete effect: enumerate alphabet segments unifying with + the Lhs constraint(s), determine each one's output **by probing the rule's own compiled + synthesis rule in isolation** (reuse v1's proven probe trick per concrete segment — do NOT + reimplement HC feature arithmetic), and add `[left-env fragment] out:in [right-env fragment]` + branches. Multi-segment Lhs = a chain of out_i:in_i arcs (probe the whole window). α-variables + in target or env: enumerate concrete alphabet bindings via unification (bounded by alphabet) — + the env↔target agreement (Indonesian-nasal-assimilation-style) falls out of enumerating + consistent concrete combos. MPR/syntactic-feature-gated subrules: compile ungated → Permissive + ("mpr-gate dropped"). +3. Tests (new `RuleInverseCompilerTests.cs`), at the TRANSDUCER level before any walker exists: + feed symbol sequences through the automaton by hand (a tiny test-local interpreter is fine), + assert accepted surface→underlying mappings and rejected ones, for: plain substitution, + left+right env, quantified env span, α-variable agreement, a 2-segment Lhs. +4. Gates: full suite green; tier report shows Indonesian's 5 rules ≥ Permissive (expected: 3–4 + Exact once boundaries land in I4; before I4 the boundary-env rules will be Permissive — note it, + don't fight it yet). + +### I2 — the chain walker + +1. Generalize `AnalyzeComposed` from one Pinv to a chain — and make the existing single-Pinv path + DELEGATE to a length-1 chain, so there is ONE walker, not two drifting copies, and every + existing lockstep test keeps guarding the new code. Config = `(int[] ruleStates, trieConfig)`; + generalize `PConfigKey` to hash the vector. +2. Per surface segment: cascade the symbol down the chain — at level i, arcs consuming the + incoming symbol (unification match); each emits one symbol to level i−1 (or nothing, ε-output, + from I3 on); level 0's emission must unify a trie arc (advance trie, accrue tokens) exactly as + today. Closure step (generalizing `ComposedClosure`): trie ε-arcs, plus PER-LEVEL ε-input arcs + — a rule at level i may spontaneously emit a symbol downward (deletion restoration, I3; + boundary insertion, I4) that cascades through levels i−1…0 to the trie. +3. Toy tests (each: engine parses it, CURRENT composite misses it — assert that baseline first — + chain covers it, a non-word stays unparsed): + - **Word-internal rule**: a rule firing inside the root, conditioned ≥3 segments away from any + morpheme boundary (junction probing provably can't see it). + - **Two-rule word-internal feeding chain**: rule A's output creates rule B's context, + mid-root. + - **The marquee general-case test — long-distance harmony**: a suffix vowel agreeing in some + feature with the FIRST root vowel across an arbitrary consonant span (quantified env). This + is the test that certifies "general", not "two languages". +4. Gates: full suite green; both real corpora unchanged (chain not yet wired into the composite — + these tests construct the chain directly); stats battery on the toy grammars (frontier sizes + printed, sanity-check the "rules sit in identity state almost everywhere" claim). + +### I3 — deletion-inverse and epenthesis-inverse + +1. Deletion (φ→∅): ε-input restoration arcs bracketed by env fragments, exactly v1's concept but + through the new compiler; **cap restorations per rule per word** (reuse the engine's own + deletion-reapplication bound as the default — find it on `Morpher`; make it a knob). An + unconditioned deletion is now compilable (the trie prunes restorations in lockstep) but respect + the cap strictly. +2. Epenthesis (∅→ψ): ε-OUTPUT arcs — consume the epenthesized surface segment, emit nothing + (this is what I0's arc extension exists for). Trivially bounded. +3. Toy tests: word-internal deletion recovered; epenthesis recovered; both with env gating; a + non-word rejected for each; cap respected (a word demanding more restorations than the cap + falls to unparsed, not a hang). + +### I4 — the boundary tape (the principled fix junction probing routed around) + +1. Trie build: stop dropping boundary nodes from root/affix chains — build boundary arcs. The + BARE walk must treat boundary-labeled arcs as free (ε) moves so its behavior is byte-identical; + the chain walk treats them as real symbols. Expect `StateCount` to GROW (each `+` in an affix + like `meⁿ+` becomes an arc+state) — measure and record the delta; the H-era state-count lesson + applies: any coverage/soundness drift is a bug, a state-count change alone is not. +2. Chain walk: a global "insert boundary" ε-move — emits a boundary symbol at the TOP of the + chain, which passes through every rule's boundary-identity self-loops (from I1.2) down to a + trie boundary arc. Only survives where the trie actually has a boundary — the same + lexicon-constrains-restoration argument as deletions. Cap insertions per word (configurable; + default generous, e.g. 8). +3. Now boundary-conditioned rules' env fragments (which reference `BoundaryMarker` FeatureStructs) + gate correctly on intermediate tapes — the v1 `_alphabet`-excludes-boundaries bug is obsolete + rather than fixed. +4. Gates: bare-walk analyses byte-identical on BOTH corpora (the risky refactor — this gate is the + whole point); then the marquee cross-check: **Indonesian with junction probing DISABLED and the + chain ENABLED must independently cover all non-redup meN- words** — proving the general + mechanism subsumes the special case rather than coexisting untested beside it. Tier report: + Indonesian's boundary-env rules move Permissive → Exact. + +### I5 — metathesis + application-semantics honesty + +1. `MetathesisRule` inverse: bounded window swap — a hold-one-symbol transducer (state remembers + the held concrete segment; ~alphabet-sized state count, fine). +2. **Self-feeding iterative rules** (`ApplicationMode == Iterative` where the output can create a + new context for the same rule): one transducer pass models one simultaneous sweep, which + under-covers self-feeding. Detect the shape (output unifies the rule's own env/target); flag it + in the tier report ("iterative-self-feeding: may under-cover"); OPTIONALLY chain the rule's + inverse twice consecutively when detected — implement only if a toy test demonstrates a real + miss, otherwise the flag + engine fallback is the honest v-next residual. +3. RTL `Direction`: the chain walks LTR regardless; for most rules this is absorbed by the + superset principle; flag RTL rules Permissive ("direction"). + +### I6 — the beam cap (closes the oldest open KNOWN_GAPS item) + +1. Max live configurations per word across `AnalyzeShape`/`AnalyzeComposed`/the chain (one shared + implementation — they're one walker after I2). Default generous (e.g. 10,000), ctor knob. + Overflow → stop that word, count it, surface via `ProbeReport.BeamOverflows` and an analyzer + property. Never throw, never hang. +2. Toy pathological test: a grammar+word engineered to explode the frontier (many Permissive-tier + rules × ambiguous unification paths); assert graceful "unparsed", bounded wall-time. +3. This is also the Identity-skip escape hatch's trigger: if a real grammar's rule blows the beam, + the tier report + per-rule skip knob is the response, recorded in diagnostics. + +### I7 — wiring, measurement, retirement by evidence + +1. `ChainPhonologyProposer` replaces `LockstepPhonologyProposer` in `CompositeProposer.ForLanguage` + and `FstCoverageProbe.ForLanguage`; chain built once per language. +2. Full battery, stats-battery reported for EVERY row (states incl. boundary delta, build ms + cold+warm, walk p50/p95 chain-on vs chain-off, coverage, unsound): both corpora must hold + 121/121 and the Sena guarded slice, 0 unsound. Walk p50 regression budget with chain on: + ≤ ~1.5×; if exceeded, ship the chain OPT-IN (composite keeps junction probing as default fast + path) and record the decision — do not silently eat a regression, do not silently drop the + chain. +3. Retirement strictly by measurement, one commit each: `ComposedPhonologyProposer` and + `ForwardSynthesisProposer` (+ its flag threading) go if the chain matches or beats them + everywhere they fire; v1 `PhonologyRuleCompiler`'s probing internals go once nothing consumes + them (keep the `InversePhonology` type — it's the chain's substrate); `LeverTwoSpikeTests`' + hand-built transducers become tests OF the new compiler (assert the compiler now GENERATES what + the spike hand-built) rather than being deleted. Junction probing (`DeletionJunctions` + skip + arcs) is retired ONLY if chain-on/probing-off matches coverage without blowing the p50 budget; + otherwise both stay (probing = precision fast path, chain = completeness backstop) — either + outcome is fine, but it must be a measured decision. +4. Docs: sweep `FST_FAST_PATH_PLAN.md` KNOWN_GAPS (boundary gap, v1 scope, beam cap, §3b — all + close), update both plans' STATUS blocks. + +### I8 (optional backlog, small, independent) — the last two uncovered MorphOps + +After I7 the fast path covers every REGULAR HC construct (all rewrite phonology incl. harmony and +feeding, metathesis, morphotactics, compounding ≤2 roots) plus peels for non-regular copying. The +only remaining `UncoveredOps` are `MorphOp.Clitic` (clitic strata compile like affix layers — a +trie-build extension, likely small) and `MorphOp.Process`/`ModifyFromInput` (simulfix = a +feature-change over stem segments — expressible as substitution-variant arcs over root chains, or +left as engine fallback). Neither is needed by any grammar in hand; spec them properly when one is. + +### What "general" still does NOT mean (honest boundary) + +Unbounded copying (full/partial reduplication) is provably non-regular and stays a peel — that is +not a limitation of this design but of finite-state mathematics; every FST toolkit ever shipped +has the same carve-out (xfst's compile-replace is a two-pass trick, not a counterexample). +Compounding stays bounded at 2 roots (the loop's bound; lift to `MaxStemCount` if a grammar needs +3). Self-feeding iterative rules may under-cover until I5's optional doubling is implemented — +flagged, never silent. And the whole edifice keeps the propose-and-verify contract: the chain +proposes, `FstReplay` confirms, so even a compiler bug costs coverage, never a wrong answer. + +Original short design notes (superseded by the spec above, kept for continuity): +1. Compile each `RewriteRule` subrule to its own small INVERSE transducer over the concrete + segment alphabet (states = position in the λ·φ·ρ window ⇒ ~5–10 states/rule; textbook + construction; replaces `PhonologyRuleCompiler`'s probing v1). +2. Generalize `AnalyzeComposed` from ONE `InversePhonology` to a CHAIN — plan §3b of + `FST_FAST_PATH_PLAN.md`, which this section supersedes in detail. +3. Keep boundary nodes as trie arcs (ε on surface, matchable by rule transducers on the + intermediate tapes). +4. Add the frontier **beam cap** (the standing Phase-F/KNOWN_GAPS item) as part of this work — + overflow ⇒ word counted unparsed, never wrong, never a hang. +5. Gates: all existing toy tests + both real corpora unchanged; new toy tests for a word-internal + rule and a two-rule feeding chain that junction probing provably cannot cover (assert the bare + composite misses them TODAY, then that the chain covers them). + +## Risks / honesty + +- **Set parity may surface analyses nobody expected** (compounds, doubled derivations) — Phase A + exists to find that before any design commitment. +- **Junction windows deeper than probed** on some future grammar — the build-time window + assertion turns that into a visible "unsupported", never a silent miss. +- G2's walk-cost note is real: the compound loop multiplies root-entry fan-in; the stats battery + after G2 decides whether PoS-gating the re-entry is needed. +- This makes the two REAL grammars fully covered; it does NOT claim 100% for arbitrary HC grammars + until Phase I exists (word-internal cascades remain the open frontier, unchanged). + +## Rough effort + +| Phase | Size | +|---|---| +| A (measure) | ✅ done | +| B (Sena morphotactics) | superseded by G2 | +| C (junction probing) | ✅ done | +| D (redup peel) | ✅ done (6/7; 7th → G1) | +| E (compounding data-model lift) | **cancelled — premise falsified, see G2** | +| F (hardening/gates) | beam cap folded into Phase I | +| H (build-time regression) | ✅ done (H1+H2; H3 struck — not a real bug) | +| G1 (suffix-peel in separator scan) | ✅ done (Indonesian now 121/121) | +| G2 (compound loop + FstReplay fix) | ✅ done (`ndikhali` 8/8 exact parity; also needed `DerivableToCategory` extension the spec missed) | +| I (lazy per-rule chain — the true FST) | **FULL EXECUTION SPEC ready (I0–I7 + optional I8), ~6–9 days across 7 commit-gated milestones** | diff --git a/docs/HERMITCRAB_FST_ADVISOR.md b/docs/HERMITCRAB_FST_ADVISOR.md new file mode 100644 index 000000000..9361a04d5 --- /dev/null +++ b/docs/HERMITCRAB_FST_ADVISOR.md @@ -0,0 +1,144 @@ +# Grammar FST Advisor — plan + +A grammar evolves; one new rule can quietly push it from the fast finite-state path into the +slow combinatorial search. This plan adds a **grammar advisor/linter** that, for any HermitCrab +`Language`, flags the rules that make parsing expensive or block FST compilation, and gives the +grammar engineer **actionable write-ups**: *why* a rule is costly, how to **constrain** it back +into fast territory, and an **alternative formulation** to try. + +It is the front-end to the FST work (`HERMITCRAB_FST_PLAN.md`): the same per-rule classification +that decides the FST tier also drives the warnings. + +## 1. What it does + +Input: a compiled `Language`. Output: a `GrammarFstReport` — a list of per-rule advisories plus +an overall **tier verdict**. Each advisory has: +- **rule name + kind** (affix / phonological / compounding / template), +- **severity**: `Escape` (breaks FST → forces search), `Cost` (inflates the search fan-out), or + `Info`, +- **issue**: one sentence on what's expensive and why, +- **advice**: "constrain it like this" and/or "try this instead". + +## 2. The classifier (what flags what) + +Detected from the object model (`AffixProcessRule.Allomorphs` → `Rhs` actions; `RewriteRule` +Lhs/Subrule environments; `MorphologicalOutputAction.PartName`; `Quantifier.Max/MinOccur`): + +| Signal | Severity | Issue | Advice | +|---|---|---|---| +| **Reduplication** — a part copied ≥2× by `CopyFromInput` | **Escape** | copying an unbounded span isn't finite-state; forces search for any word it could apply to | "If the reduplicant is a fixed size (e.g. one CV syllable), bound the copied part's length → finite-state. If only a few forms reduplicate, list them as lexical entries. Else the grammar stays in the hybrid/search tier." | +| **Infixation / stem split** — ≥2 `CopyFromInput` of *different* parts | **Escape** (unless bounded) | the stem is split at a content-determined position | "If the infix position is fixed, encode it as a bounded split; a variable split blocks FST." | +| **Process modification** — `ModifyFromInput` present | **Info/verify** | FST-able only if the modification is local/bounded | "Local feature change in a fixed context = fine; non-local/agreement = blocks FST — try a bounded reformulation." | +| **Phonological rewrite rule** present | **Info/verify** | FST-able iff its environment is a bounded window | "Bound the left/right environment to the actual window (usually 1–2 segments); unbounded context blocks FST." | +| **Deletion rule** — Lhs longer than Rhs | **Cost** | analysis must guess where deleted segments were and re-insert them (× `DeletionReapplications`) | "Keep `DeletionReapplications` as low as the language needs; bounded deletion context is still FST-able." | +| **Unbounded environment** — a `Quantifier` with infinite `MaxOccur` in an environment | **Escape** | matches an arbitrary-length span | "Replace the `+`/`*` context with the fixed window the rule really needs." | +| **Many allomorphs** on one rule (> threshold) | **Cost** | each allomorph multiplies un-application branching | "Consolidate via environment conditioning where possible." | +| Compounding rule | **Info** | bounded by `MaxStemCount`, so finite | — | + +## 3. Tier verdict (static; corpus refines it) + +- **0 Escape advisories** → **Tier 1 candidate** (fully FST-able) — confirm with the FST compile + + corpus parity check. +- **a few Escapes** → **Tier 2 candidate** (hybrid: escapes fall back to search) — run the corpus + fallback-rate measurement to confirm it's worth it vs. Tier 3. +- **pervasive Escapes** → **Tier 3** (search only). + +The static report can't compute the corpus-weighted fallback rate, so it reports the tier +*candidate* + the escape list; the FST pipeline's corpus pass (`HERMITCRAB_FST_PLAN.md` §1) +confirms it. + +## 4. The "one new rule blew up the grammar" workflow + +Run the advisor before/after a grammar change (or in CI). A new `Escape` advisory that flips the +tier (e.g. Tier 1 → Tier 2) is the warning: it names the offending rule, says it moved the whole +grammar off the fast path, and gives the constrain/alternative write-up. Grammar engineers get +"this rule made parsing slow, here's how to keep it fast" at authoring time. + +## 5. Implementation + +- `GrammarFstAdvisor.Analyze(Language) → GrammarFstReport` in the HermitCrab library (pure static + analysis of the object model; no parsing, no corpus needed). +- `GrammarFstReport.Format()` for a readable dump. +- Tests: a normal concatenative grammar → Tier 1, no escapes; add a reduplication rule → the + advisor flags it `Escape` with the reduplication write-up and downgrades the tier. +- Run on the real Sena grammar and report the advisories + tier. + +## 6. Validate on Sena + +Census already showed Sena is concatenative + no rewrite rules + no productive reduplication → +expect **Tier 1, zero escapes**, possibly a few `Cost`/`Info` notes (allomorph counts, +compounding). That both validates the classifier (no false escapes) and confirms Sena is the +fast-path case. + +## 7. Engine extension — the *regularity* axis (added, kept orthogonal to the warning) + +The advisor answers one question — **"is this slow in today's engine?"** — and the user keeps +asking exactly that ("which rule blew up the grammar", "which cases are still slow"). The +extension adds a *second, independent* question — **"does an FST exist for this in principle?"** +(regular vs non-regular) — **without letting the answer soften the slow-today warning.** + +Why the two must not be merged: the engine that turns "regular" into "fast" is the FST compiler, +and **it does not exist yet** (gated on the unbuilt spike, `HERMITCRAB_FST_PLAN.md` §7). So +"regular" today means *fast eventually, slow now*. If a vowel-harmony rule reported as +`Cost / Tier-1-reachable`, a non-expert reads "fine" — when in the only engine that ships it is +the worst case (harmony on a common segment ⇒ ~every word on the slow path). The severity must +keep telling the truth about **today**. + +So **severity is unchanged** — it means *escapes the finite-state fast path in today's engine* +(forces the combinatorial search). Harmony, infixation, and reduplication (bounded or not) all +stay `Escape`: all are slow now. We only *add* a `Regular` axis that says whether an FST could +reclaim it later, and we report it as a **separate reclaim-path line that never upgrades the +tier**. + +The theory behind the new axis is **Kaplan & Kay (1994)**: a context-sensitive rewrite rule +`φ → ψ / λ _ ρ` with regular `φ, ψ, λ, ρ`, applied obligatorily/directionally (not recursively +into its own unbounded output), **denotes a regular relation — however long `λ`/`ρ` are.** HC's +`RewriteRule` is this form, and its `Rhs` is a *bounded segment specification*, not a copy (copy +lives only in morphological `CopyFromInput`). So: + +- **Unbounded-environment rewrite (harmony/spread): `Regular = true`** — *iff* the rule's own + `Lhs`/`Rhs` are bounded (only the environment is unbounded). Reclaim later by **state-encoding** + the spreading feature (or two-level pre-image arcs). If the `Lhs`/`Rhs` themselves are unbounded + we cannot confirm regularity → `Regular = false` (conservative). Stays `Escape` (slow today). +- **Reduplication splits by boundedness of the copied part.** Look up the copied part's defining + `Lhs` pattern by name: a **length-bounded** reduplicant (fixed CV/CVC) is a finite copy → + `Regular = true` (reclaim by bounded fold). Copying an **unbounded** part (whole stem, + `Annotation(any).OneOrMore`) is the one genuinely non-regular operation (`{ww}` is not regular) + → `Regular = false`. **If the part can't be resolved, default `Regular = false` (warn).** Stays + `Escape` either way. +- **Infixation** at a pattern-defined slot: `Regular = true` (the split is a regular pattern; + reclaim by bounded fold / the per-word probe). Stays `Escape`. + +### The reclaim map (how a `Regular` case *would* be made fast — once the compiler exists) + +| Construct | `Regular` | Slow today? | Reclaim path (needs the FST compiler) | +|---|---|---|---| +| Unbounded-environment rewrite (harmony/spread) | ✅ (bounded Lhs/Rhs) | **yes** | state-encode the spreading feature / two-level pre-image arcs | +| Bounded reduplication (fixed CV reduplicant) | ✅ | **yes** | bounded fold — emit the finite copy as arcs | +| Infixation (pattern-defined slot) | ✅ | **yes** | bounded fold / per-word strip-and-reparse probe | +| Deletion | ✅ | **yes** | inverse probe — re-insert candidate deleted segments, re-parse | +| Unbounded-copy reduplication | ❌ | **yes** | per-word probe only (when surface-invariant); else search | + +`Regular` and `Probeable` (§5a) are both *paths forward*, never excuses: `Regular` = "an FST +could reclaim it (compiler pending)", `Probeable` = "a runtime strip-and-reparse is sound". The +severity and tier keep warning about today. + +### Implementation of the extension + +- Add `GrammarAdvisory.Regular` (`bool?`): true = an FST exists in principle (reclaim by + compiling), false = genuinely non-regular / unconfirmable, null = N/A. **Severity is not + changed by it.** +- Reduplication: resolve the copied part's `Lhs` pattern by name; bounded → `Regular=true`, + unbounded or unresolved → `Regular=false`. Severity stays `Escape`. +- Infixation: `Regular=true`; severity stays `Escape`; keep the per-word-probe advice. +- Unbounded-environment rewrite: `Regular = !(unbounded Lhs or Rhs)`; severity stays `Escape`; + advice = Kaplan–Kay + state-encoding, explicitly "regular in principle but slow in today's + engine". +- Report: count `RegularEscapeCount` vs `NonRegularEscapeCount`; emit a **reclaim-path line** + ("N of M escapes are FST-reclaimable once the compiler exists; all M are slow in today's + engine"). **The tier verdict is unchanged** — no "Tier 1-reachable" upgrade. +- Tests: a non-expert sanity check — a grammar whose only complex rule is harmony must still + report a slow-path warning (escape present), with `Regular=true` only as the reclaim note. + Unbounded-copy reduplication ⇒ `Regular=false`; bounded reduplicant ⇒ `Regular=true`; + infixation ⇒ `Escape` + `Regular=true` (the committed infix test keeps its severity). Sena + unchanged (Tier 1). diff --git a/docs/LEVER_2.md b/docs/LEVER_2.md new file mode 100644 index 000000000..0800e5e0b --- /dev/null +++ b/docs/LEVER_2.md @@ -0,0 +1,137 @@ +> **Scope note:** see [`FST_FAST_PATH_PLAN.md`](FST_FAST_PATH_PLAN.md) for the current, active plan +> (Phase 3 builds directly on the lockstep-composition spike below — no certification involved). + +# LEVER_2 — forward FST∘FST composition (grammar-sized, ~100% bounded morphology) + +The asymptotic fix for "stay fast as features grow." Instead of enumerating word-forms +(`ForwardSynthesisProposer`, O(language)) or un-applying phonology on a boundary-less surface +(`ComposedPhonologyProposer`, which over-generates), build the analyzer the **classical FST-morphology +way**: compile the morphotactics and the phonology each to a transducer and **compose** them into one +surface↔analysis machine. Composition is a graph-algebra operation on the *automata*, not the language — +build cost scales with the **grammar** (arcs), not the number of words, and it shares structure, so a new +affix adds arcs, not a multiplicative blow-up. + +## Why composition succeeds where the pivots failed + +- **vs. enumeration (`ForwardSynthesisProposer`)**: phonology is applied to the `meN` arc *once*, in the + network, shared across every root — grammar-sized, not language-sized. +- **vs. inverse (`ComposedPhonologyProposer`)**: the morpheme boundary `+` is an *arc in the lexicon + network*, so when the phonology transducer composes against it the boundary-conditioned rule sees the + right context. No bare-surface ambiguity — that is exactly what broke runtime inversion. + +Lever 1 (per-morpheme surface precompile) is a *local approximation* of this; Lever 2 is the exact, +global version. + +## Target pipeline — LAZY composition (no materialized `Fst.Compose`) + +Don't build `Pinv ∘ Lex` as an object. Walk the surface maintaining a frontier of configs +`(pinvState, lexState, tokens)` — an on-the-fly product automaton: + +``` + surface segment s ─▶ for each config (pinvState, lexState, tokens): + for each Pinv arc consuming s with underlying output u: (substitution / identity) + if some Lex arc at lexState has input unifying u: + advance both → (pinvState', lexState', tokens + tokenOf(lexState')) + for each Pinv ε-input arc with underlying output u: (DELETION restoration — consumes no surface) + if some Lex arc at lexState has input unifying u: + advance both, re-process s + accept where pinvState and lexState are both accepting; emit accumulated tokens +``` + +Analyze = this walk; tokens come off the traversed **lex** states exactly as the current +`FstTemplateAnalyzer` walk already does. `VerifiedFstAnalyzer` still confirms every candidate — verify is +the soundness gate, as everywhere. + +**The property this must prove:** a `Pinv` ε-arc that "restores a deleted segment" only survives if a +`Lex` arc actually has that underlying segment at that point. That lexicon constraint is *exactly* what +the runtime inverse lacked (it restored everywhere → `ⁿmeⁿnⁿpuⁿlis`). Composition prunes it because the +two machines advance in lockstep. + +## The three blockers (and resolution) + +**Blocker 1 — tokens in a side-table, not an output tape. → DISSOLVED by lazy composition.** `lexState` +is in the config, so tokens stay state-based; no token-map to recover, no output-tape hack. The walk is +a product-automaton extension of the existing `EpsilonClosure`/NFA walk. `Lex` stays the acceptor +`FstTemplateAnalyzer` already builds — use the **default ctor** (underlying-only arcs, no surface +precompile, so phonology isn't double-applied). + +**Blocker 3 — unification-arc composition. → MOOT.** We never call `Fst.Compose`; the lazy walk unifies +`Pinv` output against `Lex` input directly (same `IsUnifiable` the walk already uses). + +**Blocker 2 — HC phonology is not a transducer. → THE REAL WORK.** Build `Pinv` (surface→underlying). +HC compiles rules to `Matcher` + imperative mutation, not a transducer; no rewrite→transducer compiler +exists in-repo. Routes: + - **B-probe (reuse HC):** a bounded-context Mealy transducer built by probing HC synthesis — states + encode the last *k* segments (incl. the boundary marker); for each (context, segment) record the + surface HC produces; invert. Deletion/epenthesis = ε arcs from length change. **Risk (advisor):** HC + phonology is a multi-rule cascade with feeding/bleeding; a per-context probe only reproduces it if + the combined effect stays in the window, and deletion breaks clean underlying↔surface alignment. + *Must be validated on a two-interacting-rule case, not one rule.* + - **B-direct:** compile each `RewriteRule` to a per-rule `Fst` transducer and lazy-compose the cascade + (Kaplan–Kay). Classically safe; more work (per-rule compilation). + +## Build plan (spike-first; the spike targets DELETION) + +1. ☑ **Deletion spike (algorithm-level)** — `LeverTwoSpikeTests`: symbol-alphabet lazy composition of a + hand-built `Pinv` (with an ε-input arc restoring a deleted `t`) ⊗ a tiny lexicon. Proven: + `"sad" → [sat, -d]`; with a bare root `sad` added, **exactly** `{sat+-d, sad}` (restoration is + lexicon-constrained — no `ⁿmeⁿnⁿpuⁿlis` garbage); non-word → nothing. Targets deletion, not + substitution. +2. ☑ **Lazy-compose walk, REAL HC types** — `InversePhonology` (surface→underlying transducer with + ε-input restoration arcs) + `FstTemplateAnalyzer.AnalyzeComposed` (product walk over + `(pinvState, lexState, tokens)`; lexicon ε-arcs and Pinv ε-restorations both handled in the closure). + Proven by `LeverTwo_LazyComposition_RecoversBoundaryDeletion_RealTypes`: a `kd`-suffix whose `k` + deletes before `d` surfaces as `d`; `"sagd"` recovers `[sag, KD]` by restoring the `k` (lexicon- + constrained), sound (⊆ engine), non-word → nothing. **Blockers 1 & 3 resolved; Blocker 2's consuming + engine built and proven, including deletion.** +3. ☐ **`Pinv` compiler** — the remaining Blocker 2 work: auto-build `InversePhonology` from a grammar's + phonological rules (the spikes use a *hand-built* `Pinv`). B-probe or B-direct; **must be validated on + a two-interacting-rule cascade** (assimilation + deletion) — the advisor's gate, since feeding/bleeding + + deletion break clean alignment. This is the genuine frontier. +4. ☐ **`ComposedLexiconProposer` + measure** — wrap the walk as an opt-in `IConstructProposer`, + verify-gated; measure on Indonesian `meN-` (build should be ~grammar-sized, not the 5 s enumeration). + +## The cascade test (the decisive experiment) + +The single-rule deletion spike would "pass and lie" about **cascades** — the real `meN-` case is +assimilation **+** deletion interacting, which is what produced `ⁿmeⁿnⁿpuⁿlis`. So +`LazyComposition_RecoversOpaqueTwoRuleCascade` hand-builds a `Pinv` for a two-rule **feeding/opacity** +cascade — `N→n / _t` then `t→∅ / n_`, underlying `aN+t = "aNt" → "ant" → "an"` (the `t` that *triggered* +the assimilation then deletes; on the surface its trigger is gone — counterbleeding opacity). + +**Result: it works.** A bounded-context `Pinv` that **couples** un-assimilation (`n→N`) with deletion- +restoration (`ε→t`) *through a state* recovers the opaque `aNt` from `"an"` → `[aN, -t]`, lexicon- +constrained (exactly one analysis). So a bounded transducer **can represent the inverse of an opaque +cascade**, and lazy composition recovers it. The Lever 2 architecture is real for cascades — the thing +that defeated every prior approach. + +**Corollary for the compiler:** naive **B-probe** over *underlying* contexts would misread this — the +`t`-deletion is conditioned on the *surface* `n` that assimilation fed from `N`, not on the underlying +`N` a probe would see. So the `Pinv` compiler must compose **per-rule** transducers (**B-direct**), not +probe combined contexts. The cascade test turned that from an assertion into a known fact. + +## Status (what is proven vs. the frontier) + +**Proven (committed, passing):** the Lever 2 *architecture* is real in this codebase — through **opaque +cascades**, not just single rules. Lazy composition recovers boundary deletion and a feeding/opacity +cascade; the lexicon prunes over-restoration; demonstrated with real HC types end-to-end for deletion. +Blockers 1 (tokens state-based) and 3 (no `Fst.Compose`) are dissolved by the lazy walk; Blocker 2's +*consuming* engine (`AnalyzeComposed` + `InversePhonology`) is built and proven. + +**Frontier (unstarted, now precisely scoped):** the `Pinv` *compiler* — auto-building `InversePhonology` +from a grammar's phonological rules. The spikes/tests all use a **hand-built** `Pinv`; the compiler that +*produces* one from the grammar is not started. The cascade test shows the route must be **B-direct** +(compile each `RewriteRule` to a transducer, compose the cascade, invert), since probing misreads +feeding. That per-rule rewrite→transducer compiler over feature-structure `Shape` arcs is the multi-week +subsystem. + +**Honest headline:** architecture proven (incl. cascades) with hand-built inverses; the phonology→ +transducer **compiler is unstarted**, so Lever 2 does **not** yet accelerate a real grammar. **Lever 1** +(guided forward-synthesis, 42→69 on Indonesian) remains the only thing that accelerates a real grammar +today. Soundness is never at risk either way — `VerifiedFstAnalyzer` + the parity gate gate everything. + +## Honest gate +Work the deletion spike for real. If end-to-end recovery + pruning hold, generalize `Pinv` with +confidence. If it resists after genuine effort, that is the recorded finding — "Blocker 2 +deletion/cascade is the wall" — and Lever-1 guided enumeration is the documented pragmatic fallback, not +a silent retreat. Soundness is never at risk either way (verify + parity gate). diff --git a/docs/archive/FST_FULL_COVERAGE_PLAN.md b/docs/archive/FST_FULL_COVERAGE_PLAN.md new file mode 100644 index 000000000..9638267bd --- /dev/null +++ b/docs/archive/FST_FULL_COVERAGE_PLAN.md @@ -0,0 +1,294 @@ +> **Archived — superseded by [`../FST_FAST_PATH_PLAN.md`](../FST_FAST_PATH_PLAN.md).** This document +> predates the removal of the "certification" concept (empirical corpus-parity gate that let the FST +> replace the search engine entirely). That ambition is dead; the current scope is a bounded, opt-in, +> sound-on-positives fast pass. Kept for its construct-by-construct regularity analysis, which is +> still accurate and still useful background — but any mention of certification, closure-as-a-gate, +> or "the FST may replace the engine" is obsolete. + +# FST full-coverage plan — auditing how much of HermitCrab an FST can cover + +Audited by four parallel reviews against (a) the formal-language status of each construct, (b) HC's +implementation, and (c) our FST implementation. "Regular?" classifies the *linguistic operation* +(Kaplan & Kay 1994: a finite composition of concatenation + bounded-context rewrite over a finite +lexicon is a **regular relation**, hence 1-way-FST-able). "Coverage" is what the **proposer** +(`FstTemplateAnalyzer`) actually builds — `VerifiedFstAnalyzer`/`FstReplay` only *confirm or discard* +proposer candidates, so they can never add coverage: **every under-generation must be closed in the +proposer.** + +## 0. The headline + +Almost all of HC is formally **regular** and therefore coverable by a 1-way FST. The genuinely +non-regular core is tiny: **unbounded full-stem reduplication** (`{ww}`) and an **unbounded +self-feeding rewrite cycle** (HC already caps it at 256). Everything else — affixation, templates, +derivation, **all phonology**, bounded compounding, partial/fixed reduplication, strata — is regular. + +But "regular ⇒ coverable" is about the *ceiling*, not what we built. Two findings matter most: + +1. **The proposer is only correct for 0-phonology grammars.** Its arcs are built from *underlying* + segments; it walks the *surface*. Any feature-change/epenthesis/deletion/metathesis desyncs the + walk, so for a grammar **with** phonology the FST **silently under-generates** (it fails *safe* — + verify rejects anything spurious, so no wrong analyses — but it misses valid ones). Sena has 0 + phonological rules, which is the only reason it certifies. **This is the single biggest limit on + real-grammar coverage.** The certification parity-gate catches it (such a grammar won't certify), so + it is not a *soundness* hole — it is a *coverage* ceiling. +2. **The proposer throws (`NotSupportedException`) on infix / circumfix / reduplication / process + slots**, aborting the *entire* build rather than degrading. So a grammar with **any** such slot + can't build the FST at all today. This is a robustness bug, not a math limit. + +## 1. Coverage scorecard + +**COVERED (proposer builds it):** prefix, suffix, realizational affixes, multiple template slots, +optional slots, slot ordering, root lexicon, category + stratum gating, category-changing derivation +(bounded), bounded derivation (depth ≤ `derivDepth`, default 2, tunable). + +**PARTIAL:** derivation depth (capped — deeper stacks silently dropped, caught only by the parity +gate); zero affix (the `[CopyFromInput, InsertSegments(non-empty)]` form is covered; a **true +zero-segment** affix `[CopyFromInput]`-only is dropped/throws — a silent gap); Linear-vs-Unordered rule +order (modeled as a bounded any-order superset — sound via verify, not faithful to the flag). + +**COVERABLE (regular; not built — listed with the work + blow-up):** +- **All phonology** — RewriteRule (feature-change / epenthesis / deletion), metathesis, iterative & + simultaneous application, α-variables, allomorph environments. Regular by Kaplan–Kay. Needs the + proposer to be built by **composition** (lexicon ∘ affixes ∘ phonology) instead of the underlying- + segment walk, or phonology folded into a richer verify. Largest single win. +- **Bounded compounding** — regular (capped by `MaxStemCount`, default 2). Needs shared per-category + stem automata spliced N−1 times (additive in states) **and** an extension to `FstReplay` (which today + requires a single `LexEntry` root, so it can't even *confirm* a compound). +- **Infixation** — regular (positioned insertion). Needs `BuildRootChain` to split a root mid-stem + (`pre · infix · post`); ≈2×|root| arcs for infixing roots, bounded. +- **Circumfix** — regular. Needs one morpheme emitted at two surface positions (the `MorphOp` enum + already has `CircumfixPrefix`/`CircumfixSuffix`, but the codec only ever emits `CircumfixPrefix` — + `CircumfixSuffix` is dead code). +- **Simulfix / process (`ModifyFromInput`)** — regular (length-preserving feature rewrite). Needs + feature-mutation arcs; entangled with phonology (the mutated segment must be in the arc condition). +- **Partial / fixed-size reduplication** — regular (bounded copy). Unroll the fixed template into arcs + (Beesley–Karttunen compile-replace). +- **Strata / cyclicity** — regular (finite composition of per-stratum regular relations); already + partly modeled via stratum-index gating. +- **MPR features, morpheme/allomorph co-occurrence, allomorph environments, stem names, disjunctive + allomorphs, obligatory features, bound roots** — all regular, currently **VERIFY-ONLY** and *sound* + there (HC's real synthesis enforces them). Coverable on arcs but **not worth it**: verify already + guarantees soundness, so baking them in buys only speed, at a multiplicative state cost. Leave them + in verify. + +**NOT COVERABLE by a 1-way FST (genuinely non-regular):** +- **Unbounded full-stem reduplication** — `{ww : w∈Σ*}` is not regular (not even context-free); a + 1-way FST has no memory for an arbitrary-length copy (Dolatian & Heinz 2020). HC expresses it when a + `CopyFromInput` part is an unbounded quantifier over the stem. +- **Unbounded self-feeding rewrite cycle** — not finitely bounded; HC tames it with a 256-length cap + (which *is* a regular fold — see Appendix A). +- (Unbounded recursive compounding/incorporation is non-regular in theory, but HC can't express it — + `MaxStemCount` is always finite — so it is moot here.) + +## 2. Per-feature table (synthesis of the four audits) + +| Feature | Regular? | Where handled now | Status | What's needed to cover | +|---|---|---|---|---| +| Prefix / suffix | yes | FST proposer | COVERED | — | +| Template slots / optional / order | yes | FST proposer | COVERED | — | +| Realizational affixes | yes | FST (as slots) | COVERED | feature-blocking deferred to verify (sound) | +| Category + stratum gating | yes | FST build-time gate | COVERED | faithful when stem ⊑ template category | +| Category-changing derivation | yes (bounded) | FST (≤ depth) | COVERED | deeper chains → raise `derivDepth` | +| Derivation depth | n/a | FST cap (2) | PARTIAL | knob; deeper → engine (parity-gated) | +| Zero affix (with segments) | yes | FST | COVERED | — | +| **True zero-segment affix** | yes | throws/dropped | **PARTIAL (bug)** | emit token with no arcs | +| Linear vs Unordered order | yes | FST (any-order superset) | PARTIAL | sound via verify; not flag-faithful | +| **Phonology (all kinds)** | **yes (Kaplan–Kay)** | **engine/verify only** | **COVERABLE (big)** | compile by composition into the proposer | +| **Bounded compounding** | yes | engine/cache | COVERABLE | shared stem automata + extend `FstReplay` | +| Infixation | yes | throws | COVERABLE | mid-stem root split | +| Circumfix | yes | throws (half dead) | COVERABLE | one morpheme, two positions | +| Simulfix / process | yes | throws | COVERABLE | feature-mutation arcs (needs phonology) | +| Partial/fixed reduplication | yes | throws | COVERABLE | unroll bounded copy | +| Strata / cyclicity | yes | partial (gating) | COVERABLE | compose per-stratum transducers | +| MPR / co-occurrence / env / stemname / disjunctive / obligatory / bound | yes | **verify** | VERIFY-ONLY (sound) | leave in verify (speed-only to move) | +| **Unbounded full-stem reduplication** | **no** | engine (escape) | **NOT COVERABLE (1-way)** | length-cap / detect-peel / 2-way FST | +| Unbounded self-feeding cycle | no (capped) | engine (256-cap) | NOT COVERABLE (unbounded) | length-cap fold | + +## 3. Architecture changes / optimizations / reconfigurations + +**A. Graceful degradation instead of `throw` (do now — robustness).** The proposer must never abort a +build on an unbuildable construct. On an infix/circumfix/reduplication/process slot (and any construct +it can't model), it should **skip that path and ensure the grammar is not certified** (so those words +route to the engine), exactly as it already does for non-regular escapes. Today a single such slot +throws `NotSupportedException` and kills the whole FST — so the analyzer is unusable on most real +grammars. This one change makes the FST **safe on any grammar** (full coverage where it can, engine +backstop where it can't), which is the right "as much as we can get" posture. + +**B. Fix the true zero-segment affix (do now — small).** Emit the morpheme token at a token-bearing +state with no segment arcs (the mechanism already exists for empty-insert affixes). Today it is a +silent under-generation or a throw. + +**C. Phonology by composition (follow-on — the big coverage win).** Replace/augment the hand-rolled +underlying-segment walk with the textbook construction: compile `Lexicon ∘ Affixes ∘ Phonology` +(each `RewriteRule` already carries everything needed to emit its transducer) and analyze the surface +through the composed, **minimized** machine. This is what lifts the FST from "0-phonology grammars +only" to the majority of real grammars. Risks: multiplicative state blow-up before minimization (use +lazy/per-stratum composition + the existing `Determinize().Minimize()` for variable-free layers), and +α-variable expansion (arc multiplication by feature cardinality). Verify-only cannot substitute — +`FstReplay` can reject but not *generate*, so phonology must enter the proposer. + +**D. Bounded compounding (follow-on — highest discrete coverage gain).** Build per-category shared stem +automata, splice up to `MaxStemCount`, emit `Compound`/`Root` tokens — **and extend `FstReplay`** to +confirm multi-root candidates (today it hard-requires a single `LexEntry` root, so a compound can't be +verified even if proposed). Additive in states (Σ category automata × depth), not multiplicative. + +**E. Keep soundness constraints in verify (decision, not work).** MPR, co-occurrence, environments, +stem names, disjunctive allomorphs, obligatory/bound — all sound in verify because verify *is* HC's +synthesis. Baking them into arcs buys only speed at a state cost; the over-generation they cause is a +few cheap rejected candidates per word. Leave them. + +**F. The certification interlock is the safety contract (preserve + strengthen).** `certified = +FST-closed ∧ set-parity`. The parity check is what catches proposer gaps (phonology, compounding, +depth) even when closure says "regular" — so **a phonology-bearing or reduplicating grammar must never +certify**, or `AnalyzeWord` (which skips the engine when certified) would silently under-generate. +`GrammarFstClosure.IsEscape` flags reduplication/infix; ensure the proposer's *coverage* limits +(phonology, compounding, depth-truncation) are likewise reflected so certification can't outrun what +the proposer actually builds. The empirical parity gate already enforces this; make it explicit. + +## 4. Roadmap — close this PR vs. follow-on + +**This PR (mathematically sound, tractable, robustness):** +- **A. Graceful degradation** (no throw → skip + don't certify). Makes the FST usable on any grammar. +- **B. Zero-segment affix** fix (close the silent gap). +- **F. Certification guard** — verify (it already holds via parity) and document that only + fully-covered, FST-closed grammars certify; everything else uses the engine/cache backstop. +- Tunable `derivDepth` (already shipped) + document depth-truncation as parity-gated. + +**Follow-on PR(s) (the bigger builds, in value order):** +1. **Phonology by composition** (C) — unlocks the majority of real grammars. +2. **Bounded compounding** (D) — biggest discrete construct gain; needs the `FstReplay` extension. +3. **Infix / circumfix / partial-reduplication / simulfix** — the remaining concatenative/bounded + constructs (each COVERABLE; medium effort). +4. **The non-regular core** — Appendix A. + +--- + +## Appendix A — closing the gap on the non-FST-able constructs + +Two HC constructs are genuinely non-regular for a 1-way FST: **unbounded full-stem reduplication** +(`{ww}`) and the **unbounded self-feeding rewrite cycle**. + +### A1. Unbounded reduplication +- **Length-cap fold.** Unroll `{ww : |w| ≤ L}` into explicit arcs for a chosen max reduplicant length + L (e.g. the longest lexical stem). Sound + complete up to L; FST grows with L×|Σ|; longer stems fall + to the engine. Precedented — HC itself caps the self-feeding cycle at 256. +- **Detect-and-peel (Beesley–Karttunen compile-replace).** Detect an adjacent repeated span, peel one + copy, analyze the remainder with the regular grammar. For copy, **detection == parsing** (a + reduplicant *is* an adjacent repeat), so the live work is a cheap repeat-scan + peel; ambiguous peels + resolved by verify. The standard finite-state-morphology tool. +- **2-way FST (Dolatian & Heinz 2020).** A two-way transducer re-reads its input and computes `{ww}` + exactly, staying linear-time. The *correct* device, but the current 1-way NFA walk would need a + two-way execution engine — the largest change. +- **Sound detector + engine backstop (current posture, recommended default).** Keep the proposer + reduplication-blind; `GrammarFstClosure.IsEscape` flags it → grammar not certified → those words go + to the engine via the cache. Zero blow-up, always correct, slower only on reduplicating words. + Combine with the length-cap fold as an opportunistic fast path for short stems. + +### A2. Self-feeding cycle +Already closed by a length-cap (shape ≤ 256). To FST-ize, bake the same cap as a maximum-length +acceptance bound; identical tradeoff to the reduplication length-cap. + +## Appendix B — do current architecture decisions help or hinder the non-FST-able work? + +**HELP — verify-by-re-analysis + engine backstop.** The proposer is *allowed* to be +sound-but-under-generating: every kept analysis is a genuine HC analysis, and the FST need not model +reduplication/compounding/phonology at all — those words are quarantined to the complete engine. +Adding any Appendix-A mechanism later only *widens* the fast path; it cannot break soundness, because +verification re-runs HC end to end. + +**HELP — the escape-aware codec + closure.** `MorphTokenCodec.ClassifyOp` already distinguishes +`Reduplication`/`Infix`/`Compound`/`Process` from concatenative ops, and `GrammarFstClosure` consumes +those tags. A future reduplication/compounding builder has a ready, principled signal for which rules +to special-case. + +**HAZARD to preserve — certified-skip.** A certified grammar skips the engine entirely. A grammar with +a non-regular construct must therefore *never* certify. The interlock (`closed ∧ parity`, with closure +flagging escapes and parity catching proposer gaps) is what guarantees this — it is the explicit safety +contract tying "construct ∉ regular" to "never skip the engine for it." Keep it inviolable as coverage +grows. + +**NEUTRAL — the 1-way template walk.** Bounded folds (length-cap, detect-peel) and all the COVERABLE +concatenative constructs fit the existing 1-way walk as "more arcs." The only thing it blocks is the +exact **2-way FST** reduplication solution (A1), which needs a different execution model — a +reconfiguration to weigh only if unbounded reduplication becomes a priority grammar. + +### Citations +Kaplan & Kay 1994 (regular relations; closure under composition → phonology, strata, bounded +compounding); Dolatian & Heinz 2020 (2-way FSTs compute reduplication; 1-way cannot); Chandlee 2017 +(subregular morphology; partial reduplication is local/regular); Beesley & Karttunen 2003 (compile- +replace for bounded reduplication). + +--- + +## Appendix C — Solution 1 implementation plan (surface-allomorph precompile) + +**Goal.** Let the proposer match phonologically-altered surfaces by building its arcs from each +morpheme's **surface** realizations (phonology applied forward), not only its underlying shape. Stay a +sound **superset** (never miss a real candidate) and lean on verify to prune. This lifts the proposer +from "0-phonology grammars only" toward real grammars. + +**Why it's sound + bounded.** The proposer only nominates `(root + rules)` sets; verify re-runs HC with +real phonology and checks the surface, so extra/wrong surface variants are pruned. The only obligation +is *completeness of the variant set*: every surface a morpheme can take must be an arc. The harmony / +subregular literature (Heinz/TSL; Yawelmani ≈ 21-state FST) shows attested phonology gives each +morpheme a **small** variant set (single digits to low tens), so the FST grows by a small constant +factor, not combinatorially. Pathological blow-up is theoretical, not attested; such grammars fall back +to the engine via the certification interlock. + +**Algorithm.** +1. For each morpheme shape (root allomorph segments; affix `InsertSegments` segments), compute its + **surface variant set** = { underlying } ∪ { phonology(shape) under each bounded context }. +2. Build the proposer's segment arcs from the **union** of variants (same `(op, morpheme)` token on + every variant — the token is the underlying morpheme; the arcs are surface). Interweaving is free: + the walk picks each morpheme's variant independently. +3. Verify prunes invalid variant combinations. + +**Three tiers of "context", implemented incrementally:** +- **C-internal (first cut):** apply the grammar's phonological rules to the morpheme shape *in + isolation* (with word-edge anchors). Covers morpheme-internal + edge alternations (e.g. root-internal + aspiration). Sound for those; misses cross-boundary effects. +- **C-boundary (next):** over-approximate the neighbor context — apply rules with each natural-class + boundary segment on each side — so boundary-conditioned variants (assimilation across a morpheme + seam) are included. Still bounded (variants × small context set). +- **C-exact (endgame = Solution 3):** compose the full phonology transducer. Solution 1 is its + per-morpheme approximation; this is a smooth upgrade, not a throwaway. + +**How to apply phonology forward to a shape (reuse HC, do not reimplement):** compile each stratum's +`PhonologicalRules` via `prule.CompileSynthesisRule(morpher)` into a `LinearRuleCascade` (exactly what +`SynthesisStratumRule._prulesRule` does), build a `Word` from the morpheme shape, `Apply` the cascade, +read the surface shape(s). (Or, for bare-standing roots, `Morpher.GenerateWords(root, ∅, ∅)` returns +the surface directly — the safe minimal version.) + +**Soundness guards (must hold):** +- Keep the underlying arcs too (union), so the 0-phonology path is unchanged. +- Only ROOT-INTERNAL/edge variants are claimed by the first cut; anything cross-boundary that the cut + misses must keep the grammar from certifying (the parity gate already enforces this — a missed + variant shows up as FST≠engine, so the grammar won't certify and those words ride the engine). +- The token emitted is always the underlying morpheme; verify (which runs real phonology) confirms. + +**Explosion control:** dedup variants per morpheme by surface string; cap variants-per-morpheme with a +budget; if exceeded, drop the surface-precompute for that morpheme (fall back to underlying + engine) — +never explode, only degrade coverage. + +**Test strategy:** construct a minimal phonology grammar (a feature-changing rewrite rule, e.g. a root +that aspirates), show the *current* proposer misses the altered surface (under-generates), the +surface-precompile proposer covers it, and verify keeps it sound (0 false positives on non-words). + +### Result (shipped — C-internal tier, bare roots) + +Implemented the safe minimal version: `BareRootSurfaces` reuses the obligatoriness `GenerateWords` +call to get a root's bare surface realizations, and `BuildRootChainFromSurface` adds a proposer arc for +every realization ≠ the underlying form (same underlying-morpheme token). Zero extra build cost. + +**Latent verify bug this exposed (fixed).** `AnalysisRewriteRule.Apply` / `AnalysisMetathesisRule.Apply` +gate on `Morpher.RuleSelector`. `FstReplay`'s restricted re-analysis pinned the selector to *just the +candidate's morphological rules* — which silently disabled **all phonology** during verify. So before +this fix the propose-and-verify spine could never confirm *any* phonologically-altered candidate +(verify couldn't un-apply phonology to reduce the surface back to the root). Phonological rules are +obligatory deterministic rewrites, not a fan-out choice, so `FstReplay` now always lets +`IPhonologicalRule` through the selector; the morphological fan-out is still collapsed by gating the +leaf morphological rules + root, and soundness is still enforced by the candidate-signature match. + +Verified end-to-end by `Verified_CoversPhonologicallyAlteredBareRoot` (an unconditional t→d rule makes +bare root "dat" surface only as "dad"; the proposer now matches "dad" and verify confirms it as a +genuine HC analysis, while a non-word still yields nothing). Full HermitCrab suite green (97 passed). diff --git a/docs/archive/FST_FULL_PLAN.md b/docs/archive/FST_FULL_PLAN.md new file mode 100644 index 000000000..a34d7263c --- /dev/null +++ b/docs/archive/FST_FULL_PLAN.md @@ -0,0 +1,298 @@ +> **Archived — superseded by [`../FST_FAST_PATH_PLAN.md`](../FST_FAST_PATH_PLAN.md).** This document +> predates the removal of the "certification" concept (empirical corpus-parity gate that let the FST +> replace the search engine entirely). That ambition is dead; the current scope is a bounded, opt-in, +> sound-on-positives fast pass, with reduplication/infixation as runtime peel generators and phonology +> moving to lazy lockstep composition rather than the composed-inverse / forward-synthesis designs +> described here. Kept for the still-valid architecture and soundness reasoning (propose-and-verify, +> the composite generator pattern) — ignore any "certify"/"closed"/"replace the search engine" framing. + +# FST_FULL_PLAN — closing the coverage gap (phonology, infixation, reduplication) + +Implementation plan for expanding the propose-and-verify FST accelerator to cover **all attested +phonology**, **all infixation**, and **bounded reduplication**. Companion to +`FST_FULL_COVERAGE_PLAN.md` (the construct audit) and `HERMITCRAB_FST_PLAN.md` (the spine design). + +## The principle that makes this safe + +The propose-and-verify split puts **all correctness in verify + certification, none in the proposer**. +The proposer's only job is to emit a *sound superset* of candidates fast; `VerifiedFstAnalyzer` re-runs +HC (real analysis + synthesis, real phonology) on each candidate and discards any HC does not confirm; +the empirical parity gate (`FstVerification.Compare`) certifies a grammar only when FST≡engine on the +corpus. + +Consequence: **expansion can never produce a wrong answer.** A new candidate generator that +under-generates simply accelerates fewer words (parity gate → engine fallback); one that over-generates +has its junk pruned by verify. Correctness is invariant; only the *acceleration ratio* moves. So we can +add coverage aggressively. + +This reframes "can an FST represent X?" into **"can we cheaply enumerate a superset of candidates for X +that verify then prunes?"** — which decouples coverage from FST-representability and lets non-regular +constructs (full reduplication) be handled *beside* the FST by bounded generators feeding the same gate. + +## Architecture: a composite of candidate generators + +``` + ┌─────────────────────────────────────────┐ + surface word ───▶│ CompositeProposer (union + dedup) │ + │ ├─ FstTemplateAnalyzer (regular bulk) │ + │ ├─ ReduplicationProposer (strip + recurse) + │ └─ InfixProposer (remove + recurse) + └───────────────┬───────────────────────────┘ + │ candidate (root+rules) sets + ▼ + ┌─────────────────────────────────────────┐ + │ VerifiedFstAnalyzer (FstReplay verify) │ ── discards anything HC won't confirm + └───────────────┬───────────────────────────┘ + ▼ genuine HC analyses +``` + +`VerifiedFstAnalyzer` already wraps an `IMorphologicalAnalyzer` proposer, so the only new plumbing is a +`CompositeProposer : IMorphologicalAnalyzer` that unions + dedups candidates from several generators. + +**Three invariants every generator must respect** (learned before building, not after): + +1. **Recurse the residual through the FST proposer — never propose a flat root.** A reduplicated or + infixed surface can have an *inflected/affixed* base: `"wakaswakas"` is REDUP of inflected `"wakas"`, + not bare `"waka"`. So a generator strips/removes its own material, then calls the FST proposer on the + remainder, and wraps each returned analysis with its morpheme. Terminates: the residual is strictly + shorter, reduplication bounded to 1–2 copies, infixation to 1 site per pass. +2. **Dedup before verify.** Two generators (or a generator and the FST) can propose the same morpheme + set → verify would confirm it twice → duplicate analyses. `CompositeProposer` dedups by candidate + signature before the gate. +3. **The coverage signal must reflect the composite.** `FstTemplateAnalyzer.CoversAllConstructs` trips + `false` on a redup/infix slot. Once a sibling generator covers that construct, certification must see + the *composite's* coverage, not just the FST's — else the grammar won't certify and the now-covered + words stay on the engine. The parity gate keeps results correct regardless; this only governs whether + acceleration kicks in. + +--- + +## Point 2 — Infixation (regular; in-scope) + +Infixation splits the root and inserts the affix inside it (Tagalog `-um-`: sulat → s‹um›ulat). It is a +regular operation; the proposer already *recognizes* infix slots (`MorphTokenCodec.ClassifyOp → Infix`) +but skips them (`_hasUnbuiltConstructs = true`). + +**Generator (`InfixProposer`).** For each infix rule and each candidate insertion site in the surface: +remove the infix's surface segments at that site, recurse the remainder through the FST proposer, wrap +each analysis with the infix morpheme. Sound-superset shortcut: try every segment boundary the rule's +partition pattern allows (or over-approximate to all boundaries) — verify prunes the wrong splits. +`O(surface-length × infixes)` candidates — bounded. Composed with surface-precompile it also handles +infixes that trigger phonology. + +**Soundness.** Verify re-synthesizes `base + infix` and surface-matches; a wrong split won't confirm. +**Test.** A grammar with one infix rule; show the FST alone misses the infixed surface, `InfixProposer` +covers it, verify rejects a non-word. + +--- + +## Point 3 — Reduplication (non-regular; handled beside the FST) + +Full reduplication (copy the whole base, `ww`) is the one provably non-regular construct — an FST cannot +represent it. It doesn't need to: a bounded **string-repetition scanner** contributes candidates to the +same verify gate. + +**Generator (`ReduplicationProposer`).** Scan the surface for an adjacent repeated substring matching a +reduplication template (full-copy `XX`; partial CV-copy as a later refinement). For each detected +repetition: strip one copy, recurse the remainder through the FST proposer, wrap each analysis with the +reduplication morpheme. **Bound to 1–2 applications** (the "once or twice") — finite, tiny candidate set. +`O(n²)` scan per word, trivial. + +**Soundness.** A coincidental repeat (`"murmur"` that is not actually reduplicated) is proposed but +pruned because HC synthesis of `base + REDUP` won't reproduce it. **"Well enough for 99.9%":** the 1–2 +bound covers essentially all attested reduplication; triple/unboundedly-interacting reduplication +doesn't certify and rides the engine (still correct). +**Test.** A grammar with a full-reduplication rule; show the FST alone misses `"wakawaka"`, the composite +covers it (including an inflected reduplicant via the recursion), verify rejects a non-reduplicated word. + +--- + +## Point 1 — All phonology: affix surface-precompile + C-boundary (in-scope, incremental) + +The shipped C-internal tier handles **bare-root** alternation via `GenerateWords`. Two extensions: + +**1a. Affix surface-precompile.** Build affix arcs from each affix allomorph's *surface* segments, not +only the underlying `InsertSegments`. Forward-application helper: compile the stratum's +`PhonologicalRules` via `prule.CompileSynthesisRule(morpher)` into a `LinearRuleCascade` (exactly what +`SynthesisStratumRule._prulesRule` does), wrap the affix segments in a `Word`, `Apply`, read the surface +shape(s). An affix's surface depends on stem context, so this is fiddlier than the bare-root case — +**validate on one minimal affix-triggered alternation first**, then generalize. + +**1b. C-boundary context.** Over-approximate the neighbor: apply rules with each natural-class boundary +segment on each side, so boundary-conditioned variants (assimilation across a seam) are included. Bound +the variant count per morpheme (cap + drop-to-underlying fallback) so a long-distance harmony grammar +degrades rather than explodes. + +**Soundness.** Underlying arcs are kept (union), so the 0-phonology path is unchanged; the token is +always the underlying morpheme; verify confirms with real phonology; a missed variant shows up as +FST≠engine → no certify → engine (never wrong). +**Test.** A rewrite rule altering an affix's surface; show the underlying-only proposer misses it, the +surface-precompile proposer covers it, verify stays sound. + +### Result (shipped — 1a affix surface-precompile, C-internal tier) + +`SurfacePhonology.Variants(underlying)` compiles each stratum's synthesis phonological rules (reusing +HC's `IPhonologicalRule.CompileSynthesisRule`, exactly what `SynthesisStratumRule` runs) and applies +them to a segment string in isolation, returning the distinct surface forms (always including the +underlying). `FstTemplateAnalyzer.BuildAffixArcs` builds the affix's segment arcs from the underlying +form AND each altered surface variant (shared by both affix-arc sites: derivational layers and template +slots); the default ctor passes an identity variant function so the 0-phonology path is byte-identical. + +Verified by `Proposer_CoversPhonologicallyAlteredAffix` (a suffix inserts "t"; an unconditional t→d +rule makes it surface only as "d", so sag+SUF = "sagt" → "sagd"; the underlying-only proposer builds a +"t" arc and misses "sagd", the surface-precompile proposer builds the "d" arc and verify confirms it) +and `SurfacePhonology_AppliesRulesForwardToASegmentString`. Full suite green (101). + +### Result (shipped — 1b C-boundary) + +`SurfacePhonology.Variants` now also probes each surface-alphabet segment as a left/right neighbor: it +forward-applies phonology to `neighbor·morpheme` / `morpheme·neighbor` and, when the rule is +length-preserving (output node count = morpheme + 1), reads back the morpheme's own surface nodes. +Bounded by alphabet size × 2; a length-changing context is skipped (no reliable portion) so it stays a +sound superset. This catches an affix whose *own* surface is conditioned by a neighbor across the seam +(e.g. a suffix that voices after the root-final segment). Verified by +`SurfacePhonology_BoundaryTier_RecoversAffixSurfaceFromNeighborContext` (a "t" suffix that voices to "d" +only after "g": isolation keeps "t", the boundary tier recovers "d"). Full suite green (104). + +What the precompile still cannot see — a *neighbor's* surface changing (e.g. a root devoicing before an +affix) or any longer-distance interaction — is covered completely by Point 4 below. + +--- + +## Point 4 — C-exact: full phonology via composition with HC's phonology inverse (shipped) + +**Goal.** Cover *all* bounded phonology — including the cross-boundary, opaque, stem-conditioned +interactions the per-morpheme precompile (Point 1) cannot see. + +**What shipped.** `ComposedPhonologyProposer` composes **HC's phonology inverse with the morphotactic +FST**: it un-applies the grammar's phonological rules to the surface — reusing each stratum's +`IPhonologicalRule.CompileAnalysisRule`, exactly the rules `AnalysisStratumRule` runs (strata +surface→inner, rules reversed within a stratum) — to recover the underlying form, then walks the +underlying-arc FST on it (`FstTemplateAnalyzer.AnalyzeShape`). That is literally phonology⁻¹ ∘ +morphotactics. Because the inverse is applied to the *assembled* surface, it sees cross-boundary context +the per-morpheme tiers cannot. The un-applied shape carries under-specified nodes (analysis is +non-deterministic) which the unification walk matches against every compatible arc; verify prunes the +spurious ones, so it is a sound superset. Complete for bounded (non-cyclic) phonology; an unbounded +self-feeding cycle is not a regular relation and simply will not certify. + +**Why this form, not FST∘FST composition.** The morphotactic proposer accumulates tokens in a side-table +(`_tokenOnEntry`), not transducer outputs, so a literal `Fst.Compose` would require re-architecting the +spine. Composing HC's *existing* phonology inverse instead reuses the engine's real, tested phonology +(no reimplementation) and reaches the same coverage. Wired into `CompositeProposer.ForLanguage` (inert +when the grammar has no phonological rules — it short-circuits). Verified by +`ComposedPhonology_CoversCrossBoundaryAlternation_WherePrecompileMisses` (a root-final "g" +devoices to "k" before a suffixal "t" — "sag"+SUF = "sagt" → "sakt"; the per-morpheme precompile misses +"sakt", composition recovers it, verify confirms, a non-word still yields nothing). + +--- + +## Order of work & status + +1. ☑ `CompositeProposer` plumbing (union + dedup + coverage-signal) — established with reduplication. +2. ☑ Point 3 Reduplication (full-copy generator; strip + recurse + verify). +3. ☑ Point 2 Infixation (remove + recurse + verify; single-contiguous-infix first cut). +4. ☑ Point 1 phonology precompile — bare-root C-internal, affix C-internal (1a) and C-boundary (1b). +5. ☑ Point 4 C-exact — `ComposedPhonologyProposer` (phonology⁻¹ ∘ morphotactics); covers all bounded + phonology including cross-boundary. + +All four wired into `CompositeProposer.ForLanguage`, which both production factories +(`CompleteHybridMorpher`, `CachingMorphologicalAnalyzer`) build and certify on. Commit + test after each +point; each construct test shows (a) the FST alone misses it, (b) the composite covers it, (c) verify +still rejects a non-word. + +## Summary of what shipped + +| Construct | Coverage | Mechanism | Residual | +|---|---|---|---| +| Bare-root phonology | C-internal | `BareRootSurfaces` (GenerateWords) + verify-allows-phonology | — | +| Affix phonology | C-internal + C-boundary | `SurfacePhonology` (1a isolation + 1b neighbor) + `BuildAffixArcs` | — | +| **All phonology** (incl. cross-boundary, opaque) | **complete (bounded)** | `ComposedPhonologyProposer` — phonology⁻¹ ∘ morphotactics | unbounded self-feeding cycle (not regular) | +| Infixation | single contiguous infix | `InfixProposer` (remove + recurse) | templatic multi-slot; phonologically-altered infix surface | +| Reduplication | full copy, one application | `ReduplicationProposer` (strip + recurse) | partial/CV copy; 2+ applications | + +The phonology precompile tiers (1a/1b) are the cheap fast-path; `ComposedPhonologyProposer` is the +complete backstop, so phonology is fully covered. The remaining infix/reduplication residuals are +covered correctly today by the engine via the parity gate — the only thing not yet accelerated for those +narrow cases is *speed*, never correctness. + +## Production wiring + +Both factories — `CompleteHybridMorpher.FromLanguage` and `CachingMorphologicalAnalyzer.FromLanguage` — +build `CompositeProposer.ForLanguage(language, fst)` (the FST plus the reduplication and infix +generators) and certify on the *composite's* `CoversAllConstructs`. For a grammar with no +reduplication/infixation the generators hold no rules and yield nothing, so this is near-zero overhead +and byte-identical behavior; for a reduplicating/infixing grammar it is what lets the grammar certify +(the generator covers the construct the FST skips) instead of falling entirely to the engine. Covered by +`CompleteHybrid_WiresGenerators_ReduplicatingGrammarCertifiesAndMatchesEngine`. + +**Certification caveat (extended).** A certified grammar skips the engine entirely, so correctness on +unseen words rests on the proposer being complete on the certification corpus. With the generators wired +this now extends to reduplication/infix completeness as well — same empirical-certification property as +before, just over a larger construct set. Choose a certification corpus that exercises the grammar's +reduplication/infix patterns. + +--- + +## Real-grammar validation: Indonesian (`meN-` nasal substitution + reduplication) + +Tested end-to-end on a real FieldWorks Indonesian grammar (`GenerateHCConfig.exe` → HC XML → loader), +70 wordforms. The bare composite covered 42/70; the engine found 28 more, all carrying the **`meN-` +active-voice prefix** (`tulis → menulis`, `sewa → menyewa`, `langit → melangit`). + +**Finding that reshaped Points 1/4.** The `meN-` rules are conditioned on the *morpheme boundary* +(`meⁿ+root`) and involve deletion. `ComposedPhonologyProposer` (Point 4) un-applies phonology on the +*boundary-less* surface, so the rules fire everywhere and produce garbage (`menulis → ⁿmeⁿnⁿpuⁿlis`) — +exactly the over-generation HC prunes with interleaved morphology + re-synthesis, i.e. the slow search. +**Phonology inversion cannot be cleanly composed for boundary-conditioned morphophonemics.** It remains +valid and sound for *segment-conditioned* phonology (the `g→k / _t` test); it just isn't the tool for +`meN-`. + +**The viable mechanism: forward synthesis.** Synthesis applies rules *with the boundary present*, so it +is boundary-correct (`GenerateWords(tulis, [meN]) → "menulis"`). `ForwardSynthesisProposer` precompiles, +at build time, each root × every ordered affix combo (permutations — order matters: `[meN,Cont]` → +`menulis-nulis`, `[Cont,meN]` → a non-word) up to `maxAffixes`, synthesizes the surface, and tabulates +`surface → analysis`. Analysis is a dictionary lookup; verify still confirms. It covers reduplication +and infixation for free (synthesis handles them). + +**Result (Indonesian, depth 2):** full coverage **42 → 69 of 70**, **0 unsound** (always a subset of the +engine), build ~5 s / 2283 entries. The one holdout (`mengamat-amati` = AV+observe+Cont+LOC) is a +3-affix combo with a realizational suffix that needs feature-driven synthesis; depth 3 (45 s) does not +reach it, so depth 2 is the sweet spot. + +**Scope / honesty.** +- It is **opt-in** (`CompositeProposer.ForLanguage(language, fst, forwardSynthesis: true)`): build cost + grows with lexicon × affix permutations — right for bounded-affixation grammars / fixed corpora, not + for heavily-inflecting templatic systems (those keep riding the engine). +- It does **not** flip Indonesian to *certified* (default-path acceleration): the holdout breaks parity, + and the grammar is not FST-closed (unbounded-environment reduplication rule). The win is on the + explicit verified-FST path — correct everywhere, accelerated for the 69 covered words. +- The fully general fix for boundary-conditioned phonology remains **forward FST∘FST composition** + (compile morphotactics ⊗ phonology, the lexc+rewrite approach `Fst.Compose` supports) — a larger spine + change, deferred. + +--- + +## Closing Indonesian: bounded-reduplication closure + +Indonesian was Tier 3 / not-FST-closed. Diagnosis: **all closure escapes are reduplication** — the 3 +reduplication morphological rules (`-Cont`, `-Pl`, `REDUP-meN`); the `meN-` nasal substitution is +regular, and the "Nasalization in reduplication" rule is *phonological* (not a closure escape). So +Indonesian is FST-able except for copy. + +**The trick (your "make it close"):** reduplication over a *fixed lexicon* with *bounded copy* is a +**finite, hence regular** language (compile-replace / Beesley–Karttunen). `GrammarFstClosure.Analyze` +gains an opt-in `boundedReduplication` flag that, under that assertion, treats reduplication/infix as +FST-able feeders rather than escapes. A 1-way FST/regex still can't do *productive unbounded* copy (only +a 2-way FST can) — but it doesn't need to for a finite lexicon. + +**Result (measured):** with `forwardSynthesis: true` + `boundedReduplication: true`, +`CachingMorphologicalAnalyzer.FromLanguage` makes Indonesian **certify**: +- `closed`: default `False` → bounded `True`; +- `CoversAllConstructs`: `True` (forward-synth covers `{Reduplication, CircumfixPrefix}`, the FST's + uncovered ops — its `CoveredOps` was broadened to claim circumfix, which synthesis already handles); +- `parity`: 69/70; +- ⇒ **certified on the covered corpus → default path is FST-only, engine skipped.** + +The one holdout, `mengamat-amati` (AV+observe+Cont+**LOC**, a 3-affix realizational combo), is a +forward-synth *coverage-depth* gap, not a closure issue. Soundness is unaffected — verify + parity gate +everything; the flags are explicit opt-in assertions about the lexicon being fixed and copy bounded. diff --git a/docs/archive/HERMITCRAB_FST_PLAN.md b/docs/archive/HERMITCRAB_FST_PLAN.md new file mode 100644 index 000000000..582e3617d --- /dev/null +++ b/docs/archive/HERMITCRAB_FST_PLAN.md @@ -0,0 +1,824 @@ +> **Archived — superseded by [`../FST_FAST_PATH_PLAN.md`](../FST_FAST_PATH_PLAN.md).** This is the +> original design doc for the "certification" ambition: an empirical corpus-parity gate +> (§9–§12 below) that would let the FST analyzer *replace* the search engine once "proven" complete +> on a corpus. That concept is entirely removed from the codebase — it was fragile in practice (a +> grammar could certify on 30 words and decertify on 60) and is not the product. `CompleteHybridMorpher`, +> `CachingMorphologicalAnalyzer`, and `GrammarFstClosure`, all described extensively below, no longer +> exist. Kept for the historical record of the Sena measurements and the regularity theory in §9–§12, +> which is still accurate background even though nothing acts on it as a gate anymore. + +# HermitCrab FST acceleration — plan + +> **Shipped MVP (read this first).** The MVP that landed is a **sound, fast, optionally-complete** +> analyzer that reuses HC's own engine: +> - **`FstTemplateAnalyzer`** proposes candidate analyses by walking a precompiled template/derivation +> FST (fast, immutable → thread-safe to share). +> - **`VerifiedFstAnalyzer`** confirms each candidate by **restricted re-analysis** (`FstReplay`): HC's +> own `AnalyzeWord` pinned to that candidate's root+rules via a pooled `Morpher`. A confirmed, +> genuine HC analysis is emitted; anything HC won't confirm is discarded. **Sound** (no wrong +> analyses), **~13×** on Sena, **multithread-safe** (each verify rents a Morpher from `MorpherPool`). +> - **`CompleteHybridMorpher`** adds completeness: a grammar that passes **empirical set-parity** +> (`FstVerification`) runs FST-only; otherwise the search engine is used (the known slow path). +> Per-word control via `AnalyzeWord(word, useFst)` / `UseFstFor`. +> - **`GrammarFstAdvisor` + `GrammarFstClosure`** — the grammar census/linter (PR #441's original core). +> +> **Out of scope / explored-then-abandoned:** the *per-stem completeness proof* (proving the fast path +> complete for every word without the engine). Sections §11.5+ and §12.3+ below document that +> exploration and why it was dropped (rule/symbol coverage ≠ path coverage; the segmentation-superset +> proposer was slower and still incomplete). The shipped completeness model is the empirical +> certificate + engine backstop, not a static per-stem predicate. Deferred to later PRs: the +> generator (reverse direction) and a 2-way-FST/compounding treatment of the residual ~3%. + +Goal: replace HC's combinatorial un-application *search* (measured ~10,000 `Word` clones/word, +397 MB/word, the cause of the ~3× parallel ceiling) with a precompiled **transducer walk** for +the finite-state fraction of a grammar — while **degrading gracefully** to the existing engine +for the parts that aren't finite-state. A grammar census of the real Sena grammar found it +**~100% FST-able** (0 rewrite rules, 0 variables, 0 productive reduplication, all-concatenative +affixation) — and the `GrammarFstAdvisor` in this PR confirms it (Tier 1, 0 escapes) — so for +Sena-like grammars an automaton walk could be 10–100× and near-zero-allocation (which also lifts +the thread ceiling). + +## 1. Tech stack — build on SIL.Machine's own `Fst` (not OpenFst/Foma/HFST) + +The decisive fact: **`SIL.Machine.FiniteState.Fst` already provides the full algebra we need** — +`Compose`, `Determinize`, `Minimize`, `Intersect`, `EpsilonRemoval`, transducer outputs +(`IFstOperations`: Insert/Replace/Remove), and crucially **`UseUnification`** (arcs carry +*feature structures* matched by unification, not just plain symbols). The `RootAllomorphTrie` +is already a lexicon FST built on it. + +| Option | Verdict | +|---|---| +| **SIL.Machine `Fst`** (in-repo) | ✅ **Recommended.** Managed, cross-platform, no interop; *natively models HC's feature-bearing segments with unification*; composition algebra already present; lexicon-FST precedent. | +| OpenFst / Foma / HFST (C/C++) | ❌ for now. Mature + fast, but: plain-symbol alphabets (must flatten feature structures → blowup; even though Sena has no variables, this loses HC's native model), heavy P/Invoke + native build/cross-platform burden, and reconciling results back to HC's `Word`/`Properties`. Reserve only if SIL.Machine's FST can't scale. | + +So the stack is: **C#/.NET on SIL.Machine's `Fst`**, reusing the existing `ShapeNode`/`FeatureStruct` +model. The work is a *compiler* (Language → composed transducer) plus a *runtime* (`IMorphologicalAnalyzer` +that walks it), not a new FST engine. Risk to retire early: validate that `Fst.Compose` + +`Minimize` behave correctly for **unification** arcs at grammar scale (they're proven for the +matcher's pattern FSTs; composition of large lexicon∘affix transducers is the unknown). + +## 2. The compile pipeline (Language → one analyzer transducer) + +1. **Classify** every construct (the census, made a reusable pass): concatenative affix / + template / environment-allomorphy / bounded compounding = **FS-able**; rewrite rule with + unbounded environment, α-variable, productive reduplication, infixation = **non-FS island**. +2. **Build component transducers** for the FS-able fraction: + - lexicon → root transducer (extend `RootAllomorphTrie`), + - each concatenative affix subrule → an insert/concat transducer, + - affix templates → position-class concatenation, + - environment-conditioned allomorphy → context-restricted arcs, + - bounded compounding (`MaxStemCount`) → bounded recursion unrolled. +3. **Compose** them (`lexicon ∘ affixes ∘ templates`) into one transducer, then + **Determinize + Minimize**. Composition bakes in rule ordering/opacity; minimization gives + the optimal shared state set (the Myhill–Nerode classes). +4. **Invert/orient** for analysis (surface → underlying+gloss): the analyzer walks the input + word through the transducer, reading off morpheme IDs / `Properties` on accepting paths — + the same IDs HC's `Word` carries, so the consumer mapping (FieldWorks → LCM) is unchanged. + +## 3. Graceful degradation — the tiered hybrid (the key design) + +The architecture must never regress: the FST is a **sound optimization layered over the proven +search engine**. Three tiers, chosen automatically by the compile-time census: + +- **Tier 1 — fully FS-able grammar (e.g. Sena).** The whole grammar compiles; the transducer is + **complete**. Analysis = automaton walk only; the search engine is never invoked. Maximum win. +- **Tier 2 — FS-able with isolated non-FS rules.** Compile the FS fraction into the transducer; + mark each non-FS operation with an **escape** (flag-diacritic-style arc). At runtime: + - cheaply detect whether any non-FS rule *could* apply to this word (e.g. a reduplication + signature, or a segment a rewrite rule targets); + - if **not** → the transducer is complete for that word → fast path; + - if **yes** → fall back to the existing `Morpher` search for that word (or delegate just the + escaped sub-operation, then resume the walk). + Most words avoid the islands → mostly fast, with the slow path only where needed. +- **Tier 3 — pervasively non-FS (heavy rewrite rules, α-variables, productive reduplication).** + The FST covers too little; **disable it** and use today's search engine. No regression. + +**Soundness contract (non-negotiable):** the FST must (a) never emit a wrong analysis, and +(b) for any word it claims complete, never miss one. Guaranteed by: only compiling +*provably*-FS-able rules; in Tier 2, falling back whenever completeness is uncertain (conservative); +and a **verification mode** during rollout that runs FST + search and asserts identical analyses +across a corpus (we already have the Sena rig + signature comparison for exactly this). + +The degradation is *monotone in grammar complexity*: more FS-able ⇒ more handled by the fast +walk; less FS-able ⇒ more fall-back, down to pure search. Nothing ever gets slower than today. + +## 4. Where it bolts onto the code + +- New `FstMorpherCompiler`: `Language → ComposedAnalyzerFst` (+ the per-grammar tier decision). +- New `FstMorpher : IMorphologicalAnalyzer` (and `IMorphologicalGenerator` for the reverse): walks + the transducer, emits `WordAnalysis` / the morph `Properties`; on a Tier-2 escape, delegates to + an inner `Morpher`. +- Reuse: `RootAllomorphTrie` (lexicon FST), the `Fst` algebra, the `ShapeNode`/`FeatureStruct` + model, the census classifier, and the benchmark + signature comparison for verification. +- Consumers are unaffected: same `IMorphologicalAnalyzer` interface; FieldWorks keeps mapping + morpheme IDs → LCM. + +## 5. Risks & mitigations + +| Risk | Mitigation | +|---|---| +| `Fst.Compose`/`Minimize` unproven on large **unification** transducers | Spike on Sena first; validate output == HC output on the corpus before scaling; fall back to plain-symbol flattening (Sena has no variables) if needed | +| State/alphabet blowup | The **eager/lazy partition knob** (§10): a state/memory budget that auto-demotes expensive-cold layers from precompiled (A) to on-the-fly (B); completeness is invariant under the knob (composition associativity), so bounding size never drops analyses. Minimize-after-compose only on safe (non-unification) layers | +| Tier-2 "is the FST complete for this word?" detector unsound → missed analyses | Make it conservative (fall back when unsure); verification mode catches misses in rollout | +| **Closure**: a normal (FST) step *feeds* an escape, so the automaton's "no path" is a false "done" | Confirm FST closure (§9): static feeding-closure pass (`range(F) ∩ T_E = ∅` via `Fst.Intersect`) + stratal containment; corpus closure verification (set parity) gates replacing the search engine. Undecidable feeding (non-regular escapes in a loop) ⇒ conservatively keep those words on the search backstop | +| Generator (synthesis) direction | Same transducer inverts; or keep HC synthesis initially and only FST-accelerate analysis | +| Grammar-specificity | The census decides the tier per grammar; production grammars must be censused before enabling Tier 1/2 | + +## 6. Phased plan + +1. **Spike (decisive):** compile Sena's lexicon ∘ concatenative-affixes into one transducer via + `Fst.Compose`/`Minimize`; build a minimal `FstMorpher.AnalyzeWord`; **verify** its analyses + equal `Morpher.AnalyzeWord` on the Sena corpus (signature comparison); **measure** clones (→~0), + allocation, and wall-time vs. the search engine. This proves or kills the SIL.Machine-FST stack. +2. **Complete Tier 1:** add templates, environment-allomorphy, bounded compounding; full Sena + parity + the parallel-scaling re-measurement (expect the 8-thread/3× ceiling to lift, since the + walk barely allocates). Build the compiler as a **pipeline of composable layers behind an + eager/lazy interface** (§10) from the start — the partition knob and state budget are Phase 1–2 + architecture, not a later bolt-on. +3. **Tier 2 hybrid:** census-driven escape arcs + per-word fallback detection + verification mode, + gated on **confirming FST closure** (§9) — the static feeding-closure pass + corpus closure + verification that certify the transducer's "no analysis" is a proof, not a guess. +4. **Generator + productionize:** reverse direction, the `IMorphologicalAnalyzer` wiring, and a + FieldWorks adapter; run the census on real production grammars to set each project's tier. + +## 7. Decision gate + +Step 1 (the spike) is the gate: it answers, with numbers, whether SIL.Machine's FST can compose +a real grammar correctly and how big the speedup is. If yes → proceed; if `Fst.Compose` can't +handle it → reassess (flatten to symbols, or external lib). Everything past Step 1 is contingent +on that result. + +## 8. Transducer output schema — the packed morpheme-token array + +What the analyzer transducer emits on an accepting path must be the *structured derivation* +(ordered morphemes + root), not just accept/reject — otherwise it is a **recognizer, not an +analyzer**. HC carries this today as per-segment morph annotations + an ordered allomorph list +(`Word.MorphemesInApplicationOrder` → `WordAnalysis.Morphemes`/`RootMorphemeIndex`); the FST must +emit the same structure as transducer output. + +**Encoding — one 32-bit token per morpheme, in application order:** + +``` + 31 24 23 0 ++----------------+--------------------------------+ +| 8-bit MorphOp | 24-bit morpheme index | ++----------------+--------------------------------+ +``` + +- **op (high 8 bits)** = the morpheme's *role/operation*: Root, Prefix, Suffix, Infix, + Reduplication, CircumfixPrefix/Suffix, Compound, Clitic, Process (simulfix/ModifyFromInput), + Null (zero morph). This is the "ordered operations connected to the letters" — it lets a + consumer rebuild the gloss/bracketing without re-running any rule. +- **morpheme index (low 24 bits)** = an index into the grammar's compiled morpheme table + (→ `IMorpheme.Id`/gloss via a side table — don't pack strings). +- An accepting path's output is the **`uint[]` of these tokens — that array *is* the analysis**, + and it is **self-describing**: `Morphemes` = the indices in array order; `RootMorphemeIndex` = + the position of the `Root` token (no separate field). + +**Why this shape (verdict: sound):** + +- **Compact / cache-friendly / hashable:** 4 bytes per morph (a 5-morph word = 20 bytes); analyses + compare and dedupe as plain integer arrays. +- **24-bit ceiling = 16,777,215 morphemes** — ample (largest FLEx projects are ~10⁵–10⁶ entries); + the compiler asserts `morphemeCount ≤ MaxMorphemeId`. +- **8 bits for the op** is byte-aligned headroom (only ~5 bits used); keep it for growth. + +**What it deliberately does NOT carry — keep these as separate optional channels, do not widen the +token:** + +- **Surface segmentation** (which input letters belong to which morph): if interlinear morph-breaks + are needed, the same walk emits a parallel `int[]` of morph start-offsets. The 32-bit token stays + the pure (op, morpheme) derivation. +- **Specific allomorph** (vs morpheme): an optional second channel; consumers (FieldWorks → LCM) + key on the morpheme. + +Realized now as `MorphToken` / `MorphOp` (the codec + bounds check + root recovery); the FST +compiler (the spike, §6.1) emits these tokens as arc outputs, so the analyzer is structured from +day one rather than a bare recognizer retrofitted later. + +## 9. Confirming FST closure — the completeness certificate + +An FST analyzer is only trustworthy if its **silence is a proof**: "no accepting path" must mean +"no analysis exists", and "these K paths" must mean "exactly these K analyses" (all homographs, +nothing spurious). That is **completeness**, and it does not come for free — it must be *certified* +per grammar. Completeness has two parts, and the second is the hard one: + +1. **No escape applies to the current form.** A local trigger check. Easy. +2. **No FST-able ("normal") step reachable from the input can *create* a form where an escape + then applies.** This is **feeding** (Kiparsky): rule A feeds rule B if A builds B's + environment. If a normal step can feed an escape, the compiled automaton — which excluded + escapes — is **not closed**: a valid derivation exists that it has no path for, so its silence + is a false "done". Everything rests on ruling this out. + +### 9.1 Can closure be guaranteed? Decidably yes for the regular fragment; not universally + +The universal question ("can this grammar *ever* reach an escape configuration?") is **undecidable** +in the limit — general rewriting with non-regular escapes in a feeding loop is Turing-complete. So +"guaranteed for any grammar" is impossible. **But for a given grammar it is usually decidable**, and +when the answer is yes the automaton's silence becomes a theorem. Two mechanisms: + +- **(a) Decidable feeding-closure (the computable certificate).** Each escape `E` has a *trigger + set* `T_E` — the configurations where it fires. For a *regular* escape `T_E` is a regular + language. Each FST-able rule `F` is a regular relation. The question "can `F` ever produce a form + in `T_E`?" is exactly the **regular-language emptiness test** + + ``` + range(F restricted to FST-reachable forms) ∩ T_E = ∅ ? + ``` + + which **SIL.Machine's `Fst.Intersect` + a reachable-accepting-state check computes directly**. Run + it over every (FST-rule `F`, escape `E`) pair: + - **all intersections empty** ⇒ no normal step can ever feed an escape ⇒ the FST fragment is + **closed** ⇒ "no escape now, and no path in the automaton" is a *complete certificate* — the + sufficient "done"; + - **some intersection non-empty** ⇒ feeding is possible: if the fed escape is *regular*, fold it + into the automaton (Kaplan–Kay, §7-era reasoning) and re-check; if it is *non-regular/opaque*, + closure cannot be certified and those words must fall to the search backstop. + +- **(b) Stratal containment (the practical guarantee).** HC is stratal, and strata *bound* feeding. + If every escape is confined to a stratum the FST fragment never feeds *into* — e.g. + reduplication/templatic processes apply innermost, *before* FST-able affixation/phonology — then + by construction no later normal step can reach them. Verify by checking escape-rule strata against + FST-rule strata and the (downward) feeding direction. For most real grammars the "funny" + processes are exactly the innermost ones, so this holds. + +### 9.2 The per-grammar verdict + +| Situation | Is "no FST form ⇒ done" sufficient? | +|---|---| +| No FST-rule feeds any escape (∩ = ∅), **or** escapes stratally contained upstream | **Yes — provably.** The walk enumerates all paths; absence is a theorem; all homographs surface. | +| FST-rule feeds a **regular** escape | Fold the escape in → row above. | +| FST-rule feeds a **non-regular/opaque** escape | **No.** A valid derivation can hide from the surface; those words go to the bounded search. | + +### 9.3 Homographs (positive completeness) + +"Found one, are there others?" is the *easy* direction **once closure holds**: the walk returns +**all** accepting paths, never the first only (the spike already shows this — `dat` returns both +lexical entries). A homograph is missed only by (i) **unsafely determinizing/minimizing** and +merging paths — which is exactly why the analyzer stays nondeterministic and never `Minimize`s +unification arcs — or (ii) the compiler not encoding one decomposition (a closure failure), caught +by §9.5. + +### 9.4 The search backstop's own "done" + +For words that fall out of the FST (uncertifiable feeding to a non-regular escape), completeness +comes from the existing **bounded** search: "done" = all branches within the depth bound explored. +That is sound iff the bound is a *true* upper bound on derivation length — finite exactly when the +rule-interaction graph has **no unbounded self-feeding cycle**. A grammar with such a cycle has no +finite completeness guarantee from anyone (FST or search) and should be flagged. + +### 9.5 How we make it sufficient (the work) + +- **Static feeding-closure pass** (extends `GrammarFstAdvisor`): build the feeding graph — for each + FST-able rule and each escape, the `range(F) ∩ T_E` emptiness test via `Fst.Intersect` — and emit + a per-grammar verdict: **"closed — FST silence is a proof"** vs **"rule X feeds escape Y → those + words need the search backstop"**, plus the stratal-containment check as a fast pre-filter. +- **Corpus closure verification** (empirical backstop to the static proof): run the FST and the + sound+complete search engine over a corpus and assert the analysis **sets are identical** (same + cardinality and members) for every word, including ambiguous ones. Any divergence is a missing or + spurious path — a closure bug — localized to the offending rule. This converts "closed" from a + claim into a measured guarantee, and is the gate before an FST analyzer may *replace* (not just + shadow) the search engine for a grammar. + +### 9.6 Phase placement + +Closure confirmation is **Phase 3 (Tier-2 hybrid)** in §6: the static feeding-closure pass decides, +per grammar, which words the transducer is complete for and which escape to the search; the corpus +closure verification is the rollout gate. Until it passes for a grammar, the FST runs in +**shadow/verification mode** (alongside the search, asserting set parity), never as the sole +analyzer. + +## 10. Completeness under load — the eager/lazy partition knob (designed in from day one) + +Eagerly composing the whole grammar into one transducer is fastest to *walk* but the state count is +roughly **multiplicative across composed layers**, so a single high-branching layer (a position +class with hundreds of allomorphs, productive bounded compounding, a large affix inventory) can blow +the automaton up. We must be able to **bound the compiled size without ever sacrificing +completeness**. That requires a tunable partition — and because it changes correctness-adjacent +machinery, it has to be in the architecture from the start, not retrofitted. + +### 10.1 Three buckets + +Every construct lands in exactly one bucket, and the boundary between the first two is a **knob**: + +- **A — Precompiled (eager).** Composed into the static transducer ahead of time. Fastest walk; + costs states. +- **B — On-the-fly (lazy).** Kept as a separate composable layer and **applied at analysis time by + on-demand composition** against the partial result. Bounded memory; slower per word. Still + finite-state, still complete. +- **C — Search / probe fallback.** The non-FS escapes (and any construct whose closure can't be + certified, §9). The sound backstop. + +**What bucket C actually is (sharpened — see §11.3).** C is *not* a wide, murky middle that +"spans" A and B. Formally (Kaplan & Kay) everything concatenative — affixation, derivation, +inflection, ordered phonological rewrite rules — is a **regular relation**, hence A-or-B. The only +genuinely non-regular operations are a short list: **unbounded copying (reduplication)** and +**unbounded recursion** (productive compounding/incorporation with no depth bound), plus the rarer +**bracketing paradox**. So a C construct is a **thin, local, non-regular core wrapped in B on both +sides — `B ∘ C ∘ B`** — not a fog. That thinness is what makes the §11.3 release valves work: a +local core is **detectable and peelable**. Critically, *a construct missing from the FST is not +automatically C* — it is usually just **unbuilt B** (regular, simply not yet enumerated), which is +exactly what the Sena derivation gap turned out to be (§11.2). + +The **A↔B boundary is the knob**; **C is fixed by the §9 closure analysis, not the knob**. There is +always a safe floor setting — *everything in B* (nothing precompiled) — which is bounded in memory +and still complete; the knob only interpolates between "fast and big" (more A) and "small and slow" +(more B). The automaton can therefore never be forced to explode: when eager composition would +exceed a **state/memory budget**, the compiler demotes layers A→B until under budget. + +### 10.2 Why completeness is *independent of the knob* (the load-bearing guarantee) + +This is the property the knob must never break, and it holds for three composing reasons: + +1. **Composition is associative.** `(A ∘ B) ∘ rest ≡ A ∘ (B ∘ rest)`. Precompiling a layer versus + applying it lazily denotes the **same transduction** — the split point changes *when* the work + happens, never *which* relation is recognized. So moving a rule from A to B cannot add or drop a + single analysis. +2. **The walk enumerates all paths in either bucket.** A lazy layer expands *all* its applicable + arcs on demand (not the first), exactly as a baked-in layer would, so homograph/positive + completeness (§9.3) is preserved across the split. +3. **Closure (§9) is computed on the full relation `A ∘ B`, not on the precompiled subset.** The + feeding-closure certificate and the corpus set-parity gate validate the *whole* partition, so + "no path ⇒ done" stays a proof wherever the knob sits. + +Net: the knob is a pure **space/time dial**; the **analysis set is invariant** under it. That is why +it is safe to expose it (even to auto-tune it) without re-proving correctness each time. + +### 10.3 The knob's policy — and why it is per-language (yes, it would differ) + +The optimal A/B cut is grammar- and corpus-specific. Rank each candidate layer by two measurable +quantities: + +- **state-multiplier** — how much it grows the composed automaton (measure by composing it and + diffing the minimized state count); +- **hotness** — how often a corpus sample actually exercises it. + +Precompile (A) the **cheap-and-hot** layers; keep lazy (B) the **expensive-and-cold** ones; demote +A→B in descending cost/benefit until under the state budget. These quantities vary by language: a +language with one rarely-used 200-allomorph class should keep it lazy (precompiling multiplies the +whole automaton ×200 for little corpus payoff), while a language whose hot morphology is a handful of +low-branching affixes should precompile nearly everything. **So the same construct can be A in one +project and B in another** — the partition is a *pluggable policy* (with an optional auto-tuner that +reads the state-multiplier/hotness numbers), not a hard-coded rule. + +### 10.4 What "designed in from the beginning" demands + +- The compiler is a **pipeline of self-contained composable layers**, each carrying metadata + (state-multiplier, hotness, closure status), **not** a monolithic "compose everything." +- Each layer can be realized **either** as composed-in arcs (A) **or** as a lazy applicator (B) + behind one interface, so moving the knob is a config change, not a rewrite. +- The analyzer walks the **eager core and lazily expands B-layers on demand**, accumulating the same + `MorphToken` outputs (§8) regardless of bucket. +- A **state/memory budget** is a first-class compile input; exceeding it triggers automatic A→B + demotion (never a silent truncation — log what was demoted). +- The **corpus set-parity gate (§9.5) runs against the chosen partition**, so any A/B setting that is + shipped is verified complete before it can replace the search engine. + +### 10.5 Phase placement + +The layered, lazy-capable compiler and the budget/policy interface are **Phase 1–2 architecture** +(the spike's `FstMorpher` is already structured as discrete composable pieces — lexicon chains + +affix chains — rather than a monolith, which is the seed of this). The auto-tuner and per-project +policy tuning are **Phase 4 (productionize)**. The completeness invariant (§10.2) is an **invariant +checked at every phase**, not a phase of its own. + +## 11. Findings from the Sena drive (the corrected picture) + +This section records what the actual Sena implementation taught us, *correcting* earlier divergence +analysis that was measured against a broken baseline. Read it before §9/§10 are taken as final. + +### 11.1 The measurement bug that invalidated earlier divergence numbers + +The benchmark forced `Morpher.MaxUnapplications = 3` on the **search engine used as ground truth**. +But in HC `MaxUnapplications = 0` means **unlimited** (the cap engages only when `> 0`, +`AnalysisStratumRule.cs:144`). Setting it to `3` throttled the reference search down to **0–few +analyses per word**, so every "divergence" the FST showed against it was the FST disagreeing with a +*crippled* oracle — artifacts, not morphology bugs. **Lesson: always run the reference `Morpher` +with `MaxUnapplications = 0` (unlimited) when measuring FST parity.** A `=3` ground truth is +meaningless. + +With the corrected (unlimited) oracle: + +| corpus | FST template analyzer vs search | speed | +|---|---|---| +| curated 15 words | **IDENTICAL** (sound + complete) | 2.4 vs 177.8 ms/word (**~74×**) | +| broader 60 words | 12 real divergences (below) | 2.9 vs 245.5 ms/word (~85×) | + +So the FST approach is **already sound + complete on the regular fraction** it builds; the residual +is coverage and a verification subtlety, not a flaw in the "walk the forest" design. + +### 11.2 The two real residuals (neither is bucket C) + +The 12 genuine divergences split cleanly, and **both kinds are bucket B, not C**: + +- **Over-generation** (FST proposes readings search rejects — e.g. `kulemba` as `INF+[escrever]+IND`, + `mbalira`, `ndiende`, invalid agreement combos in `akudza`/`aikwata`). These are killed cleanly by + **verify-discard** (`VerifiedFstAnalyzer` / `FstReplay`): re-synthesize each candidate through the + proven engine and drop any that does not regenerate the surface. Re-synthesis enforces *every* HC + constraint at once (category, MPR, co-occurrence, obligatoriness) — so this is the "install all the + gates" mechanism, and it removed every over-generation in the corpus with no FST-encoded gate. + +- **Under-generation** (search has readings the FST never proposes — `aikhane`, `angwera`, `kunduli`, + `paoneke`, `khalani`, `cidzo`, `ikoyiwe`). **Every one is a derivational suffix the FST build + omits:** `REC` (reciprocal), `APPLIC` (applicative), `REV` (reversive), `NZR` (nominalizer), `NEU` + (neuter/stative), `PAS` (passive), `acção`. The build covers the *inflectional* layer (subject/ + object agreement + TAM) but not the *derivational* layer between root and inflection (e.g. + `[vencer]+REV+NZR`, `[cair]+APPLIC+IND`, `[ser]+REC+NZR`). This is **unbuilt B** — concatenative, + regular — not a non-regular gap. + + *Build-order wrinkle:* derivation reintroduces the surface-vs-derivation order problem. In + `kunduli = 10+[vencer]+REV+NZR`, the class-10 *prefix* is licensed only because `NZR` (a later + suffix) nominalized the stem — a left-to-right surface walk cannot gate that. **Resolution: build + permissively (propose the derivation paths) and let verify-discard kill the bad combos.** Do not + attempt to gate derivation order in the walk. + +- **Verify false-rejections** (`kubvuna`, `akhaona`, `nyabasa`, `ndalama`): verify-discard dropped + *valid* analyses it could not re-synthesize. This is **token under-determination** — the + `(op, morpheme)` token (§8) omits an allomorph or feature needed to regenerate the surface, so the + replay fails on a legitimate analysis. This — not reduplication — is the real "last nut" for a + *lossless fast path*, because it makes verify-discard lose true analyses. (The `SoundHybridMorpher` + fallback variant stays complete by routing any unconfirmable word to full search, at the cost of a + high fallback rate — 88% here — so it is correct but not yet fast.) + +### 11.3 Bucket C in the wild, and the release valves (does it even occur here?) + +**In *this* Sena grammar: there is no bucket C.** The grammar file has **0 reduplication rules** +(`grep reduplicat` = 0; all rules are `CopyFromInput` + `InsertSegments`, i.e. ordinary affixation), +the census reports **Tier 1 / FST-CLOSED / 0 escapes**, and compounding is bounded (8 rules). So the +slow path may never need to fire for Sena; the `HybridMorpher` total-reduplication route is a +never-triggered safety net here. + +**In general, genuine C does occur** — Bantu verb reduplication (`-famba-famba` "walk around"), +Indonesian/Malay full reduplication (`buku-buku` "books"), Tagalog aspect reduplication, and +bracketing paradoxes (English `un-happi-er`). For those, three resolution paths — and the key +insight that **for copy, detection and parsing are the same local problem** (a reduplicant is an +adjacent repeated substring; detecting it *is* finding the split): + +1. **Bounded fold into B (length-cap the copy).** Precompile reduplication for stems up to length + `N` — finite, therefore regular, therefore pure B. Cost is **linear** in `N×|stems|`, not + exponential. Stems longer than `N` (vanishingly rare) fall to the backstop. Best when copy shapes + are few. +2. **Detect-and-peel (compile-replace).** At parse time run a cheap repeat-scan that *proposes* + candidate reduplicant splits; strip the copy and hand the base to the B-FST; accept any split + whose base parses and whose copy relation holds. No precompile blow-up, handles unbounded copy, + and the live work is just the scan + peel — the heavy lifting stays in B. This is the "look for it + live as well" valve, and the standard finite-state-morphology answer (Beesley & Karttunen's + `compile-replace`). **Preferred.** +3. **2-way FST.** Replace the 1-way transducer with a two-way one for the reduplicative fragment — it + re-reads its input, so it *computes* the copy a 1-way FST cannot, while staying finite-state and + linear-time (Dolatian & Heinz, computing reduplication with 2-way FSTs). Cleanest in theory; + biggest lift (SIL.Machine's `Fst` is 1-way). + +**The A/B/C balance is computable, not guessed.** For each candidate C-feature, build the FST with +and without it folded and measure `Δ|states|`/`Δ|arcs|` (the precompile blow-up), and measure the +corpus frequency of words needing it; fold iff `Δmemory` fits the budget *and* `freq × slow_latency` +saved is worth it. This is the §10.3 knob made quantitative — a knapsack over the state-multiplier +and hotness numbers the layered compiler already exposes. + +**Theory load-bearing here** (attributed by idea; verify exact citations before quoting): rewrite +rules compose to regular relations (Kaplan & Kay 1994); reduplication is *the* canonical non-regular +morphological process; 2-way FSTs can compute it (Dolatian & Heinz); subregular locality (Chandlee) +explains why everything else is cheaply finite-state. + +### 11.5 Why re-synthesis verification failed — and why it is fixable (the confirmed root cause) + +The verify-discard mechanism (§11.2) leaned on `Morpher.GenerateWords` to confirm a candidate by +re-synthesis. A round-trip self-test exposed that **HC's own search analyses do not round-trip +through `GenerateWords`** for derivational/inflected *verb* forms (`aikhane`/`angwera`/`kunduli`/ +`ikoyiwe` → all NO), while *noun*/simple forms do (`kulemba`/`mbalira` → OK). A deep probe of +`aikhane` settled the cause — and it is **not** fundamental loss: + +- All its morphemes are plain `AffixProcessRule`; the analysis's `RealizationalFeatureStruct` is + empty (`ANY`). So it is not a realizational-FS reconstruction problem. +- Re-synthesis reproduced the surface under **none** of: all-morphemes-as-rules, non-realizational + only, empty FS, or the ground-truth FS. +- The grammar has **0 phonological rules**, so it is not opacity. + +The real cause is the **two synthesis doors** in `Morpher`: + +| Door | Input | Behavior | +|---|---|---| +| `Synthesize` (internal, used by `ParseWord`) | the **rich analysis `Word`** (stripped shape + exact template/slot structure + features, via `LexicalLookup`) | **faithful** — reproduces every valid analysis | +| `GenerateWords` (public convenience) | a **flat bag** of morphemes, re-permuted and applied as **free** morphological rules | **lossy** — re-guesses order/context, bypasses templates | + +Confirmed in the grammar: the inflectional affixes (`3P+2`, `SBJV`, …) are **template-slot rules** +(`mrule26+`, inside ``), while only compounding/derivation (`mrule1–25`) are free stratum +rules. `GenerateWords` applies the slot rules as free rules — no slot order, no obligatoriness, no +template gating — so feature-dependent verb combinations never synthesize, even from the exact right +morphemes. A simple noun + class-prefix (one slot, no interdependency) happens to survive, which is +why nouns round-trip and verbs do not. + +**The under-determination is therefore self-inflicted, not fundamental.** The FST *walk* knows +exactly which template and slots it traversed and in what order — it discarded that when it emitted +the lean `(op, morpheme)` token (§8). The fix is to **preserve the template/slot path the walk took +and verify through HC's faithful door** (`Synthesize`-style, template-aware directed synthesis), +rather than the flat `GenerateWords`. That makes verify both **sound and lossless**: a real +over-generation (e.g. an object marker on an intransitive stem) still fails HC's template-aware +synthesis and is dropped, while a valid verb form now confirms instead of being false-rejected. This +also collapses the 90% `SoundHybridMorpher` fallback (which was driven by false-rejection, not by +genuine over-generation). + +### 11.6 The measured corpus picture (200 Sena words, unlimited oracle) + +| analyzer | result | speed | +|---|---|---| +| search (oracle) | 480 analyses | 224 ms/word | +| raw FST template+derivation | 49/200 diverge (~24.5%): **~19% over-gen, ~7% under-gen** | 3.5 ms/word (**~64×**) | +| verify-discard (`GenerateWords`) | 48/200 — barely helps (the §11.5 lossy door) | 8.3 ms/word | +| sound fallback | 2/200 — near parity, but **90% fallback** (false-rejection driven) | — | + +Reading: completeness is *nearly* there (the derivation layer cut under-gen but ~7% remains — +category-changing derivation, §11.4 Part 2, and prefixal derivation). Over-gen (~19%) is the larger +axis and is what the template-aware verify (§11.5) must remove. The headline speed (~64×) is real; +the open work is making the *verified* path lossless so the fallback rate falls from 90% toward the +true over-gen rate. + +### 11.7 Status: correctness essentially done; speed is the one remaining lever + +A check of the `SoundHybridMorpher` path on the full 200-word corpus settles where the project is: +**both residual divergences (`miwiri`, `mitemo`) are `extra=[]` — pure under-generation, zero +over-generation.** So: + +- **Sound** — the hybrid never emits a wrong analysis (the fallback catches every over-gen). ✓ +- **~99% complete** — 198/200 exact set-match; 2 residual under-gen. ✓ +- **Not yet fast** — 90% fallback, so no net speedup *yet*. ⚠ + +Correctness is therefore effectively achieved. The single open axis is **speed**, and it has one +precise lever: the 90% fallback is driven by the **lossy `GenerateWords` verify false-rejecting valid +words** (§11.5), *not* by genuine errors. A lossless verify collapses the fallback toward zero and +unlocks the ~64× the raw FST already shows. + +### 11.8 The precise remaining build — a faithful (lossless) verify + +`GenerateWords` fails because it re-synthesizes from a **flat, permuted pool of rules**, losing the +**cross-stratum / template-slot ordering** that HC's internal `Synthesize` reads off the rich +analysis `Word`. Confirmed on `aikhane`: stem shape = root citation shape = `ikh` (so it is *not* a +stem-shape problem); its rules `a-5 -e -an` mix template-slot inflection (`a-5`, `-e`) with a +free derivational rule (`-an`, REC) that live in **different strata**, and the flat pool cannot +reconstruct the stratum order. The FST walk, by contrast, *knows* the stratum/template/slot/order it +traversed. + +**Caveat (measured):** `GenerateWords(WordAnalysis)` *permutes* the rule order — so it already tries +the correct order — and still fails. So the missing ingredient is **not** merely rule ordering; it is +state the rich analysis `Word` carries (syntactic features established during un-application) that a +from-citation synthesis does not re-establish. That makes a *cheap* faithful verify harder than +"apply the rules in the right order," and points to **two viable routes** (pick by measured payoff): + +- **Route A — faithful reconstruction verify.** Reconstruct enough of the rich analysis `Word` + (root + ordered rules + stratum/template/slot context the walk knows) to drive HC's internal + `Synthesize` rather than `GenerateWords`. Lossless if the reconstruction is faithful; the open risk + is whether the analysis-derived syntactic features are reconstructable from the walk's knowledge. +- **Route B — build-time constraint gates (make the FST faithful, no verify).** The over-generation + is concrete constraints — e.g. an object marker on an intransitive stem is a **subcategorization** + fact known at build time, hence order-independent and gateable like the existing category gate. + Encode the few over-gen-causing constraints on the FST arcs so it stops proposing them; then the + FST is faithful and needs no per-word verify. Cross-slot *feeding* constraints that are genuinely + not left-to-right gateable route to the search backstop (§9). + +Either route ends the same place: `VerifiedFstAnalyzer`/raw FST becomes sound *and* complete with +**near-zero fallback**, at full FST speed. + +**Decision: Route A** (chosen). Route B *duplicates* HC's constraint logic as a parallel set of FST +arc-gates that must be kept aligned with the real engine and debugged independently — a second +morphology engine, the anti-pattern this whole design avoids. Route A *reuses* HC: the constraints +stay where they already live and are already correct. + +**Route A, sharpened — "directed un-application, then `Synthesize`":** HC parsing is *search +backward* (the slow combinatorial un-application, ~10k clones/word) → *synthesize forward* to confirm +(cheap, ~2.7 ms). The FST replaces only the slow backward search — it already knows the exact path +(root + ordered rules + stratum/template). So the verify should: +1. **Directed un-application** — apply the analysis rules for *only the FST's chosen path* (no search + breadth) to the surface, producing HC's own rich analysis `Word` (with the syntactic features that + `GenerateWords`-from-citation never establishes — the §11.8 caveat). +2. **`Synthesize`** that rich `Word` through HC's existing machinery and check it matches the surface. + +Faithful by construction (HC's exact pipeline with the FST navigating instead of brute force), and +the cost is ~(rules in the path) × per-rule-apply rather than the full fan-out — the source of the +≥10×. The remaining engineering question is the cleanest way to drive HC's per-rule analysis +un-application from the FST token sequence (the rules are recoverable from the `(op, morpheme)` +tokens via the codec; the analysis-rule objects are `mrule.CompileAnalysisRule`). + +**DONE — Route A is implemented and works (the cleanest possible form).** HC's `Morpher` exposes +settable `LexEntrySelector`/`RuleSelector` (default `=> true`), checked at every analysis *and* +synthesis step. So the verify never reconstructs anything: it simply runs HC's own `AnalyzeWord` with +those selectors **pinned to the candidate's root and rules**, which prunes the combinatorial fan-out +to the single path the FST found. A candidate is valid iff it appears in that restricted result +(restriction can only remove paths, never fabricate one — HC still runs full synthesis + surface +match). Implemented in `FstReplay.Reproduces`; `VerifiedFstAnalyzer` keeps confirmed candidates and +discards the rest. **Measured (200 Sena words, unlimited oracle): verify-discard went from 48 → 14 +divergences (186/200 set-match) at 15.6 ms/word vs 234 ms/word oracle (~15×), with ALL +over-generation removed and zero false-rejection (lossless).** The 14 residual are pure +under-generation. This is the thin wrapper the design wanted — HC's real engine, navigated by the +FST, no reimplemented constraints. + +**Feasibility confirmed (why this works where `GenerateWords` fails).** `AnalysisAffixTemplateRule` +unifies the template's `RequiredSyntacticFeatureStruct` and **writes it onto the word** +(`outWord.SyntacticFeatureStruct.Add(fs)`, plus each slot rule's analysis sets its features). That +populated `SyntacticFeatureStruct` is the precondition the inflectional rules check during synthesis +— and is precisely what a from-citation `GenerateWords` never establishes (root citation form carries +bare features), which is why even the correct rule order fails there. Directed un-application calls +those *same* `CompileAnalysisRule` objects along the FST's path, so it reconstructs the populated +`SyntacticFeatureStruct` for free, then `Synthesize` succeeds. Reuse, not reimplementation. The build +applies the FST path's analysis rules (template + free derivation) to the surface `Word` — bounded by +the path, not the full search — yielding rich analysis `Word`(s) to hand to the existing `Synthesize`. + +### 11.4 The path to a full solution (what "done" means for Sena) + +1. ✅ **Re-validated the gates** built against the broken oracle: the `mbale` obligatoriness gate is + still load-bearing under the unlimited oracle (5→4 divergences); the category gate is faithful + build-time logic. +2. ✅ **Built the derivation layer** into the FST (§11.2 under-gen largely closed — `aikhane`/ + `angwera`/`paoneke`/`ikoyiwe` now proposed). +3. ✅ **Faithful (lossless) verify** (§11.8) — done via restricted re-analysis; sound + lossless at + ~15×, no fallback. `verify-discard` = 186/200 set-match (was 151 raw / 152 old verify). +4. ✅ **Category-changing derivation** — `DerivableToCategory` attaches a template over a derived + stem of its output category (verb + `NZR` → noun + class prefix), closing `kunduli`/`cidzo`/ + `khalani`. Took `verify-discard` from 14 → **6** divergences (194/200 set-match). +5. ⬜ **The last 6 (diverse proposer gaps, diminishing returns)** — all pure under-gen, all in the + *proposer*: **prefixal derivation** (`nyari` = `nominalizador`-prefix + `[ser]`; `cawo` associative), + **depth-3 derivation** (`miwiri` = `[ter]+PAS+APPLIC+NZR`; depth 3 gains it but ~2× verify cost, so + left to the backstop), and **copula/TAM** constructions (`ndico`/`ndimwe` = `é+[ele]`/`é+[vós]`; + `kuumadi` = `INF+…+IND+EVID`). Each is a small proposer-coverage item; a prefixal derivation layer + (mirror of the suffix layer) would close the first two. +6. **Target metric:** FST analyses == search analyses (set parity), at ≥10×. **Achieved: sound ✓, + lossless verify ✓, ~13× ✓ (17.2 ms/word vs 237 ms oracle), no fallback ✓, 194/200 (97%) + set-match.** The last 6 are diverse proposer coverage gaps, not a verify or soundness issue. + +### 11.9 Metric correctness and two productionization caveats + +**The parity signature was sharpened (important).** It was `join(morpheme.Id) + ":" + rootIndex`, but +affix `Morpheme.Id` is empty in this grammar, so it encoded only *(morpheme count, root position)* — +collapsing distinct affixes of the same shape (e.g. subject markers `3P+2` / `3S+1` / `6`) into one +key and hiding same-shape under-generation. Replaced with **per-morpheme object identity** (both +analyzers reference the same `Morpheme` instances from the `Language`, so it is a faithful shared +discriminator). Under the strict signature the raw-FST divergences rose 44 → 90 (shape-parity *had* +been hiding raw over-gen), but **`verify-discard` stayed at 6 (194/200), all pure under-gen** — i.e. +the verify result is robust to the metric and the soundness/lossless claim is real, not a shape +artifact. `FstReplay`'s candidate-match signature was sharpened the same way. + +**Caveat 1 — the verify mutates shared `Morpher` selectors (thread-safety).** `FstReplay` sets +`LexEntrySelector`/`RuleSelector` on the morpher with try/finally restore — correct sequentially, but +two words verified concurrently on one morpher would race the selectors. Since a core motivation is +lifting the parallel ceiling, production must give the verify a **per-thread morpher or a morpher +pool** (the analysis FST walk itself is allocation-light and parallel-friendly; only the verify step +carries this constraint). + +**Caveat 2 — the ~13× is vs the unlimited-unapplication oracle** (`MaxUnapplications=0`, 237 ms/word). +That is the sound+complete baseline (the only correct one — §11.1), and is what the FST must match. +If production HC runs a *bounded* cap for speed, it trades completeness for time, so the real-world +multiple against that configuration should be sanity-checked separately before quoting a single +headline number. + +## 12. The completeness certificate — a grammar-level proof (not per-word) + +Completeness is not a per-word heuristic; it is a **property of the grammar's rule structure**, +certified once. The contract is two exhaustive enumerators joined at a cut no derivation can cross: + +- **Side B (precompute / FST) is complete** because the regular sub-relation is a *finite automaton*: + by Myhill–Nerode it has finitely many states, and walking **all** accepting paths enumerates **all** + analyses — "enumerated absolutely everything," mechanically. (Never `Minimize` underspecified-feature + arcs: that merges distinct paths and destroys the guarantee — §9.3.) +- **Side A (live) is complete** iff (1) it tries *every applicable rule* at each node (HC's `RuleBatch` + does), and (2) a **well-founded measure** strictly decreases each step (un-application shortens the + surface; or a stratum/depth bound), so the finite search tree is fully visited. This is "I check + these N things, then I'm done." + +### 12.1 Why two complete halves can still miss — and the cut that fixes it + +If a derivation **weaves** across the boundary (`A→B→A→B`), B enumerates only B-internal paths and A +only A-internal paths, so the interleaving is **silently missed** even though each half is complete. +The fix is a **clean directed cut**: every feeding edge crosses the boundary in *one* direction. Inner +morphology feeds outer (the inner stem is what an outer affix attaches to), never the reverse — so put +**A = inner, B = outer**. Then every derivation factors uniquely as `(A-core) ∘ (B-shell)`: analysis +peels the B-shell with the FST (all ways) and hands each residual stem to A (all ways); the composition +is provably the whole analysis set. No weaving ⇒ no gap. + +### 12.2 The graph theory of a valid cut + +Model the grammar as a **feeding graph** `G` (nodes = rule/construct classes; edge `r→s` iff `r` can +create the environment `s` needs — Kiparsky feeding). + +1. Condense strongly-connected components (Tarjan) → a DAG of SCCs (an SCC = mutually-feeding rules, + i.e. a potential cycle). +2. A **valid cut** is a downward-closed set in the DAG's topological order — a *topological separator* + with all cross-edges pointing `A→B`. (HC's strata are a hand-built such stratification.) +3. Two further obligations: the **B-side relation must be regular** (Kaplan–Kay: concatenation + + ordered rewrite = regular), and every **SCC kept in A must be well-founded** (no unbounded-growth + cycle — bounded copy ok, unbounded copy not). + +A grammar admitting such a cut with B regular and A well-founded has, by construction, +`A-complete ∧ B-complete ⇒ whole-complete`. **This is the certificate, computed on the grammar.** + +### 12.3 The construct-coverage half (why "FST-closed" is necessary but not sufficient) + +`GrammarFstClosure` / the census already certify the *regularity / no-escape* half (the B-side relation +is regular; for Sena, 0 escapes). That is necessary but **not** sufficient: the FST must also actually +**enumerate every construct on the B-side**. A regular construct the builder never emits is a +*hole inside B* — a silent under-generation, not a boundary problem. So the certificate has two +mechanical checks: + +- **Closure** — the B-side is regular / no un-handled escape (existing `GrammarFstClosure`). +- **Coverage** — every grammar construct on the B-side (every affix rule in a template slot or as a + standalone morphological rule, every compounding rule, every root) is represented on some FST arc. + +`Closure ∧ Coverage` over the cut ⇒ the FST enumerates the entire B-relation ⇒ **complete for every +word** with no per-word check. If coverage fails, the certificate **names the uncovered constructs** +and the build is *flagged* (those derivations route to the proven engine) — never a silent miss. + +### 12.4 Sena under the certificate + +Census: 0 escapes, 0 reduplication, 0 phonological rules → the entire feeding graph is regular, with no +non-regular SCC. So the unique maximal valid cut is **A = ∅, B = everything**: Sena is provably +completable *entirely* in the FST, with **no live side needed**. The residual divergences are therefore +not a cut/soundness issue — they are **coverage holes** (constructs the builder omits: prefixal +derivation, depth-3 derivation chains, copula/compounding). The certificate's job is to (a) confirm +`A = ∅` and (b) list exactly those holes, turning "97% empirically" into "complete once coverage = 100%, +and known-incomplete-where-flagged until then." + +### 12.5 Why this does not balloon (size rationale) + +B is an **automaton with shared structure**, not a stored list of words: size ≈ `|lexicon trie| + +|affix inventory × template structure|` — **additive**, not the multiplicative `|roots| × |affix +combinations|` of a materialized word list. Measured on Sena: **50,673 states from 1,463 root +allomorphs + 24 templates**, sub-second build, a few MB. "Enumerate everything" means *walk all paths +at parse time*, not *materialize the cross-product at build time*. The genuine blow-up risks — eager +composition+determinization across layers, high-branching position classes, productive deep +compounding/reduplication — are bounded by the **§10 eager/lazy partition knob + state budget**, which +auto-demotes expensive layers from precompiled (A-eager) to on-the-fly (B-lazy). **Completeness is +invariant under the knob** (composition associativity: precompiling vs applying lazily denote the same +relation), so the size dial never drops an analysis; worst case "everything lazy" is bounded memory, +slower per word, still provably complete. + +### 12.6 Implementation and proof (built + stress-tested) + +Implemented: +- `FstCompletenessCertificate.Certify(language, codec)` → `FstCompletenessReport`: the closure half + (`GrammarFstClosure`) + the coverage half (every affix rule emitted by the FST, read from the codec's + covered-morpheme set), plus the compounding-rule count. `IsCertified` = closed ∧ all affixes covered + ∧ no compounding. It **names the uncovered constructs** when it fails. +- `FstTemplateAnalyzer.CoversAnalysis(WordAnalysis)`: the sound structural predicate of what the FST + provably enumerates — single root (no compounding), every morpheme covered, ≤ `DerivDepth` + derivational affixes per side, **and the canonical morph order** `[infl-prefix][deriv-prefix][root] + [deriv-suffix][infl-suffix]`. (The stress test forced each of these: depth, compounding, and order + were all discovered as required constraints by analyses that broke a weaker predicate.) +- `CompleteHybridMorpher`: the provably-complete analyzer. Certified grammar → the fast verified FST + (complete by §12.3); else → the search engine (complete; the known slow path). Completeness is by + construction, decided by the grammar-level certificate — **no per-word heuristic.** + +**Certification is the EMPIRICAL set-parity gate, not the static coverage check.** A first attempt +made `IsCertified` = closed ∧ all-affixes-covered ∧ no-compounding. A stress test exposed this as +**unsound**: `cawo = coisa + d'eles` has every morpheme covered yet the FST cannot build it (a prefix +on a pronoun root that takes no template), so a grammar could pass the static check and still silently +drop `cawo`-type words — precisely the forbidden failure. Rule/symbol coverage is **necessary, not +sufficient**; completeness is about *paths (attachments)*, not symbols present. So the static check is +demoted to a fast **pre-filter / gap-namer** (`PreFilterPasses`), and the real gate is +`FstCompletenessCertificate.CertifyEmpirically` — **FST analyses == search analyses (morpheme-identity +set parity) over a representative corpus** (§9.5). It is path-level, so it catches `cawo`. + +**Proof (stress test `Prove_CertificateCompleteness`, 200 hard Sena words, unlimited oracle):** +- *FST path tested directly* (non-vacuous): the FST itself produces **467/480** search analyses; 13 it + misses route to the engine. +- *Static check shown unsound*: **1** analysis (`cawo`) is "in-class" by the static predicate yet + missed by the FST — the concrete witness that coverage ⇏ completeness. +- *Empirical gate*: Sena is **NOT certified** (5 divergent words), so `CompleteHybridMorpher` routes to + the engine; **complete-system misses = 0** — every true analysis is returned. + +**What the stress test taught (the key result).** A *predictive per-analysis* coverage predicate is +whack-a-mole (it broke on derivation depth, then morph order, then the template-less prefix `cawo`), +and even grammar-level *symbol* coverage is unsound. **Soundness rests on the empirical set-parity gate ++ engine backstop**, not any static predicate: certified (set parity holds) ⇒ FST-only is evidence- +backed complete; uncertified ⇒ the engine guarantees completeness, and the gate names exactly which +words still diverge. The system is **100% complete today** (0 misses, via the engine for the 5 +divergent words), and the path to FST-only speed is to drive those divergences to 0 (build the 3 +remaining prefixes, compounding, deeper derivation, template-less prefixation) until the grammar +certifies — never at the cost of a silent miss. + +## 13. The two-path caching analyzer (fast + slow, the shipped front end) + +The FST fast path is **sound but not guaranteed complete** — it answers *"does this have at least one +FST-findable valid analysis?"* (a trustworthy *yes*-detector for "is this a word", never the complete +analysis set, and able to false-negative on words whose only readings use un-built constructs, e.g. a +pure compound). On its own that is not safe for a consumer that needs all readings. The shipped design +pairs it with the proven engine behind a cache: + +- **Slow path = truth, cached.** HC's search engine is complete; its result per word is stored in + `AnalysisCache`. For a fixed corpus the cache is **warmed** (in the background, in parallel) until + every word has its complete analysis — after which queries are fast *and* complete. +- **Fast path = immediate, provisional.** The verified FST answers instantly on a cache miss; its + result is flagged provisional (`FastAnalysisResult.IsComplete == false`). +- **Default is guaranteed (backwards-compatible).** `CachingMorphologicalAnalyzer.AnalyzeWord` returns + the cached complete analyses, or computes them with the engine on a miss and caches them. Existing + callers get the same analyses as before — faster once warm, never wrong. +- **Fast is opt-in.** `AnalyzeWordFast` returns the cached complete set if warm, else the provisional + FST result, and never runs the slow engine. Applications (FieldWorks) can show the fast result now + and the authoritative result once cached, querying both. +- **Persistence (fixed corpora across sessions).** `AnalysisCacheSerializer` writes/reads the cache as + text, keying morphemes by `MorphemeRegistry` (a deterministic morpheme↔key map rebuilt from the + grammar) and guarding with a **grammar-version** string — a cache built against a different grammar + is rejected, forcing a re-warm (the one way this design could otherwise serve stale, unsound + analyses). Confirmed non-words (empty analysis) are cached too, so they are not recomputed. + +Net: correctness equals the engine (the cache never invents or hides an analysis), the FST removes the +cold-start latency, and a warmed fixed corpus resolves every word fast and complete. The FST's +incompleteness — including the "is this a word" false-negative — is corrected the moment a word's +complete analysis lands in the cache. diff --git a/src/SIL.Machine.Morphology.HermitCrab/AffixTemplate.cs b/src/SIL.Machine.Morphology.HermitCrab/AffixTemplate.cs index 02e16e8e1..ebbb8d061 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/AffixTemplate.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/AffixTemplate.cs @@ -76,12 +76,12 @@ public Stratum Stratum } } - public override IRule CompileAnalysisRule(Morpher morpher) + public override IRule CompileAnalysisRule(Morpher morpher) { return new AnalysisAffixTemplateRule(morpher, this); } - public override IRule CompileSynthesisRule(Morpher morpher) + public override IRule CompileSynthesisRule(Morpher morpher) { return new SynthesisAffixTemplateRule(morpher, this); } diff --git a/src/SIL.Machine.Morphology.HermitCrab/AllomorphEnvironment.cs b/src/SIL.Machine.Morphology.HermitCrab/AllomorphEnvironment.cs index c1ac4f768..a08a76099 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/AllomorphEnvironment.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/AllomorphEnvironment.cs @@ -12,16 +12,12 @@ namespace SIL.Machine.Morphology.HermitCrab public class AllomorphEnvironment : IEquatable { private readonly ConstraintType _type; - private readonly Pattern _leftEnv; - private readonly Matcher _leftEnvMatcher; - private readonly Pattern _rightEnv; - private readonly Matcher _rightEnvMatcher; - - public AllomorphEnvironment( - ConstraintType type, - Pattern leftEnv, - Pattern rightEnv - ) + private readonly Pattern _leftEnv; + private readonly Matcher _leftEnvMatcher; + private readonly Pattern _rightEnv; + private readonly Matcher _rightEnvMatcher; + + public AllomorphEnvironment(ConstraintType type, Pattern leftEnv, Pattern rightEnv) { _type = type; if (leftEnv != null && !leftEnv.IsLeaf) @@ -29,9 +25,9 @@ Pattern rightEnv if (!leftEnv.IsFrozen) throw new ArgumentException("The pattern is not frozen.", "leftEnv"); _leftEnv = leftEnv; - _leftEnvMatcher = new Matcher( + _leftEnvMatcher = new Matcher( leftEnv, - new MatcherSettings + new MatcherSettings { AnchoredToStart = true, Direction = Direction.RightToLeft, @@ -47,9 +43,9 @@ Pattern rightEnv if (!rightEnv.IsFrozen) throw new ArgumentException("The pattern is not frozen.", "rightEnv"); _rightEnv = rightEnv; - _rightEnvMatcher = new Matcher( + _rightEnvMatcher = new Matcher( rightEnv, - new MatcherSettings + new MatcherSettings { AnchoredToStart = true, Filter = ann => @@ -68,12 +64,12 @@ public ConstraintType Type public string Name { get; set; } - public Pattern LeftEnvironment + public Pattern LeftEnvironment { get { return _leftEnv; } } - public Pattern RightEnvironment + public Pattern RightEnvironment { get { return _rightEnv; } } @@ -87,10 +83,24 @@ public bool IsWordValid(Word word, Annotation morph) private bool IsMatch(Word word, Annotation morph) { - if (_leftEnvMatcher != null && !_leftEnvMatcher.IsMatch(word, morph.Range.Start.Prev)) + // RUSTIFY Stage 2: the env matchers are Matcher; pass the bracketing node's + // direction-aware start offset (left env matches RtL, right env LtR — see the ctor). + if ( + _leftEnvMatcher != null + && !_leftEnvMatcher.IsMatch( + word, + word.Shape.MatchStartOffset(morph.Range.Start.Prev, Direction.RightToLeft) + ) + ) return false; - if (_rightEnvMatcher != null && !_rightEnvMatcher.IsMatch(word, morph.Range.End.Next)) + if ( + _rightEnvMatcher != null + && !_rightEnvMatcher.IsMatch( + word, + word.Shape.MatchStartOffset(morph.Range.End.Next, Direction.LeftToRight) + ) + ) return false; return true; diff --git a/src/SIL.Machine.Morphology.HermitCrab/AnalysisAffixTemplateRule.cs b/src/SIL.Machine.Morphology.HermitCrab/AnalysisAffixTemplateRule.cs index 6331e2995..f401ce0fa 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/AnalysisAffixTemplateRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/AnalysisAffixTemplateRule.cs @@ -1,29 +1,27 @@ -using System.Collections.Generic; +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; using System.Linq; +using System.Threading.Tasks; using SIL.Machine.Annotations; using SIL.Machine.FeatureModel; using SIL.Machine.Rules; using SIL.ObjectModel; -#if !SINGLE_THREADED -using System; -using System.Collections.Concurrent; -using System.Threading.Tasks; -#endif namespace SIL.Machine.Morphology.HermitCrab { - internal class AnalysisAffixTemplateRule : IRule + internal class AnalysisAffixTemplateRule : IRule { private readonly Morpher _morpher; private readonly AffixTemplate _template; - private readonly List> _rules; + private readonly List> _rules; public AnalysisAffixTemplateRule(Morpher morpher, AffixTemplate template) { _morpher = morpher; _template = template; - _rules = new List>( - template.Slots.Select(slot => new RuleBatch( + _rules = new List>( + template.Slots.Select(slot => new RuleBatch( slot.Rules.Select(mr => mr.CompileAnalysisRule(morpher)), false, FreezableEqualityComparer.Default @@ -47,18 +45,24 @@ public IEnumerable Apply(Word input) inWord.Freeze(); var output = new HashSet(FreezableEqualityComparer.Default); -#if SINGLE_THREADED - ApplySlots(inWord, _rules.Count - 1, output); -#else - ParallelApplySlots(inWord, output); -#endif + if (_morpher.MaxDegreeOfParallelism == 1) + ApplySlots(inWord, _rules.Count - 1, output); + else + ParallelApplySlots(inWord, output); foreach (Word outWord in output) - outWord.SyntacticFeatureStruct.Add(fs); + { + // Clone-then-reassign, not an in-place mutation: outWord may already be frozen (it + // came out of the rule cascade above), and a frozen FeatureStruct must not be mutated + // in place — a future memoized/shared result instance would otherwise leak this edit + // into every branch that shares it. + FeatureStruct sfs = outWord.SyntacticFeatureStruct.Clone(); + sfs.Add(fs); + outWord.SyntacticFeatureStruct = sfs; + } return output; } -#if SINGLE_THREADED private void ApplySlots(Word inWord, int index, HashSet output) { for (int i = index; i >= 0; i--) @@ -78,9 +82,10 @@ private void ApplySlots(Word inWord, int index, HashSet output) _morpher.TraceManager.EndUnapplyTemplate(_template, inWord, true); output.Add(inWord); } -#else + private void ParallelApplySlots(Word inWord, HashSet output) { + var parallelOptions = new ParallelOptions { MaxDegreeOfParallelism = _morpher.MaxDegreeOfParallelism }; var outStack = new ConcurrentStack(); var from = new ConcurrentStack>(); from.Push(Tuple.Create(inWord, _rules.Count - 1)); @@ -90,6 +95,7 @@ private void ParallelApplySlots(Word inWord, HashSet output) to.Clear(); Parallel.ForEach( from, + parallelOptions, work => { bool add = true; @@ -126,6 +132,5 @@ private void ParallelApplySlots(Word inWord, HashSet output) output.UnionWith(outStack); } -#endif } } diff --git a/src/SIL.Machine.Morphology.HermitCrab/AnalysisLanguageRule.cs b/src/SIL.Machine.Morphology.HermitCrab/AnalysisLanguageRule.cs index b4673ca55..4bdd3c959 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/AnalysisLanguageRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/AnalysisLanguageRule.cs @@ -6,11 +6,11 @@ namespace SIL.Machine.Morphology.HermitCrab { - internal class AnalysisLanguageRule : IRule + internal class AnalysisLanguageRule : IRule { private readonly Morpher _morpher; private readonly List _strata; - private readonly List> _rules; + private readonly List> _rules; public AnalysisLanguageRule(Morpher morpher, Language language) { diff --git a/src/SIL.Machine.Morphology.HermitCrab/AnalysisStratumRule.cs b/src/SIL.Machine.Morphology.HermitCrab/AnalysisStratumRule.cs index 36d9557ad..aadef0838 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/AnalysisStratumRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/AnalysisStratumRule.cs @@ -8,11 +8,11 @@ namespace SIL.Machine.Morphology.HermitCrab { - internal class AnalysisStratumRule : IRule + internal class AnalysisStratumRule : IRule { - private readonly IRule _mrulesRule; - private readonly IRule _prulesRule; - private readonly IRule _templatesRule; + private readonly IRule _mrulesRule; + private readonly IRule _prulesRule; + private readonly IRule _templatesRule; private readonly Stratum _stratum; private readonly Morpher _morpher; @@ -20,16 +20,16 @@ public AnalysisStratumRule(Morpher morpher, Stratum stratum) { _stratum = stratum; _morpher = morpher; - _prulesRule = new LinearRuleCascade( + _prulesRule = new LinearRuleCascade( stratum.PhonologicalRules.Select(prule => CompilePhonologicalRule(prule, morpher)).Reverse() ); - _templatesRule = new RuleBatch( + _templatesRule = new RuleBatch( stratum.AffixTemplates.Select(template => CompileAffixTemplate(template, morpher)), false, FreezableEqualityComparer.Default ); _mrulesRule = null; - IEnumerable> mrules = stratum + IEnumerable> mrules = stratum .MorphologicalRules.Select(mrule => CompileMorphologicalRule(mrule, morpher)) .Reverse(); switch (stratum.MorphologicalRuleOrder) @@ -39,31 +39,38 @@ public AnalysisStratumRule(Morpher morpher, Stratum stratum) // because morphological rules should be considered optional // during unapplication (they are obligatory during application, // but we don't know they have been applied during unapplication). - _mrulesRule = new PermutationRuleCascade( + _mrulesRule = new PermutationRuleCascade( mrules, true, FreezableEqualityComparer.Default ); break; case MorphologicalRuleOrder.Unordered: -#if SINGLE_THREADED - _mrulesRule = new CombinationRuleCascade( - mrules, - true, - FreezableEqualityComparer.Default - ); -#else - _mrulesRule = new ParallelCombinationRuleCascade( - mrules, - true, - FreezableEqualityComparer.Default - ); -#endif + // Single-threaded when the caller caps within-word parallelism (e.g. it + // parallelizes across words itself); parallel cascade otherwise. + _mrulesRule = + morpher.MaxDegreeOfParallelism == 1 + ? (IRule) + new CombinationRuleCascade( + mrules, + true, + FreezableEqualityComparer.Default + ) + : new ParallelCombinationRuleCascade( + mrules, + true, + FreezableEqualityComparer.Default + ) + { + // Honor the within-word parallelism cap rather than running at + // the default (effectively unbounded) scheduler degree. + MaxDegreeOfParallelism = morpher.MaxDegreeOfParallelism, + }; break; } } - private IRule CompileAffixTemplate(AffixTemplate template, Morpher morpher) + private IRule CompileAffixTemplate(AffixTemplate template, Morpher morpher) { try { @@ -75,7 +82,7 @@ private IRule CompileAffixTemplate(AffixTemplate template, Morp } } - private IRule CompileMorphologicalRule(IMorphologicalRule mrule, Morpher morpher) + private IRule CompileMorphologicalRule(IMorphologicalRule mrule, Morpher morpher) { try { @@ -87,7 +94,7 @@ private IRule CompileMorphologicalRule(IMorphologicalRule mrule } } - private IRule CompilePhonologicalRule(IPhonologicalRule prule, Morpher morpher) + private IRule CompilePhonologicalRule(IPhonologicalRule prule, Morpher morpher) { try { @@ -149,7 +156,7 @@ public IEnumerable Apply(Word input) private IEnumerable ApplyMorphologicalRules(Word input) { - foreach (Word mruleOutWord in _mrulesRule.Apply(input).Distinct(FreezableEqualityComparer.Default)) + foreach (Word mruleOutWord in _mrulesRule.Apply(input)) { switch (_stratum.MorphologicalRuleOrder) { @@ -168,7 +175,7 @@ private IEnumerable ApplyMorphologicalRules(Word input) private IEnumerable ApplyTemplates(Word input) { - foreach (Word tempOutWord in _templatesRule.Apply(input).Distinct(FreezableEqualityComparer.Default)) + foreach (Word tempOutWord in _templatesRule.Apply(input)) { switch (_stratum.MorphologicalRuleOrder) { diff --git a/src/SIL.Machine.Morphology.HermitCrab/ComposedPhonologyProposer.cs b/src/SIL.Machine.Morphology.HermitCrab/ComposedPhonologyProposer.cs new file mode 100644 index 000000000..7a0af2ad9 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/ComposedPhonologyProposer.cs @@ -0,0 +1,91 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Annotations; +using SIL.Machine.Morphology; +using SIL.Machine.Rules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Point 4 (C-exact phonology) by composition with HC's phonology inverse + /// (FST_FULL_PLAN.md). Un-applies the grammar's phonological rules to the surface — reusing each + /// stratum's , exactly the rules + /// runs (surface stratum first, rules reversed within a stratum) — + /// to recover the underlying form, then walks the underlying-arc morphotactic FST on it + /// (). That is literally phonology⁻¹ ∘ morphotactics. + /// + /// Because the inverse is applied to the assembled surface, this covers a large share of + /// bounded phonology — including the cross-boundary, stem-conditioned alternations the + /// per-morpheme precompile (Point 1) cannot see. The un-applied shape carries under-specified + /// nodes (analysis is non-deterministic), which the unification walk matches against every + /// compatible arc; verify prunes the spurious ones, so it stays a sound superset. An unbounded + /// self-feeding cycle is not a regular relation and simply will not be found this way — those + /// words stay unparsed by design, not incorrectly parsed. + /// + /// Thread-safe. The inverse cascade is compiled once against a private + /// with its own (not the caller's), and each + /// applies it to a fresh local — no per-call mutation of shared state — so the + /// composite stays safe on the parallel path supports. + /// + public class ComposedPhonologyProposer : IConstructProposer + { + private static readonly MorphOp[] _ops = new MorphOp[0]; + private readonly FstTemplateAnalyzer _fst; + private readonly Stratum _surfaceStratum; + private readonly CharacterDefinitionTable _table; + private readonly LinearRuleCascade _inverse; + private readonly bool _hasPhonology; + + public ComposedPhonologyProposer(Language language, FstTemplateAnalyzer fst) + { + _fst = fst; + _surfaceStratum = language.SurfaceStratum; + _table = language.SurfaceStratum.CharacterDefinitionTable; + // Compile against a private Morpher with its own TraceManager — the analysis rules read + // _morpher.TraceManager (and the morpher's selectors), so this proposer must not share the + // factory's morpher (mirrors MorpherPool giving each rented morpher its own TraceManager). + var morpher = new Morpher(new TraceManager(), language); + // Inverse order mirrors AnalysisLanguageRule/AnalysisStratumRule: strata surface→inner, and + // within each stratum the synthesis rules are un-applied in reverse application order. + var rules = new List>(); + foreach (Stratum stratum in language.Strata.Reverse()) + { + foreach (IPhonologicalRule prule in stratum.PhonologicalRules.Reverse()) + { + rules.Add(prule.CompileAnalysisRule(morpher)); + } + } + _hasPhonology = rules.Count > 0; + _inverse = new LinearRuleCascade(rules); + } + + /// Phonology completeness is not a per-construct MorphOp, so this covers none; its value + /// is validated empirically by the parity gate. + public IReadOnlyCollection CoveredOps => _ops; + + public IEnumerable AnalyzeWord(string word) + { + if (!_hasPhonology) + { + yield break; // no phonology ⇒ the bare FST proposer already covers everything + } + Shape shape; + try + { + shape = _table.Segment(word); + } + catch (InvalidShapeException) + { + yield break; + } + // Un-apply phonology in place (the cascade mutates the word's shape, as AnalysisStratumRule + // relies on); the resulting under-specified shape is the underlying form to walk. + var inverseWord = new Word(_surfaceStratum, shape); + _inverse.Apply(inverseWord).ToList(); + foreach (WordAnalysis candidate in _fst.AnalyzeShape(inverseWord.Shape)) + { + yield return candidate; + } + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs b/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs new file mode 100644 index 000000000..30bb869fc --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs @@ -0,0 +1,134 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Morphology; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Unions the candidate sets of several proposers into one (FST_FULL_PLAN.md). The FST proposer + /// () covers the regular bulk; sibling generators + /// (, ) contribute candidates for + /// constructs the FST skips. Every candidate still flows through the single + /// verify gate, so the composite is sound by the same argument as + /// each part — a generator that over-generates has its junk pruned, one that under-generates only + /// loses acceleration (the parity gate falls those words back to the engine). + /// + /// Candidates are deduped by signature before they leave the composite: when two generators + /// (or a generator and the FST) propose the same morpheme set, verify would otherwise confirm it + /// twice and emit a duplicate analysis. The signature is order-sensitive morpheme identity + root + /// index, mirroring 's match semantics. + /// + /// aggregates coverage at the MorphOp level: the FST's uncovered + /// ops minus the ops the sibling generators cover. It is a cheap, build-time coverage diagnostic — + /// which construct types this composite claims to handle at all — not a soundness or completeness + /// guarantee; a generator that covers an op only partially still under-generates on some words for + /// that op (verify + the probe's own gap diagnostics are what actually reveal that). + /// + public class CompositeProposer : IMorphologicalAnalyzer + { + private readonly IReadOnlyList _proposers; + private readonly bool _coversAllConstructs; + + public CompositeProposer(FstTemplateAnalyzer fst, params IConstructProposer[] generators) + { + var proposers = new List { fst }; + var covered = new HashSet(); + foreach (IConstructProposer generator in generators) + { + proposers.Add(generator); + foreach (MorphOp op in generator.CoveredOps) + { + covered.Add(op); + } + } + _proposers = proposers; + _coversAllConstructs = fst.UncoveredOps.All(covered.Contains); + } + + /// The standard production proposer: the FST plus the reduplication and infix + /// generators, the phonology-composition proposer (Point 4, all bounded phonology including + /// cross-boundary), and the lockstep-composition phonology proposer (LEVER_2.md/ + /// FST_FAST_PATH_PLAN.md Phase 3 — the lexicon-constrained auto-compiled Pinv walk; narrower + /// coverage than the composition proposer today, additive not a replacement, see + /// ). For a grammar without a given construct the + /// corresponding generator is inert (it holds no rules and yields nothing — both phonology + /// proposers short-circuit when the grammar has no matching phonological rules), so this adds + /// near-zero overhead and does not change behavior; that is why the factories wire it + /// unconditionally rather than as an opt-in. + /// + /// (opt-in, default off) adds the + /// — a build-time root × affix-combo synthesis precompile + /// that covers boundary-conditioned morphophonemics (e.g. Indonesian meN- nasal substitution) the + /// inverse-based phonology proposer cannot. It is opt-in because its build cost grows with + /// lexicon × affix permutations: appropriate for bounded-affixation grammars / fixed corpora, not + /// for heavily-inflecting templatic systems. bounds the combo + /// depth. + public static CompositeProposer ForLanguage( + Language language, + FstTemplateAnalyzer fst, + bool forwardSynthesis = false, + int maxAffixes = 2 + ) + { + var generators = new List + { + new ReduplicationProposer(language, fst), + new InfixProposer(language, fst), + new ComposedPhonologyProposer(language, fst), + new LockstepPhonologyProposer(language, new Morpher(new TraceManager(), language)), + }; + if (forwardSynthesis) + { + generators.Insert( + 0, + new ForwardSynthesisProposer(language, new Morpher(new TraceManager(), language), maxAffixes) + ); + } + return new CompositeProposer(fst, generators.ToArray()); + } + + /// True iff every construct the FST proposer skipped is claimed by a sibling generator. + /// A coverage diagnostic only — see class remarks. + public bool CoversAllConstructs => _coversAllConstructs; + + public IEnumerable AnalyzeWord(string word) + { + var ids = new Dictionary(); + var seen = new HashSet(); + foreach (IMorphologicalAnalyzer proposer in _proposers) + { + foreach (WordAnalysis candidate in proposer.AnalyzeWord(word)) + { + if (seen.Add(Signature(candidate, ids))) + { + yield return candidate; + } + } + } + } + + /// Order-sensitive morpheme-identity signature (same scheme as ). + private static string Signature(WordAnalysis analysis, Dictionary ids) + { + return string.Join("+", analysis.Morphemes.Select(m => Id(m, ids))) + ":" + analysis.RootMorphemeIndex; + } + + private static int Id(IMorpheme morpheme, Dictionary ids) + { + if (!ids.TryGetValue(morpheme, out int id)) + { + id = ids.Count; + ids[morpheme] = id; + } + return id; + } + } + + /// A candidate generator for a specific non-FST construct (reduplication, infixation). It + /// proposes a sound superset for that construct and declares which s it covers + /// so the composite can aggregate the build-time coverage signal. + public interface IConstructProposer : IMorphologicalAnalyzer + { + IReadOnlyCollection CoveredOps { get; } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/ForwardSynthesisProposer.cs b/src/SIL.Machine.Morphology.HermitCrab/ForwardSynthesisProposer.cs new file mode 100644 index 000000000..ace50a9dc --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/ForwardSynthesisProposer.cs @@ -0,0 +1,222 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.FeatureModel; +using SIL.Machine.Morphology; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// A candidate generator for boundary-conditioned morphophonemics — the case the phonology + /// inverse cannot handle (FST_FULL_PLAN.md, Point 1/4 reconciliation). Indonesian meN- nasal + /// substitution (tulis → menulis, the prefix nasal assimilates AND the root-initial deletes) + /// is conditioned on the morpheme boundary, so un-applying phonology on the bare surface + /// over-generates and cannot be cleanly composed. Forward synthesis, however, is + /// boundary-correct: applies the rules with the boundary present + /// and yields the true surface. + /// + /// So at build time this enumerates each root × every bounded combination of morphological rules, + /// synthesizes the surface form(s), and tabulates surface → (root + affixes). Analysis is then + /// a dictionary lookup; still confirms each candidate (and supplies + /// the real category). Sound by construction (a tabulated entry is a real synthesized word) and + /// covers every construct synthesis handles — including reduplication and infixation, which it gets + /// for free. The cost is build-time enumeration (root × affix-combos), bounded by + /// maxAffixes and a hard budget; it trades the FST's compactness for coverage, which + /// fits languages with bounded productive affixation (it does not scale to heavily-inflecting + /// templatic systems — those keep riding the engine via the parity gate). + /// + public class ForwardSynthesisProposer : IConstructProposer + { + private readonly Dictionary> _table = new Dictionary>( + StringComparer.Ordinal + ); + private readonly MorphOp[] _coveredOps; + + /// Build the surface→analysis table. bounds how many + /// morphological rules may co-occur on a word; hard-caps the number of + /// tabulated entries so a productive grammar degrades (fewer combos) rather than exploding. + public ForwardSynthesisProposer(Language language, Morpher morpher, int maxAffixes = 2, int budget = 500_000) + { + var rules = language + .Strata.SelectMany(s => s.MorphologicalRules) + .OfType() + .ToList(); + var roots = language.Strata.SelectMany(s => s.Entries).ToList(); + var covered = new HashSet(); + int entries = 0; + bool capped = false; + + foreach (LexEntry root in roots) + { + foreach (List combo in Combinations(rules, maxAffixes)) + { + if (entries >= budget) + { + capped = true; + break; + } + IReadOnlyCollection surfaces; + try + { + surfaces = morpher + .GenerateWords(root, combo, new FeatureStruct()) + .Select(Normalize) + .Distinct() + .ToList(); + } + catch (Exception) + { + continue; // an invalid combo (category clash, obligatoriness) — synthesis declines + } + if (surfaces.Count == 0) + { + continue; + } + WordAnalysis analysis = BuildAnalysis(root, combo); + foreach (string surface in surfaces) + { + if (!_table.TryGetValue(surface, out List list)) + { + list = new List(); + _table[surface] = list; + } + list.Add(analysis); + entries++; + } + foreach (MorphemicMorphologicalRule rule in combo) + { + covered.Add(RuleOp(rule)); + } + } + if (capped) + { + break; + } + } + // Claim every non-concatenative construct we actually synthesized — reduplication, infix, + // circumfix, process. Synthesis (GenerateWords) handles them all, so a tabulated entry is + // genuine coverage; plain prefix/suffix are already covered by the FST. Verify still + // confirms every candidate, so a false claim here costs coverage, never soundness. + _coveredOps = covered + .Where(o => + o == MorphOp.Reduplication + || o == MorphOp.Infix + || o == MorphOp.Process + || o == MorphOp.CircumfixPrefix + || o == MorphOp.CircumfixSuffix + ) + .ToArray(); + WasCapped = capped; + EntryCount = entries; + } + + /// True if the entry budget was hit (coverage is partial; more combos were skipped). + public bool WasCapped { get; } + + /// Number of tabulated surface→analysis entries. + public int EntryCount { get; } + + public IReadOnlyCollection CoveredOps => _coveredOps; + + public IEnumerable AnalyzeWord(string word) + { + return _table.TryGetValue(Normalize(word), out List list) + ? list + : Enumerable.Empty(); + } + + /// The candidate analysis in HC application order: prefixes, then the root, then the + /// remaining affixes (suffix/reduplication/infix). Verify confirms the order against the engine. + private static WordAnalysis BuildAnalysis(LexEntry root, List combo) + { + var prefixes = new List(); + var rest = new List(); + foreach (MorphemicMorphologicalRule rule in combo) + { + if (RuleOp(rule) == MorphOp.Prefix) + { + prefixes.Add(rule); + } + else + { + rest.Add(rule); + } + } + var morphemes = new List(prefixes.Count + 1 + rest.Count); + morphemes.AddRange(prefixes); + morphemes.Add(root); + morphemes.AddRange(rest); + return new WordAnalysis(morphemes, prefixes.Count, null); + } + + private static MorphOp RuleOp(MorphemicMorphologicalRule rule) + { + IEnumerable allomorphs; + switch (rule) + { + case AffixProcessRule affix: + allomorphs = affix.Allomorphs; + break; + case RealizationalAffixProcessRule realizational: + allomorphs = realizational.Allomorphs; + break; + default: + return MorphOp.None; + } + foreach (AffixProcessAllomorph allomorph in allomorphs) + { + return MorphTokenCodec.ClassifyOp(allomorph, false); + } + return MorphOp.None; + } + + private static string Normalize(string s) => s.Normalize(System.Text.NormalizationForm.FormD); + + /// All ORDERED sequences of 0.. distinct rules. Order matters: + /// is sensitive to the morpheme-list order (meN·Cont yields + /// the real "menulis-nulis"; Cont·meN yields a different form), so every permutation is tried — + /// the wrong orders simply tabulate forms that are never queried. + private static IEnumerable> Combinations( + List rules, + int max + ) + { + yield return new List(); + for (int size = 1; size <= max; size++) + { + foreach (List seq in PermutationsOfSize(rules, size, new bool[rules.Count])) + { + yield return seq; + } + } + } + + private static IEnumerable> PermutationsOfSize( + List rules, + int size, + bool[] used + ) + { + if (size == 0) + { + yield return new List(); + yield break; + } + for (int i = 0; i < rules.Count; i++) + { + if (used[i]) + { + continue; + } + used[i] = true; + foreach (List tail in PermutationsOfSize(rules, size - 1, used)) + { + tail.Insert(0, rules[i]); + yield return tail; + } + used[i] = false; + } + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/FstCoverageProbe.cs b/src/SIL.Machine.Morphology.HermitCrab/FstCoverageProbe.cs new file mode 100644 index 000000000..f58d96d89 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/FstCoverageProbe.cs @@ -0,0 +1,232 @@ +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// A deliberately bounded, opt-in tool for grammar authoring: run a wordlist through the FST fast + /// path — over the full composite ( + /// plus every sibling generator: , , + /// , ) — and report + /// coverage, or diff coverage between two versions of a grammar. This exists to answer "did my + /// grammar edit make parsing better or worse?" in milliseconds, as a fast proxy for the real engine + /// — never as a replacement for it, and never behind any notion of the fast path being "proven" + /// complete for a grammar. + /// + /// Contract (read before trusting the numbers): + /// + /// Sound on positives. A word this reports as parsed was confirmed by HC's own + /// restricted re-analysis () — it is a genuine engine analysis, not FST + /// over-generation. + /// Known-incomplete on negatives — by design, not by accident. The fast path does not + /// (yet) model every construct — compounding, clitics, templatic multi-slot infixation, and + /// phonological rules outside 's and + /// 's combined reach are the current gaps (see + /// FST_FAST_PATH_PLAN.md's KNOWN_GAPS) — so an uncovered word shows up as "unparsed" here even when + /// the real engine parses it. Do not read "unparsed" as "invalid"; read a coverage-count *change* + /// between two grammar versions as the signal. + /// Never wired into production analysis. This type is not used by + /// — it exists solely for a grammar engineer (or a script) to call directly + /// while iterating on a grammar. + /// + /// + public sealed class FstCoverageProbe + { + private readonly VerifiedFstAnalyzer _analyzer; + private readonly bool _coversAllConstructs; + private readonly IReadOnlyCollection _uncoveredConstructs; + private readonly int _unsupportedPhonologyRuleCount; + + private FstCoverageProbe( + VerifiedFstAnalyzer analyzer, + bool coversAllConstructs, + IReadOnlyCollection uncoveredConstructs, + int unsupportedPhonologyRuleCount + ) + { + _analyzer = analyzer; + _coversAllConstructs = coversAllConstructs; + _uncoveredConstructs = uncoveredConstructs; + _unsupportedPhonologyRuleCount = unsupportedPhonologyRuleCount; + } + + /// Build the full-composite fast path for one grammar. Cheap enough to call after + /// every edit: no corpus comparison against the engine, ever. + /// (opt-in, default off) adds — see + /// for its build-cost tradeoff. + public static FstCoverageProbe ForLanguage(Language language, bool forwardSynthesis = false) + { + var fst = new FstTemplateAnalyzer(language, new Morpher(new TraceManager(), language)); + var lockstep = new LockstepPhonologyProposer(language, new Morpher(new TraceManager(), language)); + var generators = new List + { + new ReduplicationProposer(language, fst), + new InfixProposer(language, fst), + new ComposedPhonologyProposer(language, fst), + lockstep, + }; + if (forwardSynthesis) + { + generators.Insert(0, new ForwardSynthesisProposer(language, new Morpher(new TraceManager(), language))); + } + var composite = new CompositeProposer(fst, generators.ToArray()); + var pool = new MorpherPool(() => new Morpher(new TraceManager(), language)); + var analyzer = new VerifiedFstAnalyzer(composite, pool); + return new FstCoverageProbe( + analyzer, + composite.CoversAllConstructs, + fst.UncoveredOps, + lockstep.UnsupportedRuleCount + ); + } + + /// Run the fast path over and summarize coverage. + public ProbeReport Probe(IEnumerable words) + { + var sw = Stopwatch.StartNew(); + int total = 0; + int parsed = 0; + int totalAnalyses = 0; + var unparsed = new List(); + foreach (string word in words) + { + total++; + int count = _analyzer.AnalyzeWord(word).Count(); + if (count > 0) + { + parsed++; + totalAnalyses += count; + } + else + { + unparsed.Add(word); + } + } + sw.Stop(); + return new ProbeReport( + total, + parsed, + totalAnalyses, + unparsed, + _coversAllConstructs, + _uncoveredConstructs, + _unsupportedPhonologyRuleCount, + sw.Elapsed + ); + } + + /// Probe and over the same corpus and + /// diff coverage — the direct answer to "did this grammar edit make parsing better or worse?". + /// Each grammar gets its own fresh fast-path build, so this is exactly two + /// calls plus a set diff; no engine comparison, ever. + public static CoverageDiff CompareGrammars(Language before, Language after, IEnumerable words) + { + List corpus = words.ToList(); + ProbeReport beforeReport = ForLanguage(before).Probe(corpus); + ProbeReport afterReport = ForLanguage(after).Probe(corpus); + var beforeUnparsed = new HashSet(beforeReport.UnparsedWords); + var afterUnparsed = new HashSet(afterReport.UnparsedWords); + List gained = beforeUnparsed.Where(w => !afterUnparsed.Contains(w)).OrderBy(w => w).ToList(); + List lost = afterUnparsed.Where(w => !beforeUnparsed.Contains(w)).OrderBy(w => w).ToList(); + return new CoverageDiff(beforeReport, afterReport, gained, lost); + } + } + + /// Coverage summary for one grammar over one corpus — see for + /// what "parsed" does and does not guarantee. + public sealed class ProbeReport + { + public int TotalWords { get; } + public int ParsedWords { get; } + public int TotalAnalyses { get; } + public IReadOnlyList UnparsedWords { get; } + + /// True iff every construct the bare FST cannot build (reduplication/infix/etc.) is + /// claimed by a sibling generator — a build-time coverage diagnostic, not a soundness or + /// per-word completeness guarantee (see ). + public bool CoversAllConstructs { get; } + + /// The s no generator in this composite claims to cover — a + /// grammar using one of these constructs will systematically under-generate on words that need + /// it. Empty when is true. + public IReadOnlyCollection UncoveredConstructs { get; } + + /// How many (rule, subrule) pairs could not fit + /// into its v1 supported shape (see its class remarks) — a phonology-specific coverage + /// diagnostic distinct from , which only tracks whole + /// categories. + public int UnsupportedPhonologyRuleCount { get; } + + /// Wall-clock time for this call. + public System.TimeSpan Elapsed { get; } + + public double CoverageRate => TotalWords == 0 ? 0 : (double)ParsedWords / TotalWords; + public double AverageAnalysesPerParsedWord => ParsedWords == 0 ? 0 : (double)TotalAnalyses / ParsedWords; + + internal ProbeReport( + int totalWords, + int parsedWords, + int totalAnalyses, + IReadOnlyList unparsedWords, + bool coversAllConstructs, + IReadOnlyCollection uncoveredConstructs, + int unsupportedPhonologyRuleCount, + System.TimeSpan elapsed + ) + { + TotalWords = totalWords; + ParsedWords = parsedWords; + TotalAnalyses = totalAnalyses; + UnparsedWords = unparsedWords; + CoversAllConstructs = coversAllConstructs; + UncoveredConstructs = uncoveredConstructs; + UnsupportedPhonologyRuleCount = unsupportedPhonologyRuleCount; + Elapsed = elapsed; + } + + public override string ToString() => + $"{ParsedWords}/{TotalWords} words parsed ({CoverageRate:P1}), " + + $"{AverageAnalysesPerParsedWord:F2} analyses/parsed word, " + + $"{Elapsed.TotalMilliseconds:F0} ms" + + (CoversAllConstructs ? "" : $", uncovered constructs: [{string.Join(",", UncoveredConstructs)}]") + + ( + UnsupportedPhonologyRuleCount > 0 + ? $", {UnsupportedPhonologyRuleCount} unsupported phonology rule(s)" + : "" + ); + } + + /// The coverage delta between two grammar versions over the same corpus. + /// and are the words whose fast-path parse status flipped — the direct answer to + /// "what did this edit change?". + public sealed class CoverageDiff + { + public ProbeReport Before { get; } + public ProbeReport After { get; } + + /// Unparsed under , parsed under . + public IReadOnlyList Gained { get; } + + /// Parsed under , unparsed under . + public IReadOnlyList Lost { get; } + + internal CoverageDiff( + ProbeReport before, + ProbeReport after, + IReadOnlyList gained, + IReadOnlyList lost + ) + { + Before = before; + After = after; + Gained = gained; + Lost = lost; + } + + public override string ToString() => + $"before: {Before}{System.Environment.NewLine}" + + $"after: {After}{System.Environment.NewLine}" + + $"gained {Gained.Count} word(s), lost {Lost.Count} word(s)"; + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/FstReplay.cs b/src/SIL.Machine.Morphology.HermitCrab/FstReplay.cs new file mode 100644 index 000000000..82b8ded44 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/FstReplay.cs @@ -0,0 +1,115 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Verification by restricted re-analysis (HERMITCRAB_FST_PLAN.md §11.8, Route A): confirm + /// an FST candidate by running HC's own with the rule/lexicon + /// selectors pinned to just this candidate's root and rules. That prunes HC's combinatorial + /// fan-out to the single path the FST already found — a few ms, not the full search — while reusing + /// HC's real analysis+synthesis validation end to end (no reimplemented constraints). + /// + /// A candidate is valid iff HC's restricted analysis of the surface yields it: restriction can only + /// remove paths HC would not take, never fabricate one (HC still runs full synthesis + surface + /// match), so membership in the restricted result is exactly "is a valid HC analysis". The Morpher + /// is ed so concurrent verification is thread-safe (the selectors are + /// mutable instance state). The matched HC analysis is returned (not the FST candidate) so + /// the caller emits a genuine engine — with its real category — rather + /// than the category-less proposal. + /// + /// Compounds (Phase G2, FST_FULL_GRAMMAR_PLAN.md): a candidate may contain a SECOND + /// at a non-head position (the FST's compound loop proposes one candidate per head choice — see + /// FstTemplateAnalyzer.ToWordAnalyses). That second root is admitted into the lexicon + /// selector alongside the head, and is opened in the rule selector + /// (only when a compound is actually present, so an ordinary word's fan-out stays exactly as tight + /// as before). This was the ONLY real blocker for compounding — earlier documentation claiming a + /// cross-cutting `WordAnalysis`/`MorphToken` data-model lift was needed was wrong; both types already + /// represent compounds (see `MorphOp.Compound`), as this fix demonstrates. + /// + internal static class FstReplay + { + /// The matched HC analysis of equal to , or null if HC does not produce it. + public static WordAnalysis Confirm(MorpherPool pool, WordAnalysis candidate, string word) + { + int rootIndex = candidate.RootMorphemeIndex; + IReadOnlyList morphemes = candidate.Morphemes; + if (rootIndex < 0 || rootIndex >= morphemes.Count || !(morphemes[rootIndex] is LexEntry root)) + { + return null; + } + + var rules = new HashSet(); + var extraRoots = new HashSet(); + for (int i = 0; i < morphemes.Count; i++) + { + if (i == rootIndex) + { + continue; + } + if (morphemes[i] is LexEntry nonHeadRoot) + { + extraRoots.Add(nonHeadRoot); // a compound's non-head root, not a rule + continue; + } + if (!(morphemes[i] is IHCRule rule)) + { + return null; + } + rules.Add(rule); + } + + Morpher morpher = pool.Rent(); + try + { + // Pin HC to this candidate's path: only this root (plus any compound non-head roots), + // only its morphological rules. Templates and strata stay open (they are containers the + // path threads through), and phonological rules ALWAYS stay open — they are obligatory, + // deterministic rewrites, not a fan-out choice, and un-applying them is exactly how a + // phonologically-altered surface (e.g. an FST candidate proposed from a surface + // allomorph) reduces back to its root. Gating only the leaf morphological rules + the + // root(s) is what collapses the fan-out. + morpher.LexEntrySelector = e => e == root || extraRoots.Contains(e); + morpher.RuleSelector = r => + r is AffixTemplate + || r is Stratum + || r is IPhonologicalRule + || rules.Contains(r) + || (extraRoots.Count > 0 && r is CompoundingRule); + + var ids = new Dictionary(); + string target = Signature(candidate, ids); + foreach (WordAnalysis analysis in morpher.AnalyzeWord(word)) + { + if (Signature(analysis, ids) == target) + { + return analysis; // the genuine HC analysis (carries the real category) + } + } + return null; + } + finally + { + pool.Return(morpher); + } + } + + /// Signature by per-morpheme identity (affix Morpheme.Id is empty, so shape-only would + /// falsely match a same-shape but different-morpheme analysis); same objects on both sides. + private static string Signature(WordAnalysis analysis, Dictionary ids) + { + return string.Join("+", analysis.Morphemes.Select(m => Id(m, ids))) + ":" + analysis.RootMorphemeIndex; + } + + private static int Id(IMorpheme morpheme, Dictionary ids) + { + if (!ids.TryGetValue(morpheme, out int id)) + { + id = ids.Count; + ids[morpheme] = id; + } + return id; + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs new file mode 100644 index 000000000..17cd4ca74 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs @@ -0,0 +1,1437 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Annotations; +using SIL.Machine.DataStructures; +using SIL.Machine.FeatureModel; +using SIL.Machine.FiniteState; +using SIL.Machine.Morphology; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// A token-accumulating FST analyzer for grammars whose affixation is organized into + /// affix templates (position classes) — the real-grammar case (HERMITCRAB_FST_PLAN.md + /// §6 Phase 2, §10). Each template becomes prefix-slot automaton → root → suffix-slot automaton; + /// a template attaches to a root only when the root passes the build-time gate: + /// + /// category — the root's syntactic features unify with the template's + /// RequiredSyntacticFeatureStruct; and + /// stratum — the root is at the template's stratum or an inner one (a template + /// cannot apply to a root introduced in a later/outer stratum). + /// + /// Gating both prevents over-generation and lets same-category roots share the template's + /// slot-automaton (states ≈ roots + Σ template automata, not roots × slot-combinations). + /// Tokens are accumulated along the DFS path (a state carries the morpheme token emitted on + /// entry). Prefix slots surface in reverse template order (slot 0 applies first → innermost), + /// suffix slots in template order. A budget (the §10 knob) aborts + /// before a blowup. Bounded phonology is precompiled into the arcs (surface-allomorph precompile, + /// C-internal tier — see ); a construct the FST cannot model + /// (reduplication/infix/circumfix/process) is skipped and its recorded in + /// as a coverage diagnostic, unless a sibling generator (see + /// ) covers it — it degrades gracefully, never mis-parses. + /// + public class FstTemplateAnalyzer : IMorphologicalAnalyzer + { + private readonly Fst _fsa; + private readonly State _start; + private readonly Dictionary, uint> _tokenOnEntry = + new Dictionary, uint>(); + private readonly Dictionary, int> _stateIds = + new Dictionary, int>(); + private readonly MorphTokenCodec _codec = new MorphTokenCodec(); + private readonly CharacterDefinitionTable _table; + private readonly Func, bool> _filter; + private readonly int _maxStates; + private readonly Func> _bareRootSurfaces; + private readonly Func> _affixSurfaces; + private readonly Func< + string, + IReadOnlyCollection<(string AffixSurface, FeatureStruct DeletedNeighbor)> + > _junctionDeletions; + private readonly List _derivSuffixRules = new List(); + private readonly List _derivPrefixRules = new List(); + + // Phase G2 (FST_FULL_GRAMMAR_PLAN.md): compounding rules, used ONLY by DerivableToCategory's + // frontier expansion below — a root that reaches a template-qualifying category via COMPOUNDING + // (then possibly further derivation) must be recognized as qualifying for that template, or the + // template's own class-prefix slot (gated on the post-compound-and-derive category) never gets + // offered to either half of the compound at all. + private readonly List _compoundingRules = new List(); + + // A root's underlying segment chain is identical no matter which attachment site (bare, the + // template-less path, or any given template) reaches it, so it is built ONCE here and every + // site links to it by a single epsilon arc in and a single epsilon arc out — rather than the + // roots-x-sites duplication a naive per-site build would produce. This is the same + // shared-substructure-plus-epsilon-fan-in/out technique already used for the derivation + // layers below (tokens accumulate per walk PATH, not per state, so sharing a state across + // many incoming paths never conflates their token histories). + private readonly Dictionary< + RootAllomorph, + (State Entry, State End) + > _rootChains = new Dictionary Entry, State End)>(); + + // Per-root list of states reached after consuming 0, 1, 2, ... of the root's own leading segments + // — index 0 is the chain's Entry. Only consulted by RootChainAfterSkip, for junction-deletion + // arcs (Phase C); every other consumer uses the plain Entry/End pair above. + private readonly Dictionary>> _rootCheckpoints = + new Dictionary>>(); + private int _stateCount; + private readonly HashSet _uncoveredOps = new HashSet(); + + /// + /// Max stacked derivational affixes modelled per side before inflection (tunable per grammar). + /// 2 (e.g. REV+NZR) is the speed/coverage sweet spot for Sena: depth 3 (PAS+APPLIC+NZR) gains a + /// word or two but roughly doubles verify cost (more over-gen proposals to reject). Deeper + /// stacks than this are left to the search backstop rather than inflating every verification. + /// + private readonly int _derivDepth; + + public MorphTokenCodec Codec => _codec; + + /// Number of FST states built (the precomputed size — to watch for state blow-up). + public int StateCount => _stateCount; + + /// + /// False if the build skipped a construct it cannot model (an infix/circumfix/reduplication/ + /// process slot or rule). The proposer degrades gracefully — it skips such constructs and + /// builds the rest — so a grammar using them under-generates on the fast path unless a sibling + /// generator (see ) covers it. A coverage diagnostic, not a + /// correctness signal: verify guarantees soundness regardless. + /// + public bool CoversAllConstructs => _uncoveredOps.Count == 0; + + /// The set of s the build skipped because it cannot model them in + /// the FST (infix/circumfix/reduplication/process). A sibling generator that covers one of these + /// (see ) removes it from the composite's uncovered set. + public IReadOnlyCollection UncoveredOps => _uncoveredOps; + + /// Build without obligatoriness: every root may stand bare (fine for toy grammars). + public FstTemplateAnalyzer(Language language, int maxStates = 1_000_000, int derivDepth = 2) + : this( + language, + root => new[] { UnderlyingForm(root) }, + s => new[] { s }, + s => Array.Empty<(string, FeatureStruct)>(), + maxStates, + derivDepth + ) { } + + /// + /// Build with obligatory-inflection enforcement AND surface-allomorph precompile (§C): a root's + /// bare surface realizations are obtained by synthesizing it bare (HC's own finality check). If + /// synthesis returns nothing, the bare reading is suppressed (obligatory inflection); if it + /// returns a phonologically-ALTERED surface, the proposer builds an arc for that surface so a + /// phonologically-altered bare root is matched (not just the underlying form). Also wires + /// junction-deletion probing (Phase C, FST_FULL_GRAMMAR_PLAN.md) from the SAME + /// instance, so its memoized cascade results are shared rather than + /// recomputed. + /// + public FstTemplateAnalyzer(Language language, Morpher morpher, int maxStates = 1_000_000, int derivDepth = 2) + : this( + language, + root => BareRootSurfaces(morpher, root), + new SurfacePhonology(language, morpher), + maxStates, + derivDepth + ) { } + + private FstTemplateAnalyzer( + Language language, + Func> bareRootSurfaces, + SurfacePhonology surfacePhonology, + int maxStates, + int derivDepth + ) + : this( + language, + bareRootSurfaces, + surfacePhonology.Variants, + surfacePhonology.DeletionJunctions, + maxStates, + derivDepth + ) { } + + private FstTemplateAnalyzer( + Language language, + Func> bareRootSurfaces, + Func> affixSurfaces, + Func> junctionDeletions, + int maxStates, + int derivDepth + ) + { + _bareRootSurfaces = bareRootSurfaces; + _affixSurfaces = affixSurfaces; + _junctionDeletions = junctionDeletions; + _maxStates = maxStates; + _derivDepth = derivDepth; + _table = language.SurfaceStratum.CharacterDefinitionTable; + _filter = ann => ann.Type() == HCFeatureSystem.Segment; + _fsa = new Fst { Filter = _filter, UseUnification = true }; + _start = NewState(); + _fsa.StartState = _start; + + // Collect every root with the stratum index it is introduced at. + var roots = new List(); + for (int si = 0; si < language.Strata.Count; si++) + { + foreach (LexEntry entry in language.Strata[si].Entries) + { + foreach (RootAllomorph allomorph in entry.Allomorphs) + { + roots.Add(new RootRef(allomorph, entry.SyntacticFeatureStruct, si)); + } + } + } + + // Phase G2 (FST_FULL_GRAMMAR_PLAN.md): only build the compound loop machinery at all if the + // grammar has any CompoundingRule — a grammar without one pays zero extra states/arcs. + _compoundingRules.AddRange(language.Strata.SelectMany(s => s.MorphologicalRules).OfType()); + bool hasCompoundingRules = _compoundingRules.Count > 0; + + // Standalone derivational affix rules (REC/APPLIC/REV/NZR/NEU/PAS/...), distinct from + // inflectional template slots and from compounding. Suffixal ones become an optional, + // bounded layer between the root and the inflectional suffix slots (§11.2). + foreach (Stratum stratum in language.Strata) + { + foreach (IMorphologicalRule mrule in stratum.MorphologicalRules) + { + if (!(mrule is MorphemicMorphologicalRule rule)) + { + continue; + } + MorphOp ruleOp = RuleOp(rule); + switch (ruleOp) + { + case MorphOp.Suffix: + _derivSuffixRules.Add(rule); + break; + case MorphOp.Prefix: + _derivPrefixRules.Add(rule); + break; + case MorphOp.None: + break; + default: + // A standalone rule the proposer cannot build (reduplication/infix/process). + // Record the op as uncovered — a sibling generator (see CompositeProposer) + // may still cover it; otherwise this construct just stays unparsed. + _uncoveredOps.Add(ruleOp); + break; + } + } + } + + // Bare-root paths — only for roots the grammar allows to stand uninflected. Surface-allomorph + // precompile (§C): build a chain for the underlying form AND for each phonologically-altered + // bare surface realization, so an altered bare root is matched. The emitted token is always + // the underlying root morpheme; verify re-runs HC (with real phonology) to confirm. The + // underlying chain is the SHARED one (see GetOrBuildRootChain) — the surface-variant chains + // are bare-root-only and stay unshared (templates never use them). + foreach (RootRef root in roots) + { + IReadOnlyCollection surfaces = _bareRootSurfaces(root.Allomorph); + if (surfaces.Count == 0) + { + continue; // bare root not valid (obligatory inflection) + } + (State entry, State end) = GetOrBuildRootChain(root.Allomorph); + _start.Arcs.Add(entry); // epsilon: enter the shared root chain + end.IsAccepting = true; + string underlying = UnderlyingForm(root.Allomorph); + foreach (string s in surfaces) + { + if (s == underlying) + { + continue; // already built from the underlying shape + } + State surfaceEnd = BuildRootChainFromSurface(_start, s, root.Allomorph.Morpheme); + if (surfaceEnd != null) + { + surfaceEnd.IsAccepting = true; + } + } + } + + // Template-less derivational stems: optional derivational prefixes + root + optional + // derivational suffixes, with NO inflectional template — for roots that derive/associate + // without inflecting (e.g. a pronoun taking an associative prefix: coisa + d'eles). + // Shared prefix/suffix derivation layers (built once) keep this additive. Verify-discard + // removes any over-generation, including a bare stem that should not stand alone. + // Also the home of the Phase G2 compound loop (hasCompoundingRules) — a grammar that + // compounds roots but has no OTHER standalone derivational rule still needs this path built. + if (_derivPrefixRules.Count > 0 || _derivSuffixRules.Count > 0 || hasCompoundingRules) + { + State tlPrefixEntry = NewState(); + (State tlRootStart, var tlPendingSkips) = BuildDerivationPrefixLayer(tlPrefixEntry); + _start.Arcs.Add(tlPrefixEntry); // epsilon: enter the template-less path + State tlSuffixEntry = NewState(); + State tlSuffixExit = BuildDerivationSuffixLayer(tlSuffixEntry); + tlSuffixExit.IsAccepting = true; + State tlCompoundJoin = hasCompoundingRules + ? BuildCompoundLoop(roots, tlSuffixEntry) + : null; + foreach (RootRef root in roots) + { + (State entry, State end) = GetOrBuildRootChain(root.Allomorph); + tlRootStart.Arcs.Add(entry); // epsilon: enter the shared root chain + end.Arcs.Add(tlSuffixEntry); // epsilon: root → shared derivational suffixes → accept + if (tlCompoundJoin != null) + { + end.Arcs.Add(tlCompoundJoin); // epsilon: root → (Phase G2) a second root → suffixes + } + WireDeletionSkips(tlPendingSkips, root.Allomorph); // Phase C: junction-deletion arcs + } + } + + // Each template: prefix automaton → (gated roots) → suffix automaton. + for (int ti = 0; ti < language.Strata.Count; ti++) + { + foreach (AffixTemplate template in language.Strata[ti].AffixTemplates) + { + var prefixSlots = new List(); + var suffixSlots = new List(); + ClassifyTemplate(template, prefixSlots, suffixSlots); + + State prefixEntry = NewState(); + State prefixExit = AppendSlots( + prefixEntry, + prefixSlots, + MorphOp.Prefix, + template.RequiredSyntacticFeatureStruct + ); + // Shared derivational-prefix layer between the inflectional prefixes and the root + // (surface order: class-prefix → derivational-prefix → root, e.g. + // 10 + nominalizador + [ser]). Roots start after it. + (State rootStart, var templatePendingSkips) = BuildDerivationPrefixLayer( + prefixExit + ); + State suffixEntry = NewState(); + State suffixExit = AppendSlots( + suffixEntry, + suffixSlots, + MorphOp.Suffix, + template.RequiredSyntacticFeatureStruct + ); + suffixExit.IsAccepting = true; + + // One derivation layer per template, shared by all its roots (tokens accumulate + // on the walk path, so sharing avoids a roots×derivations blowup): root → + // derivation suffixes → inflectional suffix slots. + State derivEntry = NewState(); + State derivExit = BuildDerivationSuffixLayer(derivEntry); + derivExit.Arcs.Add(suffixEntry); // epsilon: derivation → inflectional suffixes + + State templateCompoundJoin = hasCompoundingRules + ? BuildCompoundLoop(roots, derivEntry) + : null; + + _start.Arcs.Add(prefixEntry); // epsilon: enter this template + + foreach (RootRef root in roots) + { + // Attach the root to this template if its category matches directly, OR if a + // derivational suffix in the layer changes the root's category to the + // template's (e.g. a nominalizer feeding a noun-class template: vencer[verb] + + // NZR → noun, then class-10 prefix). The category-changing suffix is in the + // shared derivation layer; verify-discard removes any resulting over-gen (§11.4). + if ( + root.StratumIndex <= ti + && ( + CategoryMatches(root.Category, template.RequiredSyntacticFeatureStruct) + || DerivableToCategory(root.Category, template.RequiredSyntacticFeatureStruct) + ) + ) + { + (State entry, State end) = GetOrBuildRootChain( + root.Allomorph + ); + rootStart.Arcs.Add(entry); // epsilon: enter the shared root chain + end.Arcs.Add(derivEntry); // epsilon: root → derivation → suffix slots + if (templateCompoundJoin != null) + { + end.Arcs.Add(templateCompoundJoin); // epsilon: root → (Phase G2) a second root + } + WireDeletionSkips(templatePendingSkips, root.Allomorph); // Phase C + } + } + } + } + } + + public IEnumerable AnalyzeWord(string word) + { + Shape shape; + try + { + shape = _table.Segment(word); + } + catch (InvalidShapeException) + { + // A word with a phoneme outside this table cannot be a surface form here. + return Enumerable.Empty(); + } + return AnalyzeShape(shape); + } + + /// + /// Walk the morphotactic FST over the segments of an already-built . + /// Used both by (segmenting the surface) and by + /// , which feeds an underlying shape obtained by + /// un-applying phonology — letting the underlying arcs match cross-boundary surfaces the + /// per-morpheme precompile misses. Segments are matched by unification, so an underspecified + /// node (from analysis) matches every arc it unifies with; verify prunes the spurious ones. + /// + internal IEnumerable AnalyzeShape(Shape shape) + { + var segments = new List(); + for ( + ShapeNode node = shape.GetFirst(n => _filter(n.Annotation)); + node != shape.End; + node = node.GetNext(n => _filter(n.Annotation)) + ) + { + segments.Add(node.Annotation.FeatureStruct); + } + + // NFA simulation: a set of (state, accumulated tokens) configurations advanced one + // segment at a time, deduped by (state, tokens) so shared states are not re-explored + // (a naive recursive DFS is exponential on a real grammar's nondeterminism). Dedup keys + // are struct-based (state id + token array), not strings — a string.Join per config per + // segment was the dominant per-word allocator on real grammars. + List current = EpsilonClosure(new List { Enter(_start, EmptyTokens) }); + foreach (FeatureStruct segment in segments) + { + var next = new List(); + var seen = new HashSet(); + foreach (Config config in current) + { + for (int a = 0; a < config.State.Arcs.Count; a++) + { + Arc arc = config.State.Arcs[a]; + if (!arc.Input.IsEpsilon && arc.Input.FeatureStruct.IsUnifiable(segment)) + { + Config nc = Enter(arc.Target, config.Tokens); + if (seen.Add(Key(nc))) + { + next.Add(nc); + } + } + } + } + current = EpsilonClosure(next); + if (current.Count == 0) + { + break; + } + } + + var results = new List(); + var emitted = new HashSet(); + foreach (Config config in current) + { + if (config.State.IsAccepting && emitted.Add(new TokenArrayKey(config.Tokens))) + { + results.AddRange(ToWordAnalyses(config.Tokens)); + } + } + return results; + } + + private static readonly uint[] EmptyTokens = new uint[0]; + + private List EpsilonClosure(List configs) + { + var result = new List(); + var seen = new HashSet(); + var stack = new Stack(); + foreach (Config config in configs) + { + if (seen.Add(Key(config))) + { + stack.Push(config); + result.Add(config); + } + } + while (stack.Count > 0) + { + Config config = stack.Pop(); + for (int a = 0; a < config.State.Arcs.Count; a++) + { + Arc arc = config.State.Arcs[a]; + if (arc.Input.IsEpsilon) + { + Config nc = Enter(arc.Target, config.Tokens); + if (seen.Add(Key(nc))) + { + stack.Push(nc); + result.Add(nc); + } + } + } + } + return result; + } + + private Config Enter(State state, uint[] tokens) + { + return _tokenOnEntry.TryGetValue(state, out uint token) + ? new Config(state, Append(tokens, token)) + : new Config(state, tokens); + } + + private ConfigKey Key(Config config) + { + return new ConfigKey(_stateIds[config.State], config.Tokens); + } + + /// Usually one candidate. A token array with MORE THAN ONE + /// token (Phase G2, FST_FULL_GRAMMAR_PLAN.md — the compound loop, see ) + /// yields one candidate PER root position: the trie doesn't statically know which root a + /// compounding rule treats as head, so it proposes every choice and lets verify + /// (, which pins the real CompoundingRule) confirm whichever + /// headedness the grammar actually licenses. Sound regardless: a wrong headedness guess is just a + /// rejected candidate, matching this file's usual permissive-build/verify-prune pattern. + private IEnumerable ToWordAnalyses(uint[] tokens) + { + var morphemes = new List(tokens.Length); + var rootIndices = new List(); + for (int i = 0; i < tokens.Length; i++) + { + morphemes.Add(_codec.GetMorpheme(MorphToken.GetMorphemeId(tokens[i]))); + if (MorphToken.GetOp(tokens[i]) == MorphOp.Root) + { + rootIndices.Add(i); + } + } + if (rootIndices.Count <= 1) + { + yield return new WordAnalysis(morphemes, rootIndices.Count == 1 ? rootIndices[0] : -1, null); + yield break; + } + foreach (int rootIndex in rootIndices) + { + yield return new WordAnalysis(morphemes, rootIndex, null); + } + } + + private readonly struct PConfig + { + public PConfig(int pinvState, Config lex) + { + PinvState = pinvState; + Lex = lex; + } + + public int PinvState { get; } + public Config Lex { get; } + } + + /// + /// Lever 2 (LEVER_2.md): lazily compose an inverse-phonology transducer (surface→underlying) with + /// this morphotactic acceptor and walk the product over the surface. Pinv consumes surface and + /// emits underlying segments; each must unify a lexicon arc, advancing the lexicon and accruing + /// its token. Pinv ε-input arcs restore deleted segments and survive only where the lexicon has + /// the underlying arc — so the over-generation that broke the boundary-less runtime inverse is + /// pruned in lockstep. The lexicon network must be the underlying-only one (default ctor), so + /// phonology is applied once (by Pinv), not twice. + /// + internal IEnumerable AnalyzeComposed(string word, InversePhonology pinv) + { + Shape shape; + try + { + shape = _table.Segment(word); + } + catch (InvalidShapeException) + { + return Enumerable.Empty(); + } + + var segments = new List(); + for ( + ShapeNode node = shape.GetFirst(n => _filter(n.Annotation)); + node != shape.End; + node = node.GetNext(n => _filter(n.Annotation)) + ) + { + segments.Add(node.Annotation.FeatureStruct); + } + + List current = ComposedClosure( + pinv, + new List { new PConfig(pinv.StartState, Enter(_start, EmptyTokens)) } + ); + foreach (FeatureStruct segment in segments) + { + var next = new List(); + var seen = new HashSet(); + foreach (PConfig pc in current) + { + foreach (InversePhonology.Arc parc in pinv.ArcsFrom(pc.PinvState)) + { + if (parc.IsEpsilonInput || !parc.SurfaceInput.IsUnifiable(segment)) + { + continue; // ε-arcs are taken in the closure; this arc must consume the surface segment + } + // Pinv consumed the surface segment and emits an underlying segment: it must match + // a (non-ε) lexicon arc, which advances the lexicon walk and accrues its token. + for (int a = 0; a < pc.Lex.State.Arcs.Count; a++) + { + Arc larc = pc.Lex.State.Arcs[a]; + if (!larc.Input.IsEpsilon && larc.Input.FeatureStruct.IsUnifiable(parc.UnderlyingOutput)) + { + var nc = new PConfig(parc.Target, Enter(larc.Target, pc.Lex.Tokens)); + if (seen.Add(PKey(nc))) + { + next.Add(nc); + } + } + } + } + } + current = ComposedClosure(pinv, next); + if (current.Count == 0) + { + break; + } + } + + var results = new List(); + var emitted = new HashSet(); + foreach (PConfig pc in current) + { + if ( + pinv.IsAccepting(pc.PinvState) + && pc.Lex.State.IsAccepting + && emitted.Add(new TokenArrayKey(pc.Lex.Tokens)) + ) + { + results.AddRange(ToWordAnalyses(pc.Lex.Tokens)); + } + } + return results; + } + + /// Closure over both ε kinds: lexicon ε-arcs (advance the lexicon alone) and Pinv + /// ε-input restorations (advance Pinv + a matching lexicon arc, consuming no surface). + private List ComposedClosure(InversePhonology pinv, List configs) + { + var result = new List(); + var seen = new HashSet(); + var stack = new Stack(); + foreach (PConfig pc in configs) + { + if (seen.Add(PKey(pc))) + { + result.Add(pc); + stack.Push(pc); + } + } + while (stack.Count > 0) + { + PConfig pc = stack.Pop(); + // (a) lexicon ε-arcs: the morphotactic network's slot-entry/skip transitions. + for (int a = 0; a < pc.Lex.State.Arcs.Count; a++) + { + Arc larc = pc.Lex.State.Arcs[a]; + if (larc.Input.IsEpsilon) + { + var nc = new PConfig(pc.PinvState, Enter(larc.Target, pc.Lex.Tokens)); + if (seen.Add(PKey(nc))) + { + result.Add(nc); + stack.Push(nc); + } + } + } + // (b) Pinv ε-input (deletion-restoration) arcs: emit an underlying segment that must unify + // a lexicon arc — the lexicon constraint that prunes spurious restorations. + foreach (InversePhonology.Arc parc in pinv.ArcsFrom(pc.PinvState)) + { + if (!parc.IsEpsilonInput) + { + continue; + } + for (int a = 0; a < pc.Lex.State.Arcs.Count; a++) + { + Arc larc = pc.Lex.State.Arcs[a]; + if (!larc.Input.IsEpsilon && larc.Input.FeatureStruct.IsUnifiable(parc.UnderlyingOutput)) + { + var nc = new PConfig(parc.Target, Enter(larc.Target, pc.Lex.Tokens)); + if (seen.Add(PKey(nc))) + { + result.Add(nc); + stack.Push(nc); + } + } + } + } + } + return result; + } + + private PConfigKey PKey(PConfig pc) => new PConfigKey(pc.PinvState, _stateIds[pc.Lex.State], pc.Lex.Tokens); + + /// Split a template's slots into prefix and suffix; prefixes are reversed to surface order. + private void ClassifyTemplate( + AffixTemplate template, + List prefixSlots, + List suffixSlots + ) + { + foreach (AffixTemplateSlot slot in template.Slots) + { + switch (SlotOp(slot)) + { + case MorphOp.Prefix: + prefixSlots.Add(slot); + break; + case MorphOp.Suffix: + suffixSlots.Add(slot); + break; + default: + // A slot the proposer cannot build (infix/circumfix/reduplication/process). + // Skip it and record the construct op(s) as uncovered — those words stay + // unparsed on the fast path unless a sibling generator covers the op. + // (Was a hard throw that aborted the whole build.) + foreach (MorphemicMorphologicalRule rule in slot.Rules) + { + MorphOp ruleOp = RuleOp(rule); + if (ruleOp != MorphOp.Prefix && ruleOp != MorphOp.Suffix && ruleOp != MorphOp.None) + { + _uncoveredOps.Add(ruleOp); + } + } + break; + } + } + prefixSlots.Reverse(); // slot 0 applies first (innermost) → rightmost prefix on the surface + } + + /// The slot's surface role: the first rule that is a prefix or suffix. A slot whose + /// only rules are zero-segment affixes is a (position-less) suffix so it still builds; a slot + /// with no prefix/suffix/zero rule (e.g. infix/reduplication only) is None → skipped. + private static MorphOp SlotOp(AffixTemplateSlot slot) + { + bool hasZero = false; + foreach (MorphemicMorphologicalRule rule in slot.Rules) + { + MorphOp op = RuleOp(rule); + if (op == MorphOp.Prefix || op == MorphOp.Suffix) + { + return op; + } + if (op == MorphOp.None) + { + hasZero = true; // a zero/empty-segment affix — no surface position + } + } + return hasZero ? MorphOp.Suffix : MorphOp.None; + } + + /// The surface role (prefix/suffix/…) of a morphological rule, from its first allomorph. + private static MorphOp RuleOp(MorphemicMorphologicalRule rule) + { + foreach (AffixProcessAllomorph allomorph in Allomorphs(rule)) + { + return MorphTokenCodec.ClassifyOp(allomorph, false); + } + return MorphOp.None; + } + + /// + /// An optional, bounded chain of derivational suffixes (the stratum's standalone affix + /// rules), shared by every root of a template. Permissive by design: a category-illegal + /// derivation (e.g. a nominalizer feeding a verbal suffix) is proposed here and removed by + /// re-synthesis verification (), per the plan §11.2. + /// + private State BuildDerivationSuffixLayer(State entry) + { + return BuildDerivationLayer(entry, _derivSuffixRules, MorphOp.Suffix, null); + } + + /// + /// Compound loop (Phase G2, FST_FULL_GRAMMAR_PLAN.md): lets a root's chain continue into a + /// SECOND root before reaching (this attachment site's + /// derivation/suffix continuation — tlSuffixEntry for the template-less path, or a + /// template's own derivEntry). Returns a shared "join" state with an epsilon arc into + /// EVERY root's shared chain entry; the caller wires each qualifying root's own chain End to + /// this join (in addition to its normal arc) so a compound is + /// just an alternative path through the SAME states, not a separate construct. + /// + /// Permissive by design, like every other build-time gate in this file: it does not check the + /// grammar's CompoundingRule head/non-head part-of-speech pairing (any root may follow any + /// root) — pins the real rule and prunes any pairing the grammar + /// does not actually license, so over-generation here costs a rejected verify candidate, never a + /// wrong answer. Bounded to exactly one extra root: the second root's chain End goes straight to + /// , never back into this join, so a chain of 3+ roots is not + /// modeled (not attested in either real grammar this phase targets). + /// + private State BuildCompoundLoop(List roots, State continuation) + { + State join = NewState(); + foreach (RootRef r in roots) + { + (State entry, State end) = GetOrBuildRootChain(r.Allomorph); + join.Arcs.Add(entry); // epsilon: enter the second root's shared chain + end.Arcs.Add(continuation); // epsilon: second root → this site's continuation + } + return join; + } + + /// + /// An optional, bounded chain of derivational prefixes (the stratum's standalone prefix affix + /// rules) between the inflectional prefixes and the root — mirror of the suffix layer (§12.4). + /// Also returns any junction-deletion exit states this layer's affixes produced (Phase C, + /// FST_FULL_GRAMMAR_PLAN.md — e.g. Indonesian's meN- deleting a following voiceless obstruent): + /// the caller, once it knows the actual root list, wires each to + /// for roots whose own leading segment matches the recorded + /// class (see ). Empty unless the grammar's morpher/SurfacePhonology + /// probing actually found such a case — no-op-cost otherwise. + /// + private ( + State Current, + List<(State ExitState, FeatureStruct RootOnsetClass)> PendingSkips + ) BuildDerivationPrefixLayer(State entry) + { + var pendingSkips = new List<(State, FeatureStruct)>(); + State current = BuildDerivationLayer( + entry, + _derivPrefixRules, + MorphOp.Prefix, + pendingSkips + ); + return (current, pendingSkips); + } + + /// Shared builder for an optional, bounded derivational-affix layer of the given op. + /// , when non-null, collects junction-deletion exit states for + /// this layer's affixes (prefix layer only — see ). + private State BuildDerivationLayer( + State entry, + List rules, + MorphOp op, + List<(State ExitState, FeatureStruct RootOnsetClass)> pendingSkips + ) + { + State current = entry; + for (int k = 0; k < _derivDepth; k++) + { + State after = NewState(); + current.Arcs.Add(after); // epsilon: apply no derivation at this level + foreach (MorphemicMorphologicalRule rule in rules) + { + foreach (AffixProcessAllomorph allomorph in Allomorphs(rule)) + { + if (MorphTokenCodec.ClassifyOp(allomorph, false) != op) + { + continue; + } + uint token = MorphToken.Encode(op, _codec.GetOrAddIndex(allomorph.Morpheme)); + State tokenState = NewState(); + _tokenOnEntry[tokenState] = token; + current.Arcs.Add(tokenState); // epsilon: enter this derivational affix + InsertSegments insert = allomorph.Rhs.OfType().FirstOrDefault(); + BuildAffixArcs(tokenState, after, insert); + if (pendingSkips != null && insert != null) + { + BuildDeletionJunctionArcs(tokenState, insert, pendingSkips); + } + } + } + current = after; + } + return current; + } + + /// + /// Build an affix's segment arcs from to : + /// the underlying form AND each phonologically-altered surface realization (surface-allomorph + /// precompile, Point 1, C-internal tier), so an affix whose surface differs from its underlying + /// segments (e.g. a suffix that devoices word-finally) is matched. A zero-segment affix (null + /// ) just reconverges. Sound: the underlying path is always built, the + /// emitted token is the underlying morpheme, and verify confirms with real phonology; a variant + /// not actually attested is pruned by verify, a missed cross-boundary variant rides the engine. + /// + private void BuildAffixArcs( + State tokenState, + State after, + InsertSegments insert + ) + { + if (insert == null) + { + tokenState.Arcs.Add(after); // zero/empty-segment affix: token only + return; + } + State s = tokenState; + foreach (FeatureStruct fs in GetSegments(insert.Segments.Shape)) + { + s = AddArc(s, fs); + } + s.Arcs.Add(after); + + string underlying = insert.Segments.Representation; + foreach (string variant in _affixSurfaces(underlying)) + { + if (variant == underlying) + { + continue; // underlying path already built + } + Shape vshape; + try + { + vshape = _table.Segment(variant); + } + catch (InvalidShapeException) + { + continue; + } + State sv = tokenState; + foreach (FeatureStruct fs in GetSegments(vshape)) + { + sv = AddArc(sv, fs); + } + sv.Arcs.Add(after); + } + } + + /// + /// Build one arc chain per junction-deletion outcome (Phase C, FST_FULL_GRAMMAR_PLAN.md): the + /// affix's own surface up to but NOT including the neighbor segment it deletes (e.g. "mem" for + /// Indonesian meN- before a bilabial voiceless obstruent that gets deleted), ending at a FRESH + /// exit state kept separate from the layer's shared after — its continuation is + /// root-specific (it must skip exactly the root's own deleted leading segment, not just + /// reconverge normally), so it cannot be wired here; it is recorded in + /// for to finish once the actual + /// root list is known. Distinct outcome strings that collapse to the same surface (multiple + /// alphabet representatives triggering the same assimilated-nasal-then-deletion result) share one + /// exit state rather than building duplicate chains. + /// + private void BuildDeletionJunctionArcs( + State tokenState, + InsertSegments insert, + List<(State ExitState, FeatureStruct RootOnsetClass)> pendingSkips + ) + { + string underlying = insert.Segments.Representation; + var exitByString = new Dictionary>(); + foreach ((string affixSurface, FeatureStruct deletedNeighbor) in _junctionDeletions(underlying)) + { + if (!exitByString.TryGetValue(affixSurface, out State s)) + { + Shape shape; + try + { + shape = _table.Segment(affixSurface); + } + catch (InvalidShapeException) + { + continue; + } + s = tokenState; + foreach (FeatureStruct fs in GetSegments(shape)) + { + s = AddArc(s, fs); + } + exitByString[affixSurface] = s; + } + pendingSkips.Add((s, deletedNeighbor)); + } + } + + /// Finish the wiring could not: for THIS root, + /// connect each pending junction-deletion exit whose recorded class the root's own leading + /// segment unifies with — a build-time gate, so a root NOT starting with the deleted class never + /// gets the skip arc (over-generation would still be sound via verify, but this keeps the branch + /// factor tied to actual grammar shape rather than the full alphabet). Skips exactly the one + /// leading segment the probe found deleted; a root with no segments is never gated (nothing to + /// skip into — would return null anyway). + private void WireDeletionSkips( + List<(State ExitState, FeatureStruct RootOnsetClass)> pendingSkips, + RootAllomorph root + ) + { + if (pendingSkips.Count == 0) + { + return; + } + IReadOnlyList segments = GetSegments(root.Segments.Shape); + if (segments.Count == 0) + { + return; + } + foreach ((State exitState, FeatureStruct onsetClass) in pendingSkips) + { + if (!segments[0].IsUnifiable(onsetClass)) + { + continue; + } + State afterSkip = RootChainAfterSkip(root, 1); + if (afterSkip != null) + { + exitState.Arcs.Add(afterSkip); + } + } + } + + /// Allomorphs of a slot rule — both AffixProcessRule and its realizational sibling. + /// + /// True iff this root may surface uninflected — i.e. synthesizing it with no affixes yields + /// its own surface form. If the grammar makes a bare stem non-final (obligatory inflection), + /// synthesis returns nothing and the bare reading is correctly suppressed. + /// + private static string UnderlyingForm(RootAllomorph root) + { + return root.Segments.Representation.Normalize(System.Text.NormalizationForm.FormD); + } + + /// + /// The bare-root surface realizations: the surface forms HC synthesizes for the root with no + /// affixes (phonology applied). Empty ⇒ the bare root is not a valid word (obligatory + /// inflection). A form ≠ the underlying representation is a phonologically-altered surface the + /// proposer must match (Solution 1, §C). Reuses the same GenerateWords call the obligatoriness + /// check needed, so it is zero extra build cost. + /// + private static IReadOnlyCollection BareRootSurfaces(Morpher morpher, RootAllomorph root) + { + if (!(root.Morpheme is LexEntry entry)) + { + return new[] { UnderlyingForm(root) }; + } + return morpher + .GenerateWords(entry, System.Linq.Enumerable.Empty(), new FeatureStruct()) + .Select(g => g.Normalize(System.Text.NormalizationForm.FormD)) + .Distinct() + .ToList(); + } + + private static FeatureStruct RequiredCategory(MorphemicMorphologicalRule rule) + { + switch (rule) + { + case AffixProcessRule affix: + return affix.RequiredSyntacticFeatureStruct; + case RealizationalAffixProcessRule realizational: + return realizational.RequiredSyntacticFeatureStruct; + default: + return null; + } + } + + /// The category a derivational rule outputs (its OutSyntacticFeatureStruct). + private static FeatureStruct OutCategory(MorphemicMorphologicalRule rule) + { + return rule is AffixProcessRule affix ? affix.OutSyntacticFeatureStruct : null; + } + + /// + /// True iff can be transformed into + /// by a chain of ≤ the derivation-depth bound derivational suffixes (a category-changing + /// derivation, e.g. verb → noun via a nominalizer). Lets a template attach over a derived stem + /// of its output category; the category-changing suffix rides the shared derivation layer. + /// + /// True iff can reach + /// through a bounded chain of derivational affixes AND/OR compounding steps (Phase G2, + /// FST_FULL_GRAMMAR_PLAN.md — added so a root that only qualifies for a template AFTER being + /// compounded, e.g. Sena's class-prefix template gated on a compound's post-NZR category, is + /// still recognized as qualifying). Compounding is modeled permissively, same as everywhere else + /// in this file: may fill EITHER the head or non-head role of any + /// compounding rule it unifies with, without checking that a real partner root exists for the + /// other role — 's CompoundingRule re-check is what prunes + /// a pairing the grammar does not actually license. + private bool DerivableToCategory(FeatureStruct rootCategory, FeatureStruct templateCategory) + { + if (rootCategory == null || templateCategory == null || templateCategory.IsEmpty) + { + return false; + } + var frontier = new List { rootCategory }; + for (int depth = 0; depth < _derivDepth && frontier.Count > 0; depth++) + { + var next = new List(); + foreach (FeatureStruct cat in frontier) + { + foreach (MorphemicMorphologicalRule rule in _derivSuffixRules.Concat(_derivPrefixRules)) + { + FeatureStruct outCat = OutCategory(rule); + if (outCat == null || outCat.IsEmpty) + { + continue; // not a category-changing derivation + } + FeatureStruct inCat = RequiredCategory(rule); + if (inCat != null && !inCat.IsEmpty && !cat.IsUnifiable(inCat)) + { + continue; // rule does not apply to this stem category + } + if (outCat.IsUnifiable(templateCategory)) + { + return true; + } + next.Add(outCat); + } + foreach (CompoundingRule rule in _compoundingRules) + { + FeatureStruct outCat = rule.OutSyntacticFeatureStruct; + if (outCat == null || outCat.IsEmpty) + { + continue; + } + FeatureStruct headReq = rule.HeadRequiredSyntacticFeatureStruct; + FeatureStruct nonHeadReq = rule.NonHeadRequiredSyntacticFeatureStruct; + bool canHead = headReq == null || headReq.IsEmpty || cat.IsUnifiable(headReq); + bool canNonHead = nonHeadReq == null || nonHeadReq.IsEmpty || cat.IsUnifiable(nonHeadReq); + if (!canHead && !canNonHead) + { + continue; // this category fits neither role of this compounding rule + } + if (outCat.IsUnifiable(templateCategory)) + { + return true; + } + next.Add(outCat); + } + } + frontier = next; + } + return false; + } + + private static IEnumerable Allomorphs(MorphemicMorphologicalRule rule) + { + switch (rule) + { + case AffixProcessRule affix: + return affix.Allomorphs; + case RealizationalAffixProcessRule realizational: + return realizational.Allomorphs; + default: + return Enumerable.Empty(); + } + } + + /// Build the slot sequence from ; returns the state after the last slot. + private State AppendSlots( + State start, + List slots, + MorphOp op, + FeatureStruct templateCategory + ) + { + State current = start; + foreach (AffixTemplateSlot slot in slots) + { + State after = NewState(); + if (slot.Optional) + { + current.Arcs.Add(after); // epsilon: skip this slot + } + foreach (MorphemicMorphologicalRule rule in slot.Rules) + { + // Build-time category gate (faithful for inflectional templates, where the + // category is ~constant): a rule whose RequiredSyntacticFeatureStruct cannot + // unify with the template's category can never apply here, so omit it. This is + // HC's Required.Unify(stem) check, hoisted to compile time — no walk-order issue. + FeatureStruct required = RequiredCategory(rule); + if ( + templateCategory != null + && required != null + && !required.IsEmpty + && !templateCategory.IsUnifiable(required) + ) + { + continue; + } + foreach (AffixProcessAllomorph allomorph in Allomorphs(rule)) + { + MorphOp aop = MorphTokenCodec.ClassifyOp(allomorph, false); + if (aop != op && aop != MorphOp.None) + { + // A rule the proposer can't build in this slot (infix/circumfix/redup/ + // process). Skip it and record the op as uncovered; the engine/cache backstop + // and parity gate handle those words unless a sibling generator covers the op. + // (Was a hard throw.) + _uncoveredOps.Add(aop); + continue; + } + // aop == op (normal affix) or aop == None (a true zero-segment affix: no + // InsertSegments) — both emit the morpheme token at this slot's position; a + // zero affix simply adds no segment arcs. + uint affixToken = MorphToken.Encode(op, _codec.GetOrAddIndex(allomorph.Morpheme)); + // Enter the affix through a token-bearing state, so the morpheme is emitted + // even for a zero/empty-segment affix (its token would otherwise be lost). + State tokenState = NewState(); + _tokenOnEntry[tokenState] = affixToken; + current.Arcs.Add(tokenState); // epsilon: enter this affix + BuildAffixArcs(tokenState, after, allomorph.Rhs.OfType().FirstOrDefault()); + } + } + current = after; + } + return current; + } + + /// The shared underlying chain for one root allomorph, built once and reused by every + /// attachment site (bare, template-less, and each qualifying template): callers add an epsilon + /// arc from their own entry state to Entry and an epsilon arc from End to their + /// own continuation. Memoized per so a root attaching to N sites + /// costs one segment chain plus N cheap epsilon arcs, not N segment chains. + private (State Entry, State End) GetOrBuildRootChain(RootAllomorph root) + { + if (_rootChains.TryGetValue(root, out var cached)) + { + return cached; + } + State entry = NewState(); + var checkpoints = new List> { entry }; + State end = BuildRootChain(entry, root, checkpoints); + var chain = (entry, end); + _rootChains[root] = chain; + _rootCheckpoints[root] = checkpoints; + return chain; + } + + /// The state reached after skipping leading segments of + /// 's shared chain — used ONLY for a junction-deletion arc (Phase C, + /// FST_FULL_GRAMMAR_PLAN.md), where the real synthesis cascade deletes the root's own leading + /// segment(s) (e.g. Indonesian meN- + a voiceless obstruent onset). Returns null if the root has + /// fewer than segments (nothing to skip into). Builds the chain (via + /// ) as a side effect if this root has not been attached yet, so + /// callers may use this before or after the root's own normal attachment. + private State RootChainAfterSkip(RootAllomorph root, int skipCount) + { + GetOrBuildRootChain(root); // ensures _rootCheckpoints is populated + IReadOnlyList> checkpoints = _rootCheckpoints[root]; + return skipCount < checkpoints.Count ? checkpoints[skipCount] : null; + } + + private State BuildRootChain( + State from, + RootAllomorph root, + List> checkpoints + ) + { + State state = from; + foreach (FeatureStruct fs in GetSegments(root.Segments.Shape)) + { + state = AddArc(state, fs); + checkpoints.Add(state); + } + _tokenOnEntry[state] = MorphToken.Encode(MorphOp.Root, _codec.GetOrAddIndex(root.Morpheme)); + return state; + } + + /// Build a root chain from a surface STRING (a phonologically-altered realization), + /// segmenting it via the table; the chain ends in the underlying root morpheme's token. Returns + /// null if the surface has a segment outside the table. + private State BuildRootChainFromSurface( + State from, + string surface, + Morpheme morpheme + ) + { + Shape shape; + try + { + shape = _table.Segment(surface); + } + catch (InvalidShapeException) + { + return null; + } + State state = from; + foreach (FeatureStruct fs in GetSegments(shape)) + { + state = AddArc(state, fs); + } + _tokenOnEntry[state] = MorphToken.Encode(MorphOp.Root, _codec.GetOrAddIndex(morpheme)); + return state; + } + + private static bool CategoryMatches(FeatureStruct rootCategory, FeatureStruct required) + { + if (required == null || required.IsEmpty) + { + return true; + } + return rootCategory != null && rootCategory.IsUnifiable(required); + } + + private IReadOnlyList GetSegments(Shape shape) + { + var segments = new List(); + for ( + ShapeNode node = shape.GetFirst(n => _filter(n.Annotation)); + node != null && node != shape.End; + node = node.GetNext(n => _filter(n.Annotation)) + ) + { + FeatureStruct fs = node.Annotation.FeatureStruct.Clone(); + fs.Freeze(); + segments.Add(fs); + } + return segments; + } + + private State AddArc(State state, FeatureStruct condition) + { + State next = NewState(); + state.Arcs.Add(condition, next); + return next; + } + + private State NewState() + { + _stateCount++; + if (_stateCount > _maxStates) + { + throw new NotSupportedException( + $"FstTemplateAnalyzer exceeded the state budget ({_maxStates}); this grammar needs the " + + "lazy / on-the-fly partition (HERMITCRAB_FST_PLAN.md §10) rather than an eager build." + ); + } + State state = _fsa.CreateState(); + _stateIds[state] = _stateCount; + return state; + } + + private static uint[] Append(uint[] tokens, uint token) + { + var result = new uint[tokens.Length + 1]; + tokens.CopyTo(result, 0); + result[tokens.Length] = token; + return result; + } + + private readonly struct Config + { + public Config(State state, uint[] tokens) + { + State = state; + Tokens = tokens; + } + + public State State { get; } + public uint[] Tokens { get; } + } + + /// Value-equality wrapper over a token array, for use as a dictionary/hash-set key + /// without the string-allocation a string.Join key would cost per lookup. + private readonly struct TokenArrayKey : IEquatable + { + private readonly uint[] _tokens; + + public TokenArrayKey(uint[] tokens) + { + _tokens = tokens; + } + + public bool Equals(TokenArrayKey other) + { + if (_tokens.Length != other._tokens.Length) + { + return false; + } + for (int i = 0; i < _tokens.Length; i++) + { + if (_tokens[i] != other._tokens[i]) + { + return false; + } + } + return true; + } + + public override bool Equals(object obj) => obj is TokenArrayKey other && Equals(other); + + // netstandard2.0 has no System.HashCode; combine with the standard FNV-ish rolling hash. + public override int GetHashCode() + { + unchecked + { + int hash = 17; + foreach (uint t in _tokens) + { + hash = (hash * 31) + (int)t; + } + return hash; + } + } + } + + /// Dedup key for a walk : the FSA state id plus the accumulated + /// token history. Two configs with the same state but different token histories are distinct + /// paths (different morpheme sequences reaching the same point) and must not be merged. + private readonly struct ConfigKey : IEquatable + { + private readonly int _stateId; + private readonly TokenArrayKey _tokens; + + public ConfigKey(int stateId, uint[] tokens) + { + _stateId = stateId; + _tokens = new TokenArrayKey(tokens); + } + + public bool Equals(ConfigKey other) => _stateId == other._stateId && _tokens.Equals(other._tokens); + + public override bool Equals(object obj) => obj is ConfigKey other && Equals(other); + + public override int GetHashCode() + { + unchecked + { + return (_stateId * 397) ^ _tokens.GetHashCode(); + } + } + } + + /// Dedup key for a : the inverse-phonology state id, the lexicon + /// state id, and the token history. + private readonly struct PConfigKey : IEquatable + { + private readonly int _pinvState; + private readonly int _lexStateId; + private readonly TokenArrayKey _tokens; + + public PConfigKey(int pinvState, int lexStateId, uint[] tokens) + { + _pinvState = pinvState; + _lexStateId = lexStateId; + _tokens = new TokenArrayKey(tokens); + } + + public bool Equals(PConfigKey other) => + _pinvState == other._pinvState && _lexStateId == other._lexStateId && _tokens.Equals(other._tokens); + + public override bool Equals(object obj) => obj is PConfigKey other && Equals(other); + + public override int GetHashCode() + { + unchecked + { + int hash = _pinvState; + hash = (hash * 397) ^ _lexStateId; + hash = (hash * 397) ^ _tokens.GetHashCode(); + return hash; + } + } + } + + private readonly struct RootRef + { + public RootRef(RootAllomorph allomorph, FeatureStruct category, int stratumIndex) + { + Allomorph = allomorph; + Category = category; + StratumIndex = stratumIndex; + } + + public RootAllomorph Allomorph { get; } + public FeatureStruct Category { get; } + public int StratumIndex { get; } + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/FstVerification.cs b/src/SIL.Machine.Morphology.HermitCrab/FstVerification.cs new file mode 100644 index 000000000..db88ce024 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/FstVerification.cs @@ -0,0 +1,138 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using SIL.Machine.Morphology; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// One word on which a candidate analyzer's analysis set differs from the reference's: + /// are analyses the reference found but the candidate did not + /// (completeness failures), are analyses the candidate produced + /// that the reference rejects (soundness / over-generation failures). + /// + public sealed class AnalysisDivergence + { + public AnalysisDivergence( + string word, + IReadOnlyList missingFromCandidate, + IReadOnlyList extraInCandidate + ) + { + Word = word; + MissingFromCandidate = missingFromCandidate; + ExtraInCandidate = extraInCandidate; + } + + public string Word { get; } + public IReadOnlyList MissingFromCandidate { get; } + public IReadOnlyList ExtraInCandidate { get; } + } + + /// The result of an FST-vs-search corpus comparison — a manual gap-inspection report, + /// not a gate. Nothing in this codebase treats a clean result as a license to stop running the + /// real engine; it exists purely to show a grammar engineer where the fast path currently + /// diverges from the reference and why. + public sealed class AnalysisComparison + { + public AnalysisComparison(int wordsChecked, IReadOnlyList divergences) + { + WordsChecked = wordsChecked; + Divergences = divergences; + } + + public int WordsChecked { get; } + public IReadOnlyList Divergences { get; } + + /// Words whose analysis sets matched exactly. + public int Matches => WordsChecked - Divergences.Count; + + /// True iff the candidate's analysis SET equals the reference's for every word + /// checked — no missing and no spurious analyses, on this corpus only. Not a proof, and not + /// a signal to disable the reference engine; see . + public bool MatchesReferenceExactly => Divergences.Count == 0; + + /// A readable dump. + public string Format() + { + var sb = new StringBuilder(); + sb.AppendLine( + $"checked {WordsChecked}, {Matches} match, {Divergences.Count} diverge — " + + (MatchesReferenceExactly ? "EXACT MATCH (set parity)" : "DIVERGENCES") + ); + foreach (AnalysisDivergence d in Divergences) + { + sb.AppendLine( + $" {d.Word}: missing=[{string.Join(" | ", d.MissingFromCandidate)}] " + + $"extra=[{string.Join(" | ", d.ExtraInCandidate)}]" + ); + } + return sb.ToString(); + } + } + + /// + /// A manual divergence-inspection tool for benchmarks and ad hoc investigation: run a candidate + /// analyzer (e.g. ) beside the reference () + /// over a corpus and report, per word, where their analysis SETS differ — missing analyses + /// (fast-path gaps) and extra analyses (would be a soundness bug in the candidate, since + /// is supposed to never over-generate). This is diagnostic + /// only: nothing in this codebase gates behavior on its result, and it is not intended to run in + /// CI (the reference can be extremely slow on some words) — use it from + /// [Explicit] benchmarks with a capped corpus. + /// + public static class FstVerification + { + public static AnalysisComparison Compare( + IMorphologicalAnalyzer reference, + IMorphologicalAnalyzer candidate, + IEnumerable words + ) + { + // Identity key per distinct morpheme object: affix Morpheme.Id is empty in many grammars, + // so a name/id-string signature would collapse different affixes of the same shape (e.g. the + // subject markers 3P+2 / 3S+1 / 6) into one key and hide same-shape under-generation. Both + // analyzers reference the SAME Morpheme instances from the Language, so object identity is a + // faithful, shared discriminator. + var ids = new Dictionary(); + string Sig(WordAnalysis a) => Signature(a, ids); + + var divergences = new List(); + int count = 0; + foreach (string word in words) + { + count++; + var referenceSet = new HashSet(reference.AnalyzeWord(word).Select(Sig)); + var candidateSet = new HashSet(candidate.AnalyzeWord(word).Select(Sig)); + if (referenceSet.SetEquals(candidateSet)) + { + continue; + } + List missing = referenceSet + .Except(candidateSet) + .OrderBy(s => s, StringComparer.Ordinal) + .ToList(); + List extra = candidateSet.Except(referenceSet).OrderBy(s => s, StringComparer.Ordinal).ToList(); + divergences.Add(new AnalysisDivergence(word, missing, extra)); + } + return new AnalysisComparison(count, divergences); + } + + /// A signature of one analysis: per-morpheme identity ids (in morph order) + root index. + private static string Signature(WordAnalysis analysis, Dictionary ids) + { + return string.Join("+", analysis.Morphemes.Select(m => Id(m, ids))) + ":" + analysis.RootMorphemeIndex; + } + + private static int Id(IMorpheme morpheme, Dictionary ids) + { + if (!ids.TryGetValue(morpheme, out int id)) + { + id = ids.Count; + ids[morpheme] = id; + } + return id; + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/GrammarFstAdvisor.cs b/src/SIL.Machine.Morphology.HermitCrab/GrammarFstAdvisor.cs new file mode 100644 index 000000000..80e191bb4 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/GrammarFstAdvisor.cs @@ -0,0 +1,614 @@ +using System.Collections.Generic; +using System.Linq; +using System.Text; +using SIL.Machine.Annotations; +using SIL.Machine.DataStructures; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; +using SIL.Machine.Morphology.HermitCrab.PhonologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// How costly a flagged rule is for parsing. + /// + public enum GrammarAdvisorySeverity + { + /// Finite-state-able; informational only. + Info, + + /// Stays finite-state but inflates the combinatorial search fan-out. + Cost, + + /// Breaks finite-state compilation — forces the slow combinatorial search. + Escape, + } + + /// + /// One advisory about a single grammar rule: what makes it expensive, and how to keep + /// (or get) it back on the fast finite-state path. + /// + public sealed class GrammarAdvisory + { + public GrammarAdvisory( + string rule, + string stratum, + string kind, + GrammarAdvisorySeverity severity, + string issue, + string advice, + bool? probeable = null, + bool? regular = null + ) + { + Rule = rule; + Stratum = stratum; + Kind = kind; + Severity = severity; + Issue = issue; + Advice = advice; + Probeable = probeable; + Regular = regular; + } + + /// Name of the offending rule. + public string Rule { get; } + + /// Name of the stratum the rule lives in (rules can appear in more than one). + public string Stratum { get; } + + /// Rule kind (affix / phonological / compounding). + public string Kind { get; } + + public GrammarAdvisorySeverity Severity { get; } + + /// One sentence: what is expensive and why. + public string Issue { get; } + + /// "Constrain it like this" and/or "try this instead". + public string Advice { get; } + + /// + /// For an : whether a per-word un-application + /// probe (strip the affix / de-reduplicate, then re-parse the residue with the FST) is + /// sound for this rule. True = "clean": no phonological rule at or after its + /// stratum can rewrite the affixed span, so the affix surfaces literally and stripping it + /// recovers the stem exactly — the slow path collapses to a cheap local guess+verify. + /// False = "opaque": a later rule may alter the span, so literal stripping can miss an + /// analysis and the search backstop is required. Null = not an insertion escape / N/A. + /// + public bool? Probeable { get; } + + /// + /// For an : whether the construct denotes a + /// regular relation (an FST exists for it in principle). True = regular — it could + /// be reclaimed onto the fast path once the FST compiler exists (state-encode a spreading + /// feature, bounded-fold a finite copy, …); by Kaplan & Kay (1994) every standard + /// rewrite rule is regular regardless of how long its environment is. False = genuinely + /// non-regular (unbounded copy) or unconfirmable. Null = N/A. + /// + /// IMPORTANT: this is a reclaim path, NOT a cost downgrade. A Regular + /// escape is still Escape severity because it is slow in today's engine — + /// the FST compiler that would make it fast is not built yet. Severity tells the truth + /// about today; Regular tells you whether the slowness is fixable by compilation. + /// + public bool? Regular { get; } + } + + /// + /// The result of : the per-rule advisories + /// plus an overall tier verdict. + /// + public sealed class GrammarFstReport + { + public GrammarFstReport( + IReadOnlyList advisories, + int affixRulesExamined, + int phonologicalRulesExamined, + int compoundingRulesExamined + ) + { + Advisories = advisories; + AffixRulesExamined = affixRulesExamined; + PhonologicalRulesExamined = phonologicalRulesExamined; + CompoundingRulesExamined = compoundingRulesExamined; + + // Count per RULE, not per advisory: advisories are emitted per allomorph, so several can + // refer to one rule. Group by (Rule, Stratum, Kind) and take each rule's worst severity, so + // the counts reflect distinct rules and the partitions stay consistent + // (Probeable+Opaque = Escape, Regular+NonRegular = Escape). + List> byRule = advisories + .GroupBy(a => (a.Rule, a.Stratum, a.Kind)) + .ToList(); + EscapeCount = byRule.Count(g => g.Max(a => a.Severity) == GrammarAdvisorySeverity.Escape); + CostCount = byRule.Count(g => g.Max(a => a.Severity) == GrammarAdvisorySeverity.Cost); + InfoCount = byRule.Count(g => g.Max(a => a.Severity) == GrammarAdvisorySeverity.Info); + + // Among escaping rules, a rule is opaque/non-regular if ANY of its escape advisories is + // (the conservative aggregate); the complements partition the escape count exactly. + List> escapeRules = byRule + .Where(g => g.Max(a => a.Severity) == GrammarAdvisorySeverity.Escape) + .ToList(); + OpaqueEscapeCount = escapeRules.Count(g => + g.Any(a => a.Severity == GrammarAdvisorySeverity.Escape && a.Probeable == false) + ); + ProbeableEscapeCount = EscapeCount - OpaqueEscapeCount; + NonRegularEscapeCount = escapeRules.Count(g => + g.Any(a => a.Severity == GrammarAdvisorySeverity.Escape && a.Regular != true) + ); + RegularEscapeCount = EscapeCount - NonRegularEscapeCount; + } + + public IReadOnlyList Advisories { get; } + + /// Affix-process rules inspected (those without an advisory are clean/FST-able). + public int AffixRulesExamined { get; } + + /// Phonological rules (rewrite + metathesis) inspected. + public int PhonologicalRulesExamined { get; } + + /// Compounding rules inspected. + public int CompoundingRulesExamined { get; } + + /// Number of rules that break finite-state compilation. + public int EscapeCount { get; } + + /// Number of rules that inflate the search but stay finite-state. + public int CostCount { get; } + + public int InfoCount { get; } + + /// Escapes for which the per-word un-application probe is sound (clean). + public int ProbeableEscapeCount { get; } + + /// Escapes that may interact with a later rule, so the search backstop is needed. + public int OpaqueEscapeCount { get; } + + /// + /// Escapes that are regular (an FST could reclaim them once the compiler exists). They are + /// still slow in today's engine — this is a reclaim path, not a cost downgrade. + /// + public int RegularEscapeCount { get; } + + /// Escapes that are genuinely non-regular or unconfirmable (no FST in principle). + public int NonRegularEscapeCount { get; } + + /// + /// Static tier candidate. The static report cannot compute the corpus-weighted fallback + /// rate, so for a few escapes it reports the candidate; the FST pipeline's corpus + /// pass confirms whether Tier 2 is worth it vs. Tier 3. + /// + public string Tier => + EscapeCount == 0 ? "Tier 1 candidate — fully FST-able" + : ProbeableEscapeCount == EscapeCount + ? "Tier 2⁺ candidate — every escape is probe-able (surface-invariant): a per-word " + + "un-application probe WOULD recover the fast path once the probe runtime exists; " + + "all escapes are slow in today's engine" + : EscapeCount <= 3 + ? "Tier 2 candidate — hybrid (opaque/non-probe-able escapes fall back to search); confirm with corpus fallback rate" + : "Tier 3 — pervasive escapes, search engine only"; + + /// The rules that break FST compilation (the warnings that flip the tier). + public IEnumerable Escapes => + Advisories.Where(a => a.Severity == GrammarAdvisorySeverity.Escape); + + /// A readable dump of the report. + public string Format() + { + var sb = new StringBuilder(); + sb.AppendLine(Tier); + sb.AppendLine( + $" examined {AffixRulesExamined} affix, {PhonologicalRulesExamined} phonological, " + + $"{CompoundingRulesExamined} compounding rule(s)" + ); + sb.AppendLine( + $" {EscapeCount} escape(s) ({ProbeableEscapeCount} probe-able, {OpaqueEscapeCount} opaque), " + + $"{CostCount} cost(s), {InfoCount} info — {Advisories.Count} rule advisories" + ); + if (EscapeCount > 0) + { + sb.AppendLine( + $" reclaim path: {RegularEscapeCount} of {EscapeCount} escape(s) are FST-reclaimable " + + "(regular) once the FST compiler exists; ALL " + + $"{EscapeCount} are slow in today's engine. {NonRegularEscapeCount} are genuinely " + + "non-regular (per-word probe or search only)." + ); + } + foreach ( + GrammarAdvisory a in Advisories + .OrderByDescending(a => a.Severity) + .ThenBy(a => a.Rule, System.StringComparer.Ordinal) + ) + { + string probe = + a.Probeable == true ? " [probe-able]" + : a.Probeable == false ? " [opaque]" + : ""; + string regular = + a.Regular == true ? " [regular: FST-reclaimable, slow today]" + : a.Regular == false ? " [non-regular]" + : ""; + sb.AppendLine(); + sb.AppendLine($"[{a.Severity}]{probe}{regular} {a.Rule} ({a.Kind}, stratum '{a.Stratum}')"); + sb.AppendLine($" issue : {a.Issue}"); + if (a.Advice.Length > 0) + sb.AppendLine($" advice: {a.Advice}"); + } + return sb.ToString(); + } + } + + /// + /// Static grammar linter for the FST acceleration work (see fst.md / HERMITCRAB_FST_PLAN.md). + /// It walks a compiled and flags, per rule, what makes parsing expensive + /// or blocks finite-state compilation, with an actionable write-up (why it's costly, how to + /// constrain it, what to try instead) and an overall tier verdict. + /// + /// This is pure static analysis of the object model — no parsing, no corpus needed — so it can + /// run at grammar-authoring time or in CI: a new + /// that flips the tier is the "one new rule blew up the grammar" warning. + /// + public static class GrammarFstAdvisor + { + /// + /// Analyze every rule in . + /// + /// A compiled grammar. + /// + /// Above this allomorph count a rule earns a note. + /// + public static GrammarFstReport Analyze(Language language, int manyAllomorphsThreshold = 8) + { + var advisories = new List(); + int affixExamined = 0; + int phonExamined = 0; + int compoundExamined = 0; + + // For the clean/opaque (probe-ability) test: an insertion escape in stratum i is sound + // to un-apply by stripping iff no phonological rule at stratum i or later could rewrite + // the affixed span. Precompute the count of phonological rules at or after each stratum. + IList strata = language.Strata; + var phonAtOrAfter = new int[strata.Count + 1]; + for (int i = strata.Count - 1; i >= 0; i--) + phonAtOrAfter[i] = phonAtOrAfter[i + 1] + strata[i].PhonologicalRules.Count; + + for (int s = 0; s < strata.Count; s++) + { + Stratum stratum = strata[s]; + bool surfaceInvariant = phonAtOrAfter[s] == 0; + foreach (IMorphologicalRule mrule in stratum.MorphologicalRules) + { + switch (mrule) + { + case AffixProcessRule affix: + affixExamined++; + AnalyzeAffix( + affix.Name, + affix.Allomorphs, + stratum.Name, + surfaceInvariant, + advisories, + manyAllomorphsThreshold + ); + break; + case RealizationalAffixProcessRule realizational: + // Realizational affixes also have Allomorphs and can encode + // reduplication/infixation — examine them too (previously skipped). + affixExamined++; + AnalyzeAffix( + realizational.Name, + realizational.Allomorphs, + stratum.Name, + surfaceInvariant, + advisories, + manyAllomorphsThreshold + ); + break; + case CompoundingRule compound: + compoundExamined++; + advisories.Add( + new GrammarAdvisory( + compound.Name, + stratum.Name, + "compounding", + GrammarAdvisorySeverity.Info, + "Compounding rule; bounded by MaxStemCount, so it stays finite-state.", + "Keep MaxStemCount as low as the language needs; unbounded compounding is not finite-state." + ) + ); + break; + } + } + + foreach (IPhonologicalRule prule in stratum.PhonologicalRules) + { + phonExamined++; + AnalyzePhonological(prule, stratum.Name, advisories); + } + } + return new GrammarFstReport(advisories, affixExamined, phonExamined, compoundExamined); + } + + private static void AnalyzeAffix( + string ruleName, + IList allomorphs, + string stratum, + bool surfaceInvariant, + List advisories, + int manyAllomorphsThreshold + ) + { + // An insertion escape is "probe-able" (a per-word strip-and-reparse un-application is + // sound) only when nothing downstream can rewrite the affixed span — i.e. no + // phonological rule applies at or after this rule's stratum. + string probeNote = surfaceInvariant + ? " This escape is PROBE-ABLE: no phonological rule applies after it, so the affix " + + "surfaces literally — a per-word probe that strips the candidate affix and re-parses " + + "the residue with the FST recovers the analysis without the search engine." + : " This escape is OPAQUE: a phonological rule applies after it and may rewrite the " + + "affixed span, so a literal strip-and-reparse probe can miss an analysis; the search " + + "backstop is required."; + + foreach (AffixProcessAllomorph allomorph in allomorphs) + { + // Reduplication: the same input part is copied two or more times. Copying an + // unbounded span is not regular, so the rule is not finite-state. + IGrouping duplicated = allomorph + .Rhs.OfType() + .GroupBy(c => c.PartName) + .FirstOrDefault(g => g.Count() >= 2); + if (duplicated != null) + { + // Boundedness of the copied part decides regularity: a fixed-size reduplicant + // (CV/CVC) is a finite copy → regular (reclaimable by bounded fold); copying an + // unbounded part (the whole stem) is the one genuinely non-regular operation + // ({ww} is not regular). Unresolved part → treat as non-regular (warn). + bool bounded = IsPartBounded(allomorph, duplicated.Key); + string regularNote = bounded + ? " REGULAR (bounded reduplicant = finite copy): an FST could reclaim it by " + + "bounded-folding the copy — once the FST compiler exists. It is still slow in " + + "today's engine." + : " GENUINELY NON-REGULAR (unbounded copy — {ww} is not a regular relation): no FST " + + "exists for it; only the per-word strip-and-reparse probe (when surface-invariant) " + + "or the search engine. Slow today."; + advisories.Add( + new GrammarAdvisory( + ruleName, + stratum, + "affix", + GrammarAdvisorySeverity.Escape, + $"Reduplication: part '{duplicated.Key}' is copied {duplicated.Count()}×, so the " + + "parser falls back to the slow combinatorial search for any word this rule " + + "could apply to.", + "If the reduplicant is a fixed size (e.g. one CV syllable), bound the copied part's " + + "length so it becomes finite-state. If only a handful of forms reduplicate, list " + + "them as lexical entries instead. Otherwise this rule keeps the whole grammar in " + + "the hybrid/search tier." + + probeNote + + regularNote, + surfaceInvariant, + bounded + ) + ); + } + else if (HasInfixedCopy(allomorph.Rhs)) + { + // Infixation: a non-copy action (inserted material) sits BETWEEN two copies of + // the stem (copy…insert…copy), so the stem is split at an internal position. + // Contiguous copies with inserts only at the ends (copy/copy/insert, + // insert/copy/copy, insert/copy/copy/insert) are ordinary prefix / suffix / + // circumfix over a split stem — finite-state, NOT flagged. + advisories.Add( + new GrammarAdvisory( + ruleName, + stratum, + "affix", + GrammarAdvisorySeverity.Escape, + "Infixation: material is inserted between two copies of the stem, splitting it at " + + "an internal position.", + "If the infix position is fixed (a known slot), encode it as a bounded split so it " + + "stays finite-state. A variable, content-determined split blocks FST compilation." + + probeNote + + " REGULAR (the split is described by a regular pattern): an FST could reclaim it " + + "by bounded-folding the split, or the per-word probe handles it — once those exist. " + + "It is still slow in today's engine.", + surfaceInvariant, + regular: true + ) + ); + } + + if (allomorph.Rhs.OfType().Any()) + { + advisories.Add( + new GrammarAdvisory( + ruleName, + stratum, + "affix", + GrammarAdvisorySeverity.Info, + "Process modification (ModifyFromInput) rewrites stem segments; finite-state only if " + + "the change is local and bounded.", + "A feature change in a fixed context is fine; a non-local or agreement-driven change " + + "blocks FST — consider a bounded reformulation." + ) + ); + } + } + + if (allomorphs.Count > manyAllomorphsThreshold) + { + advisories.Add( + new GrammarAdvisory( + ruleName, + stratum, + "affix", + GrammarAdvisorySeverity.Cost, + $"{allomorphs.Count} allomorphs; each one multiplies the un-application branching " + + "during analysis.", + "Consolidate allomorphs via environment conditioning where the language allows it." + ) + ); + } + } + + /// + /// True when a non-copy action (inserted material) appears strictly between the first and + /// last in — i.e. copy…insert…copy, the + /// signature of infixation. Contiguous copies (inserts only at the ends) return false. + /// + private static bool HasInfixedCopy(IList rhs) + { + int first = -1; + int last = -1; + for (int i = 0; i < rhs.Count; i++) + { + if (rhs[i] is CopyFromInput) + { + if (first < 0) + first = i; + last = i; + } + } + if (first < 0 || last == first) + return false; + for (int i = first + 1; i < last; i++) + { + if (!(rhs[i] is CopyFromInput)) + return true; + } + return false; + } + + private static void AnalyzePhonological( + IPhonologicalRule prule, + string stratum, + List advisories + ) + { + switch (prule) + { + case RewriteRule rewrite: + AnalyzeRewrite(rewrite, stratum, advisories); + break; + case MetathesisRule metathesis: + advisories.Add( + new GrammarAdvisory( + metathesis.Name, + stratum, + "phonological", + GrammarAdvisorySeverity.Info, + "Metathesis (segment reordering); finite-state over a bounded span.", + "Keep the reordered span bounded; unbounded metathesis blocks FST." + ) + ); + break; + } + } + + private static void AnalyzeRewrite(RewriteRule rule, string stratum, List advisories) + { + bool unboundedEnvironment = rule.Subrules.Any(sr => + HasUnboundedQuantifier(sr.LeftEnvironment) || HasUnboundedQuantifier(sr.RightEnvironment) + ); + + if (unboundedEnvironment) + { + // Kaplan & Kay (1994): a context-sensitive rewrite rule with regular φ/ψ/λ/ρ, + // applied directionally, denotes a REGULAR relation no matter how long the + // environment is — so an unbounded environment does not make the rule non-regular. + // It is regular iff the rule's own Lhs/Rhs are bounded (only the environment is + // unbounded); if the Lhs/Rhs are themselves unbounded we cannot confirm it. + bool rewriteBounded = + !HasUnboundedQuantifier(rule.Lhs) && rule.Subrules.All(sr => !HasUnboundedQuantifier(sr.Rhs)); + advisories.Add( + new GrammarAdvisory( + rule.Name, + stratum, + "phonological", + GrammarAdvisorySeverity.Escape, + "Unbounded rule environment: the left/right context matches an arbitrary-length span, so " + + "today's engine un-applies it at many positions — slow, and the composed automaton " + + "gains states.", + "Replace the '+'/'*' context with the fixed window the rule actually needs (usually 1–2 " + + "segments)." + + ( + rewriteBounded + ? " REGULAR (Kaplan & Kay 1994: a directional rewrite rule is a regular " + + "relation however long its environment): the long-distance dependency " + + "(e.g. vowel harmony / spreading) can be state-encoded into the FST — once " + + "the compiler exists. It is still slow in today's engine." + : " The rule's own LHS/RHS is unbounded, so regularity cannot be confirmed — " + + "treat as non-regular." + ), + regular: rewriteBounded + ) + ); + } + else + { + advisories.Add( + new GrammarAdvisory( + rule.Name, + stratum, + "phonological", + GrammarAdvisorySeverity.Info, + "Rewrite rule with a bounded environment: finite-state. It adds states to the composed " + + "transducer.", + "Keep the environment as tight as the language requires." + ) + ); + } + + // Deletion: the LHS is longer than every subrule's RHS. During analysis the parser must + // guess where the deleted segments were and re-insert them (× DeletionReapplications), + // which multiplies the search. + int lhsSegments = CountConstraints(rule.Lhs); + if (lhsSegments > 0 && rule.Subrules.All(sr => CountConstraints(sr.Rhs) < lhsSegments)) + { + advisories.Add( + new GrammarAdvisory( + rule.Name, + stratum, + "phonological", + GrammarAdvisorySeverity.Cost, + "Deletion rule (LHS longer than RHS): during analysis the parser guesses where the " + + "deleted segments were and re-inserts them (× DeletionReapplications), multiplying " + + "the search.", + "Keep DeletionReapplications as low as the language needs; a bounded deletion context is " + + "still finite-state." + ) + ); + } + } + + /// + /// Whether the copied part named is length-bounded — i.e. its + /// defining pattern has no unbounded quantifier. + /// Bounded ⇒ a finite copy ⇒ regular. Unresolved part ⇒ false (conservative: warn). + /// + private static bool IsPartBounded(AffixProcessAllomorph allomorph, string partName) + { + Pattern part = allomorph.Lhs.FirstOrDefault(p => p.Name == partName); + if (part == null) + return false; + return !HasUnboundedQuantifier(part); + } + + private static bool HasUnboundedQuantifier(Pattern pattern) + { + if (pattern == null) + return false; + return pattern + .GetNodesDepthFirst() + .OfType>() + .Any(q => q.MaxOccur == Quantifier.Infinite); + } + + private static int CountConstraints(Pattern pattern) + { + if (pattern == null) + return 0; + return pattern.GetNodesDepthFirst().OfType>().Count(); + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/HCRuleBase.cs b/src/SIL.Machine.Morphology.HermitCrab/HCRuleBase.cs index 4e1fa8c68..4c204feeb 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/HCRuleBase.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/HCRuleBase.cs @@ -16,9 +16,9 @@ protected HCRuleBase() public string Name { get; set; } - public abstract IRule CompileAnalysisRule(Morpher morpher); + public abstract IRule CompileAnalysisRule(Morpher morpher); - public abstract IRule CompileSynthesisRule(Morpher morpher); + public abstract IRule CompileSynthesisRule(Morpher morpher); public IDictionary Properties { diff --git a/src/SIL.Machine.Morphology.HermitCrab/HermitCrabExtensions.cs b/src/SIL.Machine.Morphology.HermitCrab/HermitCrabExtensions.cs index bd05a1c74..5cf2ad5af 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/HermitCrabExtensions.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/HermitCrabExtensions.cs @@ -3,6 +3,7 @@ using System.Text; using System.Text.RegularExpressions; using SIL.Machine.Annotations; +using SIL.Machine.DataStructures; using SIL.Machine.FeatureModel; using SIL.Machine.Matching; using SIL.ObjectModel; @@ -21,11 +22,64 @@ public static FeatureSymbol Type(this Annotation ann) return (FeatureSymbol)ann.FeatureStruct.GetValue(HCFeatureSystem.Type); } - public static FeatureSymbol Type(this Constraint constraint) + public static FeatureSymbol Type(this Constraint constraint) { return (FeatureSymbol)constraint.FeatureStruct.GetValue(HCFeatureSystem.Type); } + // RUSTIFY Stage 2: the FST binds as Fst and its matcher filters / inspects the + // shape's int-offset annotation projection (Annotation), which shares the FeatureStruct + // with the ShapeNode annotations — so these read identically to the ShapeNode overloads. + public static FeatureSymbol Type(this Annotation ann) + { + return (FeatureSymbol)ann.FeatureStruct.GetValue(HCFeatureSystem.Type); + } + + internal static bool IsDeleted(this Annotation ann) + { + SymbolicFeatureValue sfv; + if (ann.FeatureStruct.TryGetValue(HCFeatureSystem.Deletion, out sfv)) + return ((FeatureSymbol)sfv) == HCFeatureSystem.Deleted; + return false; + } + + // ---- RUSTIFY Stage 2: int match/group offset -> ShapeNode resolution ---- + // The FST binds as Fst with offset = node Tag and half-open annotation ranges + // [tag, tag+1). A match/group Range is therefore [leftmostTag, rightmostTag+1): the + // leftmost node is NodeAt(Start) and the rightmost is NodeAt(End-1). These helpers re-express + // the old ShapeNode range navigation (range.Start/.End/.GetStart(dir)/.GetEnd(dir)) over int + // offsets so rule RHS code can keep operating on the segment graph. + + internal static ShapeNode StartNode(this Shape shape, Range range) + { + return shape.NodeAt(range.Start); + } + + internal static ShapeNode EndNode(this Shape shape, Range range) + { + return shape.NodeAt(range.End - 1); + } + + internal static ShapeNode GetStartNode(this Shape shape, Range range, Direction dir) + { + return dir == Direction.LeftToRight ? shape.NodeAt(range.Start) : shape.NodeAt(range.End - 1); + } + + internal static ShapeNode GetEndNode(this Shape shape, Range range, Direction dir) + { + return dir == Direction.LeftToRight ? shape.NodeAt(range.End - 1) : shape.NodeAt(range.Start); + } + + internal static Range ToShapeRange(this Shape shape, Range range) + { + return Range.Create(shape.NodeAt(range.Start), shape.NodeAt(range.End - 1)); + } + + internal static IEnumerable GetNodes(this Shape shape, Range range) + { + return shape.GetNodes(shape.ToShapeRange(range)); + } + internal static FeatureStruct AntiFeatureStruct(this FeatureStruct fs) { // TODO: handle reentrance properly @@ -140,14 +194,14 @@ internal static IEnumerable RemoveDuplicates(this IEnumerable words) return output; } - internal static IEnumerable> DeepCloneExceptBoundaries( - this IEnumerable> nodes + internal static IEnumerable> DeepCloneExceptBoundaries( + this IEnumerable> nodes ) { - foreach (PatternNode node in nodes) + foreach (PatternNode node in nodes) { if ( - node is Constraint constraint + node is Constraint constraint && (constraint.FeatureStruct.IsEmpty || constraint.Type() != HCFeatureSystem.Boundary) ) { @@ -155,27 +209,25 @@ node is Constraint constraint continue; } - if (node is Alternation alternation) + if (node is Alternation alternation) { - var newAlteration = new Alternation( - alternation.Children.DeepCloneExceptBoundaries() - ); + var newAlteration = new Alternation(alternation.Children.DeepCloneExceptBoundaries()); if (newAlteration.Children.Count > 0) yield return newAlteration; continue; } - if (node is Group group) + if (node is Group group) { - var newGroup = new Group(group.Name, group.Children.DeepCloneExceptBoundaries()); + var newGroup = new Group(group.Name, group.Children.DeepCloneExceptBoundaries()); if (newGroup.Children.Count > 0) yield return newGroup; continue; } - if (node is Quantifier quantifier) + if (node is Quantifier quantifier) { - var newQuantifier = new Quantifier( + var newQuantifier = new Quantifier( quantifier.MinOccur, quantifier.MaxOccur, quantifier.Children.DeepCloneExceptBoundaries().SingleOrDefault() @@ -185,12 +237,9 @@ node is Constraint constraint continue; } - if (node is Pattern pattern) + if (node is Pattern pattern) { - var newPattern = new Pattern( - pattern.Name, - pattern.Children.DeepCloneExceptBoundaries() - ); + var newPattern = new Pattern(pattern.Name, pattern.Children.DeepCloneExceptBoundaries()); if (newPattern.Children.Count > 0) yield return newPattern; } diff --git a/src/SIL.Machine.Morphology.HermitCrab/IHCRule.cs b/src/SIL.Machine.Morphology.HermitCrab/IHCRule.cs index 9ae4e7c27..3c98ae8a3 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/IHCRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/IHCRule.cs @@ -7,7 +7,7 @@ public interface IHCRule { string Name { get; set; } - IRule CompileAnalysisRule(Morpher morpher); - IRule CompileSynthesisRule(Morpher morpher); + IRule CompileAnalysisRule(Morpher morpher); + IRule CompileSynthesisRule(Morpher morpher); } } diff --git a/src/SIL.Machine.Morphology.HermitCrab/InfixProposer.cs b/src/SIL.Machine.Morphology.HermitCrab/InfixProposer.cs new file mode 100644 index 000000000..b1fe23101 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/InfixProposer.cs @@ -0,0 +1,117 @@ +using System; +using System.Collections.Generic; +using System.Text; +using SIL.Machine.Morphology; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// A candidate generator for infixation (an affix inserted inside the stem, e.g. Tagalog + /// -um-: sulat → s·um·ulat) — a regular construct the FST proposer recognizes but does not build + /// (FST_FULL_PLAN.md, Point 2). Handled here as a sibling generator feeding the same + /// gate. + /// + /// Mechanism (remove + recurse): for each infix and each interior position where the infix's surface + /// segments occur, remove them and recurse the residual through the FST proposer (so an + /// infixed form of an inflected stem is covered), then append the infix morpheme in HC application + /// order (root·…·INF). Over-approximation: every interior occurrence is tried; verify prunes the + /// wrong splits (a wrong removal won't re-synthesize to the surface). `O(surface-length × infixes × + /// surface-variants)` candidates — bounded. + /// + /// Surface variants: an infix whose own surface is phonologically altered (e.g. it devoices between + /// two consonants) would otherwise never be found by a literal search for its underlying form, so + /// each infix's variants (the same isolation/boundary-probe precompile + /// built for regular affix arcs) are searched too — the underlying form is always included, so a + /// 0-phonology grammar is unaffected. + /// + /// Scope (first cut): the infix must be a single contiguous run of inserted segments. Templatic + /// multi-slot infixes (separate insert runs) are left to the engine (the parity gate keeps results + /// correct — those words simply ride the slow path). + /// + public class InfixProposer : IConstructProposer + { + private static readonly MorphOp[] _ops = { MorphOp.Infix }; + private readonly IMorphologicalAnalyzer _baseProposer; + private readonly List<(MorphemicMorphologicalRule Rule, IReadOnlyCollection Surfaces)> _infixes; + + public InfixProposer(Language language, IMorphologicalAnalyzer baseProposer) + { + _baseProposer = baseProposer; + _infixes = new List<(MorphemicMorphologicalRule, IReadOnlyCollection)>(); + var surfacePhonology = new SurfacePhonology(language, new Morpher(new TraceManager(), language)); + foreach (Stratum stratum in language.Strata) + { + foreach (IMorphologicalRule mrule in stratum.MorphologicalRules) + { + if (!(mrule is AffixProcessRule rule)) + { + continue; + } + foreach (AffixProcessAllomorph allomorph in rule.Allomorphs) + { + if (MorphTokenCodec.ClassifyOp(allomorph, false) != MorphOp.Infix) + { + continue; + } + string infix = InfixString(allomorph); + if (!string.IsNullOrEmpty(infix)) + { + _infixes.Add((rule, surfacePhonology.Variants(infix))); + } + } + } + } + } + + public IReadOnlyCollection CoveredOps => _ops; + + public IEnumerable AnalyzeWord(string word) + { + foreach ((MorphemicMorphologicalRule rule, IReadOnlyCollection surfaces) in _infixes) + { + foreach (string infix in surfaces) + { + // Interior occurrences only: stem material both before (i >= 1) and after the infix. + int i = word.IndexOf(infix, 1, StringComparison.Ordinal); + while (i >= 1 && i + infix.Length < word.Length) + { + string residual = word.Remove(i, infix.Length); + foreach (WordAnalysis baseAnalysis in _baseProposer.AnalyzeWord(residual)) + { + var morphemes = new List(baseAnalysis.Morphemes) { rule }; + yield return new WordAnalysis(morphemes, baseAnalysis.RootMorphemeIndex, null); + } + i = word.IndexOf(infix, i + 1, StringComparison.Ordinal); + } + } + } + } + + /// The infix's inserted material iff it is a single contiguous run of inserted segments; + /// null for templatic multi-slot infixes (left to the engine). + private static string InfixString(AffixProcessAllomorph allomorph) + { + var runs = new List(); + StringBuilder current = null; + foreach (MorphologicalOutputAction action in allomorph.Rhs) + { + if (action is InsertSegments insert) + { + current = current ?? new StringBuilder(); + current.Append(insert.Segments.Representation); + } + else if (current != null) + { + runs.Add(current.ToString()); + current = null; + } + } + if (current != null) + { + runs.Add(current.ToString()); + } + return runs.Count == 1 ? runs[0] : null; + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/InversePhonology.cs b/src/SIL.Machine.Morphology.HermitCrab/InversePhonology.cs new file mode 100644 index 000000000..c02800439 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/InversePhonology.cs @@ -0,0 +1,63 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.FeatureModel; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// An inverse-phonology transducer (surface → underlying) for Lever 2 lazy composition + /// (LEVER_2.md). States are ints; an arc carries a surface input feature structure + /// (null = ε-input, i.e. a deletion restoration that consumes no surface) and an + /// underlying output feature structure. walks + /// this against the morphotactic acceptor as a product automaton: the underlying output must unify a + /// lexicon arc, so a restoration only survives where the lexicon actually has that underlying segment + /// — the constraint the runtime inverse lacked. + /// + /// This is the consuming end of Lever 2 (proven); building a general + /// from a grammar's phonological rules (substitution + deletion + cascades) is Blocker 2's remaining + /// compiler work. + /// + public sealed class InversePhonology + { + public readonly struct Arc + { + public Arc(FeatureStruct surfaceInput, FeatureStruct underlyingOutput, int target) + { + SurfaceInput = surfaceInput; + UnderlyingOutput = underlyingOutput; + Target = target; + } + + /// The surface segment consumed, or null for an ε-input (restoration) arc. + public FeatureStruct SurfaceInput { get; } + + /// The underlying segment emitted (matched against a lexicon arc). + public FeatureStruct UnderlyingOutput { get; } + + public int Target { get; } + + public bool IsEpsilonInput => SurfaceInput == null; + } + + private readonly Dictionary> _arcs = new Dictionary>(); + private readonly HashSet _accepting = new HashSet(); + + public int StartState { get; set; } + + public void AddArc(int from, FeatureStruct surfaceInput, FeatureStruct underlyingOutput, int to) + { + if (!_arcs.TryGetValue(from, out List list)) + { + _arcs[from] = list = new List(); + } + list.Add(new Arc(surfaceInput, underlyingOutput, to)); + } + + public void SetAccepting(int state) => _accepting.Add(state); + + public bool IsAccepting(int state) => _accepting.Contains(state); + + public IReadOnlyList ArcsFrom(int state) => + _arcs.TryGetValue(state, out List list) ? list : (IReadOnlyList)System.Array.Empty(); + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/Language.cs b/src/SIL.Machine.Morphology.HermitCrab/Language.cs index b88afaf37..fd8e983b8 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/Language.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/Language.cs @@ -137,14 +137,14 @@ public ICollection PhonologicalRules get { return _allomorphCoOccurRules; } } - public override IRule CompileAnalysisRule(Morpher morpher) + public override IRule CompileAnalysisRule(Morpher morpher) { return new AnalysisLanguageRule(morpher, this); } - public override IRule CompileSynthesisRule(Morpher morpher) + public override IRule CompileSynthesisRule(Morpher morpher) { - return new PipelineRuleCascade( + return new PipelineRuleCascade( _strata.Select(stratum => stratum.CompileSynthesisRule(morpher)), FreezableEqualityComparer.Default ); diff --git a/src/SIL.Machine.Morphology.HermitCrab/LockstepPhonologyProposer.cs b/src/SIL.Machine.Morphology.HermitCrab/LockstepPhonologyProposer.cs new file mode 100644 index 000000000..f08b989ca --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/LockstepPhonologyProposer.cs @@ -0,0 +1,69 @@ +using System.Collections.Generic; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Phonology coverage via LEVER_2.md's lazy lockstep composition: + /// auto-builds an from the grammar's rewrite rules, and + /// walks it against the underlying-only lexicon in + /// lockstep — the lexicon constrains every restoration as it happens, so boundary-conditioned rules + /// (where 's boundary-less un-apply over-generates) stay + /// sound here by construction. + /// + /// Narrower than today, not a replacement (yet). The + /// v1 compiler only handles single-segment, right-context-only rules with no true multi-rule + /// cascade composition (see for the exact supported shape) — it + /// is wired in ADDITIVELY alongside the existing phonology proposer, not instead of it. Retiring + /// / is gated on this + /// compiler demonstrably matching or exceeding their coverage on real grammars (see + /// FST_FAST_PATH_PLAN.md Phase 3d) — not assumed. + /// + public sealed class LockstepPhonologyProposer : IConstructProposer + { + private static readonly MorphOp[] _ops = new MorphOp[0]; + private readonly FstTemplateAnalyzer _underlyingOnlyFst; + private readonly InversePhonology _pinv; + private readonly bool _hasArcs; + + public LockstepPhonologyProposer(Language language, Morpher morpher) + { + // Lex must be the underlying-only acceptor (default ctor, no surface precompile) — see + // LEVER_2.md: composing against the surface-precompiled FST would apply phonology twice. + _underlyingOnlyFst = new FstTemplateAnalyzer(language); + (InversePhonology pinv, int unsupported) = PhonologyRuleCompiler.Compile(language, morpher); + _pinv = pinv; + UnsupportedRuleCount = unsupported; + // An all-identity Pinv (no rule contributed a restoration/substitution branch) walks + // exactly like the plain underlying lexicon, so skip it — the bare FST already covers that. + _hasArcs = HasNonIdentityArcs(pinv); + } + + /// How many (rule, subrule) pairs the compiler could not put into this Pinv — a + /// coverage diagnostic (see ), not a soundness signal. + public int UnsupportedRuleCount { get; } + + /// Phonology completeness is not a per-construct MorphOp; validated empirically. + public IReadOnlyCollection CoveredOps => _ops; + + public IEnumerable AnalyzeWord(string word) + { + if (!_hasArcs) + { + return System.Linq.Enumerable.Empty(); + } + return _underlyingOnlyFst.AnalyzeComposed(word, _pinv); + } + + private static bool HasNonIdentityArcs(InversePhonology pinv) + { + foreach (InversePhonology.Arc arc in pinv.ArcsFrom(pinv.StartState)) + { + if (arc.IsEpsilonInput || !arc.SurfaceInput.ValueEquals(arc.UnderlyingOutput)) + { + return true; + } + } + return false; + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphToken.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphToken.cs new file mode 100644 index 000000000..867a5be5c --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphToken.cs @@ -0,0 +1,111 @@ +using System; +using System.Collections.Generic; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// The role/operation of a morpheme in a derivation — the high 8-bit field of a packed + /// . It is the "ordered operation connected to the letters": it lets a + /// consumer rebuild the gloss/bracketing of an analysis without re-running any rule. + /// + public enum MorphOp : byte + { + /// Unset / not a morpheme boundary. + None = 0, + + /// The root (stem) morpheme. + Root = 1, + + /// A prefix. + Prefix = 2, + + /// A suffix. + Suffix = 3, + + /// An infix (inserted inside the stem). + Infix = 4, + + /// Reduplication. + Reduplication = 5, + + /// The prefixal half of a circumfix. + CircumfixPrefix = 6, + + /// The suffixal half of a circumfix. + CircumfixSuffix = 7, + + /// A compounding element (a non-head stem). + Compound = 8, + + /// A clitic. + Clitic = 9, + + /// A process / simulfix (a ModifyFromInput-style change, no added segments). + Process = 10, + + /// A zero (null) morph. + Null = 11, + } + + /// + /// A 32-bit packed analysis token: high 8 bits = , low 24 bits = a + /// morpheme index into the grammar's compiled morpheme table. The analyzer transducer emits + /// one token per morpheme, in application order; the resulting uint[] IS the structured + /// analysis. It is self-describing — the morpheme order is the array order, and the root + /// position is the index of the token, so no separate + /// RootMorphemeIndex field is needed. See HERMITCRAB_FST_PLAN.md §8. + /// + public static class MorphToken + { + /// Number of low bits reserved for the morpheme index. + public const int MorphemeIdBits = 24; + + /// Largest encodable morpheme index (16,777,215). + public const int MaxMorphemeId = (1 << MorphemeIdBits) - 1; + + private const uint MorphemeIdMask = (1u << MorphemeIdBits) - 1; + + /// Pack a (role, morpheme index) pair into one 32-bit token. + /// + /// does not fit in bits. + /// + public static uint Encode(MorphOp op, int morphemeId) + { + if (morphemeId < 0 || morphemeId > MaxMorphemeId) + { + throw new ArgumentOutOfRangeException( + nameof(morphemeId), + morphemeId, + $"morpheme index must be in [0, {MaxMorphemeId}] to fit in {MorphemeIdBits} bits" + ); + } + return ((uint)op << MorphemeIdBits) | (uint)morphemeId; + } + + /// The morpheme's role/operation. + public static MorphOp GetOp(uint token) => (MorphOp)(token >> MorphemeIdBits); + + /// The morpheme index into the grammar's compiled morpheme table. + public static int GetMorphemeId(uint token) => (int)(token & MorphemeIdMask); + + /// + /// Index of the token in a derivation array, or -1 if none. + /// This recovers WordAnalysis.RootMorphemeIndex from the token array itself. + /// + public static int RootIndex(IReadOnlyList tokens) + { + if (tokens == null) + { + return -1; + } + for (int i = 0; i < tokens.Count; i++) + { + if (GetOp(tokens[i]) == MorphOp.Root) + { + return i; + } + } + return -1; + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphTokenCodec.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphTokenCodec.cs new file mode 100644 index 000000000..58104cb57 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphTokenCodec.cs @@ -0,0 +1,131 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Converts a parsed into the packed 32-bit morpheme-token array + /// (HERMITCRAB_FST_PLAN.md §8) and assigns each morpheme a stable 24-bit index. This is the + /// reference encoder the FST compiler will emit as arc outputs; it also proves the schema + /// faithfully reproduces a real HC analysis — encoding a and decoding it + /// yields the same morphemes (and root) that WordAnalysis carries, with the operation + /// of each morpheme recovered from the rule that introduced it. + /// + public class MorphTokenCodec + { + private readonly Dictionary _indexByMorpheme = new Dictionary(); + private readonly List _morphemesByIndex = new List(); + + /// Number of distinct morphemes that have been assigned an index. + public int MorphemeCount => _morphemesByIndex.Count; + + /// The morpheme assigned a given 24-bit index. + public Morpheme GetMorpheme(int index) => _morphemesByIndex[index]; + + /// + /// Encode a parsed word as its derivation token array: one per + /// morpheme in application order, the head root tagged . Mirrors + /// the morpheme order and root choice that Morpher.CreateWordAnalysis produces. + /// + public uint[] Encode(Word word) + { + var tokens = new List(); + foreach (Allomorph allo in word.AllomorphsInMorphOrder) + { + MorphOp op = ClassifyOp(allo, allo == word.RootAllomorph); + tokens.Add(MorphToken.Encode(op, GetOrAddIndex(allo.Morpheme))); + } + return tokens.ToArray(); + } + + /// Assign (or look up) the stable 24-bit index for a morpheme. + public int GetOrAddIndex(Morpheme morpheme) + { + if (!_indexByMorpheme.TryGetValue(morpheme, out int index)) + { + index = _morphemesByIndex.Count; + _indexByMorpheme[morpheme] = index; + _morphemesByIndex.Add(morpheme); + } + return index; + } + + /// + /// Determine the role/operation of an applied allomorph: the head root is + /// ; any other root (a compound stem) is + /// ; an affix is classified from its output actions. + /// + public static MorphOp ClassifyOp(Allomorph allomorph, bool isHeadRoot) + { + if (isHeadRoot) + { + return MorphOp.Root; + } + if (allomorph is RootAllomorph) + { + return MorphOp.Compound; + } + if (allomorph is AffixProcessAllomorph affix) + { + return ClassifyAffix(affix.Rhs); + } + return MorphOp.None; + } + + private static MorphOp ClassifyAffix(IList rhs) + { + // Reduplication: the same input part is copied two or more times. + bool reduplication = rhs.OfType().GroupBy(c => c.PartName).Any(g => g.Count() >= 2); + if (reduplication) + { + return MorphOp.Reduplication; + } + + int firstCopy = -1; + int lastCopy = -1; + for (int i = 0; i < rhs.Count; i++) + { + if (rhs[i] is CopyFromInput) + { + if (firstCopy < 0) + { + firstCopy = i; + } + lastCopy = i; + } + } + + if (firstCopy < 0) + { + // No copy of the stem: a pure insertion, or a process (ModifyFromInput) change. + return rhs.OfType().Any() ? MorphOp.Process : MorphOp.None; + } + + // Inserted material BETWEEN two copies of the stem = infixation. + for (int i = firstCopy + 1; i < lastCopy; i++) + { + if (!(rhs[i] is CopyFromInput)) + { + return MorphOp.Infix; + } + } + + bool leadingInsert = firstCopy > 0; + bool trailingInsert = lastCopy < rhs.Count - 1; + if (leadingInsert && trailingInsert) + { + return MorphOp.CircumfixPrefix; + } + if (leadingInsert) + { + return MorphOp.Prefix; + } + if (trailingInsert) + { + return MorphOp.Suffix; + } + return MorphOp.None; + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphemicMorphologicalRule.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphemicMorphologicalRule.cs index 0381fe255..8a1e76e51 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphemicMorphologicalRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphemicMorphologicalRule.cs @@ -13,7 +13,7 @@ public override MorphemeType MorphemeType get { return MorphemeType.Affix; } } - public abstract IRule CompileAnalysisRule(Morpher morpher); - public abstract IRule CompileSynthesisRule(Morpher morpher); + public abstract IRule CompileAnalysisRule(Morpher morpher); + public abstract IRule CompileSynthesisRule(Morpher morpher); } } diff --git a/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs b/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs index e4fd1879d..10cdc45c6 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs @@ -1,9 +1,12 @@ using System; +using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; +using System.Threading.Tasks; using SIL.Extensions; using SIL.Machine.Annotations; using SIL.Machine.FeatureModel; +using SIL.Machine.FiniteState; using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; using SIL.Machine.Rules; using SIL.ObjectModel; @@ -11,27 +14,36 @@ using System.IO; #endif -#if !SINGLE_THREADED -using System.Collections.Concurrent; -using System.Threading.Tasks; -#endif - namespace SIL.Machine.Morphology.HermitCrab { public class Morpher : IMorphologicalAnalyzer, IMorphologicalGenerator { private readonly Language _lang; - private readonly IRule _analysisRule; - private readonly IRule _synthesisRule; + private readonly IRule _analysisRule; + private readonly IRule _synthesisRule; private readonly Dictionary _allomorphTries; private readonly ITraceManager _traceManager; private readonly ReadOnlyObservableCollection _morphemes; private readonly IList _lexicalPatterns = new List(); public Morpher(ITraceManager traceManager, Language lang) + : this(traceManager, lang, -1) { } + + /// + /// Caps the parallelism used within a single parse. A value of 1 makes the + /// morpher fully single-threaded (analysis cascade, affix-template unapplication, and + /// synthesis all run sequentially) — this is the mode a caller should use when it + /// parallelizes across words itself (e.g. FieldWorks' "Parse All Words"), to + /// avoid nested parallelism / thread-pool oversubscription. Any value <= 0 defaults + /// to (the historical behavior). + /// + public Morpher(ITraceManager traceManager, Language lang, int maxDegreeOfParallelism) { _lang = lang; _traceManager = traceManager; + // Must be set before CompileAnalysisRule: the analysis rules choose a sequential vs. + // parallel cascade at construction time based on this value. + MaxDegreeOfParallelism = maxDegreeOfParallelism <= 0 ? Environment.ProcessorCount : maxDegreeOfParallelism; _allomorphTries = new Dictionary(); var morphemes = new ObservableList(); foreach (Stratum stratum in _lang.Strata) @@ -84,6 +96,12 @@ public ITraceManager TraceManager /// public bool MergeEquivalentAnalyses { get; set; } + /// + /// Caps parallelism used within a single parse; 1 = fully single-threaded. + /// Set via the constructor (it influences how the analysis rules are compiled). + /// + public int MaxDegreeOfParallelism { get; } + public Func LexEntrySelector { get; set; } public Func RuleSelector { get; set; } @@ -121,7 +139,7 @@ public IEnumerable ParseWord(string word, out object trace, bool guessRoot trace = input.CurrentTrace; // Unapply rules - var analyses = new ConcurrentQueue(_analysisRule.Apply(input)); + IList analyses = _analysisRule.Apply(input).ToList(); #if OUTPUT_ANALYSES var lines = new List(); @@ -134,7 +152,8 @@ public IEnumerable ParseWord(string word, out object trace, bool guessRoot File.WriteAllLines("analyses.txt", lines.OrderBy(l => l)); #endif - IList origAnalyses = guessRoot ? analyses.ToList() : null; + // analyses is already materialized and Synthesize doesn't mutate it, so no copy needed. + IList origAnalyses = guessRoot ? analyses : null; IList syntheses = Synthesize(word, analyses).ToList(); if (guessRoot && syntheses.Count == 0) { @@ -196,6 +215,7 @@ out object trace a => rulePermutations, (a, p) => new { Allomorph = a, RulePermutation = p } ), + new ParallelOptions { MaxDegreeOfParallelism = MaxDegreeOfParallelism }, (synthesisInfo, state) => { try @@ -279,53 +299,32 @@ Stack> permutation in PermuteRules( } } -#if SINGLE_THREADED - private IEnumerable Synthesize(string word, IEnumerable analyses) + private IEnumerable Synthesize(string word, IList analyses) { - var matches = new HashSet(FreezableEqualityComparer.Default); - foreach (Word analysisWord in analyses) + // Single-threaded: used when the caller parallelizes across words itself. + if (MaxDegreeOfParallelism == 1) { - foreach (Word synthesisWord in LexicalLookup(analysisWord)) + var matches = new HashSet(FreezableEqualityComparer.Default); + foreach (Word analysisWord in analyses) { - foreach (Word alternative in synthesisWord.ExpandAlternatives()) - { - foreach (Word validWord in _synthesisRule.Apply(alternative).Where(IsWordValid)) - { - if (IsMatch(word, validWord)) - matches.Add(validWord); - } - } + foreach (Word validWord in SynthesizeAnalysis(word, analysisWord)) + matches.Add(validWord); } + return matches; } - return matches; - } -#else - private IEnumerable Synthesize(string word, ConcurrentQueue analyses) - { - var matches = new ConcurrentBag(); + + // Parallel across the candidate analyses of this one word. + var parallelMatches = new ConcurrentBag(); Exception exception = null; Parallel.ForEach( - Partitioner.Create(0, analyses.Count), - new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount }, - (range, state) => + analyses, + new ParallelOptions { MaxDegreeOfParallelism = MaxDegreeOfParallelism }, + (analysisWord, state) => { try { - for (int i = 0; i < range.Item2 - range.Item1; i++) - { - analyses.TryDequeue(out Word analysisWord); - foreach (Word synthesisWord in LexicalLookup(analysisWord)) - { - foreach (Word alternative in synthesisWord.ExpandAlternatives()) - { - foreach (Word validWord in _synthesisRule.Apply(alternative).Where(IsWordValid)) - { - if (IsMatch(word, validWord)) - matches.Add(validWord); - } - } - } - } + foreach (Word validWord in SynthesizeAnalysis(word, analysisWord)) + parallelMatches.Add(validWord); } catch (Exception e) { @@ -336,9 +335,23 @@ private IEnumerable Synthesize(string word, ConcurrentQueue analyses ); if (exception != null) throw exception; - return matches.Distinct(FreezableEqualityComparer.Default); + return parallelMatches.Distinct(FreezableEqualityComparer.Default); + } + + private IEnumerable SynthesizeAnalysis(string word, Word analysisWord) + { + foreach (Word synthesisWord in LexicalLookup(analysisWord)) + { + foreach (Word alternative in synthesisWord.ExpandAlternatives()) + { + foreach (Word validWord in _synthesisRule.Apply(alternative).Where(IsWordValid)) + { + if (IsMatch(word, validWord)) + yield return validWord; + } + } + } } -#endif internal IEnumerable SearchRootAllomorphs(Stratum stratum, Shape shape) { @@ -378,7 +391,7 @@ private IEnumerable LexicalGuess(Word input) if (_traceManager.IsTracing) _traceManager.LexicalLookup(input.Stratum, input); CharacterDefinitionTable table = input.Stratum.CharacterDefinitionTable; - IEnumerable shapeNodes = input.Shape.GetNodes(input.Range); + IEnumerable shapeNodes = input.Shape.GetNodes(input.Shape.Range); foreach (RootAllomorph lexicalPattern in _lexicalPatterns) { HashSet shapeSet = new HashSet(); diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorpherPool.cs b/src/SIL.Machine.Morphology.HermitCrab/MorpherPool.cs new file mode 100644 index 000000000..e04bc6054 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/MorpherPool.cs @@ -0,0 +1,42 @@ +using System; +using System.Collections.Concurrent; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// A thread-safe pool of instances for the FST verify / engine-backstop + /// paths. Verification pins the engine's / + /// per candidate, which is mutable instance state — so a single + /// shared cannot be used from multiple threads (the selectors would race). + /// Each parse instead s its own Morpher and s it (selectors + /// reset) when done; the Morpher's own internal parallelism is safe because the rented instance has + /// a single owner for the duration of the call. Morphers are built once (compiling the grammar is + /// expensive) and reused across words. + /// + public sealed class MorpherPool + { + private readonly Func _factory; + private readonly ConcurrentBag _available = new ConcurrentBag(); + + /// Creates a fresh (each must be independent — its + /// own — so pooled instances never share mutable state). + public MorpherPool(Func factory) + { + _factory = factory; + } + + /// Borrow a Morpher with default (unrestricted) selectors. Always pair with . + public Morpher Rent() + { + return _available.TryTake(out Morpher morpher) ? morpher : _factory(); + } + + /// Reset the selectors and return the Morpher to the pool for reuse. + public void Return(Morpher morpher) + { + morpher.LexEntrySelector = _ => true; + morpher.RuleSelector = _ => true; + _available.Add(morpher); + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AffixProcessAllomorph.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AffixProcessAllomorph.cs index cf4200c2a..bb3aad8b4 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AffixProcessAllomorph.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AffixProcessAllomorph.cs @@ -27,7 +27,7 @@ public enum ReduplicationHint public class AffixProcessAllomorph : Allomorph { - private readonly List> _lhs; + private readonly List> _lhs; private readonly List _rhs; private readonly MprFeatureSet _requiredMprFeatures; private readonly MprFeatureSet _excludedMprFeatures; @@ -35,7 +35,7 @@ public class AffixProcessAllomorph : Allomorph public AffixProcessAllomorph() { - _lhs = new List>(); + _lhs = new List>(); _rhs = new List(); _requiredMprFeatures = new MprFeatureSet(); _excludedMprFeatures = new MprFeatureSet(); @@ -45,7 +45,7 @@ public AffixProcessAllomorph() public ReduplicationHint ReduplicationHint { get; set; } - public IList> Lhs + public IList> Lhs { get { return _lhs; } } @@ -80,7 +80,7 @@ protected override bool ConstraintsEqual(Allomorph other) return base.ConstraintsEqual(other) && _requiredMprFeatures.SetEquals(otherAllo._requiredMprFeatures) && _excludedMprFeatures.SetEquals(otherAllo._excludedMprFeatures) - && _lhs.SequenceEqual(otherAllo._lhs, FreezableEqualityComparer>.Default) + && _lhs.SequenceEqual(otherAllo._lhs, FreezableEqualityComparer>.Default) && RequiredSyntacticFeatureStruct.ValueEquals(otherAllo.RequiredSyntacticFeatureStruct); } diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AffixProcessRule.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AffixProcessRule.cs index 70a8fbf28..ff3eeb30f 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AffixProcessRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AffixProcessRule.cs @@ -77,12 +77,12 @@ public IList Allomorphs get { return _allomorphs; } } - public override IRule CompileAnalysisRule(Morpher morpher) + public override IRule CompileAnalysisRule(Morpher morpher) { return new AnalysisAffixProcessRule(morpher, this); } - public override IRule CompileSynthesisRule(Morpher morpher) + public override IRule CompileSynthesisRule(Morpher morpher) { return new SynthesisAffixProcessRule(morpher, this); } diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisAffixProcessAllomorphRuleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisAffixProcessAllomorphRuleSpec.cs index 9ce94c148..64f1993aa 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisAffixProcessAllomorphRuleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisAffixProcessAllomorphRuleSpec.cs @@ -15,7 +15,7 @@ public AnalysisAffixProcessAllomorphRuleSpec(AffixProcessAllomorph allomorph) Pattern.Freeze(); } - public override Word ApplyRhs(PatternRule rule, Match match) + public override Word ApplyRhs(PatternRule rule, Match match) { Word output = match.Input.Clone(); GenerateShape(_allomorph.Lhs, output.Shape, match); diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisAffixProcessRule.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisAffixProcessRule.cs index 4e89fef97..b9f6d4acc 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisAffixProcessRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisAffixProcessRule.cs @@ -1,29 +1,30 @@ using System.Collections.Generic; using System.Linq; using SIL.Machine.Annotations; +using SIL.Machine.FeatureModel; using SIL.Machine.Matching; using SIL.Machine.Rules; namespace SIL.Machine.Morphology.HermitCrab.MorphologicalRules { - public class AnalysisAffixProcessRule : IRule + public class AnalysisAffixProcessRule : IRule { private readonly Morpher _morpher; private readonly AffixProcessRule _rule; - private readonly List> _rules; + private readonly List> _rules; public AnalysisAffixProcessRule(Morpher morpher, AffixProcessRule rule) { _morpher = morpher; _rule = rule; - _rules = new List>(); + _rules = new List>(); foreach (AffixProcessAllomorph allo in rule.Allomorphs) { _rules.Add( - new MultiplePatternRule( + new MultiplePatternRule( new AnalysisAffixProcessAllomorphRuleSpec(allo), - new MatcherSettings + new MatcherSettings { Filter = ann => ann.Type() == HCFeatureSystem.Segment, MatchingMethod = MatchingMethod.Unification, @@ -55,10 +56,19 @@ public IEnumerable Apply(Word input) bool unapplied = false; foreach (Word outWord in _rules[i].Apply(input).RemoveDuplicates()) { + // Clone-then-reassign, not an in-place mutation: outWord may already be frozen by + // the pattern rule that produced it, and a frozen FeatureStruct must not be + // mutated in place (see Word.FreezeImpl's comment). if (!_rule.RequiredSyntacticFeatureStruct.IsEmpty) - outWord.SyntacticFeatureStruct.Add(_rule.RequiredSyntacticFeatureStruct); + { + FeatureStruct sfs = outWord.SyntacticFeatureStruct.Clone(); + sfs.Add(_rule.RequiredSyntacticFeatureStruct); + outWord.SyntacticFeatureStruct = sfs; + } else if (_rule.OutSyntacticFeatureStruct.IsEmpty) - outWord.SyntacticFeatureStruct.Clear(); + { + outWord.SyntacticFeatureStruct = new FeatureStruct(); + } outWord.MorphologicalRuleUnapplied(_rule); outWord.Freeze(); if (_morpher.TraceManager.IsTracing) diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisCompoundingRule.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisCompoundingRule.cs index 6dc2a0c28..b5013d4ee 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisCompoundingRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisCompoundingRule.cs @@ -1,29 +1,30 @@ using System.Collections.Generic; using System.Linq; using SIL.Machine.Annotations; +using SIL.Machine.FeatureModel; using SIL.Machine.Matching; using SIL.Machine.Rules; namespace SIL.Machine.Morphology.HermitCrab.MorphologicalRules { - public class AnalysisCompoundingRule : IRule + public class AnalysisCompoundingRule : IRule { private readonly Morpher _morpher; private readonly CompoundingRule _rule; - private readonly List> _rules; + private readonly List> _rules; public AnalysisCompoundingRule(Morpher morpher, CompoundingRule rule) { _morpher = morpher; _rule = rule; - _rules = new List>(); + _rules = new List>(); foreach (CompoundingSubrule sr in rule.Subrules) { _rules.Add( - new MultiplePatternRule( + new MultiplePatternRule( new AnalysisCompoundingSubruleRuleSpec(sr), - new MatcherSettings + new MatcherSettings { Filter = ann => ann.Type() == HCFeatureSystem.Segment, MatchingMethod = MatchingMethod.Unification, @@ -126,10 +127,18 @@ RootAllomorph allo in _morpher.SearchRootAllomorphs(_rule.Stratum, outWord.Curre bool unapplied = false; foreach (Word outWord in srOutput) { + // Clone-then-reassign, not an in-place mutation: outWord may already be frozen (see + // Word.FreezeImpl's comment). if (!_rule.HeadRequiredSyntacticFeatureStruct.IsEmpty) - outWord.SyntacticFeatureStruct.Add(_rule.HeadRequiredSyntacticFeatureStruct); + { + FeatureStruct sfs = outWord.SyntacticFeatureStruct.Clone(); + sfs.Add(_rule.HeadRequiredSyntacticFeatureStruct); + outWord.SyntacticFeatureStruct = sfs; + } else if (_rule.OutSyntacticFeatureStruct.IsEmpty) - outWord.SyntacticFeatureStruct.Clear(); + { + outWord.SyntacticFeatureStruct = new FeatureStruct(); + } outWord.MorphologicalRuleUnapplied(_rule); outWord.Freeze(); diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisCompoundingSubruleRuleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisCompoundingSubruleRuleSpec.cs index 50f6177bb..6172e2cb0 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisCompoundingSubruleRuleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisCompoundingSubruleRuleSpec.cs @@ -17,7 +17,7 @@ public AnalysisCompoundingSubruleRuleSpec(CompoundingSubrule subrule) Pattern.Freeze(); } - public override Word ApplyRhs(PatternRule rule, Match match) + public override Word ApplyRhs(PatternRule rule, Match match) { Word output = match.Input.Clone(); GenerateShape(_subrule.HeadLhs, output.Shape, match); diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisMorphologicalTransform.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisMorphologicalTransform.cs index 93e7a0ef6..832f55e68 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisMorphologicalTransform.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisMorphologicalTransform.cs @@ -9,18 +9,15 @@ namespace SIL.Machine.Morphology.HermitCrab.MorphologicalRules { public class AnalysisMorphologicalTransform { - private readonly Pattern _pattern; + private readonly Pattern _pattern; private readonly Dictionary> _modifyFromInfos; private readonly Dictionary _capturedParts; - public AnalysisMorphologicalTransform( - IEnumerable> lhs, - IList rhs - ) + public AnalysisMorphologicalTransform(IEnumerable> lhs, IList rhs) { - Dictionary> partLookup = lhs.ToDictionary(p => p.Name); + Dictionary> partLookup = lhs.ToDictionary(p => p.Name); _modifyFromInfos = new Dictionary>(); - _pattern = new Pattern(); + _pattern = new Pattern(); _capturedParts = new Dictionary(); foreach (MorphologicalOutputAction outputAction in rhs) { @@ -46,19 +43,19 @@ protected IDictionary CapturedParts get { return _capturedParts; } } - public Pattern Pattern + public Pattern Pattern { get { return _pattern; } } - public void GenerateShape(IList> lhs, Shape shape, Match match) + public void GenerateShape(IList> lhs, Shape shape, Match match) { shape.Clear(); - foreach (Pattern part in lhs) + foreach (Pattern part in lhs) AddPartNodes(part, match, shape); } - private void AddPartNodes(Pattern part, Match match, Shape output) + private void AddPartNodes(Pattern part, Match match, Shape output) { int count; if (_capturedParts.TryGetValue(part.Name, out count)) @@ -83,15 +80,18 @@ private void AddPartNodes(Pattern part, Match private bool AddCapturedPartNodes( string partName, int index, - Match match, + Match match, FeatureStruct modifyFromFS, Shape output ) { - GroupCapture inputGroup = match.GroupCaptures[GetGroupName(partName, index)]; + GroupCapture inputGroup = match.GroupCaptures[GetGroupName(partName, index)]; if (inputGroup.Success) { - Range outputRange = match.Input.Shape.CopyTo(inputGroup.Range, output); + Range outputRange = match.Input.Shape.CopyTo( + match.Input.Shape.ToShapeRange(inputGroup.Range), + output + ); if (modifyFromFS != null) { foreach (ShapeNode node in output.GetNodes(outputRange)) @@ -106,15 +106,15 @@ Shape output } private void Untruncate( - PatternNode patternNode, + PatternNode patternNode, Shape output, bool optional, VariableBindings varBindings ) { - foreach (PatternNode node in patternNode.Children) + foreach (PatternNode node in patternNode.Children) { - if (node is Constraint constraint && constraint.Type() == HCFeatureSystem.Segment) + if (node is Constraint constraint && constraint.Type() == HCFeatureSystem.Segment) { FeatureStruct fs = constraint.FeatureStruct.Clone(); fs.ReplaceVariables(varBindings); @@ -122,7 +122,7 @@ VariableBindings varBindings } else { - if (node is Quantifier quantifier) + if (node is Quantifier quantifier) { for (int i = 0; i < quantifier.MaxOccur; i++) Untruncate(quantifier, output, i >= quantifier.MinOccur, varBindings); diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisMorphologicalTransformRuleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisMorphologicalTransformRuleSpec.cs index a23ae84b1..2ef6d56ca 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisMorphologicalTransformRuleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisMorphologicalTransformRuleSpec.cs @@ -7,10 +7,10 @@ namespace SIL.Machine.Morphology.HermitCrab.MorphologicalRules { public abstract class AnalysisMorphologicalTransformRuleSpec : AnalysisMorphologicalTransform, - IPatternRuleSpec + IPatternRuleSpec { protected AnalysisMorphologicalTransformRuleSpec( - IEnumerable> lhs, + IEnumerable> lhs, IList rhs ) : base(lhs, rhs) { } @@ -20,7 +20,7 @@ public bool IsApplicable(Word input) return true; } - protected bool IsPartCaptured(Match match, string partName) + protected bool IsPartCaptured(Match match, string partName) { int count; if (CapturedParts.TryGetValue(partName, out count)) @@ -34,6 +34,6 @@ protected bool IsPartCaptured(Match match, string partName) return false; } - public abstract Word ApplyRhs(PatternRule rule, Match match); + public abstract Word ApplyRhs(PatternRule rule, Match match); } } diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisRealizationalAffixProcessRule.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisRealizationalAffixProcessRule.cs index 749aa7b4d..031c6fbad 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisRealizationalAffixProcessRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/AnalysisRealizationalAffixProcessRule.cs @@ -7,24 +7,24 @@ namespace SIL.Machine.Morphology.HermitCrab.MorphologicalRules { - public class AnalysisRealizationalAffixProcessRule : IRule + public class AnalysisRealizationalAffixProcessRule : IRule { private readonly Morpher _morpher; private readonly RealizationalAffixProcessRule _rule; - private readonly List> _rules; + private readonly List> _rules; public AnalysisRealizationalAffixProcessRule(Morpher morpher, RealizationalAffixProcessRule rule) { _morpher = morpher; _rule = rule; - _rules = new List>(); + _rules = new List>(); foreach (AffixProcessAllomorph allo in rule.Allomorphs) { _rules.Add( - new MultiplePatternRule( + new MultiplePatternRule( new AnalysisAffixProcessAllomorphRuleSpec(allo), - new MatcherSettings + new MatcherSettings { Filter = ann => ann.Type() == HCFeatureSystem.Segment, MatchingMethod = MatchingMethod.Unification, diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/CompoundingRule.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/CompoundingRule.cs index a5280ba6a..fca007daf 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/CompoundingRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/CompoundingRule.cs @@ -58,12 +58,12 @@ public ICollection ObligatorySyntacticFeatures public Stratum Stratum { get; set; } - public override IRule CompileAnalysisRule(Morpher morpher) + public override IRule CompileAnalysisRule(Morpher morpher) { return new AnalysisCompoundingRule(morpher, this); } - public override IRule CompileSynthesisRule(Morpher morpher) + public override IRule CompileSynthesisRule(Morpher morpher) { return new SynthesisCompoundingRule(morpher, this); } diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/CompoundingSubrule.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/CompoundingSubrule.cs index 7001098d0..6ae61c2dd 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/CompoundingSubrule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/CompoundingSubrule.cs @@ -6,8 +6,8 @@ namespace SIL.Machine.Morphology.HermitCrab.MorphologicalRules { public class CompoundingSubrule { - private readonly List> _headLhs; - private readonly List> _nonHeadLhs; + private readonly List> _headLhs; + private readonly List> _nonHeadLhs; private readonly List _rhs; private readonly MprFeatureSet _requiredMprFeatures; @@ -16,8 +16,8 @@ public class CompoundingSubrule public CompoundingSubrule() { - _headLhs = new List>(); - _nonHeadLhs = new List>(); + _headLhs = new List>(); + _nonHeadLhs = new List>(); _rhs = new List(); _requiredMprFeatures = new MprFeatureSet(); @@ -25,12 +25,12 @@ public CompoundingSubrule() _outMprFeatures = new MprFeatureSet(); } - public IList> HeadLhs + public IList> HeadLhs { get { return _headLhs; } } - public IList> NonHeadLhs + public IList> NonHeadLhs { get { return _nonHeadLhs; } } diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/CopyFromInput.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/CopyFromInput.cs index a5ffa91e2..1f4a32f3e 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/CopyFromInput.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/CopyFromInput.cs @@ -13,24 +13,22 @@ public CopyFromInput(string partName) : base(partName) { } public override void GenerateAnalysisLhs( - Pattern analysisLhs, - IDictionary> partLookup, + Pattern analysisLhs, + IDictionary> partLookup, IDictionary capturedParts ) { - Pattern pattern = partLookup[PartName]; + Pattern pattern = partLookup[PartName]; int count = capturedParts.GetOrCreate(PartName, () => 0); string groupName = AnalysisMorphologicalTransform.GetGroupName(PartName, count); - analysisLhs.Children.Add( - new Group(groupName, pattern.Children.DeepCloneExceptBoundaries()) - ); + analysisLhs.Children.Add(new Group(groupName, pattern.Children.DeepCloneExceptBoundaries())); capturedParts[PartName]++; } - public override IEnumerable> Apply(Match match, Word output) + public override IEnumerable> Apply(Match match, Word output) { var mappings = new List>(); - GroupCapture inputGroup = match.GroupCaptures[PartName]; + GroupCapture inputGroup = match.GroupCaptures[PartName]; if (inputGroup.Success) { foreach ( diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/InsertSegments.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/InsertSegments.cs index 0ffebcc64..a24167382 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/InsertSegments.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/InsertSegments.cs @@ -27,19 +27,19 @@ public Segments Segments } public override void GenerateAnalysisLhs( - Pattern analysisLhs, - IDictionary> partLookup, + Pattern analysisLhs, + IDictionary> partLookup, IDictionary capturedParts ) { foreach (ShapeNode node in _segments.Shape) { if (node.Annotation.Type() != HCFeatureSystem.Boundary) - analysisLhs.Children.Add(new Constraint(node.Annotation.FeatureStruct)); + analysisLhs.Children.Add(new Constraint(node.Annotation.FeatureStruct)); } } - public override IEnumerable> Apply(Match match, Word output) + public override IEnumerable> Apply(Match match, Word output) { Shape shape = _segments.Shape; var mappings = new List>(); diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/InsertSimpleContext.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/InsertSimpleContext.cs index ab652c765..bf882a4bf 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/InsertSimpleContext.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/InsertSimpleContext.cs @@ -29,15 +29,15 @@ public SimpleContext SimpleContext } public override void GenerateAnalysisLhs( - Pattern analysisLhs, - IDictionary> partLookup, + Pattern analysisLhs, + IDictionary> partLookup, IDictionary capturedParts ) { - analysisLhs.Children.Add(new Constraint(_simpleCtxt.FeatureStruct.Clone())); + analysisLhs.Children.Add(new Constraint(_simpleCtxt.FeatureStruct.Clone())); } - public override IEnumerable> Apply(Match match, Word output) + public override IEnumerable> Apply(Match match, Word output) { FeatureStruct fs = _simpleCtxt.FeatureStruct.Clone(); fs.ReplaceVariables(match.VariableBindings); diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/ModifyFromInput.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/ModifyFromInput.cs index 70cae03ac..a92cb6bd1 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/ModifyFromInput.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/ModifyFromInput.cs @@ -31,19 +31,19 @@ public SimpleContext SimpleContext } public override void GenerateAnalysisLhs( - Pattern analysisLhs, - IDictionary> partLookup, + Pattern analysisLhs, + IDictionary> partLookup, IDictionary capturedParts ) { - Pattern pattern = partLookup[PartName]; + Pattern pattern = partLookup[PartName]; int count = capturedParts.GetOrCreate(PartName, () => 0); string groupName = AnalysisMorphologicalTransform.GetGroupName(PartName, count); - var group = new Group(groupName, pattern.Children.DeepCloneExceptBoundaries()); + var group = new Group(groupName, pattern.Children.DeepCloneExceptBoundaries()); foreach ( - Constraint constraint in group + Constraint constraint in group .GetNodesDepthFirst() - .OfType>() + .OfType>() .Where(c => c.Type() == (FeatureSymbol)_simpleCtxt.FeatureStruct.GetValue(HCFeatureSystem.Type)) ) { @@ -53,10 +53,10 @@ Constraint constraint in group capturedParts[PartName]++; } - public override IEnumerable> Apply(Match match, Word output) + public override IEnumerable> Apply(Match match, Word output) { var mappings = new List>(); - GroupCapture inputGroup = match.GroupCaptures[PartName]; + GroupCapture inputGroup = match.GroupCaptures[PartName]; foreach ( ShapeNode inputNode in GetSkippedOptionalNodes(match.Input.Shape, inputGroup.Range) .Concat(match.Input.Shape.GetNodes(inputGroup.Range)) diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/MorphologicalOutputAction.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/MorphologicalOutputAction.cs index 5595a232f..619625343 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/MorphologicalOutputAction.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/MorphologicalOutputAction.cs @@ -25,8 +25,8 @@ public string PartName } public abstract void GenerateAnalysisLhs( - Pattern analysisLhs, - IDictionary> partLookup, + Pattern analysisLhs, + IDictionary> partLookup, IDictionary capturedParts ); @@ -35,11 +35,12 @@ IDictionary capturedParts /// /// The match. /// The output word synthesis. - public abstract IEnumerable> Apply(Match match, Word output); + public abstract IEnumerable> Apply(Match match, Word output); - protected IEnumerable GetSkippedOptionalNodes(Shape shape, Range range) + // RUSTIFY Stage 2: range is now an int-offset match/group range; its leftmost node is NodeAt(Start). + protected IEnumerable GetSkippedOptionalNodes(Shape shape, Range range) { - ShapeNode node = range.Start.Prev; + ShapeNode node = shape.NodeAt(range.Start).Prev; var skippedNodes = new List(); while (node.Annotation.Optional) { diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/RealizationalAffixProcessRule.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/RealizationalAffixProcessRule.cs index da20c683e..ebe39caf5 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/RealizationalAffixProcessRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/RealizationalAffixProcessRule.cs @@ -58,12 +58,12 @@ public IList Allomorphs get { return _allomorphs; } } - public override IRule CompileAnalysisRule(Morpher morpher) + public override IRule CompileAnalysisRule(Morpher morpher) { return new AnalysisRealizationalAffixProcessRule(morpher, this); } - public override IRule CompileSynthesisRule(Morpher morpher) + public override IRule CompileSynthesisRule(Morpher morpher) { return new SynthesisRealizationalAffixProcessRule(morpher, this); } diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisAffixProcessAllomorphRuleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisAffixProcessAllomorphRuleSpec.cs index aab74102d..79a97ee99 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisAffixProcessAllomorphRuleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisAffixProcessAllomorphRuleSpec.cs @@ -8,17 +8,17 @@ namespace SIL.Machine.Morphology.HermitCrab.MorphologicalRules { - public class SynthesisAffixProcessAllomorphRuleSpec : IPatternRuleSpec + public class SynthesisAffixProcessAllomorphRuleSpec : IPatternRuleSpec { private readonly AffixProcessAllomorph _allomorph; - private readonly Pattern _pattern; + private readonly Pattern _pattern; private readonly HashSet _nonAllomorphActions; public SynthesisAffixProcessAllomorphRuleSpec(AffixProcessAllomorph allomorph) { _allomorph = allomorph; - IList> lhs = _allomorph.Lhs; + IList> lhs = _allomorph.Lhs; IList rhs = _allomorph.Rhs; _nonAllomorphActions = new HashSet(); var redupParts = new List>(); @@ -119,12 +119,12 @@ List partActions in rhs.Where(action => } } - _pattern = new Pattern(); - foreach (Pattern part in lhs) - _pattern.Children.Add(new Group(part.Name, part.Children.CloneItems())); + _pattern = new Pattern(); + foreach (Pattern part in lhs) + _pattern.Children.Add(new Group(part.Name, part.Children.CloneItems())); } - public Pattern Pattern + public Pattern Pattern { get { return _pattern; } } @@ -134,7 +134,7 @@ public bool IsApplicable(Word input) return true; } - public Word ApplyRhs(PatternRule rule, Match match) + public Word ApplyRhs(PatternRule rule, Match match) { Word output = match.Input.Clone(); output.Shape.Clear(); diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisAffixProcessRule.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisAffixProcessRule.cs index f7dc9c0dc..98a3895d0 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisAffixProcessRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisAffixProcessRule.cs @@ -8,24 +8,24 @@ namespace SIL.Machine.Morphology.HermitCrab.MorphologicalRules { - public class SynthesisAffixProcessRule : IRule + public class SynthesisAffixProcessRule : IRule { private readonly Morpher _morpher; private readonly AffixProcessRule _rule; - private readonly List> _rules; + private readonly List> _rules; public SynthesisAffixProcessRule(Morpher morpher, AffixProcessRule rule) { _morpher = morpher; _rule = rule; - _rules = new List>(); + _rules = new List>(); foreach (AffixProcessAllomorph allo in rule.Allomorphs) { var ruleSpec = new SynthesisAffixProcessAllomorphRuleSpec(allo); _rules.Add( - new PatternRule( + new PatternRule( ruleSpec, - new MatcherSettings + new MatcherSettings { Filter = ann => ann.Type().IsOneOf(HCFeatureSystem.Segment, HCFeatureSystem.Boundary) @@ -178,8 +178,13 @@ public IEnumerable Apply(Word input) Word outWord = _rules[i].Apply(input).SingleOrDefault(); if (outWord != null) { - outWord.SyntacticFeatureStruct = syntacticFS; - outWord.SyntacticFeatureStruct.PriorityUnion(_rule.OutSyntacticFeatureStruct); + // Clone before mutating: syntacticFS is shared across every loop iteration + // (computed once, above), so mutating it in place would alias every outWord + // assigned from an earlier iteration. Also protects against outWord already being + // frozen (see Word.FreezeImpl's comment). + FeatureStruct sfs = syntacticFS.Clone(); + sfs.PriorityUnion(_rule.OutSyntacticFeatureStruct); + outWord.SyntacticFeatureStruct = sfs; foreach (Feature obligFeature in _rule.ObligatorySyntacticFeatures) outWord.ObligatorySyntacticFeatures.Add(obligFeature); diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisCompoundingRule.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisCompoundingRule.cs index a8f16e650..29e3bd5f3 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisCompoundingRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisCompoundingRule.cs @@ -9,30 +9,30 @@ namespace SIL.Machine.Morphology.HermitCrab.MorphologicalRules { - public class SynthesisCompoundingRule : IRule + public class SynthesisCompoundingRule : IRule { private readonly Morpher _morpher; private readonly CompoundingRule _rule; - private readonly List, Matcher>> _subruleMatchers; + private readonly List, Matcher>> _subruleMatchers; public SynthesisCompoundingRule(Morpher morpher, CompoundingRule rule) { _morpher = morpher; _rule = rule; - _subruleMatchers = new List, Matcher>>(); + _subruleMatchers = new List, Matcher>>(); foreach (CompoundingSubrule sr in rule.Subrules) _subruleMatchers.Add(Tuple.Create(BuildMatcher(sr.HeadLhs), BuildMatcher(sr.NonHeadLhs))); } - private Matcher BuildMatcher(IEnumerable> lhs) + private Matcher BuildMatcher(IEnumerable> lhs) { - var pattern = new Pattern(); - foreach (Pattern part in lhs) - pattern.Children.Add(new Group(part.Name, part.Children.CloneItems())); + var pattern = new Pattern(); + foreach (Pattern part in lhs) + pattern.Children.Add(new Group(part.Name, part.Children.CloneItems())); - return new Matcher( + return new Matcher( pattern, - new MatcherSettings + new MatcherSettings { Filter = ann => ann.Type().IsOneOf(HCFeatureSystem.Segment, HCFeatureSystem.Boundary) && !ann.IsDeleted(), @@ -167,10 +167,10 @@ public IEnumerable Apply(Word input) continue; } - Match headMatch = _subruleMatchers[i].Item1.Match(input); + Match headMatch = _subruleMatchers[i].Item1.Match(input); if (headMatch.Success) { - Match nonHeadMatch = _subruleMatchers[i].Item2.Match(input.CurrentNonHead); + Match nonHeadMatch = _subruleMatchers[i].Item2.Match(input.CurrentNonHead); if (nonHeadMatch.Success) { Word outWord = ApplySubrule(_rule.Subrules[i], headMatch, nonHeadMatch); @@ -178,8 +178,13 @@ public IEnumerable Apply(Word input) outWord.MprFeatures.AddOutput(_rule.Subrules[i].OutMprFeatures); outWord.MprFeatures.AddOutput(_rule.OutputProdRestrictionsMprFeatures); - outWord.SyntacticFeatureStruct = syntacticFS; - outWord.SyntacticFeatureStruct.PriorityUnion(_rule.OutSyntacticFeatureStruct); + // Clone before mutating: syntacticFS is shared across every loop iteration + // (computed once, above), so mutating it in place would alias every outWord + // assigned from an earlier iteration. Also protects against outWord already + // being frozen (see Word.FreezeImpl's comment). + FeatureStruct sfs = syntacticFS.Clone(); + sfs.PriorityUnion(_rule.OutSyntacticFeatureStruct); + outWord.SyntacticFeatureStruct = sfs; foreach (Feature feature in _rule.ObligatorySyntacticFeatures) outWord.ObligatorySyntacticFeatures.Add(feature); @@ -226,11 +231,7 @@ public IEnumerable Apply(Word input) return output; } - private Word ApplySubrule( - CompoundingSubrule sr, - Match headMatch, - Match nonHeadMatch - ) + private Word ApplySubrule(CompoundingSubrule sr, Match headMatch, Match nonHeadMatch) { // TODO: unify the variable bindings from the head and non-head matches Word output = headMatch.Input.Clone(); diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisRealizationalAffixProcessRule.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisRealizationalAffixProcessRule.cs index 1ea640909..bd1717f82 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisRealizationalAffixProcessRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphologicalRules/SynthesisRealizationalAffixProcessRule.cs @@ -9,23 +9,23 @@ namespace SIL.Machine.Morphology.HermitCrab.MorphologicalRules { - public class SynthesisRealizationalAffixProcessRule : IRule + public class SynthesisRealizationalAffixProcessRule : IRule { private readonly Morpher _morpher; private readonly RealizationalAffixProcessRule _rule; - private readonly List> _rules; + private readonly List> _rules; public SynthesisRealizationalAffixProcessRule(Morpher morpher, RealizationalAffixProcessRule rule) { _morpher = morpher; _rule = rule; - _rules = new List>(); + _rules = new List>(); foreach (AffixProcessAllomorph allo in rule.Allomorphs) { _rules.Add( - new PatternRule( + new PatternRule( new SynthesisAffixProcessAllomorphRuleSpec(allo), - new MatcherSettings + new MatcherSettings { Filter = ann => ann.Type().IsOneOf(HCFeatureSystem.Segment, HCFeatureSystem.Boundary) @@ -118,8 +118,14 @@ public IEnumerable Apply(Word input) Word outWord = _rules[i].Apply(input).SingleOrDefault(); if (outWord != null) { - outWord.SyntacticFeatureStruct = syntacticFS; - outWord.SyntacticFeatureStruct.PriorityUnion(_rule.RealizationalFeatureStruct); + // Clone before mutating: syntacticFS is shared across every loop iteration (it's + // computed once, above), so mutating it in place here would alias every outWord + // assigned from an earlier iteration to whatever the last PriorityUnion produced. + // Also protects against outWord already being frozen (see Word.FreezeImpl's + // comment). + FeatureStruct sfs = syntacticFS.Clone(); + sfs.PriorityUnion(_rule.RealizationalFeatureStruct); + outWord.SyntacticFeatureStruct = sfs; outWord.MorphologicalRuleApplied(_rule, appliedAllomorphIndices); appliedAllomorphIndices.Add(i); diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisMetathesisRule.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisMetathesisRule.cs index 8b21c1853..5d160243f 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisMetathesisRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisMetathesisRule.cs @@ -8,7 +8,7 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { - public class AnalysisMetathesisRule : IRule + public class AnalysisMetathesisRule : IRule { private readonly Morpher _morpher; private readonly MetathesisRule _rule; @@ -21,7 +21,7 @@ public AnalysisMetathesisRule(Morpher morpher, MetathesisRule rule) var ruleSpec = new AnalysisMetathesisRuleSpec(rule.Pattern, rule.LeftSwitchName, rule.RightSwitchName); - var settings = new MatcherSettings + var settings = new MatcherSettings { Direction = rule.Direction == Direction.LeftToRight ? Direction.RightToLeft : Direction.LeftToRight, Filter = ann => ann.Type().IsOneOf(HCFeatureSystem.Segment, HCFeatureSystem.Anchor), diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisMetathesisRuleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisMetathesisRuleSpec.cs index 6581ffff4..baa0d1a9f 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisMetathesisRuleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisMetathesisRuleSpec.cs @@ -12,21 +12,19 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { public class AnalysisMetathesisRuleSpec : IPhonologicalPatternRuleSpec, IPhonologicalPatternSubruleSpec { - private readonly Pattern _pattern; + private readonly Pattern _pattern; private readonly string _leftGroupName; private readonly string _rightGroupName; - public AnalysisMetathesisRuleSpec(Pattern pattern, string leftGroupName, string rightGroupName) + public AnalysisMetathesisRuleSpec(Pattern pattern, string leftGroupName, string rightGroupName) { _leftGroupName = leftGroupName; _rightGroupName = rightGroupName; - Group[] groupOrder = pattern.Children.OfType>().ToArray(); - Dictionary> groups = groupOrder.ToDictionary(g => g.Name); - _pattern = new Pattern(); - foreach ( - PatternNode node in pattern.Children.TakeWhile(n => !(n is Group)) - ) + Group[] groupOrder = pattern.Children.OfType>().ToArray(); + Dictionary> groups = groupOrder.ToDictionary(g => g.Name); + _pattern = new Pattern(); + foreach (PatternNode node in pattern.Children.TakeWhile(n => !(n is Group))) { _pattern.Children.Add(node.Clone()); } @@ -35,9 +33,9 @@ PatternNode node in pattern.Children.TakeWhile(n => !(n is Grou AddGroup(groups, rightGroupName); foreach ( - PatternNode node in pattern + PatternNode node in pattern .Children.GetNodes(Direction.RightToLeft) - .TakeWhile(n => !(n is Group)) + .TakeWhile(n => !(n is Group)) .Reverse() ) { @@ -46,41 +44,43 @@ PatternNode node in pattern _pattern.Freeze(); } - private void AddGroup(Dictionary> groups, string name) + private void AddGroup(Dictionary> groups, string name) { - var newGroup = new Group(name); - foreach ( - Constraint constraint in groups[name].Children.Cast>() - ) + var newGroup = new Group(name); + foreach (Constraint constraint in groups[name].Children.Cast>()) { - Constraint newConstraint = constraint.Clone(); + Constraint newConstraint = constraint.Clone(); newConstraint.FeatureStruct.AddValue(HCFeatureSystem.Modified, HCFeatureSystem.Clean); newGroup.Children.Add(newConstraint); } _pattern.Children.Add(newGroup); } - public Pattern Pattern + public Pattern Pattern { get { return _pattern; } } public bool MatchSubrule( PhonologicalPatternRule rule, - Match match, + Match match, out PhonologicalSubruleMatch subruleMatch ) { - subruleMatch = new PhonologicalSubruleMatch(this, match.Range, match.VariableBindings); + subruleMatch = new PhonologicalSubruleMatch( + this, + match.Input.Shape.ToShapeRange(match.Range), + match.VariableBindings + ); return true; } - Matcher IPhonologicalPatternSubruleSpec.LeftEnvironmentMatcher + Matcher IPhonologicalPatternSubruleSpec.LeftEnvironmentMatcher { get { return null; } } - Matcher IPhonologicalPatternSubruleSpec.RightEnvironmentMatcher + Matcher IPhonologicalPatternSubruleSpec.RightEnvironmentMatcher { get { return null; } } @@ -91,24 +91,26 @@ bool IPhonologicalPatternSubruleSpec.IsApplicable(Word input) } void IPhonologicalPatternSubruleSpec.ApplyRhs( - Match targetMatch, + Match targetMatch, Range range, VariableBindings varBindings ) { - ShapeNode start = null, - end = null; - foreach (GroupCapture gc in targetMatch.GroupCaptures) + int? startTag = null, + endTag = null; + foreach (GroupCapture gc in targetMatch.GroupCaptures) { - if (start == null || gc.Range.Start.CompareTo(start) < 0) - start = gc.Range.Start; - if (end == null || gc.Range.End.CompareTo(end) > 0) - end = gc.Range.End; + if (!gc.Success) + continue; + if (startTag == null || gc.Range.Start < startTag) + startTag = gc.Range.Start; + if (endTag == null || gc.Range.End > endTag) + endTag = gc.Range.End; } - Debug.Assert(start != null && end != null); + Debug.Assert(startTag != null && endTag != null); - GroupCapture leftGroup = targetMatch.GroupCaptures[_leftGroupName]; - GroupCapture rightGroup = targetMatch.GroupCaptures[_rightGroupName]; + GroupCapture leftGroup = targetMatch.GroupCaptures[_leftGroupName]; + GroupCapture rightGroup = targetMatch.GroupCaptures[_rightGroupName]; foreach ( Tuple tuple in targetMatch diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisRewriteRule.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisRewriteRule.cs index 95eacbf73..e691b4c0a 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisRewriteRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisRewriteRule.cs @@ -10,7 +10,7 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { - public class AnalysisRewriteRule : IRule + public class AnalysisRewriteRule : IRule { private enum ReapplyType { @@ -28,7 +28,7 @@ public AnalysisRewriteRule(Morpher morpher, RewriteRule rule) _morpher = morpher; _rule = rule; - var settings = new MatcherSettings + var settings = new MatcherSettings { Direction = rule.Direction == Direction.LeftToRight ? Direction.RightToLeft : Direction.LeftToRight, Filter = ann => ann.Type().IsOneOf(HCFeatureSystem.Segment, HCFeatureSystem.Anchor), @@ -49,11 +49,7 @@ public AnalysisRewriteRule(Morpher morpher, RewriteRule rule) ruleSpec = new FeatureAnalysisRewriteRuleSpec(settings, rule.Lhs, sr); if (_rule.ApplicationMode == RewriteApplicationMode.Simultaneous) { - foreach ( - Constraint constraint in sr.Rhs.Children.Cast< - Constraint - >() - ) + foreach (Constraint constraint in sr.Rhs.Children.Cast>()) { if (constraint.Type() == HCFeatureSystem.Segment) { @@ -106,12 +102,9 @@ Constraint constraint in sr.Rhs.Children.Cast< } } - private static bool IsUnifiable(Constraint constraint, Pattern env) + private static bool IsUnifiable(Constraint constraint, Pattern env) { - foreach ( - Constraint curConstraint in env.GetNodesDepthFirst() - .OfType>() - ) + foreach (Constraint curConstraint in env.GetNodesDepthFirst().OfType>()) { if ( curConstraint.Type() == HCFeatureSystem.Segment diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisRewriteSubruleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisRewriteSubruleSpec.cs index 540267e6d..fd42491ea 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisRewriteSubruleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/AnalysisRewriteSubruleSpec.cs @@ -7,12 +7,12 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { public class AnalysisRewriteSubruleSpec : RewriteSubruleSpec { - private readonly Action, Range, VariableBindings> _applyAction; + private readonly Action, Range, VariableBindings> _applyAction; public AnalysisRewriteSubruleSpec( - MatcherSettings matcherSettings, + MatcherSettings matcherSettings, RewriteSubrule subrule, - Action, Range, VariableBindings> applyAction + Action, Range, VariableBindings> applyAction ) : base( matcherSettings, @@ -23,16 +23,16 @@ Action, Range, VariableBindings> applyAction _applyAction = applyAction; } - private static Pattern CreateEnvironmentPattern(Pattern env) + private static Pattern CreateEnvironmentPattern(Pattern env) { - Pattern pattern = null; + Pattern pattern = null; if (!env.IsEmpty) - pattern = new Pattern(env.Children.DeepCloneExceptBoundaries()); + pattern = new Pattern(env.Children.DeepCloneExceptBoundaries()); return pattern; } public override void ApplyRhs( - Match targetMatch, + Match targetMatch, Range range, VariableBindings varBindings ) diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/EpenthesisAnalysisRewriteRuleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/EpenthesisAnalysisRewriteRuleSpec.cs index 6c29369b9..225cb3254 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/EpenthesisAnalysisRewriteRuleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/EpenthesisAnalysisRewriteRuleSpec.cs @@ -9,15 +9,15 @@ public class EpenthesisAnalysisRewriteRuleSpec : RewriteRuleSpec { private readonly int _targetCount; - public EpenthesisAnalysisRewriteRuleSpec(MatcherSettings matcherSettings, RewriteSubrule subrule) + public EpenthesisAnalysisRewriteRuleSpec(MatcherSettings matcherSettings, RewriteSubrule subrule) : base(false) { Pattern.Acceptable = IsUnapplicationNonvacuous; _targetCount = subrule.Rhs.Children.Count; - foreach (Constraint constraint in subrule.Rhs.Children.Cast>()) + foreach (Constraint constraint in subrule.Rhs.Children.Cast>()) { - Constraint newConstraint = constraint.Clone(); + Constraint newConstraint = constraint.Clone(); newConstraint.FeatureStruct.AddValue(HCFeatureSystem.Modified, HCFeatureSystem.Clean); Pattern.Children.Add(newConstraint); } @@ -26,7 +26,7 @@ public EpenthesisAnalysisRewriteRuleSpec(MatcherSettings matcherSetti SubruleSpecs.Add(new AnalysisRewriteSubruleSpec(matcherSettings, subrule, Unapply)); } - private static bool IsUnapplicationNonvacuous(Match match) + private static bool IsUnapplicationNonvacuous(Match match) { foreach (ShapeNode node in match.Input.Shape.GetNodes(match.Range)) { @@ -37,7 +37,7 @@ private static bool IsUnapplicationNonvacuous(Match match) return false; } - private void Unapply(Match targetMatch, Range range, VariableBindings varBindings) + private void Unapply(Match targetMatch, Range range, VariableBindings varBindings) { ShapeNode curNode = range.Start; for (int i = 0; i < _targetCount; i++) diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/EpenthesisSynthesisRewriteSubruleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/EpenthesisSynthesisRewriteSubruleSpec.cs index c184bd4dc..6c1f0bdc6 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/EpenthesisSynthesisRewriteSubruleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/EpenthesisSynthesisRewriteSubruleSpec.cs @@ -7,10 +7,10 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { public class EpenthesisSynthesisRewriteSubruleSpec : SynthesisRewriteSubruleSpec { - private readonly Pattern _rhs; + private readonly Pattern _rhs; public EpenthesisSynthesisRewriteSubruleSpec( - MatcherSettings matcherSettings, + MatcherSettings matcherSettings, bool isIterative, RewriteSubrule subrule, int index @@ -21,17 +21,17 @@ int index } public override void ApplyRhs( - Match targetMatch, + Match targetMatch, Range range, VariableBindings varBindings ) { ShapeNode curNode = range.Start; - foreach (PatternNode node in _rhs.Children.GetNodes(targetMatch.Matcher.Direction)) + foreach (PatternNode node in _rhs.Children.GetNodes(targetMatch.Matcher.Direction)) { if (targetMatch.Input.Shape.Count == 256) throw new InfiniteLoopException("An epenthesis rewrite rule is stuck in an infinite loop."); - var constraint = (Constraint)node; + var constraint = (Constraint)node; FeatureStruct fs = constraint.FeatureStruct.Clone(); if (varBindings != null) fs.ReplaceVariables(varBindings); diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/FeatureAnalysisRewriteRuleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/FeatureAnalysisRewriteRuleSpec.cs index e98d95ec6..75c66c0e8 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/FeatureAnalysisRewriteRuleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/FeatureAnalysisRewriteRuleSpec.cs @@ -10,19 +10,19 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { public class FeatureAnalysisRewriteRuleSpec : RewriteRuleSpec { - private readonly Pattern _analysisRhs; + private readonly Pattern _analysisRhs; public FeatureAnalysisRewriteRuleSpec( - MatcherSettings matcherSettings, - Pattern lhs, + MatcherSettings matcherSettings, + Pattern lhs, RewriteSubrule subrule ) : base(false) { var rhsAntiFSs = new List(); foreach ( - Constraint constraint in subrule - .Rhs.Children.OfType>() + Constraint constraint in subrule + .Rhs.Children.OfType>() .Where(c => c.Type() == HCFeatureSystem.Segment) ) { @@ -31,28 +31,26 @@ Constraint constraint in subrule Pattern.Acceptable = match => IsUnapplicationNonvacuous(match, rhsAntiFSs); - _analysisRhs = new Pattern(); + _analysisRhs = new Pattern(); int i = 0; foreach ( - Tuple, PatternNode> tuple in lhs.Children.Zip( - subrule.Rhs.Children - ) + Tuple, PatternNode> tuple in lhs.Children.Zip(subrule.Rhs.Children) ) { - var lhsConstraint = (Constraint)tuple.Item1; - var rhsConstraint = (Constraint)tuple.Item2; + var lhsConstraint = (Constraint)tuple.Item1; + var rhsConstraint = (Constraint)tuple.Item2; if (lhsConstraint.Type() == HCFeatureSystem.Segment && rhsConstraint.Type() == HCFeatureSystem.Segment) { - Constraint targetConstraint = lhsConstraint.Clone(); + Constraint targetConstraint = lhsConstraint.Clone(); targetConstraint.FeatureStruct.PriorityUnion(rhsConstraint.FeatureStruct); targetConstraint.FeatureStruct.AddValue(HCFeatureSystem.Modified, HCFeatureSystem.Clean); - Pattern.Children.Add(new Group("target" + i) { Children = { targetConstraint } }); + Pattern.Children.Add(new Group("target" + i) { Children = { targetConstraint } }); FeatureStruct fs = rhsConstraint.FeatureStruct.AntiFeatureStruct(); fs.Subtract(lhsConstraint.FeatureStruct.AntiFeatureStruct()); fs.AddValue(HCFeatureSystem.Type, HCFeatureSystem.Segment); - _analysisRhs.Children.Add(new Constraint(fs)); + _analysisRhs.Children.Add(new Constraint(fs)); i++; } @@ -62,12 +60,15 @@ Tuple, PatternNode> tuple in lhs.C SubruleSpecs.Add(new AnalysisRewriteSubruleSpec(matcherSettings, subrule, Unapply)); } - private bool IsUnapplicationNonvacuous(Match match, IEnumerable rhsAntiFSs) + private bool IsUnapplicationNonvacuous(Match match, IEnumerable rhsAntiFSs) { int i = 0; foreach (FeatureStruct fs in rhsAntiFSs) { - ShapeNode node = match.GroupCaptures["target" + i].Range.GetStart(match.Matcher.Direction); + ShapeNode node = match.Input.Shape.GetStartNode( + match.GroupCaptures["target" + i].Range, + match.Matcher.Direction + ); foreach (SymbolicFeature sf in fs.Features.OfType()) { SymbolicFeatureValue sfv = fs.GetValue(sf); @@ -97,14 +98,15 @@ private bool IsUnapplicationNonvacuous(Match match, IEnumerable return false; } - private void Unapply(Match targetMatch, Range range, VariableBindings varBindings) + private void Unapply(Match targetMatch, Range range, VariableBindings varBindings) { int i = 0; - foreach ( - Constraint constraint in _analysisRhs.Children.Cast>() - ) + foreach (Constraint constraint in _analysisRhs.Children.Cast>()) { - ShapeNode node = targetMatch.GroupCaptures["target" + i].Range.GetStart(targetMatch.Matcher.Direction); + ShapeNode node = targetMatch.Input.Shape.GetStartNode( + targetMatch.GroupCaptures["target" + i].Range, + targetMatch.Matcher.Direction + ); FeatureStruct fs = node.Annotation.FeatureStruct.Clone(); fs.PriorityUnion(constraint.FeatureStruct); node.Annotation.FeatureStruct.Union(fs, varBindings); diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/FeatureSynthesisRewriteSubruleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/FeatureSynthesisRewriteSubruleSpec.cs index c961fe6c7..58d5beb55 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/FeatureSynthesisRewriteSubruleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/FeatureSynthesisRewriteSubruleSpec.cs @@ -8,10 +8,10 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { public class FeatureSynthesisRewriteSubruleSpec : SynthesisRewriteSubruleSpec { - private readonly Pattern _rhs; + private readonly Pattern _rhs; public FeatureSynthesisRewriteSubruleSpec( - MatcherSettings matcherSettings, + MatcherSettings matcherSettings, bool isIterative, RewriteSubrule subrule, int index @@ -22,18 +22,18 @@ int index } public override void ApplyRhs( - Match targetMatch, + Match targetMatch, Range range, VariableBindings varBindings ) { foreach ( - Tuple> tuple in targetMatch + Tuple> tuple in targetMatch .Input.Shape.GetNodes(range) .Zip(_rhs.Children) ) { - var constraints = (Constraint)tuple.Item2; + var constraints = (Constraint)tuple.Item2; tuple.Item1.Annotation.FeatureStruct.PriorityUnion(constraints.FeatureStruct, varBindings); if (IsIterative) tuple.Item1.SetDirty(true); diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/IPhonologicalPatternRuleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/IPhonologicalPatternRuleSpec.cs index a26f66f6e..4130df814 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/IPhonologicalPatternRuleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/IPhonologicalPatternRuleSpec.cs @@ -5,10 +5,10 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { public interface IPhonologicalPatternRuleSpec { - Pattern Pattern { get; } + Pattern Pattern { get; } bool MatchSubrule( PhonologicalPatternRule rule, - Match match, + Match match, out PhonologicalSubruleMatch subruleMatch ); } diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/IPhonologicalPatternSubruleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/IPhonologicalPatternSubruleSpec.cs index 25964c1c2..92484e61e 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/IPhonologicalPatternSubruleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/IPhonologicalPatternSubruleSpec.cs @@ -6,10 +6,10 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { public interface IPhonologicalPatternSubruleSpec { - Matcher LeftEnvironmentMatcher { get; } - Matcher RightEnvironmentMatcher { get; } + Matcher LeftEnvironmentMatcher { get; } + Matcher RightEnvironmentMatcher { get; } bool IsApplicable(Word input); - void ApplyRhs(Match targetMatch, Range range, VariableBindings varBindings); + void ApplyRhs(Match targetMatch, Range range, VariableBindings varBindings); } } diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/IterativePhonologicalPatternRule.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/IterativePhonologicalPatternRule.cs index 4046e4319..2471bb2b7 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/IterativePhonologicalPatternRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/IterativePhonologicalPatternRule.cs @@ -10,33 +10,40 @@ public class IterativePhonologicalPatternRule : PhonologicalPatternRule { public IterativePhonologicalPatternRule( IPhonologicalPatternRuleSpec ruleSpec, - MatcherSettings matcherSettings + MatcherSettings matcherSettings ) : base(ruleSpec, matcherSettings) { } public override IEnumerable Apply(Word input) { bool applied = false; - Match targetMatch = Matcher.Match(input); + Match targetMatch = Matcher.Match(input); while (targetMatch.Success) { ShapeNode start; PhonologicalSubruleMatch srMatch; + // RUSTIFY Stage 2: int offsets in targetMatch.Range go stale once ApplyRhs mutates the + // shape (the projection re-densifies), so resolve the directional end/start NODE now — + // a ShapeNode handle survives mutation, exactly as the old ShapeNode match range did. + // Only one of end/start is ever used per iteration, so resolve it inside its own + // branch instead of paying for both NodeAt lookups unconditionally. if (RuleSpec.MatchSubrule(this, targetMatch, out srMatch)) { + ShapeNode matchEndNode = input.Shape.GetEndNode(targetMatch.Range, Matcher.Direction); srMatch.SubruleSpec.ApplyRhs(targetMatch, srMatch.Range, srMatch.VariableBindings); applied = true; - start = targetMatch.Range.GetEnd(Matcher.Direction).GetNext(Matcher.Direction); + start = matchEndNode.GetNext(Matcher.Direction); } else { - start = targetMatch.Range.GetStart(Matcher.Direction).GetNext(Matcher.Direction); + ShapeNode matchStartNode = input.Shape.GetStartNode(targetMatch.Range, Matcher.Direction); + start = matchStartNode.GetNext(Matcher.Direction); } if (start == null) break; - targetMatch = Matcher.Match(input, start); + targetMatch = Matcher.Match(input, input.Shape.MatchStartOffset(start, Matcher.Direction)); } if (applied) diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/MetathesisRule.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/MetathesisRule.cs index 0fd0693c7..b926a69c9 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/MetathesisRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/MetathesisRule.cs @@ -13,23 +13,23 @@ public class MetathesisRule : HCRuleBase, IPhonologicalRule { public MetathesisRule() { - Pattern = Pattern.New().Value; + Pattern = Pattern.New().Value; } public Direction Direction { get; set; } - public Pattern Pattern { get; set; } + public Pattern Pattern { get; set; } public string LeftSwitchName { get; set; } public string RightSwitchName { get; set; } - public override IRule CompileAnalysisRule(Morpher morpher) + public override IRule CompileAnalysisRule(Morpher morpher) { return new AnalysisMetathesisRule(morpher, this); } - public override IRule CompileSynthesisRule(Morpher morpher) + public override IRule CompileSynthesisRule(Morpher morpher) { return new SynthesisMetathesisRule(morpher, this); } diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/NarrowAnalysisRewriteRuleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/NarrowAnalysisRewriteRuleSpec.cs index 469f3020a..df0c7aede 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/NarrowAnalysisRewriteRuleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/NarrowAnalysisRewriteRuleSpec.cs @@ -8,12 +8,12 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { public class NarrowAnalysisRewriteRuleSpec : RewriteRuleSpec { - private readonly Pattern _analysisRhs; + private readonly Pattern _analysisRhs; private readonly int _targetCount; public NarrowAnalysisRewriteRuleSpec( - MatcherSettings matcherSettings, - Pattern lhs, + MatcherSettings matcherSettings, + Pattern lhs, RewriteSubrule subrule ) : base(subrule.Rhs.IsEmpty) @@ -24,7 +24,7 @@ RewriteSubrule subrule if (subrule.Rhs.IsEmpty) { Pattern.Children.Add( - new Constraint( + new Constraint( FeatureStruct.New().Symbol(HCFeatureSystem.Segment, HCFeatureSystem.Anchor).Value ) ); @@ -38,12 +38,10 @@ RewriteSubrule subrule SubruleSpecs.Add(new AnalysisRewriteSubruleSpec(matcherSettings, subrule, Unapply)); } - private void Unapply(Match targetMatch, Range range, VariableBindings varBindings) + private void Unapply(Match targetMatch, Range range, VariableBindings varBindings) { ShapeNode curNode = IsTargetEmpty ? range.Start : range.End; - foreach ( - Constraint constraint in _analysisRhs.Children.Cast>() - ) + foreach (Constraint constraint in _analysisRhs.Children.Cast>()) { FeatureStruct fs = constraint.FeatureStruct.Clone(); if (varBindings != null) diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/NarrowSynthesisRewriteSubruleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/NarrowSynthesisRewriteSubruleSpec.cs index 99957f357..c6fd2d182 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/NarrowSynthesisRewriteSubruleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/NarrowSynthesisRewriteSubruleSpec.cs @@ -7,11 +7,11 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { public class NarrowSynthesisRewriteSubruleSpec : SynthesisRewriteSubruleSpec { - private readonly Pattern _rhs; + private readonly Pattern _rhs; private readonly int _targetCount; public NarrowSynthesisRewriteSubruleSpec( - MatcherSettings matcherSettings, + MatcherSettings matcherSettings, bool isIterative, int targetCount, RewriteSubrule subrule, @@ -24,15 +24,15 @@ int index } public override void ApplyRhs( - Match targetMatch, + Match targetMatch, Range range, VariableBindings varBindings ) { ShapeNode curNode = range.End; - foreach (PatternNode node in _rhs.Children) + foreach (PatternNode node in _rhs.Children) { - var constraint = (Constraint)node; + var constraint = (Constraint)node; FeatureStruct fs = constraint.FeatureStruct.Clone(); if (varBindings != null) fs.ReplaceVariables(varBindings); diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/PhonologicalPatternRule.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/PhonologicalPatternRule.cs index b55bf7d2a..ca16bb736 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/PhonologicalPatternRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/PhonologicalPatternRule.cs @@ -5,21 +5,18 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { - public abstract class PhonologicalPatternRule : IRule + public abstract class PhonologicalPatternRule : IRule { private readonly IPhonologicalPatternRuleSpec _ruleSpec; - private readonly Matcher _matcher; + private readonly Matcher _matcher; - protected PhonologicalPatternRule( - IPhonologicalPatternRuleSpec ruleSpec, - MatcherSettings matcherSettings - ) + protected PhonologicalPatternRule(IPhonologicalPatternRuleSpec ruleSpec, MatcherSettings matcherSettings) { _ruleSpec = ruleSpec; - _matcher = new Matcher(_ruleSpec.Pattern, matcherSettings); + _matcher = new Matcher(_ruleSpec.Pattern, matcherSettings); } - public Matcher Matcher + public Matcher Matcher { get { return _matcher; } } diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/RewriteRule.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/RewriteRule.cs index 30aa423c6..1972fdb71 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/RewriteRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/RewriteRule.cs @@ -18,11 +18,11 @@ public class RewriteRule : HCRuleBase, IPhonologicalRule public RewriteRule() { - Lhs = Pattern.New().Value; + Lhs = Pattern.New().Value; _subrules = new List(); } - public Pattern Lhs { get; set; } + public Pattern Lhs { get; set; } public IList Subrules { @@ -33,12 +33,12 @@ public IList Subrules public RewriteApplicationMode ApplicationMode { get; set; } - public override IRule CompileAnalysisRule(Morpher morpher) + public override IRule CompileAnalysisRule(Morpher morpher) { return new AnalysisRewriteRule(morpher, this); } - public override IRule CompileSynthesisRule(Morpher morpher) + public override IRule CompileSynthesisRule(Morpher morpher) { return new SynthesisRewriteRule(morpher, this); } diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/RewriteRuleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/RewriteRuleSpec.cs index f91899d61..2e60d8f4f 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/RewriteRuleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/RewriteRuleSpec.cs @@ -8,18 +8,18 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { public abstract class RewriteRuleSpec : IPhonologicalPatternRuleSpec { - private readonly Pattern _pattern; + private readonly Pattern _pattern; private readonly List _subruleSpecs; private readonly bool _isTargetEmpty; protected RewriteRuleSpec(bool isTargetEmpty) { - _pattern = new Pattern(); + _pattern = new Pattern(); _subruleSpecs = new List(); _isTargetEmpty = isTargetEmpty; } - public Pattern Pattern + public Pattern Pattern { get { return _pattern; } } @@ -36,10 +36,16 @@ protected bool IsTargetEmpty public bool MatchSubrule( PhonologicalPatternRule rule, - Match match, + Match match, out PhonologicalSubruleMatch subruleMatch ) { + // RUSTIFY Stage 2: match.Range is now Range ([leftmostTag, rightmostTag+1)); resolve its + // bracketing nodes via the shape once (match.Input/match.Range are invariant across the + // subrules tried below), then navigate the segment graph as before. + Shape shape = match.Input.Shape; + ShapeNode rangeStart = shape.NodeAt(match.Range.Start); + ShapeNode rangeEnd = shape.NodeAt(match.Range.End - 1); foreach (RewriteSubruleSpec subruleSpec in _subruleSpecs) { if (!subruleSpec.IsApplicable(match.Input)) @@ -53,13 +59,13 @@ out PhonologicalSubruleMatch subruleMatch { if (match.Matcher.Direction == Direction.LeftToRight) { - leftNode = match.Range.Start; - rightNode = match.Range.End.Next; + leftNode = rangeStart; + rightNode = rangeEnd.Next; } else { - leftNode = match.Range.Start.Prev; - rightNode = match.Range.End; + leftNode = rangeStart.Prev; + rightNode = rangeEnd; } startNode = leftNode; @@ -67,10 +73,10 @@ out PhonologicalSubruleMatch subruleMatch } else { - leftNode = match.Range.Start.Prev; - rightNode = match.Range.End.Next; - startNode = match.Range.Start; - endNode = match.Range.End; + leftNode = rangeStart.Prev; + rightNode = rangeEnd.Next; + startNode = rangeStart; + endNode = rangeEnd; } if (leftNode == null || rightNode == null) @@ -80,9 +86,10 @@ out PhonologicalSubruleMatch subruleMatch } VariableBindings varBindings = match.VariableBindings; - Match leftEnvMatch = subruleSpec.LeftEnvironmentMatcher?.Match( + // left environment is matched right-to-left (see RewriteSubruleSpec) + Match leftEnvMatch = subruleSpec.LeftEnvironmentMatcher?.Match( match.Input, - leftNode, + shape.MatchStartOffset(leftNode, Direction.RightToLeft), varBindings ); if (leftEnvMatch == null || leftEnvMatch.Success) @@ -90,9 +97,10 @@ out PhonologicalSubruleMatch subruleMatch if (leftEnvMatch != null && leftEnvMatch.VariableBindings != null) varBindings = leftEnvMatch.VariableBindings; - Match rightEnvMatch = subruleSpec.RightEnvironmentMatcher?.Match( + // right environment is matched left-to-right (see RewriteSubruleSpec) + Match rightEnvMatch = subruleSpec.RightEnvironmentMatcher?.Match( match.Input, - rightNode, + shape.MatchStartOffset(rightNode, Direction.LeftToRight), varBindings ); if (rightEnvMatch == null || rightEnvMatch.Success) diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/RewriteSubrule.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/RewriteSubrule.cs index 66331295d..9db8ab38c 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/RewriteSubrule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/RewriteSubrule.cs @@ -11,17 +11,17 @@ public class RewriteSubrule public RewriteSubrule() { - Rhs = Pattern.New().Value; - LeftEnvironment = Pattern.New().Value; - RightEnvironment = Pattern.New().Value; + Rhs = Pattern.New().Value; + LeftEnvironment = Pattern.New().Value; + RightEnvironment = Pattern.New().Value; RequiredSyntacticFeatureStruct = FeatureStruct.New().Value; _requiredMprFeatures = new MprFeatureSet(); _excludedMprFeatures = new MprFeatureSet(); } - public Pattern Rhs { get; set; } - public Pattern LeftEnvironment { get; set; } - public Pattern RightEnvironment { get; set; } + public Pattern Rhs { get; set; } + public Pattern LeftEnvironment { get; set; } + public Pattern RightEnvironment { get; set; } public FeatureStruct RequiredSyntacticFeatureStruct { get; set; } public MprFeatureSet RequiredMprFeatures { diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/RewriteSubruleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/RewriteSubruleSpec.cs index a8219cc83..1fa74dbc9 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/RewriteSubruleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/RewriteSubruleSpec.cs @@ -7,38 +7,38 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { public abstract class RewriteSubruleSpec : IPhonologicalPatternSubruleSpec { - private readonly Matcher _leftEnvMatcher; - private readonly Matcher _rightEnvMatcher; + private readonly Matcher _leftEnvMatcher; + private readonly Matcher _rightEnvMatcher; protected RewriteSubruleSpec( - MatcherSettings matcherSettings, - Pattern leftEnv, - Pattern rightEnv + MatcherSettings matcherSettings, + Pattern leftEnv, + Pattern rightEnv ) { if (leftEnv != null && !leftEnv.IsEmpty) { - MatcherSettings leftEnvMatcherSettings = matcherSettings.Clone(); + MatcherSettings leftEnvMatcherSettings = matcherSettings.Clone(); leftEnvMatcherSettings.Direction = Direction.RightToLeft; leftEnvMatcherSettings.AnchoredToStart = true; - _leftEnvMatcher = new Matcher(leftEnv, leftEnvMatcherSettings); + _leftEnvMatcher = new Matcher(leftEnv, leftEnvMatcherSettings); } if (rightEnv != null && !rightEnv.IsEmpty) { - MatcherSettings rightEnvMatcherSettings = matcherSettings.Clone(); + MatcherSettings rightEnvMatcherSettings = matcherSettings.Clone(); rightEnvMatcherSettings.Direction = Direction.LeftToRight; rightEnvMatcherSettings.AnchoredToStart = true; - _rightEnvMatcher = new Matcher(rightEnv, rightEnvMatcherSettings); + _rightEnvMatcher = new Matcher(rightEnv, rightEnvMatcherSettings); } } - public Matcher LeftEnvironmentMatcher + public Matcher LeftEnvironmentMatcher { get { return _leftEnvMatcher; } } - public Matcher RightEnvironmentMatcher + public Matcher RightEnvironmentMatcher { get { return _rightEnvMatcher; } } @@ -49,7 +49,7 @@ public virtual bool IsApplicable(Word input) } public abstract void ApplyRhs( - Match targetMatch, + Match targetMatch, Range range, VariableBindings varBindings ); diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SimultaneousPhonologicalPatternRule.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SimultaneousPhonologicalPatternRule.cs index e95ab5690..3c965fe56 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SimultaneousPhonologicalPatternRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SimultaneousPhonologicalPatternRule.cs @@ -12,7 +12,7 @@ public class SimultaneousPhonologicalPatternRule : PhonologicalPatternRule public SimultaneousPhonologicalPatternRule( IPhonologicalPatternRuleSpec ruleSpec, - MatcherSettings matcherSettings + MatcherSettings matcherSettings ) : base(ruleSpec, matcherSettings) { @@ -21,15 +21,15 @@ MatcherSettings matcherSettings public override IEnumerable Apply(Word input) { - var matches = new List, PhonologicalSubruleMatch>>(); - foreach (Match targetMatch in Matcher.AllMatches(input)) + var matches = new List, PhonologicalSubruleMatch>>(); + foreach (Match targetMatch in Matcher.AllMatches(input)) { PhonologicalSubruleMatch srMatch; if (_ruleSpec.MatchSubrule(this, targetMatch, out srMatch)) matches.Add(Tuple.Create(targetMatch, srMatch)); } - foreach (Tuple, PhonologicalSubruleMatch> match in matches) + foreach (Tuple, PhonologicalSubruleMatch> match in matches) match.Item2.SubruleSpec.ApplyRhs(match.Item1, match.Item2.Range, match.Item2.VariableBindings); return input.ToEnumerable(); diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisMetathesisRule.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisMetathesisRule.cs index 2ea546df2..2d8c3af5a 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisMetathesisRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisMetathesisRule.cs @@ -7,7 +7,7 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { - public class SynthesisMetathesisRule : IRule + public class SynthesisMetathesisRule : IRule { private readonly Morpher _morpher; private readonly MetathesisRule _rule; @@ -20,7 +20,7 @@ public SynthesisMetathesisRule(Morpher morpher, MetathesisRule rule) var ruleSpec = new SynthesisMetathesisRuleSpec(rule.Pattern, rule.LeftSwitchName, rule.RightSwitchName); - var settings = new MatcherSettings + var settings = new MatcherSettings { Direction = rule.Direction, Filter = ann => diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisMetathesisRuleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisMetathesisRuleSpec.cs index 312e0a677..01e1a27f2 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisMetathesisRuleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisMetathesisRuleSpec.cs @@ -8,30 +8,24 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { public class SynthesisMetathesisRuleSpec : IPhonologicalPatternRuleSpec, IPhonologicalPatternSubruleSpec { - private readonly Pattern _pattern; + private readonly Pattern _pattern; private readonly string _leftGroupName; private readonly string _rightGroupName; - public SynthesisMetathesisRuleSpec( - Pattern pattern, - string leftGroupName, - string rightGroupName - ) + public SynthesisMetathesisRuleSpec(Pattern pattern, string leftGroupName, string rightGroupName) { _leftGroupName = leftGroupName; _rightGroupName = rightGroupName; - _pattern = new Pattern(); - foreach (PatternNode node in pattern.Children) + _pattern = new Pattern(); + foreach (PatternNode node in pattern.Children) { - if (node is Group group) + if (node is Group group) { - var newGroup = new Group(group.Name); - foreach ( - Constraint constraint in group.Children.Cast>() - ) + var newGroup = new Group(group.Name); + foreach (Constraint constraint in group.Children.Cast>()) { - Constraint newConstraint = constraint.Clone(); + Constraint newConstraint = constraint.Clone(); newConstraint.FeatureStruct.AddValue(HCFeatureSystem.Modified, HCFeatureSystem.Clean); newGroup.Children.Add(newConstraint); } @@ -45,27 +39,31 @@ Constraint constraint in group.Children.Cast Pattern + public Pattern Pattern { get { return _pattern; } } public bool MatchSubrule( PhonologicalPatternRule rule, - Match match, + Match match, out PhonologicalSubruleMatch subruleMatch ) { - subruleMatch = new PhonologicalSubruleMatch(this, match.Range, match.VariableBindings); + subruleMatch = new PhonologicalSubruleMatch( + this, + match.Input.Shape.ToShapeRange(match.Range), + match.VariableBindings + ); return true; } - Matcher IPhonologicalPatternSubruleSpec.LeftEnvironmentMatcher + Matcher IPhonologicalPatternSubruleSpec.LeftEnvironmentMatcher { get { return null; } } - Matcher IPhonologicalPatternSubruleSpec.RightEnvironmentMatcher + Matcher IPhonologicalPatternSubruleSpec.RightEnvironmentMatcher { get { return null; } } @@ -75,18 +73,35 @@ bool IPhonologicalPatternSubruleSpec.IsApplicable(Word input) return true; } - public void ApplyRhs(Match targetMatch, Range range, VariableBindings varBindings) + public void ApplyRhs(Match targetMatch, Range range, VariableBindings varBindings) { - ShapeNode start = null, - end = null; - foreach (GroupCapture gc in targetMatch.GroupCaptures) + // RUSTIFY Stage 2: group captures are int offsets that go stale on the first structural + // mutation (morph.Remove / MoveNodesAfter re-densify the projection), so resolve EVERYTHING + // to ShapeNode refs up front — those survive the moves, as the old ShapeNode ranges did. + Shape shape = targetMatch.Input.Shape; + int? startTag = null, + endTag = null; + foreach (GroupCapture gc in targetMatch.GroupCaptures) { - if (start == null || gc.Range.Start.CompareTo(start) < 0) - start = gc.Range.Start; - if (end == null || gc.Range.End.CompareTo(end) > 0) - end = gc.Range.End; + if (!gc.Success) + continue; + if (startTag == null || gc.Range.Start < startTag) + startTag = gc.Range.Start; + if (endTag == null || gc.Range.End > endTag) + endTag = gc.Range.End; } - Debug.Assert(start != null && end != null); + Debug.Assert(startTag != null && endTag != null); + ShapeNode start = shape.NodeAt(startTag.Value); + ShapeNode end = shape.NodeAt(endTag.Value - 1); + + GroupCapture leftGroup = targetMatch.GroupCaptures[_leftGroupName]; + GroupCapture rightGroup = targetMatch.GroupCaptures[_rightGroupName]; + Range leftRange = shape.ToShapeRange(leftGroup.Range); + Range rightRange = shape.ToShapeRange(rightGroup.Range); + // Already resolved above via ToShapeRange (leftRange.End == EndNode(leftGroup.Range), + // rightRange.Start == NodeAt(rightGroup.Range.Start)) — reuse instead of a second NodeAt lookup. + ShapeNode leftEnd = leftRange.End; + ShapeNode beforeRightGroup = rightRange.Start.Prev; var morphs = targetMatch .Input.Morphs.Where(ann => ann.Range.Overlaps(start, end)) @@ -95,12 +110,8 @@ public void ApplyRhs(Match targetMatch, Range range, foreach (var morph in morphs) morph.Annotation.Remove(); - GroupCapture leftGroup = targetMatch.GroupCaptures[_leftGroupName]; - GroupCapture rightGroup = targetMatch.GroupCaptures[_rightGroupName]; - - ShapeNode beforeRightGroup = rightGroup.Range.Start.Prev; - MoveNodesAfter(targetMatch.Input.Shape, leftGroup.Range.End, rightGroup.Range); - MoveNodesAfter(targetMatch.Input.Shape, beforeRightGroup, leftGroup.Range); + MoveNodesAfter(shape, leftEnd, rightRange); + MoveNodesAfter(shape, beforeRightGroup, leftRange); foreach (var morph in morphs) { @@ -110,7 +121,7 @@ public void ApplyRhs(Match targetMatch, Range range, morph.Annotation.FeatureStruct ); newMorphAnn.Children.AddRange(morph.Children); - targetMatch.Input.Annotations.Add(newMorphAnn, false); + shape.Annotations.Add(newMorphAnn, false); } } diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisRewriteRule.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisRewriteRule.cs index 1dc7e3ca5..ecf84a7dc 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisRewriteRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisRewriteRule.cs @@ -8,7 +8,7 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules { - public class SynthesisRewriteRule : IRule + public class SynthesisRewriteRule : IRule { private readonly Morpher _morpher; private readonly RewriteRule _rule; @@ -19,7 +19,7 @@ public SynthesisRewriteRule(Morpher morpher, RewriteRule rule) _morpher = morpher; _rule = rule; - var settings = new MatcherSettings + var settings = new MatcherSettings { Direction = rule.Direction, Filter = ann => diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisRewriteRuleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisRewriteRuleSpec.cs index 87fa10955..bab19a4e8 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisRewriteRuleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisRewriteRuleSpec.cs @@ -11,9 +11,9 @@ namespace SIL.Machine.Morphology.HermitCrab.PhonologicalRules public class SynthesisRewriteRuleSpec : RewriteRuleSpec { public SynthesisRewriteRuleSpec( - MatcherSettings matcherSettings, + MatcherSettings matcherSettings, bool isIterative, - Pattern lhs, + Pattern lhs, IEnumerable subrules ) : base(lhs.IsEmpty) @@ -23,14 +23,14 @@ IEnumerable subrules if (lhs.IsEmpty) { Pattern.Children.Add( - new Constraint( + new Constraint( FeatureStruct.New().Symbol(HCFeatureSystem.Segment, HCFeatureSystem.Anchor).Value ) ); } else { - foreach (Constraint constraint in lhs.Children.Cast>()) + foreach (Constraint constraint in lhs.Children.Cast>()) { var newConstraint = constraint.Clone(); if (isIterative) @@ -82,15 +82,15 @@ IEnumerable subrules } } - private static bool CheckTarget(Match match, Pattern lhs) + private static bool CheckTarget(Match match, Pattern lhs) { foreach ( - Tuple> tuple in match + Tuple> tuple in match .Input.Shape.GetNodes(match.Range) .Zip(lhs.Children) ) { - var constraints = (Constraint)tuple.Item2; + var constraints = (Constraint)tuple.Item2; if (tuple.Item1.Annotation.Type() != constraints.Type()) return false; } diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisRewriteSubruleSpec.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisRewriteSubruleSpec.cs index a604aaba9..2c93bccd1 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisRewriteSubruleSpec.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologicalRules/SynthesisRewriteSubruleSpec.cs @@ -11,7 +11,7 @@ public abstract class SynthesisRewriteSubruleSpec : RewriteSubruleSpec private readonly bool _isIterative; protected SynthesisRewriteSubruleSpec( - MatcherSettings matcherSettings, + MatcherSettings matcherSettings, bool isIterative, RewriteSubrule subrule, int index diff --git a/src/SIL.Machine.Morphology.HermitCrab/PhonologyRuleCompiler.cs b/src/SIL.Machine.Morphology.HermitCrab/PhonologyRuleCompiler.cs new file mode 100644 index 000000000..6162ee595 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/PhonologyRuleCompiler.cs @@ -0,0 +1,349 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Annotations; +using SIL.Machine.FeatureModel; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.PhonologicalRules; +using SIL.Machine.Rules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Builds an (surface→underlying) automatically from a grammar's + /// phonological rules — the "Pinv compiler" LEVER_2.md left as the frontier. Consumed by + /// , which walks it in LOCKSTEP against the + /// morphotactic lexicon (a real product-automaton composition, not the boundary-less runtime + /// inversion uses) — the lexicon constrains every + /// restoration, so the boundary-conditioning problem that broke Indonesian meN- does not + /// recur here: the walk only restores an underlying segment where a lexicon arc actually has it. + /// + /// B-direct, per rule. For each supported rule this compiles its own arcs by PROBING that + /// ONE rule's synthesis behavior in isolation (build a tiny underlying string, apply just this + /// rule via its own , observe the surface) — + /// never by probing the grammar's combined multi-rule effect (which LEVER_2.md's cascade + /// experiment showed misreads feeding/bleeding). Each rule's arcs are added as an independent + /// branch from the shared "outside any rule" state 0, so genuinely INTERACTING rules (true + /// feeding/bleeding cascades, e.g. Indonesian's assimilation+deletion) are not composed together + /// — that remains LEVER_2.md's still-open frontier. Non-interacting rules (the common case) work + /// correctly under this v1: each is independently invertible and none needs the others' state. + /// + /// Supported shape (v1): a with a + /// AND a + /// of plain Constraint nodes (either or both may be empty, but a deletion needs at least one + /// non-empty — unconditioned deletion would over-restore everywhere), a single-segment + /// (exactly one , no + /// quantifiers/groups/alternations), a of length 0 (deletion) or + /// 1 (feature-change substitution), and no syntactic-feature/MPR-feature gating. The left and + /// right environments are independent, symmetric chains of identity arcs bracketing the + /// restoration/substitution arc — each environment is probed and applied in isolation, so a rule + /// needing BOTH environments to interact with EACH OTHER (not just with the Lhs) is outside this + /// v1's reach. Everything else (multi-segment Lhs, epenthesis, metathesis, α-variables, gated + /// subrules, true multi-rule feeding/bleeding cascades) is skipped — not silently: + /// reports how many were skipped, and skipping only costs + /// coverage (verify still guards soundness on whatever this DOES propose). + /// + public sealed class PhonologyRuleCompiler + { + private readonly CharacterDefinitionTable _table; + private readonly List _alphabet; + private int _nextState = 1; // state 0 is reserved for the shared "outside any rule" state + private int _unsupportedRuleCount; + + private PhonologyRuleCompiler(CharacterDefinitionTable table) + { + _table = table; + _alphabet = table.Where(cd => cd.Type == HCFeatureSystem.Segment).ToList(); + } + + /// Number of (rule, subrule) pairs skipped because their shape is outside the v1 + /// supported set (see class remarks) — a coverage diagnostic, never a soundness concern. + public int UnsupportedRuleCount => _unsupportedRuleCount; + + /// Compile every stratum's supported phonological rules into one . + /// Returns an all-identity (no-op) transducer if the grammar has no phonological rules or none + /// are in the supported shape — safe: the composed walk then degenerates to matching the + /// underlying-only lexicon directly. + public static (InversePhonology Pinv, int UnsupportedCount) Compile(Language language, Morpher morpher) + { + var compiler = new PhonologyRuleCompiler(language.SurfaceStratum.CharacterDefinitionTable); + var pinv = new InversePhonology { StartState = 0 }; + pinv.SetAccepting(0); + foreach (CharacterDefinition cd in compiler._alphabet) + { + pinv.AddArc(0, cd.FeatureStruct, cd.FeatureStruct, 0); // identity: everything outside a rule + } + foreach (Stratum stratum in language.Strata) + { + foreach (IPhonologicalRule prule in stratum.PhonologicalRules) + { + if (!(prule is RewriteRule rule)) + { + continue; // metathesis and other non-rewrite rule types: not yet supported + } + IRule compiled = rule.CompileSynthesisRule(morpher); + foreach (RewriteSubrule subrule in rule.Subrules) + { + compiler.TryCompileSubrule(pinv, rule, subrule, stratum, compiled, morpher); + } + } + } + return (pinv, compiler._unsupportedRuleCount); + } + + private void TryCompileSubrule( + InversePhonology pinv, + RewriteRule rule, + RewriteSubrule subrule, + Stratum stratum, + IRule compiled, + Morpher morpher + ) + { + if ( + !IsTrivial(subrule.RequiredSyntacticFeatureStruct) + || subrule.RequiredMprFeatures.Count > 0 + || subrule.ExcludedMprFeatures.Count > 0 + || !TryGetConstraints(subrule.LeftEnvironment, out List leftEnv) + || !TryGetConstraints(rule.Lhs, out List lhs) + || lhs.Count != 1 // v1: single-segment Lhs only + || !TryGetConstraints(subrule.Rhs, out List rhs) + || rhs.Count > 1 // v1: deletion (0) or plain substitution (1) only + || !TryGetConstraints(subrule.RightEnvironment, out List rightEnv) + ) + { + _unsupportedRuleCount++; + return; + } + bool isDeletion = rhs.Count == 0; + if (isDeletion && leftEnv.Count == 0 && rightEnv.Count == 0) + { + _unsupportedRuleCount++; // unconditioned deletion would over-restore everywhere + return; + } + + string leftEnvProbe = BuildProbeString(leftEnv); + string rightEnvProbe = BuildProbeString(rightEnv); + if (leftEnvProbe == null || rightEnvProbe == null) + { + _unsupportedRuleCount++; // an environment has a class with no representative segment + return; + } + int targetIndex = leftEnv.Count; + + // The left-environment chain is the same for every candidate below (it depends only on + // the rule's environment, not on which Lhs segment is being probed), so build it once. + int fromState = ChainLeftEnvironment(pinv, leftEnv); + + bool addedAny = false; + foreach (CharacterDefinition candidate in _alphabet) + { + if (!candidate.FeatureStruct.IsUnifiable(lhs[0])) + { + continue; + } + string underlyingRep = candidate.Representations.FirstOrDefault(); + if (underlyingRep == null) + { + continue; + } + string probeString = leftEnvProbe + underlyingRep + rightEnvProbe; + List before = SegmentFeatureStructs(probeString); + if (before == null || before.Count == 0) + { + continue; + } + List after = ApplyRule(compiled, stratum, probeString); + if (after == null) + { + continue; + } + if (isDeletion && after.Count == before.Count - 1) + { + // The candidate segment vanished. (A weaker check than also confirming the + // environment surfaced byte-for-byte unchanged — FeatureStruct.ValueEquals proved + // too strict for segments that round-tripped through rule application without a + // real change, likely picking up incidental instance state. Verify is the actual + // soundness backstop regardless: a wrong restoration here costs a rejected + // candidate, never a wrong answer.) + AddRestorationBranch(pinv, fromState, candidate.FeatureStruct, rightEnv); + addedAny = true; + } + else if ( + !isDeletion + && after.Count == before.Count + && !after[targetIndex].ValueEquals(before[targetIndex]) + ) + { + AddSubstitutionBranch(pinv, fromState, after[targetIndex], candidate.FeatureStruct, rightEnv); + addedAny = true; + } + // else: this candidate is unaffected by the rule in this context — no arc added. + } + if (!addedAny) + { + _unsupportedRuleCount++; // probing found no effect — nothing this rule does is invertible here + } + } + + /// ε-input: restore from + /// (state 0 if this rule has no left environment, or the state reached after matching the + /// left-environment chain), then consume the right-environment segments as identity back to + /// state 0 — mirrors the hand-verified LEVER_2.md deletion spike. With no right context either, + /// the restoration arc goes straight back to state 0. + private void AddRestorationBranch( + InversePhonology pinv, + int fromState, + FeatureStruct underlying, + List rightEnv + ) + { + if (rightEnv.Count == 0) + { + pinv.AddArc(fromState, null, underlying, 0); + return; + } + int state = NewState(); + pinv.AddArc(fromState, null, underlying, state); + ChainRightEnvironment(pinv, state, rightEnv); + } + + /// A real arc, from (state 0, or the state reached after + /// matching the left-environment chain), consuming the surfaced segment and emitting the + /// underlying one, then the right-environment chain back to state 0. With no right context at + /// all, the arc goes straight back to state 0 — no intermediate state needed. + private void AddSubstitutionBranch( + InversePhonology pinv, + int fromState, + FeatureStruct surface, + FeatureStruct underlying, + List rightEnv + ) + { + if (rightEnv.Count == 0) + { + pinv.AddArc(fromState, surface, underlying, 0); + return; + } + int state = NewState(); + pinv.AddArc(fromState, surface, underlying, state); + ChainRightEnvironment(pinv, state, rightEnv); + } + + /// Consume each right-environment segment as an identity transition, ending back at + /// state 0. Callers only invoke this with a non-empty environment (the zero-length case is + /// handled directly by the caller). + private void ChainRightEnvironment(InversePhonology pinv, int from, List rightEnv) + { + int state = from; + for (int i = 0; i < rightEnv.Count; i++) + { + int next = i == rightEnv.Count - 1 ? 0 : NewState(); + pinv.AddArc(state, rightEnv[i], rightEnv[i], next); // environment passes through as identity + state = next; + } + } + + /// Consume each left-environment segment as an identity transition FROM state 0, + /// returning the state reached once the whole environment has matched (state 0 itself if the + /// environment is empty) — the symmetric mirror of , built + /// forward instead of backward. This runs in parallel with state 0's own identity self-loops + /// (this is an NFA: a segment that starts matching a left environment is also still a candidate + /// for every other live path), so no existing arc is disturbed. + private int ChainLeftEnvironment(InversePhonology pinv, List leftEnv) + { + int state = 0; + foreach (FeatureStruct env in leftEnv) + { + int next = NewState(); + pinv.AddArc(state, env, env, next); // environment passes through as identity + state = next; + } + return state; + } + + private int NewState() => _nextState++; + + private static bool IsTrivial(FeatureStruct fs) => fs == null || fs.IsEmpty; + + /// True iff every top-level child of is a plain + /// (no quantifiers/groups/alternations) — the bounded, + /// fixed-length window this compiler supports. Returns the ordered constraint FeatureStructs. + private static bool TryGetConstraints(Pattern pattern, out List constraints) + { + constraints = new List(); + if (pattern == null) + { + return true; + } + foreach (PatternNode child in pattern.Children) + { + if (!(child is Constraint c)) + { + constraints = null; + return false; + } + constraints.Add(c.FeatureStruct); + } + return true; + } + + /// One concrete alphabet representative per environment constraint, concatenated in + /// order (used for both the left- and right-environment side — direction-agnostic; the caller + /// decides where the result is concatenated). Returns null if some constraint has no unifiable + /// alphabet member. + private string BuildProbeString(List envConstraints) + { + var sb = new System.Text.StringBuilder(); + foreach (FeatureStruct fs in envConstraints) + { + CharacterDefinition rep = _alphabet.FirstOrDefault(cd => cd.FeatureStruct.IsUnifiable(fs)); + string str = rep?.Representations.FirstOrDefault(); + if (str == null) + { + return null; + } + sb.Append(str); + } + return sb.ToString(); + } + + private List SegmentFeatureStructs(string str) + { + Shape shape; + try + { + shape = _table.Segment(str); + } + catch (InvalidShapeException) + { + return null; + } + return shape + .Where(n => n.Annotation.Type() == HCFeatureSystem.Segment && !n.IsDeleted()) + .Select(n => n.Annotation.FeatureStruct) + .ToList(); + } + + private List ApplyRule(IRule compiled, Stratum stratum, string underlying) + { + Shape shape; + try + { + shape = _table.Segment(underlying); + } + catch (InvalidShapeException) + { + return null; + } + var word = new Word(stratum, shape); + Word result = compiled.Apply(word).DefaultIfEmpty(word).First(); + // A deletion rule marks a node IsDeleted() rather than physically removing it from the + // Shape (HermitCrabExtensions), so that flag must be filtered here or "deleted" segments + // would still show up in the count and mask the deletion. + return result + .Shape.Where(n => n.Annotation.Type() == HCFeatureSystem.Segment && !n.IsDeleted()) + .Select(n => n.Annotation.FeatureStruct) + .ToList(); + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/ReduplicationProposer.cs b/src/SIL.Machine.Morphology.HermitCrab/ReduplicationProposer.cs new file mode 100644 index 000000000..5d7c61877 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/ReduplicationProposer.cs @@ -0,0 +1,249 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Annotations; +using SIL.Machine.Morphology; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// A candidate generator for reduplication — full copy (surface = base·base) or partial + /// prefix/suffix copy (e.g. Tagalog CV-reduplication: surface = CV·base) — the one provably + /// non-regular construct (Dolatian & Heinz 2020), handled beside the FST rather than + /// inside it (FST_FULL_PLAN.md, Point 3). It does not need to be regular because the + /// gate re-runs HC to confirm every candidate. + /// + /// Mechanism (strip + recurse): for every copy length from 1 up to half the word, check both a + /// PREFIX copy (surface starts with its own next `len` characters repeated — full reduplication is + /// just the `len == word.Length / 2` case of this) and a SUFFIX copy (surface ends with its own + /// preceding `len` characters repeated). For each match, strip the copy and recurse the residual + /// base through the FST proposer — so an inflected reduplicant (e.g. REDUP of an affixed + /// stem) is covered, not just a bare root — then wrap each returned base analysis with the + /// reduplication morpheme (prepended, matching HC's RED root … order). This is a bounded, + /// O(word length²) scan (trivial); "well enough" for the attested cases, and anything it misses + /// simply fails parity and rides the engine — never a wrong answer. + /// + /// A THIRD shape (Phase C/D, FST_FULL_GRAMMAR_PLAN.md): a single-character SEPARATOR between base + /// and copy, where the copy is a TAIL of the base rather than an identical adjacent repeat — e.g. + /// Indonesian's -Cont gives menulis-nulis (`nulis` = the last 5 characters of + /// `menulis`, not a copy of the WHOLE prefixed word). HC's own morph-boundary bookkeeping makes the + /// copy exclude fixed-inserted prefix text like `me` in a way this generator does not try to model + /// underlyingly; empirically the copy is always a genuine surface TAIL, so scanning for that shape + /// directly (any single separator character, any tail length) recovers it — a wrong guess (an + /// unrelated character that happens to precede a coincidental tail match) is pruned by verify like + /// any other candidate here, so trying every position costs time, never soundness. + /// + /// Soundness: a coincidental repeat (a word that merely looks copied at some length but is not + /// reduplicated) is proposed but pruned by verify, because HC's synthesis of base + REDUP (or + /// REDUP + base) will not reproduce it. + /// + /// A FOURTH shape (Phase G1, FST_FULL_GRAMMAR_PLAN.md): a suffix stacked OUTSIDE the reduplication + /// itself, on the copy side only — e.g. Indonesian's mengamat-amati is + /// meng+amat-Contmengamat-amat-i(LOC) → mengamat-amati, + /// so the copy (amati) is the base's tail PLUS a known suffix surface, not a plain tail. When + /// the plain separator-scan match fails, this generator additionally tries peeling each grammar + /// suffix's own surface text off the END of the copy and re-testing the remainder as a tail — a + /// single layer only (this corpus needs no more), and a wrong peel is pruned by verify exactly like + /// every other candidate here. + /// + public class ReduplicationProposer : IConstructProposer + { + private static readonly MorphOp[] _ops = { MorphOp.Reduplication }; + private readonly IMorphologicalAnalyzer _baseProposer; + private readonly List _redupRules; + private readonly List<(string SurfaceText, MorphemicMorphologicalRule Rule)> _suffixSurfaces; + + public ReduplicationProposer(Language language, IMorphologicalAnalyzer baseProposer) + { + _baseProposer = baseProposer; + _redupRules = new List(); + _suffixSurfaces = new List<(string, MorphemicMorphologicalRule)>(); + CharacterDefinitionTable table = language.SurfaceStratum.CharacterDefinitionTable; + foreach (Stratum stratum in language.Strata) + { + foreach (IMorphologicalRule mrule in stratum.MorphologicalRules) + { + if (!(mrule is MorphemicMorphologicalRule rule)) + { + continue; + } + if (IsReduplication(rule)) + { + _redupRules.Add(rule); + continue; + } + foreach (AffixProcessAllomorph allomorph in Allomorphs(rule)) + { + if (MorphTokenCodec.ClassifyOp(allomorph, false) != MorphOp.Suffix) + { + continue; + } + InsertSegments insert = allomorph.Rhs.OfType().FirstOrDefault(); + if (insert == null) + { + continue; + } + // The underlying representation may include boundary characters (e.g. + // Indonesian's "-i" LOC suffix is underlyingly "+i") that never appear on the + // surface — strip them by keeping only Segment-type nodes when rendering. + string surfaceText = RenderSurfaceOnly(table, insert.Segments.Shape); + if (!string.IsNullOrEmpty(surfaceText)) + { + _suffixSurfaces.Add((surfaceText, rule)); + } + } + } + } + } + + private static IEnumerable Allomorphs(MorphemicMorphologicalRule rule) + { + switch (rule) + { + case AffixProcessRule affix: + return affix.Allomorphs; + case RealizationalAffixProcessRule realizational: + return realizational.Allomorphs; + default: + return Enumerable.Empty(); + } + } + + private static string RenderSurfaceOnly(CharacterDefinitionTable table, Shape shape) + { + var sb = new System.Text.StringBuilder(); + foreach (ShapeNode node in shape) + { + if (node.Annotation.Type() != HCFeatureSystem.Segment) + { + continue; + } + string rep = table.GetMatchingStrReps(node).FirstOrDefault(); + if (string.IsNullOrEmpty(rep)) + { + return null; + } + sb.Append(rep); + } + return sb.ToString(); + } + + public IReadOnlyCollection CoveredOps => _ops; + + public IEnumerable AnalyzeWord(string word) + { + if (_redupRules.Count == 0) + { + yield break; + } + int maxCopyLen = word.Length / 2; + for (int len = 1; len <= maxCopyLen; len++) + { + // Prefix copy: the first `len` characters repeat immediately (surface = copy·base, and + // the base itself starts with that same `len`-character prefix). Strip the copy. + if (string.Equals(word.Substring(0, len), word.Substring(len, len), StringComparison.Ordinal)) + { + foreach (WordAnalysis analysis in ProposeForResidual(word.Substring(len))) + { + yield return analysis; + } + } + // Suffix copy: the last `len` characters repeat the `len` characters before them + // (surface = base·copy). Strip the trailing copy. + if ( + string.Equals( + word.Substring(word.Length - len, len), + word.Substring(word.Length - (2 * len), len), + StringComparison.Ordinal + ) + ) + { + foreach (WordAnalysis analysis in ProposeForResidual(word.Substring(0, word.Length - len))) + { + yield return analysis; + } + } + } + // Separator + tail copy: base + one literal character + a TAIL of base (not necessarily the + // whole base) — see class remarks. `sepPos` is the separator's index; everything after it is + // the candidate copy, everything before it is the candidate base. + for (int sepPos = 1; sepPos < word.Length - 1; sepPos++) + { + string before = word.Substring(0, sepPos); + string copy = word.Substring(sepPos + 1); + if (copy.Length == 0) + { + continue; + } + if (before.Length >= copy.Length && before.EndsWith(copy, StringComparison.Ordinal)) + { + foreach (WordAnalysis analysis in ProposeForResidual(before)) + { + yield return analysis; + } + continue; // plain tail matched — no need to also try peeling a suffix off this copy + } + // Fourth shape (Phase G1): the copy didn't match as a plain tail — try peeling a known + // suffix surface off the END of the copy and re-testing the remainder as a tail. + foreach ((string suffixText, MorphemicMorphologicalRule suffixRule) in _suffixSurfaces) + { + if (!copy.EndsWith(suffixText, StringComparison.Ordinal)) + { + continue; + } + string strippedCopy = copy.Substring(0, copy.Length - suffixText.Length); + if ( + strippedCopy.Length > 0 + && before.Length >= strippedCopy.Length + && before.EndsWith(strippedCopy, StringComparison.Ordinal) + ) + { + foreach (WordAnalysis analysis in ProposeForResidual(before, suffixRule)) + { + yield return analysis; + } + } + } + } + } + + private IEnumerable ProposeForResidual( + string residual, + MorphemicMorphologicalRule extraSuffix = null + ) + { + foreach (WordAnalysis baseAnalysis in _baseProposer.AnalyzeWord(residual)) + { + foreach (MorphemicMorphologicalRule redup in _redupRules) + { + // Application order: root (and its affixes), then the reduplication rule, then (if + // present) a suffix stacked OUTSIDE the reduplication (Phase G1) — matching HC's + // WordAnalysis.Morphemes order (root·…·RED·suffix), so the root index is unchanged. + var morphemes = new List(baseAnalysis.Morphemes) { redup }; + if (extraSuffix != null) + { + morphemes.Add(extraSuffix); + } + yield return new WordAnalysis(morphemes, baseAnalysis.RootMorphemeIndex, null); + } + } + } + + private static bool IsReduplication(MorphemicMorphologicalRule rule) + { + if (!(rule is AffixProcessRule affix)) + { + return false; + } + foreach (AffixProcessAllomorph allomorph in affix.Allomorphs) + { + if (MorphTokenCodec.ClassifyOp(allomorph, false) == MorphOp.Reduplication) + { + return true; + } + } + return false; + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/Stratum.cs b/src/SIL.Machine.Morphology.HermitCrab/Stratum.cs index f0fc7e197..823e8402b 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/Stratum.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/Stratum.cs @@ -125,12 +125,12 @@ public ICollection Entries /// The morphological rule order. public MorphologicalRuleOrder MorphologicalRuleOrder { get; set; } - public override IRule CompileAnalysisRule(Morpher morpher) + public override IRule CompileAnalysisRule(Morpher morpher) { return new AnalysisStratumRule(morpher, this); } - public override IRule CompileSynthesisRule(Morpher morpher) + public override IRule CompileSynthesisRule(Morpher morpher) { return new SynthesisStratumRule(morpher, this); } diff --git a/src/SIL.Machine.Morphology.HermitCrab/SurfacePhonology.cs b/src/SIL.Machine.Morphology.HermitCrab/SurfacePhonology.cs new file mode 100644 index 000000000..82d75d639 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/SurfacePhonology.cs @@ -0,0 +1,341 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Annotations; +using SIL.Machine.FeatureModel; +using SIL.Machine.Morphology.HermitCrab.PhonologicalRules; +using SIL.Machine.Rules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Forward phonology for the surface-allomorph precompile (FST_FULL_PLAN.md, Point 1). Applies the + /// grammar's synthesis phonological rules to a morpheme's underlying segment string and + /// returns the distinct surface realizations. Reuses HC's own compiled synthesis rules — no + /// reimplemented phonology — exactly the rules runs. + /// + /// Two tiers, both precompiled into the proposer's arcs: + /// + /// C-internal (1a): apply rules to the morpheme in isolation (word-edge context) + /// — catches edge-conditioned and morpheme-internal alternations. + /// C-boundary (1b): apply rules to the morpheme with each single neighbor segment of the + /// surface alphabet on each side, and (when the rule is length-preserving) read back the morpheme's + /// own surface portion — catches an affix whose own surface is conditioned by a neighbor across + /// the seam. Bounded by alphabet size × 2; a length-changing context is skipped (no reliable + /// portion), so it stays a sound superset. + /// + /// What remains — a neighbor's surface changing (e.g. a root devoicing before an affix), and any + /// longer-distance interaction — is covered completely by + /// (Point 4), which un-applies phonology on the assembled surface. So this helper is the cheap + /// fast-path; the composition proposer is the complete backstop. + /// + internal sealed class SurfacePhonology + { + private readonly CharacterDefinitionTable _table; + private readonly Stratum _surfaceStratum; + private readonly List> _strataPrules; + private readonly List _alphabet; + + // Variants(underlying) is a pure function of the fixed strata/alphabet above, but the FST builder + // calls it once per build SITE (per allomorph x slot x template x derivation depth/side) rather + // than once per distinct affix string - the same underlying segment string is re-cascaded many + // times over. Memoize so build cost scales with the affix inventory, not the template structure. + private readonly Dictionary> _variantsCache = + new Dictionary>(); + + // DeletionJunctions(underlying) is likewise a pure function of the fixed strata/alphabet, but + // FstTemplateAnalyzer.BuildDeletionJunctionArcs calls it per prefix-affix allomorph PER + // derivation-layer build PER depth level (Phase H, FST_FULL_GRAMMAR_PLAN.md) - the same handful + // of distinct affix strings get re-probed dozens of times over on a grammar with many templates. + // Memoize for the same reason Variants is memoized. + private readonly Dictionary> _deletionJunctionsCache = + new Dictionary>(); + + // Capability gates (Phase H): computed once from the grammar's own rule shapes, not the + // alphabet, so they're free to check on every call. A grammar with no phonological rules at + // all (e.g. Sena) can never alter a surface form, so Variants degenerates to identity with zero + // probing; one with no deletion-shaped subrule (empty Rhs) can never delete a neighbor, so + // DeletionJunctions' alphabet/alphabet^2 probing - which previously ran to exhaustion finding + // nothing on exactly such a grammar - is skipped entirely. + private readonly bool _anyPhonologicalRules; + private readonly bool _anyDeletionSubrule; + + public SurfacePhonology(Language language, Morpher morpher) + { + _table = language.SurfaceStratum.CharacterDefinitionTable; + _surfaceStratum = language.SurfaceStratum; + _strataPrules = new List>(); + foreach (Stratum stratum in language.Strata) + { + _strataPrules.Add( + new LinearRuleCascade( + stratum.PhonologicalRules.Select(p => p.CompileSynthesisRule(morpher)) + ) + ); + foreach (IPhonologicalRule prule in stratum.PhonologicalRules) + { + _anyPhonologicalRules = true; + if (prule is RewriteRule rewrite) + { + foreach (RewriteSubrule subrule in rewrite.Subrules) + { + if (!subrule.Rhs.Children.Any()) + { + _anyDeletionSubrule = true; + } + } + } + } + } + // The surface alphabet: one representative per segment character definition (the neighbor + // segments used to probe boundary-conditioned alternations). + _alphabet = new List(); + foreach (CharacterDefinition cd in _table) + { + if (cd.Type == HCFeatureSystem.Segment) + { + string rep = cd.Representations.FirstOrDefault(); + if (!string.IsNullOrEmpty(rep)) + { + _alphabet.Add(rep); + } + } + } + } + + /// The distinct surface realizations of — its isolation + /// form (always included, so the 0-phonology path is unchanged) plus each boundary-context + /// realization recovered when the rule is length-preserving. + public IReadOnlyCollection Variants(string underlying) + { + if (_variantsCache.TryGetValue(underlying, out IReadOnlyCollection cached)) + { + return cached; + } + IReadOnlyCollection computed = ComputeVariants(underlying); + _variantsCache[underlying] = computed; + return computed; + } + + private IReadOnlyCollection ComputeVariants(string underlying) + { + if (!_anyPhonologicalRules) + { + return new[] { underlying }; // no rule exists ⇒ identity is exact, not an approximation + } + var result = new HashSet { underlying }; + int underlyingLen = NodeCount(underlying); + if (underlyingLen < 0) + { + return new[] { underlying }; // unsegmentable + } + + // C-internal: the morpheme in isolation. + string isolation = SurfaceOf(underlying); + if (isolation != null) + { + result.Add(isolation); + } + + // C-boundary: the morpheme with one neighbor segment on each side. When the context is + // length-preserving, read back just the morpheme's own surface nodes. + foreach (string c in _alphabet) + { + AddBoundaryVariant(c + underlying, underlyingLen, fromEnd: true, result); // left neighbor + AddBoundaryVariant(underlying + c, underlyingLen, fromEnd: false, result); // right neighbor + } + return result.ToList(); + } + + private void AddBoundaryVariant(string context, int underlyingLen, bool fromEnd, HashSet result) + { + List outNodes = SurfaceNodes(context); + if (outNodes == null || outNodes.Count != underlyingLen + 1) + { + return; // unsegmentable, or a length-changing rule fired ⇒ no reliable morpheme portion + } + // The neighbor is one node; the morpheme is the remaining contiguous nodes. + IEnumerable morphemeNodes = fromEnd + ? outNodes.Skip(1) // left neighbor consumed the first node + : outNodes.Take(underlyingLen); // right neighbor is the last node + string rendered = RenderNodes(morphemeNodes); + if (rendered != null) + { + result.Add(rendered); + } + } + + /// Render nodes to their surface string, OMITTING any IsDeleted() node (HC marks a + /// deletion rather than physically removing the node — see PhonologyRuleCompiler's remarks + /// — so a naive render would still print the pre-deletion segment). This is what lets a + /// deletion-shortened morpheme (e.g. Indonesian's meN- nasal deleting before a sonorant root) show + /// up as a genuinely shorter variant instead of silently reproducing the underlying form. Returns + /// null if some surviving node has no single representation. + private string RenderNodes(IEnumerable nodes) + { + var sb = new System.Text.StringBuilder(); + foreach (ShapeNode node in nodes) + { + if (node.IsDeleted()) + { + continue; + } + string rep = _table.GetMatchingStrReps(node).FirstOrDefault(); + if (string.IsNullOrEmpty(rep)) + { + return null; // an under-specified node has no single representation — skip this context + } + sb.Append(rep); + } + return sb.ToString(); + } + + /// + /// Junction probe (Phase C, FST_FULL_GRAMMAR_PLAN.md): for each alphabet representative as the + /// RIGHT neighbor of , reports every case where the real synthesis + /// cascade deletes that NEIGHBOR itself (not just changes the morpheme's own segments) — e.g. + /// Indonesian's meN- causing a following voiceless obstruent to delete after nasal assimilation. + /// already handles a neighbor that survives unchanged or whose OWN + /// substitution is length-preserving; this method is the complementary case where the neighbor + /// disappears. Each result pairs the morpheme's own resulting surface (skipping any deleted node + /// within its own span too, so a length-preserving deletion inside the morpheme itself — e.g. the + /// nasal deleting before a sonorant root — is also captured here without double-counting against + /// ) with the neighbor's own underlying , so the + /// caller can build-time-gate a "the next real segment was deleted" arc to roots whose own leading + /// segment actually unifies with it — never a general runtime mechanism, just a bounded lookup + /// table keyed by the alphabet (size ~dozens), not by the lexicon. + /// + /// Tries a single trailing neighbor first (enough for a rule whose own environment needs nothing + /// beyond the deleted segment); if that finds nothing for a given candidate, falls back to probing + /// WITH A SECOND trailing segment too — some rules (Indonesian's voiceless-obstruent deletion) + /// require a further segment of right context (e.g. "and then a vowel") to satisfy their own + /// environment, which a single neighbor can never supply. Bounded by alphabet² in the worst case + /// (dozens², not lexicon-sized). + /// + public IReadOnlyCollection<(string AffixSurface, FeatureStruct DeletedNeighbor)> DeletionJunctions( + string underlying + ) + { + if (_deletionJunctionsCache.TryGetValue(underlying, out var cached)) + { + return cached; + } + IReadOnlyCollection<(string, FeatureStruct)> computed = ComputeDeletionJunctions(underlying); + _deletionJunctionsCache[underlying] = computed; + return computed; + } + + private IReadOnlyCollection<(string AffixSurface, FeatureStruct DeletedNeighbor)> ComputeDeletionJunctions( + string underlying + ) + { + var result = new List<(string, FeatureStruct)>(); + if (!_anyDeletionSubrule) + { + return result; // no rule can ever delete a segment ⇒ nothing to find, by construction + } + int underlyingLen = NodeCount(underlying); + if (underlyingLen < 0) + { + return result; + } + foreach (string c1 in _alphabet) + { + if (TryProbeDeletion(underlying, c1, null, underlyingLen, out var hit)) + { + result.Add(hit); + continue; + } + foreach (string c2 in _alphabet) + { + if (TryProbeDeletion(underlying, c1, c2, underlyingLen, out var hit2)) + { + result.Add(hit2); + break; // one confirming c2 is enough to know c1's class deletes in SOME context + } + } + } + return result; + } + + private bool TryProbeDeletion( + string underlying, + string c1, + string c2, + int underlyingLen, + out (string AffixSurface, FeatureStruct DeletedNeighbor) hit + ) + { + hit = default; + int extra = c2 == null ? 1 : 2; + List outNodes = SurfaceNodes(underlying + c1 + c2); + if (outNodes == null || outNodes.Count != underlyingLen + extra) + { + return false; // unsegmentable, or a length-changing rule fired elsewhere in the window + } + if (!outNodes[underlyingLen].IsDeleted()) + { + return false; // c1 survived — Variants() already covers that case + } + string affixSurface = RenderNodes(outNodes.Take(underlyingLen)); + if (affixSurface == null) + { + return false; + } + CharacterDefinition cd = _table.FirstOrDefault(d => + d.Type == HCFeatureSystem.Segment && d.Representations.Contains(c1) + ); + if (cd == null) + { + return false; + } + hit = (affixSurface, cd.FeatureStruct); + return true; + } + + /// Apply forward phonology to a segment string and return the surface string, or null if + /// it cannot be segmented. + private string SurfaceOf(string underlying) + { + List nodes = SurfaceNodes(underlying); + return nodes == null ? null : RenderNodes(nodes); + } + + /// Apply forward phonology to a segment string and return the surface segment nodes, or + /// null if it cannot be segmented. + private List SurfaceNodes(string str) + { + Shape shape; + try + { + shape = _table.Segment(str); + } + catch (InvalidShapeException) + { + return null; + } + var word = new Word(_surfaceStratum, shape); + foreach (LinearRuleCascade cascade in _strataPrules) + { + word = cascade.Apply(word).DefaultIfEmpty(word).First(); + } + return word.Shape.Where(n => n.Annotation.Type() == HCFeatureSystem.Segment).ToList(); + } + + /// The number of segment nodes after segmentation (before any phonology), or -1 if the + /// string cannot be segmented. This is the reference length for boundary extraction: a neighbor + /// adds exactly one node, so a length-preserving context yields underlyingLen + 1 nodes. + private int NodeCount(string str) + { + Shape shape; + try + { + shape = _table.Segment(str); + } + catch (InvalidShapeException) + { + return -1; + } + return shape.Count(n => n.Annotation.Type() == HCFeatureSystem.Segment); + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/SynthesisAffixTemplateRule.cs b/src/SIL.Machine.Morphology.HermitCrab/SynthesisAffixTemplateRule.cs index e8cf7ee17..21248d002 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/SynthesisAffixTemplateRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/SynthesisAffixTemplateRule.cs @@ -6,18 +6,18 @@ namespace SIL.Machine.Morphology.HermitCrab { - internal class SynthesisAffixTemplateRule : IRule + internal class SynthesisAffixTemplateRule : IRule { private readonly Morpher _morpher; private readonly AffixTemplate _template; - private readonly List> _rules; + private readonly List> _rules; public SynthesisAffixTemplateRule(Morpher morpher, AffixTemplate template) { _morpher = morpher; _template = template; - _rules = new List>( - template.Slots.Select(slot => new RuleBatch( + _rules = new List>( + template.Slots.Select(slot => new RuleBatch( slot.Rules.Select(mr => mr.CompileSynthesisRule(morpher)), false, FreezableEqualityComparer.Default diff --git a/src/SIL.Machine.Morphology.HermitCrab/SynthesisAffixTemplatesRule.cs b/src/SIL.Machine.Morphology.HermitCrab/SynthesisAffixTemplatesRule.cs index 1d878cd75..a5ab1aa2a 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/SynthesisAffixTemplatesRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/SynthesisAffixTemplatesRule.cs @@ -7,12 +7,12 @@ namespace SIL.Machine.Morphology.HermitCrab { - internal class SynthesisAffixTemplatesRule : IRule + internal class SynthesisAffixTemplatesRule : IRule { private readonly Morpher _morpher; private readonly Stratum _stratum; private readonly List _templates; - private readonly List> _templateRules; + private readonly List> _templateRules; public SynthesisAffixTemplatesRule(Morpher morpher, Stratum stratum) { diff --git a/src/SIL.Machine.Morphology.HermitCrab/SynthesisStratumRule.cs b/src/SIL.Machine.Morphology.HermitCrab/SynthesisStratumRule.cs index a15a6de5b..72ff8b24b 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/SynthesisStratumRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/SynthesisStratumRule.cs @@ -7,10 +7,10 @@ namespace SIL.Machine.Morphology.HermitCrab { - internal class SynthesisStratumRule : IRule + internal class SynthesisStratumRule : IRule { - private readonly IRule _mrulesRule; - private readonly IRule _prulesRule; + private readonly IRule _mrulesRule; + private readonly IRule _prulesRule; private readonly SynthesisAffixTemplatesRule _templatesRule; private readonly Stratum _stratum; private readonly Morpher _morpher; @@ -19,27 +19,27 @@ public SynthesisStratumRule(Morpher morpher, Stratum stratum) { _templatesRule = new SynthesisAffixTemplatesRule(morpher, stratum); _mrulesRule = null; - IEnumerable> mrules = stratum.MorphologicalRules.Select(mrule => + IEnumerable> mrules = stratum.MorphologicalRules.Select(mrule => mrule.CompileSynthesisRule(morpher) ); switch (stratum.MorphologicalRuleOrder) { case MorphologicalRuleOrder.Linear: - _mrulesRule = new LinearRuleCascade( + _mrulesRule = new LinearRuleCascade( mrules, true, FreezableEqualityComparer.Default ); break; case MorphologicalRuleOrder.Unordered: - _mrulesRule = new CombinationRuleCascade( + _mrulesRule = new CombinationRuleCascade( mrules, true, FreezableEqualityComparer.Default ); break; } - _prulesRule = new LinearRuleCascade( + _prulesRule = new LinearRuleCascade( stratum.PhonologicalRules.Select(prule => prule.CompileSynthesisRule(morpher)) ); _stratum = stratum; diff --git a/src/SIL.Machine.Morphology.HermitCrab/VerifiedFstAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/VerifiedFstAnalyzer.cs new file mode 100644 index 000000000..99d4a1b08 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/VerifiedFstAnalyzer.cs @@ -0,0 +1,50 @@ +using System.Collections.Generic; +using SIL.Machine.Morphology; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// The FST proposes candidates fast; each is confirmed by restricted re-analysis + /// () — HC's own pinned to the candidate's + /// root and rules — and a candidate HC does not confirm is discarded (not a fallback). The + /// confirmed, genuine HC analysis is emitted. Because verification runs HC's real analysis + + /// synthesis, this enforces every constraint (category, MPR, co-occurrence, obligatoriness) without + /// reimplementing any of them. + /// + /// Sound by construction (a kept analysis is a real HC analysis) and lossless (a valid candidate is + /// never false-rejected). It does not add analyses the proposer never produced, so under-generation + /// (coverage) must be closed in the proposer. Thread-safe: the immutable proposer is shared + /// and each verification rents a from the pool, so many words can be analyzed + /// in parallel. + /// + public class VerifiedFstAnalyzer : IMorphologicalAnalyzer + { + private readonly IMorphologicalAnalyzer _proposer; + private readonly MorpherPool _pool; + + public VerifiedFstAnalyzer(IMorphologicalAnalyzer proposer, MorpherPool pool) + { + _proposer = proposer; + _pool = pool; + } + + /// Build the proposer and a verify Morpher pool from a language. + public VerifiedFstAnalyzer(TraceManager traceManager, Language language) + : this( + new FstTemplateAnalyzer(language, new Morpher(traceManager, language)), + new MorpherPool(() => new Morpher(new TraceManager(), language)) + ) { } + + public IEnumerable AnalyzeWord(string word) + { + foreach (WordAnalysis candidate in _proposer.AnalyzeWord(word)) + { + WordAnalysis confirmed = FstReplay.Confirm(_pool, candidate, word); + if (confirmed != null) + { + yield return confirmed; + } + } + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/Word.cs b/src/SIL.Machine.Morphology.HermitCrab/Word.cs index 9b29429e9..96748875f 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/Word.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/Word.cs @@ -10,7 +10,7 @@ namespace SIL.Machine.Morphology.HermitCrab { - public class Word : Freezable, IAnnotatedData, ICloneable + public class Word : Freezable, IAnnotatedData, ICloneable { public const string RootMorphID = "ROOT"; @@ -19,8 +19,12 @@ public class Word : Freezable, IAnnotatedData, ICloneable private Shape _shape; private readonly List _mruleApps; private int _mruleAppIndex = -1; - private readonly Dictionary _mrulesUnapplied; - private readonly Dictionary _mrulesApplied; + + // RUSTIFY lever 2: lazily allocated — these morphological-rule bookkeeping maps stay empty through + // the phonological-analysis cascade (where ~345 clones/word happen), so cloning them eagerly per + // candidate allocated an empty dictionary for nothing. Null means empty; created on first write. + private Dictionary _mrulesUnapplied; + private Dictionary _mrulesApplied; private readonly List _nonHeadApps; private int _nonHeadAppIndex = -1; private readonly MprFeatureSet _mprFeatures; @@ -29,7 +33,7 @@ public class Word : Freezable, IAnnotatedData, ICloneable private Stratum _stratum; private bool? _isLastAppliedRuleFinal; private bool _isPartial; - private readonly Dictionary> _disjunctiveAllomorphIndices; + private Dictionary> _disjunctiveAllomorphIndices; // lazily allocated (see above) private int _mruleAppCount = 0; private readonly IList _alternatives = new List(); @@ -42,12 +46,10 @@ public Word(RootAllomorph rootAllomorph, FeatureStruct realizationalFS) SetRootAllomorph(rootAllomorph); RealizationalFeatureStruct = realizationalFS; _mruleApps = new List(); - _mrulesUnapplied = new Dictionary(); - _mrulesApplied = new Dictionary(); + // _mrulesUnapplied / _mrulesApplied / _disjunctiveAllomorphIndices are lazily allocated (null = empty). _nonHeadApps = new List(); _obligatorySyntacticFeatures = new IDBearerSet(); _isLastAppliedRuleFinal = null; - _disjunctiveAllomorphIndices = new Dictionary>(); } public Word(Stratum stratum, Shape shape) @@ -60,13 +62,11 @@ public Word(Stratum stratum, Shape shape) RealizationalFeatureStruct = new FeatureStruct(); _mprFeatures = new MprFeatureSet(); _mruleApps = new List(); - _mrulesUnapplied = new Dictionary(); - _mrulesApplied = new Dictionary(); + // _mrulesUnapplied / _mrulesApplied / _disjunctiveAllomorphIndices are lazily allocated (null = empty). _nonHeadApps = new List(); _obligatorySyntacticFeatures = new IDBearerSet(); _isLastAppliedRuleFinal = null; _isPartial = false; - _disjunctiveAllomorphIndices = new Dictionary>(); } protected Word(Word word) @@ -82,18 +82,29 @@ protected Word(Word word) _mprFeatures = word.MprFeatures.Clone(); _mruleApps = new List(word._mruleApps); _mruleAppIndex = word._mruleAppIndex; - _mrulesUnapplied = new Dictionary(word._mrulesUnapplied); - _mrulesApplied = new Dictionary(word._mrulesApplied); + // Lazily-allocated maps: copy only when the source actually has entries (null = empty), so a + // candidate cloned during phonological analysis allocates none of these dictionaries. + _mrulesUnapplied = + word._mrulesUnapplied == null || word._mrulesUnapplied.Count == 0 + ? null + : new Dictionary(word._mrulesUnapplied); + _mrulesApplied = + word._mrulesApplied == null || word._mrulesApplied.Count == 0 + ? null + : new Dictionary(word._mrulesApplied); _nonHeadApps = new List(word._nonHeadApps.CloneItems()); _nonHeadAppIndex = word._nonHeadAppIndex; _obligatorySyntacticFeatures = new IDBearerSet(word._obligatorySyntacticFeatures); _isLastAppliedRuleFinal = word._isLastAppliedRuleFinal; _isPartial = word._isPartial; CurrentTrace = word.CurrentTrace; - _disjunctiveAllomorphIndices = word._disjunctiveAllomorphIndices.ToDictionary( - kvp => kvp.Key, - kvp => new HashSet(kvp.Value) - ); + _disjunctiveAllomorphIndices = + word._disjunctiveAllomorphIndices == null || word._disjunctiveAllomorphIndices.Count == 0 + ? null + : word._disjunctiveAllomorphIndices.ToDictionary( + kvp => kvp.Key, + kvp => new HashSet(kvp.Value) + ); _mruleAppCount = word._mruleAppCount; } @@ -102,7 +113,7 @@ public IEnumerable> Morphs get { var morphs = new List>(); - foreach (Annotation ann in Annotations) + foreach (Annotation ann in _shape.Annotations) { ann.PostorderTraverse(a => { @@ -173,14 +184,17 @@ public ICollection ObligatorySyntacticFeatures get { return _obligatorySyntacticFeatures; } } - public Range Range + // RUSTIFY Stage 2: Word is the FST's IAnnotatedData and the FST now binds as Fst + // (offset = node Tag), so these expose the shape's int-offset projection. Code that wants the + // ShapeNode-level annotations/range uses word.Shape.Annotations / word.Shape.Range directly. + public Range Range { - get { return _shape.Range; } + get { return _shape.IntRange; } } - public AnnotationList Annotations + public AnnotationList Annotations { - get { return _shape.Annotations; } + get { return _shape.IntAnnotations; } } public Stratum Stratum @@ -318,7 +332,11 @@ internal void MorphologicalRuleUnapplied(IMorphologicalRule mrule) { CheckFrozen(); if (mrule != null) - _mrulesUnapplied.UpdateValue(mrule, () => 0, count => count + 1); + (_mrulesUnapplied = _mrulesUnapplied ?? new Dictionary()).UpdateValue( + mrule, + () => 0, + count => count + 1 + ); if (!(mrule is RealizationalAffixProcessRule)) { _mruleApps.Add(mrule); @@ -333,7 +351,7 @@ internal void MorphologicalRuleUnapplied(IMorphologicalRule mrule) /// The number of unapplications. internal int GetUnapplicationCount(IMorphologicalRule mrule) { - if (!_mrulesUnapplied.TryGetValue(mrule, out int numUnapplies)) + if (_mrulesUnapplied == null || !_mrulesUnapplied.TryGetValue(mrule, out int numUnapplies)) numUnapplies = 0; return numUnapplies; } @@ -349,9 +367,15 @@ internal void MorphologicalRuleApplied(IMorphologicalRule mrule, IEnumerable 0, count => count + 1); + (_mrulesApplied = _mrulesApplied ?? new Dictionary()).UpdateValue( + mrule, + () => 0, + count => count + 1 + ); if (allomorphIndices != null) - _disjunctiveAllomorphIndices.GetOrCreate(_mruleAppCount.ToString()).UnionWith(allomorphIndices); + (_disjunctiveAllomorphIndices = _disjunctiveAllomorphIndices ?? new Dictionary>()) + .GetOrCreate(_mruleAppCount.ToString()) + .UnionWith(allomorphIndices); _mruleAppCount++; } @@ -372,7 +396,7 @@ internal bool? IsLastAppliedRuleFinal /// The number of applications. internal int GetApplicationCount(IMorphologicalRule mrule) { - if (!_mrulesApplied.TryGetValue(mrule, out int numApplies)) + if (_mrulesApplied == null || !_mrulesApplied.TryGetValue(mrule, out int numApplies)) numApplies = 0; return numApplies; } @@ -464,7 +488,10 @@ internal IEnumerable> GetMorphs(Allomorph allomorph) internal IEnumerable GetDisjunctiveAllomorphApplications(Annotation morph) { var morphID = (string)morph.FeatureStruct.GetValue(HCFeatureSystem.MorphID); - if (_disjunctiveAllomorphIndices.TryGetValue(morphID, out HashSet indices)) + if ( + _disjunctiveAllomorphIndices != null + && _disjunctiveAllomorphIndices.TryGetValue(morphID, out HashSet indices) + ) return indices; return null; } @@ -508,6 +535,13 @@ internal void ResetDirty() protected override int FreezeImpl() { int code = 23; + // Freezing SyntacticFeatureStruct is correctness hardening only: it makes the 8 + // mutate-after-freeze call sites elsewhere in this namespace (which clone-then-reassign + // rather than mutate in place, see AnalysisAffixTemplateRule etc.) actually enforced by + // CheckFrozen(), instead of the invariant being purely conventional. Deliberately NOT + // folded into the frozen hash/ValueEquals below — those predate this and dedup must stay + // unchanged. + SyntacticFeatureStruct.Freeze(); _shape.Freeze(); code = code * 31 + _shape.GetFrozenHashCode(); _realizationalFS.Freeze(); diff --git a/src/SIL.Machine.Morphology.HermitCrab/XmlLanguageLoader.cs b/src/SIL.Machine.Morphology.HermitCrab/XmlLanguageLoader.cs index 6469f7990..efea6e214 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/XmlLanguageLoader.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/XmlLanguageLoader.cs @@ -540,12 +540,12 @@ CharacterDefinitionTable defaultTable { var variables = new Dictionary>(); - Pattern leftEnv = LoadPhoneticTemplate( + Pattern leftEnv = LoadPhoneticTemplate( envElem.Elements("LeftEnvironment").Elements("PhoneticTemplate").SingleOrDefault(), variables, defaultTable ); - Pattern rightEnv = LoadPhoneticTemplate( + Pattern rightEnv = LoadPhoneticTemplate( envElem.Elements("RightEnvironment").Elements("PhoneticTemplate").SingleOrDefault(), variables, defaultTable @@ -1078,7 +1078,7 @@ private void LoadMorphologicalLhs( XElement reqPhonInputElem, Dictionary> variables, Dictionary partNames, - IList> lhs, + IList> lhs, CharacterDefinitionTable defaultTable, string partNamePrefix = null ) @@ -1377,7 +1377,7 @@ private Dictionary> LoadVariables(XElemen return variables; } - private IEnumerable> LoadPatternNodes( + private IEnumerable> LoadPatternNodes( XElement pseqElem, Dictionary> variables, CharacterDefinitionTable defaultTable, @@ -1386,12 +1386,12 @@ Dictionary groupNames { foreach (XElement recElem in pseqElem.Elements()) { - PatternNode node = null; + PatternNode node = null; switch (recElem.Name.LocalName) { case "SimpleContext": SimpleContext simpleCtxt = LoadSimpleContext(recElem, variables); - node = new Constraint(simpleCtxt.FeatureStruct) { Tag = simpleCtxt }; + node = new Constraint(simpleCtxt.FeatureStruct) { Tag = simpleCtxt }; break; case "Segment": @@ -1399,7 +1399,7 @@ Dictionary groupNames CharacterDefinition cd = _charDefs[ (string)recElem.Attribute(recElem.Name.LocalName == "Segment" ? "segment" : "boundary") ]; - node = new Constraint(cd.FeatureStruct) { Tag = cd }; + node = new Constraint(cd.FeatureStruct) { Tag = cd }; break; case "OptionalSegmentSequence": @@ -1407,10 +1407,10 @@ Dictionary groupNames int min = string.IsNullOrEmpty(minStr) ? 0 : int.Parse(minStr); var maxStr = (string)recElem.Attribute("max"); int max = string.IsNullOrEmpty(maxStr) ? -1 : int.Parse(maxStr); - node = new Quantifier( + node = new Quantifier( min, max, - new Group(LoadPatternNodes(recElem, variables, defaultTable, groupNames)) + new Group(LoadPatternNodes(recElem, variables, defaultTable, groupNames)) ); break; @@ -1418,8 +1418,8 @@ Dictionary groupNames CharacterDefinitionTable segsTable = GetTable(recElem, defaultTable); var shapeStr = (string)recElem.Element("PhoneticShape"); var segments = new Segments(segsTable, shapeStr); - node = new Group( - segments.Shape.Select(n => new Constraint(n.Annotation.FeatureStruct)) + node = new Group( + segments.Shape.Select(n => new Constraint(n.Annotation.FeatureStruct)) ) { Tag = segments, @@ -1433,7 +1433,7 @@ Dictionary groupNames if (groupNames == null || string.IsNullOrEmpty(id) || !groupNames.TryGetValue(id, out groupName)) yield return node; else - yield return new Group(groupName, node); + yield return new Group(groupName, node); } } @@ -1460,20 +1460,20 @@ Dictionary> variables return new SimpleContext(nc, ctxtVars); } - private Pattern LoadPhoneticTemplate( + private Pattern LoadPhoneticTemplate( XElement ptempElem, Dictionary> variables, CharacterDefinitionTable defaultTable = null, Dictionary groupNames = null ) { - var pattern = new Pattern(); + var pattern = new Pattern(); if (ptempElem != null) { if ((string)ptempElem.Attribute("initialBoundaryCondition") == "true") - pattern.Children.Add(new Constraint(HCFeatureSystem.LeftSideAnchor)); + pattern.Children.Add(new Constraint(HCFeatureSystem.LeftSideAnchor)); foreach ( - PatternNode node in LoadPatternNodes( + PatternNode node in LoadPatternNodes( ptempElem.Element("PhoneticSequence"), variables, defaultTable, @@ -1484,13 +1484,13 @@ PatternNode node in LoadPatternNodes( pattern.Children.Add(node); } if ((string)ptempElem.Attribute("finalBoundaryCondition") == "true") - pattern.Children.Add(new Constraint(HCFeatureSystem.RightSideAnchor)); + pattern.Children.Add(new Constraint(HCFeatureSystem.RightSideAnchor)); } pattern.Freeze(); return pattern; } - private Pattern LoadPhoneticSequence( + private Pattern LoadPhoneticSequence( XElement pseqElem, Dictionary> variables, CharacterDefinitionTable defaultTable = null, @@ -1498,8 +1498,8 @@ private Pattern LoadPhoneticSequence( ) { if (pseqElem == null) - return Pattern.New().Value; - var pattern = new Pattern(name, LoadPatternNodes(pseqElem, variables, defaultTable, null)); + return Pattern.New().Value; + var pattern = new Pattern(name, LoadPatternNodes(pseqElem, variables, defaultTable, null)); pattern.Freeze(); return pattern; } diff --git a/src/SIL.Machine.Morphology.HermitCrab/XmlLanguageWriter.cs b/src/SIL.Machine.Morphology.HermitCrab/XmlLanguageWriter.cs index 4dec714cb..5f7e5663e 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/XmlLanguageWriter.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/XmlLanguageWriter.cs @@ -1119,7 +1119,7 @@ private XElement WriteProperties(IDictionary properties) } private XElement WritePhoneticTemplate( - Pattern pattern, + Pattern pattern, Dictionary> variables, CharacterDefinitionTable defaultTable = null, string prefix = null @@ -1134,9 +1134,9 @@ private XElement WritePhoneticTemplate( return phonTempElem; } - private bool IsAnchor(PatternNode node, FeatureSymbol type) + private bool IsAnchor(PatternNode node, FeatureSymbol type) { - if (node is Constraint constraint) + if (node is Constraint constraint) { return constraint.Type() == HCFeatureSystem.Anchor && (FeatureSymbol)constraint.FeatureStruct.GetValue(HCFeatureSystem.AnchorType) == type; @@ -1146,7 +1146,7 @@ private bool IsAnchor(PatternNode node, FeatureSymbol type) } private XElement WritePhoneticSequence( - Pattern pattern, + Pattern pattern, Dictionary> variables, CharacterDefinitionTable defaultTable = null, string prefix = null @@ -1155,20 +1155,20 @@ private XElement WritePhoneticSequence( var seqElem = new XElement("PhoneticSequence"); if (!string.IsNullOrEmpty(pattern.Name)) seqElem.Add(new XAttribute("id", Normalize((prefix ?? "") + pattern.Name))); - foreach (PatternNode node in pattern.Children) + foreach (PatternNode node in pattern.Children) seqElem.Add(WritePatternNodes(node, variables, defaultTable, prefix ?? "", null)); return seqElem; } private IEnumerable WritePatternNodes( - PatternNode node, + PatternNode node, Dictionary> variables, CharacterDefinitionTable defaultTable, string prefix, string id ) { - if (node is Constraint constraint) + if (node is Constraint constraint) { if (constraint.Tag == null) yield break; @@ -1187,13 +1187,13 @@ string id yield break; } - if (node is Quantifier quantifier) + if (node is Quantifier quantifier) { yield return WriteOptionalSegmentSequence(quantifier, variables, defaultTable, id); yield break; } - if (node is Group group) + if (node is Group group) { if (!string.IsNullOrEmpty(group.Name)) { @@ -1219,7 +1219,7 @@ XElement elem in WritePatternNodes( else { // Normal group - foreach (PatternNode childNode in group.Children) + foreach (PatternNode childNode in group.Children) { foreach (XElement elem in WritePatternNodes(childNode, variables, defaultTable, prefix, id)) yield return elem; @@ -1277,7 +1277,7 @@ private XElement WriteSimpleContext( } private XElement WriteOptionalSegmentSequence( - Quantifier quantifier, + Quantifier quantifier, Dictionary> variables, CharacterDefinitionTable defaultTable, string id diff --git a/src/SIL.Machine/Annotations/Annotation.cs b/src/SIL.Machine/Annotations/Annotation.cs index 7292fc591..d4c8c1a42 100644 --- a/src/SIL.Machine/Annotations/Annotation.cs +++ b/src/SIL.Machine/Annotations/Annotation.cs @@ -124,7 +124,18 @@ public bool Optional set { CheckFrozen(); + if (_optional == value) + return; _optional = value; + // Shape's int-offset projection copies Optional by value and caches against the root + // annotation list's Version (see AnnotationList.IncrementVersion). Optional is part of + // the projected view but flipping it is not a structural change, so bump the root + // list's version here to invalidate the cache — otherwise the matcher keeps seeing the + // stale flag and never forks the optional-skip instances (RUSTIFY Stage 2). + Annotation top = this; + while (top.Parent != null) + top = top.Parent; + (top.List as AnnotationList)?.IncrementVersion(); } } diff --git a/src/SIL.Machine/Annotations/AnnotationList.cs b/src/SIL.Machine/Annotations/AnnotationList.cs index c57fa9e24..20536e3f4 100644 --- a/src/SIL.Machine/Annotations/AnnotationList.cs +++ b/src/SIL.Machine/Annotations/AnnotationList.cs @@ -17,6 +17,81 @@ public class AnnotationList private int _currentID; private readonly Annotation _parent; private int _hashCode; + private int _version; + + /// + /// Monotonically increments on every structural change (add/remove/clear). Lets a consumer + /// (e.g. 's lazily-built int-offset annotation projection) cheaply + /// detect when a cached derivative is stale without diffing the list. + /// + internal int Version + { + get { return _version; } + } + + /// + /// Bumps for a non-structural change that a cached derivative still + /// depends on. Specifically: 's int-offset projection copies each + /// annotation's flag by value, so flipping + /// Optional (during analysis/unapplication) must invalidate that cache even though the list + /// structure is unchanged. 's setter calls this on + /// the root list. + /// + internal void IncrementVersion() + { + _version++; + } + + // Cache of filtered+direction-sorted annotation views for FST traversal (see + // TraversalMethodBase.Reset). Only populated on FROZEN lists — a frozen list (and its + // annotations' FeatureStructs) is immutable, so the filtered view is final; for unfrozen + // lists a rule's in-place FeatureStruct edit could silently invalidate a cached view, so + // they never cache. Keyed by filter-delegate reference: filters come from a handful of + // compiler-cached non-capturing lambdas (one per rule-class call site), so the chain stays + // tiny (≤ filters × directions). Lock-free CAS publish; a lost race just rebuilds once. + private sealed class FilteredView + { + internal readonly object Filter; + internal readonly Direction Direction; + internal readonly List> Annotations; + internal readonly FilteredView Next; + + internal FilteredView( + object filter, + Direction direction, + List> annotations, + FilteredView next + ) + { + Filter = filter; + Direction = direction; + Annotations = annotations; + Next = next; + } + } + + private FilteredView _filteredViews; + + internal List> GetFilteredView(object filter, Direction dir) + { + for (FilteredView v = _filteredViews; v != null; v = v.Next) + { + if (ReferenceEquals(v.Filter, filter) && v.Direction == dir) + return v.Annotations; + } + return null; + } + + internal void AddFilteredView(object filter, Direction dir, List> annotations) + { + while (true) + { + FilteredView head = _filteredViews; + var entry = new FilteredView(filter, dir, annotations, head); + if (System.Threading.Interlocked.CompareExchange(ref _filteredViews, entry, head) == head) + return; + } + } public AnnotationList() : base(new AnnotationComparer(), begin => new Annotation(Range.Null)) { } @@ -121,6 +196,7 @@ public override void Add(Annotation node) public void Add(Annotation node, bool subsume) { CheckFrozen(); + _version++; if (_parent != null && !_parent.Range.Contains(node.Range)) { throw new ArgumentException( @@ -160,6 +236,7 @@ public override bool Remove(Annotation node) public bool Remove(Annotation node, bool preserveChildren) { CheckFrozen(); + _version++; if (base.Remove(node)) { if (preserveChildren) @@ -281,6 +358,7 @@ public IEnumerable> GetNodes(Range range, Direction public override void Clear() { CheckFrozen(); + _version++; base.Clear(); } diff --git a/src/SIL.Machine/Annotations/Shape.cs b/src/SIL.Machine/Annotations/Shape.cs index ecea24a02..5e4dd3063 100644 --- a/src/SIL.Machine/Annotations/Shape.cs +++ b/src/SIL.Machine/Annotations/Shape.cs @@ -1,32 +1,82 @@ using System; +using System.Collections; using System.Collections.Generic; using System.Linq; -using SIL.Extensions; using SIL.Machine.DataStructures; using SIL.Machine.FeatureModel; using SIL.ObjectModel; namespace SIL.Machine.Annotations { + /// + /// An ordered sequence of s plus their annotation tree. + /// + /// As of the RUSTIFY flat-shape rework (Phase 3b-impl, Stage 1) a owns its nodes + /// in flat backing arrays addressed by a stable per-node : the prev/next + /// links (an in-array doubly-linked list, so + /// and stay O(1) and the tag-relabel order maintenance is preserved) + /// and the per-node frozen flag live here rather than on the node. The list machinery that used to be + /// inherited from OrderedBidirList<ShapeNode> is reimplemented over those arrays. The + /// objects added to the shape are retained as the canonical one-per-slot + /// handles, so reference identity is unchanged and behavior is byte-identical. + /// public class Shape - : OrderedBidirList, + : IOrderedBidirList, IAnnotatedData, ICloneable, IFreezable, IValueEquatable { + // Link sentinel for "no node" (the old null Next/Prev). + private const int Nil = -1; + private readonly Func _marginSelector; private readonly AnnotationList _annotations; + private readonly IEqualityComparer _comparer; private int _hashCode; + // Flat backing. Slot 0 = Begin margin, slot 1 = End margin, content nodes from slot 2 up. + private ShapeNode[] _nodes; // canonical handle per slot (null = free slot) + private int[] _next; // forward link by slot (Nil = none) + private int[] _prev; // backward link by slot (Nil = none) + private bool[] _frozen; // per-node frozen flag by slot + private int _capacity; + private int _used; // high-water count of slots ever handed out + private readonly Stack _free; // reclaimed slots below the high-water mark + private int _size; // content node count (excludes the two margins) + + private readonly ShapeNode _begin; + private readonly ShapeNode _end; + + // RUSTIFY Stage 3 (III): copy-on-write clone. A clone of a *frozen* shape stores its source here + // and does NOT copy the node graph: the hot read path (the FST matcher) consumes the clone only + // through the int-offset projection (IntAnnotations/IntRange), which is served from the frozen + // source — so a clone that is only traversed (never has a ShapeNode/Annotation handle handed out + // and is never mutated) costs a shell, not N nodes + N annotations + their skip-list towers. The + // first access that needs the real node graph — any flat-backing link read, enumeration, handle + // bridge (NodeAt), .Annotations access, or mutation — calls EnsureInflated() to materialize it. + private Shape _cowSource; + public Shape(Func marginSelector) : this(marginSelector, new AnnotationList()) { } public Shape(Func marginSelector, AnnotationList annotations) - : base(EqualityComparer.Default, marginSelector) { _marginSelector = marginSelector; _annotations = annotations; + _comparer = EqualityComparer.Default; + _free = new Stack(); + _capacity = 0; + _used = 0; + _size = 0; + + _begin = marginSelector(true); + _end = marginSelector(false); + Adopt(_begin, AllocSlot()); + Adopt(_end, AllocSlot()); + _next[_begin.Index] = _end.Index; + _prev[_end.Index] = _begin.Index; + Begin.Tag = int.MinValue; End.Tag = int.MaxValue; _annotations.Add(Begin.Annotation, false); @@ -36,9 +86,115 @@ public Shape(Func marginSelector, AnnotationList ann protected Shape(Shape shape) : this(shape._marginSelector) { - shape.CopyTo(this); + // Copy-on-write only when the source is frozen (immutable, so safe to share): the common + // case, since words are frozen before being cloned. Flatten any COW chain to the real source. + if (shape.IsFrozen) + _cowSource = shape._cowSource ?? shape; + else + shape.CopyTo(this); + } + + // Materialize a copy-on-write clone's real node graph on first access that needs it (see _cowSource). + // Idempotent and not reentrant: clears _cowSource first, then does the real copy from the (frozen, + // non-COW) source; re-freezes if this clone had already been frozen-by-sharing. + private void EnsureInflated() + { + if (_cowSource == null) + return; + Shape src = _cowSource; + _cowSource = null; + bool wasFrozen = IsFrozen; + if (wasFrozen) + IsFrozen = false; + src.CopyTo(this); + if (wasFrozen) + Freeze(); + } + + #region Flat backing helpers + + private void EnsureCapacity(int n) + { + if (n <= _capacity) + return; + int newCap = _capacity == 0 ? 4 : _capacity * 2; + while (newCap < n) + newCap *= 2; + Array.Resize(ref _nodes, newCap); + Array.Resize(ref _next, newCap); + Array.Resize(ref _prev, newCap); + Array.Resize(ref _frozen, newCap); + _capacity = newCap; + } + + private int AllocSlot() + { + if (_free.Count > 0) + return _free.Pop(); + int idx = _used++; + EnsureCapacity(_used); + return idx; + } + + private void Adopt(ShapeNode node, int idx) + { + _nodes[idx] = node; + _next[idx] = Nil; + _prev[idx] = Nil; + _frozen[idx] = false; + node.Owner = this; + node.Index = idx; + } + + // Detaches a node from this shape (the old OrderedBidirListNode.Clear): frees its slot and + // resets the handle to the detached state. Does not adjust _size; callers manage that. + // Preserves a frozen node's IsFrozen==true for its remaining (detached) lifetime by carrying + // the flag over to _detachedFrozen before clearing Owner — ShapeNode.IsFrozen used to be a + // permanent per-node bool that never reset once set, and detaching a node must not silently + // un-freeze it. + private void Detach(ShapeNode node) + { + int idx = node.Index; + bool wasFrozen = _frozen[idx]; + _nodes[idx] = null; + _next[idx] = Nil; + _prev[idx] = Nil; + _frozen[idx] = false; + node.Owner = null; + node.Index = -1; + if (wasFrozen) + node.MarkDetachedFrozen(); + _free.Push(idx); } + internal ShapeNode GetNextLink(int index) + { + if (_cowSource != null) + EnsureInflated(); + int n = _next[index]; + return n < 0 ? null : _nodes[n]; + } + + internal ShapeNode GetPrevLink(int index) + { + if (_cowSource != null) + EnsureInflated(); + int p = _prev[index]; + return p < 0 ? null : _nodes[p]; + } + + internal bool IsNodeFrozen(int index) + { + return _frozen[index]; + } + + internal void SetNodeFrozen(int index) + { + _frozen[index] = true; + } + + #endregion + public Range Range { get { return Range.Create(Begin, End); } @@ -46,16 +202,46 @@ public Range Range public AnnotationList Annotations { - get { return _annotations; } + get + { + // Hands out the ShapeNode-keyed annotation tree (morph extraction, rule code, result + // comparison) — needs the real node graph. + EnsureInflated(); + return _annotations; + } } public bool IsFrozen { get; private set; } + // ---- RUSTIFY Stage 2: int-offset projection (the Fst bridge) ---- + // The FST binds as Fst with offset = a DENSE per-projection node position (0..N+1 in + // node order: Begin=0, content 1..N, End=N+1). Dense contiguous offsets — rather than the + // shape's sparse Tag — are what keep the int model byte-identical: they never collide with the + // Range.Null = [-1,-1] sentinel, never overflow the half-open +1 (Tag's End == int.MaxValue + // did), and keep the End anchor a non-empty [N+1, N+2) (matching the ShapeNode anchor's length). + // These views are rebuilt lazily, gated on the annotation list Version (+ frozen state), so a + // stable/frozen shape builds them once and reuses them across the thousands of Transduce calls + // per word, while a shape mutated in place by an iterative rewrite rule rebuilds on next access. + private AnnotationList _intAnnotations; + private Dictionary _byOffset; + private Dictionary _nodeOffset; + private int _intProjectionVersion = -1; + private bool _intProjectionFrozen; + public void Freeze() { if (IsFrozen) return; + // A copy-on-write clone equals its already-frozen source: adopt the frozen state (and its + // hash) without materializing the node graph, so freeze-then-traverse stays handle-free. + if (_cowSource != null) + { + IsFrozen = true; + _hashCode = _cowSource.GetFrozenHashCode(); + return; + } + IsFrozen = true; Begin.Freeze(); int i = 0; @@ -71,6 +257,155 @@ public void Freeze() _hashCode = 23; _hashCode = _hashCode * 31 + Count; _hashCode = _hashCode * 31 + _annotations.GetFrozenHashCode(); + + // Build the int-offset projection now, while frozen and single-threaded. A frozen shape is + // immutable, so this projection is final — and (RUSTIFY Stage 3 / COW) copy-on-write clones + // delegate their IntAnnotations to this frozen source, possibly from several parse threads at + // once. Building eagerly here means those concurrent reads always hit a complete cache rather + // than racing a lazy first build of the offset dictionaries. No extra work overall: a frozen + // shape that is frozen is one that will be traversed (by itself or its COW clones). + EnsureIntProjection(); + // Freeze the (final) projection so the FST traversal can cache filtered views on it + // (AnnotationList.GetFilteredView gates on IsFrozen — for an unfrozen list, in-place + // FeatureStruct edits could silently invalidate a cached view). Also fail-fast hardens + // the COW invariant: any unexpected mutation of a shared projection now throws. + _intAnnotations.Freeze(); + } + + // Maps a ShapeNode annotation range to its int-offset range using the dense per-projection node + // positions: a single node [n, n] -> half-open [off(n), off(n)+1); a span [s, e] -> + // [off(s), off(e)+1). Relationship-preserving vs the inclusive ShapeNode form (see the + // IntOffsetRangeMapping parity test); dense offsets make it free of the Tag edge cases. + private Range ToIntRange(Range r) + { + return Range.Create(_nodeOffset[r.Start], _nodeOffset[r.End] + 1); + } + + private void EnsureIntProjection() + { + if ( + _intAnnotations != null + && _intProjectionVersion == _annotations.Version + && _intProjectionFrozen == IsFrozen + ) + { + return; + } + + // Assign dense offsets to every node in node order: Begin=0, content 1..N, End=N+1. + _nodeOffset = new Dictionary(); + _byOffset = new Dictionary(); + int pos = 0; + AssignOffset(Begin, ref pos); + foreach (ShapeNode node in this) + AssignOffset(node, ref pos); + AssignOffset(End, ref pos); + + var dest = new AnnotationList(); + foreach (Annotation top in _annotations) + dest.Add(ProjectAnnotation(top), false); + + _intAnnotations = dest; + _intProjectionVersion = _annotations.Version; + _intProjectionFrozen = IsFrozen; + } + + private void AssignOffset(ShapeNode node, ref int pos) + { + _nodeOffset[node] = pos; + _byOffset[pos] = node; + pos++; + } + + private Annotation ProjectAnnotation(Annotation src) + { + // Share the FeatureStruct by reference (no clone): the int annotation is a view, and a + // rule's in-place FeatureStruct edit on a matched node must remain visible. + var ann = new Annotation(ToIntRange(src.Range), src.FeatureStruct) { Optional = src.Optional }; + if (!src.IsLeaf) + { + foreach (Annotation child in src.Children) + ann.Children.Add(ProjectAnnotation(child), false); + } + return ann; + } + + /// + /// The int-offset projection of this shape's annotations (RUSTIFY Stage 2): the + /// the Fst<Word,int> traversal consumes. Built + /// lazily and cached against the annotation . + /// + public AnnotationList IntAnnotations + { + get + { + // The whole point of COW: serve the projection from the frozen source without + // materializing this clone's node graph. This is the FST matcher's only access path. + if (_cowSource != null) + return _cowSource.IntAnnotations; + EnsureIntProjection(); + return _intAnnotations; + } + } + + /// + /// The whole-shape int range — the half-open image of the inclusive ShapeNode range + /// [Begin, End], i.e. [off(Begin), off(End) + 1). The +1 matters: the only + /// framework consumer is Matcher.GetStartAnnotation via Range.GetStart(dir), and a + /// right-to-left match starts at GetStart(RtL) == End. The End anchor's dense node range + /// is [off(End), off(End)+1), whose RtL start coordinate is off(End)+1 — so End + /// must be off(End)+1 for a RtL match to begin at the End anchor rather than at + /// the last content node (which would skip any edit adjacent to End, e.g. inserting a deleted + /// segment after the final vowel during analysis). + /// + public Range IntRange + { + get + { + if (_cowSource != null) + return _cowSource.IntRange; + EnsureIntProjection(); + return Range.Create(_nodeOffset[Begin], _nodeOffset[End] + 1); + } + } + + /// + /// Resolves an int offset (a dense node position) back to its node — the reverse of the + /// int-offset projection, used by rule RHS code to act on the segment graph. Works on frozen + /// and unfrozen shapes. + /// + public ShapeNode NodeAt(int offset) + { + // Hands out a real ShapeNode of this shape (rule-RHS / mutation path) — must materialize. + EnsureInflated(); + EnsureIntProjection(); + return _byOffset[offset]; + } + + /// + /// The int offset (dense node position) of a node. Companion to . + /// + public int OffsetOf(ShapeNode node) + { + EnsureInflated(); + EnsureIntProjection(); + return _nodeOffset[node]; + } + + /// + /// The offset to pass to Matcher.Match(input, start) to begin matching at + /// in direction . A node's half-open annotation + /// is [off, off+1), and the matcher locates the start annotation by its + /// Range.GetStart(dir): that is off left-to-right but off+1 right-to-left. + /// (With the old inclusive [node, node] ShapeNode ranges this was direction-agnostic; + /// the dense half-open int model needs this adjustment to stay byte-identical for RtL matches.) + /// + public int MatchStartOffset(ShapeNode node, Direction dir) + { + EnsureInflated(); + EnsureIntProjection(); + int off = _nodeOffset[node]; + return dir == Direction.LeftToRight ? off : off + 1; } private void CheckFrozen() @@ -79,6 +414,178 @@ private void CheckFrozen() throw new InvalidOperationException("The shape is immutable."); } + #region ICollection / IBidirList + + public int Count + { + // COW-safe: the clone's content count equals its frozen source's, without inflating. + get { return _cowSource != null ? _cowSource.Count : _size; } + } + + bool ICollection.IsReadOnly + { + get { return false; } + } + + public ShapeNode Begin + { + get { return _begin; } + } + + public ShapeNode End + { + get { return _end; } + } + + public ShapeNode GetBegin(Direction dir) + { + return dir == Direction.LeftToRight ? Begin : End; + } + + public ShapeNode GetEnd(Direction dir) + { + return dir == Direction.LeftToRight ? End : Begin; + } + + public ShapeNode First + { + // Count is COW-aware; GetNextLink inflates if needed, so this hands out a real node. + get { return Count == 0 ? null : GetNextLink(_begin.Index); } + } + + public ShapeNode Last + { + get { return Count == 0 ? null : GetPrevLink(_end.Index); } + } + + public ShapeNode GetFirst(Direction dir) + { + return dir == Direction.LeftToRight ? First : Last; + } + + public ShapeNode GetLast(Direction dir) + { + return dir == Direction.LeftToRight ? Last : First; + } + + public ShapeNode GetNext(ShapeNode cur) + { + return GetNext(cur, Direction.LeftToRight); + } + + public ShapeNode GetNext(ShapeNode cur, Direction dir) + { + if (cur.List != this) + throw new ArgumentException("cur is not a member of this collection.", "cur"); + return dir == Direction.LeftToRight ? cur.Next : cur.Prev; + } + + public ShapeNode GetPrev(ShapeNode cur) + { + return GetPrev(cur, Direction.LeftToRight); + } + + public ShapeNode GetPrev(ShapeNode cur, Direction dir) + { + if (cur.List != this) + throw new ArgumentException("cur is not a member of this collection.", "cur"); + return dir == Direction.LeftToRight ? cur.Prev : cur.Next; + } + + public bool Find(ShapeNode example, out ShapeNode result) + { + return Find(example, Direction.LeftToRight, out result); + } + + public bool Find(ShapeNode start, ShapeNode example, out ShapeNode result) + { + return Find(start, example, Direction.LeftToRight, out result); + } + + public bool Find(ShapeNode example, Direction dir, out ShapeNode result) + { + return Find(GetFirst(dir), example, dir, out result); + } + + public bool Find(ShapeNode start, ShapeNode example, Direction dir, out ShapeNode result) + { + for (ShapeNode n = start; n != GetEnd(dir); n = n.GetNext(dir)) + { + if (_comparer.Equals(example, n)) + { + result = n; + return true; + } + } + result = null; + return false; + } + + public bool Contains(ShapeNode node) + { + return node.List == this; + } + + public void CopyTo(ShapeNode[] array, int arrayIndex) + { + foreach (ShapeNode node in this) + array[arrayIndex++] = node; + } + + IEnumerator IEnumerable.GetEnumerator() + { + // Count is COW-aware; First inflates if needed. (Use Count, not _size — a COW clone has + // _size == 0 until inflated.) + if (Count == 0) + yield break; + + for (ShapeNode node = First; node != End; node = node.Next) + yield return node; + } + + IEnumerator IEnumerable.GetEnumerator() + { + return ((IEnumerable)this).GetEnumerator(); + } + + public void Add(ShapeNode node) + { + AddAfter(_end.Prev, node, Direction.LeftToRight); + } + + public void AddRange(IEnumerable e) + { + foreach (ShapeNode node in e) + Add(node); + } + + public void AddRangeAfter(ShapeNode node, IEnumerable newNodes, Direction dir) + { + if (_size == 0 && node == null) + node = GetBegin(dir); + + if (node.List != this) + throw new ArgumentException("node is not a member of this collection.", "node"); + + foreach (ShapeNode newNode in newNodes) + { + AddAfter(node, newNode, dir); + node = newNode; + } + } + + public void AddRangeAfter(ShapeNode node, IEnumerable newNodes) + { + AddRangeAfter(node, newNodes, Direction.LeftToRight); + } + + public void AddAfter(ShapeNode node, ShapeNode newNode) + { + AddAfter(node, newNode, Direction.LeftToRight); + } + + #endregion + public ShapeNode Add(FeatureStruct fs) { return Add(fs, false); @@ -104,10 +611,30 @@ public Range CopyTo(ShapeNode srcStart, ShapeNode srcEnd, Shape dest) return CopyTo(Range.Create(srcStart, srcEnd), dest); } + // Per-thread scratch map reused across CopyTo calls. CopyTo runs on every Word.Clone + // (hundreds per parse on a real grammar) and the map is fully consumed before CopyTo + // returns (never retained) and CopyTo is not reentrant, so reusing one map per thread + // removes a per-clone Dictionary allocation without any sharing hazard. This is a SAFE + // pool — unlike the across-word FST arena (RUSTIFY Phase 1b), nothing here survives the + // call, so it cannot promote parse data to Gen2 / regress parallel parsing. + [ThreadStatic] + private static Dictionary CloneMapping; + public Range CopyTo(Range srcRange, Shape dest) { + // Reads this shape's real node graph + annotations as the copy source — materialize if COW. + // (When called from EnsureInflated the source is the real frozen shape, so this is a no-op.) + EnsureInflated(); ShapeNode startNode = null; ShapeNode endNode = null; + // Build the src->dest node mapping inline while cloning, instead of a second pass + // with GetNodes().Zip().ToDictionary(). CopyTo runs on every Word.Clone (thousands + // per parse on a real grammar), so eliminating the extra enumerations + LINQ + // allocations per clone is a measurable GC win. + Dictionary mapping = CloneMapping; + if (mapping == null) + mapping = CloneMapping = new Dictionary(); + mapping.Clear(); foreach (ShapeNode node in GetNodes(srcRange)) { ShapeNode newNode = node.Clone(); @@ -115,12 +642,10 @@ public Range CopyTo(Range srcRange, Shape dest) startNode = newNode; endNode = newNode; dest.Add(newNode); + mapping[node] = newNode; } Range destRange = Range.Create(startNode, endNode); - Dictionary mapping = GetNodes(srcRange) - .Zip(dest.GetNodes(destRange)) - .ToDictionary(tuple => tuple.Item1, tuple => tuple.Item2); foreach (Annotation ann in _annotations.GetNodes(srcRange)) CopyAnnotations(dest._annotations, ann, mapping); @@ -175,9 +700,10 @@ public ShapeNode AddAfter(ShapeNode node, FeatureStruct fs, bool optional, Direc return newNode; } - public override void AddAfter(ShapeNode node, ShapeNode newNode, Direction dir) + public void AddAfter(ShapeNode node, ShapeNode newNode, Direction dir) { CheckFrozen(); + EnsureInflated(); if (newNode.List == this) throw new ArgumentException("newNode is already a member of this collection.", "newNode"); if (node != null && node.List != this) @@ -221,20 +747,51 @@ public override void AddAfter(ShapeNode node, ShapeNode newNode, Direction dir) } } - base.AddAfter(node, newNode, dir); + // Splice newNode into the in-array linked list (was OrderedBidirList.AddAfter). + if (Count == 0 && node == null) + node = GetBegin(dir); + + newNode.Remove(); + Adopt(newNode, AllocSlot()); + + ShapeNode anchor = node; + if (dir == Direction.RightToLeft) + anchor = anchor.Prev; + + int aIdx = anchor.Index; + int sIdx = newNode.Index; + int afterIdx = _next[aIdx]; + _next[sIdx] = afterIdx; + _next[aIdx] = sIdx; + _prev[sIdx] = aIdx; + if (afterIdx >= 0) + _prev[afterIdx] = sIdx; + + _size++; _annotations.Add(newNode.Annotation); } - public override bool Remove(ShapeNode node) + public bool Remove(ShapeNode node) { CheckFrozen(); + EnsureInflated(); if (node.List != this) return false; node.Annotation.Remove(); UpdateAnnotations(_annotations, node); - return base.Remove(node); + + int idx = node.Index; + int p = _prev[idx]; + int n = _next[idx]; + if (p >= 0) + _next[p] = n; + if (n >= 0) + _prev[n] = p; + Detach(node); + _size--; + return true; } private void UpdateAnnotations(AnnotationList annList, ShapeNode node) @@ -290,10 +847,15 @@ Annotation ann in annList } } - public override void Clear() + public void Clear() { CheckFrozen(); - base.Clear(); + EnsureInflated(); + foreach (ShapeNode node in this.ToArray()) + Detach(node); + _next[_begin.Index] = _end.Index; + _prev[_end.Index] = _begin.Index; + _size = 0; _annotations.Clear(); _annotations.Add(Begin.Annotation); _annotations.Add(End.Annotation); @@ -387,6 +949,7 @@ public IEnumerable GetNodes(Range range) public IEnumerable GetNodes(Range range, Direction dir) { + EnsureInflated(); return this.GetNodes(range.GetStart(dir), range.GetEnd(dir), dir); } @@ -395,7 +958,13 @@ public bool ValueEquals(Shape other) if (Count != other.Count) return false; - return _annotations.ValueEquals(other._annotations); + // Compare via the int-offset projection, not Annotations: IntAnnotations is served + // lazily from a COW clone's frozen source without materializing the ShapeNode graph + // (see the IntAnnotations getter above), whereas Annotations forces EnsureInflated() on + // both operands. This method is the equality FreezableEqualityComparer uses for + // rule-cascade dedup, so every hash-collision check would otherwise de-COW both + // candidate words for no reason other than the comparison itself. + return IntAnnotations.ValueEquals(other.IntAnnotations); } public int GetFrozenHashCode() diff --git a/src/SIL.Machine/Annotations/ShapeNode.cs b/src/SIL.Machine/Annotations/ShapeNode.cs index cf72e7c8f..ff6eebdab 100644 --- a/src/SIL.Machine/Annotations/ShapeNode.cs +++ b/src/SIL.Machine/Annotations/ShapeNode.cs @@ -7,8 +7,17 @@ namespace SIL.Machine.Annotations { + /// + /// A node in a . As of the RUSTIFY flat-shape rework (Phase 3b-impl, Stage 1) + /// this is a handle into its owning 's flat backing arrays rather than a + /// self-contained doubly-linked-list node: the prev/next links and the frozen flag live in the owner + /// arrays addressed by . The handle object added to a shape is stored as the + /// canonical one-per-slot handle, so reference identity (and therefore ==, dictionary keys and + /// endpoint identity) is preserved exactly as before. + /// stays on the node so it survives a node being moved between shapes. + /// public class ShapeNode - : OrderedBidirListNode, + : IOrderedBidirListNode, IComparable, IComparable, ICloneable, @@ -17,11 +26,29 @@ public class ShapeNode { private readonly Annotation _ann; private int _tag; + private bool _detachedFrozen; + + // Equals() is intentionally left as the default (reference equality) — ShapeNode is used as + // an identity key in Shape.EnsureIntProjection's per-Freeze() Dictionary + // (_nodeOffset). A CPU profile showed the CLR's identity-hash fallback (assigned via + // AssignOffset's dictionary inserts) contributing real self-time on that hot per-word path. + // _id is a construction-order sequence number, immutable for the instance's lifetime (unlike + // _tag/Index, which are reassigned as the node moves/is frozen), so it changes nothing about + // which nodes compare equal. + private static int NextId; + private readonly int _id = System.Threading.Interlocked.Increment(ref NextId); + + // The owning shape, or null when this node is detached (created but not yet added, or removed). + internal Shape Owner { get; set; } + + // Slot index into the owner's flat arrays; -1 when detached. + internal int Index { get; set; } public ShapeNode(FeatureStruct fs) { _ann = new Annotation(Range.Create(this), fs); _tag = int.MinValue; + Index = -1; } protected ShapeNode(ShapeNode node) @@ -45,6 +72,54 @@ public Annotation Annotation get { return _ann; } } + public IBidirList List + { + get { return Owner; } + } + + public ShapeNode Next + { + get { return Owner?.GetNextLink(Index); } + } + + public ShapeNode Prev + { + get { return Owner?.GetPrevLink(Index); } + } + + public ShapeNode GetNext(Direction dir) + { + if (Owner == null) + return null; + return Owner.GetNext(this, dir); + } + + public ShapeNode GetPrev(Direction dir) + { + if (Owner == null) + return null; + return Owner.GetPrev(this, dir); + } + + public bool Remove() + { + if (Owner == null) + return false; + return Owner.Remove(this); + } + + public void AddAfter(ShapeNode newNode, Direction dir) + { + if (Owner == null) + return; + Owner.AddAfter(this, newNode, dir); + } + + public void AddAfter(ShapeNode newNode) + { + AddAfter(newNode, Direction.LeftToRight); + } + public int CompareTo(ShapeNode other) { if (other.List != List) @@ -113,13 +188,31 @@ private void CheckFrozen() throw new InvalidOperationException("The shape node is immutable."); } - public bool IsFrozen { get; private set; } + public bool IsFrozen + { + get { return Owner != null ? Owner.IsNodeFrozen(Index) : _detachedFrozen; } + } public void Freeze() { if (IsFrozen) return; - IsFrozen = true; + if (Owner != null) + Owner.SetNodeFrozen(Index); + else + _detachedFrozen = true; + } + + // Called by Shape.Detach when a frozen node is removed/cleared, so IsFrozen keeps reporting + // true for the node's remaining (now-detached) lifetime instead of silently flipping to false. + internal void MarkDetachedFrozen() + { + _detachedFrozen = true; + } + + public override int GetHashCode() + { + return _id; } public int GetFrozenHashCode() diff --git a/src/SIL.Machine/DataStructures/BidirList.cs b/src/SIL.Machine/DataStructures/BidirList.cs index a3dad1ce3..580ba9980 100644 --- a/src/SIL.Machine/DataStructures/BidirList.cs +++ b/src/SIL.Machine/DataStructures/BidirList.cs @@ -12,22 +12,43 @@ public abstract class BidirList : IBidirList private readonly TNode _end; private readonly IComparer _comparer; - private readonly Random _rand = new Random(); + // [ThreadStatic] instead of one Random per BidirList instance: skip-list level selection is + // statistical (result shape doesn't affect the byte-identical parse output, only balance), so + // sharing one Random per thread is safe. A CPU profile showed constructing a fresh + // System.Random per BidirList (i.e. per AnnotationList, i.e. effectively per Word.Clone) — + // including its OS-entropy-seeded Xoshiro256** state — as real, avoidable self-time. Each + // Word/Shape clone's BidirLists are only ever mutated by the single thread that owns that + // clone (the COW invariant: a shared/frozen Shape is read-only), so no cross-thread Random + // sharing occurs. + [ThreadStatic] + private static Random ThreadRand; + + private static Random Rand + { + get + { + if (ThreadRand == null) + ThreadRand = new Random(); + return ThreadRand; + } + } + private int _size; protected BidirList(IComparer comparer, Func marginSelector) { _begin = marginSelector(true); _end = marginSelector(false); - _begin.Init(this, 33); + // The Begin/End margins grow their tower arrays on demand (see GrowMargins) rather than + // pre-allocating the 33-level skip-list maximum: most lists stay shallow, so the eager [33] + // margin towers were pure waste — ~70% of the per-AnnotationList tower-array allocation, the + // dominant Word.Clone sub-cost on Sena (RUSTIFY Stage 3, increment II). Start at level 0 only. + _begin.Init(this, 1); _begin.Levels = 1; - _end.Init(this, 33); + _end.Init(this, 1); _end.Levels = 1; - for (int i = 0; i < 33; i++) - { - _begin.SetNext(i, _end); - _end.SetPrev(i, _begin); - } + _begin.SetNext(0, _end); + _end.SetPrev(0, _begin); _comparer = comparer; } @@ -55,13 +76,12 @@ public virtual void Add(TNode node) // 1-bits before we encounter the first 0-bit is the level of the node. Since R is // 32-bit, the level can be at most 32. int level = 0; - for (int r = _rand.Next(); (r & 1) == 1; r >>= 1) + for (int r = Rand.Next(); (r & 1) == 1; r >>= 1) { level++; if (level == _begin.Levels) { - _begin.Levels++; - _end.Levels++; + GrowMargins(); break; } } @@ -92,15 +112,29 @@ public virtual void Add(TNode node) _size++; } + // Raise the skip list's height by one level: ensure the margins' tower arrays can hold the new + // level, link Begin<->End at it, then bump the margin levels. Replaces the old eager 33-level + // margin pre-allocation; called only when a freshly added node reaches the current max height. + private void GrowMargins() + { + int newLevel = _begin.Levels; + _begin.EnsureLevelCapacity(newLevel + 1); + _end.EnsureLevelCapacity(newLevel + 1); + _begin.SetNext(newLevel, _end); + _end.SetPrev(newLevel, _begin); + _begin.Levels = newLevel + 1; + _end.Levels = newLevel + 1; + } + public virtual void Clear() { foreach (TNode node in this.ToArray()) node.Clear(); - for (int i = 0; i < 33; i++) - { - _begin.SetNext(i, _end); - _end.SetPrev(i, _begin); - } + // Reset to height 1; only level 0 needs relinking (higher levels are above Levels and are + // never read until GrowMargins re-links them as the list grows tall again). The margin + // arrays keep whatever capacity they grew to, which is reused. + _begin.SetNext(0, _end); + _end.SetPrev(0, _begin); _begin.Levels = 1; _end.Levels = 1; _size = 0; diff --git a/src/SIL.Machine/DataStructures/BidirListNode.cs b/src/SIL.Machine/DataStructures/BidirListNode.cs index dcb26a32e..dce752943 100644 --- a/src/SIL.Machine/DataStructures/BidirListNode.cs +++ b/src/SIL.Machine/DataStructures/BidirListNode.cs @@ -3,9 +3,15 @@ namespace SIL.Machine.DataStructures public abstract class BidirListNode : IBidirListNode where TNode : BidirListNode { + // Skip-list tower links. Level 0 (the only level ~50% of nodes have) is stored inline in fields so + // those nodes allocate no tower array at all, and every taller node's array is one slot shorter; + // levels 1.. live in _nextHigh/_prevHigh (null when Levels <= 1). The per-node `new TNode[levels]` + // towers were the dominant Word.Clone sub-cost on Sena (RUSTIFY Stage 3, increment II). private BidirList _list; - private TNode[] _next; - private TNode[] _prev; + private TNode _next0; + private TNode _prev0; + private TNode[] _nextHigh; + private TNode[] _prevHigh; public IBidirList List { @@ -14,24 +20,12 @@ public IBidirList List public TNode Next { - get - { - if (_next == null) - return null; - - return _next[0]; - } + get { return Levels == 0 ? null : _next0; } } public TNode Prev { - get - { - if (_prev == null) - return null; - - return _prev[0]; - } + get { return Levels == 0 ? null : _prev0; } } /// @@ -73,16 +67,35 @@ public bool Remove() protected internal virtual void Init(BidirList list, int levels) { _list = list; - _next = new TNode[levels]; - _prev = new TNode[levels]; + _next0 = null; + _prev0 = null; + _nextHigh = levels > 1 ? new TNode[levels - 1] : null; + _prevHigh = levels > 1 ? new TNode[levels - 1] : null; Levels = levels; } + // Grow this node's high-level tower arrays to hold a list of total height `levels`. Used by + // BidirList for the Begin/End margins, which grow as the skip list gets taller instead of being + // pre-allocated to the 33-level maximum up front (most skip lists stay shallow). Right-sizes the + // exact level: margins grow one level at a time and the shallow majority never reach here, so + // geometric growth would only over-allocate; the O(height^2) churn it avoids is bounded by the + // ~31-level skip-list cap and only reached by rare very large lists. + internal void EnsureLevelCapacity(int levels) + { + int needHigh = levels - 1; + if (needHigh <= 0 || (_nextHigh?.Length ?? 0) >= needHigh) + return; + System.Array.Resize(ref _nextHigh, needHigh); + System.Array.Resize(ref _prevHigh, needHigh); + } + protected internal virtual void Clear() { _list = null; - _next = null; - _prev = null; + _next0 = null; + _prev0 = null; + _nextHigh = null; + _prevHigh = null; Levels = 0; } @@ -90,22 +103,28 @@ protected internal virtual void Clear() internal TNode GetNext(int level) { - return _next[level]; + return level == 0 ? _next0 : _nextHigh[level - 1]; } internal void SetNext(int level, TNode node) { - _next[level] = node; + if (level == 0) + _next0 = node; + else + _nextHigh[level - 1] = node; } internal TNode GetPrev(int level) { - return _prev[level]; + return level == 0 ? _prev0 : _prevHigh[level - 1]; } internal void SetPrev(int level, TNode node) { - _prev[level] = node; + if (level == 0) + _prev0 = node; + else + _prevHigh[level - 1] = node; } } } diff --git a/src/SIL.Machine/DataStructures/DataStructuresExtensions.cs b/src/SIL.Machine/DataStructures/DataStructuresExtensions.cs index 4f950c37f..9806c227d 100644 --- a/src/SIL.Machine/DataStructures/DataStructuresExtensions.cs +++ b/src/SIL.Machine/DataStructures/DataStructuresExtensions.cs @@ -263,6 +263,82 @@ bool preorder action((TNode)node); } + /// + /// Walks two structurally-isomorphic forests in lockstep (preorder), invoking + /// on each corresponding node pair. Used to pair a cloned tree with + /// its source without allocating the Queue + SelectMany/Zip iterator chain that + /// roots1.SelectMany(GetNodesBreadthFirst).Zip(roots2.SelectMany(GetNodesBreadthFirst)) + /// builds. The two forests MUST be isomorphic (e.g. one is a Clone of the other); the + /// resulting set of node pairs is independent of traversal order, so a preorder walk is + /// interchangeable with the BFS-zip form. is threaded through so the + /// callback can be a static (allocation-free) lambda rather than a closure. + /// + public static void PairedPreorderTraverse( + IEnumerable roots1, + IEnumerable roots2, + TState state, + Action action, + Direction dir + ) + where TNode : class, IBidirTreeNode + { + IEnumerator e1 = roots1.GetEnumerator(); + IEnumerator e2 = roots2.GetEnumerator(); + try + { + bool m1, + m2; + while ((m1 = e1.MoveNext()) & (m2 = e2.MoveNext())) + PairedPreorderNode(e1.Current, e2.Current, state, action, dir); + System.Diagnostics.Debug.Assert( + m1 == m2, + "PairedPreorderTraverse: forests are not isomorphic (root count mismatch)" + ); + } + finally + { + e1.Dispose(); + e2.Dispose(); + } + } + + private static void PairedPreorderNode( + TNode n1, + TNode n2, + TState state, + Action action, + Direction dir + ) + where TNode : class, IBidirTreeNode + { + action(state, n1, n2); + System.Diagnostics.Debug.Assert( + n1.IsLeaf == n2.IsLeaf, + "PairedPreorderTraverse: forests are not isomorphic (leaf mismatch)" + ); + if (!n1.IsLeaf) + { + IEnumerator c1 = n1.Children.GetNodes(dir).GetEnumerator(); + IEnumerator c2 = n2.Children.GetNodes(dir).GetEnumerator(); + try + { + bool m1, + m2; + while ((m1 = c1.MoveNext()) & (m2 = c2.MoveNext())) + PairedPreorderNode(c1.Current, c2.Current, state, action, dir); + System.Diagnostics.Debug.Assert( + m1 == m2, + "PairedPreorderTraverse: forests are not isomorphic (child count mismatch)" + ); + } + finally + { + c1.Dispose(); + c2.Dispose(); + } + } + } + public static void LevelOrderTraverse(this IBidirTreeNode root, Action action) where TNode : class, IBidirTreeNode { diff --git a/src/SIL.Machine/DataStructures/IDBearerBase.cs b/src/SIL.Machine/DataStructures/IDBearerBase.cs index a018efa33..ee62f45f0 100644 --- a/src/SIL.Machine/DataStructures/IDBearerBase.cs +++ b/src/SIL.Machine/DataStructures/IDBearerBase.cs @@ -21,5 +21,19 @@ public override string ToString() { return Description; } + + // Equals() is intentionally left as the default (reference equality) — this override only + // makes GetHashCode() cheap. Without it, every derived type (Feature and its subclasses, + // symbols, etc.) falls back to the CLR's identity hash, which a CPU profile showed + // contributing real self-time when these objects are used as Dictionary/HashSet keys (e.g. + // FeatureStruct._definite's Dictionary, rebuilt on every unify output). + // _id is immutable and set once at construction, so hashing on it changes nothing about + // which objects compare equal: two objects Equal by the untouched reference-equality Equals + // are the same instance, hence share the same _id, hence the same hash — the + // Equals/GetHashCode contract holds trivially. + public override int GetHashCode() + { + return _id == null ? 0 : _id.GetHashCode(); + } } } diff --git a/src/SIL.Machine/FeatureModel/FeatureStruct.cs b/src/SIL.Machine/FeatureModel/FeatureStruct.cs index 8fe64c34c..cc9c083a7 100644 --- a/src/SIL.Machine/FeatureModel/FeatureStruct.cs +++ b/src/SIL.Machine/FeatureModel/FeatureStruct.cs @@ -2,8 +2,8 @@ using System.Collections.Generic; using System.Linq; using System.Text; +using System.Threading; using SIL.Extensions; -using SIL.Machine.DataStructures; using SIL.Machine.FeatureModel.Fluent; using SIL.ObjectModel; @@ -51,15 +51,44 @@ public static IFeatureStructSyntax NewMutable(FeatureSystem featSys, FeatureStru return new FeatureStructBuilder(featSys, fs.Clone(), true); } - private readonly IDBearerDictionary _definite; + // Plain Dictionary rather than IDBearerDictionary: the latter kept a *second* parallel + // Dictionary to serve string-ID lookups, doubling the dictionary + // allocation on every unify-output / COW-inflation. String-ID lookups are rare (cold external + // API) so they now scan _definite by Feature.ID instead (see TryGetValueById/ContainsKeyById). + private Dictionary _definite; private int? _hashCode; + /// + /// On/off switch for the bit-packed flat-vector unify fast path. Default on; internal so a + /// test can flip it to verify parity against the original unification engine. Not part of + /// the public API. + /// + internal static bool FlatUnifyEnabled = true; + + // Bit-packed flat unify vector, computed lazily and cached (reset on mutation): + // _flatBits[feature.FlatIndex] = allowed-symbol bits (present) or ~0UL (absent = unconstrained). + // _flatState: 0 = not computed, 1 = computed. + // _flatComplete: every feature was bit-packable -> safe to use as the *constraint* (arc input). + // _flatSafeSegment: every NON-packable feature is non-symbolic (string/complex), which a + // symbolic input can never constrain -> safe to use as the *segment* (extras are ignored). + private ulong[] _flatBits; + private byte _flatState; + private bool _flatComplete; + private bool _flatSafeSegment; + + // Copy-on-write: a clone of a FROZEN feature struct borrows the source's (immutable) + // backing dictionary instead of deep-copying it. _shared is true until the first + // mutation inflates a private copy; _sharedSource is the frozen FS we borrowed from + // (needed to seed the re-entrancy map on inflate so the deep copy matches a normal clone). + private bool _shared; + private FeatureStruct _sharedSource; + /// /// Initializes a new instance of the class. /// public FeatureStruct() { - _definite = new IDBearerDictionary(); + _definite = new Dictionary(); } protected FeatureStruct(FeatureStruct other) @@ -78,6 +107,14 @@ private FeatureStruct(FeatureStruct other, IDictionary /// Gets the features. /// @@ -172,7 +209,7 @@ public void AddValue(Feature feature, FeatureValue value) if (value == null) throw new ArgumentNullException("value"); - CheckFrozen(); + EnsureWritable(); _definite[feature] = value; } @@ -183,7 +220,7 @@ public void AddValue(IEnumerable path, FeatureValue value) if (value == null) throw new ArgumentNullException("value"); - CheckFrozen(); + EnsureWritable(); Feature lastFeature; FeatureStruct lastFS; if (FollowPath(path, out lastFeature, out lastFS)) @@ -197,7 +234,7 @@ public void RemoveValue(Feature feature) if (feature == null) throw new ArgumentNullException("feature"); - CheckFrozen(); + EnsureWritable(); _definite.Remove(feature); } @@ -206,7 +243,7 @@ public void RemoveValue(IEnumerable path) if (path == null) throw new ArgumentNullException("path"); - CheckFrozen(); + EnsureWritable(); Feature lastFeature; FeatureStruct lastFS; if (FollowPath(path, out lastFeature, out lastFS)) @@ -217,7 +254,7 @@ public void RemoveValue(IEnumerable path) public void ReplaceVariables(VariableBindings varBindings) { - CheckFrozen(); + EnsureWritable(); ReplaceVariables(varBindings, new HashSet()); } @@ -254,7 +291,7 @@ private void ReplaceVariables(VariableBindings varBindings, ISet public void RemoveVariables() { - CheckFrozen(); + EnsureWritable(); RemoveVariables(new HashSet()); } @@ -293,7 +330,7 @@ public void PriorityUnion(FeatureStruct other, VariableBindings varBindings) if (other == null) throw new ArgumentNullException("other"); - CheckFrozen(); + EnsureWritable(); PriorityUnion(other, varBindings, new Dictionary()); } @@ -377,7 +414,7 @@ public void Union(FeatureStruct other, VariableBindings varBindings) if (other == null) throw new ArgumentNullException("other"); - CheckFrozen(); + EnsureWritable(); UnionImpl(other, varBindings, new Dictionary>()); } @@ -423,7 +460,7 @@ public void Add(FeatureStruct other, VariableBindings varBindings) if (other == null) throw new ArgumentNullException("other"); - CheckFrozen(); + EnsureWritable(); AddImpl(other, varBindings, new Dictionary>()); } @@ -477,7 +514,7 @@ public void Subtract(FeatureStruct other, VariableBindings varBindings) if (other == null) throw new ArgumentNullException("other"); - CheckFrozen(); + EnsureWritable(); SubtractImpl(other, varBindings, new Dictionary>()); } @@ -513,7 +550,7 @@ IDictionary> visited public void Clear() { - CheckFrozen(); + EnsureWritable(); _definite.Clear(); } @@ -667,12 +704,42 @@ public bool TryGetValue(string featureID, out T value) throw new ArgumentNullException("featureID"); FeatureValue val; - if (_definite.TryGetValue(featureID, out val)) + if (TryGetValueById(_definite, featureID, out val)) return Dereference(val, out value); value = null; return false; } + // String-ID lookups over the plain _definite dictionary (replaces the dropped parallel + // string-keyed dictionary). Feature IDs are unique within a struct, so first match wins. + private static bool TryGetValueById( + Dictionary definite, + string id, + out FeatureValue value + ) + { + foreach (KeyValuePair kvp in definite) + { + if (kvp.Key.ID == id) + { + value = kvp.Value; + return true; + } + } + value = null; + return false; + } + + private static bool ContainsKeyById(Dictionary definite, string id) + { + foreach (KeyValuePair kvp in definite) + { + if (kvp.Key.ID == id) + return true; + } + return false; + } + public bool TryGetValue(IEnumerable path, out T value) where T : FeatureValue { @@ -702,7 +769,7 @@ public bool TryGetValue(IEnumerable path, out T value) if (FollowPath(path, out lastID, out lastFS)) { FeatureValue val; - if (lastFS._definite.TryGetValue(lastID, out val)) + if (TryGetValueById(lastFS._definite, lastID, out val)) return Dereference(val, out value); } value = null; @@ -722,7 +789,7 @@ public bool ContainsFeature(string featureID) if (featureID == null) throw new ArgumentNullException("featureID"); - return _definite.ContainsKey(featureID); + return ContainsKeyById(_definite, featureID); } public bool ContainsFeature(IEnumerable path) @@ -745,7 +812,7 @@ public bool ContainsFeature(IEnumerable path) string lastID; FeatureStruct lastFS; if (FollowPath(path, out lastID, out lastFS)) - return lastFS._definite.ContainsKey(lastID); + return ContainsKeyById(lastFS._definite, lastID); return false; } @@ -758,7 +825,7 @@ private bool FollowPath(IEnumerable path, out string lastID, out Feature if (lastID != null) { FeatureValue curValue; - if (!lastFS._definite.TryGetValue(lastID, out curValue) || !Dereference(curValue, out lastFS)) + if (!TryGetValueById(lastFS._definite, lastID, out curValue) || !Dereference(curValue, out lastFS)) { lastID = null; lastFS = null; @@ -793,6 +860,94 @@ private bool FollowPath(IEnumerable path, out Feature lastFeature, out return true; } + // Builds (once, on a frozen struct) the bit-packed flat unify vector. _flatState becomes + // 1 (Simple: vector valid) only if every feature is a flat-indexed symbolic feature with a + // non-empty ulong value and no variable; otherwise 2 (Complex: must use the slow path). + private void EnsureFlat() + { + // Volatile.Read/Write instead of a lock: frozen structs are read concurrently from every + // parallel FST traversal thread (Input.Matches -> TryFastUnifiable), and a plain field + // write here would let one thread observe _flatState==1 before _flatBits' array + // reference is visible (store reordering), reading a stale/null array. Redundant + // concurrent computation is harmless (deterministic, frozen input) and cheaper than a + // lock; only the publish order needs the release/acquire fence. + if (Volatile.Read(ref _flatState) != 0) + return; + int maxIdx = -1; + bool complete = true; // all features bit-packable (usable as a constraint/input) + bool safeSegment = true; // every non-packable feature is non-symbolic (ignorable in a segment) + foreach (KeyValuePair featVal in _definite) + { + if ( + featVal.Key is SymbolicFeature sf + && sf.FlatIndex >= 0 + && Dereference(featVal.Value) is SymbolicFeatureValue sv + && sv.TryGetFlatBits(out _) + ) + { + if (sf.FlatIndex > maxIdx) + maxIdx = sf.FlatIndex; + } + else + { + complete = false; + // A symbolic-but-unpackable feature (variable/empty/>64 symbols) CAN be + // constrained by a symbolic input, so it can't be safely ignored in a segment. + if (featVal.Key is SymbolicFeature) + safeSegment = false; + } + } + var arr = new ulong[maxIdx + 1]; + for (int i = 0; i <= maxIdx; i++) + arr[i] = ulong.MaxValue; // absent feature = unconstrained + foreach (KeyValuePair featVal in _definite) + { + if ( + featVal.Key is SymbolicFeature sf + && sf.FlatIndex >= 0 + && Dereference(featVal.Value) is SymbolicFeatureValue sv + && sv.TryGetFlatBits(out ulong bits) + ) + { + arr[sf.FlatIndex] = bits; + } + } + _flatBits = arr; + _flatComplete = complete; + _flatSafeSegment = safeSegment; + Volatile.Write(ref _flatState, 1); + } + + // Bit-packed unifiability fast path. Returns false (not handled) when either struct isn't a + // frozen, Simple symbolic struct; otherwise sets result and returns true. Provably identical + // to IsUnifiable(other, useDefaults:false, varBindings:null) for the simple/no-variable case: + // a feature absent on either side is ~0 (the "no constraint" branch), and overlap == unifiable. + // this = the segment being matched; other = the arc-input constraint. + internal bool TryFastUnifiable(FeatureStruct other, out bool result) + { + result = false; + if (!FlatUnifyEnabled) + return false; + EnsureFlat(); + other.EnsureFlat(); + // The constraint (input) must be fully bit-packed; the segment may carry extra + // non-symbolic features the symbolic input can't constrain (so they're ignorable). + if (!other._flatComplete || !_flatSafeSegment) + return false; + ulong[] a = _flatBits; + ulong[] b = other._flatBits; + int n = a.Length > b.Length ? a.Length : b.Length; + for (int i = 0; i < n; i++) + { + ulong av = i < a.Length ? a[i] : ulong.MaxValue; + ulong bv = i < b.Length ? b[i] : ulong.MaxValue; + if ((av & bv) == 0) + return true; // result already false: a feature has no common symbol + } + result = true; + return true; + } + public bool IsUnifiable(FeatureStruct other) { return IsUnifiable(other, false); @@ -1099,6 +1254,10 @@ internal override void FindReentrances(IDictionary reentranc public new FeatureStruct Clone() { + // A clone of a frozen FS borrows its immutable backing (copy-on-write); a clone of an + // unfrozen FS must be an independent deep copy, since the caller may mutate both. + if (IsFrozen) + return new FeatureStruct(this, sharedClone: true); return new FeatureStruct(this); } @@ -1188,10 +1347,25 @@ public override bool ValueEquals(FeatureValue other) public bool IsFrozen { get; private set; } - private void CheckFrozen() + // Guards every mutation. Frozen structs stay immutable (throw). A copy-on-write shell + // that is still borrowing a frozen backing inflates a private deep copy first, so neither + // this struct's mutation nor any recursion into its children can touch shared frozen data. + private void EnsureWritable() { if (IsFrozen) throw new InvalidOperationException("The feature structure is immutable."); + // Any mutation invalidates the cached flat unify vector. + _flatState = 0; + _flatBits = null; + if (!_shared) + return; + var copies = new Dictionary { [_sharedSource] = this }; + var owned = new Dictionary(); + foreach (KeyValuePair featVal in _definite) + owned[featVal.Key] = Dereference(featVal.Value).CloneImpl(copies); + _definite = owned; + _shared = false; + _sharedSource = null; } public void Freeze() @@ -1211,7 +1385,13 @@ internal override int FreezeImpl(ISet visited) IsFrozen = true; int code = 23; - foreach (KeyValuePair kvp in _definite.OrderBy(kvp => kvp.Key.ID)) + // Ordinal, not the LINQ default (culture-aware, CompareInfo.Compare): feature IDs are + // opaque grammar identifiers, not user-facing text, and a CPU profile showed the + // culture-aware comparison contributing real self-time on this hot per-Freeze() sort + // (needed only for a deterministic hash — Dictionary iteration order isn't guaranteed). + foreach ( + KeyValuePair kvp in _definite.OrderBy(kvp => kvp.Key.ID, StringComparer.Ordinal) + ) { code = code * 31 + kvp.Key.GetHashCode(); FeatureValue value = Dereference(kvp.Value); @@ -1255,7 +1435,12 @@ internal override string ToStringImpl(ISet visited, IDictionary 0) sb.Append("["); - foreach (KeyValuePair kvp in _definite.OrderBy(kvp => kvp.Key.Description)) + foreach ( + KeyValuePair kvp in _definite.OrderBy( + kvp => kvp.Key.Description, + StringComparer.Ordinal + ) + ) { FeatureValue value = Dereference(kvp.Value); if (!firstFeature) diff --git a/src/SIL.Machine/FeatureModel/FeatureValue.cs b/src/SIL.Machine/FeatureModel/FeatureValue.cs index 19d18ebe9..c345f7429 100644 --- a/src/SIL.Machine/FeatureModel/FeatureValue.cs +++ b/src/SIL.Machine/FeatureModel/FeatureValue.cs @@ -5,8 +5,27 @@ namespace SIL.Machine.FeatureModel { public abstract class FeatureValue : ICloneable { + // Equals() is intentionally left as the default (reference equality) — every subclass + // (FeatureStruct, SimpleFeatureValue, ...) is tracked by IDENTITY in the visited-node + // dictionaries/sets used throughout unification (e.g. AddImpl/UnionImpl's + // IDictionary, CloneImpl's IDictionary): + // structurally-identical-but-distinct instances must stay distinct nodes during a graph + // traversal, so content-based equality here would be a correctness bug. This override only + // makes GetHashCode() cheap: a CPU profile showed the CLR's default identity hash (assigning + // a sync-block hash code on first use) dominating self-time, driven by these dictionaries — + // FeatureStruct instances are created on nearly every clone/unify-output. _id is a + // construction-order sequence number, unique and stable for the instance's lifetime, so it + // changes nothing about which objects compare equal (still exactly reference equality). + private static int NextId; + private readonly int _id = System.Threading.Interlocked.Increment(ref NextId); + internal FeatureValue Forward { get; set; } + public override int GetHashCode() + { + return _id; + } + internal abstract bool UnionImpl( FeatureValue other, VariableBindings varBindings, diff --git a/src/SIL.Machine/FeatureModel/StringFeatureValue.cs b/src/SIL.Machine/FeatureModel/StringFeatureValue.cs index ae05be2a9..3772efffd 100644 --- a/src/SIL.Machine/FeatureModel/StringFeatureValue.cs +++ b/src/SIL.Machine/FeatureModel/StringFeatureValue.cs @@ -234,7 +234,11 @@ protected override int GetValuesHashCode() { int code = base.GetValuesHashCode(); code = code * 31 + Not.GetHashCode(); - code = code * 31 + _values.OrderBy(str => str).GetSequenceHashCode(); + // Ordinal: these are opaque grammar string-feature values, not user-facing text, and this + // hash is computed on the Freeze() hot path (a CPU profile showed the culture-aware default + // contributing real self-time via CompareInfo.Compare — same class of fix as FeatureStruct's + // OrderBy sites). + code = code * 31 + _values.OrderBy(str => str, StringComparer.Ordinal).GetSequenceHashCode(); return code; } diff --git a/src/SIL.Machine/FeatureModel/SymbolicFeature.cs b/src/SIL.Machine/FeatureModel/SymbolicFeature.cs index 4fad07741..bee22db64 100644 --- a/src/SIL.Machine/FeatureModel/SymbolicFeature.cs +++ b/src/SIL.Machine/FeatureModel/SymbolicFeature.cs @@ -7,6 +7,29 @@ public class SymbolicFeature : Feature { private readonly PossibleSymbolCollection _possibleSymbols; + // Process-wide counter for globally-unique flat indices (see FlatIndex). + private static int NextFlatIndex = -1; + private int _flatIndex = -1; + + /// + /// Globally-unique dense index used to place this feature's allowed-symbol bits in a + /// FeatureStruct's flat unify vector. Assigned lazily and once (so it works regardless of + /// whether/when the owning FeatureSystem is frozen — loaded grammars don't always freeze it). + /// Returns -1 for features with > 64 symbols, which forces the slow unification path. + /// + internal int FlatIndex + { + get + { + if (_flatIndex < 0 && _possibleSymbols.Count <= sizeof(ulong) * 8) + { + int idx = System.Threading.Interlocked.Increment(ref NextFlatIndex); + System.Threading.Interlocked.CompareExchange(ref _flatIndex, idx, -1); + } + return _flatIndex; + } + } + public SymbolicFeature(string id, params FeatureSymbol[] possibleSymbols) : this(id, (IEnumerable)possibleSymbols) { } diff --git a/src/SIL.Machine/FeatureModel/SymbolicFeatureValue.cs b/src/SIL.Machine/FeatureModel/SymbolicFeatureValue.cs index 724911e0a..480bfe87c 100644 --- a/src/SIL.Machine/FeatureModel/SymbolicFeatureValue.cs +++ b/src/SIL.Machine/FeatureModel/SymbolicFeatureValue.cs @@ -94,6 +94,18 @@ public IEnumerable Values get { return _feature.PossibleSymbols.Where(_flags.Get); } } + // For the flat bit-packed unify fast path: this value's allowed symbols as a raw ulong + // bitset. Returns false (forcing the slow path) for variables or non-ulong (>64 symbol) + // backing, or an empty set (so a fs-only empty value can't be wrongly skipped). + internal bool TryGetFlatBits(out ulong bits) + { + bits = 0; + if (IsVariable || !(_flags is UlongSymbolicFeatureValueFlags ulong_flags)) + return false; + bits = ulong_flags.RawFlags; + return bits != 0; + } + public bool IsSupersetOf(SymbolicFeatureValue other, bool notOther = false) { return IsSupersetOf(false, other, notOther); diff --git a/src/SIL.Machine/FeatureModel/UlongSymbolicFeatureValueFlags.cs b/src/SIL.Machine/FeatureModel/UlongSymbolicFeatureValueFlags.cs index bdb1596c4..09f4d92f1 100644 --- a/src/SIL.Machine/FeatureModel/UlongSymbolicFeatureValueFlags.cs +++ b/src/SIL.Machine/FeatureModel/UlongSymbolicFeatureValueFlags.cs @@ -9,10 +9,20 @@ internal class UlongSymbolicFeatureValueFlags : ISymbolicFeatureValueFlags private readonly ulong _mask; private ulong _flags = 0; + /// The set of allowed symbols as a raw bitset (bit i = symbol with Index i). + internal ulong RawFlags => _flags; + public UlongSymbolicFeatureValueFlags(SymbolicFeature feature) { _feature = feature; - _mask = (1UL << feature.PossibleSymbols.Count) - 1UL; + int count = feature.PossibleSymbols.Count; + // A feature with exactly 64 symbols occupies bits 0..63 (the whole ulong). Computing + // the mask as `(1UL << count) - 1` would be wrong here: C# masks a ulong shift count to + // its low 6 bits, so `1UL << 64` == `1UL << 0` == 1, giving _mask == 0 — which silently + // breaks every mask-dependent op (HasAllSet, negation, and the `not`/`notOther` branches + // of IsSupersetOf/Overlaps/IntersectWith/UnionWith/ExceptWith/Not). The dispatch guard in + // SymbolicFeatureValue.CreateFlags admits counts up to 64, so this boundary is reachable. + _mask = count >= 64 ? ulong.MaxValue : (1UL << count) - 1UL; } private UlongSymbolicFeatureValueFlags(SymbolicFeature feature, ulong mask, ulong flags) diff --git a/src/SIL.Machine/FiniteState/DeterministicFsaTraversalMethod.cs b/src/SIL.Machine/FiniteState/DeterministicFsaTraversalMethod.cs index 5470fa689..577268e58 100644 --- a/src/SIL.Machine/FiniteState/DeterministicFsaTraversalMethod.cs +++ b/src/SIL.Machine/FiniteState/DeterministicFsaTraversalMethod.cs @@ -1,6 +1,5 @@ using System.Collections.Generic; using SIL.Machine.Annotations; -using SIL.Machine.FeatureModel; namespace SIL.Machine.FiniteState { @@ -8,19 +7,12 @@ internal class DeterministicFsaTraversalMethod : TraversalMethodBase> where TData : IAnnotatedData { - public DeterministicFsaTraversalMethod( - Fst fst, - TData data, - VariableBindings varBindings, - bool startAnchor, - bool endAnchor, - bool useDefaults - ) - : base(fst, data, varBindings, startAnchor, endAnchor, useDefaults) { } + public DeterministicFsaTraversalMethod(Fst fst) + : base(fst) { } - public override IEnumerable> Traverse( + public override List> Traverse( ref int annIndex, - Register[,] initRegisters, + Register[] initRegisters, IList initCmds, ISet initAnns ) @@ -75,23 +67,17 @@ protected override DeterministicFsaTraversalInstance CreateInsta private Stack> InitializeStack( ref int annIndex, - Register[,] registers, + Register[] registers, IList cmds, ISet initAnns ) { var instStack = new Stack>(); - foreach ( - DeterministicFsaTraversalInstance inst in Initialize( - ref annIndex, - registers, - cmds, - initAnns - ) - ) - { + List> insts = InitializeBuffer; + insts.Clear(); + Initialize(ref annIndex, registers, cmds, initAnns, insts); + foreach (DeterministicFsaTraversalInstance inst in insts) instStack.Push(inst); - } return instStack; } diff --git a/src/SIL.Machine/FiniteState/DeterministicFstTraversalInstance.cs b/src/SIL.Machine/FiniteState/DeterministicFstTraversalInstance.cs index fc8b85bdd..b6b94b23a 100644 --- a/src/SIL.Machine/FiniteState/DeterministicFstTraversalInstance.cs +++ b/src/SIL.Machine/FiniteState/DeterministicFstTraversalInstance.cs @@ -1,8 +1,6 @@ using System.Collections.Generic; -using System.Linq; using SIL.Extensions; using SIL.Machine.Annotations; -using SIL.Machine.DataStructures; namespace SIL.Machine.FiniteState { @@ -34,16 +32,11 @@ public override void CopyTo(TraversalInstance other) base.CopyTo(other); var otherDfst = (DeterministicFstTraversalInstance)other; - Dictionary, Annotation> outputMappings = Output - .Annotations.SelectMany(a => a.GetNodesBreadthFirst()) - .Zip(Output.Annotations.SelectMany(a => a.GetNodesBreadthFirst())) - .ToDictionary(t => t.Item1, t => t.Item2); - otherDfst.Mappings.AddRange( - _mappings.Select(kvp => new KeyValuePair, Annotation>( - kvp.Key, - outputMappings[kvp.Value] - )) - ); + // Identity map: the original zipped this.Output's node sequence with itself, so + // outputMappings[v] == v and the block reduces to copying _mappings unchanged. + // Avoids a Dictionary + two SelectMany(BFS) + Zip + Select per instance copy. + // Byte-identical; otherDfst.Mappings is empty here (GetCachedInstance -> Clear()). + otherDfst.Mappings.AddRange(_mappings); foreach (Annotation ann in _queue) otherDfst.Queue.Enqueue(ann); } diff --git a/src/SIL.Machine/FiniteState/DeterministicFstTraversalMethod.cs b/src/SIL.Machine/FiniteState/DeterministicFstTraversalMethod.cs index 534a2dcd1..10cee5e3b 100644 --- a/src/SIL.Machine/FiniteState/DeterministicFstTraversalMethod.cs +++ b/src/SIL.Machine/FiniteState/DeterministicFstTraversalMethod.cs @@ -1,6 +1,4 @@ using System.Collections.Generic; -using System.Linq; -using SIL.Extensions; using SIL.Machine.Annotations; using SIL.Machine.DataStructures; using SIL.Machine.FeatureModel; @@ -12,19 +10,12 @@ internal class DeterministicFstTraversalMethod : TraversalMethodBase> where TData : IAnnotatedData { - public DeterministicFstTraversalMethod( - Fst fst, - TData data, - VariableBindings varBindings, - bool startAnchor, - bool endAnchor, - bool useDefaults - ) - : base(fst, data, varBindings, startAnchor, endAnchor, useDefaults) { } + public DeterministicFstTraversalMethod(Fst fst) + : base(fst) { } - public override IEnumerable> Traverse( + public override List> Traverse( ref int annIndex, - Register[,] initRegisters, + Register[] initRegisters, IList initCmds, ISet initAnns ) @@ -137,28 +128,27 @@ Queue> queue private Stack> InitializeStack( ref int annIndex, - Register[,] registers, + Register[] registers, IList cmds, ISet initAnns ) { var instStack = new Stack>(); - foreach ( - DeterministicFstTraversalInstance inst in Initialize( - ref annIndex, - registers, - cmds, - initAnns - ) - ) + List> insts = InitializeBuffer; + insts.Clear(); + Initialize(ref annIndex, registers, cmds, initAnns, insts); + foreach (DeterministicFstTraversalInstance inst in insts) { inst.Output = ((ICloneable)Data).Clone(); - inst.Mappings.AddRange( - Data.Annotations.SelectMany(a => a.GetNodesBreadthFirst()) - .Zip( - inst.Output.Annotations.SelectMany(a => a.GetNodesBreadthFirst()), - (a1, a2) => new KeyValuePair, Annotation>(a1, a2) - ) + // Pair each source annotation with its clone via a lockstep preorder walk of the two + // isomorphic forests — same result as zipping the two BFS node sequences (dict order + // is irrelevant) but without the per-call Queue + SelectMany/Zip iterators + KVPs. + DataStructuresExtensions.PairedPreorderTraverse( + Data.Annotations, + inst.Output.Annotations, + inst.Mappings, + (mappings, a1, a2) => mappings[a1] = a2, + Direction.LeftToRight ); instStack.Push(inst); } diff --git a/src/SIL.Machine/FiniteState/Fst.cs b/src/SIL.Machine/FiniteState/Fst.cs index 04fb681c4..32f58c941 100644 --- a/src/SIL.Machine/FiniteState/Fst.cs +++ b/src/SIL.Machine/FiniteState/Fst.cs @@ -25,6 +25,16 @@ public class Fst : IFreezable private int _nextTag; private readonly Dictionary _groups; private readonly List _initializers; + + // Frozen-time partition of _initializers (see Freeze): the Dest!=0 commands are the per-call + // `cmds` list (read-only in traversal), and the Dest==0 commands drive the per-annotation + // SetOffset. Precomputing once at Freeze removes a List allocation + the + // filter loop from every Transduce call. Null until frozen → Transduce falls back to the + // inline build, so unfrozen callers are unaffected. Immutable after Freeze (the FST is shared + // read-only across parsing threads), so concurrent reads of the shared cmds list are safe. + private List _nonZeroDestInitializers; + private List _zeroDestInitializers; + private int _registerCount; private Direction _dir; private Func, bool> _filter; @@ -114,11 +124,11 @@ public bool IsAcceptor get { return _operations == null; } } - public bool GetOffsets(string groupName, Register[,] registers, out TOffset start, out TOffset end) + public bool GetOffsets(string groupName, Register[] registers, out TOffset start, out TOffset end) { int tag = _groups[groupName]; - Register startValue = registers[tag, 0]; - Register endValue = registers[tag + 1, 1]; + Register startValue = registers[tag * 2]; + Register endValue = registers[(tag + 1) * 2 + 1]; if ( startValue.HasOffset && endValue.HasOffset @@ -245,7 +255,7 @@ public IFstOperations Operations get { return _operations; } } - internal IEqualityComparer[,]> RegistersEqualityComparer + internal IEqualityComparer[]> RegistersEqualityComparer { get { return _registersEqualityComparer; } } @@ -312,83 +322,68 @@ private bool Transduce( out IEnumerable> results ) { - ITraversalMethod traversalMethod; - if (_operations != null) - { - if (IsDeterministic) - { - traversalMethod = new DeterministicFstTraversalMethod( - this, - data, - varBindings, - startAnchor, - endAnchor, - useDefaults - ); - } - else - { - traversalMethod = new NondeterministicFstTraversalMethod( - this, - data, - varBindings, - startAnchor, - endAnchor, - useDefaults - ); - } - } - else - { - if (IsDeterministic) - { - traversalMethod = new DeterministicFsaTraversalMethod( - this, - data, - varBindings, - startAnchor, - endAnchor, - useDefaults - ); - } - else - { - traversalMethod = new NondeterministicFsaTraversalMethod( - this, - data, - varBindings, - startAnchor, - endAnchor, - useDefaults - ); - } - } + // A fresh traversal method per Transduce call. Pooling it per-thread across a word was + // tried and reverted: the pooled method survives a Gen0 collection, promotes to Gen2, + // and the stop-the-world Gen2 serializes parallel parsing (see RUSTIFY Phase 1b). With + // allocation now driven down elsewhere, short-lived (die-in-Gen0) is the right tradeoff. + ITraversalMethod traversalMethod = CreateTraversalMethod(); + traversalMethod.Reset(data, varBindings, startAnchor, endAnchor, useDefaults); List> resultList = null; int annIndex = traversalMethod.Annotations.IndexOf(start); var initAnns = new HashSet(); + // Reuse the frozen-time initializer partition when available (the hot, shared-grammar + // path); fall back to building cmds inline for an unfrozen FST. + List nonZeroDestInit = _nonZeroDestInitializers; + // RUSTIFY lever 1: allocate the initial-register scaffold once and clear it per start + // position instead of `new Register[regCount,2]` every outer iteration. Traverse only ever + // Array.Copy's it into the initial instances (never retains it), so reuse-after-clear is + // byte-identical — and AllMatches (analysis) runs one iteration per start, so this removes + // (starts-1) register-array allocations per matcher call. + // Flat array (not Register[,]): a CPU profile showed rectangular-array allocation + // (Array.CreateInstanceMDArray) dominating self-time on this hot path — see TraversalInstance. + var initRegisters = new Register[_registerCount * 2]; + bool firstIteration = true; while (annIndex < traversalMethod.Annotations.Count && annIndex > -1) { - var initRegisters = new Register[_registerCount, 2]; + if (!firstIteration) + Array.Clear(initRegisters, 0, initRegisters.Length); + firstIteration = false; - var cmds = new List(); - foreach (TagMapCommand cmd in _initializers) + List cmds; + if (nonZeroDestInit != null) { - if (cmd.Dest == 0) + foreach (TagMapCommand cmd in _zeroDestInitializers) { - initRegisters[cmd.Dest, 0] + initRegisters[cmd.Dest * 2] .SetOffset(traversalMethod.Annotations[annIndex].Range.GetStart(_dir), true); } - else + cmds = nonZeroDestInit; + } + else + { + cmds = new List(); + foreach (TagMapCommand cmd in _initializers) { - cmds.Add(cmd); + if (cmd.Dest == 0) + { + initRegisters[cmd.Dest * 2] + .SetOffset(traversalMethod.Annotations[annIndex].Range.GetStart(_dir), true); + } + else + { + cmds.Add(cmd); + } } } - List> curResults = traversalMethod - .Traverse(ref annIndex, initRegisters, cmds, initAnns) - .ToList(); + List> curResults = traversalMethod.Traverse( + ref annIndex, + initRegisters, + cmds, + initAnns + ); if (curResults.Count > 0) { if (resultList == null) @@ -409,10 +404,31 @@ out IEnumerable> results return false; } - results = allMatches ? resultList.Distinct() : resultList; + // Distinct() materializes a lazy iterator + internal set every time it is enumerated; + // for 0/1 results there is nothing to dedupe (resultList is non-null with Count >= 1 + // here), so return the list directly and skip the iterator in that common case. + results = (allMatches && resultList.Count > 1) ? resultList.Distinct() : resultList; return true; } + private ITraversalMethod CreateTraversalMethod() + { + return CreateTraversalMethodCore(); + } + + private ITraversalMethod CreateTraversalMethodCore() + { + if (_operations != null) + { + return IsDeterministic + ? (ITraversalMethod)new DeterministicFstTraversalMethod(this) + : new NondeterministicFstTraversalMethod(this); + } + return IsDeterministic + ? (ITraversalMethod)new DeterministicFsaTraversalMethod(this) + : new NondeterministicFsaTraversalMethod(this); + } + private int ResultCompare(FstResult x, FstResult y) { int compare = x.Priority.CompareTo(y.Priority); @@ -2122,6 +2138,21 @@ public void Freeze() IsFrozen = true; foreach (State state in _states) state.Freeze(); + + // Partition the (now immutable) initializers once so Transduce reuses them instead of + // rebuilding the cmds list every call. Build into locals and publish the gating field + // (_nonZeroDestInitializers) last, so a reader never observes a partially filled list. + var zeroDest = new List(); + var nonZeroDest = new List(); + foreach (TagMapCommand cmd in _initializers) + { + if (cmd.Dest == 0) + zeroDest.Add(cmd); + else + nonZeroDest.Add(cmd); + } + _zeroDestInitializers = zeroDest; + _nonZeroDestInitializers = nonZeroDest; } public int GetFrozenHashCode() diff --git a/src/SIL.Machine/FiniteState/FstResult.cs b/src/SIL.Machine/FiniteState/FstResult.cs index 42fa9688f..ee202a840 100644 --- a/src/SIL.Machine/FiniteState/FstResult.cs +++ b/src/SIL.Machine/FiniteState/FstResult.cs @@ -7,8 +7,8 @@ namespace SIL.Machine.FiniteState { public class FstResult : IEquatable> { - private readonly IEqualityComparer[,]> _registersEqualityComparer; - private readonly Register[,] _registers; + private readonly IEqualityComparer[]> _registersEqualityComparer; + private readonly Register[] _registers; private readonly TData _output; private readonly VariableBindings _varBindings; private readonly string _id; @@ -19,9 +19,9 @@ public class FstResult : IEquatable> private readonly int _order; internal FstResult( - IEqualityComparer[,]> registersEqualityComparer, + IEqualityComparer[]> registersEqualityComparer, string id, - Register[,] registers, + Register[] registers, TData output, VariableBindings varBindings, int priority, @@ -48,7 +48,7 @@ public string ID get { return _id; } } - public Register[,] Registers + public Register[] Registers { get { return _registers; } } diff --git a/src/SIL.Machine/FiniteState/ITraversalMethod.cs b/src/SIL.Machine/FiniteState/ITraversalMethod.cs index d11dfab46..071171982 100644 --- a/src/SIL.Machine/FiniteState/ITraversalMethod.cs +++ b/src/SIL.Machine/FiniteState/ITraversalMethod.cs @@ -1,5 +1,6 @@ using System.Collections.Generic; using SIL.Machine.Annotations; +using SIL.Machine.FeatureModel; namespace SIL.Machine.FiniteState { @@ -7,9 +8,10 @@ internal interface ITraversalMethod where TData : IAnnotatedData { IList> Annotations { get; } - IEnumerable> Traverse( + void Reset(TData data, VariableBindings varBindings, bool startAnchor, bool endAnchor, bool useDefaults); + List> Traverse( ref int annIndex, - Register[,] initRegisters, + Register[] initRegisters, IList initCmds, ISet initAnns ); diff --git a/src/SIL.Machine/FiniteState/Input.cs b/src/SIL.Machine/FiniteState/Input.cs index 3fb4fc761..4f4304694 100644 --- a/src/SIL.Machine/FiniteState/Input.cs +++ b/src/SIL.Machine/FiniteState/Input.cs @@ -50,11 +50,46 @@ public bool Matches(FeatureStruct fs, bool unification, bool useDefaults, Variab { if (unification) { - return fs.IsUnifiable(_fs, useDefaults, varBindings) - && _negatedFSs.All(nfs => !fs.IsUnifiable(nfs, useDefaults)); + // Bit-packed fast path for the common phonological case (no defaults, no negation, + // both operands simple symbolic structs). Identical result, no varBindings clone, + // no dictionary walk. Falls back to the full engine otherwise. + if (!useDefaults && _negatedFSs.Count == 0 && fs.TryFastUnifiable(_fs, out bool fastResult)) + return fastResult; + + if (!fs.IsUnifiable(_fs, useDefaults, varBindings)) + return false; + return NoneUnifiable(fs, useDefaults); } - return _fs.Subsumes(fs, useDefaults, varBindings) && _negatedFSs.All(nfs => !nfs.Subsumes(fs, useDefaults)); + return _fs.Subsumes(fs, useDefaults, varBindings) && NoneSubsumed(fs, useDefaults); + } + + // Explicit loops instead of `_negatedFSs.All(nfs => ...)`: the lambda's closure (capturing fs + // and useDefaults) and the boxed HashSet.Enumerator (via the IEnumerable extension-method + // path) were allocated on every call, even for the common case where _negatedFSs is empty. + // A plain `foreach` on the concrete HashSet reference uses its unboxed struct enumerator. + private bool NoneUnifiable(FeatureStruct fs, bool useDefaults) + { + if (_negatedFSs.Count == 0) + return true; + foreach (FeatureStruct nfs in _negatedFSs) + { + if (fs.IsUnifiable(nfs, useDefaults)) + return false; + } + return true; + } + + private bool NoneSubsumed(FeatureStruct fs, bool useDefaults) + { + if (_negatedFSs.Count == 0) + return true; + foreach (FeatureStruct nfs in _negatedFSs) + { + if (nfs.Subsumes(fs, useDefaults)) + return false; + } + return true; } public bool IsSatisfiable diff --git a/src/SIL.Machine/FiniteState/NondeterministicFsaTraversalInstance.cs b/src/SIL.Machine/FiniteState/NondeterministicFsaTraversalInstance.cs index 2f084aa9e..f87c72e73 100644 --- a/src/SIL.Machine/FiniteState/NondeterministicFsaTraversalInstance.cs +++ b/src/SIL.Machine/FiniteState/NondeterministicFsaTraversalInstance.cs @@ -1,4 +1,3 @@ -using System.Collections.Generic; using SIL.Machine.Annotations; namespace SIL.Machine.FiniteState @@ -6,24 +5,33 @@ namespace SIL.Machine.FiniteState internal class NondeterministicFsaTraversalInstance : TraversalInstance where TData : IAnnotatedData { - private readonly HashSet> _visited; + // RUSTIFY lever 1: a value-type bitset over state indices instead of a HashSet — no + // per-instance set allocation (the instance is created ~2,927x/word on Sena). + private VisitedStates _visited; public NondeterministicFsaTraversalInstance(int registerCount) - : base(registerCount, false) + : base(registerCount, false) { } + + public bool IsVisited(State state) { - _visited = new HashSet>(); + return _visited.Contains(state.Index); } - public ISet> Visited + public void MarkVisited(State state) { - get { return _visited; } + _visited.Add(state.Index); + } + + public void ClearVisited() + { + _visited.Clear(); } public override void CopyTo(TraversalInstance other) { base.CopyTo(other); var otherNfsa = (NondeterministicFsaTraversalInstance)other; - otherNfsa.Visited.UnionWith(_visited); + otherNfsa._visited.UnionWith(in _visited); } public override void Clear() diff --git a/src/SIL.Machine/FiniteState/NondeterministicFsaTraversalMethod.cs b/src/SIL.Machine/FiniteState/NondeterministicFsaTraversalMethod.cs index b5d3b3d5e..e2576e168 100644 --- a/src/SIL.Machine/FiniteState/NondeterministicFsaTraversalMethod.cs +++ b/src/SIL.Machine/FiniteState/NondeterministicFsaTraversalMethod.cs @@ -1,5 +1,4 @@ -using System; -using System.Collections.Generic; +using System.Collections.Generic; using SIL.Machine.Annotations; using SIL.Machine.FeatureModel; using SIL.ObjectModel; @@ -10,19 +9,21 @@ internal class NondeterministicFsaTraversalMethod : TraversalMethodBase> where TData : IAnnotatedData { - public NondeterministicFsaTraversalMethod( - Fst fst, - TData data, - VariableBindings varBindings, - bool startAnchor, - bool endAnchor, - bool useDefaults - ) - : base(fst, data, varBindings, startAnchor, endAnchor, useDefaults) { } + // Hoisted out of Traverse: building this per call allocated a comparer object plus two bound + // delegates (KeyEquals/KeyGetHashCode are instance methods) on every Traverse call — thousands + // per word. The comparer only closes over `this` (via Fst), so one instance is reusable for the + // life of this traversal method. + private readonly IEqualityComparer _traversalKeyComparer; - public override IEnumerable> Traverse( + public NondeterministicFsaTraversalMethod(Fst fst) + : base(fst) + { + _traversalKeyComparer = AnonymousEqualityComparer.Create(KeyEquals, KeyGetHashCode); + } + + public override List> Traverse( ref int annIndex, - Register[,] initRegisters, + Register[] initRegisters, IList initCmds, ISet initAnns ) @@ -35,12 +36,10 @@ ISet initAnns ); var curResults = new List>(); - var traversed = new HashSet, int, Register[,]>>( - AnonymousEqualityComparer.Create, int, Register[,]>>( - KeyEquals, - KeyGetHashCode - ) - ); + // The dedup key is a value type (was Tuple<,,>): the HashSet stores it inline in its slot + // array, so there is no per-push heap object — `traversed.Add` is the hottest allocation in + // nondeterministic traversal. Byte-identical equality/hash (same fields, same comparers). + var traversed = new HashSet(_traversalKeyComparer); while (instStack.Count != 0) { NondeterministicFsaTraversalInstance inst = instStack.Pop(); @@ -53,7 +52,7 @@ ISet initAnns bool isInstReusable = i == inst.State.Arcs.Count - 1; if (arc.Input.IsEpsilon) { - if (!inst.Visited.Contains(arc.Target)) + if (!inst.IsVisited(arc.Target)) { NondeterministicFsaTraversalInstance ti; if (isInstReusable) @@ -68,22 +67,15 @@ ISet initAnns ti.VariableBindings = varBindings; } - ti.Visited.Add(arc.Target); + ti.MarkVisited(arc.Target); NondeterministicFsaTraversalInstance newInst = EpsilonAdvance( ti, arc, curResults ); - Tuple, int, Register[,]> key = Tuple.Create( - newInst.State, - newInst.AnnotationIndex, - newInst.Registers - ); - if (!traversed.Contains(key)) - { + var key = new TraversalKey(newInst.State, newInst.AnnotationIndex, newInst.Registers); + if (traversed.Add(key)) instStack.Push(newInst); - traversed.Add(key); - } if (isInstReusable) releaseInstance = false; varBindings = null; @@ -108,17 +100,10 @@ NondeterministicFsaTraversalInstance newInst in Advance( ) ) { - newInst.Visited.Clear(); - Tuple, int, Register[,]> key = Tuple.Create( - newInst.State, - newInst.AnnotationIndex, - newInst.Registers - ); - if (!traversed.Contains(key)) - { + newInst.ClearVisited(); + var key = new TraversalKey(newInst.State, newInst.AnnotationIndex, newInst.Registers); + if (traversed.Add(key)) instStack.Push(newInst); - traversed.Add(key); - } } if (isInstReusable) releaseInstance = false; @@ -142,44 +127,52 @@ protected override NondeterministicFsaTraversalInstance CreateIn return new NondeterministicFsaTraversalInstance(Fst.RegisterCount); } - private bool KeyEquals( - Tuple, int, Register[,]> x, - Tuple, int, Register[,]> y - ) + // Value-type dedup key (was Tuple): stored inline in the `traversed` + // HashSet so a push no longer allocates a heap Tuple. Holds the instance's live Registers by + // reference exactly as the Tuple did (same reference + hash-at-Add semantics). + private readonly struct TraversalKey { - return x.Item1.Equals(y.Item1) - && x.Item2.Equals(y.Item2) - && Fst.RegistersEqualityComparer.Equals(x.Item3, y.Item3); + public readonly State State; + public readonly int AnnotationIndex; + public readonly Register[] Registers; + + public TraversalKey(State state, int annotationIndex, Register[] registers) + { + State = state; + AnnotationIndex = annotationIndex; + Registers = registers; + } + } + + private bool KeyEquals(TraversalKey x, TraversalKey y) + { + return x.State.Equals(y.State) + && x.AnnotationIndex.Equals(y.AnnotationIndex) + && Fst.RegistersEqualityComparer.Equals(x.Registers, y.Registers); } - private int KeyGetHashCode(Tuple, int, Register[,]> m) + private int KeyGetHashCode(TraversalKey m) { int code = 23; - code = code * 31 + m.Item1.GetHashCode(); - code = code * 31 + m.Item2.GetHashCode(); - code = code * 31 + Fst.RegistersEqualityComparer.GetHashCode(m.Item3); + code = code * 31 + m.State.GetHashCode(); + code = code * 31 + m.AnnotationIndex.GetHashCode(); + code = code * 31 + Fst.RegistersEqualityComparer.GetHashCode(m.Registers); return code; } private Stack> InitializeStack( ref int annIndex, - Register[,] registers, + Register[] registers, IList cmds, ISet initAnns ) { var instStack = new Stack>(); - foreach ( - NondeterministicFsaTraversalInstance inst in Initialize( - ref annIndex, - registers, - cmds, - initAnns - ) - ) - { + List> insts = InitializeBuffer; + insts.Clear(); + Initialize(ref annIndex, registers, cmds, initAnns, insts); + foreach (NondeterministicFsaTraversalInstance inst in insts) instStack.Push(inst); - } return instStack; } diff --git a/src/SIL.Machine/FiniteState/NondeterministicFstTraversalInstance.cs b/src/SIL.Machine/FiniteState/NondeterministicFstTraversalInstance.cs index 3583c6ccf..8ce17b7a2 100644 --- a/src/SIL.Machine/FiniteState/NondeterministicFstTraversalInstance.cs +++ b/src/SIL.Machine/FiniteState/NondeterministicFstTraversalInstance.cs @@ -1,29 +1,38 @@ using System.Collections.Generic; -using System.Linq; using SIL.Extensions; using SIL.Machine.Annotations; -using SIL.Machine.DataStructures; namespace SIL.Machine.FiniteState { internal class NondeterministicFstTraversalInstance : TraversalInstance where TData : IAnnotatedData { - private readonly HashSet> _visited; + // RUSTIFY lever 1: value-type bitset over state indices instead of a HashSet (no + // per-instance set allocation). + private VisitedStates _visited; private readonly Dictionary, Annotation> _mappings; private readonly List> _outputs; public NondeterministicFstTraversalInstance(int registerCount) : base(registerCount, false) { - _visited = new HashSet>(); _mappings = new Dictionary, Annotation>(); _outputs = new List>(); } - public ISet> Visited + public bool IsVisited(State state) { - get { return _visited; } + return _visited.Contains(state.Index); + } + + public void MarkVisited(State state) + { + _visited.Add(state.Index); + } + + public void ClearVisited() + { + _visited.Clear(); } public IDictionary, Annotation> Mappings @@ -42,17 +51,15 @@ public override void CopyTo(TraversalInstance other) var otherNfst = (NondeterministicFstTraversalInstance)other; - otherNfst._visited.UnionWith(_visited); - Dictionary, Annotation> outputMappings = Output - .Annotations.SelectMany(a => a.GetNodesBreadthFirst()) - .Zip(Output.Annotations.SelectMany(a => a.GetNodesBreadthFirst())) - .ToDictionary(t => t.Item1, t => t.Item2); - otherNfst._mappings.AddRange( - _mappings.Select(kvp => new KeyValuePair, Annotation>( - kvp.Key, - outputMappings[kvp.Value] - )) - ); + otherNfst._visited.UnionWith(in _visited); + // The original built `outputMappings` by zipping this.Output's node sequence with itself + // — a deterministic (Queue-based BFS) enumeration paired element-for-element, i.e. the + // identity map — so `outputMappings[v] == v` and the whole block reduces to copying + // _mappings unchanged. Doing that directly avoids a Dictionary + two SelectMany(BFS, + // each allocating a Queue + iterator) + Zip + Select per instance copy (very hot in + // nondeterministic traversal). Byte-identical; otherNfst._mappings is empty here + // (GetCachedInstance -> Clear()). + otherNfst._mappings.AddRange(_mappings); otherNfst._outputs.AddRange(_outputs); } diff --git a/src/SIL.Machine/FiniteState/NondeterministicFstTraversalMethod.cs b/src/SIL.Machine/FiniteState/NondeterministicFstTraversalMethod.cs index e171f4410..a4e1da9be 100644 --- a/src/SIL.Machine/FiniteState/NondeterministicFstTraversalMethod.cs +++ b/src/SIL.Machine/FiniteState/NondeterministicFstTraversalMethod.cs @@ -13,19 +13,12 @@ internal class NondeterministicFstTraversalMethod : TraversalMethodBase> where TData : IAnnotatedData { - public NondeterministicFstTraversalMethod( - Fst fst, - TData data, - VariableBindings varBindings, - bool startAnchor, - bool endAnchor, - bool useDefaults - ) - : base(fst, data, varBindings, startAnchor, endAnchor, useDefaults) { } + public NondeterministicFstTraversalMethod(Fst fst) + : base(fst) { } - public override IEnumerable> Traverse( + public override List> Traverse( ref int annIndex, - Register[,] initRegisters, + Register[] initRegisters, IList initCmds, ISet initAnns ) @@ -38,12 +31,11 @@ ISet initAnns ); var curResults = new List>(); - var traversed = new HashSet< - Tuple, int, Register[,], Output[]> - >( - AnonymousEqualityComparer.Create< - Tuple, int, Register[,], Output[]> - >(KeyEquals, KeyGetHashCode) + // Value-type dedup key (was Tuple<,,,>): stored inline in the HashSet, so a push no longer + // allocates a heap Tuple. The per-push Outputs snapshot array remains (the key must capture + // the outputs at push time, since the instance's Outputs list keeps growing afterward). + var traversed = new HashSet( + AnonymousEqualityComparer.Create(KeyEquals, KeyGetHashCode) ); while (instStack.Count != 0) { @@ -57,7 +49,7 @@ ISet initAnns bool isInstReusable = i == inst.State.Arcs.Count - 1; if (arc.Input.IsEpsilon) { - if (!inst.Visited.Contains(arc.Target)) + if (!inst.IsVisited(arc.Target)) { NondeterministicFstTraversalInstance ti; if (isInstReusable) @@ -79,24 +71,23 @@ ISet initAnns ti.Outputs.Add(arc.Outputs[0]); } - ti.Visited.Add(arc.Target); + ti.MarkVisited(arc.Target); NondeterministicFstTraversalInstance newInst = EpsilonAdvance( inst, arc, curResults ); - Tuple, int, Register[,], Output[]> key = - Tuple.Create( - newInst.State, - newInst.AnnotationIndex, - newInst.Registers, - newInst.Outputs.ToArray() - ); - if (!traversed.Contains(key)) - { + var key = new TraversalKey( + newInst.State, + newInst.AnnotationIndex, + newInst.Registers, + newInst.Outputs.ToArray() + ); + // Add returns false if already present; this single hash/lookup replaces + // the Contains-then-Add pair (the structural key hash over registers + + // outputs is expensive and this is the innermost traversal loop). + if (traversed.Add(key)) instStack.Push(newInst); - traversed.Add(key); - } if (isInstReusable) releaseInstance = false; varBindings = null; @@ -128,19 +119,16 @@ NondeterministicFstTraversalInstance newInst in Advance( ) ) { - newInst.Visited.Clear(); - Tuple, int, Register[,], Output[]> key = - Tuple.Create( - newInst.State, - newInst.AnnotationIndex, - newInst.Registers, - newInst.Outputs.ToArray() - ); - if (!traversed.Contains(key)) - { + newInst.ClearVisited(); + var key = new TraversalKey( + newInst.State, + newInst.AnnotationIndex, + newInst.Registers, + newInst.Outputs.ToArray() + ); + // Single hash/lookup (Add returns false if present) — see note above. + if (traversed.Add(key)) instStack.Push(newInst); - traversed.Add(key); - } } if (isInstReusable) releaseInstance = false; @@ -164,51 +152,71 @@ protected override NondeterministicFstTraversalInstance CreateIn return new NondeterministicFstTraversalInstance(Fst.RegisterCount); } - private bool KeyEquals( - Tuple, int, Register[,], Output[]> x, - Tuple, int, Register[,], Output[]> y - ) + // Value-type dedup key (was Tuple): stored inline in the + // `traversed` HashSet so a push no longer allocates a heap Tuple. Holds the instance's live + // Registers by reference and a snapshot of its Outputs, exactly as the Tuple did. + private readonly struct TraversalKey { - return x.Item1.Equals(y.Item1) - && x.Item2.Equals(y.Item2) - && Fst.RegistersEqualityComparer.Equals(x.Item3, y.Item3) - && x.Item4.SequenceEqual(y.Item4); + public readonly State State; + public readonly int AnnotationIndex; + public readonly Register[] Registers; + public readonly Output[] Outputs; + + public TraversalKey( + State state, + int annotationIndex, + Register[] registers, + Output[] outputs + ) + { + State = state; + AnnotationIndex = annotationIndex; + Registers = registers; + Outputs = outputs; + } } - private int KeyGetHashCode(Tuple, int, Register[,], Output[]> m) + private bool KeyEquals(TraversalKey x, TraversalKey y) + { + return x.State.Equals(y.State) + && x.AnnotationIndex.Equals(y.AnnotationIndex) + && Fst.RegistersEqualityComparer.Equals(x.Registers, y.Registers) + && x.Outputs.SequenceEqual(y.Outputs); + } + + private int KeyGetHashCode(TraversalKey m) { int code = 23; - code = code * 31 + m.Item1.GetHashCode(); - code = code * 31 + m.Item2.GetHashCode(); - code = code * 31 + Fst.RegistersEqualityComparer.GetHashCode(m.Item3); - code = code * 31 + m.Item4.GetSequenceHashCode(); + code = code * 31 + m.State.GetHashCode(); + code = code * 31 + m.AnnotationIndex.GetHashCode(); + code = code * 31 + Fst.RegistersEqualityComparer.GetHashCode(m.Registers); + code = code * 31 + m.Outputs.GetSequenceHashCode(); return code; } private Stack> InitializeStack( ref int annIndex, - Register[,] registers, + Register[] registers, IList cmds, ISet initAnns ) { var instStack = new Stack>(); - foreach ( - NondeterministicFstTraversalInstance inst in Initialize( - ref annIndex, - registers, - cmds, - initAnns - ) - ) + List> insts = InitializeBuffer; + insts.Clear(); + Initialize(ref annIndex, registers, cmds, initAnns, insts); + foreach (NondeterministicFstTraversalInstance inst in insts) { inst.Output = ((ICloneable)Data).Clone(); - inst.Mappings.AddRange( - Data.Annotations.SelectMany(a => a.GetNodesBreadthFirst()) - .Zip( - inst.Output.Annotations.SelectMany(a => a.GetNodesBreadthFirst()), - (a1, a2) => new KeyValuePair, Annotation>(a1, a2) - ) + // Pair each source annotation with its clone via a lockstep preorder walk of the two + // isomorphic forests — same result as zipping the two BFS node sequences (dict order + // is irrelevant) but without the per-call Queue + SelectMany/Zip iterators + KVPs. + DataStructuresExtensions.PairedPreorderTraverse( + Data.Annotations, + inst.Output.Annotations, + inst.Mappings, + (mappings, a1, a2) => mappings[a1] = a2, + Direction.LeftToRight ); instStack.Push(inst); } diff --git a/src/SIL.Machine/FiniteState/RegistersEqualityComparer.cs b/src/SIL.Machine/FiniteState/RegistersEqualityComparer.cs index 5337cab37..36aab51d3 100644 --- a/src/SIL.Machine/FiniteState/RegistersEqualityComparer.cs +++ b/src/SIL.Machine/FiniteState/RegistersEqualityComparer.cs @@ -1,45 +1,60 @@ -using System.Collections.Generic; +using System.Collections.Generic; namespace SIL.Machine.FiniteState { - internal class RegistersEqualityComparer : IEqualityComparer[,]> + // Registers is a flat Register[] of length 2*registerCount (see RUSTIFY MD-array note + // on TraversalInstance/Fst.Transduce): index i's pair lives at [2*i] (start) / [2*i+1] (end). + internal class RegistersEqualityComparer : IEqualityComparer[]> { private readonly IEqualityComparer _offsetEqualityComparer; + // Devirtualizes the common case: EqualityComparer.Default.Equals/GetHashCode are + // JIT-inlined for a value-type TOffset (HermitCrab's int), so skip the interface-dispatch + // field entirely when the caller passed the default comparer. + private readonly bool _isDefault; + public RegistersEqualityComparer(IEqualityComparer offsetEqualityComparer) { _offsetEqualityComparer = offsetEqualityComparer; + _isDefault = ReferenceEquals(offsetEqualityComparer, EqualityComparer.Default); } - public bool Equals(Register[,] x, Register[,] y) + public bool Equals(Register[] x, Register[] y) { - for (int i = 0; i < x.GetLength(0); i++) + for (int i = 0; i < x.Length; i++) { - for (int j = 0; j < 2; j++) - { - if (!x[i, j].ValueEquals(y[i, j], _offsetEqualityComparer)) - return false; - } + if (!RegisterEquals(x[i], y[i])) + return false; } return true; } - public int GetHashCode(Register[,] obj) + private bool RegisterEquals(Register x, Register y) + { + return _isDefault + ? x.ValueEquals(y, EqualityComparer.Default) + : x.ValueEquals(y, _offsetEqualityComparer); + } + + public int GetHashCode(Register[] obj) { int code = 23; - for (int i = 0; i < obj.GetLength(0); i++) + for (int i = 0; i < obj.Length; i++) { - for (int j = 0; j < 2; j++) + if (obj[i].HasOffset) + { + code = + code * 31 + + ( + _isDefault + ? EqualityComparer.Default.GetHashCode(obj[i].Offset) + : _offsetEqualityComparer.GetHashCode(obj[i].Offset) + ); + code = code * 31 + obj[i].IsStart.GetHashCode(); + } + else { - if (obj[i, j].HasOffset) - { - code = code * 31 + _offsetEqualityComparer.GetHashCode(obj[i, j].Offset); - code = code * 31 + obj[i, j].IsStart.GetHashCode(); - } - else - { - code = code * 31 + 0; - } + code = code * 31 + 0; } } return code; diff --git a/src/SIL.Machine/FiniteState/State.cs b/src/SIL.Machine/FiniteState/State.cs index f01aa3959..ce7676c0c 100644 --- a/src/SIL.Machine/FiniteState/State.cs +++ b/src/SIL.Machine/FiniteState/State.cs @@ -96,6 +96,18 @@ public override string ToString() return string.Format("State {0}", _index); } + // Without this override, GetHashCode() falls back to the CLR's default identity hash + // (RuntimeHelpers.GetHashCode's sync-block-index path) — a CPU profile showed that call + // dominating self-time on the hot nondeterministic-traversal dedup path (TraversalKey's + // hash folds in State.GetHashCode() once per pushed instance). _index is a stable, + // already-unique-per-Fst int assigned once at construction, so it is a valid, far cheaper + // hash; Equals() is intentionally left as reference equality (state objects are singletons + // within their Fst, never recreated), so the Equals/GetHashCode contract still holds. + public override int GetHashCode() + { + return _index; + } + private void CheckFrozen() { if (IsFrozen) diff --git a/src/SIL.Machine/FiniteState/TraversalInstance.cs b/src/SIL.Machine/FiniteState/TraversalInstance.cs index 100f728d5..3e6b27e68 100644 --- a/src/SIL.Machine/FiniteState/TraversalInstance.cs +++ b/src/SIL.Machine/FiniteState/TraversalInstance.cs @@ -10,12 +10,17 @@ namespace SIL.Machine.FiniteState internal abstract class TraversalInstance where TData : IAnnotatedData { - private readonly Register[,] _registers; + // Flat (SZ, single-dimension zero-lower-bound) array instead of Register[,]: the + // CLR allocates rectangular (multi-dim) arrays through the general-purpose + // Array.CreateInstanceMDArray runtime helper, which a CPU profile showed dominating + // self-time on this hot path — SZ arrays get the JIT-inlined fast allocation path instead. + // Index i's (start, end) pair lives at [2*i] / [2*i+1]. + private readonly Register[] _registers; private readonly List _priorities; protected TraversalInstance(int registerCount, bool deterministic) { - _registers = new Register[registerCount, 2]; + _registers = new Register[registerCount * 2]; if (!deterministic) _priorities = new List(); } @@ -29,7 +34,7 @@ public IList Priorities get { return _priorities; } } - public Register[,] Registers + public Register[] Registers { get { return _registers; } } diff --git a/src/SIL.Machine/FiniteState/TraversalMethodBase.cs b/src/SIL.Machine/FiniteState/TraversalMethodBase.cs index c5934d991..fb7a969e7 100644 --- a/src/SIL.Machine/FiniteState/TraversalMethodBase.cs +++ b/src/SIL.Machine/FiniteState/TraversalMethodBase.cs @@ -13,54 +13,112 @@ internal abstract class TraversalMethodBase : ITraversalM where TInst : TraversalInstance { private readonly Fst _fst; - private readonly TData _data; - private readonly VariableBindings _varBindings; - private readonly bool _startAnchor; - private readonly bool _endAnchor; - private readonly bool _useDefaults; - private readonly List> _annotations; + private TData _data; + private VariableBindings _varBindings; + private bool _startAnchor; + private bool _endAnchor; + private bool _useDefaults; + + // Either this method's own scratch list (built by Reset) or a shared filtered view cached on + // a frozen AnnotationList (see Reset). When shared (_annotationsShared), it must never be + // mutated — traversal only reads it after Reset, so the only guarded site is Reset's Clear(). + private List> _annotations; + private bool _annotationsShared; + + // Instance free-list, kept across Reset() calls so a traversal method pooled for the + // duration of one word (see Fst.Transduce + Morpher per-word reset) reuses instances across + // the thousands of Transduce calls that word triggers. private readonly Queue _cachedInstances; - protected TraversalMethodBase( - Fst fst, - TData data, - VariableBindings varBindings, - bool startAnchor, - bool endAnchor, - bool useDefaults - ) + // Cached delegate for the per-annotation insertion sort in Reset(). Allocated once here + // rather than per Reset() call so the depth-first walk uses the allocation-free + // PreorderTraverse(action) form instead of GetNodesDepthFirst(), whose yield state machine + // was heap-allocated on every Transduce (Reset runs once per Transduce, thousands per word). + private readonly Action> _insertAnnotation; + + protected TraversalMethodBase(Fst fst) { _fst = fst; + // _annotations is created lazily in Reset: on the (common) cached-view hit path this + // method never needs a scratch list of its own. + _cachedInstances = new Queue(); + _insertAnnotation = InsertAnnotation; + } + + /// + /// Re-targets this (pooled) traversal method at a new input without reallocating it or its + /// instance free-list. Rebuilds the per-input annotation list; keeps . + /// + public void Reset(TData data, VariableBindings varBindings, bool startAnchor, bool endAnchor, bool useDefaults) + { _data = data; _varBindings = varBindings; _startAnchor = startAnchor; _endAnchor = endAnchor; _useDefaults = useDefaults; - _annotations = new List>(); - // insertion sort - foreach (Annotation topAnn in _data.Annotations.GetNodes(_fst.Direction)) + + // The filtered+sorted list built below depends only on (annotation list, filter, + // direction) — NOT on which FST asks — and on the sena grammar ~89% of Transduce calls + // re-derive a view that was already built for the same frozen list (COW clones share the + // frozen source's projection, and rule filters are a handful of compiler-cached lambdas). + // Frozen lists are immutable, so a cached view is final; unfrozen lists never cache + // (their annotations' FeatureStructs can be edited in place, silently invalidating a + // cached view). + AnnotationList annList = _data.Annotations; + bool cacheable = annList.IsFrozen; + if (cacheable) { - foreach (Annotation ann in topAnn.GetNodesDepthFirst(_fst.Direction)) + List> cached = annList.GetFilteredView(_fst.Filter, _fst.Direction); + if (cached != null) { - if (!_fst.Filter(ann)) - continue; - - int i = _annotations.Count - 1; - while (i >= 0 && CompareAnnotations(_annotations[i], ann) > 0) - { - if (i + 1 == _annotations.Count) - _annotations.Add(_annotations[i]); - else - _annotations[i + 1] = _annotations[i]; - i--; - } - if (i + 1 == _annotations.Count) - _annotations.Add(ann); - else - _annotations[i + 1] = ann; + _annotations = cached; + _annotationsShared = true; + return; } } - _cachedInstances = new Queue(); + + if (_annotations == null || _annotationsShared) + { + _annotations = new List>(); + _annotationsShared = false; + } + else + { + _annotations.Clear(); + } + // insertion sort (PreorderTraverse with a cached delegate — same depth-first order as + // GetNodesDepthFirst but no per-call yield-iterator allocation; see _insertAnnotation). + foreach (Annotation topAnn in annList.GetNodes(_fst.Direction)) + topAnn.PreorderTraverse(_insertAnnotation, _fst.Direction); + + if (cacheable) + { + // Publish for the next Transduce against the same frozen list. This method keeps + // using the (now-shared) list read-only; mark it shared so a hypothetical re-Reset + // of this method starts a fresh scratch list instead of clearing the published one. + annList.AddFilteredView(_fst.Filter, _fst.Direction, _annotations); + _annotationsShared = true; + } + } + + private void InsertAnnotation(Annotation ann) + { + if (!_fst.Filter(ann)) + return; + + int i = _annotations.Count - 1; + while (i >= 0 && CompareAnnotations(_annotations[i], ann) > 0) + { + if (i + 1 == _annotations.Count) + _annotations.Add(_annotations[i]); + else + _annotations[i + 1] = _annotations[i]; + i--; + } + if (i + 1 == _annotations.Count) + _annotations.Add(ann); + else + _annotations[i + 1] = ann; } private int CompareAnnotations(Annotation x, Annotation y) @@ -87,33 +145,56 @@ public IList> Annotations get { return _annotations; } } - public abstract IEnumerable> Traverse( + public abstract List> Traverse( ref int annIndex, - Register[,] initRegisters, + Register[] initRegisters, IList initCmds, ISet initAnns ); + private static void ApplyCommand( + Register[] registers, + TagMapCommand cmd, + Register start, + Register end + ) + { + if (cmd.Src == TagMapCommand.CurrentPosition) + { + registers[cmd.Dest * 2] = start; + registers[cmd.Dest * 2 + 1] = end; + } + else + { + registers[cmd.Dest * 2] = registers[cmd.Src * 2]; + registers[cmd.Dest * 2 + 1] = registers[cmd.Src * 2 + 1]; + } + } + protected static void ExecuteCommands( - Register[,] registers, + Register[] registers, IEnumerable cmds, Register start, Register end ) { foreach (TagMapCommand cmd in cmds) - { - if (cmd.Src == TagMapCommand.CurrentPosition) - { - registers[cmd.Dest, 0] = start; - registers[cmd.Dest, 1] = end; - } - else - { - registers[cmd.Dest, 0] = registers[cmd.Src, 0]; - registers[cmd.Dest, 1] = registers[cmd.Src, 1]; - } - } + ApplyCommand(registers, cmd, start, end); + } + + // Concrete-List overload: the hot callers (arc.Commands, state.Finishers) pass a List, so an + // index for-loop avoids boxing the List.Enumerator struct that the IEnumerable foreach incurs + // on every arc-advance. Overload resolution routes List args here; the cold IList init path keeps + // the IEnumerable overload above. + protected static void ExecuteCommands( + Register[] registers, + List cmds, + Register start, + Register end + ) + { + for (int i = 0; i < cmds.Count; i++) + ApplyCommand(registers, cmds[i], start, end); } protected bool CheckInputMatch(Arc arc, int annIndex, VariableBindings varBindings) @@ -129,7 +210,7 @@ protected bool CheckInputMatch(Arc arc, int annIndex, VariableBi private void CheckAccepting( int annIndex, - Register[,] registers, + Register[] registers, TData output, VariableBindings varBindings, State state, @@ -141,7 +222,7 @@ IList priorities { Annotation ann = annIndex < _annotations.Count ? _annotations[annIndex] : _data.Annotations.GetEnd(_fst.Direction); - var matchRegisters = (Register[,])registers.Clone(); + var matchRegisters = (Register[])registers.Clone(); ExecuteCommands(matchRegisters, state.Finishers, new Register(), new Register()); if (state.AcceptInfos.Count > 0) { @@ -190,14 +271,17 @@ IList priorities } } - protected IEnumerable Initialize( + // De-iterator (RUSTIFY lever 1): fills the caller-provided buffer instead of allocating a fresh + // List per call (plus a nested List per recursive optional-skip). The buffer is reused per + // Transduce by the traversal method (see InitializeStack); recursion appends to the same buffer. + protected void Initialize( ref int annIndex, - Register[,] registers, + Register[] registers, IList cmds, - ISet initAnns + ISet initAnns, + List output ) { - var insts = new List(); TOffset offset = _annotations[annIndex].Range.GetStart(_fst.Direction); if (_startAnchor) @@ -212,11 +296,7 @@ ISet initAnns { int nextIndex = GetNextNonoverlappingAnnotationIndex(i); if (nextIndex != _annotations.Count) - { - insts.AddRange( - Initialize(ref nextIndex, (Register[,])registers.Clone(), cmds, initAnns) - ); - } + Initialize(ref nextIndex, (Register[])registers.Clone(), cmds, initAnns, output); } } } @@ -237,20 +317,46 @@ ISet initAnns Array.Copy(registers, inst.Registers, registers.Length); if (!_fst.IgnoreVariables) inst.VariableBindings = _varBindings != null ? _varBindings.Clone() : new VariableBindings(); - insts.Add(inst); + output.Add(inst); initAnns.Add(annIndex); } } + } - return insts; + // RUSTIFY lever 1 (de-iterator): Advance was a `yield`-based iterator, so every call (one per + // matched arc, recursively for optional-skip forks — millions/word) allocated an iterator state + // machine. It now fills a reusable per-method buffer instead. The traversal method is created + // fresh per Transduce (dies in Gen0), so the buffer carries no cross-word retention (the Phase-1b + // regression), and Advance is not re-entrant within one method (a re-entrant Transduce gets its + // own method instance + buffer). Byte-identical: same results in the same order. + // One reusable result buffer per traversal method (per-Transduce → no cross-word retention; can't + // be a thread-static — CheckAccepting's Acceptable predicate can re-enter Transduce). Shared by + // Initialize and Advance: Initialize fills it once at the start of Traverse and the caller fully + // consumes it building the work stack before the main loop's first Advance reuses it, so they + // never overlap. + private readonly List _buffer = new List(); + + protected List InitializeBuffer => _buffer; + + protected List Advance( + TInst inst, + VariableBindings varBindings, + Arc arc, + ICollection> curResults + ) + { + _buffer.Clear(); + AdvanceInto(inst, varBindings, arc, curResults, false, _buffer); + return _buffer; } - protected IEnumerable Advance( + private void AdvanceInto( TInst inst, VariableBindings varBindings, Arc arc, ICollection> curResults, - bool optional = false + bool optional, + List output ) { inst.Priorities?.Add(arc.Priority); @@ -271,8 +377,10 @@ protected IEnumerable Advance( if (nextIndex < _annotations.Count) { - var anns = new List(); bool cloneOutputs = false; + // The same-offset window is a contiguous index range [nextIndex, annsEnd); track its + // end bound instead of materializing a List per Advance call (hot path). + int annsEnd = nextIndex; for ( int i = nextIndex; i < _annotations.Count && _annotations[i].Range.GetStart(_fst.Direction).Equals(nextOffset); @@ -283,13 +391,12 @@ protected IEnumerable Advance( { TInst ti = CopyInstance(inst); ti.AnnotationIndex = i; - foreach (TInst ni in Advance(ti, varBindings, arc, curResults, true)) - { - yield return ni; + int before = output.Count; + AdvanceInto(ti, varBindings, arc, curResults, true, output); + if (output.Count > before) cloneOutputs = true; - } } - anns.Add(i); + annsEnd = i + 1; } ExecuteCommands( @@ -314,13 +421,13 @@ protected IEnumerable Advance( inst.State = arc.Target; bool first = true; - foreach (int curIndex in anns) + for (int curIndex = nextIndex; curIndex < annsEnd; curIndex++) { TInst ni = first ? inst : CopyInstance(inst); ni.AnnotationIndex = curIndex; if (varBindings != null) inst.VariableBindings = cloneOutputs ? varBindings.Clone() : varBindings; - yield return ni; + output.Add(ni); cloneOutputs = true; first = false; } @@ -346,7 +453,7 @@ protected IEnumerable Advance( inst.State = arc.Target; inst.AnnotationIndex = nextIndex; inst.VariableBindings = varBindings; - yield return inst; + output.Add(inst); } } @@ -384,7 +491,7 @@ ICollection> curResults protected void CheckAcceptingStartState( ISet anns, - Register[,] registers, + Register[] registers, ICollection> curResults ) { diff --git a/src/SIL.Machine/FiniteState/VisitedStates.cs b/src/SIL.Machine/FiniteState/VisitedStates.cs new file mode 100644 index 000000000..72a220ad1 --- /dev/null +++ b/src/SIL.Machine/FiniteState/VisitedStates.cs @@ -0,0 +1,58 @@ +using System; + +namespace SIL.Machine.FiniteState +{ + /// + /// A value-type set of FST state indices used by the nondeterministic traversal to avoid epsilon + /// loops. States have a dense (0..N-1), so membership is a + /// bitset: states 0–63 live in an inline ulong field (zero heap allocation — the common case, + /// HC rule FSTs have only a handful of states) and an overflow ulong[] is allocated lazily only + /// for FSTs with 64+ states. RUSTIFY lever 1: replaces the per-instance HashSet<State> + /// (~1.17M allocated per word on Sena) so creating a traversal instance no longer allocates a set. + /// + internal struct VisitedStates + { + private ulong _bits0; // states 0..63 + private ulong[] _overflow; // states 64.., word i covers states [64*(i+1) .. 64*(i+1)+63] + + public bool Contains(int index) + { + if (index < 64) + return (_bits0 & (1UL << index)) != 0; + int w = index / 64 - 1; + return _overflow != null && w < _overflow.Length && (_overflow[w] & (1UL << (index & 63))) != 0; + } + + public void Add(int index) + { + if (index < 64) + { + _bits0 |= 1UL << index; + return; + } + int w = index / 64 - 1; + if (_overflow == null || w >= _overflow.Length) + Array.Resize(ref _overflow, w + 1); + _overflow[w] |= 1UL << (index & 63); + } + + public void Clear() + { + _bits0 = 0; + if (_overflow != null) + Array.Clear(_overflow, 0, _overflow.Length); + } + + public void UnionWith(in VisitedStates other) + { + _bits0 |= other._bits0; + if (other._overflow != null) + { + if (_overflow == null || _overflow.Length < other._overflow.Length) + Array.Resize(ref _overflow, other._overflow.Length); + for (int i = 0; i < other._overflow.Length; i++) + _overflow[i] |= other._overflow[i]; + } + } + } +} diff --git a/src/SIL.Machine/Matching/Matcher.cs b/src/SIL.Machine/Matching/Matcher.cs index 9d73cfe4e..3c0174a14 100644 --- a/src/SIL.Machine/Matching/Matcher.cs +++ b/src/SIL.Machine/Matching/Matcher.cs @@ -19,6 +19,12 @@ public class Matcher private Fst _fsa; private readonly IEqualityComparer> _matchComparer; + // Memoizes match.ID -> split parts. IDs come from the (fixed, small) set of accepting-state + // labels, so re-splitting the same string on every result is wasted allocation. The matcher is + // shared read-only across parallel parses, so the cache must be concurrent. + private readonly System.Collections.Concurrent.ConcurrentDictionary _idSplitCache = + new System.Collections.Concurrent.ConcurrentDictionary(); + public Matcher(Pattern pattern) : this(pattern, new MatcherSettings()) { } @@ -201,7 +207,9 @@ private Match CreatePatternMatch(TData input, FstResult() + : _idSplitCache.GetOrAdd(match.ID, id => id.Split('*')), match.VariableBindings, match.NextAnnotation ); diff --git a/src/SIL.Machine/Rules/ParallelCombinationRuleCascade.cs b/src/SIL.Machine/Rules/ParallelCombinationRuleCascade.cs index 9a68ea244..b698989f1 100644 --- a/src/SIL.Machine/Rules/ParallelCombinationRuleCascade.cs +++ b/src/SIL.Machine/Rules/ParallelCombinationRuleCascade.cs @@ -29,8 +29,16 @@ IEqualityComparer comparer ) : base(rules, multiApp, comparer) { } + /// + /// Caps the parallelism used by . Default -1 (unbounded, the .NET + /// default). Set to the morpher's MaxDegreeOfParallelism so the cap is actually honored + /// rather than the parallel path running at the default scheduler degree. + /// + public int MaxDegreeOfParallelism { get; set; } = -1; + public override IEnumerable Apply(TData input) { + var parallelOptions = new ParallelOptions { MaxDegreeOfParallelism = MaxDegreeOfParallelism }; var output = new ConcurrentStack(); var from = new ConcurrentStack>>(); from.Push(Tuple.Create(input, !MultipleApplication ? new HashSet() : null)); @@ -40,6 +48,7 @@ public override IEnumerable Apply(TData input) to.Clear(); Parallel.ForEach( from, + parallelOptions, work => { for (int i = 0; i < Rules.Count; i++) diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/AffixTemplateTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/AffixTemplateTests.cs index 54786c82f..bf564246f 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/AffixTemplateTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/AffixTemplateTests.cs @@ -1,5 +1,4 @@ using NUnit.Framework; -using SIL.Machine.Annotations; using SIL.Machine.FeatureModel; using SIL.Machine.Matching; using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; @@ -60,8 +59,8 @@ public void RealizationalRule() { Lhs = { - Pattern.New("1").Annotation(any).OneOrMore.Value, - Pattern.New("2").Annotation(alvStop).Value, + Pattern.New("1").Annotation(any).OneOrMore.Value, + Pattern.New("2").Annotation(alvStop).Value, }, Rhs = { new CopyFromInput("1"), new CopyFromInput("2"), new InsertSegments(Table3, "ɯd") }, } @@ -69,14 +68,14 @@ public void RealizationalRule() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Annotation(voicelessCons).Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Annotation(voicelessCons).Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "t") }, } ); edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "d") }, } ); @@ -97,8 +96,8 @@ public void RealizationalRule() { Lhs = { - Pattern.New("1").Annotation(any).OneOrMore.Value, - Pattern.New("2").Annotation(labiodental).Value, + Pattern.New("1").Annotation(any).OneOrMore.Value, + Pattern.New("2").Annotation(labiodental).Value, }, Rhs = { new CopyFromInput("1"), new ModifyFromInput("2", voiced), new InsertSegments(Table3, "z") }, } @@ -106,7 +105,7 @@ public void RealizationalRule() sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Annotation(strident).Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Annotation(strident).Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "ɯz") }, } ); @@ -115,8 +114,8 @@ public void RealizationalRule() { Lhs = { - Pattern.New("1").Annotation(any).OneOrMore.Value, - Pattern.New("2").Annotation(voicelessCons).Value, + Pattern.New("1").Annotation(any).OneOrMore.Value, + Pattern.New("2").Annotation(voicelessCons).Value, }, Rhs = { new CopyFromInput("1"), new CopyFromInput("2"), new InsertSegments(Table3, "s") }, } @@ -124,7 +123,7 @@ public void RealizationalRule() sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "z") }, } ); @@ -143,7 +142,7 @@ public void RealizationalRule() evidential.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "v") }, } ); @@ -245,8 +244,8 @@ public void NonFinalTemplate() { Lhs = { - Pattern.New("1").Annotation(any).OneOrMore.Value, - Pattern.New("2").Annotation(alvStop).Value, + Pattern.New("1").Annotation(any).OneOrMore.Value, + Pattern.New("2").Annotation(alvStop).Value, }, Rhs = { new CopyFromInput("1"), new CopyFromInput("2"), new InsertSegments(Table3, "ɯd") }, } @@ -254,14 +253,14 @@ public void NonFinalTemplate() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Annotation(voicelessCons).Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Annotation(voicelessCons).Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "t") }, } ); edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "d") }, } ); @@ -284,7 +283,7 @@ public void NonFinalTemplate() nominalizer.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "v") }, } ); @@ -303,8 +302,8 @@ public void NonFinalTemplate() crule.Subrules.Add( new CompoundingSubrule { - HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, - NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, + HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, + NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("head"), new InsertSegments(Table3, "+"), new CopyFromInput("nonHead") }, } ); @@ -319,7 +318,7 @@ public void NonFinalTemplate() sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, } ); @@ -363,7 +362,7 @@ public void AffixTemplateAppliedAfterMorphologicalRule() nominalizer.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "v") }, } ); @@ -379,7 +378,7 @@ public void AffixTemplateAppliedAfterMorphologicalRule() sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, } ); @@ -415,7 +414,7 @@ public void SameRuleUsedInMultipleTemplates() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "d") }, } ); @@ -447,7 +446,7 @@ public void SameRuleUsedInMultipleTemplates() nominalizer.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "v") }, } ); diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstCoverageProbeTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstCoverageProbeTests.cs new file mode 100644 index 000000000..0a4c13127 --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstCoverageProbeTests.cs @@ -0,0 +1,164 @@ +using NUnit.Framework; +using SIL.Machine.FeatureModel; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; +using SIL.Machine.Morphology.HermitCrab.PhonologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// CI coverage for the bounded, opt-in grammar-tuning probe (): it reports +/// coverage over a wordlist and diffs coverage between two grammar versions, without ever running the +/// full search engine. +/// +public class FstCoverageProbeTests : HermitCrabTestBase +{ + private AffixProcessRule AddSuffix() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var sSuffix = new AffixProcessRule + { + Name = "s_suffix", + Gloss = "NMLZ", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + sSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, + } + ); + Morphophonemic.MorphologicalRules.Add(sSuffix); + return sSuffix; + } + + [Test] + public void Probe_ReportsCoverageAndUnparsedWords() + { + string[] corpus = { "sag", "dat", "zzz" }; // two bare roots, one non-word + ProbeReport report = FstCoverageProbe.ForLanguage(Language).Probe(corpus); + + Assert.That(report.TotalWords, Is.EqualTo(3)); + Assert.That(report.ParsedWords, Is.EqualTo(2)); + Assert.That(report.UnparsedWords, Is.EquivalentTo(new[] { "zzz" })); + Assert.That(report.CoverageRate, Is.EqualTo(2.0 / 3).Within(0.0001)); + } + + [Test] + public void Probe_NeverReportsANonWordAsParsed() + { + // Soundness contract: "sagg" does not parse in the base grammar (shared negative-control word + // used across the FST test suite); the probe must agree, never over-generating a false positive. + ProbeReport report = FstCoverageProbe.ForLanguage(Language).Probe(new[] { "sagg" }); + + Assert.That(report.ParsedWords, Is.Zero); + Assert.That(report.UnparsedWords, Is.EquivalentTo(new[] { "sagg" })); + } + + [Test] + public void CompareGrammars_SameGrammarTwice_NoGainedOrLost() + { + string[] corpus = { "sag", "dat", "zzz" }; + CoverageDiff diff = FstCoverageProbe.CompareGrammars(Language, Language, corpus); + + Assert.That(diff.Gained, Is.Empty); + Assert.That(diff.Lost, Is.Empty); + Assert.That(diff.Before.ParsedWords, Is.EqualTo(diff.After.ParsedWords)); + } + + [Test] + public void Probe_DetectsGainedCoverage_AfterAddingSuffixRule() + { + // The direct "did this grammar edit make parsing better or worse" workflow: probe before the + // edit, apply the edit, probe again, and confirm the newly-coverable word is picked up. This is + // the affix-rule edit class of FST_FAST_PATH_PLAN.md's Phase 5.4 edit-loop promise. + string[] corpus = { "sag", "sags", "dat" }; + ProbeReport before = FstCoverageProbe.ForLanguage(Language).Probe(corpus); + Assert.That(before.UnparsedWords, Does.Contain("sags"), "precondition: sags not yet coverable"); + + AffixProcessRule suffix = AddSuffix(); + try + { + ProbeReport after = FstCoverageProbe.ForLanguage(Language).Probe(corpus); + Assert.That(after.UnparsedWords, Does.Not.Contain("sags")); + Assert.That(after.ParsedWords, Is.EqualTo(before.ParsedWords + 1)); + } + finally + { + Morphophonemic.MorphologicalRules.Remove(suffix); + } + } + + [Test] + public void Probe_DetectsGainedCoverage_AfterAddingPhonologicalRule() + { + // The phonological-rule edit class of the Phase 5.4 edit-loop promise: an unconditional t->d + // rule means bare root "dat" (entry 8) now surfaces only as "dad" — invisible to the probe + // until the rule exists. ("dat" itself is deliberately excluded from the corpus: the same + // unconditional rule also makes the literal string "dat" stop being a valid surface form of + // anything once every underlying "t" surfaces as "d" — a real "gained dad, lost dat" situation, + // not tested here to keep this assertion to a single, unconfounded gain.) + string[] corpus = { "sag", "dad" }; + ProbeReport before = FstCoverageProbe.ForLanguage(Language).Probe(corpus); + Assert.That(before.UnparsedWords, Does.Contain("dad"), "precondition: dad not yet coverable"); + + var tToD = new RewriteRule + { + Name = "t_to_d_probe", + Lhs = Pattern.New().Annotation(Character(Table1, "t")).Value, + }; + tToD.Subrules.Add( + new RewriteSubrule { Rhs = Pattern.New().Annotation(Character(Table1, "d")).Value } + ); + Surface.PhonologicalRules.Add(tToD); + try + { + ProbeReport after = FstCoverageProbe.ForLanguage(Language).Probe(corpus); + Assert.That(after.UnparsedWords, Does.Not.Contain("dad")); + Assert.That(after.ParsedWords, Is.EqualTo(before.ParsedWords + 1)); + } + finally + { + Surface.PhonologicalRules.Remove(tToD); + } + } + + [Test] + public void Probe_DetectsGainedCoverage_AfterAddingReduplicationRule() + { + // The reduplication-rule edit class of the Phase 5.4 edit-loop promise: a full-copy rule means + // "sagsag" (RED('sag')) is only coverable once the rule exists. + string[] corpus = { "sag", "sagsag", "dat" }; + ProbeReport before = FstCoverageProbe.ForLanguage(Language).Probe(corpus); + Assert.That(before.UnparsedWords, Does.Contain("sagsag"), "precondition: sagsag not yet coverable"); + + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var redup = new AffixProcessRule + { + Name = "redup_probe", + Gloss = "RED", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, + } + ); + Morphophonemic.MorphologicalRules.Add(redup); + try + { + ProbeReport after = FstCoverageProbe.ForLanguage(Language).Probe(corpus); + Assert.That(after.UnparsedWords, Does.Not.Contain("sagsag")); + Assert.That(after.ParsedWords, Is.EqualTo(before.ParsedWords + 1)); + } + finally + { + Morphophonemic.MorphologicalRules.Remove(redup); + } + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstSenaBenchmark.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstSenaBenchmark.cs new file mode 100644 index 000000000..e96ed106e --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstSenaBenchmark.cs @@ -0,0 +1,560 @@ +using System.Collections.Concurrent; +using System.Diagnostics; +using NUnit.Framework; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// Manual end-to-end benchmark on a real grammar: grammar census, build, per-analyzer timing + a +/// gap-diagnostic comparison vs the search engine, a negative-example soundness check, and a +/// parallel-consistency check. [Explicit] — set HC_GRAMMAR (an HC config XML) and HC_WORDS (one word +/// per line); optionally HC_MAX_WORDS. The reference oracle runs with unlimited unapplications (the +/// only sound+complete baseline). Run: +/// $env:HC_GRAMMAR=...; $env:HC_WORDS=...; dotnet test --filter "FullyQualifiedName~FstSenaBenchmark" +/// +[TestFixture] +[Explicit("Manual FST-vs-search benchmark on an external grammar; not part of CI.")] +public class FstSenaBenchmark +{ + [Test] + public void Benchmark_FstVsSearch() + { + (Language language, List words) = Load(); + var search = new Morpher(new TraceManager(), language) { MaxUnapplications = 0 }; + + GrammarFstReport census = GrammarFstAdvisor.Analyze(language); + TestContext.Out.WriteLine($"census : {census.Tier} ({census.EscapeCount} escapes)"); + + var fst = new FstTemplateAnalyzer(language, search); + TestContext.Out.WriteLine($"FST states: {fst.StateCount}"); + var verified = new VerifiedFstAnalyzer(fst, new MorpherPool(() => new Morpher(new TraceManager(), language))); + + long searchMs = TimeParse("search ", words, w => search.AnalyzeWord(w).Count()); + TimeParse("verified", words, w => verified.AnalyzeWord(w).Count()); + + AnalysisComparison gap = FstVerification.Compare(search, verified, words); + TestContext.Out.WriteLine( + $"verified vs search : {(gap.MatchesReferenceExactly ? "IDENTICAL" : gap.Divergences.Count + " divergent words")}" + ); + TestContext.Out.WriteLine($"(search total {searchMs} ms)"); + } + + /// + /// Composite (FST + reduplication + infix + phonology-composition generators) vs the bare FST, + /// both verified, against the search oracle: how many words each fully covers (set parity), whether + /// the composite is a sound subset of search (no false positives), and the extra coverage the + /// generators buy on a real grammar. + /// + [Test] + public void Benchmark_CompositeVsSearch() + { + (Language language, List words) = Load(); + var search = new Morpher(new TraceManager(), language) { MaxUnapplications = 0 }; + var bare = new VerifiedFstAnalyzer( + new FstTemplateAnalyzer(language, search), + new MorpherPool(() => new Morpher(new TraceManager(), language)) + ); + CompositeProposer composite = CompositeProposer.ForLanguage( + language, + new FstTemplateAnalyzer(language, new Morpher(new TraceManager(), language)) + ); + var composed = new VerifiedFstAnalyzer( + composite, + new MorpherPool(() => new Morpher(new TraceManager(), language)) + ); + + int bareFull = 0, + compFull = 0, + unsound = 0, + wordsWithAnalysis = 0; + foreach (string w in words) + { + var oracle = new HashSet(search.AnalyzeWord(w).Select(Sig)); + var bareSet = new HashSet(bare.AnalyzeWord(w).Select(Sig)); + var compSet = new HashSet(composed.AnalyzeWord(w).Select(Sig)); + if (oracle.Count > 0) + wordsWithAnalysis++; + if (bareSet.SetEquals(oracle)) + bareFull++; + if (compSet.SetEquals(oracle)) + compFull++; + if (!compSet.IsSubsetOf(oracle)) + unsound++; // composite produced an analysis the engine did not — a soundness failure + } + TestContext.Out.WriteLine($"words: {words.Count} ({wordsWithAnalysis} with an analysis)"); + TestContext.Out.WriteLine($"fully covered — bare FST: {bareFull}, composite: {compFull}"); + TestContext.Out.WriteLine($"composite unsound words (⊄ search): {unsound}"); + Assert.That(unsound, Is.Zero, "soundness: composite must never produce a non-engine analysis"); + Assert.That( + compFull, + Is.GreaterThanOrEqualTo(bareFull), + "composite must cover at least as much as the bare FST" + ); + } + + private static string Sig(WordAnalysis a) => + string.Join("+", a.Morphemes.Select(m => (m as Morpheme)?.Gloss ?? "?")) + ":" + a.RootMorphemeIndex; + + /// Measure the forward-synthesis precompile: build cost, table size, how many words it lifts + /// to full coverage over the bare composite, and that it stays a sound subset of the engine. + [Test] + public void Benchmark_ForwardSynthVsSearch() + { + (Language language, List words) = Load(); + var search = new Morpher(new TraceManager(), language) { MaxUnapplications = 0 }; + var pool = new MorpherPool(() => new Morpher(new TraceManager(), language)); + + var bareComposite = CompositeProposer.ForLanguage( + language, + new FstTemplateAnalyzer(language, new Morpher(new TraceManager(), language)) + ); + var bare = new VerifiedFstAnalyzer(bareComposite, pool); + + var sw = Stopwatch.StartNew(); + int maxAffixes = int.TryParse(Environment.GetEnvironmentVariable("HC_MAX_AFFIXES"), out int ma) ? ma : 2; + var synth = new ForwardSynthesisProposer(language, new Morpher(new TraceManager(), language), maxAffixes); + sw.Stop(); + var fullComposite = new CompositeProposer( + new FstTemplateAnalyzer(language, new Morpher(new TraceManager(), language)), + synth, + new ReduplicationProposer(language, new FstTemplateAnalyzer(language)), + new InfixProposer(language, new FstTemplateAnalyzer(language)) + ); + var full = new VerifiedFstAnalyzer(fullComposite, pool); + TestContext.Out.WriteLine( + $"forward-synth build: {sw.ElapsedMilliseconds} ms, {synth.EntryCount} entries, capped={synth.WasCapped}" + ); + + int bareFull = 0, + fullFull = 0, + unsound = 0, + analyzable = 0; + foreach (string w in words) + { + var oracle = search.AnalyzeWord(w).Select(Sig).ToHashSet(); + var b = bare.AnalyzeWord(w).Select(Sig).ToHashSet(); + var f = full.AnalyzeWord(w).Select(Sig).ToHashSet(); + if (oracle.Count > 0) + analyzable++; + if (b.SetEquals(oracle)) + bareFull++; + if (f.SetEquals(oracle)) + { + fullFull++; + } + else if (oracle.Count > 0) + { + TestContext.Out.WriteLine( + $" still missed {w}: engine={oracle.Count} fst={f.Count} | {string.Join(" ; ", oracle.Except(f))}" + ); + } + if (!f.IsSubsetOf(oracle)) + unsound++; + } + TestContext.Out.WriteLine($"words: {words.Count} ({analyzable} analyzable)"); + TestContext.Out.WriteLine($"fully covered — bare composite: {bareFull}, +forward-synth: {fullFull}"); + TestContext.Out.WriteLine($"forward-synth unsound words (⊄ search): {unsound}"); + Assert.That(unsound, Is.Zero, "soundness: forward-synth must never produce a non-engine analysis"); + Assert.That(fullFull, Is.GreaterThanOrEqualTo(bareFull)); + } + + /// Diagnostic: list the words the composite under-generates on, with what the engine found + /// that the FST missed, and dump the census escapes — to see WHICH constructs block coverage. + [Test] + public void Diagnose_Divergences() + { + (Language language, List words) = Load(); + var search = new Morpher(new TraceManager(), language) { MaxUnapplications = 0 }; + CompositeProposer composite = CompositeProposer.ForLanguage( + language, + new FstTemplateAnalyzer(language, new Morpher(new TraceManager(), language)) + ); + var composed = new VerifiedFstAnalyzer( + composite, + new MorpherPool(() => new Morpher(new TraceManager(), language)) + ); + + GrammarFstReport census = GrammarFstAdvisor.Analyze(language); + TestContext.Out.WriteLine($"=== census escapes ({census.EscapeCount}) ==="); + foreach (GrammarAdvisory e in census.Escapes.Take(40)) + { + TestContext.Out.WriteLine( + $" ESCAPE [{e.Kind}] rule={e.Rule} stratum={e.Stratum} regular={e.Regular}: {e.Issue}" + ); + } + + TestContext.Out.WriteLine("=== divergent words (engine finds, FST misses) ==="); + foreach (string w in words) + { + var oracle = search.AnalyzeWord(w).Select(Sig).ToHashSet(); + var comp = composed.AnalyzeWord(w).Select(Sig).ToHashSet(); + if (!comp.SetEquals(oracle)) + { + var missed = oracle.Except(comp).ToList(); + TestContext.Out.WriteLine( + $" {w}: engine={oracle.Count} fst={comp.Count} | missed: {string.Join(" ; ", missed)}" + ); + } + } + } + + /// + /// Soundness on NEGATIVE examples: plausible-looking non-words (real words over-prefixed, + /// over-suffixed, prefix-swapped, fake-reduplicated, fake-compounded) must analyze to NOTHING. We + /// keep only true negatives (search = ∅), preferring those the raw FST proposes for (so the verify + /// is exercised), then require the verified FST to also return ∅. A non-empty result is a false + /// positive — the soundness failure this hunts for. + /// + [Test] + public void Soundness_NegativeExamples() + { + (Language language, List real0) = Load(); + int targetCount = int.TryParse(Environment.GetEnvironmentVariable("HC_NEG_COUNT"), out int nc) ? nc : 50; + var search = new Morpher(new TraceManager(), language) { MaxUnapplications = 0 }; + var raw = new FstTemplateAnalyzer(language, search); + var verified = new VerifiedFstAnalyzer( + new FstTemplateAnalyzer(language, search), + new MorpherPool(() => new Morpher(new TraceManager(), language)) + ); + + List real = real0.Take(80).ToList(); + string[] pre = { "ku", "a", "ci", "ka", "mu", "ma", "ni", "wa", "ti", "pa" }; + string[] suf = { "a", "e", "ira", "isa", "ka", "ni", "wa", "esa" }; + var candidates = new List(); + for (int i = 0; i < real.Count; i++) + { + string w = real[i].ToLowerInvariant(); + foreach (string p in pre) + { + candidates.Add(p + w); + if (w.Length > p.Length + 1 && w.StartsWith(p, StringComparison.Ordinal)) + { + foreach (string p2 in pre) + { + if (p2 != p) + { + candidates.Add(string.Concat(p2.AsSpan(), w.AsSpan(p.Length))); + } + } + } + } + foreach (string s in suf) + { + candidates.Add(w + s); + } + candidates.Add(string.Concat(w.AsSpan(0, 2), w)); + if (i + 1 < real.Count) + { + candidates.Add(w + real[i + 1].ToLowerInvariant()); + } + } + + int chosen = 0; + int fstProposed = 0; + int falsePositives = 0; + var fp = new List(); + var seen = new HashSet(); + foreach (string c in candidates) + { + if (chosen >= targetCount || !seen.Add(c)) + { + continue; + } + try + { + if (search.AnalyzeWord(c).Any()) + { + continue; // actually parses — not a negative + } + int rawCount = raw.AnalyzeWord(c).Count(); + int verifiedCount = verified.AnalyzeWord(c).Count(); + chosen++; + if (rawCount > 0) + { + fstProposed++; + } + if (verifiedCount != 0) + { + falsePositives++; + if (fp.Count < 20) + { + fp.Add(c); + } + } + } + catch (Exception) { } + } + + TestContext.Out.WriteLine( + $"negatives: {chosen}; raw FST proposed {fstProposed}; false positives {falsePositives}" + ); + foreach (string e in fp) + { + TestContext.Out.WriteLine($" FALSE POSITIVE: {e}"); + } + Assert.That(chosen, Is.GreaterThanOrEqualTo(targetCount), "could not assemble enough true negatives"); + Assert.That(falsePositives, Is.Zero, "soundness FAILURE: verified FST analyzed a non-word"); + } + + /// Parallel-consistency: parsing the corpus concurrently must give the same analyses as + /// sequentially (validates the pooled-Morpher thread-safety fix). + [Test] + public void Concurrent_MatchesSequential() + { + (Language language, List words) = Load(); + var verified = new VerifiedFstAnalyzer( + new FstTemplateAnalyzer(language, new Morpher(new TraceManager(), language)), + new MorpherPool(() => new Morpher(new TraceManager(), language)) + ); + + Dictionary sequential = words.Distinct().ToDictionary(w => w, w => SigSet(verified, w)); + var parallel = new ConcurrentDictionary(); + Parallel.ForEach(words.Distinct(), w => parallel[w] = SigSet(verified, w)); + + int mismatches = sequential.Count(kv => parallel[kv.Key] != kv.Value); + TestContext.Out.WriteLine($"parallel vs sequential: {mismatches} mismatches of {sequential.Count} words"); + Assert.That(mismatches, Is.Zero, "thread-safety FAILURE: concurrent analyses differ from sequential"); + } + + /// Parallel wall-clock throughput at a controlled degree of parallelism (HC_THREADS, default + /// 16): the pooled search engine (oracle, unbounded) vs the bare verified-FST path (the always-fast, + /// not-necessarily-complete "first pass" — , unconditional and + /// always used, never gated on anything). Both sides rent a per-call from a + /// — a single shared hammered by concurrent + /// AnalyzeWord calls is not an established-safe pattern in this codebase (only the + /// pool-per-call pattern is validated by Concurrent_MatchesSequential), so the oracle side must + /// pool too for the comparison to be apples-to-apples. Reports whether the process actually got Server + /// GC (DOTNET_gcServer=1 env var, checked at process start). + [Test] + public void Benchmark_ParallelThroughput() + { + (Language language, List words) = Load(); + int degree = int.TryParse(Environment.GetEnvironmentVariable("HC_THREADS"), out int t) ? t : 16; + List load = words.Distinct().ToList(); + + var searchPool = new MorpherPool(() => new Morpher(new TraceManager(), language) { MaxUnapplications = 0 }); + var verified = new VerifiedFstAnalyzer( + new FstTemplateAnalyzer(language, new Morpher(new TraceManager(), language)), + new MorpherPool(() => new Morpher(new TraceManager(), language)) + ); + + TestContext.Out.WriteLine($"Server GC: {System.Runtime.GCSettings.IsServerGC}"); + TestContext.Out.WriteLine($"threads={degree} distinct words={load.Count}"); + + var po = new ParallelOptions { MaxDegreeOfParallelism = degree }; + + // warm up (JIT + FST build) outside the timed region + Morpher warm = searchPool.Rent(); + warm.AnalyzeWord(load[0]).Count(); + searchPool.Return(warm); + verified.AnalyzeWord(load[0]).Count(); + + long searchAnalyses = 0; + var swSearch = Stopwatch.StartNew(); + Parallel.ForEach( + load, + po, + w => + { + Morpher m = searchPool.Rent(); + try + { + int n = m.AnalyzeWord(w).Count(); + Interlocked.Add(ref searchAnalyses, n); + } + finally + { + searchPool.Return(m); + } + } + ); + swSearch.Stop(); + + long fstAnalyses = 0; + var swFst = Stopwatch.StartNew(); + Parallel.ForEach( + load, + po, + w => + { + int n = verified.AnalyzeWord(w).Count(); + Interlocked.Add(ref fstAnalyses, n); + } + ); + swFst.Stop(); + + TestContext.Out.WriteLine( + $"search (parallel, pooled) : {swSearch.ElapsedMilliseconds, 7} ms " + + $"({(double)swSearch.ElapsedMilliseconds / load.Count:F3} ms/word, {searchAnalyses} analyses)" + ); + TestContext.Out.WriteLine( + $"verified (parallel) : {swFst.ElapsedMilliseconds, 7} ms " + + $"({(double)swFst.ElapsedMilliseconds / load.Count:F3} ms/word, {fstAnalyses} analyses)" + ); + TestContext.Out.WriteLine( + $"speedup: {(double)swSearch.ElapsedMilliseconds / Math.Max(1, swFst.ElapsedMilliseconds):F1}x" + ); + } + + private static string SigSet(IMorphologicalAnalyzer analyzer, string word) + { + return string.Join( + "|", + analyzer + .AnalyzeWord(word) + .Select(a => + string.Join("+", a.Morphemes.Select(m => (m as Morpheme)?.Gloss ?? "?")) + ":" + a.RootMorphemeIndex + ) + .OrderBy(s => s, StringComparer.Ordinal) + ); + } + + /// End-to-end FstCoverageProbe benchmark over the FULL wordlist (not the HC_MAX_WORDS-capped + /// slice the other benchmarks use): coverage + p50/p95 per-word latency. Set HC_PROBE_MAX_WORDS to + /// cap it (default: the whole file — Sena's is 7,121 words). + [Test] + public void Benchmark_FullCorpusProbe() + { + string? grammarPath = Environment.GetEnvironmentVariable("HC_GRAMMAR"); + string? wordsPath = Environment.GetEnvironmentVariable("HC_WORDS"); + if (string.IsNullOrEmpty(grammarPath) || string.IsNullOrEmpty(wordsPath)) + { + Assert.Ignore("set HC_GRAMMAR and HC_WORDS"); + } + int maxWords = int.TryParse(Environment.GetEnvironmentVariable("HC_PROBE_MAX_WORDS"), out int mw) + ? mw + : int.MaxValue; + Language language = XmlLanguageLoader.Load(grammarPath!); + List words = File.ReadAllLines(wordsPath!) + .Select(w => w.Trim()) + .Where(w => w.Length > 0) + .Take(maxWords) + .ToList(); + + FstCoverageProbe probe = FstCoverageProbe.ForLanguage(language); + var latenciesMs = new List(words.Count); + int parsed = 0; + var sw = Stopwatch.StartNew(); + foreach (string word in words) + { + ProbeReport single = probe.Probe(new[] { word }); + latenciesMs.Add(single.Elapsed.TotalMilliseconds); + if (single.ParsedWords > 0) + { + parsed++; + } + } + sw.Stop(); + latenciesMs.Sort(); + double P(double pct) => latenciesMs[(int)System.Math.Min(latenciesMs.Count - 1, pct * latenciesMs.Count)]; + + TestContext.Out.WriteLine($"words: {words.Count}, parsed: {parsed} ({(double)parsed / words.Count:P1})"); + TestContext.Out.WriteLine($"total wall time: {sw.ElapsedMilliseconds} ms"); + TestContext.Out.WriteLine($"p50: {P(0.50):F3} ms/word, p95: {P(0.95):F3} ms/word, p99: {P(0.99):F3} ms/word"); + } + + /// Genuinely parallel (HC_THREADS, default 16), FST-only (no oracle, so no open-ended + /// runtime risk on a large corpus) throughput benchmark over the FULL wordlist. Distinct from + /// Benchmark_FullCorpusProbe (deliberately single-threaded, for clean per-word latency percentiles) + /// and Benchmark_ParallelThroughput (parallel, but pairs against the oracle, and HC_MAX_WORDS + /// defaults to a 60-word cap) — this is the "how fast is the FST alone across all cores over + /// everything" number. + [Test] + public void Benchmark_FullCorpusParallelThroughput() + { + string? grammarPath = Environment.GetEnvironmentVariable("HC_GRAMMAR"); + string? wordsPath = Environment.GetEnvironmentVariable("HC_WORDS"); + if (string.IsNullOrEmpty(grammarPath) || string.IsNullOrEmpty(wordsPath)) + { + Assert.Ignore("set HC_GRAMMAR and HC_WORDS"); + } + int maxWords = int.TryParse(Environment.GetEnvironmentVariable("HC_PROBE_MAX_WORDS"), out int mw) + ? mw + : int.MaxValue; + int degree = int.TryParse(Environment.GetEnvironmentVariable("HC_THREADS"), out int t) ? t : 16; + Language language = XmlLanguageLoader.Load(grammarPath!); + List words = File.ReadAllLines(wordsPath!) + .Select(w => w.Trim()) + .Where(w => w.Length > 0) + .Take(maxWords) + .ToList(); + + var fst = new FstTemplateAnalyzer(language, new Morpher(new TraceManager(), language)); + CompositeProposer composite = CompositeProposer.ForLanguage(language, fst); + var pool = new MorpherPool(() => new Morpher(new TraceManager(), language)); + var verified = new VerifiedFstAnalyzer(composite, pool); + + TestContext.Out.WriteLine($"Server GC: {System.Runtime.GCSettings.IsServerGC}"); + TestContext.Out.WriteLine($"threads={degree} words={words.Count}"); + + var po = new ParallelOptions { MaxDegreeOfParallelism = degree }; + verified.AnalyzeWord(words[0]).Count(); // warm up outside the timed region + + long analyses = 0; + int parsed = 0; + var sw = Stopwatch.StartNew(); + Parallel.ForEach( + words, + po, + w => + { + int n = verified.AnalyzeWord(w).Count(); + Interlocked.Add(ref analyses, n); + if (n > 0) + { + Interlocked.Increment(ref parsed); + } + } + ); + sw.Stop(); + + TestContext.Out.WriteLine($"parsed: {parsed}/{words.Count} ({(double)parsed / words.Count:P1})"); + TestContext.Out.WriteLine( + $"total wall time: {sw.ElapsedMilliseconds} ms " + + $"({(double)sw.ElapsedMilliseconds / words.Count:F3} ms/word average, {analyses} analyses)" + ); + TestContext.Out.WriteLine($"throughput: {words.Count / sw.Elapsed.TotalSeconds:F0} words/sec"); + } + + private static (Language, List) Load() + { + string? grammarPath = Environment.GetEnvironmentVariable("HC_GRAMMAR"); + string? wordsPath = Environment.GetEnvironmentVariable("HC_WORDS"); + if (string.IsNullOrEmpty(grammarPath) || string.IsNullOrEmpty(wordsPath)) + { + Assert.Ignore("set HC_GRAMMAR and HC_WORDS"); + } + int maxWords = int.TryParse(Environment.GetEnvironmentVariable("HC_MAX_WORDS"), out int mw) ? mw : 60; + Language language = XmlLanguageLoader.Load(grammarPath!); + List words = File.ReadAllLines(wordsPath!) + .Select(w => w.Trim()) + .Where(w => w.Length > 0) + .Take(maxWords) + .ToList(); + return (language, words); + } + + private static long TimeParse(string label, List words, Func parse) + { + try + { + parse(words[0]); // warm up + } + catch (Exception) { } + var sw = Stopwatch.StartNew(); + long total = 0; + foreach (string w in words) + { + try + { + total += parse(w); + } + catch (Exception) { } + } + sw.Stop(); + TestContext.Out.WriteLine( + $"{label} : {sw.ElapsedMilliseconds, 7} ms ({(double)sw.ElapsedMilliseconds / words.Count:F1} ms/word, {total} analyses)" + ); + return sw.ElapsedMilliseconds; + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstTemplateAnalyzerTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstTemplateAnalyzerTests.cs new file mode 100644 index 000000000..04946f25b --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstTemplateAnalyzerTests.cs @@ -0,0 +1,206 @@ +using NUnit.Framework; +using SIL.Machine.FeatureModel; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// Template-based analysis with build-time category gating (HERMITCRAB_FST_PLAN.md §6/§10): a +/// suffixing affix template attaches only to roots whose category matches, and the token- +/// accumulating walk reproduces the search engine's analyses — including NOT over-generating the +/// template onto a wrong-category root. +/// +public class FstTemplateAnalyzerTests : HermitCrabTestBase +{ + private AffixProcessRule Suffix(string name, string gloss, string seg) + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var rule = new AffixProcessRule { Name = name, Gloss = gloss }; + rule.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, seg) }, + } + ); + return rule; + } + + private AffixProcessRule Prefix(string name, string gloss, string seg) + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var rule = new AffixProcessRule { Name = name, Gloss = gloss }; + rule.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new InsertSegments(Table3, seg), new CopyFromInput("1") }, + } + ); + return rule; + } + + [Test] + public void Analyze_SlotAffixWrongCategory_PrunedNotOvergenerated() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + AffixProcessRule ok = Suffix("ok_suffix", "OK", "d"); // no category requirement → applies to V + var wrong = new AffixProcessRule + { + Name = "n_only_suffix", + Gloss = "NS", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + wrong.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "z") }, + } + ); + var verbTemplate = new AffixTemplate + { + Name = "verb", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + verbTemplate.Slots.Add(new AffixTemplateSlot(ok) { Optional = true }); + verbTemplate.Slots.Add(new AffixTemplateSlot(wrong) { Optional = true }); + Morphophonemic.AffixTemplates.Add(verbTemplate); + + var search = new Morpher(TraceManager, Language); + var fst = new FstTemplateAnalyzer(Language); + + // sag is V. "sagd" uses the OK suffix (valid); "sagz" would use the N-only suffix on a V + // root — the build-time category gate prunes it, so the FST must NOT over-generate it. + string[] corpus = { "sag", "sagd", "sagz" }; + AnalysisComparison comparison = FstVerification.Compare(search, fst, corpus); + Assert.That(comparison.MatchesReferenceExactly, Is.True, comparison.Format()); + + Morphophonemic.AffixTemplates.Remove(verbTemplate); + } + + [Test] + public void Build_ReduplicationSlot_DegradesGracefully_DoesNotThrow() + { + // A reduplication slot is non-regular and unbuildable. The proposer must SKIP it (degrade), + // not throw and abort the whole build — and flag the grammar as not fully covered. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var redup = new AffixProcessRule { Name = "redup", Gloss = "RED" }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, // copy the stem twice = reduplication + } + ); + var t = new AffixTemplate + { + Name = "redup_tmpl", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + t.Slots.Add(new AffixTemplateSlot(redup) { Optional = true }); + Morphophonemic.AffixTemplates.Add(t); + + FstTemplateAnalyzer? fst = null; + Assert.DoesNotThrow( + () => fst = new FstTemplateAnalyzer(Language), + "an unbuildable slot must degrade, not throw" + ); + Assert.That( + fst!.CoversAllConstructs, + Is.False, + "reduplication slot → grammar not fully covered by the bare FST" + ); + Assert.That(fst!.AnalyzeWord("sag"), Is.Not.Empty, "the rest of the grammar still analyzes"); + + Morphophonemic.AffixTemplates.Remove(t); + } + + [Test] + public void Analyze_ZeroSegmentSuffix_IsEmitted_NotDropped() + { + // A true zero-segment affix (CopyFromInput only, no InsertSegments) must still emit its + // morpheme token (it adds no segments). Previously it threw / was silently dropped. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var zero = new AffixProcessRule { Name = "zero_sfx", Gloss = "Z" }; + zero.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1") }, // copy stem, insert nothing = zero affix + } + ); + var t = new AffixTemplate + { + Name = "zero_tmpl", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + t.Slots.Add(new AffixTemplateSlot(zero) { Optional = true }); + Morphophonemic.AffixTemplates.Add(t); + + var search = new Morpher(TraceManager, Language); + var fst = new FstTemplateAnalyzer(Language); + Assert.That(fst.CoversAllConstructs, Is.True, "a zero-segment affix is buildable, not a skipped construct"); + // Whatever the engine yields for "sag" (bare root and/or root+Z), the FST must match it — + // i.e. it must not drop the zero-suffixed analysis. + AnalysisComparison comparison = FstVerification.Compare(search, fst, new[] { "sag" }); + Assert.That(comparison.MatchesReferenceExactly, Is.True, comparison.Format()); + + Morphophonemic.AffixTemplates.Remove(t); + } + + [Test] + public void Analyze_PrefixAndSuffixTemplate_MatchesSearch() + { + // A verb template with a prefix slot (di-) and a suffix slot (-d), restricted to V roots. + AffixProcessRule di = Prefix("di_prefix", "PST", "di"); + AffixProcessRule ed = Suffix("ed_suffix", "PERF", "d"); + var verbTemplate = new AffixTemplate + { + Name = "verb", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + verbTemplate.Slots.Add(new AffixTemplateSlot(di) { Optional = true }); + verbTemplate.Slots.Add(new AffixTemplateSlot(ed) { Optional = true }); + Morphophonemic.AffixTemplates.Add(verbTemplate); + + var search = new Morpher(TraceManager, Language); + var fst = new FstTemplateAnalyzer(Language); + + // sag (V, Morphophonemic): bare, prefixed (disag), suffixed (sagd), both (disagd). + string[] corpus = { "sag", "disag", "sagd", "disagd", "gab", "digab" }; + AnalysisComparison comparison = FstVerification.Compare(search, fst, corpus); + Assert.That(comparison.MatchesReferenceExactly, Is.True, comparison.Format()); + + Morphophonemic.AffixTemplates.Remove(verbTemplate); + } + + [Test] + public void Analyze_SuffixTemplateWithCategoryGate_MatchesSearch() + { + // A verb template, restricted to V roots, with two optional suffix slots. + AffixProcessRule ed = Suffix("ed_suffix", "PAST", "d"); + AffixProcessRule wit = Suffix("evidential", "WIT", "v"); + var verbTemplate = new AffixTemplate + { + Name = "verb", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + verbTemplate.Slots.Add(new AffixTemplateSlot(ed) { Optional = true }); + verbTemplate.Slots.Add(new AffixTemplateSlot(wit) { Optional = true }); + Morphophonemic.AffixTemplates.Add(verbTemplate); + + var search = new Morpher(TraceManager, Language); + var fst = new FstTemplateAnalyzer(Language); + + // Same-stratum (Morphophonemic) roots so only the category gate is in play: sag (32, V) + // takes the template; gab (11, A) must NOT. "sagdv" exercises both slots; "gabd" must yield + // no analysis in either engine (the gate blocks the verb template on the A root). + string[] corpus = { "sag", "sagd", "sagdv", "gab", "gabd" }; + AnalysisComparison comparison = FstVerification.Compare(search, fst, corpus); + Assert.That(comparison.MatchesReferenceExactly, Is.True, comparison.Format()); + + Morphophonemic.AffixTemplates.Remove(verbTemplate); + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstVerificationTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstVerificationTests.cs new file mode 100644 index 000000000..04eab0681 --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstVerificationTests.cs @@ -0,0 +1,74 @@ +using NUnit.Framework; +using SIL.Machine.FeatureModel; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// FstVerification.Compare is a manual gap-inspection tool: it measures FST-vs-search analysis-set +/// divergence over a corpus, reporting missing analyses (fast-path gaps) and extra analyses (which +/// would be a soundness bug) at once. Diagnostic only — nothing gates behavior on its result. +/// +public class FstVerificationTests : HermitCrabTestBase +{ + private AffixProcessRule AddSuffix() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var sSuffix = new AffixProcessRule + { + Name = "s_suffix", + Gloss = "NMLZ", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + sSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, + } + ); + Morphophonemic.MorphologicalRules.Add(sSuffix); + return sSuffix; + } + + private sealed class EmptyAnalyzer : IMorphologicalAnalyzer + { + public IEnumerable AnalyzeWord(string word) => Enumerable.Empty(); + } + + [Test] + public void Compare_FstVsSearch_MatchesOnConcatenativeCorpus() + { + AffixProcessRule suffix = AddSuffix(); + IMorphologicalAnalyzer search = new Morpher(TraceManager, Language); + IMorphologicalAnalyzer fst = new VerifiedFstAnalyzer(TraceManager, Language); + + // A mix: inflected, bare root, homograph (dat = entries 8 & 9), and a non-word. + string[] corpus = { "sag", "sags", "dat", "sagg" }; + AnalysisComparison comparison = FstVerification.Compare(search, fst, corpus); + + Assert.That(comparison.WordsChecked, Is.EqualTo(corpus.Length)); + Assert.That(comparison.MatchesReferenceExactly, Is.True, comparison.Format()); + + Morphophonemic.MorphologicalRules.Remove(suffix); + } + + [Test] + public void Compare_DetectsMissingAnalyses_NotVacuous() + { + AffixProcessRule suffix = AddSuffix(); + IMorphologicalAnalyzer search = new Morpher(TraceManager, Language); + + // A candidate that finds nothing must be flagged incomplete on a word that has an analysis. + AnalysisComparison comparison = FstVerification.Compare(search, new EmptyAnalyzer(), new[] { "sag" }); + + Assert.That(comparison.MatchesReferenceExactly, Is.False); + Assert.That(comparison.Divergences, Has.Count.EqualTo(1)); + Assert.That(comparison.Divergences[0].MissingFromCandidate, Is.Not.Empty); + Assert.That(comparison.Divergences[0].ExtraInCandidate, Is.Empty); + + Morphophonemic.MorphologicalRules.Remove(suffix); + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorBenchmark.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorBenchmark.cs new file mode 100644 index 000000000..656228ef1 --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorBenchmark.cs @@ -0,0 +1,29 @@ +using NUnit.Framework; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// Runs against a real FLEx-exported grammar and prints the +/// report. [Explicit] so it never runs in CI. Point HC_GRAMMAR at an HC config XML: +/// $env:HC_GRAMMAR="...\sena-hc.xml"; dotnet test --filter "FullyQualifiedName~GrammarFstAdvisorBenchmark" +/// +[TestFixture] +[Explicit("Manual grammar-linter run against an external grammar; not part of CI.")] +public class GrammarFstAdvisorBenchmark +{ + [Test] + public void Advise_OnExternalGrammar() + { + string? grammarPath = Environment.GetEnvironmentVariable("HC_GRAMMAR"); + Assert.That(grammarPath, Is.Not.Null.And.Not.Empty, "set HC_GRAMMAR to an HC config XML path"); + Assert.That(File.Exists(grammarPath), Is.True, $"grammar not found: {grammarPath}"); + + Language language = XmlLanguageLoader.Load(grammarPath!); + GrammarFstReport report = GrammarFstAdvisor.Analyze(language); + + TestContext.Out.WriteLine($"Grammar: {Path.GetFileName(grammarPath)}"); + TestContext.Out.WriteLine($"Strata : {language.Strata.Count}"); + TestContext.Out.WriteLine(""); + TestContext.Out.WriteLine(report.Format()); + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorTests.cs new file mode 100644 index 000000000..85427ea8a --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorTests.cs @@ -0,0 +1,258 @@ +using NUnit.Framework; +using SIL.Machine.FeatureModel; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; +using SIL.Machine.Morphology.HermitCrab.PhonologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// Verifies the grammar linter (): a plain concatenative grammar +/// is a Tier 1 (fully FST-able) candidate with no escapes, and adding a single reduplication rule +/// flips the verdict — the offending rule is flagged +/// with a reduplication write-up. This is the "one new rule blew up the grammar" guard. +/// +public class GrammarFstAdvisorTests : HermitCrabTestBase +{ + [Test] + public void Analyze_ConcatenativeGrammar_Tier1NoEscapes() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + + // A plain suffix: copy the whole stem, then add segments. Fully finite-state. + var sSuffix = new AffixProcessRule { Name = "s_suffix", Gloss = "PL" }; + sSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, + } + ); + Morphophonemic.MorphologicalRules.Add(sSuffix); + + // A suffix over a SPLIT stem (copy part 1, copy part 2, then insert): the copies are + // contiguous, so this is an ordinary suffix — finite-state, must NOT be flagged. + var splitSuffix = new AffixProcessRule { Name = "split_suffix", Gloss = "PST" }; + splitSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = + { + Pattern.New("1").Annotation(any).Value, + Pattern.New("2").Annotation(any).OneOrMore.Value, + }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("2"), new InsertSegments(Table3, "d") }, + } + ); + Morphophonemic.MorphologicalRules.Add(splitSuffix); + + GrammarFstReport report = GrammarFstAdvisor.Analyze(Language); + + Assert.That(report.EscapeCount, Is.EqualTo(0), report.Format()); + Assert.That(report.Tier, Does.StartWith("Tier 1")); + + Morphophonemic.MorphologicalRules.Remove(sSuffix); + Morphophonemic.MorphologicalRules.Remove(splitSuffix); + } + + [Test] + public void Analyze_BoundedReduplicant_IsRegular() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + + // A fixed-size reduplicant: the copied part "1" matches a SINGLE segment (no OneOrMore), + // so the copy is finite → regular (reclaimable by bounded fold), unlike whole-stem copy. + var redup = new AffixProcessRule { Name = "credup", Gloss = "PL" }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, + } + ); + Morphophonemic.MorphologicalRules.Add(redup); + + GrammarFstReport report = GrammarFstAdvisor.Analyze(Language); + + GrammarAdvisory escape = report.Escapes.Single(a => a.Rule == "credup"); + // Still slow today (Escape preserved), but regular = FST-reclaimable. + Assert.That(escape.Severity, Is.EqualTo(GrammarAdvisorySeverity.Escape)); + Assert.That(escape.Regular, Is.True, report.Format()); + Assert.That(report.RegularEscapeCount, Is.EqualTo(1)); + + Morphophonemic.MorphologicalRules.Remove(redup); + } + + [Test] + public void Analyze_TrueInfix_FlaggedEscape() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + + // Infixation: insert material BETWEEN two copies of the stem (copy…insert…copy). + var infix = new AffixProcessRule { Name = "infix", Gloss = "PERF" }; + infix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = + { + Pattern.New("1").Annotation(any).Value, + Pattern.New("2").Annotation(any).OneOrMore.Value, + }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "a"), new CopyFromInput("2") }, + } + ); + Morphophonemic.MorphologicalRules.Add(infix); + + GrammarFstReport report = GrammarFstAdvisor.Analyze(Language); + + GrammarAdvisory escape = report.Escapes.Single(a => a.Rule == "infix"); + Assert.That(escape.Issue, Does.Contain("Infixation")); + // Severity is preserved — infixation is slow in today's engine — but it is regular (the + // split is pattern-defined), so it carries the reclaim path. + Assert.That(escape.Severity, Is.EqualTo(GrammarAdvisorySeverity.Escape)); + Assert.That(escape.Regular, Is.True); + Assert.That(report.Tier, Does.StartWith("Tier 2")); + + Morphophonemic.MorphologicalRules.Remove(infix); + } + + [Test] + public void Analyze_HarmonyRewrite_StaysEscapeButIsRegular() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + + // A vowel-harmony-style rewrite: bounded LHS/RHS, but an UNBOUNDED left environment + // ("...anything... ___"). By Kaplan & Kay this is a regular relation, but in today's + // engine it un-applies at many positions and is slow. + var harmony = new RewriteRule { Name = "harmony", Lhs = Pattern.New().Annotation(any).Value }; + harmony.Subrules.Add( + new RewriteSubrule + { + Rhs = Pattern.New().Annotation(any).Value, + LeftEnvironment = Pattern.New().Annotation(any).OneOrMore.Value, + } + ); + Allophonic.PhonologicalRules.Add(harmony); + + GrammarFstReport report = GrammarFstAdvisor.Analyze(Language); + + GrammarAdvisory escape = report.Escapes.Single(a => a.Rule == "harmony"); + // The non-expert sanity check: the headline still WARNS (escape present, not Tier 1) ... + Assert.That(escape.Severity, Is.EqualTo(GrammarAdvisorySeverity.Escape)); + Assert.That(report.Tier, Does.Not.StartWith("Tier 1")); + Assert.That(report.EscapeCount, Is.GreaterThanOrEqualTo(1)); + // ... and the reclaim path is reported separately: regular (FST-reclaimable), not "fine". + Assert.That(escape.Regular, Is.True); + Assert.That(report.RegularEscapeCount, Is.GreaterThanOrEqualTo(1)); + Assert.That(escape.Advice, Does.Contain("today's engine")); + + Allophonic.PhonologicalRules.Remove(harmony); + } + + [Test] + public void Analyze_ReduplicationRule_FlaggedEscapeAndTierDowngraded() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + + GrammarFstReport before = GrammarFstAdvisor.Analyze(Language); + Assert.That(before.EscapeCount, Is.EqualTo(0), "baseline grammar should have no escapes"); + + // Total reduplication: copy the stem ("1") twice. Copying an unbounded span is not + // finite-state — exactly the rule that should blow up the grammar. + var redup = new AffixProcessRule { Name = "redup", Gloss = "INTENS" }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, + } + ); + Morphophonemic.MorphologicalRules.Add(redup); + + GrammarFstReport after = GrammarFstAdvisor.Analyze(Language); + + Assert.That(after.EscapeCount, Is.EqualTo(1), after.Format()); + GrammarAdvisory escape = after.Escapes.Single(); + Assert.That(escape.Rule, Is.EqualTo("redup")); + Assert.That(escape.Severity, Is.EqualTo(GrammarAdvisorySeverity.Escape)); + Assert.That(escape.Issue, Does.Contain("Reduplication")); + Assert.That(escape.Advice, Is.Not.Empty); + // No phonological rule applies after it, so the escape is probe-able (clean). + Assert.That(escape.Probeable, Is.True); + Assert.That(after.ProbeableEscapeCount, Is.EqualTo(1)); + // Copying the whole stem (part "1" is OneOrMore) is the one genuinely non-regular case. + Assert.That(escape.Regular, Is.False); + Assert.That(after.NonRegularEscapeCount, Is.EqualTo(1)); + // The tier verdict changed: this is the warning a grammar engineer sees. + Assert.That(after.Tier, Is.Not.EqualTo(before.Tier)); + Assert.That(after.Tier, Does.StartWith("Tier 2")); + + Morphophonemic.MorphologicalRules.Remove(redup); + } + + [Test] + public void Analyze_RealizationalReduplication_IsExamined() + { + // RealizationalAffixProcessRule also implements IMorphologicalRule and has Allomorphs, so a + // reduplication encoded on one must be examined and flagged — it was previously skipped. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + Assert.That(GrammarFstAdvisor.Analyze(Language).EscapeCount, Is.EqualTo(0), "baseline has no escapes"); + + var redup = new RealizationalAffixProcessRule + { + Name = "real_redup", + Gloss = "INTENS", + RealizationalFeatureStruct = FeatureStruct + .New(Language.SyntacticFeatureSystem) + .Feature(Head) + .EqualTo(head => head.Feature("tense").EqualTo("past")) + .Value, + }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, + } + ); + Morphophonemic.MorphologicalRules.Add(redup); + + GrammarFstReport after = GrammarFstAdvisor.Analyze(Language); + Assert.That(after.EscapeCount, Is.EqualTo(1), after.Format()); + Assert.That(after.Escapes.Single().Rule, Is.EqualTo("real_redup")); + Assert.That(after.Escapes.Single().Issue, Does.Contain("Reduplication")); + + Morphophonemic.MorphologicalRules.Remove(redup); + } + + [Test] + public void Analyze_ReduplicationWithLaterPhonology_IsOpaque() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + + var redup = new AffixProcessRule { Name = "redup", Gloss = "INTENS" }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, + } + ); + Morphophonemic.MorphologicalRules.Add(redup); + + // A phonological rule in a LATER stratum can rewrite the reduplicated span, so the + // strip-and-reparse probe is no longer sound — the escape is opaque (needs the backstop). + var rule = new RewriteRule { Name = "t_rule", Lhs = Pattern.New().Annotation(any).Value }; + Surface.PhonologicalRules.Add(rule); + + GrammarFstReport report = GrammarFstAdvisor.Analyze(Language); + + GrammarAdvisory escape = report.Escapes.Single(a => a.Rule == "redup"); + Assert.That(escape.Probeable, Is.False, report.Format()); + Assert.That(report.OpaqueEscapeCount, Is.EqualTo(1)); + Assert.That(report.Tier, Does.StartWith("Tier 2 candidate — hybrid")); + + Morphophonemic.MorphologicalRules.Remove(redup); + Surface.PhonologicalRules.Remove(rule); + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/HermitCrabTestBase.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/HermitCrabTestBase.cs index 2d1afc619..065d47fef 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/HermitCrabTestBase.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/HermitCrabTestBase.cs @@ -1,6 +1,5 @@ using System.Text; using NUnit.Framework; -using SIL.Machine.Annotations; using SIL.Machine.FeatureModel; using SIL.Machine.Matching; using SIL.ObjectModel; @@ -682,11 +681,7 @@ public void FixtureSetUp() entry .Allomorphs[0] .Environments.Add( - new AllomorphEnvironment( - ConstraintType.Require, - null, - Pattern.New().Annotation(vowel).Value - ) + new AllomorphEnvironment(ConstraintType.Require, null, Pattern.New().Annotation(vowel).Value) ); entry = AddEntry( @@ -710,26 +705,18 @@ public void FixtureSetUp() new AllomorphEnvironment( ConstraintType.Require, null, - Pattern.New().Annotation(unroundedVowel).Value + Pattern.New().Annotation(unroundedVowel).Value ) ); entry .Allomorphs[1] .Environments.Add( - new AllomorphEnvironment( - ConstraintType.Require, - null, - Pattern.New().Annotation(vowel).Value - ) + new AllomorphEnvironment(ConstraintType.Require, null, Pattern.New().Annotation(vowel).Value) ); entry .Allomorphs[2] .Environments.Add( - new AllomorphEnvironment( - ConstraintType.Require, - null, - Pattern.New().Annotation(vowel).Value - ) + new AllomorphEnvironment(ConstraintType.Require, null, Pattern.New().Annotation(vowel).Value) ); entry = AddEntry( diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/LeverTwoSpikeTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/LeverTwoSpikeTests.cs new file mode 100644 index 000000000..208c7ef63 --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/LeverTwoSpikeTests.cs @@ -0,0 +1,224 @@ +using NUnit.Framework; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// LEVER_2.md spike (algorithm-level, symbol alphabet): prove that LAZY composition of an inverse- +/// phonology transducer (Pinv: surface→underlying) with a morphotactic acceptor (Lex: underlying, tokens +/// on states) recovers a deleted segment AND that the lexicon constrains the restoration — +/// the property that the runtime inverse lacked (it restored deleted segments everywhere → garbage). +/// +/// Toy: root "sat" + suffix "-d", rule t→∅ / _d, so sat+d = "satd" → surface "sad". Analyzing "sad" must +/// restore the deleted "t" to recover [sat, -d] — but a t-restoration must only survive where Lex has a +/// "t" arc. Deletion (not substitution) is the case every prior approach died on, so the spike targets it. +/// +public class LeverTwoSpikeTests +{ + // A tiny transition graph. Arc: (input symbol or "" for ε, output symbol, target state). + private sealed class Graph + { + public readonly Dictionary> Arcs = new(); + public readonly Dictionary TokenOnEntry = new(); + public readonly HashSet Accepting = new(); + public int Start; + + public void Add(int from, string inSym, string outSym, int to) + { + if (!Arcs.TryGetValue(from, out var list)) + Arcs[from] = list = new List<(string, string, int)>(); + list.Add((inSym, outSym, to)); + } + + public IEnumerable<(string In, string Out, int To)> From(int s) => + Arcs.TryGetValue(s, out var list) ? list : Enumerable.Empty<(string, string, int)>(); + } + + // Pinv: surface → underlying. Identity on s/a/d, plus an ε:t arc (restore a deleted t) that must be + // followed by consuming a "d" — i.e. "a t was deleted before this d". + private static Graph BuildPinv() + { + var g = new Graph { Start = 0 }; + g.Accepting.Add(0); + g.Add(0, "s", "s", 0); + g.Add(0, "a", "a", 0); + g.Add(0, "d", "d", 0); + g.Add(0, "", "t", 1); // ε-input: restore an underlying t (consumes no surface) + g.Add(1, "d", "d", 0); // the restored t must be immediately before a surface d + return g; + } + + // Lex: underlying acceptor. Path s-a-t-(root "sat")-d-(suffix "-d"). Optionally a second root "sad" + // (no t) to show the t-restoration is NOT taken where the lexicon lacks a t. + private static Graph BuildLex(bool includeSadRoot) + { + var g = new Graph { Start = 0 }; + // root "sat" then suffix "d": 0-s->1-a->2-t->3 (root) -d->4 (suffix) + g.Add(0, "s", "s", 1); + g.Add(1, "a", "a", 2); + g.Add(2, "t", "t", 3); + g.TokenOnEntry[3] = "sat"; + g.Add(3, "d", "d", 4); + g.TokenOnEntry[4] = "-d"; + g.Accepting.Add(4); + if (includeSadRoot) + { + // a distinct bare root "sad": 0-s->10-a->11-d->12 (root "sad", accepting). No t arc. + g.Add(0, "s", "s", 10); + g.Add(10, "a", "a", 11); + g.Add(11, "d", "d", 12); + g.TokenOnEntry[12] = "sad"; + g.Accepting.Add(12); + } + return g; + } + + /// On-the-fly product walk of Pinv ⊗ Lex over the surface. A config is + /// (pinvState, lexState, tokens). Pinv consumes surface and emits underlying; that underlying must + /// unify (here: equal) a Lex arc, which advances Lex and accrues its token. ε-input Pinv arcs + /// (restorations) advance in the closure without consuming surface. + private static HashSet Analyze(Graph pinv, Graph lex, string surface) + { + var start = new List<(int P, int L, string Toks)> { (pinv.Start, lex.Start, "") }; + List<(int P, int L, string Toks)> frontier = Closure(pinv, lex, start); + foreach (char c in surface) + { + string s = c.ToString(); + var next = new List<(int, int, string)>(); + foreach ((int p, int l, string toks) in frontier) + { + foreach ((string inSym, string outSym, int pTo) in pinv.From(p)) + { + if (inSym != s) + continue; // this arc consumes a different surface symbol + foreach ((string lin, string _, int lTo) in lex.From(l)) + { + if (lin == outSym) + next.Add((pTo, lTo, toks + Tok(lex, lTo))); + } + } + } + frontier = Closure(pinv, lex, next); + if (frontier.Count == 0) + break; + } + var results = new HashSet(); + foreach ((int p, int l, string toks) in frontier) + { + if (pinv.Accepting.Contains(p) && lex.Accepting.Contains(l)) + results.Add(toks.Trim('+')); + } + return results; + } + + // Apply ε-input Pinv arcs (deletion restorations) to fixpoint: each emits an underlying symbol that + // must unify a Lex arc (the lexicon constraint that prunes spurious restorations). + private static List<(int P, int L, string Toks)> Closure( + Graph pinv, + Graph lex, + List<(int P, int L, string Toks)> configs + ) + { + var seen = new HashSet<(int, int, string)>(configs); + var result = new List<(int, int, string)>(configs); + var stack = new Stack<(int P, int L, string Toks)>(configs); + while (stack.Count > 0) + { + (int p, int l, string toks) = stack.Pop(); + foreach ((string inSym, string outSym, int pTo) in pinv.From(p)) + { + if (inSym != "") + continue; // only ε-input arcs in the closure + foreach ((string lin, string _, int lTo) in lex.From(l)) + { + if (lin != outSym) + continue; // restoration only survives where the lexicon has this underlying symbol + var nc = (pTo, lTo, toks + Tok(lex, lTo)); + if (seen.Add(nc)) + { + result.Add(nc); + stack.Push(nc); + } + } + } + } + return result; + } + + private static string Tok(Graph lex, int state) => + lex.TokenOnEntry.TryGetValue(state, out string? t) ? "+" + t : ""; + + [Test] + public void LazyComposition_RecoversDeletedSegment() + { + HashSet got = Analyze(BuildPinv(), BuildLex(includeSadRoot: false), "sad"); + Assert.That(got, Does.Contain("sat+-d"), "must restore the deleted t and recover [sat, -d]"); + Assert.That(got.Count, Is.EqualTo(1), "no spurious analyses — restoration is lexicon-constrained"); + } + + [Test] + public void LazyComposition_RestorationIsLexiconConstrained() + { + // With a bare root "sad" added, "sad" has TWO valid analyses: the bare root (no restoration) and + // the deleted-t form. The walk finds exactly those — restoration fires only where Lex has a t. + HashSet got = Analyze(BuildPinv(), BuildLex(includeSadRoot: true), "sad"); + Assert.That(got, Does.Contain("sat+-d"), "deleted-t analysis"); + Assert.That(got, Does.Contain("sad"), "bare-root analysis (no restoration)"); + Assert.That( + got.Count, + Is.EqualTo(2), + "exactly the two lexicon-valid analyses — no garbage from over-restoration" + ); + } + + [Test] + public void LazyComposition_NonWordYieldsNothing() + { + // "saa": no Lex path accepts it, with or without restoration → empty (the t-restoration cannot + // rescue it because Lex never has the needed arcs). Soundness of the mechanism. + HashSet got = Analyze(BuildPinv(), BuildLex(includeSadRoot: true), "saa"); + Assert.That(got, Is.Empty, "a non-word must yield nothing even with deletion-restoration available"); + } + + // ---- Two-rule FEEDING/OPACITY cascade: N→n / _t, then t→∅ / n_. Underlying aN+t = "aNt" → "ant" + // (assimilation) → "an" (the t that TRIGGERED the assimilation then deletes — counterbleeding + // opacity). On the surface "an" the conditioning t is gone, so this is exactly the interacting-rule + // case the advisor flagged and that produced ⁿmeⁿnⁿpuⁿlis in the runtime inverse. ---- + + private static Graph BuildCascadePinv() + { + var g = new Graph { Start = 0 }; + g.Accepting.Add(0); + g.Add(0, "a", "a", 0); + g.Add(0, "n", "n", 0); // identity: surface n was underlying n + g.Add(0, "n", "N", 1); // un-assimilate: surface n was underlying N (before a now-deleted t)... + g.Add(1, "", "t", 0); // ...so restore the deleted t (ε-input). The two ops are COUPLED via state 1. + return g; + } + + private static Graph BuildCascadeLex() + { + // root "aN" (underlying, ends in the archiphoneme N) + suffix "-t": 0-a->1-N->2 (root) -t->3 (suffix) + var g = new Graph { Start = 0 }; + g.Add(0, "a", "a", 1); + g.Add(1, "N", "N", 2); + g.TokenOnEntry[2] = "aN"; + g.Add(2, "t", "t", 3); + g.TokenOnEntry[3] = "-t"; + g.Accepting.Add(3); + return g; + } + + [Test] + public void LazyComposition_RecoversOpaqueTwoRuleCascade() + { + // The decisive test: a bounded-context Pinv that COUPLES un-assimilation (n→N) with deletion- + // restoration (ε→t) recovers the opaque underlying form. surface "an" → [aN, -t]. + HashSet got = Analyze(BuildCascadePinv(), BuildCascadeLex(), "an"); + Assert.That(got, Does.Contain("aN+-t"), "lazy composition must recover the opaque cascade aNt→an"); + Assert.That( + got.Count, + Is.EqualTo(1), + "exactly the lexicon-valid analysis — opacity restoration is constrained" + ); + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/LexEntryTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/LexEntryTests.cs index 191a6b5ed..e789d1c0d 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/LexEntryTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/LexEntryTests.cs @@ -24,7 +24,7 @@ public void DisjunctiveAllomorphs() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+ɯd") }, } ); @@ -62,14 +62,14 @@ public void FreeFluctuation() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+t") }, } ); edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+"), new InsertSimpleContext(d) }, } ); @@ -101,7 +101,7 @@ public void StemNames() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+ɯd") }, } ); @@ -121,7 +121,7 @@ public void StemNames() tSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+t") }, } ); @@ -141,7 +141,7 @@ public void StemNames() sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+s") }, } ); @@ -180,7 +180,7 @@ public void BoundRootAllomorph() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+ɯd") }, } ); @@ -196,7 +196,7 @@ public void AllomorphEnvironments() var vowel = FeatureStruct.New(Language.PhonologicalFeatureSystem).Symbol("voc+").Value; LexEntry headEntry = Entries["32"]; - Pattern envPattern = Pattern.New().Annotation(vowel).Value; + Pattern envPattern = Pattern.New().Annotation(vowel).Value; var env = new AllomorphEnvironment(ConstraintType.Require, null, envPattern); headEntry.PrimaryAllomorph.Environments.Add(env); @@ -276,7 +276,7 @@ public void PartialEntry() nominalizer.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "v") }, } ); @@ -297,7 +297,7 @@ public void PartialEntry() sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, } ); diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenCodecTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenCodecTests.cs new file mode 100644 index 000000000..bcfe83cf9 --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenCodecTests.cs @@ -0,0 +1,122 @@ +using NUnit.Framework; +using SIL.Machine.FeatureModel; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// Proves the packed-token schema (HERMITCRAB_FST_PLAN.md §8) faithfully represents a real HC +/// analysis: encoding a parsed and decoding it reproduces the morphemes and +/// root that WordAnalysis carries, with the operation populated from the actual rule — +/// including the multi-stem (compound) case that the flat array must not lose. +/// +public class MorphTokenCodecTests : HermitCrabTestBase +{ + [Test] + public void Encode_Suffix_RoundTripsToWordAnalysis() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var sSuffix = new AffixProcessRule + { + Name = "s_suffix", + Gloss = "NMLZ", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + sSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, + } + ); + Morphophonemic.MorphologicalRules.Add(sSuffix); + + var morpher = new Morpher(TraceManager, Language); + List words = morpher.ParseWord("sags").ToList(); + List analyses = morpher.AnalyzeWord("sags").ToList(); + Assert.That(words, Has.Count.EqualTo(1)); + Assert.That(analyses, Has.Count.EqualTo(1)); + + var codec = new MorphTokenCodec(); + uint[] tokens = codec.Encode(words[0]); + WordAnalysis wa = analyses[0]; + + // Morpheme channel: decoded indices reproduce WordAnalysis.Morphemes, in order. + Assert.That( + tokens.Select(t => codec.GetMorpheme(MorphToken.GetMorphemeId(t)).Id), + Is.EqualTo(wa.Morphemes.Select(m => m.Id)) + ); + // Root recovered purely from the op codes == HC's RootMorphemeIndex (no separate field). + Assert.That(MorphToken.RootIndex(tokens), Is.EqualTo(wa.RootMorphemeIndex)); + // Op channel is populated from the real rule: a root and a suffix. + var ops = tokens.Select(MorphToken.GetOp).ToList(); + Assert.That(ops, Does.Contain(MorphOp.Root)); + Assert.That(ops, Does.Contain(MorphOp.Suffix)); + + Morphophonemic.MorphologicalRules.Remove(sSuffix); + } + + [Test] + public void Encode_Compound_KeepsBothStems_OneRoot() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var rule1 = new CompoundingRule { Name = "rule1" }; + Allophonic.MorphologicalRules.Add(rule1); + rule1.Subrules.Add( + new CompoundingSubrule + { + HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, + NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("head"), new InsertSegments(Table3, "+"), new CopyFromInput("nonHead") }, + } + ); + + var morpher = new Morpher(TraceManager, Language); + List words = morpher.ParseWord("pʰutdat").ToList(); + List analyses = morpher.AnalyzeWord("pʰutdat").ToList(); + Assert.That(words, Is.Not.Empty); + + // Match each encoded word to a WordAnalysis by morpheme sequence (decoupled from order). + var codec = new MorphTokenCodec(); + foreach (Word w in words) + { + uint[] tokens = codec.Encode(w); + + // Two stems → two morphemes; exactly one tagged Root, the other Compound (not lost). + Assert.That(tokens, Has.Length.EqualTo(2)); + Assert.That(tokens.Count(t => MorphToken.GetOp(t) == MorphOp.Root), Is.EqualTo(1)); + Assert.That(tokens.Select(MorphToken.GetOp), Does.Contain(MorphOp.Compound)); + + string[] decoded = tokens.Select(t => codec.GetMorpheme(MorphToken.GetMorphemeId(t)).Id).ToArray(); + WordAnalysis? match = analyses.FirstOrDefault(a => a.Morphemes.Select(m => m.Id).SequenceEqual(decoded)); + Assert.That(match, Is.Not.Null, $"no WordAnalysis matches decoded morphemes [{string.Join(",", decoded)}]"); + Assert.That(MorphToken.RootIndex(tokens), Is.EqualTo(match!.RootMorphemeIndex)); + } + + Allophonic.MorphologicalRules.Remove(rule1); + } + + [Test] + public void ClassifyOp_PopulatesAffixRolesFromOutputActions() + { + Assert.That(RoleOf(new CopyFromInput("1"), new CopyFromInput("1")), Is.EqualTo(MorphOp.Reduplication)); + Assert.That( + RoleOf(new CopyFromInput("1"), new InsertSegments(Table3, "a"), new CopyFromInput("2")), + Is.EqualTo(MorphOp.Infix) + ); + Assert.That(RoleOf(new InsertSegments(Table3, "di"), new CopyFromInput("1")), Is.EqualTo(MorphOp.Prefix)); + Assert.That(RoleOf(new CopyFromInput("1"), new InsertSegments(Table3, "s")), Is.EqualTo(MorphOp.Suffix)); + } + + private static MorphOp RoleOf(params MorphologicalOutputAction[] rhs) + { + var allo = new AffixProcessAllomorph(); + foreach (MorphologicalOutputAction action in rhs) + { + allo.Rhs.Add(action); + } + return MorphTokenCodec.ClassifyOp(allo, isHeadRoot: false); + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenTests.cs new file mode 100644 index 000000000..fb9d1590a --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenTests.cs @@ -0,0 +1,67 @@ +using NUnit.Framework; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// The packed 32-bit analysis token (HERMITCRAB_FST_PLAN.md §8): 8-bit MorphOp + 24-bit morpheme +/// index, with the derivation array being self-describing (morpheme order = array order; root = +/// the Root token's position). +/// +public class MorphTokenTests +{ + [Test] + public void Encode_RoundTripsOpAndMorphemeId() + { + foreach (MorphOp op in System.Enum.GetValues(typeof(MorphOp))) + { + foreach (int id in new[] { 0, 1, 42, MorphToken.MaxMorphemeId }) + { + uint token = MorphToken.Encode(op, id); + Assert.That(MorphToken.GetOp(token), Is.EqualTo(op), $"op for id {id}"); + Assert.That(MorphToken.GetMorphemeId(token), Is.EqualTo(id), $"id for op {op}"); + } + } + } + + [Test] + public void Encode_IdOutOfRange_Throws() + { + Assert.Throws(() => + MorphToken.Encode(MorphOp.Root, MorphToken.MaxMorphemeId + 1) + ); + Assert.Throws(() => MorphToken.Encode(MorphOp.Root, -1)); + } + + [Test] + public void Encode_DistinctInputsGiveDistinctTokens() + { + // Different op, same id → different token. + Assert.That(MorphToken.Encode(MorphOp.Prefix, 7), Is.Not.EqualTo(MorphToken.Encode(MorphOp.Suffix, 7))); + // Same op, different id → different token. + Assert.That(MorphToken.Encode(MorphOp.Suffix, 7), Is.Not.EqualTo(MorphToken.Encode(MorphOp.Suffix, 8))); + } + + [Test] + public void Derivation_ArrayIsSelfDescribing() + { + // prefix m10 · root m20 · suffix m30 — a whole WordAnalysis in 12 bytes. + uint[] derivation = + { + MorphToken.Encode(MorphOp.Prefix, 10), + MorphToken.Encode(MorphOp.Root, 20), + MorphToken.Encode(MorphOp.Suffix, 30), + }; + + // Morphemes in order = the array's morpheme indices in array order. + Assert.That(System.Array.ConvertAll(derivation, MorphToken.GetMorphemeId), Is.EqualTo(new[] { 10, 20, 30 })); + // RootMorphemeIndex falls out of the op codes — no separate field needed. + Assert.That(MorphToken.RootIndex(derivation), Is.EqualTo(1)); + } + + [Test] + public void RootIndex_NoRoot_ReturnsMinusOne() + { + uint[] derivation = { MorphToken.Encode(MorphOp.Prefix, 1), MorphToken.Encode(MorphOp.Suffix, 2) }; + Assert.That(MorphToken.RootIndex(derivation), Is.EqualTo(-1)); + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs index eb8944ad0..8245d17a1 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs @@ -25,7 +25,7 @@ public void AnalyzeWord_CanAnalyze_ReturnsCorrectAnalysis() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+d") }, } ); @@ -54,7 +54,7 @@ public void AnalyzeWord_CanAnalyzeLinear_ReturnsCorrectAnalysis() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+d") }, } ); @@ -71,7 +71,7 @@ public void AnalyzeWord_CanAnalyzeLinear_ReturnsCorrectAnalysis() tSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+t") }, } ); @@ -82,10 +82,10 @@ public void AnalyzeWord_CanAnalyzeLinear_ReturnsCorrectAnalysis() var rule1 = new RewriteRule { Name = "rule1", - Lhs = Pattern.New().Annotation(Character(Table1, "t")).Value, + Lhs = Pattern.New().Annotation(Character(Table1, "t")).Value, }; rule1.Subrules.Add( - new RewriteSubrule { Rhs = Pattern.New().Annotation(Character(Table1, "d")).Value } + new RewriteSubrule { Rhs = Pattern.New().Annotation(Character(Table1, "d")).Value } ); Morphophonemic.PhonologicalRules.Add(rule1); @@ -113,7 +113,7 @@ public void AnalyzeWord_CannotAnalyze_ReturnsEmptyEnumerable() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+d") }, } ); @@ -143,7 +143,7 @@ public void AnalyzeWord_CannotAnalyzeDueToAllomorphCooccurenceFailure_ReturnsEmp edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+d") }, } ); @@ -202,7 +202,7 @@ public void AnalyzeWord_CannotAnalyzeDueToMorphemeCooccurenceFailure_ReturnsEmpt edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+d") }, } ); @@ -253,7 +253,7 @@ public void AnalyzeWord_CanGuess_ReturnsCorrectAnalysis() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+d") }, } ); @@ -288,7 +288,7 @@ public void GenerateWords_CanGenerate_ReturnsCorrectWord() siPrefix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new InsertSegments(Table3, "si+"), new CopyFromInput("1") }, } ); @@ -304,7 +304,7 @@ public void GenerateWords_CanGenerate_ReturnsCorrectWord() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+ɯd") }, } ); @@ -333,7 +333,7 @@ public void GenerateWords_CannotGenerate_ReturnsEmptyEnumerable() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+ɯd") }, } ); @@ -454,4 +454,103 @@ IList GetNodes(string pattern) Shape shape = new Segments(Table2, pattern, true).Shape; return shape.GetNodes(shape.Range).ToList(); } + + [Test] + public void AnalyzeWord_SingleThreaded_MatchesParallel() + { + // Build a small Unordered grammar (the order FieldWorks uses, which exercises the + // parallel analysis cascade and parallel affix-template unapplication). + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var edSuffix = new AffixProcessRule + { + Id = "PAST", + Name = "ed_suffix", + Gloss = "PAST", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + edSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+d") }, + } + ); + Morphophonemic.MorphologicalRules.Add(edSuffix); + + var parallel = new Morpher(TraceManager, Language); // default: Environment.ProcessorCount + var singleThreaded = new Morpher(TraceManager, Language, maxDegreeOfParallelism: 1); + + Assert.That(singleThreaded.MaxDegreeOfParallelism, Is.EqualTo(1)); + + // The single-threaded cascade (MaxDegreeOfParallelism == 1) must produce the same analyses + // as the parallel cascade. + IEnumerable singleResult = singleThreaded.AnalyzeWord("sagd").ToList(); + IEnumerable parallelResult = parallel.AnalyzeWord("sagd").ToList(); + Assert.That( + singleResult, + Is.EquivalentTo(parallelResult), + "single-threaded analysis must match the parallel analysis" + ); + } + + [Test] + public void AnalyzeWord_ConcurrentRepeatedParsing_IsDeterministic() + { + // Concurrency safety net for the copy-on-write refactors (Plans A & B): many threads + // parse against one shared frozen grammar whose FeatureStructs become shared into + // per-parse clones. A COW race would show up as a nondeterministic analysis. Unordered + // order exercises the parallel cascade + affix-template paths. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var edSuffix = new AffixProcessRule + { + Id = "PAST", + Name = "ed_suffix", + Gloss = "PAST", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + edSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+d") }, + } + ); + Morphophonemic.MorphologicalRules.Add(edSuffix); + + var morpher = new Morpher(TraceManager, Language); + var words = new[] { "sagd", "sag", "tag", "tagd", "gag", "xyzzy" }; + Dictionary baseline = words.ToDictionary(w => w, w => AnalysisSignature(morpher, w)); + + for (int iter = 0; iter < 50; iter++) + { + var results = new System.Collections.Concurrent.ConcurrentDictionary(); + System.Threading.Tasks.Parallel.ForEach( + Enumerable.Range(0, 250), + i => + { + string w = words[i % words.Length]; + results[w] = AnalysisSignature(morpher, w); + } + ); + foreach (string w in words) + { + Assert.That( + results[w], + Is.EqualTo(baseline[w]), + $"nondeterministic analysis for '{w}' on iteration {iter}" + ); + } + } + } + + private static string AnalysisSignature(Morpher morpher, string word) + { + return string.Join( + "|", + morpher + .AnalyzeWord(word) + .Select(a => string.Join("+", a.Morphemes.Select(m => m.Id)) + ":" + a.RootMorphemeIndex) + .OrderBy(s => s, System.StringComparer.Ordinal) + ); + } } diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphologicalRules/AffixProcessRuleTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphologicalRules/AffixProcessRuleTests.cs index 329074617..2d399183b 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphologicalRules/AffixProcessRuleTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphologicalRules/AffixProcessRuleTests.cs @@ -1,6 +1,5 @@ using NUnit.Framework; using SIL.Extensions; -using SIL.Machine.Annotations; using SIL.Machine.FeatureModel; using SIL.Machine.Matching; using SIL.Machine.Morphology.HermitCrab.PhonologicalRules; @@ -24,7 +23,7 @@ public void MorphosyntacticRules() sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, } ); @@ -89,7 +88,7 @@ public void MorphosyntacticRules() sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "d") }, } ); @@ -121,7 +120,7 @@ public void PercolationRules() rule1.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "z") }, } ); @@ -331,21 +330,21 @@ public void SuffixRules() sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Annotation(strident).Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Annotation(strident).Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "ɯz") }, } ); sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Annotation(voicelessCons).Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Annotation(voicelessCons).Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, } ); sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "z") }, } ); @@ -367,8 +366,8 @@ public void SuffixRules() { Lhs = { - Pattern.New("1").Annotation(any).OneOrMore.Value, - Pattern.New("2").Annotation(alvStop).Value, + Pattern.New("1").Annotation(any).OneOrMore.Value, + Pattern.New("2").Annotation(alvStop).Value, }, Rhs = { new CopyFromInput("1"), new CopyFromInput("2"), new InsertSegments(Table3, "+ɯd") }, } @@ -376,14 +375,14 @@ public void SuffixRules() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Annotation(voicelessCons).Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Annotation(voicelessCons).Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+t") }, } ); edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+"), new InsertSimpleContext(d) }, } ); @@ -392,13 +391,13 @@ public void SuffixRules() var prule1 = new RewriteRule { Name = "rule1", - Lhs = Pattern.New().Annotation(Character(Table3, "t")).Value, + Lhs = Pattern.New().Annotation(Character(Table3, "t")).Value, }; prule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(unasp).Value, - LeftEnvironment = Pattern.New().Annotation(cons).Value, + Rhs = Pattern.New().Annotation(unasp).Value, + LeftEnvironment = Pattern.New().Annotation(cons).Value, } ); Allophonic.PhonologicalRules.Add(prule1); @@ -445,7 +444,7 @@ public void SuffixRules() { Lhs = { - Pattern + Pattern .New("1") .Annotation(any) .OneOrMore.Annotation( @@ -536,21 +535,21 @@ public void PrefixRules() sPrefix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(strident).Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(strident).Annotation(any).OneOrMore.Value }, Rhs = { new InsertSegments(Table3, "zi"), new CopyFromInput("1") }, } ); sPrefix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(voicelessCons).Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(voicelessCons).Annotation(any).OneOrMore.Value }, Rhs = { new InsertSegments(Table3, "s"), new CopyFromInput("1") }, } ); sPrefix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new InsertSegments(Table3, "z"), new CopyFromInput("1") }, } ); @@ -570,21 +569,21 @@ public void PrefixRules() edPrefix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(alvStop).Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(alvStop).Annotation(any).OneOrMore.Value }, Rhs = { new InsertSegments(Table3, "di+"), new CopyFromInput("1") }, } ); edPrefix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(voicelessCons).Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(voicelessCons).Annotation(any).OneOrMore.Value }, Rhs = { new InsertSegments(Table3, "t+"), new CopyFromInput("1") }, } ); edPrefix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new InsertSegments(Table3, "d+"), new CopyFromInput("1") }, } ); @@ -593,9 +592,9 @@ public void PrefixRules() var aspiration = new RewriteRule { Name = "aspiration", - Lhs = Pattern.New().Annotation(voicelessStop).Value, + Lhs = Pattern.New().Annotation(voicelessStop).Value, }; - aspiration.Subrules.Add(new RewriteSubrule { Rhs = Pattern.New().Annotation(unasp).Value }); + aspiration.Subrules.Add(new RewriteSubrule { Rhs = Pattern.New().Annotation(unasp).Value }); Allophonic.PhonologicalRules.Add(aspiration); var morpher = new Morpher(TraceManager, Language); @@ -620,7 +619,7 @@ public void PrefixRules() { Lhs = { - Pattern + Pattern .New("1") .Annotation( FeatureStruct @@ -659,8 +658,8 @@ public void PrefixRules() { Lhs = { - Pattern.New("1").Annotation(cons).Value, - Pattern + Pattern.New("1").Annotation(cons).Value, + Pattern .New("2") .Annotation( FeatureStruct @@ -670,7 +669,7 @@ public void PrefixRules() .Value ) .Value, - Pattern.New("3").Annotation(any).OneOrMore.Value, + Pattern.New("3").Annotation(any).OneOrMore.Value, }, Rhs = { @@ -728,9 +727,9 @@ public void InfixRules() { Lhs = { - Pattern.New("1").Annotation(cons).Value, - Pattern.New("2").Annotation(cons).Value, - Pattern.New("3").Annotation(cons).Value, + Pattern.New("1").Annotation(cons).Value, + Pattern.New("2").Annotation(cons).Value, + Pattern.New("3").Annotation(cons).Value, }, Rhs = { @@ -760,9 +759,9 @@ public void InfixRules() { Lhs = { - Pattern.New("1").Annotation(cons).Value, - Pattern.New("2").Annotation(cons).Value, - Pattern.New("3").Annotation(cons).Value, + Pattern.New("1").Annotation(cons).Value, + Pattern.New("2").Annotation(cons).Value, + Pattern.New("3").Annotation(cons).Value, }, Rhs = { @@ -792,8 +791,8 @@ public void InfixRules() { Lhs = { - Pattern.New("1").Annotation(cons).Annotation(cons).Value, - Pattern.New("2").Annotation(cons).Value, + Pattern.New("1").Annotation(cons).Annotation(cons).Value, + Pattern.New("2").Annotation(cons).Value, }, Rhs = { @@ -822,9 +821,9 @@ public void InfixRules() { Lhs = { - Pattern.New("1").Annotation(cons).Value, - Pattern.New("2").Annotation(cons).Value, - Pattern.New("3").Annotation(cons).Value, + Pattern.New("1").Annotation(cons).Value, + Pattern.New("2").Annotation(cons).Value, + Pattern.New("3").Annotation(cons).Value, }, Rhs = { @@ -841,9 +840,9 @@ public void InfixRules() var aspiration = new RewriteRule { Name = "aspiration", - Lhs = Pattern.New().Annotation(voicelessStop).Value, + Lhs = Pattern.New().Annotation(voicelessStop).Value, }; - aspiration.Subrules.Add(new RewriteSubrule { Rhs = Pattern.New().Annotation(unasp).Value }); + aspiration.Subrules.Add(new RewriteSubrule { Rhs = Pattern.New().Annotation(unasp).Value }); Allophonic.PhonologicalRules.Add(aspiration); var morpher = new Morpher(TraceManager, Language); @@ -902,8 +901,8 @@ public void SimulfixRules() { Lhs = { - Pattern.New("1").Annotation(any).OneOrMore.Value, - Pattern.New("2").Annotation(p).Value, + Pattern.New("1").Annotation(any).OneOrMore.Value, + Pattern.New("2").Annotation(p).Value, }, Rhs = { new CopyFromInput("1"), new ModifyFromInput("2", voiced) }, } @@ -919,8 +918,8 @@ public void SimulfixRules() { Lhs = { - Pattern.New("1").Annotation(p).Value, - Pattern.New("2").Annotation(any).OneOrMore.Value, + Pattern.New("1").Annotation(p).Value, + Pattern.New("2").Annotation(any).OneOrMore.Value, }, Rhs = { new ModifyFromInput("1", voiced), new CopyFromInput("2") }, } @@ -936,9 +935,9 @@ public void SimulfixRules() { Lhs = { - Pattern.New("1").Annotation(cons).Optional.Value, - Pattern.New("2").Annotation(vowel).Value, - Pattern.New("3").Annotation(any).OneOrMore.Value, + Pattern.New("1").Annotation(cons).Optional.Value, + Pattern.New("2").Annotation(vowel).Value, + Pattern.New("3").Annotation(any).OneOrMore.Value, }, Rhs = { new CopyFromInput("1"), new ModifyFromInput("2", nonround), new CopyFromInput("3") }, } @@ -953,9 +952,9 @@ public void SimulfixRules() { Lhs = { - Pattern.New("1").Annotation(cons).Optional.Value, - Pattern.New("2").Annotation(vowel).Range(1, 2).Value, - Pattern.New("3").Annotation(any).OneOrMore.Value, + Pattern.New("1").Annotation(cons).Optional.Value, + Pattern.New("2").Annotation(vowel).Range(1, 2).Value, + Pattern.New("3").Annotation(any).OneOrMore.Value, }, Rhs = { new CopyFromInput("1"), new ModifyFromInput("2", nonround), new CopyFromInput("3") }, } @@ -1007,8 +1006,8 @@ public void ReduplicationRules() { Lhs = { - Pattern.New("1").Annotation(cons).Annotation(vowel).Value, - Pattern.New("2").Annotation(any).OneOrMore.Value, + Pattern.New("1").Annotation(cons).Annotation(vowel).Value, + Pattern.New("2").Annotation(any).OneOrMore.Value, }, Rhs = { new CopyFromInput("1"), new CopyFromInput("1"), new CopyFromInput("2") }, } @@ -1021,14 +1020,14 @@ public void ReduplicationRules() var voicing = new RewriteRule { Name = "voicing", - Lhs = Pattern.New().Annotation(Character(Table1, "s")).Value, + Lhs = Pattern.New().Annotation(Character(Table1, "s")).Value, }; voicing.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(voiced).Value, - LeftEnvironment = Pattern.New().Annotation(vowel).Value, - RightEnvironment = Pattern.New().Annotation(vowel).Value, + Rhs = Pattern.New().Annotation(voiced).Value, + LeftEnvironment = Pattern.New().Annotation(vowel).Value, + RightEnvironment = Pattern.New().Annotation(vowel).Value, } ); Allophonic.PhonologicalRules.Add(voicing); @@ -1039,13 +1038,13 @@ public void ReduplicationRules() var affrication = new RewriteRule { Name = "affrication", - Lhs = Pattern.New().Annotation(Character(Table1, "s")).Value, + Lhs = Pattern.New().Annotation(Character(Table1, "s")).Value, }; affrication.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(affricate).Value, - LeftEnvironment = Pattern.New().Annotation(HCFeatureSystem.LeftSideAnchor).Value, + Rhs = Pattern.New().Annotation(affricate).Value, + LeftEnvironment = Pattern.New().Annotation(HCFeatureSystem.LeftSideAnchor).Value, } ); Allophonic.PhonologicalRules.Add(affrication); @@ -1059,8 +1058,8 @@ public void ReduplicationRules() { Lhs = { - Pattern.New("1").Annotation(any).OneOrMore.Value, - Pattern.New("2").Annotation(vowel).Annotation(cons).Value, + Pattern.New("1").Annotation(any).OneOrMore.Value, + Pattern.New("2").Annotation(vowel).Annotation(cons).Value, }, Rhs = { new CopyFromInput("1"), new CopyFromInput("2"), new CopyFromInput("2") }, } @@ -1077,8 +1076,8 @@ public void ReduplicationRules() { Lhs = { - Pattern.New("1").Annotation(any).ZeroOrMore.Value, - Pattern.New("2").Annotation(cons).Annotation(vowel).Annotation(cons).Value, + Pattern.New("1").Annotation(any).ZeroOrMore.Value, + Pattern.New("2").Annotation(cons).Annotation(vowel).Annotation(cons).Value, }, Rhs = { new CopyFromInput("1"), new CopyFromInput("2"), new CopyFromInput("2") }, } @@ -1094,8 +1093,8 @@ public void ReduplicationRules() { Lhs = { - Pattern.New("1").Annotation(any).OneOrMore.Value, - Pattern.New("2").Annotation(vowel).Annotation(cons).Value, + Pattern.New("1").Annotation(any).OneOrMore.Value, + Pattern.New("2").Annotation(vowel).Annotation(cons).Value, }, Rhs = { new CopyFromInput("1"), new CopyFromInput("2"), new CopyFromInput("2") }, } @@ -1104,13 +1103,13 @@ public void ReduplicationRules() var gDelete = new RewriteRule { Name = "g_delete", - Lhs = Pattern.New().Annotation(Character(Table1, "g")).Value, + Lhs = Pattern.New().Annotation(Character(Table1, "g")).Value, }; gDelete.Subrules.Add( new RewriteSubrule { - LeftEnvironment = Pattern.New().Annotation(vowel).Value, - RightEnvironment = Pattern.New().Annotation(vowel).Value, + LeftEnvironment = Pattern.New().Annotation(vowel).Value, + RightEnvironment = Pattern.New().Annotation(vowel).Value, } ); Allophonic.PhonologicalRules.Add(gDelete); @@ -1122,7 +1121,7 @@ public void ReduplicationRules() gDelete.Subrules.Add( new RewriteSubrule { - RightEnvironment = Pattern.New().Annotation(HCFeatureSystem.RightSideAnchor).Value, + RightEnvironment = Pattern.New().Annotation(HCFeatureSystem.RightSideAnchor).Value, } ); @@ -1138,9 +1137,9 @@ public void ReduplicationRules() { Lhs = { - Pattern.New("1").Annotation(cons).Value, - Pattern.New("2").Annotation(vowel).Annotation(vowel).Value, - Pattern.New("3").Annotation(cons).Value, + Pattern.New("1").Annotation(cons).Value, + Pattern.New("2").Annotation(vowel).Annotation(vowel).Value, + Pattern.New("3").Annotation(cons).Value, }, Rhs = { @@ -1189,8 +1188,8 @@ public void TruncateRules() { Lhs = { - Pattern.New("1").Annotation(any).OneOrMore.Value, - Pattern.New("2").Annotation(Character(Table3, "g")).Value, + Pattern.New("1").Annotation(any).OneOrMore.Value, + Pattern.New("2").Annotation(Character(Table3, "g")).Value, }, Rhs = { new CopyFromInput("1") }, } @@ -1206,8 +1205,8 @@ public void TruncateRules() { Lhs = { - Pattern.New("1").Annotation(Character(Table3, "s")).Value, - Pattern.New("2").Annotation(any).OneOrMore.Value, + Pattern.New("1").Annotation(Character(Table3, "s")).Value, + Pattern.New("2").Annotation(any).OneOrMore.Value, }, Rhs = { new CopyFromInput("2") }, } @@ -1222,8 +1221,8 @@ public void TruncateRules() { Lhs = { - Pattern.New("1").Annotation(fricative).Value, - Pattern.New("2").Annotation(any).OneOrMore.Value, + Pattern.New("1").Annotation(fricative).Value, + Pattern.New("2").Annotation(any).OneOrMore.Value, }, Rhs = { new CopyFromInput("2") }, } @@ -1238,8 +1237,8 @@ public void TruncateRules() { Lhs = { - Pattern.New("1").Annotation(any).OneOrMore.Value, - Pattern.New("2").Annotation(velarStop).Value, + Pattern.New("1").Annotation(any).OneOrMore.Value, + Pattern.New("2").Annotation(velarStop).Value, }, Rhs = { new CopyFromInput("1") }, } @@ -1254,8 +1253,8 @@ public void TruncateRules() { Lhs = { - Pattern.New("1").Annotation(Character(Table3, "s")).Optional.Value, - Pattern.New("2").Annotation(any).OneOrMore.Value, + Pattern.New("1").Annotation(Character(Table3, "s")).Optional.Value, + Pattern.New("2").Annotation(any).OneOrMore.Value, }, Rhs = { new InsertSegments(Table3, "g"), new CopyFromInput("2") }, } @@ -1285,19 +1284,19 @@ public void RequiredEnvironments() sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, Environments = { new AllomorphEnvironment( ConstraintType.Require, null, - Pattern.New().Annotation(Character(Table3, "a")).Value + Pattern.New().Annotation(Character(Table3, "a")).Value ), new AllomorphEnvironment( ConstraintType.Require, null, - Pattern.New().Annotation(Character(Table3, "ɯ")).Value + Pattern.New().Annotation(Character(Table3, "ɯ")).Value ), }, } @@ -1305,7 +1304,7 @@ public void RequiredEnvironments() sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "z") }, } ); @@ -1325,7 +1324,7 @@ public void RequiredEnvironments() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+ɯd") }, } ); @@ -1357,7 +1356,7 @@ public void RequiredSyntacticFeatureStruct() sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, RequiredSyntacticFeatureStruct = FeatureStruct .New(Language.SyntacticFeatureSystem) @@ -1369,7 +1368,7 @@ public void RequiredSyntacticFeatureStruct() sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "z") }, } ); @@ -1389,7 +1388,7 @@ public void RequiredSyntacticFeatureStruct() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+ɯd") }, } ); @@ -1421,14 +1420,14 @@ public void FreeFluctuation() sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, } ); sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "z") }, } ); @@ -1452,7 +1451,7 @@ public void CircumfixRules() circumfix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new InsertSegments(Table3, "ta"), new CopyFromInput("1"), new InsertSegments(Table3, "od") }, } ); @@ -1462,7 +1461,7 @@ public void CircumfixRules() sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, } ); @@ -1504,8 +1503,8 @@ public void BoundaryRules() { Lhs = { - Pattern.New("1").Annotation(any).OneOrMore.Value, - Pattern + Pattern.New("1").Annotation(any).OneOrMore.Value, + Pattern .New("2") .Annotation( FeatureStruct @@ -1554,10 +1553,10 @@ public void WordSynthesisWithBoundaryAtBeginning() { Lhs = { - Pattern.New("1").Annotation(any).ZeroOrMore.Value, - Pattern.New("2").Annotation(cons).Value, - Pattern.New("3").Annotation(vowel).Value, - Pattern.New("4").Annotation(cons).Value, + Pattern.New("1").Annotation(any).ZeroOrMore.Value, + Pattern.New("2").Annotation(cons).Value, + Pattern.New("3").Annotation(vowel).Value, + Pattern.New("4").Annotation(cons).Value, }, Rhs = { @@ -1586,7 +1585,7 @@ public void WordSynthesisWithBoundaryAtBeginning() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+ɯd") }, } ); @@ -1623,8 +1622,8 @@ public void PartialRule() { Lhs = { - Pattern.New("1").Annotation(any).OneOrMore.Value, - Pattern.New("2").Annotation(alvStop).Value, + Pattern.New("1").Annotation(any).OneOrMore.Value, + Pattern.New("2").Annotation(alvStop).Value, }, Rhs = { new CopyFromInput("1"), new CopyFromInput("2"), new InsertSegments(Table3, "ɯd") }, } @@ -1632,14 +1631,14 @@ public void PartialRule() edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Annotation(voicelessCons).Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Annotation(voicelessCons).Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "t") }, } ); edSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "d") }, } ); @@ -1663,7 +1662,7 @@ public void PartialRule() sSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, } ); @@ -1680,7 +1679,7 @@ public void PartialRule() nominalizer.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "v") }, } ); @@ -1696,7 +1695,7 @@ public void PartialRule() uSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "u") }, } ); @@ -1711,7 +1710,7 @@ public void PartialRule() pSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "p") }, } ); @@ -1760,14 +1759,14 @@ public void DisjunctiveAllomorphs() esSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Annotation(vowel).Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Annotation(vowel).Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, } ); esSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "ɯs") }, } ); @@ -1782,14 +1781,14 @@ public void DisjunctiveAllomorphs() guSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "gun") }, Environments = { new AllomorphEnvironment( ConstraintType.Require, null, - Pattern.New().Annotation(vowel).Value + Pattern.New().Annotation(vowel).Value ), }, } @@ -1797,7 +1796,7 @@ public void DisjunctiveAllomorphs() guSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "gu") }, } ); @@ -1830,7 +1829,7 @@ public void SubsumedAffix() uSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "u") }, } ); @@ -1847,8 +1846,8 @@ public void SubsumedAffix() { Lhs = { - Pattern.New("1").Annotation(any).OneOrMore.Value, - Pattern.New("2").Annotation(vowel).Value, + Pattern.New("1").Annotation(any).OneOrMore.Value, + Pattern.New("2").Annotation(vowel).Value, }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, } @@ -1866,7 +1865,7 @@ public void SubsumedAffix() nominalizer.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "v") }, } ); @@ -1883,8 +1882,8 @@ public void SubsumedAffix() { Lhs = { - Pattern.New("1").Annotation(any).OneOrMore.Value, - Pattern.New("2").Annotation(vowel).Value, + Pattern.New("1").Annotation(any).OneOrMore.Value, + Pattern.New("2").Annotation(vowel).Value, }, Rhs = { new CopyFromInput("1") }, } @@ -1927,8 +1926,8 @@ public void ModifyFromInputRules() { Lhs = { - Pattern.New("1").Annotation(any).OneOrMore.Value, - Pattern.New("2").Annotation(vowel).Value, + Pattern.New("1").Annotation(any).OneOrMore.Value, + Pattern.New("2").Annotation(vowel).Value, }, Rhs = { @@ -1994,9 +1993,9 @@ public void NonContiguousRules() { Lhs = { - Pattern.New("1").Annotation(cons).Value, - Pattern.New("2").Annotation(cons).Value, - Pattern.New("3").Annotation(cons).Value, + Pattern.New("1").Annotation(cons).Value, + Pattern.New("2").Annotation(cons).Value, + Pattern.New("3").Annotation(cons).Value, }, Rhs = { @@ -2015,13 +2014,13 @@ public void NonContiguousRules() { Name = "rule1", ApplicationMode = RewriteApplicationMode.Iterative, - Lhs = Pattern.New().Annotation(lowVowel).Value, + Lhs = Pattern.New().Annotation(lowVowel).Value, }; rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(i).Value, - RightEnvironment = Pattern.New().Annotation(voicedCons).Value, + Rhs = Pattern.New().Annotation(i).Value, + RightEnvironment = Pattern.New().Annotation(voicedCons).Value, } ); Allophonic.PhonologicalRules.Add(rule1); diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphologicalRules/CompoundingRuleTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphologicalRules/CompoundingRuleTests.cs index 93192e758..8f5523508 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphologicalRules/CompoundingRuleTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphologicalRules/CompoundingRuleTests.cs @@ -1,5 +1,4 @@ using NUnit.Framework; -using SIL.Machine.Annotations; using SIL.Machine.FeatureModel; using SIL.Machine.Matching; @@ -16,8 +15,8 @@ public void SimpleRules() rule1.Subrules.Add( new CompoundingSubrule { - HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, - NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, + HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, + NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("head"), new InsertSegments(Table3, "+"), new CopyFromInput("nonHead") }, } ); @@ -33,8 +32,8 @@ public void SimpleRules() rule1.Subrules.Add( new CompoundingSubrule { - HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, - NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, + HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, + NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("nonHead"), new InsertSegments(Table3, "+"), new CopyFromInput("head") }, } ); @@ -61,7 +60,7 @@ public void SimpleRules() prefix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new InsertSegments(Table3, "di+"), new CopyFromInput("1") }, } ); @@ -78,8 +77,8 @@ public void SimpleRules() rule1.Subrules.Add( new CompoundingSubrule { - HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, - NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, + HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, + NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("head"), new InsertSegments(Table3, "+"), new CopyFromInput("nonHead") }, } ); @@ -96,8 +95,8 @@ public void SimpleRules() rule2.Subrules.Add( new CompoundingSubrule { - HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, - NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, + HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, + NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("nonHead"), new InsertSegments(Table3, "+"), new CopyFromInput("head") }, } ); @@ -124,8 +123,8 @@ public void MorphosyntacticRules() rule1.Subrules.Add( new CompoundingSubrule { - HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, - NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, + HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, + NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("head"), new InsertSegments(Table3, "+"), new CopyFromInput("nonHead") }, } ); @@ -181,8 +180,8 @@ public void ProdRestrictRule() rule1.Subrules.Add( new CompoundingSubrule { - HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, - NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, + HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, + NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("head"), new InsertSegments(Table3, "+"), new CopyFromInput("nonHead") }, } ); diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/PhonologicalRules/MetathesisRuleTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/PhonologicalRules/MetathesisRuleTests.cs index 54d0e7451..ea7c8954f 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/PhonologicalRules/MetathesisRuleTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/PhonologicalRules/MetathesisRuleTests.cs @@ -1,5 +1,4 @@ using NUnit.Framework; -using SIL.Machine.Annotations; using SIL.Machine.FeatureModel; using SIL.Machine.Matching; using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; @@ -14,7 +13,7 @@ public void SimpleRule() var rule1 = new MetathesisRule { Name = "rule1", - Pattern = Pattern + Pattern = Pattern .New() .Group("1", group => group.Annotation(Character(Table3, "i"))) .Group("2", group => group.Annotation(Character(Table3, "u"))) @@ -36,7 +35,7 @@ public void ComplexRule() var rule1 = new MetathesisRule { Name = "rule1", - Pattern = Pattern + Pattern = Pattern .New() .Group("1", group => group.Annotation(Character(Table3, "i"))) .Group("middle", group => group.Annotation(Character(Table3, "+"))) @@ -53,7 +52,7 @@ public void ComplexRule() uSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+u") }, } ); @@ -70,7 +69,7 @@ public void SimpleRuleNotUnapplied() var prule = new MetathesisRule { Name = "rule1", - Pattern = Pattern + Pattern = Pattern .New() .Group("1", group => group.Annotation(Character(Table3, "i"))) .Group("2", group => group.Annotation(Character(Table3, "u"))) @@ -85,7 +84,7 @@ public void SimpleRuleNotUnapplied() iSuffix.Allomorphs.Add( new AffixProcessAllomorph { - Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "i") }, } ); diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/PhonologicalRules/RewriteRuleTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/PhonologicalRules/RewriteRuleTests.cs index f7c56ecaa..68913bba7 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/PhonologicalRules/RewriteRuleTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/PhonologicalRules/RewriteRuleTests.cs @@ -1,5 +1,4 @@ using NUnit.Framework; -using SIL.Machine.Annotations; using SIL.Machine.DataStructures; using SIL.Machine.FeatureModel; using SIL.Machine.Matching; @@ -25,35 +24,35 @@ public void SimpleRules() var rule1 = new RewriteRule { Name = "rule1", - Lhs = Pattern.New().Annotation(Character(Table1, "t")).Value, + Lhs = Pattern.New().Annotation(Character(Table1, "t")).Value, }; Allophonic.PhonologicalRules.Add(rule1); rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(asp).Value, - LeftEnvironment = Pattern.New().Annotation(nonCons).Value, + Rhs = Pattern.New().Annotation(asp).Value, + LeftEnvironment = Pattern.New().Annotation(nonCons).Value, } ); var rule2 = new RewriteRule { Name = "rule2", - Lhs = Pattern.New().Annotation(Character(Table3, "p")).Value, + Lhs = Pattern.New().Annotation(Character(Table3, "p")).Value, }; Allophonic.PhonologicalRules.Add(rule2); Morphophonemic.PhonologicalRules.Add(rule2); rule2.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(asp).Value, + Rhs = Pattern.New().Annotation(asp).Value, // the following should be a NOOP because it accepts the empty string. - LeftEnvironment = Pattern + LeftEnvironment = Pattern .New() .Annotation(nonCons) .Optional.Annotation(nonCons) .Optional.Value, - RightEnvironment = Pattern.New().Annotation(nonCons).Value, + RightEnvironment = Pattern.New().Annotation(nonCons).Value, } ); @@ -105,17 +104,13 @@ public void LongDistanceRules() .Symbol("voc+") .Value; - var rule3 = new RewriteRule - { - Name = "rule3", - Lhs = Pattern.New().Annotation(highVowel).Value, - }; + var rule3 = new RewriteRule { Name = "rule3", Lhs = Pattern.New().Annotation(highVowel).Value }; Allophonic.PhonologicalRules.Add(rule3); rule3.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(backRnd).Value, - LeftEnvironment = Pattern + Rhs = Pattern.New().Annotation(backRnd).Value, + LeftEnvironment = Pattern .New() .Annotation(rndVowel) .Annotation(cons) @@ -132,8 +127,8 @@ public void LongDistanceRules() rule3.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(backRnd).Value, - RightEnvironment = Pattern + Rhs = Pattern.New().Annotation(backRnd).Value, + RightEnvironment = Pattern .New() .Annotation(cons) .Annotation(lowVowel) @@ -150,8 +145,8 @@ public void LongDistanceRules() rule3.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(backRnd).Value, - LeftEnvironment = Pattern + Rhs = Pattern.New().Annotation(backRnd).Value, + LeftEnvironment = Pattern .New() .Annotation(highVowel) .Annotation(cons) @@ -188,12 +183,12 @@ public void AnchorRules() .Symbol("voc+") .Value; - var rule3 = new RewriteRule { Name = "rule3", Lhs = Pattern.New().Annotation(cons).Value }; + var rule3 = new RewriteRule { Name = "rule3", Lhs = Pattern.New().Annotation(cons).Value }; rule3.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(vlUnasp).Value, - RightEnvironment = Pattern.New().Annotation(HCFeatureSystem.RightSideAnchor).Value, + Rhs = Pattern.New().Annotation(vlUnasp).Value, + RightEnvironment = Pattern.New().Annotation(HCFeatureSystem.RightSideAnchor).Value, } ); Allophonic.PhonologicalRules.Add(rule3); @@ -205,8 +200,8 @@ public void AnchorRules() rule3.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(vlUnasp).Value, - RightEnvironment = Pattern + Rhs = Pattern.New().Annotation(vlUnasp).Value, + RightEnvironment = Pattern .New() .Annotation(vowel) .Annotation(cons) @@ -222,8 +217,8 @@ public void AnchorRules() rule3.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(vlUnasp).Value, - LeftEnvironment = Pattern.New().Annotation(HCFeatureSystem.LeftSideAnchor).Value, + Rhs = Pattern.New().Annotation(vlUnasp).Value, + LeftEnvironment = Pattern.New().Annotation(HCFeatureSystem.LeftSideAnchor).Value, } ); @@ -234,8 +229,8 @@ public void AnchorRules() rule3.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(vlUnasp).Value, - LeftEnvironment = Pattern + Rhs = Pattern.New().Annotation(vlUnasp).Value, + LeftEnvironment = Pattern .New() .Annotation(HCFeatureSystem.LeftSideAnchor) .Annotation(cons) @@ -292,17 +287,13 @@ public void QuantifierRules() .Symbol("round+") .Value; - var rule3 = new RewriteRule - { - Name = "rule3", - Lhs = Pattern.New().Annotation(highVowel).Value, - }; + var rule3 = new RewriteRule { Name = "rule3", Lhs = Pattern.New().Annotation(highVowel).Value }; Allophonic.PhonologicalRules.Add(rule3); rule3.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(backRnd).Value, - RightEnvironment = Pattern + Rhs = Pattern.New().Annotation(backRnd).Value, + RightEnvironment = Pattern .New() .Group(g => g.Annotation(cons).Annotation(lowVowel)) .LazyRange(1, 2) @@ -312,17 +303,13 @@ public void QuantifierRules() } ); - var rule4 = new RewriteRule - { - Name = "rule4", - Lhs = Pattern.New().Annotation(highVowel).Value, - }; + var rule4 = new RewriteRule { Name = "rule4", Lhs = Pattern.New().Annotation(highVowel).Value }; Allophonic.PhonologicalRules.Add(rule4); rule4.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(backRnd).Value, - LeftEnvironment = Pattern + Rhs = Pattern.New().Annotation(backRnd).Value, + LeftEnvironment = Pattern .New() .Annotation(rndVowel) .Group(g => g.Annotation(cons).Annotation(lowVowel)) @@ -340,17 +327,13 @@ public void QuantifierRules() Allophonic.PhonologicalRules.Clear(); - var rule1 = new RewriteRule - { - Name = "rule1", - Lhs = Pattern.New().Annotation(highVowel).Value, - }; + var rule1 = new RewriteRule { Name = "rule1", Lhs = Pattern.New().Annotation(highVowel).Value }; Allophonic.PhonologicalRules.Add(rule1); rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(backRnd).Value, - LeftEnvironment = Pattern + Rhs = Pattern.New().Annotation(backRnd).Value, + LeftEnvironment = Pattern .New() .Annotation(backRndVowel) .Annotation(highVowel) @@ -401,24 +384,24 @@ public void MultipleSegmentRules() var rule1 = new RewriteRule { Name = "rule1", - Lhs = Pattern.New().Annotation(highVowel).Annotation(highVowel).Value, + Lhs = Pattern.New().Annotation(highVowel).Annotation(highVowel).Value, }; Allophonic.PhonologicalRules.Add(rule1); rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(backRnd).Annotation(backRnd).Value, - LeftEnvironment = Pattern.New().Annotation(backRndVowel).Value, + Rhs = Pattern.New().Annotation(backRnd).Annotation(backRnd).Value, + LeftEnvironment = Pattern.New().Annotation(backRndVowel).Value, } ); var morpher = new Morpher(TraceManager, Language); AssertMorphsEqual(morpher.ParseWord("buuubuuu"), "27"); - var rule2 = new RewriteRule { Name = "rule2", Lhs = Pattern.New().Annotation(t).Value }; + var rule2 = new RewriteRule { Name = "rule2", Lhs = Pattern.New().Annotation(t).Value }; Allophonic.PhonologicalRules.Add(rule2); rule2.Subrules.Add( - new RewriteSubrule { RightEnvironment = Pattern.New().Annotation(backRndVowel).Value } + new RewriteSubrule { RightEnvironment = Pattern.New().Annotation(backRndVowel).Value } ); morpher = new Morpher(TraceManager, Language); @@ -447,11 +430,11 @@ public void MultipleDeletionRules() var rule1 = new RewriteRule { Name = "rule1", - Lhs = Pattern.New().Annotation(highVowel).Annotation(highVowel).Value, + Lhs = Pattern.New().Annotation(highVowel).Annotation(highVowel).Value, }; Allophonic.PhonologicalRules.Add(rule1); rule1.Subrules.Add( - new RewriteSubrule { LeftEnvironment = Pattern.New().Annotation(backRndVowel).Value } + new RewriteSubrule { LeftEnvironment = Pattern.New().Annotation(backRndVowel).Value } ); var morpher = new Morpher(TraceManager, Language); @@ -490,14 +473,14 @@ public void MergeRules() var rule1 = new RewriteRule { Name = "rule1", - Lhs = Pattern.New().Annotation(highVowel).Annotation(highVowel).Value, + Lhs = Pattern.New().Annotation(highVowel).Annotation(highVowel).Value, }; Allophonic.PhonologicalRules.Add(rule1); rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(t).Value, - LeftEnvironment = Pattern.New().Annotation(backRndVowel).Value, + Rhs = Pattern.New().Annotation(t).Value, + LeftEnvironment = Pattern.New().Annotation(backRndVowel).Value, } ); @@ -537,17 +520,10 @@ public void MultipleMergeRules() var rule1 = new RewriteRule { Name = "rule1", - Lhs = Pattern - .New() - .Annotation(backRndVowel) - .Annotation(highVowel) - .Annotation(highVowel) - .Value, + Lhs = Pattern.New().Annotation(backRndVowel).Annotation(highVowel).Annotation(highVowel).Value, }; Allophonic.PhonologicalRules.Add(rule1); - rule1.Subrules.Add( - new RewriteSubrule { Rhs = Pattern.New().Annotation(t).Annotation(t).Value } - ); + rule1.Subrules.Add(new RewriteSubrule { Rhs = Pattern.New().Annotation(t).Annotation(t).Value }); var morpher = new Morpher(TraceManager, Language); AssertMorphsEqual(morpher.ParseWord("bttbtt"), "27"); @@ -572,17 +548,10 @@ public void ExpandRules() .Symbol("round+") .Value; - var rule1 = new RewriteRule - { - Name = "rule1", - Lhs = Pattern.New().Annotation(backRndVowel).Value, - }; + var rule1 = new RewriteRule { Name = "rule1", Lhs = Pattern.New().Annotation(backRndVowel).Value }; Allophonic.PhonologicalRules.Add(rule1); rule1.Subrules.Add( - new RewriteSubrule - { - Rhs = Pattern.New().Annotation(highVowel).Annotation(highVowel).Value, - } + new RewriteSubrule { Rhs = Pattern.New().Annotation(highVowel).Annotation(highVowel).Value } ); var morpher = new Morpher(TraceManager, Language); @@ -679,17 +648,13 @@ public void BoundaryRules() .Symbol("asp+") .Value; - var rule1 = new RewriteRule - { - Name = "rule1", - Lhs = Pattern.New().Annotation(highVowel).Value, - }; + var rule1 = new RewriteRule { Name = "rule1", Lhs = Pattern.New().Annotation(highVowel).Value }; Morphophonemic.PhonologicalRules.Add(rule1); rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(backRnd).Value, - LeftEnvironment = Pattern + Rhs = Pattern.New().Annotation(backRnd).Value, + LeftEnvironment = Pattern .New() .Annotation(backRndVowel) .Annotation(Character(Table3, "+")) @@ -704,8 +669,8 @@ public void BoundaryRules() rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(unbackUnrnd).Value, - RightEnvironment = Pattern + Rhs = Pattern.New().Annotation(unbackUnrnd).Value, + RightEnvironment = Pattern .New() .Annotation(Character(Table3, "+")) .Annotation(unbackUnrndVowel) @@ -720,8 +685,8 @@ public void BoundaryRules() rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(backRnd).Value, - LeftEnvironment = Pattern.New().Annotation(backRndVowel).Value, + Rhs = Pattern.New().Annotation(backRnd).Value, + LeftEnvironment = Pattern.New().Annotation(backRndVowel).Value, } ); @@ -732,33 +697,26 @@ public void BoundaryRules() rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(unbackUnrnd).Value, - RightEnvironment = Pattern.New().Annotation(unbackUnrndVowel).Value, + Rhs = Pattern.New().Annotation(unbackUnrnd).Value, + RightEnvironment = Pattern.New().Annotation(unbackUnrndVowel).Value, } ); morpher = new Morpher(TraceManager, Language); AssertMorphsEqual(morpher.ParseWord("biib"), "30", "31"); - rule1.Lhs = Pattern.New().Annotation(Character(Table3, "i")).Value; + rule1.Lhs = Pattern.New().Annotation(Character(Table3, "i")).Value; rule1.Subrules.Clear(); rule1.Subrules.Add( - new RewriteSubrule - { - RightEnvironment = Pattern.New().Annotation(Character(Table3, "b")).Value, - } + new RewriteSubrule { RightEnvironment = Pattern.New().Annotation(Character(Table3, "b")).Value } ); - var rule2 = new RewriteRule - { - Name = "rule2", - Lhs = Pattern.New().Annotation(backVowel).Value, - }; + var rule2 = new RewriteRule { Name = "rule2", Lhs = Pattern.New().Annotation(backVowel).Value }; rule2.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(Character(Table3, "a")).Value, - RightEnvironment = Pattern + Rhs = Pattern.New().Annotation(Character(Table3, "a")).Value, + RightEnvironment = Pattern .New() .Group(group => group.Annotation(Character(Table3, "+")).Annotation(Character(Table3, "b"))) .Value, @@ -769,22 +727,19 @@ public void BoundaryRules() morpher = new Morpher(TraceManager, Language); AssertMorphsEqual(morpher.ParseWord("bab"), "30"); - rule1.Lhs = Pattern.New().Annotation(Character(Table3, "u")).Value; + rule1.Lhs = Pattern.New().Annotation(Character(Table3, "u")).Value; rule1.Subrules.Clear(); rule1.Subrules.Add( - new RewriteSubrule - { - LeftEnvironment = Pattern.New().Annotation(Character(Table3, "b")).Value, - } + new RewriteSubrule { LeftEnvironment = Pattern.New().Annotation(Character(Table3, "b")).Value } ); - rule2.Lhs = Pattern.New().Annotation(unrndVowel).Value; + rule2.Lhs = Pattern.New().Annotation(unrndVowel).Value; rule2.Subrules.Clear(); rule2.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(lowBack).Value, - LeftEnvironment = Pattern + Rhs = Pattern.New().Annotation(lowBack).Value, + LeftEnvironment = Pattern .New() .Annotation(Character(Table3, "b")) .Annotation(Character(Table3, "+")) @@ -797,7 +752,7 @@ public void BoundaryRules() Morphophonemic.PhonologicalRules.Remove(rule2); - rule1.Lhs = Pattern + rule1.Lhs = Pattern .New() .Annotation(bilabialCons) .Annotation(Character(Table3, "+")) @@ -806,41 +761,41 @@ public void BoundaryRules() rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern + Rhs = Pattern .New() .Annotation(unvdUnasp) .Annotation(Character(Table3, "+")) .Annotation(unvdUnasp) .Value, - LeftEnvironment = Pattern.New().Annotation(vowel).Value, - RightEnvironment = Pattern.New().Annotation(vowel).Value, + LeftEnvironment = Pattern.New().Annotation(vowel).Value, + RightEnvironment = Pattern.New().Annotation(vowel).Value, } ); morpher = new Morpher(TraceManager, Language); AssertMorphsEqual(morpher.ParseWord("appa"), "39"); - rule1.Lhs = Pattern.New().Annotation(bilabialCons).Annotation(bilabialCons).Value; + rule1.Lhs = Pattern.New().Annotation(bilabialCons).Annotation(bilabialCons).Value; rule1.Subrules.Clear(); rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(unvdUnasp).Annotation(unvdUnasp).Value, - LeftEnvironment = Pattern.New().Annotation(vowel).Value, - RightEnvironment = Pattern.New().Annotation(vowel).Value, + Rhs = Pattern.New().Annotation(unvdUnasp).Annotation(unvdUnasp).Value, + LeftEnvironment = Pattern.New().Annotation(vowel).Value, + RightEnvironment = Pattern.New().Annotation(vowel).Value, } ); morpher = new Morpher(TraceManager, Language); AssertMorphsEqual(morpher.ParseWord("appa"), "40"); - rule1.Lhs = Pattern.New().Annotation(cons).Value; + rule1.Lhs = Pattern.New().Annotation(cons).Value; rule1.Subrules.Clear(); rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(asp).Value, - LeftEnvironment = Pattern.New().Annotation(Character(Table3, "+")).Value, + Rhs = Pattern.New().Annotation(asp).Value, + LeftEnvironment = Pattern.New().Annotation(Character(Table3, "+")).Value, } ); @@ -854,13 +809,13 @@ public void BoundaryRules() rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern + Rhs = Pattern .New() .Annotation(Character(Table1, "t")) .Annotation(Character(Table1, "a")) .Value, - LeftEnvironment = Pattern.New().Annotation(HCFeatureSystem.LeftSideAnchor).Value, - RightEnvironment = Pattern + LeftEnvironment = Pattern.New().Annotation(HCFeatureSystem.LeftSideAnchor).Value, + RightEnvironment = Pattern .New() .Annotation(cons) .Annotation(vowel) @@ -909,15 +864,15 @@ public void CommonFeatureRules() var rule1 = new RewriteRule { Name = "rule1", - Lhs = Pattern.New().Annotation(Character(Table1, "p")).Value, + Lhs = Pattern.New().Annotation(Character(Table1, "p")).Value, }; Allophonic.PhonologicalRules.Add(rule1); rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(vdLabFric).Value, - LeftEnvironment = Pattern.New().Annotation(vowel).Value, - RightEnvironment = Pattern.New().Annotation(vowel).Value, + Rhs = Pattern.New().Annotation(vdLabFric).Value, + LeftEnvironment = Pattern.New().Annotation(vowel).Value, + RightEnvironment = Pattern.New().Annotation(vowel).Value, } ); @@ -928,9 +883,9 @@ public void CommonFeatureRules() rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(Character(Table1, "v")).Value, - LeftEnvironment = Pattern.New().Annotation(vowel).Value, - RightEnvironment = Pattern.New().Annotation(vowel).Value, + Rhs = Pattern.New().Annotation(Character(Table1, "v")).Value, + LeftEnvironment = Pattern.New().Annotation(vowel).Value, + RightEnvironment = Pattern.New().Annotation(vowel).Value, } ); @@ -1005,16 +960,12 @@ public void AlphaVariableRules() .Symbol("nasal-") .Value; - var rule1 = new RewriteRule - { - Name = "rule1", - Lhs = Pattern.New().Annotation(highVowel).Value, - }; + var rule1 = new RewriteRule { Name = "rule1", Lhs = Pattern.New().Annotation(highVowel).Value }; Morphophonemic.PhonologicalRules.Add(rule1); rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern + Rhs = Pattern .New() .Annotation( FeatureStruct @@ -1027,7 +978,7 @@ public void AlphaVariableRules() .Value ) .Value, - LeftEnvironment = Pattern + LeftEnvironment = Pattern .New() .Annotation( FeatureStruct @@ -1046,12 +997,12 @@ public void AlphaVariableRules() var morpher = new Morpher(TraceManager, Language); AssertMorphsEqual(morpher.ParseWord("bububu"), "42", "43"); - rule1.Lhs = Pattern.New().Annotation(nasalCons).Value; + rule1.Lhs = Pattern.New().Annotation(nasalCons).Value; rule1.Subrules.Clear(); rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern + Rhs = Pattern .New() .Annotation( FeatureStruct @@ -1062,7 +1013,7 @@ public void AlphaVariableRules() .Value ) .Value, - RightEnvironment = Pattern + RightEnvironment = Pattern .New() .Annotation( FeatureStruct @@ -1081,7 +1032,7 @@ public void AlphaVariableRules() Morphophonemic.PhonologicalRules.Clear(); Allophonic.PhonologicalRules.Add(rule1); - rule1.Lhs = Pattern + rule1.Lhs = Pattern .New() .Annotation( FeatureStruct @@ -1095,8 +1046,8 @@ public void AlphaVariableRules() rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(asp).Value, - LeftEnvironment = Pattern + Rhs = Pattern.New().Annotation(asp).Value, + LeftEnvironment = Pattern .New() .Annotation( FeatureStruct @@ -1109,18 +1060,18 @@ public void AlphaVariableRules() .Value, } ); - rule1.Subrules.Add(new RewriteSubrule { Rhs = Pattern.New().Annotation(unasp).Value }); + rule1.Subrules.Add(new RewriteSubrule { Rhs = Pattern.New().Annotation(unasp).Value }); morpher = new Morpher(TraceManager, Language); AssertMorphsEqual(morpher.ParseWord("pipʰ"), "41"); - rule1.Lhs = Pattern.New().Value; + rule1.Lhs = Pattern.New().Value; rule1.Subrules.Clear(); rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(Character(Table1, "f")).Value, - LeftEnvironment = Pattern + Rhs = Pattern.New().Annotation(Character(Table1, "f")).Value, + LeftEnvironment = Pattern .New() .Annotation( FeatureStruct @@ -1134,7 +1085,7 @@ public void AlphaVariableRules() .Value ) .Value, - RightEnvironment = Pattern + RightEnvironment = Pattern .New() .Annotation( FeatureStruct @@ -1161,7 +1112,7 @@ public void AlphaVariableRules() rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern + Rhs = Pattern .New() .Annotation( FeatureStruct @@ -1171,7 +1122,7 @@ public void AlphaVariableRules() .Value ) .Value, - LeftEnvironment = Pattern + LeftEnvironment = Pattern .New() .Annotation( FeatureStruct @@ -1181,7 +1132,7 @@ public void AlphaVariableRules() .Value ) .Value, - RightEnvironment = Pattern.New().Annotation(HCFeatureSystem.RightSideAnchor).Value, + RightEnvironment = Pattern.New().Annotation(HCFeatureSystem.RightSideAnchor).Value, } ); @@ -1242,8 +1193,8 @@ public void EpenthesisRules() rule4.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(highFrontUnrndVowel).Value, - LeftEnvironment = Pattern.New().Annotation(highVowel).Value, + Rhs = Pattern.New().Annotation(highFrontUnrndVowel).Value, + LeftEnvironment = Pattern.New().Annotation(highVowel).Value, } ); @@ -1254,8 +1205,8 @@ public void EpenthesisRules() rule4.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(Character(Table1, "i")).Value, - RightEnvironment = Pattern.New().Annotation(highVowel).Value, + Rhs = Pattern.New().Annotation(Character(Table1, "i")).Value, + RightEnvironment = Pattern.New().Annotation(highVowel).Value, } ); @@ -1267,9 +1218,9 @@ public void EpenthesisRules() rule4.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(highFrontUnrndVowel).Value, - LeftEnvironment = Pattern.New().Annotation(HCFeatureSystem.LeftSideAnchor).Value, - RightEnvironment = Pattern.New().Annotation(cons).Value, + Rhs = Pattern.New().Annotation(highFrontUnrndVowel).Value, + LeftEnvironment = Pattern.New().Annotation(HCFeatureSystem.LeftSideAnchor).Value, + RightEnvironment = Pattern.New().Annotation(cons).Value, } ); @@ -1280,9 +1231,9 @@ public void EpenthesisRules() rule4.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(highFrontUnrndVowel).Value, - LeftEnvironment = Pattern.New().Annotation(cons).Value, - RightEnvironment = Pattern.New().Annotation(HCFeatureSystem.RightSideAnchor).Value, + Rhs = Pattern.New().Annotation(highFrontUnrndVowel).Value, + LeftEnvironment = Pattern.New().Annotation(cons).Value, + RightEnvironment = Pattern.New().Annotation(HCFeatureSystem.RightSideAnchor).Value, } ); @@ -1293,9 +1244,9 @@ public void EpenthesisRules() rule4.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(highFrontUnrndVowel).Value, - LeftEnvironment = Pattern.New().Annotation(cons).Value, - RightEnvironment = Pattern.New().Annotation(highBackRndVowel).Value, + Rhs = Pattern.New().Annotation(highFrontUnrndVowel).Value, + LeftEnvironment = Pattern.New().Annotation(cons).Value, + RightEnvironment = Pattern.New().Annotation(highBackRndVowel).Value, } ); @@ -1307,7 +1258,7 @@ public void EpenthesisRules() rule4.Subrules.Add( new RewriteSubrule { - Rhs = Pattern + Rhs = Pattern .New() .Annotation( FeatureStruct @@ -1319,7 +1270,7 @@ public void EpenthesisRules() .Value ) .Value, - LeftEnvironment = Pattern + LeftEnvironment = Pattern .New() .Annotation( FeatureStruct @@ -1341,12 +1292,8 @@ public void EpenthesisRules() rule4.Subrules.Add( new RewriteSubrule { - Rhs = Pattern - .New() - .Annotation(highFrontUnrndVowel) - .Annotation(highFrontUnrndVowel) - .Value, - LeftEnvironment = Pattern.New().Annotation(highVowel).Value, + Rhs = Pattern.New().Annotation(highFrontUnrndVowel).Annotation(highFrontUnrndVowel).Value, + LeftEnvironment = Pattern.New().Annotation(highVowel).Value, } ); @@ -1359,8 +1306,8 @@ public void EpenthesisRules() rule4.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(highFrontUnrndVowel).Value, - LeftEnvironment = Pattern.New().Annotation(HCFeatureSystem.LeftSideAnchor).Value, + Rhs = Pattern.New().Annotation(highFrontUnrndVowel).Value, + LeftEnvironment = Pattern.New().Annotation(HCFeatureSystem.LeftSideAnchor).Value, } ); @@ -1369,13 +1316,13 @@ public void EpenthesisRules() Allophonic.PhonologicalRules.Clear(); - var rule1 = new RewriteRule { Name = "rule1", Lhs = Pattern.New().Annotation(vowel).Value }; + var rule1 = new RewriteRule { Name = "rule1", Lhs = Pattern.New().Annotation(vowel).Value }; Allophonic.PhonologicalRules.Add(rule1); rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(highBackRnd).Value, - LeftEnvironment = Pattern.New().Annotation(highBackRndVowel).Value, + Rhs = Pattern.New().Annotation(highBackRnd).Value, + LeftEnvironment = Pattern.New().Annotation(highBackRndVowel).Value, } ); @@ -1384,9 +1331,9 @@ public void EpenthesisRules() rule2.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(Character(Table1, "t")).Value, - LeftEnvironment = Pattern.New().Annotation(vowel).Value, - RightEnvironment = Pattern.New().Annotation(vowel).Value, + Rhs = Pattern.New().Annotation(Character(Table1, "t")).Value, + LeftEnvironment = Pattern.New().Annotation(vowel).Value, + RightEnvironment = Pattern.New().Annotation(vowel).Value, } ); @@ -1454,11 +1401,11 @@ public void DeletionRules() var rule4 = new RewriteRule { Name = "rule4", - Lhs = Pattern.New().Annotation(highFrontUnrndVowel).Value, + Lhs = Pattern.New().Annotation(highFrontUnrndVowel).Value, }; Allophonic.PhonologicalRules.Add(rule4); rule4.Subrules.Add( - new RewriteSubrule { LeftEnvironment = Pattern.New().Annotation(highVowel).Value } + new RewriteSubrule { LeftEnvironment = Pattern.New().Annotation(highVowel).Value } ); var morpher = new Morpher(TraceManager, Language); @@ -1468,25 +1415,19 @@ public void DeletionRules() AssertMorphsEqual(morpher.ParseWord("bubu"), "24", "25", "26", "27", "19"); rule4.Subrules.Clear(); - rule4.Subrules.Add( - new RewriteSubrule { RightEnvironment = Pattern.New().Annotation(cons).Value } - ); + rule4.Subrules.Add(new RewriteSubrule { RightEnvironment = Pattern.New().Annotation(cons).Value }); morpher = new Morpher(TraceManager, Language); AssertMorphsEqual(morpher.ParseWord("bubu"), "25", "19"); - rule4.Lhs = Pattern - .New() - .Annotation(highFrontUnrndVowel) - .Annotation(highFrontUnrndVowel) - .Value; + rule4.Lhs = Pattern.New().Annotation(highFrontUnrndVowel).Annotation(highFrontUnrndVowel).Value; morpher = new Morpher(TraceManager, Language); AssertMorphsEqual(morpher.ParseWord("bubu"), "29", "19"); rule4.Subrules.Clear(); rule4.Subrules.Add( - new RewriteSubrule { LeftEnvironment = Pattern.New().Annotation(highBackRndVowel).Value } + new RewriteSubrule { LeftEnvironment = Pattern.New().Annotation(highBackRndVowel).Value } ); morpher = new Morpher(TraceManager, Language); @@ -1495,20 +1436,20 @@ public void DeletionRules() Allophonic.PhonologicalRules.Clear(); Morphophonemic.PhonologicalRules.Add(rule4); - rule4.Lhs = Pattern.New().Annotation(Character(Table3, "b")).Value; + rule4.Lhs = Pattern.New().Annotation(Character(Table3, "b")).Value; rule4.Subrules.Clear(); rule4.Subrules.Add( new RewriteSubrule { - LeftEnvironment = Pattern.New().Annotation(HCFeatureSystem.LeftSideAnchor).Value, - RightEnvironment = Pattern.New().Annotation(Character(Table3, "+")).Value, + LeftEnvironment = Pattern.New().Annotation(HCFeatureSystem.LeftSideAnchor).Value, + RightEnvironment = Pattern.New().Annotation(Character(Table3, "+")).Value, } ); var rule5 = new RewriteRule { Name = "rule5", - Lhs = Pattern + Lhs = Pattern .New() .Annotation(Character(Table3, "u")) .Annotation(Character(Table3, "b")) @@ -1519,22 +1460,22 @@ public void DeletionRules() rule5.Subrules.Add( new RewriteSubrule { - LeftEnvironment = Pattern.New().Annotation(Character(Table3, "+")).Value, - RightEnvironment = Pattern.New().Annotation(HCFeatureSystem.RightSideAnchor).Value, + LeftEnvironment = Pattern.New().Annotation(Character(Table3, "+")).Value, + RightEnvironment = Pattern.New().Annotation(HCFeatureSystem.RightSideAnchor).Value, } ); var rule1 = new RewriteRule { Name = "rule1", - Lhs = Pattern.New().Annotation(Character(Table3, "t")).Value, + Lhs = Pattern.New().Annotation(Character(Table3, "t")).Value, }; Morphophonemic.PhonologicalRules.Add(rule1); rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(asp).Value, - LeftEnvironment = Pattern.New().Annotation(nonCons).Value, + Rhs = Pattern.New().Annotation(asp).Value, + LeftEnvironment = Pattern.New().Annotation(nonCons).Value, } ); @@ -1546,15 +1487,15 @@ public void DeletionRules() Allophonic.PhonologicalRules.Add(rule5); Allophonic.PhonologicalRules.Add(rule1); - rule4.Subrules[0].LeftEnvironment = Pattern.New().Value; + rule4.Subrules[0].LeftEnvironment = Pattern.New().Value; - rule5.Lhs = Pattern + rule5.Lhs = Pattern .New() .Annotation(Character(Table3, "u")) .Annotation(Character(Table3, "b")) .Annotation(Character(Table3, "i")) .Value; - rule5.Subrules[0].RightEnvironment = Pattern.New().Value; + rule5.Subrules[0].RightEnvironment = Pattern.New().Value; morpher = new Morpher(TraceManager, Language); Assert.That(morpher.ParseWord("b"), Is.Empty); @@ -1563,7 +1504,7 @@ public void DeletionRules() Allophonic.PhonologicalRules.Add(rule4); Morphophonemic.PhonologicalRules.Add(rule5); - rule4.Lhs = Pattern + rule4.Lhs = Pattern .New() .Annotation( FeatureStruct @@ -1583,7 +1524,7 @@ public void DeletionRules() rule4.Subrules.Add( new RewriteSubrule { - RightEnvironment = Pattern + RightEnvironment = Pattern .New() .Annotation( FeatureStruct @@ -1602,14 +1543,14 @@ public void DeletionRules() } ); - rule5.Lhs = Pattern.New().Annotation(cons).Value; + rule5.Lhs = Pattern.New().Annotation(cons).Value; rule5.Subrules.Clear(); rule5.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(voiced).Value, - LeftEnvironment = Pattern.New().Annotation(vowel).Value, - RightEnvironment = Pattern.New().Annotation(vowel).Value, + Rhs = Pattern.New().Annotation(voiced).Value, + LeftEnvironment = Pattern.New().Annotation(vowel).Value, + RightEnvironment = Pattern.New().Annotation(vowel).Value, } ); @@ -1741,31 +1682,27 @@ public void DisjunctiveRules() .Symbol("cont-") .Value; - var disrule1 = new RewriteRule - { - Name = "disrule1", - Lhs = Pattern.New().Annotation(stop).Value, - }; + var disrule1 = new RewriteRule { Name = "disrule1", Lhs = Pattern.New().Annotation(stop).Value }; Allophonic.PhonologicalRules.Add(disrule1); disrule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(asp).Value, - LeftEnvironment = Pattern.New().Annotation(HCFeatureSystem.LeftSideAnchor).Value, + Rhs = Pattern.New().Annotation(asp).Value, + LeftEnvironment = Pattern.New().Annotation(HCFeatureSystem.LeftSideAnchor).Value, } ); - disrule1.Subrules.Add(new RewriteSubrule { Rhs = Pattern.New().Annotation(unasp).Value }); + disrule1.Subrules.Add(new RewriteSubrule { Rhs = Pattern.New().Annotation(unasp).Value }); var morpher = new Morpher(TraceManager, Language); AssertMorphsEqual(morpher.ParseWord("pʰip"), "41"); - disrule1.Lhs = Pattern.New().Annotation(highVowel).Value; + disrule1.Lhs = Pattern.New().Annotation(highVowel).Value; disrule1.Subrules.Clear(); disrule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(backRnd).Value, - LeftEnvironment = Pattern + Rhs = Pattern.New().Annotation(backRnd).Value, + LeftEnvironment = Pattern .New() .Annotation(backRndVowel) .Group(g => g.Annotation(cons).Annotation(highFrontVowel)) @@ -1776,8 +1713,8 @@ public void DisjunctiveRules() disrule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(frontRnd).Value, - LeftEnvironment = Pattern + Rhs = Pattern.New().Annotation(frontRnd).Value, + LeftEnvironment = Pattern .New() .Annotation(frontRndVowel) .Group(g => g.Annotation(cons).Annotation(highFrontVowel)) @@ -1788,8 +1725,8 @@ public void DisjunctiveRules() disrule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(backUnrnd).Value, - LeftEnvironment = Pattern + Rhs = Pattern.New().Annotation(backUnrnd).Value, + LeftEnvironment = Pattern .New() .Annotation(backUnrndVowel) .Group(g => g.Annotation(cons).Annotation(highFrontVowel)) @@ -1800,8 +1737,8 @@ public void DisjunctiveRules() disrule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(frontUnrnd).Value, - LeftEnvironment = Pattern + Rhs = Pattern.New().Annotation(frontUnrnd).Value, + LeftEnvironment = Pattern .New() .Annotation(frontUnrndVowel) .Group(g => g.Annotation(cons).Annotation(highFrontVowel)) @@ -1813,56 +1750,56 @@ public void DisjunctiveRules() morpher = new Morpher(TraceManager, Language); AssertMorphsEqual(morpher.ParseWord("bububu"), "42", "43"); - disrule1.Lhs = Pattern.New().Annotation(stop).Value; + disrule1.Lhs = Pattern.New().Annotation(stop).Value; disrule1.Subrules.Clear(); disrule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(asp).Value, - LeftEnvironment = Pattern.New().Annotation(HCFeatureSystem.LeftSideAnchor).Value, + Rhs = Pattern.New().Annotation(asp).Value, + LeftEnvironment = Pattern.New().Annotation(HCFeatureSystem.LeftSideAnchor).Value, } ); disrule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(unasp).Value, - RightEnvironment = Pattern.New().Annotation(HCFeatureSystem.RightSideAnchor).Value, + Rhs = Pattern.New().Annotation(unasp).Value, + RightEnvironment = Pattern.New().Annotation(HCFeatureSystem.RightSideAnchor).Value, } ); morpher = new Morpher(TraceManager, Language); AssertMorphsEqual(morpher.ParseWord("pʰip"), "41"); - disrule1.Lhs = Pattern.New().Annotation(p).Value; + disrule1.Lhs = Pattern.New().Annotation(p).Value; disrule1.Subrules.Clear(); disrule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(vd).Value, - LeftEnvironment = Pattern.New().Annotation(vowel).Value, + Rhs = Pattern.New().Annotation(vd).Value, + LeftEnvironment = Pattern.New().Annotation(vowel).Value, } ); disrule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(asp).Value, - RightEnvironment = Pattern.New().Annotation(HCFeatureSystem.RightSideAnchor).Value, + Rhs = Pattern.New().Annotation(asp).Value, + RightEnvironment = Pattern.New().Annotation(HCFeatureSystem.RightSideAnchor).Value, } ); morpher = new Morpher(TraceManager, Language); AssertMorphsEqual(morpher.ParseWord("bubu"), "46", "19"); - disrule1.Lhs = Pattern.New().Annotation(voicelessStop).Value; + disrule1.Lhs = Pattern.New().Annotation(voicelessStop).Value; disrule1.Subrules.Clear(); disrule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(asp).Value, - LeftEnvironment = Pattern.New().Annotation(voicelessStop).Value, + Rhs = Pattern.New().Annotation(asp).Value, + LeftEnvironment = Pattern.New().Annotation(voicelessStop).Value, } ); - disrule1.Subrules.Add(new RewriteSubrule { Rhs = Pattern.New().Annotation(unasp).Value }); + disrule1.Subrules.Add(new RewriteSubrule { Rhs = Pattern.New().Annotation(unasp).Value }); morpher = new Morpher(TraceManager, Language); AssertMorphsEqual(morpher.ParseWord("ktʰb"), "49"); @@ -1904,13 +1841,13 @@ public void MultipleApplicationRules() { Name = "rule1", ApplicationMode = RewriteApplicationMode.Simultaneous, - Lhs = Pattern.New().Annotation(highVowel).Value, + Lhs = Pattern.New().Annotation(highVowel).Value, }; rule1.Subrules.Add( new RewriteSubrule { - Rhs = Pattern.New().Annotation(backRnd).Value, - LeftEnvironment = Pattern.New().Annotation(i).Annotation(cons).Value, + Rhs = Pattern.New().Annotation(backRnd).Value, + LeftEnvironment = Pattern.New().Annotation(i).Annotation(cons).Value, } ); Allophonic.PhonologicalRules.Add(rule1); diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/PhonologyRuleCompilerTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/PhonologyRuleCompilerTests.cs new file mode 100644 index 000000000..d259de3c2 --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/PhonologyRuleCompilerTests.cs @@ -0,0 +1,317 @@ +using NUnit.Framework; +using SIL.Machine.FeatureModel; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; +using SIL.Machine.Morphology.HermitCrab.PhonologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// CI coverage for the auto-compiled lockstep phonology (FST_FAST_PATH_PLAN.md Phase 3): does +/// build the SAME kind of Pinv the LEVER_2.md spike hand-built and +/// proved sound (LeverTwo_LazyComposition_RecoversBoundaryDeletion_RealTypes in +/// VerifiedFstAnalyzerTests.cs), and does wire it correctly. +/// +public class PhonologyRuleCompilerTests : HermitCrabTestBase +{ + private static string Sig(WordAnalysis a) => + string.Join("+", a.Morphemes.Select(m => (m as Morpheme)?.Gloss ?? "?")) + ":" + a.RootMorphemeIndex; + + [Test] + public void Compile_AutoRecoversBoundaryDeletion() + { + // Same shape as the hand-built LEVER_2 spike: a "kd" suffix whose "k" deletes before "d" (a + // plain right-context deletion rule), so sag+KD = "sagkd" -> "sagd". The compiler must find + // this on its own — no hand-built Pinv. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var kdSuffix = new AffixProcessRule + { + Name = "kd_suffix", + Gloss = "KD", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + kdSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table1, "kd") }, + } + ); + Morphophonemic.MorphologicalRules.Add(kdSuffix); + var kDel = new RewriteRule + { + Name = "k_deletion", + Lhs = Pattern.New().Annotation(Character(Table1, "k")).Value, + }; + kDel.Subrules.Add( + new RewriteSubrule // no Rhs => deletion of k before d + { + RightEnvironment = Pattern.New().Annotation(Character(Table1, "d")).Value, + } + ); + Surface.PhonologicalRules.Add(kDel); + try + { + var search = new Morpher(TraceManager, Language); + var engine = new HashSet(search.AnalyzeWord("sagd").Select(Sig)); + Assert.That(engine.Any(s => s.Contains("KD")), Is.True, "precondition: 'sagd' = sag+KD (k->0/_d)"); + + var morpher = new Morpher(new TraceManager(), Language); + (InversePhonology pinv, int unsupported) = PhonologyRuleCompiler.Compile(Language, morpher); + Assert.That(unsupported, Is.Zero, "this rule is entirely within the v1 supported shape"); + + var lex = new FstTemplateAnalyzer(Language); // default ctor: underlying-only arcs + var composed = new HashSet(lex.AnalyzeComposed("sagd", pinv).Select(Sig)); + + Assert.That( + composed.Any(s => s.Contains("KD")), + Is.True, + "auto-compiled Pinv must recover the deletion form" + ); + Assert.That( + composed.IsSubsetOf(engine), + Is.True, + "soundness: composed candidates must be a subset of the engine's" + ); + Assert.That(lex.AnalyzeComposed("saga", pinv), Is.Empty, "a non-word must yield nothing"); + } + finally + { + Surface.PhonologicalRules.Remove(kDel); + Morphophonemic.MorphologicalRules.Remove(kdSuffix); + } + } + + [Test] + public void Compile_AutoRecoversUnconditionedSubstitution() + { + // An unconditional (no left/right environment) t->d rule: bare root "dat" (entry 8) surfaces + // only as "dad". Exercises the zero-right-environment substitution branch. + var tToD = new RewriteRule + { + Name = "t_to_d", + Lhs = Pattern.New().Annotation(Character(Table1, "t")).Value, + }; + tToD.Subrules.Add( + new RewriteSubrule { Rhs = Pattern.New().Annotation(Character(Table1, "d")).Value } + ); + Surface.PhonologicalRules.Add(tToD); + try + { + var search = new Morpher(TraceManager, Language); + Assert.That(search.AnalyzeWord("dad").Any(), Is.True, "precondition: 'dad' analyzes"); + + var morpher = new Morpher(new TraceManager(), Language); + (InversePhonology pinv, int unsupported) = PhonologyRuleCompiler.Compile(Language, morpher); + Assert.That(unsupported, Is.Zero); + + var lex = new FstTemplateAnalyzer(Language); + Assert.That(lex.AnalyzeWord("dad"), Is.Empty, "baseline: the underlying-only walk alone must miss 'dad'"); + + var composed = lex.AnalyzeComposed("dad", pinv); + Assert.That(composed, Is.Not.Empty, "auto-compiled Pinv must recover the substituted bare root"); + Assert.That(lex.AnalyzeComposed("zzz", pinv), Is.Empty, "a non-word must yield nothing"); + } + finally + { + Surface.PhonologicalRules.Remove(tToD); + } + } + + [Test] + public void Compile_SkipsUnconditionedDeletion_AsUnsupported() + { + // Deletion with NO right environment would over-restore everywhere (there's nothing to bound + // it), so the compiler must decline this shape rather than build an unsound-in-practice arc. + var kDel = new RewriteRule + { + Name = "unconditioned_k_deletion", + Lhs = Pattern.New().Annotation(Character(Table1, "k")).Value, + }; + kDel.Subrules.Add(new RewriteSubrule()); // empty Rhs, empty environments => deletes k everywhere + Surface.PhonologicalRules.Add(kDel); + try + { + var morpher = new Morpher(new TraceManager(), Language); + (InversePhonology pinv, int unsupported) = PhonologyRuleCompiler.Compile(Language, morpher); + Assert.That(unsupported, Is.EqualTo(1), "unconditioned deletion must be marked unsupported, not compiled"); + } + finally + { + Surface.PhonologicalRules.Remove(kDel); + } + } + + [Test] + public void Compile_AutoRecoversLeftContextDeletion() + { + // Mirror of Compile_AutoRecoversBoundaryDeletion but conditioned on the LEFT side: a "dk" + // suffix whose "k" deletes after "d" (left-context-only deletion, no right environment at + // all), so sag+DK = "sagdk" -> "sagd". + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var dkSuffix = new AffixProcessRule + { + Name = "dk_suffix", + Gloss = "DK", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + dkSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table1, "dk") }, + } + ); + Morphophonemic.MorphologicalRules.Add(dkSuffix); + var kDel = new RewriteRule + { + Name = "k_deletion_left", + Lhs = Pattern.New().Annotation(Character(Table1, "k")).Value, + }; + kDel.Subrules.Add( + new RewriteSubrule // no Rhs => deletion of k after d; no right environment at all + { + LeftEnvironment = Pattern.New().Annotation(Character(Table1, "d")).Value, + } + ); + Surface.PhonologicalRules.Add(kDel); + try + { + var search = new Morpher(TraceManager, Language); + var engine = new HashSet(search.AnalyzeWord("sagd").Select(Sig)); + Assert.That(engine.Any(s => s.Contains("DK")), Is.True, "precondition: 'sagd' = sag+DK (k->0/d_)"); + + var morpher = new Morpher(new TraceManager(), Language); + (InversePhonology pinv, int unsupported) = PhonologyRuleCompiler.Compile(Language, morpher); + Assert.That(unsupported, Is.Zero, "left-context-only deletion is within the supported shape"); + + var lex = new FstTemplateAnalyzer(Language); + var composed = new HashSet(lex.AnalyzeComposed("sagd", pinv).Select(Sig)); + + Assert.That( + composed.Any(s => s.Contains("DK")), + Is.True, + "auto-compiled Pinv must recover the left-context deletion form" + ); + Assert.That( + composed.IsSubsetOf(engine), + Is.True, + "soundness: composed candidates must be a subset of the engine's" + ); + Assert.That(lex.AnalyzeComposed("saga", pinv), Is.Empty, "a non-word must yield nothing"); + } + finally + { + Surface.PhonologicalRules.Remove(kDel); + Morphophonemic.MorphologicalRules.Remove(dkSuffix); + } + } + + [Test] + public void Compile_AutoRecoversLeftContextSubstitution() + { + // A t->d rule triggered only when preceded by "g" (left environment, no right environment): + // exercises the zero-right-environment branch of the left-context chain. Bare root "kagt" + // (Surface stratum, so no morphological rule is needed) surfaces only as "kagd". + LexEntry entry = AddEntry( + "kagt_root", + FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + Surface, + "kagt" + ); + var tToD = new RewriteRule + { + Name = "t_to_d_left", + Lhs = Pattern.New().Annotation(Character(Table1, "t")).Value, + }; + tToD.Subrules.Add( + new RewriteSubrule + { + LeftEnvironment = Pattern.New().Annotation(Character(Table1, "g")).Value, + Rhs = Pattern.New().Annotation(Character(Table1, "d")).Value, + } + ); + Surface.PhonologicalRules.Add(tToD); + try + { + var search = new Morpher(TraceManager, Language); + Assert.That(search.AnalyzeWord("kagd").Any(), Is.True, "precondition: 'kagd' = kagt (t->d/g_)"); + + var morpher = new Morpher(new TraceManager(), Language); + (InversePhonology pinv, int unsupported) = PhonologyRuleCompiler.Compile(Language, morpher); + Assert.That(unsupported, Is.Zero); + + var lex = new FstTemplateAnalyzer(Language); + Assert.That(lex.AnalyzeWord("kagd"), Is.Empty, "baseline: the underlying-only walk alone must miss 'kagd'"); + + var composed = lex.AnalyzeComposed("kagd", pinv); + Assert.That(composed, Is.Not.Empty, "auto-compiled Pinv must recover the left-conditioned substitution"); + Assert.That(lex.AnalyzeComposed("kagz", pinv), Is.Empty, "a non-word must yield nothing"); + } + finally + { + Surface.PhonologicalRules.Remove(tToD); + Surface.Entries.Remove(entry); + Entries.Remove("kagt_root"); + } + } + + [Test] + public void LockstepPhonologyProposer_CoversDeletion_WiredThroughComposite() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var kdSuffix = new AffixProcessRule + { + Name = "kd_suffix2", + Gloss = "KD2", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + kdSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table1, "kd") }, + } + ); + Morphophonemic.MorphologicalRules.Add(kdSuffix); + var kDel = new RewriteRule + { + Name = "k_deletion2", + Lhs = Pattern.New().Annotation(Character(Table1, "k")).Value, + }; + kDel.Subrules.Add( + new RewriteSubrule { RightEnvironment = Pattern.New().Annotation(Character(Table1, "d")).Value } + ); + Surface.PhonologicalRules.Add(kDel); + try + { + var search = new Morpher(TraceManager, Language); + var oracle = new HashSet(search.AnalyzeWord("sagd").Select(Sig)); + + CompositeProposer composite = CompositeProposer.ForLanguage( + Language, + new FstTemplateAnalyzer(Language, new Morpher(new TraceManager(), Language)) + ); + var fast = new VerifiedFstAnalyzer( + composite, + new MorpherPool(() => new Morpher(new TraceManager(), Language)) + ); + var got = new HashSet(fast.AnalyzeWord("sagd").Select(Sig)); + + Assert.That( + got.SetEquals(oracle), + Is.True, + "composite (via the lockstep proposer) must match the engine for 'sagd'" + ); + Assert.That(fast.AnalyzeWord("zzz"), Is.Empty, "soundness: a non-word must still yield nothing"); + } + finally + { + Surface.PhonologicalRules.Remove(kDel); + Morphophonemic.MorphologicalRules.Remove(kdSuffix); + } + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/SurfacePhonologyJunctionTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/SurfacePhonologyJunctionTests.cs new file mode 100644 index 000000000..b0018cd5b --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/SurfacePhonologyJunctionTests.cs @@ -0,0 +1,151 @@ +using NUnit.Framework; +using SIL.Machine.FeatureModel; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; +using SIL.Machine.Morphology.HermitCrab.PhonologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// CI coverage for junction-deletion probing (Phase C, FST_FULL_GRAMMAR_PLAN.md): a prefix whose +/// insertion abuts a morpheme boundary, where a phonological rule then deletes the FOLLOWING root's +/// own leading segment (Indonesian meN- + a voiceless obstruent onset is the real-grammar case this +/// mirrors — SurfacePhonology.DeletionJunctions + FstTemplateAnalyzer's root-chain +/// checkpoints). Deliberately requires a second segment of right context beyond the deleted one (a +/// following vowel) to satisfy the rule's own environment — the exact shape that broke the first +/// (single-neighbor-only) version of the probe. +/// +public class SurfacePhonologyJunctionTests : HermitCrabTestBase +{ + [OneTimeSetUp] + public void AddBoundaryToTable1() + { + // language.SurfaceStratum (== Surface, Table1-based) is what SurfacePhonology/FstTemplateAnalyzer + // actually segment strings with, regardless of which table an affix's InsertSegments cited when + // the rule was defined — so the boundary character needed for this test must live on Table1. + Table1.AddBoundary("+"); + } + + private AffixProcessRule AddMePrefix() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var prefix = new AffixProcessRule + { + Name = "me_prefix", + Gloss = "AV", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + prefix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new InsertSegments(Table1, "m+"), new CopyFromInput("1") }, + } + ); + Surface.MorphologicalRules.Add(prefix); + return prefix; + } + + private RewriteRule AddVoicelessDeletionAfterBoundary() + { + // p → ∅ / + _ a (mirrors Indonesian's voiceless-obstruent deletion: needs BOTH a left boundary + // AND a right-context vowel beyond the deleted segment itself to fire). + var rule = new RewriteRule + { + Name = "voiceless_deletion", + Lhs = Pattern.New().Annotation(Character(Table1, "p")).Value, + }; + rule.Subrules.Add( + new RewriteSubrule + { + LeftEnvironment = Pattern.New().Annotation(Character(Table1, "+")).Value, + RightEnvironment = Pattern.New().Annotation(Character(Table1, "a")).Value, + } + ); + Surface.PhonologicalRules.Add(rule); + return rule; + } + + [Test] + public void Junction_RecoversRootOnsetDeletion_RequiringTwoSegmentProbe() + { + LexEntry root = AddEntry( + "pat_root", + FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + Surface, + "pat" + ); + AffixProcessRule prefix = AddMePrefix(); + RewriteRule delRule = AddVoicelessDeletionAfterBoundary(); + try + { + var search = new Morpher(TraceManager, Language); + Assert.That( + search.AnalyzeWord("mat").Any(), + Is.True, + "precondition: 'mat' = m+pat with p deleted after the boundary before a vowel" + ); + Assert.That( + search.AnalyzeWord("mpat"), + Is.Empty, + "precondition: the underlying (undeleted) form must not itself surface" + ); + + var fst = new FstTemplateAnalyzer(Language, new Morpher(new TraceManager(), Language)); + List found = fst.AnalyzeWord("mat").ToList(); + Assert.That(found, Is.Not.Empty, "the junction-deletion arc must recover 'mat' directly"); + + var verified = new VerifiedFstAnalyzer(TraceManager, Language); + Assert.That( + verified.AnalyzeWord("mat").Any(), + Is.True, + "the full propose-and-verify path must also recover 'mat'" + ); + Assert.That(verified.AnalyzeWord("mpit"), Is.Empty, "a non-word must yield nothing"); + } + finally + { + Surface.PhonologicalRules.Remove(delRule); + Surface.MorphologicalRules.Remove(prefix); + Surface.Entries.Remove(root); + Entries.Remove("pat_root"); + } + } + + [Test] + public void Junction_DoesNotSkip_WhenRootOnsetIsNotTheDeletedClass() + { + // "dat" starts with a VOICED obstruent — outside the deletion rule's Lhs entirely — so the + // build-time onset gate (WireDeletionSkips) must never offer the skip arc for this root: only + // the normal, unskipped concatenation "mdat" should be recoverable. + LexEntry root = AddEntry( + "dat_root", + FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + Surface, + "dat" + ); + AffixProcessRule prefix = AddMePrefix(); + RewriteRule delRule = AddVoicelessDeletionAfterBoundary(); + try + { + var search = new Morpher(TraceManager, Language); + Assert.That(search.AnalyzeWord("mdat").Any(), Is.True, "precondition: 'mdat' = m+dat, unaltered"); + + var verified = new VerifiedFstAnalyzer(TraceManager, Language); + Assert.That(verified.AnalyzeWord("mdat").Any(), Is.True, "the unskipped path must still work"); + Assert.That( + verified.AnalyzeWord("mat"), + Is.Empty, + "soundness: skipping 'd' would wrongly recover a word only 'pat' should produce" + ); + } + finally + { + Surface.PhonologicalRules.Remove(delRule); + Surface.MorphologicalRules.Remove(prefix); + Surface.Entries.Remove(root); + Entries.Remove("dat_root"); + } + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs new file mode 100644 index 000000000..b31af2cd8 --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs @@ -0,0 +1,880 @@ +using System.Collections.Concurrent; +using NUnit.Framework; +using SIL.Machine.FeatureModel; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; +using SIL.Machine.Morphology.HermitCrab.PhonologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// CI coverage for the propose-and-verify spine (HERMITCRAB_FST_PLAN.md §11.8/§12): the FST proposes, +/// HC's own engine confirms each candidate by restricted re-analysis (), and +/// the confirmed engine analysis is emitted. Exercises soundness (no false positives), the M2 fix +/// (yields genuine HC analyses with their category), the per-word opt-out, and thread-safety. +/// +public class VerifiedFstAnalyzerTests : HermitCrabTestBase +{ + private AffixProcessRule AddSuffix() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var sSuffix = new AffixProcessRule + { + Name = "s_suffix", + Gloss = "NMLZ", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + sSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, + } + ); + Morphophonemic.MorphologicalRules.Add(sSuffix); + return sSuffix; + } + + [Test] + public void Verified_MatchesSearch_OnConcatenativeCorpus() + { + AffixProcessRule suffix = AddSuffix(); + IMorphologicalAnalyzer search = new Morpher(TraceManager, Language); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(TraceManager, Language); + string[] corpus = { "sag", "sags", "dat", "sagg" }; // inflected, bare, homograph, non-word + AnalysisComparison comparison = FstVerification.Compare(search, verified, corpus); + Assert.That(comparison.MatchesReferenceExactly, Is.True, comparison.Format()); + Morphophonemic.MorphologicalRules.Remove(suffix); + } + + [Test] + public void Verified_RejectsNonWord_NoFalsePositive() + { + IMorphologicalAnalyzer search = new Morpher(TraceManager, Language); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(TraceManager, Language); + Assert.That(search.AnalyzeWord("sagg"), Is.Empty, "precondition: sagg is a non-word"); + Assert.That(verified.AnalyzeWord("sagg"), Is.Empty, "verify must not analyze a non-word"); + } + + [Test] + public void Verified_YieldsGenuineEngineAnalyses_WithCategory() + { + // M2: VerifiedFstAnalyzer must yield the matched HC analysis (real category), not the + // category-less FST candidate. WordAnalysis.Equals includes Category, so set-equality vs the + // engine fails if the category is dropped. + var search = new Morpher(TraceManager, Language); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(TraceManager, Language); + foreach (string word in new[] { "sag", "dat" }) + { + var fromSearch = new HashSet(search.AnalyzeWord(word)); + List fromVerified = verified.AnalyzeWord(word).ToList(); + Assert.That(fromVerified, Is.Not.Empty, $"expected analyses for {word}"); + foreach (WordAnalysis a in fromVerified) + { + Assert.That(a.Category, Is.Not.Null, $"verified analysis of {word} lost its category"); + Assert.That( + fromSearch, + Does.Contain(a), + $"verified analysis of {word} is not a genuine engine analysis" + ); + } + } + } + + [Test] + public void Verified_ParallelMatchesSequential() + { + AddSuffix(); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(TraceManager, Language); + var corpus = new List(); + for (int i = 0; i < 50; i++) + { + corpus.AddRange(new[] { "sag", "sags", "dat", "sat", "saz", "sas", "sagg" }); + } + Dictionary sequential = corpus.Distinct().ToDictionary(w => w, w => SigSet(verified, w)); + var parallel = new ConcurrentDictionary(); + Parallel.ForEach(corpus, w => parallel[w] = SigSet(verified, w)); + Assert.That( + corpus.Distinct().All(w => parallel[w] == sequential[w]), + Is.True, + "concurrent analyses diverged from sequential" + ); + } + + [Test] + public void Verified_CoversPhonologicallyAlteredBareRoot() + { + // Surface-allomorph precompile (§C): an unconditional t→d rule means the underlying bare root + // "dat" (entry 8) can ONLY surface as "dad". The old proposer (underlying arcs) misses it — its + // "t" arc can't match surface "d", and BareRootValid rejected it (it doesn't surface as itself). + // The surface-precompile builds an arc from the actual generated surface ("dad"), so the altered + // bare root is now matched. Confirmed via probe: gen dat(8)→dad, and "dad" analyzes while "dat" + // no longer does. + var tToD = new RewriteRule + { + Name = "t_to_d", + Lhs = Pattern.New().Annotation(Character(Table1, "t")).Value, + }; + tToD.Subrules.Add( + new RewriteSubrule { Rhs = Pattern.New().Annotation(Character(Table1, "d")).Value } + ); + Surface.PhonologicalRules.Add(tToD); + try + { + var search = new Morpher(TraceManager, Language); + Assert.That( + search.AnalyzeWord("dad").Any(), + Is.True, + "precondition: 'dad' analyzes (bare root 'dat' surfaces as 'dad')" + ); + + // Baseline: the underlying-only proposer (no-morpher ctor builds arcs from underlying shapes) + // misses the altered surface — both "dad" readings are underlying "dat", so it has no "dad" arc. + Assert.That( + new FstTemplateAnalyzer(Language).AnalyzeWord("dad"), + Is.Empty, + "baseline: the underlying-only proposer must miss the phonologically-altered surface" + ); + + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(TraceManager, Language); + AnalysisComparison cmp = FstVerification.Compare(search, verified, new[] { "dad" }); + Assert.That(cmp.MatchesReferenceExactly, Is.True, "altered bare root not covered: " + cmp.Format()); + + Assert.That(verified.AnalyzeWord("zzz"), Is.Empty, "soundness: a non-word must still yield nothing"); + } + finally + { + Surface.PhonologicalRules.Remove(tToD); + } + } + + [Test] + public void ComposedPhonology_CoversCrossBoundaryAlternation_WherePrecompileMisses() + { + // Point 4 (C-exact, composition with phonology inverse): a CROSS-BOUNDARY rule the per-morpheme + // precompile cannot see. A suffix inserts "t"; the root-final "g" devoices to "k" before that + // suffixal "t" — so sag+SUF = "sagt" -> "sakt". The precompile sees the bare root ("sag", no + // following t -> no devoicing) and the affix ("t") only in isolation, so it builds a "sagt" path + // and MISSES "sakt". Composition un-applies the rule on the assembled surface and recovers it. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var tSuffix = new AffixProcessRule + { + Name = "t_suffix", + Gloss = "TSF", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + tSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table1, "t") }, + } + ); + Morphophonemic.MorphologicalRules.Add(tSuffix); + var gDevoice = new RewriteRule + { + Name = "g_devoice", + Lhs = Pattern.New().Annotation(Character(Table1, "g")).Value, + }; + gDevoice.Subrules.Add( + new RewriteSubrule + { + Rhs = Pattern.New().Annotation(Character(Table1, "k")).Value, + RightEnvironment = Pattern.New().Annotation(Character(Table1, "t")).Value, + } + ); + Surface.PhonologicalRules.Add(gDevoice); + try + { + var search = new Morpher(TraceManager, Language); + Assert.That(search.AnalyzeWord("sakt").Any(), Is.True, "precondition: 'sakt' = sag+TSF (g->k / _t)"); + + // Even the surface-precompile proposer misses the cross-boundary form. + var fst = new FstTemplateAnalyzer(Language, new Morpher(TraceManager, Language)); + Assert.That( + fst.AnalyzeWord("sakt"), + Is.Empty, + "baseline: per-morpheme precompile misses cross-boundary 'sakt'" + ); + + var composed = new ComposedPhonologyProposer(Language, fst); + var pool = new MorpherPool(() => new Morpher(new TraceManager(), Language)); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(new CompositeProposer(fst, composed), pool); + AnalysisComparison cmp = FstVerification.Compare(search, verified, new[] { "sakt" }); + Assert.That( + cmp.MatchesReferenceExactly, + Is.True, + "cross-boundary alternation not covered: " + cmp.Format() + ); + + Assert.That(verified.AnalyzeWord("zzz"), Is.Empty, "soundness: a non-word must still yield nothing"); + } + finally + { + Surface.PhonologicalRules.Remove(gDevoice); + Morphophonemic.MorphologicalRules.Remove(tSuffix); + } + } + + [Test] + public void Composite_CoversFullReduplication_WhereFstAloneMisses() + { + // Point 3: full reduplication (copy the whole stem) is non-regular — the FST cannot represent + // it, but the ReduplicationProposer strips one copy, recurses the residual through the FST, and + // wraps it with the reduplication morpheme; verify confirms it as a genuine HC analysis. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var redup = new AffixProcessRule + { + Name = "redup", + Gloss = "RED", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, // copy the stem twice + } + ); + Morphophonemic.MorphologicalRules.Add(redup); + try + { + var search = new Morpher(TraceManager, Language); + Assert.That(search.AnalyzeWord("sagsag").Any(), Is.True, "precondition: 'sagsag' = RED('sag')"); + + var fst = new FstTemplateAnalyzer(Language, new Morpher(TraceManager, Language)); + Assert.That(fst.AnalyzeWord("sagsag"), Is.Empty, "baseline: the FST alone cannot represent reduplication"); + Assert.That(fst.CoversAllConstructs, Is.False, "reduplication marks the FST not-fully-covered"); + + var composite = new CompositeProposer(fst, new ReduplicationProposer(Language, fst)); + Assert.That(composite.CoversAllConstructs, Is.True, "the reduplication generator covers the skipped op"); + + var pool = new MorpherPool(() => new Morpher(new TraceManager(), Language)); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(composite, pool); + AnalysisComparison cmp = FstVerification.Compare(search, verified, new[] { "sagsag" }); + Assert.That(cmp.MatchesReferenceExactly, Is.True, "reduplication not covered: " + cmp.Format()); + + Assert.That(verified.AnalyzeWord("zzz"), Is.Empty, "soundness: a non-word must still yield nothing"); + + // Soundness of the generalized (any-length, not just half-word) copy scan added for partial + // (CV-style) reduplication support: "sasag" has an incidental short prefix repeat ("sa"+"sag" + // starts with "sa" again) that is NOT a real application of this full-copy-only rule. The + // raw proposer may well propose it (that is the point of scanning every length), but verify + // must reject it — this grammar's redup rule only produces base+base, never CV+base. + Assert.That( + verified.AnalyzeWord("sasag"), + Is.Empty, + "soundness: a coincidental short prefix repeat must not be confirmed by a full-copy-only rule" + ); + } + finally + { + Morphophonemic.MorphologicalRules.Remove(redup); + } + } + + [Test] + public void Composite_CoversSeparatorReduplication_WhereFstAloneMisses() + { + // Phase D (FST_FULL_GRAMMAR_PLAN.md): a copy separated by a literal character rather than + // sitting immediately adjacent to the base — the shape Indonesian's "-Cont" produces + // (menulis-nulis). This toy rule uses a full copy (base+sep+base) rather than a genuine partial + // tail copy (base+sep+TAIL — the real Indonesian shape, which needs a multi-group Lhs pattern; + // no existing test in this repo builds one, so it's unvalidated territory not attempted here, + // same call Phase 4 made for CV-template partial reduplication). The full-121-word Indonesian + // corpus benchmark is the positive evidence for the TAIL case specifically (93/121 -> 120/121 + // this session); this test covers the separator-scan mechanism itself and its soundness. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var redup = new AffixProcessRule + { + Name = "sep_redup", + Gloss = "CONT", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table1, "z"), new CopyFromInput("1") }, + } + ); + Morphophonemic.MorphologicalRules.Add(redup); + try + { + var search = new Morpher(TraceManager, Language); + Assert.That(search.AnalyzeWord("sagzsag").Any(), Is.True, "precondition: 'sagzsag' = CONT('sag')"); + + var fst = new FstTemplateAnalyzer(Language, new Morpher(TraceManager, Language)); + Assert.That( + fst.AnalyzeWord("sagzsag"), + Is.Empty, + "baseline: the FST alone cannot represent separator reduplication" + ); + + var composite = new CompositeProposer(fst, new ReduplicationProposer(Language, fst)); + var pool = new MorpherPool(() => new Morpher(new TraceManager(), Language)); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(composite, pool); + AnalysisComparison cmp = FstVerification.Compare(search, verified, new[] { "sagzsag" }); + Assert.That(cmp.MatchesReferenceExactly, Is.True, "separator reduplication not covered: " + cmp.Format()); + + // "sagzag" passes the surface shape check the scan looks for ("ag" is a genuine tail of + // "sag", so the separator scan DOES propose residual "sag") but this toy rule only produces + // a FULL copy (base+sep+base = "sagzsag"), never a tail copy — verify must reject it. + Assert.That( + verified.AnalyzeWord("sagzag"), + Is.Empty, + "soundness: a tail-copy candidate must not be confirmed by a full-copy-only rule" + ); + } + finally + { + Morphophonemic.MorphologicalRules.Remove(redup); + } + } + + [Test] + public void Composite_CoversSuffixStackedOutsideReduplication_WhereSeparatorScanAloneMisses() + { + // Phase G1 (FST_FULL_GRAMMAR_PLAN.md): Indonesian's mengamat-amati is meng+amat -> -Cont -> + // mengamat-amat -> -i(LOC) -> mengamat-amati - a plain suffix rule applied AFTER reduplication, + // which (since it just appends at the very end) lands on the tail of the copy. The separator + // scan alone sees copy="sags" against base="sagzsag" and finds no tail match (the trailing "s" + // isn't part of any copy); it must additionally try peeling a known suffix surface off the + // copy's end before re-testing the remainder as a tail. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var redup = new AffixProcessRule + { + Name = "sep_redup2", + Gloss = "CONT", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table1, "z"), new CopyFromInput("1") }, + } + ); + var trailingSuffix = new AffixProcessRule + { + Name = "trailing_suffix", + Gloss = "TRL", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + trailingSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table1, "s") }, // suffix order + } + ); + Morphophonemic.MorphologicalRules.Add(redup); + Morphophonemic.MorphologicalRules.Add(trailingSuffix); + try + { + var search = new Morpher(TraceManager, Language); + Assert.That( + search.AnalyzeWord("sagzsags").Any(), + Is.True, + "precondition: 'sagzsags' = TRL(CONT('sag')) — the engine must stack the plain suffix on top of the reduplicated form" + ); + + var fst = new FstTemplateAnalyzer(Language, new Morpher(TraceManager, Language)); + var composite = new CompositeProposer(fst, new ReduplicationProposer(Language, fst)); + var pool = new MorpherPool(() => new Morpher(new TraceManager(), Language)); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(composite, pool); + + AnalysisComparison cmp = FstVerification.Compare(search, verified, new[] { "sagzsags" }); + Assert.That( + cmp.MatchesReferenceExactly, + Is.True, + "suffix stacked outside reduplication not covered: " + cmp.Format() + ); + + Assert.That( + verified.AnalyzeWord("sagzdats"), + Is.Empty, + "soundness: a suffix-peeled candidate whose stripped copy isn't a real tail must be rejected" + ); + } + finally + { + Morphophonemic.MorphologicalRules.Remove(redup); + Morphophonemic.MorphologicalRules.Remove(trailingSuffix); + } + } + + [Test] + public void Fst_CoversCompound_ViaTheCompoundLoop() + { + // Phase G2 (FST_FULL_GRAMMAR_PLAN.md): the FST couldn't represent a compound at all until the + // compound loop landed directly in FstTemplateAnalyzer (a shared "join" state every root's + // chain end feeds into and every root's chain entry feeds out of — unlike reduplication/infix, + // this needed no sibling IConstructProposer) plus the FstReplay.Confirm fix (it used to reject + // any candidate with a second LexEntry morpheme outright). The originally-documented + // "cross-cutting WordAnalysis/MorphToken data-model lift" premise for this was wrong — both + // types already represent compounds (MorphOp.Compound); the only real blocker was FstReplay. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + LexEntry head = AddEntry( + "compound_head", + FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + Surface, + "pat" + ); + LexEntry nonHead = AddEntry( + "compound_nonhead", + FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + Surface, + "tak" + ); + var compound = new CompoundingRule { Name = "compound_rule" }; + Surface.MorphologicalRules.Add(compound); + compound.Subrules.Add( + new CompoundingSubrule + { + HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, + NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("head"), new InsertSegments(Table3, "+"), new CopyFromInput("nonHead") }, + } + ); + try + { + var search = new Morpher(TraceManager, Language); + Assert.That(search.AnalyzeWord("pattak").Any(), Is.True, "precondition: 'pattak' = pat+tak compound"); + + var fst = new FstTemplateAnalyzer(Language, new Morpher(TraceManager, Language)); + Assert.That( + fst.AnalyzeWord("pattak"), + Is.Not.Empty, + "the compound loop must let the bare FST propose the compound directly (no sibling generator needed)" + ); + + var pool = new MorpherPool(() => new Morpher(new TraceManager(), Language)); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(fst, pool); + AnalysisComparison cmp = FstVerification.Compare(search, verified, new[] { "pattak" }); + Assert.That(cmp.MatchesReferenceExactly, Is.True, "compound not covered: " + cmp.Format()); + + // Soundness / boundedness: the compound loop is bounded to exactly ONE extra root (no arc + // back into the join), matching CompoundingRule's own default MaxApplicationCount of 1 — a + // three-root chain must be rejected by both the real engine and the verified FST alike. + Assert.That( + search.AnalyzeWord("pattakpat"), + Is.Empty, + "precondition: a 3-root chain exceeds MaxApplicationCount=1, so the engine itself rejects it" + ); + Assert.That( + verified.AnalyzeWord("pattakpat"), + Is.Empty, + "soundness: the compound loop must not chain a third root" + ); + } + finally + { + Surface.MorphologicalRules.Remove(compound); + Surface.Entries.Remove(head); + Surface.Entries.Remove(nonHead); + Entries.Remove("compound_head"); + Entries.Remove("compound_nonhead"); + } + } + + [Test] + public void SurfacePhonology_AppliesRulesForwardToASegmentString() + { + // The forward helper applies synthesis phonology to a segment string in isolation: an + // unconditional t->d rule means "t" surfaces as "d" (and the underlying form is always kept). + var tToD = new RewriteRule + { + Name = "t_to_d", + Lhs = Pattern.New().Annotation(Character(Table1, "t")).Value, + }; + tToD.Subrules.Add( + new RewriteSubrule { Rhs = Pattern.New().Annotation(Character(Table1, "d")).Value } + ); + Surface.PhonologicalRules.Add(tToD); + try + { + var sp = new SurfacePhonology(Language, new Morpher(TraceManager, Language)); + Assert.That(sp.Variants("t"), Does.Contain("d"), "'t' must surface as 'd'"); + Assert.That(sp.Variants("t"), Does.Contain("t"), "the underlying form is always included"); + } + finally + { + Surface.PhonologicalRules.Remove(tToD); + } + } + + [Test] + public void SurfacePhonology_BoundaryTier_RecoversAffixSurfaceFromNeighborContext() + { + // Point 1b (C-boundary): a suffixal "t" voices to "d" only AFTER "g". In isolation "t" stays + // "t" (1a misses the alternation); with the left neighbor "g" the boundary tier recovers "d". + var tVoice = new RewriteRule + { + Name = "t_voice", + Lhs = Pattern.New().Annotation(Character(Table1, "t")).Value, + }; + tVoice.Subrules.Add( + new RewriteSubrule + { + Rhs = Pattern.New().Annotation(Character(Table1, "d")).Value, + LeftEnvironment = Pattern.New().Annotation(Character(Table1, "g")).Value, + } + ); + Surface.PhonologicalRules.Add(tVoice); + try + { + var sp = new SurfacePhonology(Language, new Morpher(TraceManager, Language)); + IReadOnlyCollection variants = sp.Variants("t"); + Assert.That(variants, Does.Contain("t"), "underlying form is always included"); + Assert.That( + variants, + Does.Contain("d"), + "boundary tier must recover the post-'g' surface 'd' (isolation alone would miss it)" + ); + } + finally + { + Surface.PhonologicalRules.Remove(tVoice); + } + } + + [Test] + public void Proposer_CoversPhonologicallyAlteredAffix() + { + // Point 1 (affix surface-precompile): a suffix inserts "t", but an unconditional t->d rule means + // it can only surface as "d" — so "sag"+SUF = "sagt" -> "sagd". The underlying-only proposer + // builds a "t" affix arc and misses "sagd"; the surface-precompile proposer builds the "d" arc. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var tSuffix = new AffixProcessRule + { + Name = "t_suffix", + Gloss = "TSF", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + tSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table1, "t") }, + } + ); + Morphophonemic.MorphologicalRules.Add(tSuffix); + var tToD = new RewriteRule + { + Name = "t_to_d", + Lhs = Pattern.New().Annotation(Character(Table1, "t")).Value, + }; + tToD.Subrules.Add( + new RewriteSubrule { Rhs = Pattern.New().Annotation(Character(Table1, "d")).Value } + ); + Surface.PhonologicalRules.Add(tToD); + try + { + var search = new Morpher(TraceManager, Language); + Assert.That(search.AnalyzeWord("sagd").Any(), Is.True, "precondition: 'sagd' = sag+TSF (t->d)"); + + Assert.That( + new FstTemplateAnalyzer(Language).AnalyzeWord("sagd"), + Is.Empty, + "baseline: the underlying-only proposer builds a 't' affix arc and misses the 'd' surface" + ); + + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(TraceManager, Language); + AnalysisComparison cmp = FstVerification.Compare(search, verified, new[] { "sagd" }); + Assert.That(cmp.MatchesReferenceExactly, Is.True, "altered affix not covered: " + cmp.Format()); + + Assert.That(verified.AnalyzeWord("zzz"), Is.Empty, "soundness: a non-word must still yield nothing"); + } + finally + { + Surface.PhonologicalRules.Remove(tToD); + Morphophonemic.MorphologicalRules.Remove(tSuffix); + } + } + + [Test] + public void Composite_CoversInfixation_WhereFstAloneMisses() + { + // Point 2: infixation (affix inserted inside the stem). The FST recognizes but does not build + // infix slots; the InfixProposer removes the infix's segments at each interior position, recurses + // the residual through the FST, and appends the infix morpheme. Here an "a" is infixed after the + // first segment: "sag" -> "s·a·ag" = "saag". + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var infix = new AffixProcessRule + { + Name = "a_infix", + Gloss = "INF", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + infix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = + { + Pattern.New("1").Annotation(any).Value, // first segment + Pattern.New("2").Annotation(any).OneOrMore.Value, // rest of stem + }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "a"), new CopyFromInput("2") }, + } + ); + Morphophonemic.MorphologicalRules.Add(infix); + try + { + var search = new Morpher(TraceManager, Language); + Assert.That(search.AnalyzeWord("saag").Any(), Is.True, "precondition: 'saag' = INF('sag')"); + + var fst = new FstTemplateAnalyzer(Language, new Morpher(TraceManager, Language)); + Assert.That(fst.AnalyzeWord("saag"), Is.Empty, "baseline: the FST alone does not build infix slots"); + Assert.That(fst.CoversAllConstructs, Is.False, "infixation marks the FST not-fully-covered"); + + var composite = new CompositeProposer(fst, new InfixProposer(Language, fst)); + Assert.That(composite.CoversAllConstructs, Is.True, "the infix generator covers the skipped op"); + + var pool = new MorpherPool(() => new Morpher(new TraceManager(), Language)); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(composite, pool); + AnalysisComparison cmp = FstVerification.Compare(search, verified, new[] { "saag" }); + Assert.That(cmp.MatchesReferenceExactly, Is.True, "infixation not covered: " + cmp.Format()); + + Assert.That(verified.AnalyzeWord("zzz"), Is.Empty, "soundness: a non-word must still yield nothing"); + } + finally + { + Morphophonemic.MorphologicalRules.Remove(infix); + } + } + + [Test] + public void Composite_WiresGenerators_ReduplicatingGrammarMatchesEngine() + { + // Integration: CompositeProposer.ForLanguage wires the FST + generators, so a reduplicating + // grammar's fast path matches the engine — not just the hand-built composite in the unit tests. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var redup = new AffixProcessRule + { + Name = "redup", + Gloss = "RED", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, + } + ); + Morphophonemic.MorphologicalRules.Add(redup); + try + { + string[] corpus = { "sag", "sagsag", "dat" }; // bare, reduplicated, homograph + var search = new Morpher(TraceManager, Language); + CompositeProposer composite = CompositeProposer.ForLanguage( + Language, + new FstTemplateAnalyzer(Language, new Morpher(new TraceManager(), Language)) + ); + var fast = new VerifiedFstAnalyzer( + composite, + new MorpherPool(() => new Morpher(new TraceManager(), Language)) + ); + foreach (string word in corpus.Append("zzz")) + { + var fastSet = new HashSet(fast.AnalyzeWord(word).Select(Sig)); + var oracle = new HashSet(search.AnalyzeWord(word).Select(Sig)); + Assert.That(fastSet.SetEquals(oracle), Is.True, $"fast path disagrees with the engine for {word}"); + } + } + finally + { + Morphophonemic.MorphologicalRules.Remove(redup); + } + } + + [Test] + public void Composite_WithPhonologyAndReduplication_ParallelMatchesSequential() + { + // Thread-safety on the concurrent path: the composite now runs HC's phonology inverse + // (ComposedPhonologyProposer) and the reduplication generator at analyze time. Drive both + // in parallel and assert no divergence / no exceptions. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var redup = new AffixProcessRule + { + Name = "redup", + Gloss = "RED", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, + } + ); + Morphophonemic.MorphologicalRules.Add(redup); + var tToD = new RewriteRule + { + Name = "t_to_d", + Lhs = Pattern.New().Annotation(Character(Table1, "t")).Value, + }; + tToD.Subrules.Add( + new RewriteSubrule { Rhs = Pattern.New().Annotation(Character(Table1, "d")).Value } + ); + Surface.PhonologicalRules.Add(tToD); + try + { + CompositeProposer composite = CompositeProposer.ForLanguage( + Language, + new FstTemplateAnalyzer(Language, new Morpher(new TraceManager(), Language)) + ); + var fast = new VerifiedFstAnalyzer( + composite, + new MorpherPool(() => new Morpher(new TraceManager(), Language)) + ); + var corpus = new List(); + for (int i = 0; i < 50; i++) + { + corpus.AddRange(new[] { "sag", "sagsag", "dad", "daddad", "sad", "zzz" }); + } + Dictionary sequential = corpus.Distinct().ToDictionary(w => w, w => SigSet(fast, w)); + var parallel = new ConcurrentDictionary(); + Parallel.ForEach(corpus, w => parallel[w] = SigSet(fast, w)); + Assert.That( + corpus.Distinct().All(w => parallel[w] == sequential[w]), + Is.True, + "concurrent analyses diverged from sequential (composite phonology/redup not thread-safe)" + ); + } + finally + { + Surface.PhonologicalRules.Remove(tToD); + Morphophonemic.MorphologicalRules.Remove(redup); + } + } + + [Test] + public void LeverTwo_LazyComposition_RecoversBoundaryDeletion_RealTypes() + { + // Lever 2 with REAL HC types (LEVER_2.md): lazy-compose an inverse-phonology transducer (Pinv) + // with the underlying morphotactic FST (FstTemplateAnalyzer.AnalyzeComposed). A "-d" suffix plus + // a deletion rule t→∅ / _d means sat+DSF = "satd" → "sad" (the root-final t deletes). The + // underlying-only proposer misses "sad"; lazy composition restores the deleted t — constrained by + // the lexicon — and recovers [sat, DSF]. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + // Suffix whose underlying form is "kd" but whose "k" deletes before "d" → it surfaces as "d". + // All segments are Table1 (the earlier affix-phonology test confirmed Table1 rules fire on + // Table1-inserted affix segments), so this avoids the root-table friction. + var kdSuffix = new AffixProcessRule + { + Name = "kd_suffix", + Gloss = "KD", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + kdSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table1, "kd") }, + } + ); + Morphophonemic.MorphologicalRules.Add(kdSuffix); + var kDel = new RewriteRule + { + Name = "k_deletion", + Lhs = Pattern.New().Annotation(Character(Table1, "k")).Value, + }; + kDel.Subrules.Add( + new RewriteSubrule // no Rhs ⇒ deletion of k before d + { + RightEnvironment = Pattern.New().Annotation(Character(Table1, "d")).Value, + } + ); + Surface.PhonologicalRules.Add(kDel); + try + { + var search = new Morpher(TraceManager, Language); + var engine = new HashSet(search.AnalyzeWord("sagd").Select(Sig)); + Assert.That(engine.Any(s => s.Contains("KD")), Is.True, "precondition: 'sagd' = sag+KD (k→∅/_d)"); + + // Baseline: the underlying-only proposer has a "k" arc the surface "sagd" cannot match. + Assert.That( + new FstTemplateAnalyzer(Language).AnalyzeWord("sagd").Select(Sig).Any(s => s.Contains("KD")), + Is.False, + "baseline: underlying-only proposer misses the deletion form" + ); + + // Pinv: identity on s/a/g/d, plus an ε-input arc restoring a deleted k immediately before a d. + var pinv = new InversePhonology { StartState = 0 }; + pinv.SetAccepting(0); + foreach (string c in new[] { "s", "a", "g", "d" }) + pinv.AddArc(0, Character(Table1, c), Character(Table1, c), 0); + pinv.AddArc(0, null, Character(Table1, "k"), 1); // ε: restore underlying k + pinv.AddArc(1, Character(Table1, "d"), Character(Table1, "d"), 0); // ...immediately before a d + + var lex = new FstTemplateAnalyzer(Language); // default ctor: underlying-only arcs + var composed = new HashSet(lex.AnalyzeComposed("sagd", pinv).Select(Sig)); + + Assert.That( + composed.Any(s => s.Contains("KD")), + Is.True, + "lazy composition must recover the deletion form" + ); + Assert.That(composed.IsSubsetOf(engine), Is.True, "soundness: composed candidates ⊆ engine analyses"); + Assert.That(lex.AnalyzeComposed("saga", pinv), Is.Empty, "a non-word must yield nothing"); + } + finally + { + Surface.PhonologicalRules.Remove(kDel); + Morphophonemic.MorphologicalRules.Remove(kdSuffix); + } + } + + [Test] + public void ForwardSynthesis_CoversAffixedForms_AndIsSound() + { + // Forward-synthesis precompile: enumerate root × affix combos, synthesize each surface (phonology + // applied WITH the morpheme boundary present — boundary-correct, unlike the inverse), and + // tabulate surface→analysis. Here the s-suffix form "sags" is tabulated and confirmed by verify. + AffixProcessRule suffix = AddSuffix(); + try + { + var search = new Morpher(TraceManager, Language); + var synth = new ForwardSynthesisProposer(Language, new Morpher(TraceManager, Language)); + var pool = new MorpherPool(() => new Morpher(new TraceManager(), Language)); + var composite = new CompositeProposer(new FstTemplateAnalyzer(Language), synth); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(composite, pool); + + foreach (string w in new[] { "sag", "sags", "dat" }) + { + var oracle = new HashSet(search.AnalyzeWord(w).Select(Sig)); + var got = new HashSet(verified.AnalyzeWord(w).Select(Sig)); + Assert.That( + got.IsSubsetOf(oracle), + Is.True, + $"soundness: forward-synth proposed a non-engine analysis for {w}" + ); + Assert.That(got.SetEquals(oracle), Is.True, $"forward-synth + composite should fully cover {w}"); + } + Assert.That(verified.AnalyzeWord("zzz"), Is.Empty, "soundness: a non-word must yield nothing"); + } + finally + { + Morphophonemic.MorphologicalRules.Remove(suffix); + } + } + + private static string Sig(WordAnalysis a) => + string.Join("+", a.Morphemes.Select(m => (m as Morpheme)?.Gloss ?? "?")) + ":" + a.RootMorphemeIndex; + + private static string SigSet(IMorphologicalAnalyzer analyzer, string word) => + string.Join("|", analyzer.AnalyzeWord(word).Select(Sig).OrderBy(s => s, System.StringComparer.Ordinal)); +} diff --git a/tests/SIL.Machine.Tests/Annotations/AnnotationTests.cs b/tests/SIL.Machine.Tests/Annotations/AnnotationTests.cs index db74eee40..304266a7a 100644 --- a/tests/SIL.Machine.Tests/Annotations/AnnotationTests.cs +++ b/tests/SIL.Machine.Tests/Annotations/AnnotationTests.cs @@ -266,4 +266,375 @@ public void FindDepthFirst() Assert.That(annList.FindDepthFirst(50, Direction.RightToLeft, out result), Is.True); Assert.That(result, Is.EqualTo(annList.Last.Prev)); } + + // Copy-on-write safety net for the Shape/ShapeNode refactor (Plan B): cloning a frozen + // Shape and mutating a cloned node's FeatureStruct must not change the source shape. + private static Shape BuildShape(FeatureSystem featSys) + { + var shape = new Shape(end => new ShapeNode(FeatureStruct.New().Value)); + shape.Add(FeatureStruct.New(featSys).Symbol("a1").Value); + shape.Add(FeatureStruct.New(featSys).Symbol("a2").Value); + shape.Add(FeatureStruct.New(featSys).Symbol("a3").Value); + shape.Freeze(); + return shape; + } + + [Test] + public void CloneShape_MutateClonedNodeFeatureStruct_LeavesSourceShapeUnchanged() + { + var featSys = new FeatureSystem + { + new SymbolicFeature("a", new FeatureSymbol("a1"), new FeatureSymbol("a2"), new FeatureSymbol("a3")), + new SymbolicFeature("b", new FeatureSymbol("b1"), new FeatureSymbol("b2")), + }; + featSys.Freeze(); + + Shape source = BuildShape(featSys); + Shape expected = BuildShape(featSys); + Shape clone = source.Clone(); + + // CopyTo fidelity: same node count and value-equal to the source. + Assert.That(clone.Count, Is.EqualTo(source.Count)); + Assert.That(clone.ValueEquals(source), Is.True); + + // Mutate the first cloned node's feature struct (the in-place pattern HermitCrab uses). + clone.First.Annotation.FeatureStruct.AddValue( + featSys.GetFeature("b"), + new SymbolicFeatureValue(featSys.GetSymbol("b1")) + ); + + // The source shape must be byte-for-byte unchanged. + Assert.That(source.ValueEquals(expected), Is.True, "frozen source shape changed by a clone-node mutation"); + Assert.That(source.First.Annotation.FeatureStruct.ContainsFeature(featSys.GetFeature("b")), Is.False); + Assert.That(clone.First.Annotation.FeatureStruct.ContainsFeature(featSys.GetFeature("b")), Is.True); + } + + [Test] + public void LargeShape_GrowsBackingArrays_PreservesLinkAndCloneIntegrity() + { + var featSys = new FeatureSystem { new SymbolicFeature("a", new FeatureSymbol("a1"), new FeatureSymbol("a2")) }; + featSys.Freeze(); + + // Append well past the initial backing capacity (4) through several doublings, so the + // parallel _nodes/_next/_prev/_frozen arrays are Array.Resize'd multiple times. + var shape = new Shape(end => new ShapeNode(FeatureStruct.New().Value)); + var added = new List(); + for (int i = 0; i < 50; i++) + added.Add(shape.Add(FeatureStruct.New(featSys).Symbol(i % 2 == 0 ? "a1" : "a2").Value)); + + Assert.That(shape.Count, Is.EqualTo(50)); + + // Forward links (First..Last content) preserve insertion order + node identity across growth. + var forward = new List(); + for (ShapeNode n = shape.First; n != shape.End; n = n.Next) + forward.Add(n); + Assert.That(forward, Is.EqualTo(added)); + + // Backward links consistent (Last..First reversed equals insertion order). + var backward = new List(); + for (ShapeNode n = shape.Last; n != shape.Begin; n = n.Prev) + backward.Add(n); + backward.Reverse(); + Assert.That(backward, Is.EqualTo(added)); + + // GetNodes over the content range yields the same nodes. + Assert.That(shape.GetNodes(shape.First, shape.Last).ToList(), Is.EqualTo(added)); + + // Each handle round-trips through the dense index (NodeAt(OffsetOf(n)) == n). + foreach (ShapeNode n in added) + Assert.That(shape.NodeAt(shape.OffsetOf(n)), Is.EqualTo(n)); + + // Mid-list insert then remove exercise the flat-backing mutators + slot bookkeeping. + var inserted = new ShapeNode(FeatureStruct.New(featSys).Symbol("a1").Value); + shape.AddAfter(added[24], inserted); + Assert.That(shape.Remove(added[9]), Is.True); + Assert.That(shape.Count, Is.EqualTo(50)); // 50 + 1 inserted - 1 removed + + var afterMutation = new List(); + for (ShapeNode n = shape.First; n != shape.End; n = n.Next) + afterMutation.Add(n); + Assert.That(afterMutation, Does.Not.Contain(added[9])); + Assert.That(afterMutation, Does.Contain(inserted)); + int idx24 = afterMutation.IndexOf(added[24]); + Assert.That(afterMutation[idx24 + 1], Is.EqualTo(inserted), "inserted node must follow its anchor"); + + // A clone of the large (unfrozen) shape is value-equal and independent. + Shape clone = shape.Clone(); + Assert.That(clone.Count, Is.EqualTo(shape.Count)); + Assert.That(clone.ValueEquals(shape), Is.True); + } + + // RUSTIFY Stage 2 thesis check: the FST flip from TOffset = ShapeNode to TOffset = int maps each + // annotation [startNode, endNode] to the half-open int range [startNode.Tag, endNode.Tag + 1]. + // The whole flip's correctness rests on that mapping preserving the range relationships the FST + // traversal depends on (ordering via CompareTo, Overlaps, Contains) — for SPARSE tags (an + // appended, unfrozen shape: rewrite rules mutate + match unfrozen) AND dense tags (frozen). This + // validates that thesis empirically before any code is built on it. + private static System.Collections.Generic.List> BuildSpannedShape( + FeatureSystem featSys, + bool freeze + ) + { + var shape = new Shape(end => new ShapeNode(FeatureStruct.New().Value)); + shape.Add(FeatureStruct.New(featSys).Symbol("a1").Value); + ShapeNode n1 = shape.Add(FeatureStruct.New(featSys).Symbol("a2").Value); + ShapeNode n2 = shape.Add(FeatureStruct.New(featSys).Symbol("a3").Value); + shape.Add(FeatureStruct.New(featSys).Symbol("a1").Value); + // a spanning (morph-like) annotation over the two middle nodes — exercises start != end + shape.Annotations.Add(Range.Create(n1, n2), FeatureStruct.New(featSys).Symbol("a2").Value); + if (freeze) + shape.Freeze(); + + // every annotation (leaves + the span + its children), excluding the Begin/End anchors whose + // int.MinValue/int.MaxValue tags are handled separately in the real projection + var anns = new System.Collections.Generic.List>(); + foreach (Annotation top in shape.Annotations) + { + foreach (Annotation a in top.GetNodesDepthFirst()) + { + if (a.Range.Start.Tag != int.MinValue && a.Range.End.Tag != int.MaxValue) + anns.Add(a); + } + } + return anns; + } + + private static Shape BuildSpannedShapeObject(FeatureSystem featSys, bool freeze) + { + var shape = new Shape(end => new ShapeNode(FeatureStruct.New().Value)); + shape.Add(FeatureStruct.New(featSys).Symbol("a1").Value); + ShapeNode n1 = shape.Add(FeatureStruct.New(featSys).Symbol("a2").Value); + ShapeNode n2 = shape.Add(FeatureStruct.New(featSys).Symbol("a3").Value); + shape.Add(FeatureStruct.New(featSys).Symbol("a1").Value); + shape.Annotations.Add(Range.Create(n1, n2), FeatureStruct.New(featSys).Symbol("a2").Value); + if (freeze) + shape.Freeze(); + return shape; + } + + private static void AssertProjectionMatches(Shape shape, Annotation src, Annotation proj) + { + // offset = dense node position; a node [s,e] -> half-open [off(s), off(e)+1) + Range expected = Range.Create(shape.OffsetOf(src.Range.Start), shape.OffsetOf(src.Range.End) + 1); + Assert.That(proj.Range, Is.EqualTo(expected), "projected range"); + Assert.That(proj.Optional, Is.EqualTo(src.Optional), "projected optional"); + // FeatureStruct is shared by reference so in-place edits stay visible to the int view + Assert.That(proj.FeatureStruct, Is.SameAs(src.FeatureStruct), "projected FeatureStruct identity"); + Assert.That(proj.Children.Count, Is.EqualTo(src.IsLeaf ? 0 : src.Children.Count), "projected child count"); + if (!src.IsLeaf) + { + Annotation[] sc = src.Children.ToArray(); + Annotation[] pc = proj.Children.ToArray(); + for (int k = 0; k < sc.Length; k++) + AssertProjectionMatches(shape, sc[k], pc[k]); + } + } + + [TestCase(false)] + [TestCase(true)] + public void IntAnnotationProjection_MirrorsShapeNodeAnnotations(bool freeze) + { + var featSys = new FeatureSystem + { + new SymbolicFeature("a", new FeatureSymbol("a1"), new FeatureSymbol("a2"), new FeatureSymbol("a3")), + }; + featSys.Freeze(); + + Shape shape = BuildSpannedShapeObject(featSys, freeze); + + AnnotationList proj = shape.IntAnnotations; + Assert.That(proj.Count, Is.EqualTo(shape.Annotations.Count), "top-level count"); + + // top-level annotations correspond in order (the int range mapping preserves ordering) + Annotation[] srcTop = shape.Annotations.ToArray(); + Annotation[] projTop = proj.ToArray(); + for (int k = 0; k < srcTop.Length; k++) + AssertProjectionMatches(shape, srcTop[k], projTop[k]); + + // NodeAt/OffsetOf round-trip every node (dense offset), including margins + foreach (ShapeNode node in shape) + Assert.That(shape.NodeAt(shape.OffsetOf(node)), Is.SameAs(node), "NodeAt(OffsetOf(node)) round-trip"); + Assert.That(shape.NodeAt(shape.OffsetOf(shape.Begin)), Is.SameAs(shape.Begin)); + Assert.That(shape.NodeAt(shape.OffsetOf(shape.End)), Is.SameAs(shape.End)); + + // the projection is cached against the annotation Version + Assert.That(shape.IntAnnotations, Is.SameAs(proj), "projection cached when unchanged"); + if (!freeze) + { + shape.Add(FeatureStruct.New(featSys).Symbol("a1").Value); + Assert.That(shape.IntAnnotations, Is.Not.SameAs(proj), "projection rebuilt after a mutation"); + } + } + + [TestCase(false)] + [TestCase(true)] + public void IntOffsetRangeMapping_PreservesShapeNodeRangeRelationships(bool freeze) + { + var featSys = new FeatureSystem + { + new SymbolicFeature("a", new FeatureSymbol("a1"), new FeatureSymbol("a2"), new FeatureSymbol("a3")), + }; + featSys.Freeze(); + + System.Collections.Generic.List> anns = BuildSpannedShape(featSys, freeze); + Assert.That(anns.Count, Is.GreaterThanOrEqualTo(4)); + + // sanity: appended (unfrozen) tags really are sparse, not 0..N-1 + if (!freeze) + { + var tags = anns.Select(a => a.Range.Start.Tag).Distinct().OrderBy(t => t).ToArray(); + Assert.That(tags.Length > 1 && tags[1] - tags[0] > 1, Is.True, "expected sparse appended tags"); + } + + static Range ToInt(Annotation a) => Range.Create(a.Range.Start.Tag, a.Range.End.Tag + 1); + + foreach (Annotation x in anns) + { + foreach (Annotation y in anns) + { + Range xs = x.Range, + ys = y.Range; + Range xi = ToInt(x), + yi = ToInt(y); + + Assert.That( + System.Math.Sign(xi.CompareTo(yi)), + Is.EqualTo(System.Math.Sign(xs.CompareTo(ys))), + $"CompareTo sign diverged: shape={xs}.CompareTo({ys}) int={xi}.CompareTo({yi})" + ); + Assert.That( + xi.Overlaps(yi), + Is.EqualTo(xs.Overlaps(ys)), + $"Overlaps diverged: shape={xs}/{ys} int={xi}/{yi}" + ); + Assert.That( + xi.Contains(yi), + Is.EqualTo(xs.Contains(ys)), + $"Contains diverged: shape={xs}/{ys} int={xi}/{yi}" + ); + } + } + } + + [TestCase(false)] + [TestCase(true)] + public void IntRange_StartsAtBoundaryAnchorInEachDirection(bool freeze) + { + var featSys = new FeatureSystem + { + new SymbolicFeature("a", new FeatureSymbol("a1"), new FeatureSymbol("a2"), new FeatureSymbol("a3")), + }; + featSys.Freeze(); + + Shape shape = BuildSpannedShapeObject(featSys, freeze); + + // A directional match begins at IntRange.GetStart(dir); that offset must resolve to the + // boundary anchor itself (Begin for LtR, End for RtL). The End anchor's dense node range is + // half-open [off(End), off(End)+1), so its RtL start coordinate is off(End)+1 — IntRange must + // carry the +1, or a RtL match would begin at the last content node and skip any edit adjacent + // to End (e.g. inserting a deleted segment after the final vowel during analysis). + Assert.That( + shape.IntRange.GetStart(Direction.LeftToRight), + Is.EqualTo(shape.MatchStartOffset(shape.Begin, Direction.LeftToRight)), + "a LtR match must start at the Begin anchor" + ); + Assert.That( + shape.IntRange.GetStart(Direction.RightToLeft), + Is.EqualTo(shape.MatchStartOffset(shape.End, Direction.RightToLeft)), + "a RtL match must start at the End anchor" + ); + } + + [Test] + public void Optional_FlipInvalidatesIntProjection() + { + var featSys = new FeatureSystem + { + new SymbolicFeature("a", new FeatureSymbol("a1"), new FeatureSymbol("a2"), new FeatureSymbol("a3")), + }; + featSys.Freeze(); + + // unfrozen: Optional is only ever flipped on a mutable shape (during analysis/unapplication) + Shape shape = BuildSpannedShapeObject(featSys, freeze: false); + + AnnotationList proj = shape.IntAnnotations; + ShapeNode node = shape.First; + Assert.That(node.Annotation.Optional, Is.False); + + // Flipping Optional is a non-structural change. The int projection copies Optional by value and + // caches against the annotation Version, so the flip must invalidate the cache — otherwise the + // matcher keeps seeing the stale flag and never forks the optional-skip instances. + node.Annotation.Optional = true; + + AnnotationList proj2 = shape.IntAnnotations; + Assert.That(proj2, Is.Not.SameAs(proj), "projection rebuilt after an Optional flip"); + int off = shape.OffsetOf(node); + Annotation projNode = proj2.Single(a => a.Range.Start == off); + Assert.That(projNode.Optional, Is.True, "rebuilt projection reflects the flipped Optional flag"); + } + + [Test] + public void CopyOnWriteClone_NeverInflated_ServesProjectionFromSource() + { + var featSys = new FeatureSystem + { + new SymbolicFeature("a", new FeatureSymbol("a1"), new FeatureSymbol("a2"), new FeatureSymbol("a3")), + }; + featSys.Freeze(); + + // RUSTIFY Stage 3 (III): a clone of a FROZEN shape is copy-on-write — it copies nothing and serves + // the int-offset projection from its frozen source, so a traverse-only clone never materializes. + Shape src = BuildSpannedShapeObject(featSys, freeze: true); + AnnotationList srcProj = src.IntAnnotations; + + Shape clone = src.Clone(); + Assert.That(clone.Count, Is.EqualTo(src.Count), "COW clone reports the source content count"); + Assert.That(clone.IntAnnotations, Is.SameAs(srcProj), "COW clone serves the source's projection"); + Assert.That(clone.IntRange, Is.EqualTo(src.IntRange), "COW clone serves the source's int range"); + } + + [Test] + public void CopyOnWriteClone_MutationInflatesAndDoesNotCorruptSource() + { + var featSys = new FeatureSystem + { + new SymbolicFeature("a", new FeatureSymbol("a1"), new FeatureSymbol("a2"), new FeatureSymbol("a3")), + }; + featSys.Freeze(); + + Shape src = BuildSpannedShapeObject(featSys, freeze: true); + int srcCount = src.Count; + AnnotationList srcProj = src.IntAnnotations; + + Shape clone = src.Clone(); + // Touch a handle, then mutate — this must inflate the clone's own node graph, leaving the shared + // frozen source untouched (the corruption case the gating exists to prevent). + ShapeNode first = clone.First; + clone.AddAfter(first, FeatureStruct.New(featSys).Symbol("a1").Value); + + Assert.That(clone.Count, Is.EqualTo(srcCount + 1), "the clone was mutated"); + Assert.That(src.Count, Is.EqualTo(srcCount), "the frozen source count is unchanged"); + Assert.That(src.IntAnnotations, Is.SameAs(srcProj), "the frozen source projection is unchanged"); + } + + [Test] + public void CopyOnWriteClone_FrozenBySharing_HashStableAcrossInflation() + { + var featSys = new FeatureSystem + { + new SymbolicFeature("a", new FeatureSymbol("a1"), new FeatureSymbol("a2"), new FeatureSymbol("a3")), + }; + featSys.Freeze(); + + Shape src = BuildSpannedShapeObject(featSys, freeze: true); + int srcHash = src.GetFrozenHashCode(); + + Shape clone = src.Clone(); + clone.Freeze(); // no-op: adopts the source's frozen state + hash without materializing nodes + Assert.That(clone.GetFrozenHashCode(), Is.EqualTo(srcHash), "frozen-by-sharing hash equals the source"); + + // Forcing inflation (handle access) re-materializes + re-freezes; the hash must be unchanged. + ShapeNode _ = clone.First; + Assert.That(clone.GetFrozenHashCode(), Is.EqualTo(srcHash), "hash stable across COW inflation"); + } } diff --git a/tests/SIL.Machine.Tests/FeatureModel/FeatureStructTests.cs b/tests/SIL.Machine.Tests/FeatureModel/FeatureStructTests.cs index c733110ee..2f5a6430d 100644 --- a/tests/SIL.Machine.Tests/FeatureModel/FeatureStructTests.cs +++ b/tests/SIL.Machine.Tests/FeatureModel/FeatureStructTests.cs @@ -1017,6 +1017,37 @@ public void BitArray() SkipAndCheck(featPos, 17, "v1"); } + [Test] + public void UlongSymbolicFeatureValueFlags_SixtyFourSymbols_MaskCoversWholeUlong() + { + // Regression for the 64-symbol boundary: SymbolicFeatureValue.CreateFlags routes a feature + // with up to 64 symbols to the ulong implementation, whose mask was computed as + // `(1UL << 64) - 1`. A ulong shift count is masked to its low 6 bits, so that is `1UL << 0` + // minus 1 == 0, breaking every mask-dependent op. The existing BitArray() test only checks + // positive first/last values (Set/Get), so it never exercised the mask at this boundary. + var symbols = new System.Collections.Generic.List(); + for (int i = 0; i < 64; i++) + symbols.Add(new FeatureSymbol("s" + i)); + var feature = new SymbolicFeature("f64", symbols); + + var all = new UlongSymbolicFeatureValueFlags(feature); + all.Set(feature.PossibleSymbols); + // A value holding every allowed symbol is fully instantiated / unconstrained. + Assert.That(all.HasAllSet(), Is.True, "64-symbol full set must satisfy HasAllSet (mask must be all-ones)"); + + // Negating the full set must yield the empty set (not everything, and not itself). + ISymbolicFeatureValueFlags none = all.Not(); + Assert.That(none.HasAnySet(), Is.False, "Not(full set) must be empty at 64 symbols"); + + // Negating a single-symbol value must select exactly the other 63. + var single = new UlongSymbolicFeatureValueFlags(feature); + single.Set(new[] { symbols[0] }); + ISymbolicFeatureValueFlags rest = single.Not(); + Assert.That(rest.Get(symbols[0]), Is.False); + Assert.That(rest.Get(symbols[63]), Is.True); + Assert.That(rest.HasAllSet(), Is.False); + } + private static void SkipAndCheck(SymbolicFeature featPos, int iSkip, string sFirst) { var symbols = featPos.PossibleSymbols.Skip(iSkip); @@ -1032,4 +1063,212 @@ private static void CheckFirstAndLastValues(FeatureSystem featSys, string sFirst FeatureStruct fs2 = FeatureStruct.NewMutable(featSys).Symbol("ncp").Value; Assert.That(fs2.ToString(), Is.EqualTo("[POS:ncp]")); } + + // --------------------------------------------------------------------------------- + // Copy-on-write characterization tests (safety net for the COW FeatureStruct refactor). + // Invariant under test: cloning a FROZEN feature struct and mutating the clone must never + // alter the (potentially shared) frozen source — INCLUDING nested children. A naive/shallow + // copy-on-write would let the internal recursive mutators (which have no per-level frozen + // check) write into a shared frozen child and silently corrupt the source. "No exception" + // is therefore insufficient: every test asserts the SOURCE is byte-for-byte unchanged. + // --------------------------------------------------------------------------------- + + private static FeatureSystem CowFeatSys() + { + var featSys = new FeatureSystem + { + new ComplexFeature("cx1"), + new ComplexFeature("cx2"), + new SymbolicFeature("a", new FeatureSymbol("a1"), new FeatureSymbol("a2"), new FeatureSymbol("a3")), + new SymbolicFeature("b", new FeatureSymbol("b1"), new FeatureSymbol("b2")), + new SymbolicFeature("c", new FeatureSymbol("c1"), new FeatureSymbol("c2")), + }; + featSys.Freeze(); + return featSys; + } + + // frozen [cx1:[a:a1 b:b1]] + private static FeatureStruct BuildNestedFrozen(FeatureSystem featSys) + { + return FeatureStruct.New(featSys).Feature("cx1").EqualTo(cx1 => cx1.Symbol("a1").Symbol("b1")).Value; + } + + // frozen [a:a1 b:b1] + private static FeatureStruct BuildFlatFrozen(FeatureSystem featSys) + { + return FeatureStruct.New(featSys).Symbol("a1").Symbol("b1").Value; + } + + private static void AssertSourceUnchanged(FeatureStruct source, FeatureStruct expected) + { + Assert.That(source.ValueEquals(expected), Is.True, "frozen source value changed by a clone mutation"); + Assert.That( + source.ToString(), + Is.EqualTo(expected.ToString()), + "frozen source string changed by a clone mutation" + ); + } + + [Test] + public void Clone_FrozenNested_PriorityUnionOnClone_LeavesSourceUnchanged() + { + FeatureSystem featSys = CowFeatSys(); + FeatureStruct source = BuildNestedFrozen(featSys); + FeatureStruct other = FeatureStruct.New(featSys).Feature("cx1").EqualTo(cx1 => cx1.Symbol("a2")).Value; + + FeatureStruct clone = source.Clone(); + clone.PriorityUnion(other); // recurses into and mutates the (shared) cx1 child + + AssertSourceUnchanged(source, BuildNestedFrozen(featSys)); + Assert.That(clone.ValueEquals(source), Is.False, "clone was not actually mutated"); + } + + [Test] + public void Clone_FrozenNested_UnionOnClone_LeavesSourceUnchanged() + { + FeatureSystem featSys = CowFeatSys(); + FeatureStruct source = BuildNestedFrozen(featSys); + FeatureStruct other = FeatureStruct.New(featSys).Feature("cx1").EqualTo(cx1 => cx1.Symbol("a1")).Value; + + FeatureStruct clone = source.Clone(); + clone.Union(other); + + AssertSourceUnchanged(source, BuildNestedFrozen(featSys)); + } + + [Test] + public void Clone_FrozenNested_SubtractOnClone_LeavesSourceUnchanged() + { + FeatureSystem featSys = CowFeatSys(); + FeatureStruct source = BuildNestedFrozen(featSys); + FeatureStruct other = FeatureStruct.New(featSys).Feature("cx1").EqualTo(cx1 => cx1.Symbol("a1")).Value; + + FeatureStruct clone = source.Clone(); + clone.Subtract(other); + + AssertSourceUnchanged(source, BuildNestedFrozen(featSys)); + } + + [Test] + public void Clone_FrozenFlat_AddValueOnClone_LeavesSourceUnchanged() + { + FeatureSystem featSys = CowFeatSys(); + FeatureStruct source = BuildFlatFrozen(featSys); + + FeatureStruct clone = source.Clone(); + clone.AddValue(featSys.GetFeature("c"), new SymbolicFeatureValue(featSys.GetSymbol("c1"))); + + AssertSourceUnchanged(source, BuildFlatFrozen(featSys)); + Assert.That(source.ContainsFeature(featSys.GetFeature("c")), Is.False); + Assert.That(clone.ContainsFeature(featSys.GetFeature("c")), Is.True); + } + + [Test] + public void Clone_FrozenFlat_RemoveValueOnClone_LeavesSourceUnchanged() + { + FeatureSystem featSys = CowFeatSys(); + FeatureStruct source = BuildFlatFrozen(featSys); + + FeatureStruct clone = source.Clone(); + clone.RemoveValue(featSys.GetFeature("b")); + + AssertSourceUnchanged(source, BuildFlatFrozen(featSys)); + Assert.That(source.ContainsFeature(featSys.GetFeature("b")), Is.True); + Assert.That(clone.ContainsFeature(featSys.GetFeature("b")), Is.False); + } + + [Test] + public void Clone_FrozenFlat_ClearOnClone_LeavesSourceUnchanged() + { + FeatureSystem featSys = CowFeatSys(); + FeatureStruct source = BuildFlatFrozen(featSys); + + FeatureStruct clone = source.Clone(); + clone.Clear(); + + AssertSourceUnchanged(source, BuildFlatFrozen(featSys)); + Assert.That(source.IsEmpty, Is.False); + Assert.That(clone.IsEmpty, Is.True); + } + + [Test] + public void Clone_OfFrozen_IsMutable() + { + FeatureSystem featSys = CowFeatSys(); + FeatureStruct source = BuildFlatFrozen(featSys); + + FeatureStruct clone = source.Clone(); + + // a fresh clone is NOT frozen: it has no valid frozen hash but it can be mutated + Assert.Throws(() => clone.GetFrozenHashCode()); + Assert.DoesNotThrow(() => + clone.AddValue(featSys.GetFeature("c"), new SymbolicFeatureValue(featSys.GetSymbol("c1"))) + ); + // and the frozen source still rejects mutation + Assert.Throws(() => + source.AddValue(featSys.GetFeature("c"), new SymbolicFeatureValue(featSys.GetSymbol("c1"))) + ); + } + + [Test] + public void Clone_OfFrozen_NeverMutated_EqualsSourceBothDirections() + { + FeatureSystem featSys = CowFeatSys(); + FeatureStruct source = BuildNestedFrozen(featSys); + + FeatureStruct clone = source.Clone(); + + Assert.That(source.ValueEquals(clone), Is.True); + Assert.That(clone.ValueEquals(source), Is.True); + Assert.That(FreezableEqualityComparer.Default.Equals(source, clone), Is.True); + } + + [Test] + public void Clone_FrozenReentrant_MutateClone_PreservesSharingAndLeavesSourceUnchanged() + { + FeatureSystem featSys = CowFeatSys(); + // [cx1:[a:a1](1) cx2->1] — cx1 and cx2 are the SAME structure (re-entrant) + Func build = () => + FeatureStruct + .New(featSys) + .Feature("cx1") + .EqualTo(1, cx1 => cx1.Symbol("a1")) + .Feature("cx2") + .ReferringTo(1) + .Value; + FeatureStruct source = build(); + + FeatureStruct clone = source.Clone(); + clone.AddValue(featSys.GetFeature("b"), new SymbolicFeatureValue(featSys.GetSymbol("b1"))); + + AssertSourceUnchanged(source, build()); + // the clone must still share its cx1/cx2 substructure after the inflate + Assert.That( + ReferenceEquals(clone.GetValue("cx1"), clone.GetValue("cx2")), + Is.True, + "re-entrant substructure sharing was lost by clone" + ); + } + + [Test] + public void Clone_FrozenWithVariable_ReplaceVariablesOnClone_LeavesSourceVariableIntact() + { + var featSys = new FeatureSystem + { + new SymbolicFeature("a", new FeatureSymbol("a+", "+"), new FeatureSymbol("a-", "-")), + new SymbolicFeature("b", new FeatureSymbol("b+", "+"), new FeatureSymbol("b-", "-")), + }; + featSys.Freeze(); + Func build = () => + FeatureStruct.New(featSys).Feature("a").EqualToVariable("var1").Symbol("b-").Value; + FeatureStruct source = build(); + + var bindings = new VariableBindings(); + bindings["var1"] = new SymbolicFeatureValue(featSys.GetSymbol("a+")); + FeatureStruct clone = source.Clone(); + clone.ReplaceVariables(bindings); + + AssertSourceUnchanged(source, build()); + Assert.That(clone.ValueEquals(source), Is.False, "ReplaceVariables did not change the clone"); + } } diff --git a/tests/SIL.Machine.Tests/FiniteState/FstTests.cs b/tests/SIL.Machine.Tests/FiniteState/FstTests.cs index f3fc213b1..db480b741 100644 --- a/tests/SIL.Machine.Tests/FiniteState/FstTests.cs +++ b/tests/SIL.Machine.Tests/FiniteState/FstTests.cs @@ -240,4 +240,72 @@ public void Transduce() Assert.That(resultsArray.Length, Is.EqualTo(2)); Assert.That(resultsArray.Select(r => r.Output.String), Is.EquivalentTo(new[] { "cas+.p", "cas.p" })); } + + [Test] + public void TransduceNondeterministic_MatchesDeterminized() + { + // Exercises the nondeterministic FST traversal (NondeterministicFstTraversalMethod + + // VisitedStates) by transducing an FST directly, without Determinize() first. Oracle: + // the accepted outputs must match the determinized FST's (determinization preserves the + // relation), which is the same nas-assimilation transducer used by Transduce(). + var fst = new Fst(_operations) { UseUnification = false }; + fst.StartState = fst.CreateAcceptingState(); + fst.StartState.Arcs.Add(FeatureStruct.New(PhoneticFeatSys).Symbol("nas-", "nas?").Value, fst.StartState); + fst.StartState.Arcs.Add( + FeatureStruct.New(PhoneticFeatSys).Symbol("nas+").Symbol("cor+", "cor-").Value, + fst.StartState + ); + State s1 = fst.StartState.Arcs.Add( + FeatureStruct.New(PhoneticFeatSys).Symbol("cor?").Symbol("nas+").Value, + FeatureStruct.New(PhoneticFeatSys).Symbol("cor-").Value, + fst.CreateState() + ); + s1.Arcs.Add(FeatureStruct.New(PhoneticFeatSys).Symbol("cor-").Value, fst.StartState); + State s2 = fst.StartState.Arcs.Add( + FeatureStruct.New(PhoneticFeatSys).Symbol("cor?").Symbol("nas+").Value, + FeatureStruct.New(PhoneticFeatSys).Symbol("cor+").Value, + fst.CreateAcceptingState() + ); + s2.Arcs.Add( + FeatureStruct.New(PhoneticFeatSys).Symbol("cor?").Symbol("nas+").Value, + FeatureStruct.New(PhoneticFeatSys).Symbol("cor+").Value, + s2 + ); + s2.Arcs.Add( + FeatureStruct.New(PhoneticFeatSys).Symbol("nas-", "nas?").Symbol("cor+", "cor?").Value, + fst.StartState + ); + s2.Arcs.Add(FeatureStruct.New(PhoneticFeatSys).Symbol("nas+").Symbol("cor+").Value, fst.StartState); + s2.Arcs.Add( + FeatureStruct.New(PhoneticFeatSys).Symbol("cor?").Symbol("nas+").Value, + FeatureStruct.New(PhoneticFeatSys).Symbol("cor-").Value, + s1 + ); + + Assert.That(fst.IsDeterministic, Is.False, "the raw FST must take the nondeterministic traversal path"); + Fst dfst = fst.Determinize(); + + // This transducer has no epsilon-input arcs, so the raw nondeterministic traversal and the + // determinized FST accept exactly the same (input, output) relation. (Epsilon-input FSTs + // are always determinized before transducing in production, so raw-NFST transduce of those + // is out of scope here.) + foreach (string input in new[] { "caNp", "caN", "carp" }) + { + AnnotatedStringData ndData = CreateStringData(input); + IEnumerable> ndResults; + Assert.That( + fst.Transduce(ndData, ndData.Annotations.First, null, true, true, true, out ndResults), + Is.True, + $"nondeterministic transduce of '{input}' should succeed" + ); + AnnotatedStringData dData = CreateStringData(input); + IEnumerable> dResults; + Assert.That(dfst.Transduce(dData, dData.Annotations.First, null, true, true, true, out dResults), Is.True); + Assert.That( + ndResults.Select(r => r.Output.String).Distinct(), + Is.EquivalentTo(dResults.Select(r => r.Output.String).Distinct()), + $"nondeterministic and determinized transduce of '{input}' must accept the same outputs" + ); + } + } }