Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ tests/SIL.Machine.Tests/Corpora/TestData/usfm/target/*
tests/SIL.Machine.Tests/Corpora/TestData/project/*
tests/SIL.Machine.Tests/Corpora/TestData/pretranslations.json
.idea
.worktrees

# Local-only HermitCrab benchmark fixtures (real Sena/Indonesian grammars + word lists, used
# for ad hoc perf/allocation testing) + FieldWorks project backups. Large and/or not licensed
Expand Down
360 changes: 360 additions & 0 deletions docs/hermitcrab-parse-algorithm-analysis.md

Large diffs are not rendered by default.

241 changes: 241 additions & 0 deletions src/SIL.Machine.Morphology.HermitCrab.Tool/BatchCommand.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using ManyConsole;

namespace SIL.Machine.Morphology.HermitCrab;

/// <summary>
/// Phase 0 of parse-optimization.md: parses every word in a word list and records a per-word result
/// signature plus elapsed time, so two runs (before/after an engine change) can be diffed to confirm
/// parse results are unchanged. Flushed per word and crash-resumable via --start, since some corpus
/// words are expensive enough (100+ seconds, and one has been observed to crash a process outright) that
/// losing partial progress on a multi-hour corpus run is unacceptable.
/// </summary>
internal class BatchCommand : ConsoleCommand
{
private readonly HCContext _context;
private int _startIndex;
private string _ruleStatsPath;
private bool _parallel;
private int _parallelDegree = -1;

public BatchCommand(HCContext context)
{
_context = context;

IsCommand(
"batch",
"Parses every word in a word list, recording a result signature and timing per word (see parse-optimization.md Phase 0)"
);
SkipsCommandSummaryBeforeRunning();
HasAdditionalArguments(2, "<wordlist-file> <output-tsv>");
HasOption(
"start=",
"0-based line index to resume at (for crash recovery; ignored with --parallel)",
v => _startIndex = int.Parse(v)
);
HasOption(
"rule-stats=",
"accumulate per-rule firing stats (category/stem/allomorph/environment buckets, with example "
+ "words) across the whole run and write a report to {FILE} -- run with --sequential, the "
+ "counters are not thread-safe",
v => _ruleStatsPath = v
);
HasOption(
"parallel:",
"parse words concurrently across a load-balanced, longest-word-first Parallel.ForEach "
+ "(parse-optimization.md Phase 8a) -- requires the Morpher itself to be --sequential for "
+ "the per-word memo tables to engage; degree defaults to Environment.ProcessorCount, or "
+ "{N} if given; trades --start crash-resume for speed (output is buffered and written "
+ "index-ordered at the end)",
v =>
{
_parallel = true;
if (!string.IsNullOrEmpty(v))
_parallelDegree = int.Parse(v);
}
);
}

public override int Run(string[] remainingArguments)
{
string wordListPath = remainingArguments[0];
string outputPath = remainingArguments[1];

string[] words = File.ReadAllLines(wordListPath).Select(w => w.Trim()).Where(w => w.Length > 0).ToArray();

if (_ruleStatsPath != null)
{
if (_parallel)
{
_context.Out.WriteLine(
"ERROR: --rule-stats and --parallel cannot be combined (counters are not thread-safe)."
);
return -1;
}
if (_context.Morpher.MaxDegreeOfParallelism != 1)
{
_context.Out.WriteLine(
"WARNING: --rule-stats requested without --sequential; per-rule counters are not "
+ "thread-safe and will be unreliable under within-word parallelism."
);
}
_context.Morpher.AccumulateRuleStats = true;
}

if (_parallel)
{
if (_context.Morpher.MaxDegreeOfParallelism != 1)
{
_context.Out.WriteLine(
"WARNING: --parallel requested without --sequential; the per-word memo tables "
+ "(parse-optimization.md Phases 2/3/3b) only engage on the sequential cascade, so "
+ "this run will not get their benefit."
);
}
if (_startIndex > 0)
{
_context.Out.WriteLine("WARNING: --start is ignored under --parallel; running the full word list.");
}
return RunParallel(words, outputPath);
}

return RunSequential(words, outputPath);
}

private int RunSequential(string[] words, string outputPath)
{
using var writer = new StreamWriter(outputPath, append: _startIndex > 0) { AutoFlush = true };
var totalSw = Stopwatch.StartNew();
long parsed = 0,
skipped = 0;
for (int i = _startIndex; i < words.Length; i++)
{
string word = words[i];
// Sentinel written before the attempt: if this word crashes the process, a wrapper script
// can read the last line to find where to resume (see run_sena_shards.ps1 precedent).
writer.WriteLine($"{i}\t{word}\tSTARTED");
(string status, long elapsedMs, string signature) = ParseOneWord(word);
writer.WriteLine($"{i}\t{word}\t{elapsedMs}\t{status}\t{signature}");
if (status == "SKIPPED")
skipped++;
else
parsed++;
if (i % 100 == 0)
{
_context.Out.WriteLine("[{0}/{1}]", i, words.Length);
// Rewritten (not appended) every checkpoint so a mid-run crash on a pathological word
// still leaves a usable report reflecting everything parsed so far.
if (_ruleStatsPath != null)
WriteRuleStatsReport();
}
}
totalSw.Stop();
if (_ruleStatsPath != null)
WriteRuleStatsReport();
_context.Out.WriteLine(
"batch complete: {0} words parsed ({1} skipped), {2}ms total",
parsed,
skipped,
totalSw.ElapsedMilliseconds
);
return 0;
}

// Phase 8a: the earlier per-word AutoFlush writer is not thread-safe and crash-resume has no meaning
// once words are handed out out-of-order, so rows are buffered per index and written once at the end.
// Ordering the work queue longest-word-first, combined with the load-balanced (chunked, not static
// range) partitioner below, is what closes the 2.9x gap between wall clock and the perfect-packing
// bound measured on 2026-07-03 -- heavy words no longer cluster onto a few threads.
private int RunParallel(string[] words, string outputPath)
{
var rows = new string[words.Length];
int[] order = Enumerable.Range(0, words.Length).OrderByDescending(i => words[i].Length).ToArray();

var totalSw = Stopwatch.StartNew();
long parsed = 0,
skipped = 0;
long completed = 0;

var parallelOptions = new ParallelOptions();
if (_parallelDegree > 0)
parallelOptions.MaxDegreeOfParallelism = _parallelDegree;

Parallel.ForEach(
Partitioner.Create(order, loadBalance: true),
parallelOptions,
i =>
{
string word = words[i];
(string status, long elapsedMs, string signature) = ParseOneWord(word);
rows[i] = $"{i}\t{word}\t{elapsedMs}\t{status}\t{signature}";
if (status == "SKIPPED")
Interlocked.Increment(ref skipped);
else
Interlocked.Increment(ref parsed);
long n = Interlocked.Increment(ref completed);
if (n % 100 == 0)
_context.Out.WriteLine("[{0}/{1}]", n, words.Length);
}
);
totalSw.Stop();

using (var writer = new StreamWriter(outputPath, append: false))
{
foreach (string row in rows)
writer.WriteLine(row);
}

_context.Out.WriteLine(
"batch complete: {0} words parsed ({1} skipped), {2}ms total",
parsed,
skipped,
totalSw.ElapsedMilliseconds
);
return 0;
}

private (string status, long elapsedMs, string signature) ParseOneWord(string word)
{
var sw = Stopwatch.StartNew();
try
{
Word[] results = _context.Morpher.ParseWord(word, out _).ToArray();
sw.Stop();
return ("ok", sw.ElapsedMilliseconds, BuildSignature(results));
}
catch (InvalidShapeException)
{
return ("SKIPPED", 0, "-");
}
}

private void WriteRuleStatsReport()
{
using var statsWriter = new StreamWriter(_ruleStatsPath, append: false);
RuleStatsReport.Write(statsWriter, "Analysis", _context.Morpher.AnalysisRuleStats);
RuleStatsReport.Write(statsWriter, "Synthesis", _context.Morpher.SynthesisRuleStats);
}

// Order-independent (sorted) so two runs that find the same parses in a different internal order
// still compare equal; a change in this signature means parse RESULTS changed, which every phase in
// parse-optimization.md is required not to do.
private static string BuildSignature(IEnumerable<Word> results)
{
List<string> signatures = results
.Select(w =>
string.Join("+", w.AllomorphsInMorphOrder.Select(a => a.Morpheme.Id))
+ "|"
+ w.Shape.ToRegexString(w.Stratum.CharacterDefinitionTable, true)
)
.OrderBy(s => s, StringComparer.Ordinal)
.ToList();
return signatures.Count == 0 ? "-" : string.Join(";", signatures);
}
}
4 changes: 2 additions & 2 deletions src/SIL.Machine.Morphology.HermitCrab.Tool/HCContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ internal class HCContext(Language language, TextWriter outWriter)
private Morpher _morpher;
private readonly TextWriter _outWriter = outWriter;

public void Compile()
public void Compile(bool sequential = false)
{
_morpher = new Morpher(new TraceManager(), _language);
_morpher = new Morpher(new TraceManager(), _language, sequential ? 1 : -1);
}

public Language Language
Expand Down
20 changes: 19 additions & 1 deletion src/SIL.Machine.Morphology.HermitCrab.Tool/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ public static int Main(string[] args)
string scriptFile = null;
bool showHelp = false;
bool quitOnError = true;
bool sequential = false;
bool lexicalGate = false;

var p = new OptionSet
{
Expand All @@ -31,6 +33,20 @@ public static int Main(string[] args)
"continues when an error occurs while loading the configuration",
value => quitOnError = value == null
},
{
"sequential",
"parse single-threaded (maxDegreeOfParallelism: 1) -- the mode a caller that "
+ "parallelizes across words itself (e.g. batch corpus runs) should use; also the "
+ "only mode the analysis nogood cache (parse-optimization.md Phase 2) currently covers",
value => sequential = value != null
},
{
"lexical-gate",
"enable Morpher.EnableLexicalGating (parse-optimization.md Phase 5) -- default off, "
+ "highest-risk optimization; use for A/B corpus verification against a run without "
+ "this flag",
value => lexicalGate = value != null
},
{ "h|help", "show this help message and exit", value => showHelp = value != null },
};

Expand Down Expand Up @@ -66,7 +82,8 @@ public static int Main(string[] args)

context = new HCContext(language, output ?? Console.Out);
Console.Write("Compiling rules... ");
context.Compile();
context.Compile(sequential);
context.Morpher.EnableLexicalGating = lexicalGate;
Console.WriteLine("done.");
Console.WriteLine("{0} loaded.", language.Name);
Console.WriteLine();
Expand All @@ -92,6 +109,7 @@ public static int Main(string[] args)
new TracingCommand(context),
new TestCommand(context),
new StatsCommand(context),
new BatchCommand(context),
};

string input;
Expand Down
56 changes: 56 additions & 0 deletions src/SIL.Machine.Morphology.HermitCrab.Tool/RuleStatsReport.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
using System.Diagnostics;
using System.IO;
using System.Linq;
using SIL.Machine.Rules;

namespace SIL.Machine.Morphology.HermitCrab;

/// <summary>
/// Formats the InstrumentedRule tree (see Morpher.AccumulateRuleStats) as a flat, grep-able text report:
/// one line per rule with its totals, followed by its bucket breakdowns sorted so the rarest (most
/// suspicious) buckets are easy to spot against the common case -- that's the "300 times vs 4 times, are
/// the 4 wrong?" comparison this whole feature exists for.
/// </summary>
internal static class RuleStatsReport
{
public static void Write(TextWriter writer, string label, InstrumentedRule<Word, int> root)
{
writer.WriteLine($"==== {label} ====");
if (root == null)
{
writer.WriteLine("(no rule tree)");
return;
}
WriteRule(writer, root, "");
writer.WriteLine();
}

private static void WriteRule(TextWriter writer, InstrumentedRule<Word, int> rule, string path)
{
if (rule == null)
return;

string fullPath = string.IsNullOrEmpty(path) ? rule.Name ?? "?" : $"{path} > {rule.Name}";

if (rule.InputCount > 0 || rule.BucketGroups.Count > 0)
{
double elapsedMs = rule.ElapsedTime * 1000.0 / Stopwatch.Frequency;
writer.WriteLine(
$"{fullPath}\tinputs={rule.InputCount}\tsuccesses={rule.SuccessCount}\toutputs={rule.OutputCount}\telapsedMs={elapsedMs:F0}"
);

foreach (var group in rule.BucketGroups.OrderBy(g => g.Key))
{
writer.WriteLine($" [{group.Key}]");
foreach (var bucket in group.Value.OrderByDescending(b => b.Value.Count))
{
string examples = string.Join(" | ", bucket.Value.Examples);
writer.WriteLine($" {bucket.Key}: {bucket.Value.Count}\te.g. {examples}");
}
}
}

foreach (var sub in rule.SubRules)
WriteRule(writer, sub, fullPath);
}
}
Loading
Loading