diff --git a/FieldWorks.sln b/FieldWorks.sln index aae0cc092f..1b2e68a769 100644 --- a/FieldWorks.sln +++ b/FieldWorks.sln @@ -144,6 +144,10 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ParserCore", "Src\LexText\P EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ParserCoreTests", "Src\LexText\ParserCore\ParserCoreTests\ParserCoreTests.csproj", "{E5F82767-7DC7-599F-BC29-AAFE4AC98060}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "HCWorker", "Src\LexText\HCWorker\HCWorker.csproj", "{27CA033B-B6E6-41E0-A5C9-33DA5BB7F61D}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "HCWorkerTests", "Src\LexText\HCWorker\HCWorkerTests\HCWorkerTests.csproj", "{9CF72C1E-F5E8-463C-B53D-0F39979742F9}" +EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ParserUI", "Src\LexText\ParserUI\ParserUI.csproj", "{09D7C8FE-DD9B-5C1C-9A4D-9D61B26E878E}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ParserUITests", "Src\LexText\ParserUI\ParserUITests\ParserUITests.csproj", "{2310A14E-5FFA-5939-885C-DA681EAFC168}" @@ -709,6 +713,18 @@ Global {1DD0C70B-EA9D-593E-BF23-72FEAB6849DF}.Debug|x64.Build.0 = Debug|x64 {1DD0C70B-EA9D-593E-BF23-72FEAB6849DF}.Release|x64.ActiveCfg = Release|x64 {1DD0C70B-EA9D-593E-BF23-72FEAB6849DF}.Release|x64.Build.0 = Release|x64 + {27CA033B-B6E6-41E0-A5C9-33DA5BB7F61D}.Bounds|x64.ActiveCfg = Release|x64 + {27CA033B-B6E6-41E0-A5C9-33DA5BB7F61D}.Bounds|x64.Build.0 = Release|x64 + {27CA033B-B6E6-41E0-A5C9-33DA5BB7F61D}.Debug|x64.ActiveCfg = Debug|x64 + {27CA033B-B6E6-41E0-A5C9-33DA5BB7F61D}.Debug|x64.Build.0 = Debug|x64 + {27CA033B-B6E6-41E0-A5C9-33DA5BB7F61D}.Release|x64.ActiveCfg = Release|x64 + {27CA033B-B6E6-41E0-A5C9-33DA5BB7F61D}.Release|x64.Build.0 = Release|x64 + {9CF72C1E-F5E8-463C-B53D-0F39979742F9}.Bounds|x64.ActiveCfg = Release|x64 + {9CF72C1E-F5E8-463C-B53D-0F39979742F9}.Bounds|x64.Build.0 = Release|x64 + {9CF72C1E-F5E8-463C-B53D-0F39979742F9}.Debug|x64.ActiveCfg = Debug|x64 + {9CF72C1E-F5E8-463C-B53D-0F39979742F9}.Debug|x64.Build.0 = Debug|x64 + {9CF72C1E-F5E8-463C-B53D-0F39979742F9}.Release|x64.ActiveCfg = Release|x64 + {9CF72C1E-F5E8-463C-B53D-0F39979742F9}.Release|x64.Build.0 = Release|x64 {E5F82767-7DC7-599F-BC29-AAFE4AC98060}.Bounds|x64.ActiveCfg = Release|x64 {E5F82767-7DC7-599F-BC29-AAFE4AC98060}.Bounds|x64.Build.0 = Release|x64 {E5F82767-7DC7-599F-BC29-AAFE4AC98060}.Debug|x64.ActiveCfg = Debug|x64 diff --git a/Src/LexText/HCWorker/App.config b/Src/LexText/HCWorker/App.config new file mode 100644 index 0000000000..c25b035574 --- /dev/null +++ b/Src/LexText/HCWorker/App.config @@ -0,0 +1,15 @@ + + + + + + + + + + + diff --git a/Src/LexText/HCWorker/HCWorker.csproj b/Src/LexText/HCWorker/HCWorker.csproj new file mode 100644 index 0000000000..dd26f5827c --- /dev/null +++ b/Src/LexText/HCWorker/HCWorker.csproj @@ -0,0 +1,54 @@ + + + + HCWorker + SIL.FieldWorks.WordWorks.Parser.HCWorker + net48 + Exe + win-x64 168,169,219,414,649,1635,1702,1701 + false + false + + App.config + + + true + portable + false + DEBUG;TRACE + + + portable + true + TRACE + + + + + + + + + + + + + + + + + + + + + Properties\CommonAssemblyInfo.cs + + + diff --git a/Src/LexText/HCWorker/HCWorkerService.cs b/Src/LexText/HCWorker/HCWorkerService.cs new file mode 100644 index 0000000000..a404149961 --- /dev/null +++ b/Src/LexText/HCWorker/HCWorkerService.cs @@ -0,0 +1,165 @@ +// Copyright (c) 2026 SIL International +// This software is licensed under the LGPL, version 2.1 or later +// (http://www.gnu.org/licenses/lgpl-2.1.html) + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.ServiceModel; +using System.Threading.Tasks; +using SIL.FieldWorks.WordWorks.Parser; +using SIL.Machine.Annotations; +using SIL.Machine.Morphology.HermitCrab; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.FieldWorks.WordWorks.Parser.HCWorker +{ + /// + /// Hosts one Morpher for the lifetime of the worker process. One instance is shared across all + /// WCF calls (InstanceContextMode.Single) and calls run concurrently (ConcurrencyMode.Multiple) + /// - Morpher.ParseWord was already called this way in-process (ParserWorker's parallel batch, + /// each iteration calling HCParser.ParseWord -> Morpher.ParseWord with no external locking), so + /// moving it out-of-process introduces no new thread-safety requirement. + /// + /// The DTO extraction below (ToWordAnalysisDto) is the id-collection half of HCParser.GetMorphs, + /// running here where the Word/Allomorph/Morpheme graph lives; the LCM-object-resolution half + /// stays in HCParser.GetMorphs, which consumes the returned MorphDto[]. The Form/Msa/InflType + /// keys come from HCParser's own constants, so worker and client can never disagree on them. + /// + [ServiceBehavior(InstanceContextMode = InstanceContextMode.Single, ConcurrencyMode = ConcurrencyMode.Multiple)] + public class HCWorkerService : IHCWorkerService + { + private volatile Morpher _morpher; + + public void UpdateGrammar(HCGrammarDto grammar) + { + if (grammar == null) + throw new ArgumentNullException(nameof(grammar)); + + // XmlLanguageLoader.Load only takes a file path, so round-trip the grammar XML through a + // temp file rather than adding a string/stream overload to the HC library. + string tempPath = Path.Combine(Path.GetTempPath(), $"hcworker-grammar-{Guid.NewGuid():N}.xml"); + try + { + File.WriteAllText(tempPath, grammar.CompiledGrammarXml); + Language language = XmlLanguageLoader.Load(tempPath); + _morpher = new Morpher(new TraceManager(), language) + { + DeletionReapplications = grammar.DeletionReapplications, + MaxStemCount = grammar.MaxStemCount, + MergeEquivalentAnalyses = grammar.MergeEquivalentAnalyses + }; + } + finally + { + try + { + File.Delete(tempPath); + } + catch (IOException) + { + // best-effort cleanup; a stray temp file is not worth failing the grammar update over + } + } + } + + public WordAnalysisDto[] ParseWord(string word, bool guessRoots) + { + Morpher morpher = RequireMorpher(); + return morpher.ParseWord(word, out _, guessRoots).Select(ToWordAnalysisDto).ToArray(); + } + + public IDictionary ParseWordsBatch(string[] words, bool guessRoots) + { + Morpher morpher = RequireMorpher(); + var results = new ConcurrentDictionary(); + // Parses the whole batch server-side with no artificial DOP cap: the cap in + // ParserWorker existed only to keep FieldWorks' UI thread responsive under Workstation + // GC, which no longer applies once parsing lives in this Server-GC process. + Parallel.ForEach( + words, + new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount }, + word => + { + try + { + results[word] = morpher.ParseWord(word, out _, guessRoots).Select(ToWordAnalysisDto).ToArray(); + } + catch (Exception) + { + // Guard each word so one unexpected exception (e.g. an out-of-vocabulary + // character, which throws InvalidShapeException) cannot abort the whole + // batch, mirroring ParserWorker.ParseAndUpdateWordformGuarded and + // HCParser.ParseWord's own try/catch around the equivalent call. + results[word] = new WordAnalysisDto[0]; + } + } + ); + // Return a plain Dictionary: DataContractSerializer's IDictionary support is defined in + // terms of the concrete Dictionary shape, so don't rely on ConcurrentDictionary matching it. + return new Dictionary(results); + } + + private Morpher RequireMorpher() + { + Morpher morpher = _morpher; + if (morpher == null) + throw new InvalidOperationException("UpdateGrammar must be called before parsing."); + return morpher; + } + + internal static WordAnalysisDto ToWordAnalysisDto(Word ws) + { + var morphemeIndices = new Dictionary(); + var morphs = new List(); + foreach (Annotation morph in ws.Morphs) + { + Allomorph allomorph = ws.GetAllomorph(morph); + int formId = ParseNullableIntProperty(allomorph.Properties, HCParser.FormID) ?? 0; + if (formId == 0) + continue; + + if (!morphemeIndices.TryGetValue(allomorph.Morpheme, out int morphemeIndex)) + { + morphemeIndex = morphemeIndices.Count; + morphemeIndices[allomorph.Morpheme] = morphemeIndex; + } + + string formStr = ws.Shape.GetNodes(morph.Range).ToString(ws.Stratum.CharacterDefinitionTable, false); + morphs.Add( + new MorphDto + { + FormId = formId, + FormId2 = ParseNullableIntProperty(allomorph.Properties, HCParser.FormID2) ?? 0, + IsAffixProcessAllomorph = allomorph is AffixProcessAllomorph, + FormStr = formStr, + Guessed = allomorph.Guessed, + MsaId = ParseIntProperty(allomorph.Morpheme.Properties, HCParser.MsaID), + InflTypeId = ParseNullableIntProperty(allomorph.Morpheme.Properties, HCParser.InflTypeID) ?? 0, + MorphemeIndex = morphemeIndex + } + ); + } + return new WordAnalysisDto { Morphs = morphs.ToArray() }; + } + + private static int ParseIntProperty(IDictionary properties, string key) + { + // Properties round-trip through XmlLanguageWriter/XmlLanguageLoader as strings even + // though HCLoader stored them as ints (hcEntry.Properties[HCParser.MsaID] = msa.Hvo), + // so parse rather than unbox. + if (!properties.TryGetValue(key, out object value) || value == null) + throw new InvalidOperationException($"Morpheme is missing required property '{key}'."); + return int.Parse(value.ToString()); + } + + private static int? ParseNullableIntProperty(IDictionary properties, string key) + { + if (!properties.TryGetValue(key, out object value) || value == null) + return null; + return int.Parse(value.ToString()); + } + } +} diff --git a/Src/LexText/HCWorker/HCWorkerTests/HCWorkerServiceTests.cs b/Src/LexText/HCWorker/HCWorkerTests/HCWorkerServiceTests.cs new file mode 100644 index 0000000000..aec381547a --- /dev/null +++ b/Src/LexText/HCWorker/HCWorkerTests/HCWorkerServiceTests.cs @@ -0,0 +1,280 @@ +// Copyright (c) 2026 SIL International +// This software is licensed under the LGPL, version 2.1 or later +// (http://www.gnu.org/licenses/lgpl-2.1.html) + +using System.IO; +using System.Linq; +using System.ServiceModel; +using NUnit.Framework; +using SIL.Machine.Annotations; +using SIL.Machine.FeatureModel; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.FieldWorks.WordWorks.Parser.HCWorker +{ + /// + /// Validates the out-of-process worker along its two risk axes: + /// 1. Grammar-transfer fidelity: XmlLanguageWriter.Save -> XmlLanguageLoader.Load preserves the + /// ad hoc Properties[HCParser.FormID]/[MsaID]/[InflTypeID] tags HCLoader writes and + /// HCParser.GetMorphs needs, and the DTO carries them back across the process boundary - + /// plus a real net.pipe ServiceHost/ChannelFactory round trip (catches DataContract mistakes + /// an in-proc call would miss). Uses a single-morph grammar built from scratch with + /// NCName-safe symbol ids (XmlLanguageWriter emits ids as XML IDs). + /// 2. DTO-extraction correctness (HCWorkerService.ToWordAnalysisDto) on a real multi-morph Word, + /// including the MorphemeIndex grouping that stands in for GetMorphs' Dictionary<Morpheme, + /// MorphInfo> reference-identity lookup. + /// + [TestFixture] + public class HCWorkerServiceTests + { + private const int RootMsaId = 42; + private const int RootFormId = 555; + + private Language _language; + private string _grammarXml; + + [SetUp] + public void SetUp() + { + // Symbol/feature ids double as XML IDs (NCName) once written via XmlLanguageWriter, so + // they must be NCName-safe - matching how HCLoader generates ids in production (e.g. + // "pos" + msa.Hvo, never a bare symbol character like "+"). + var phoneticFeatSys = new FeatureSystem + { + new SymbolicFeature("cons", new FeatureSymbol("consPos", "+"), new FeatureSymbol("consNeg", "-")), + new SymbolicFeature("voc", new FeatureSymbol("vocPos", "+"), new FeatureSymbol("vocNeg", "-")), + new SymbolicFeature("place", new FeatureSymbol("alveolar"), new FeatureSymbol("velar")) + }; + phoneticFeatSys.Freeze(); + + // "s" and "g" are both consonants; a distinguishing "place" feature keeps + // GetMatchingStrReps from returning whichever was added first for both. + var table = new CharacterDefinitionTable { Name = "table" }; + AddSeg(table, phoneticFeatSys, "s", "consPos", "vocNeg", "alveolar"); + AddSeg(table, phoneticFeatSys, "a", "consNeg", "vocPos"); + AddSeg(table, phoneticFeatSys, "g", "consPos", "vocNeg", "velar"); + + var syntacticFeatSys = new SyntacticFeatureSystem(); + syntacticFeatSys.AddPartsOfSpeech(new FeatureSymbol("V", "Verb")); + syntacticFeatSys.Freeze(); + + var stratum = new Stratum(table) { Name = "Test", MorphologicalRuleOrder = MorphologicalRuleOrder.Unordered }; + + var root = new LexEntry + { + Id = "root", + Gloss = "sag", + SyntacticFeatureStruct = FeatureStruct.New(syntacticFeatSys).Symbol("V").Value + }; + root.Allomorphs.Add(new RootAllomorph(new Segments(table, "sag", true))); + // Simulate HCLoader tagging the entry's morpheme/allomorph with the ids HCParser.GetMorphs + // resolves back to live LCM objects. + root.Properties[HCParser.MsaID] = RootMsaId; + root.PrimaryAllomorph.Properties[HCParser.FormID] = RootFormId; + stratum.Entries.Add(root); + + _language = new Language + { + Name = "WorkerTest", + PhonologicalFeatureSystem = phoneticFeatSys, + SyntacticFeatureSystem = syntacticFeatSys, + Strata = { stratum } + }; + _language.CharacterDefinitionTables.Add(table); + + string tempPath = Path.Combine(Path.GetTempPath(), $"hcworker-test-{TestContext.CurrentContext.Test.ID}.xml"); + XmlLanguageWriter.Save(_language, tempPath); + _grammarXml = File.ReadAllText(tempPath); + File.Delete(tempPath); + } + + private static void AddSeg( + CharacterDefinitionTable table, + FeatureSystem phoneticFeatSys, + string strRep, + params string[] symbols + ) + { + var fs = new FeatureStruct(); + foreach (string symbolId in symbols) + { + FeatureSymbol symbol = phoneticFeatSys.GetSymbol(symbolId); + fs.AddValue(symbol.Feature, new SymbolicFeatureValue(symbol)); + } + table.AddSegment(strRep, fs); + } + + private HCGrammarDto MakeGrammarDto() => + new HCGrammarDto + { + CompiledGrammarXml = _grammarXml, + DeletionReapplications = 0, + MaxStemCount = 2, + MergeEquivalentAnalyses = false + }; + + [Test] + public void ParseWord_AfterGrammarRoundTrip_CarriesFieldWorksIds() + { + var service = new HCWorkerService(); + service.UpdateGrammar(MakeGrammarDto()); + WordAnalysisDto[] actual = service.ParseWord("sag", false); + + Assert.That(actual, Has.Length.EqualTo(1)); + Assert.That(actual[0].Morphs, Has.Length.EqualTo(1)); + + MorphDto root = actual[0].Morphs[0]; + Assert.That(root.FormId, Is.EqualTo(RootFormId)); + Assert.That(root.MsaId, Is.EqualTo(RootMsaId)); + Assert.That(root.FormStr, Is.EqualTo("sag")); + Assert.That(root.IsAffixProcessAllomorph, Is.False); + } + + [Test] + public void ParseWordsBatch_ReturnsOneEntryPerWord() + { + var service = new HCWorkerService(); + service.UpdateGrammar(MakeGrammarDto()); + + var result = service.ParseWordsBatch(new[] { "sag", "nonword" }, false); + + Assert.That(result.Keys, Is.EquivalentTo(new[] { "sag", "nonword" })); + Assert.That(result["sag"], Has.Length.EqualTo(1)); + Assert.That(result["nonword"], Is.Empty); + } + + [Test] + public void ParseWord_BeforeUpdateGrammar_Throws() + { + var service = new HCWorkerService(); + Assert.Throws(() => service.ParseWord("sag", false)); + } + + [Test] + public void OverWcfNamedPipe_RoundTripsCorrectly() + { + string pipeName = "hcworker-test-" + TestContext.CurrentContext.Test.ID; + NetNamedPipeBinding pipeBinding = PipeBindingFactory.Create(); + + using (var host = new ServiceHost(new HCWorkerService())) + { + host.AddServiceEndpoint(typeof(IHCWorkerService), pipeBinding, "net.pipe://localhost/" + pipeName); + host.Open(); + try + { + var factory = new ChannelFactory( + pipeBinding, + new EndpointAddress("net.pipe://localhost/" + pipeName) + ); + IHCWorkerService client = factory.CreateChannel(); + + client.UpdateGrammar(MakeGrammarDto()); + WordAnalysisDto[] result = client.ParseWord("sag", false); + + Assert.That(result, Has.Length.EqualTo(1)); + Assert.That(result[0].Morphs, Has.Length.EqualTo(1)); + Assert.That(result[0].Morphs[0].FormId, Is.EqualTo(RootFormId)); + + var batch = client.ParseWordsBatch(new[] { "sag" }, false); + Assert.That(batch["sag"], Has.Length.EqualTo(1)); + + ((IClientChannel)client).Close(); + factory.Close(); + } + finally + { + host.Close(); + } + } + } + + /// + /// Exercises ToWordAnalysisDto directly on a real multi-morph Word (root + suffix rule) - no + /// XML round trip, so the Pattern-serialization limitation of the round-trip path doesn't + /// apply. This is the scenario GetMorphs' second-occurrence dictionary lookup exists for; + /// MorphemeIndex is the DTO's wire-safe replacement for it. + /// + [Test] + public void ToWordAnalysisDto_MultiMorphWord_GroupsByMorphemeAndFlagsAffixProcessAllomorph() + { + var phonologicalFeatSys = new FeatureSystem + { + new SymbolicFeature("cons", new FeatureSymbol("cons+", "+"), new FeatureSymbol("cons-", "-")), + new SymbolicFeature("voc", new FeatureSymbol("voc+", "+"), new FeatureSymbol("voc-", "-")) + }; + phonologicalFeatSys.Freeze(); + var syntacticFeatSys = new SyntacticFeatureSystem(); + syntacticFeatSys.AddPartsOfSpeech(new FeatureSymbol("V", "Verb")); + syntacticFeatSys.Freeze(); + + var table = new CharacterDefinitionTable { Name = "table" }; + AddSeg(table, phonologicalFeatSys, "s", "cons+", "voc-"); + AddSeg(table, phonologicalFeatSys, "a", "cons-", "voc+"); + AddSeg(table, phonologicalFeatSys, "g", "cons+", "voc-"); + AddSeg(table, phonologicalFeatSys, "d", "cons+", "voc-"); + table.AddBoundary("+"); + + var stratum = new Stratum(table) { Name = "Test", MorphologicalRuleOrder = MorphologicalRuleOrder.Unordered }; + + var root = new LexEntry + { + Id = "root", + Gloss = "sag", + SyntacticFeatureStruct = FeatureStruct.New(syntacticFeatSys).Symbol("V").Value + }; + root.Allomorphs.Add(new RootAllomorph(new Segments(table, "sag", true))); + root.Properties[HCParser.MsaID] = RootMsaId; + root.PrimaryAllomorph.Properties[HCParser.FormID] = RootFormId; + stratum.Entries.Add(root); + + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var pastSuffix = new AffixProcessRule + { + Id = "PAST", + Name = "ed_suffix", + Gloss = "PAST", + RequiredSyntacticFeatureStruct = FeatureStruct.New(syntacticFeatSys).Symbol("V").Value + }; + var suffixAllomorph = new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(table, "+d") } + }; + const int suffixFormId = 777; + const int suffixMsaId = 99; + suffixAllomorph.Properties[HCParser.FormID] = suffixFormId; + pastSuffix.Properties[HCParser.MsaID] = suffixMsaId; + pastSuffix.Allomorphs.Add(suffixAllomorph); + stratum.MorphologicalRules.Add(pastSuffix); + + var language = new Language + { + Name = "WorkerTest", + PhonologicalFeatureSystem = phonologicalFeatSys, + SyntacticFeatureSystem = syntacticFeatSys, + Strata = { stratum } + }; + + var morpher = new Morpher(new TraceManager(), language); + Word word = morpher.ParseWord("sagd", out _, false).Single(); + + WordAnalysisDto dto = HCWorkerService.ToWordAnalysisDto(word); + + Assert.That(dto.Morphs, Has.Length.EqualTo(2)); + MorphDto rootMorph = dto.Morphs[0]; + MorphDto suffixMorph = dto.Morphs[1]; + + Assert.That(rootMorph.FormId, Is.EqualTo(RootFormId)); + Assert.That(rootMorph.MsaId, Is.EqualTo(RootMsaId)); + Assert.That(rootMorph.IsAffixProcessAllomorph, Is.False); + + Assert.That(suffixMorph.FormId, Is.EqualTo(suffixFormId)); + Assert.That(suffixMorph.MsaId, Is.EqualTo(suffixMsaId)); + Assert.That(suffixMorph.IsAffixProcessAllomorph, Is.True); + + Assert.That(suffixMorph.MorphemeIndex, Is.Not.EqualTo(rootMorph.MorphemeIndex)); + } + } +} diff --git a/Src/LexText/HCWorker/HCWorkerTests/HCWorkerTests.csproj b/Src/LexText/HCWorker/HCWorkerTests/HCWorkerTests.csproj new file mode 100644 index 0000000000..edbd52ba56 --- /dev/null +++ b/Src/LexText/HCWorker/HCWorkerTests/HCWorkerTests.csproj @@ -0,0 +1,56 @@ + + + + HCWorkerTests + SIL.FieldWorks.WordWorks.Parser.HCWorker + net48 + Library + true + + true 168,169,219,414,649,1635,1702,1701 + false + false + + + DEBUG;TRACE + true + false + portable + + + TRACE + true + true + portable + + + + + + + + + + + + + + + + + + + + + + + + + Properties\CommonAssemblyInfo.cs + + + diff --git a/Src/LexText/HCWorker/Program.cs b/Src/LexText/HCWorker/Program.cs new file mode 100644 index 0000000000..cb2d2bd3c2 --- /dev/null +++ b/Src/LexText/HCWorker/Program.cs @@ -0,0 +1,91 @@ +// Copyright (c) 2026 SIL International +// This software is licensed under the LGPL, version 2.1 or later +// (http://www.gnu.org/licenses/lgpl-2.1.html) + +using System; +using System.Diagnostics; +using System.ServiceModel; +using System.Threading; +using SIL.FieldWorks.WordWorks.Parser; + +namespace SIL.FieldWorks.WordWorks.Parser.HCWorker +{ + /// + /// Entry point for the out-of-process Server-GC HermitCrab worker. Spawned lazily by + /// FieldWorks' HCWorkerProcessManager (Src\LexText\ParserCore) as: + /// HCWorker.exe <pipeName> <parentProcessId> + /// The WCF contract (IHCWorkerService + DTOs), the pipe binding, and the Form/Msa/InflType key + /// constants all live in ParserCore, so this host and the FieldWorks-side client share one + /// definition. See RUSTIFY-fieldworks-worker-design.md. + /// + public static class Program + { + public static int Main(string[] args) + { + if (args.Length < 2 || !int.TryParse(args[1], out int parentProcessId)) + { + Console.Error.WriteLine("Usage: HCWorker.exe "); + return 1; + } + string pipeName = args[0]; + + // Safety net mirroring FLExBridgeHelper.cs's process watchdog: if FieldWorks dies + // (crash, kill, normal exit without an explicit shutdown of us) this ensures the + // worker - and its Server-GC memory footprint - does not outlive it. + StartParentWatchdog(parentProcessId); + + NetNamedPipeBinding pipeBinding = PipeBindingFactory.Create(); + + using (var host = new ServiceHost(new HCWorkerService())) + { + host.AddServiceEndpoint(typeof(IHCWorkerService), pipeBinding, "net.pipe://localhost/" + pipeName); + host.Open(); + + // Readiness = the pipe is open; FieldWorks calls UpdateGrammar immediately after + // spawn and every parse call fails with a clear error until that lands + // (HCWorkerService.RequireMorpher), so there is no separate ready handshake. + Console.Out.WriteLine("READY"); + Console.Out.Flush(); + + // Block forever; the process exits via the parent watchdog above or being killed + // directly by FieldWorks (design §4 "Shutdown"). + Thread.Sleep(Timeout.Infinite); + } + return 0; + } + + private static void StartParentWatchdog(int parentProcessId) + { + Process parent; + try + { + parent = Process.GetProcessById(parentProcessId); + } + catch (ArgumentException) + { + // Parent already gone before we even started - exit immediately rather than + // leaking a Server-GC process with nothing to serve. + Environment.Exit(0); + return; + } + + var watchdog = new Thread(() => + { + try + { + parent.WaitForExit(); + } + catch (Exception) + { + // Handle may already be invalid; either way, treat it as "parent is gone." + } + Environment.Exit(0); + }) + { + IsBackground = true, + Name = "HCWorker parent-process watchdog" + }; + watchdog.Start(); + } + } +} diff --git a/Src/LexText/HCWorker/Properties/AssemblyInfo.cs b/Src/LexText/HCWorker/Properties/AssemblyInfo.cs new file mode 100644 index 0000000000..37b81037c1 --- /dev/null +++ b/Src/LexText/HCWorker/Properties/AssemblyInfo.cs @@ -0,0 +1,7 @@ +// Copyright (c) 2026 SIL International +// This software is licensed under the LGPL, version 2.1 or later +// (http://www.gnu.org/licenses/lgpl-2.1.html) + +using System.Runtime.CompilerServices; + +[assembly: InternalsVisibleTo("HCWorkerTests")] diff --git a/Src/LexText/ParserCore/HCParser.cs b/Src/LexText/ParserCore/HCParser.cs index 37015bb53b..774f3ab958 100644 --- a/Src/LexText/ParserCore/HCParser.cs +++ b/Src/LexText/ParserCore/HCParser.cs @@ -4,10 +4,12 @@ using System; using System.Collections.Generic; +using System.Diagnostics; using System.Globalization; using System.IO; using System.Linq; using System.Text; +using System.Threading; using System.Xml; using System.Xml.Linq; using SIL.LCModel; @@ -22,7 +24,20 @@ namespace SIL.FieldWorks.WordWorks.Parser public class HCParser : DisposableBase, IParser { private readonly LcmCache m_cache; - private Morpher m_morpher; + // Out-of-process Server-GC worker proxy (RUSTIFY-fieldworks-worker-design.md) that now + // does the bulk/interactive parsing an in-process m_morpher used to do directly. Tracing/ + // Try-a-Word (GetTraceMorpher/ParseToXml below) deliberately still runs in-process: its + // FwXmlTraceManager touches LCM inline while tracing, which the worker has no access to - + // see the design's §8 incremental-rollout note (bulk path first, interactive path already + // ported here since it needed no such LCM-touching trace manager; Try-a-Word is left for a + // follow-up). + private readonly HCWorkerClient m_workerClient; + // A dedicated morpher used only by the trace/Try-A-Word path. Tracing mutates the + // morpher's LexEntrySelector, RuleSelector, and TraceManager.IsTracing; keeping a + // separate morpher (with its own trace manager) ensures those mutations never corrupt + // the bulk m_morpher, which may be parsing concurrently on several threads. + private Morpher m_traceMorpher; + private FwXmlTraceManager m_traceMorpherTraceManager; private Language m_language; private readonly FwXmlTraceManager m_traceManager; private readonly string m_outputDirectory; @@ -30,11 +45,23 @@ public class HCParser : DisposableBase, IParser private bool m_forceUpdate; private bool m_guessRoots; private bool m_mergeAnalyses; + private int m_delReapps; + private int m_maxStemCount; + + // Diagnostic perf counters (accumulated across all threads) splitting bulk-parse time + // into the lock-free morpher parse vs. the LCM-read mapping (GetMorphs under the read + // lock). Near-zero overhead; used by the parser concurrency benchmark. + public static long DiagMorpherParseTicks; + public static long DiagGetMorphsTicks; // the public const strings are for GenerateHCConfigForFLExTrans and HCSynthByGlossLib internal const string CRuleID = "ID"; - internal const string FormID = "ID"; - internal const string FormID2 = "ID2"; + // FormID/FormID2 are public so the out-of-process HCWorker (Src\LexText\HCWorker) can key + // the same Allomorph.Properties bag when it projects a parsed Word down to MorphDto[] - + // keeping the worker's id extraction and this class's GetMorphs consumption on one set of + // key strings. + public const string FormID = "ID"; + public const string FormID2 = "ID2"; public const string InflTypeID = "InflTypeID"; public const string MsaID = "ID"; internal const string PRuleID = "ID"; @@ -50,6 +77,7 @@ public class HCParser : DisposableBase, IParser public HCParser(LcmCache cache) { m_cache = cache; + m_workerClient = new HCWorkerClient(); m_traceManager = new FwXmlTraceManager(m_cache); m_outputDirectory = Path.GetTempPath(); m_changeListener = new ParserModelChangeListener(m_cache); @@ -85,13 +113,21 @@ public ParseResult ParseWord(string word) { CheckDisposed(); - if (m_morpher == null) + if (m_language == null) return null; - IEnumerable wordAnalyses; + WordAnalysisDto[] wordAnalyses; try { - wordAnalyses = m_morpher.ParseWord(word, out _, m_guessRoots); + var morpherSw = Stopwatch.StartNew(); + // Round-trips to the worker process, OUTSIDE the LCM read lock below. The + // worker's parse is the expensive, CPU-bound part and touches only its own copy + // of the frozen HC grammar (not LCM), so keeping it off the read lock lets it run + // without holding the read lock for its whole duration - same reasoning as the + // in-process call this replaces, just across a process boundary now. + wordAnalyses = m_workerClient.ParseWord(word, m_guessRoots); + morpherSw.Stop(); + Interlocked.Add(ref DiagMorpherParseTicks, morpherSw.ElapsedTicks); } catch (Exception e) { @@ -99,10 +135,11 @@ public ParseResult ParseWord(string word) } ParseResult result; + var getMorphsSw = Stopwatch.StartNew(); using (new WorkerThreadReadHelper(m_cache.ServiceLocator.GetInstance())) { var analyses = new List(); - foreach (Word wordAnalysis in wordAnalyses) + foreach (WordAnalysisDto wordAnalysis in wordAnalyses) { List morphs; if (GetMorphs(wordAnalysis, out morphs)) @@ -113,10 +150,57 @@ public ParseResult ParseWord(string word) } result = new ParseResult(analyses); } + getMorphsSw.Stop(); + Interlocked.Add(ref DiagGetMorphsTicks, getMorphsSw.ElapsedTicks); return result; } + /// + /// Bulk path (design §5): one WCF round trip for the whole batch of already-normalized + /// word forms, instead of ParserWorker's old per-wordform Parallel.ForEach each calling + /// the single-word ParseWord above. Returns null (rather than throwing) if the batch call + /// itself fails even after HCWorkerClient's own retry-once, so ParserWorker can fall back + /// to its per-wordform path for this run instead of losing it entirely (design §6). + /// + public IDictionary ParseWordsBatch(string[] words) + { + CheckDisposed(); + + if (m_language == null || words.Length == 0) + return null; + + IDictionary wordAnalysesByWord; + try + { + wordAnalysesByWord = m_workerClient.ParseWordsBatch(words, m_guessRoots); + } + catch (Exception) + { + return null; + } + + var results = new Dictionary(); + using (new WorkerThreadReadHelper(m_cache.ServiceLocator.GetInstance())) + { + foreach (KeyValuePair kvp in wordAnalysesByWord) + { + var analyses = new List(); + foreach (WordAnalysisDto wordAnalysis in kvp.Value) + { + List morphs; + if (GetMorphs(wordAnalysis, out morphs)) + { + analyses.Add(new ParseAnalysis(morphs.Select(mi => + new ParseMorph(mi.Form, mi.Msa, mi.InflType, mi.GuessedString)))); + } + } + results[kvp.Key] = new ParseResult(analyses); + } + } + return results; + } + public XDocument TraceWordXml(string form, IEnumerable selectTraceMorphs) { CheckDisposed(); @@ -139,11 +223,15 @@ protected override void DisposeManagedResources() m_changeListener.Dispose(); m_changeListener = null; } + m_workerClient?.Dispose(); } private void LoadParser() { - m_morpher = null; + m_language = null; + // Force the trace morpher to be rebuilt over the freshly loaded language. + m_traceMorpher = null; + m_traceMorpherTraceManager = null; int delReapps = 0; // For Hermit Crab, the maximum number of roots/stems allowed is between one and ten. @@ -170,24 +258,58 @@ private void LoadParser() if (maxRootsElem != null) maxStemCount = int.Parse(maxRootsElem.Value); } - m_morpher = new Morpher(m_traceManager, m_language) { DeletionReapplications = delReapps }; - m_morpher.MaxStemCount = maxStemCount; - m_morpher.MergeEquivalentAnalyses = m_mergeAnalyses; + m_delReapps = delReapps; + m_maxStemCount = maxStemCount; + + // Ship the freshly loaded grammar to the worker (design §4/§5 "Grammar change"): the + // same HC.NET XML input format XmlLanguageLoader already reads, produced via + // XmlLanguageWriter.Save on the Language HCLoader.Load just built - no new + // serialization format, no changes to SIL.Machine.Morphology.HermitCrab itself. + string grammarFile = Path.Combine(m_outputDirectory, m_cache.ProjectId.Name + "HCGrammar.xml"); + XmlLanguageWriter.Save(m_language, grammarFile); + string grammarXml = File.ReadAllText(grammarFile); + File.Delete(grammarFile); + m_workerClient.UpdateGrammar(grammarXml, delReapps, maxStemCount, m_mergeAnalyses); + } + + /// + /// Lazily builds (and returns) the morpher used for tracing. It shares the frozen, + /// read-only with the bulk morpher but has its own mutable + /// state and its own trace manager, so enabling tracing or setting morpheme selectors + /// here cannot affect a bulk parse running on m_morpher. + /// + private Morpher GetTraceMorpher() + { + if (m_traceMorpher == null) + { + m_traceMorpherTraceManager = new FwXmlTraceManager(m_cache); + m_traceMorpher = new Morpher(m_traceMorpherTraceManager, m_language) + { + DeletionReapplications = m_delReapps, + MaxStemCount = m_maxStemCount, + MergeEquivalentAnalyses = m_mergeAnalyses + }; + } + return m_traceMorpher; } private XDocument ParseToXml(string form, bool tracing, IEnumerable selectTraceMorphs) { - if (m_morpher == null) + if (m_language == null) return null; + // Use the dedicated trace morpher so that setting selectors / IsTracing here cannot + // corrupt a bulk parse that may be running concurrently on m_morpher. + Morpher traceMorpher = GetTraceMorpher(); + var doc = new XDocument(); using (new WorkerThreadReadHelper(m_cache.ServiceLocator.GetInstance())) { if (selectTraceMorphs != null) { var selectTraceMorphsSet = new HashSet(selectTraceMorphs); - m_morpher.LexEntrySelector = entry => selectTraceMorphsSet.Contains((int) entry.Properties[MsaID]); - m_morpher.RuleSelector = rule => + traceMorpher.LexEntrySelector = entry => selectTraceMorphsSet.Contains((int) entry.Properties[MsaID]); + traceMorpher.RuleSelector = rule => { // Need to check if the rule is a morpheme and if it has a non-null msa id. // If the rule comes from an irregularly inflected form, msa id will be null. @@ -200,15 +322,15 @@ private XDocument ParseToXml(string form, bool tracing, IEnumerable selectT } else { - m_morpher.LexEntrySelector = entry => true; - m_morpher.RuleSelector = rule => true; + traceMorpher.LexEntrySelector = entry => true; + traceMorpher.RuleSelector = rule => true; } - m_morpher.TraceManager.IsTracing = tracing; + traceMorpher.TraceManager.IsTracing = tracing; var wordformElem = new XElement("Wordform", new XAttribute("form", form)); try { object trace; - foreach (Word wordAnalysis in m_morpher.ParseWord(form, out trace, m_guessRoots)) + foreach (Word wordAnalysis in traceMorpher.ParseWord(form, out trace, m_guessRoots)) { List morphs; if (GetMorphs(wordAnalysis, out morphs)) @@ -438,6 +560,127 @@ private bool GetMorphs(Word ws, out List result) return true; } + /// + /// The LCM-object-resolution half of GetMorphs(Word,...) above, ported to run over the + /// worker's flat MorphDto[] instead of walking a live Word/Annotation<ShapeNode>/ + /// Allomorph/Morpheme graph (the worker has no LcmCache, so it can't do these repository + /// lookups itself - see HCWorkerService.ToWordAnalysisDto and MorphDto's doc comment in + /// IHCWorkerService.cs). Every circumfix/infix-placement decision below is identical to + /// the Word-based version; only the source of FormId/FormId2/MsaId/InflTypeId/Guessed/ + /// FormStr and the "have we seen this morpheme already" key (MorphemeIndex instead of a + /// Morpheme reference) changed. + /// + private bool GetMorphs(WordAnalysisDto wordAnalysis, out List result) + { + var morphs = new Dictionary(); + + var aprCircumfixes = new List(); + bool isSuffixPortionOfAprCircumfix = false; + + result = new List(); + foreach (MorphDto morphDto in wordAnalysis.Morphs) + { + // The worker already skips morphs with no FormId (HCWorkerService. + // ToWordAnalysisDto mirrors this method's Word-based twin's `if (formID == 0) + // continue;`), so every entry reaching here has one. + int formID = morphDto.FormId; + + isSuffixPortionOfAprCircumfix = false; + int formID2 = morphDto.FormId2; + if (formID2 == 0 && morphDto.IsAffixProcessAllomorph) + { + // Per the Leipzig glossing rules (https://www.eva.mpg.de/lingua/resources/glossing-rules.php), + // circumfixes should appear both before and after the material they attach to. + // HC does not have an overt marker for a circumfix when it is an affix processing rule (aka APR). + // The following code determines when an APR is marked as a circumfix in FLEx and ensures the + // two instances of it as a morph are included in the result at the correct places. + // This is a fix for https://jira.sil.org/browse/LT-21447 + IMoForm circumForm; + if (!m_cache.ServiceLocator.GetInstance().TryGetObject(formID, out circumForm)) + { + result = null; + return false; + } + if (circumForm.MorphTypeRA.Guid == MoMorphTypeTags.kguidMorphCircumfix) + { + if (aprCircumfixes.Contains(formID)) + { + isSuffixPortionOfAprCircumfix = true; + } + else + { + // Remember this allomorph as an APR that is a circumfix + aprCircumfixes.Add(formID); + } + } + } + + int curFormID; + MorphInfo morphInfo; + if (!morphs.TryGetValue(morphDto.MorphemeIndex, out morphInfo) || isSuffixPortionOfAprCircumfix) + { + curFormID = formID; + } + else if (formID2 > 0) + { + // circumfix + curFormID = formID2; + } + else + { + continue; + } + + IMoForm form; + if (!m_cache.ServiceLocator.GetInstance().TryGetObject(curFormID, out form)) + { + result = null; + return false; + } + + IMoMorphSynAnalysis msa; + if (!m_cache.ServiceLocator.GetInstance().TryGetObject(morphDto.MsaId, out msa)) + { + result = null; + return false; + } + + ILexEntryInflType inflType = null; + if (morphDto.InflTypeId > 0 && !m_cache.ServiceLocator.GetInstance().TryGetObject(morphDto.InflTypeId, out inflType)) + { + result = null; + return false; + } + + morphInfo = new MorphInfo + { + Form = form, + GuessedString = morphDto.Guessed ? morphDto.FormStr : null, + Msa = msa, + InflType = inflType, + IsCircumfix = formID2 > 0 + }; + + morphs[morphDto.MorphemeIndex] = morphInfo; + + switch ((form.MorphTypeRA == null ? Guid.Empty : form.MorphTypeRA.Guid).ToString()) + { + case MoMorphTypeTags.kMorphInfix: + case MoMorphTypeTags.kMorphInfixingInterfix: + if (result.Count == 0) + result.Add(morphInfo); + else + result.Insert(result.Count - 1, morphInfo); + break; + + default: + result.Add(morphInfo); + break; + } + } + return true; + } + private static string GetMorphTypeString(Guid typeGuid) { switch (typeGuid.ToString()) diff --git a/Src/LexText/ParserCore/HCWorkerClient.cs b/Src/LexText/ParserCore/HCWorkerClient.cs new file mode 100644 index 0000000000..cb05fb2cbd --- /dev/null +++ b/Src/LexText/ParserCore/HCWorkerClient.cs @@ -0,0 +1,142 @@ +// Copyright (c) 2026 SIL International +// This software is licensed under the LGPL, version 2.1 or later +// (http://www.gnu.org/licenses/lgpl-2.1.html) + +using System; +using System.Collections.Generic; +using System.ServiceModel; + +namespace SIL.FieldWorks.WordWorks.Parser +{ + /// + /// Thin WCF proxy to the out-of-process HermitCrab worker, replacing HCParser's direct + /// in-process Morpher calls (RUSTIFY-fieldworks-worker-design.md §2/§3). The net.pipe binding + /// comes from the shared PipeBindingFactory so this client and the worker's ServiceHost use one + /// definition. + /// + /// Owns an HCWorkerProcessManager for spawn/respawn and remembers the last grammar sent so a + /// mid-call worker crash can be recovered from without the caller re-supplying it: on + /// CommunicationException/TimeoutException, respawn, replay UpdateGrammar, retry the failed + /// call once, then surface the error (design §6). + /// + public class HCWorkerClient : IDisposable + { + private readonly object m_channelLock = new object(); + private readonly HCWorkerProcessManager m_processManager = new HCWorkerProcessManager(); + private ChannelFactory m_factory; + private IHCWorkerService m_channel; + private HCGrammarDto m_lastGrammar; + + public void UpdateGrammar(string compiledGrammarXml, int deletionReapplications, int maxStemCount, bool mergeEquivalentAnalyses) + { + var grammar = new HCGrammarDto + { + CompiledGrammarXml = compiledGrammarXml, + DeletionReapplications = deletionReapplications, + MaxStemCount = maxStemCount, + MergeEquivalentAnalyses = mergeEquivalentAnalyses + }; + CallWithRetry(channel => channel.UpdateGrammar(grammar), grammar); + } + + public WordAnalysisDto[] ParseWord(string word, bool guessRoots) + { + WordAnalysisDto[] result = null; + CallWithRetry(channel => result = channel.ParseWord(word, guessRoots), m_lastGrammar); + return result; + } + + public IDictionary ParseWordsBatch(string[] words, bool guessRoots) + { + IDictionary result = null; + CallWithRetry(channel => result = channel.ParseWordsBatch(words, guessRoots), m_lastGrammar); + return result; + } + + /// + /// Kills the worker process (FieldWorks exit, or an idle timeout - design §4). The next + /// call after this lazily respawns and replays UpdateGrammar, same as a crash recovery. + /// + public void Shutdown() + { + lock (m_channelLock) + { + CloseChannel(); + m_processManager.Shutdown(); + } + } + + private void CallWithRetry(Action call, HCGrammarDto grammarToReplay) + { + IHCWorkerService channel = GetOrCreateChannel(); + try + { + call(channel); + // UpdateGrammar itself succeeded - remember it for a future respawn's replay. + // (Assigning unconditionally here is harmless when grammarToReplay is m_lastGrammar + // itself, e.g. from ParseWord/ParseWordsBatch.) + if (grammarToReplay != null) + m_lastGrammar = grammarToReplay; + } + catch (Exception e) when (e is CommunicationException || e is TimeoutException) + { + // Worker crashed or the pipe is otherwise unusable: respawn, replay the grammar + // (idempotent - design §6), and retry the failed call exactly once before + // surfacing the error to the caller/UI. + lock (m_channelLock) + { + CloseChannel(); + } + IHCWorkerService retryChannel = GetOrCreateChannel(); + if (m_lastGrammar != null) + retryChannel.UpdateGrammar(m_lastGrammar); + call(retryChannel); + if (grammarToReplay != null) + m_lastGrammar = grammarToReplay; + } + } + + private IHCWorkerService GetOrCreateChannel() + { + lock (m_channelLock) + { + if (m_channel != null) + return m_channel; + + string pipeName = m_processManager.EnsureStarted(); + // One binding definition shared with the worker's ServiceHost (both sides must agree + // on quotas/timeouts). + NetNamedPipeBinding pipeBinding = PipeBindingFactory.Create(); + + m_factory = new ChannelFactory( + pipeBinding, + new EndpointAddress("net.pipe://localhost/" + pipeName)); + m_channel = m_factory.CreateChannel(); + return m_channel; + } + } + + private void CloseChannel() + { + lock (m_channelLock) + { + try + { + (m_channel as ICommunicationObject)?.Abort(); + } + catch (Exception) + { + // Best-effort teardown of a channel we already know is broken. + } + m_factory?.Abort(); + m_channel = null; + m_factory = null; + } + } + + public void Dispose() + { + Shutdown(); + } + } +} diff --git a/Src/LexText/ParserCore/HCWorkerProcessManager.cs b/Src/LexText/ParserCore/HCWorkerProcessManager.cs new file mode 100644 index 0000000000..953214a203 --- /dev/null +++ b/Src/LexText/ParserCore/HCWorkerProcessManager.cs @@ -0,0 +1,132 @@ +// Copyright (c) 2026 SIL International +// This software is licensed under the LGPL, version 2.1 or later +// (http://www.gnu.org/licenses/lgpl-2.1.html) + +using System; +using System.Diagnostics; +using System.IO; +using System.Threading; +using SIL.FieldWorks.Common.FwUtils; + +namespace SIL.FieldWorks.WordWorks.Parser +{ + /// + /// Spawns and watches the out-of-process HermitCrab worker (RUSTIFY-fieldworks-worker-design.md + /// §2/§4). Modeled directly on FLExBridgeHelper.cs's Process.Start + WaitForExit watchdog + /// pattern (Src\Common\FwUtils\FLExBridgeHelper.cs) - that is this codebase's existing + /// precedent for "spawn a helper process and notice if it dies," so this doesn't invent a new + /// one. One instance is expected to live for the lifetime of a FieldWorks process (owned by + /// HCWorkerClient); it does not itself talk WCF - that's HCWorkerClient's job, kept separate so + /// process lifecycle and channel lifecycle can fail/retry independently, matching the design's + /// architecture diagram (HCWorkerProcessManager box distinct from the client proxy box). + /// + public class HCWorkerProcessManager : IDisposable + { + private const string WorkerExeName = "HCWorker.exe"; + + private readonly object m_lock = new object(); + private Process m_process; + private string m_pipeName; + + /// + /// Pipe name of the currently running worker, or null if none is running. Unique per + /// FieldWorks process (not per-launch) so a respawned worker after a crash still gets a + /// fresh, non-colliding pipe name. + /// + public string PipeName => m_pipeName; + + public bool IsRunning + { + get + { + lock (m_lock) + { + return m_process != null && !m_process.HasExited; + } + } + } + + /// + /// Starts the worker if it is not already running (design §4: lazy start on first HC + /// parse request per session, not eagerly at FieldWorks startup). Returns the pipe name to + /// connect to. Safe to call repeatedly/concurrently. + /// + public string EnsureStarted() + { + lock (m_lock) + { + if (m_process != null && !m_process.HasExited) + return m_pipeName; + + m_pipeName = "HCWorker_" + Guid.NewGuid().ToString("N"); + string exePath = Path.Combine(FwDirectoryFinder.ExeOrDllDirectory, WorkerExeName); + + var startInfo = new ProcessStartInfo + { + UseShellExecute = false, + FileName = exePath, + Arguments = $"{m_pipeName} {Process.GetCurrentProcess().Id}", + CreateNoWindow = true, + RedirectStandardOutput = true + }; + + var process = new Process { StartInfo = startInfo }; + process.Start(); + + // Safety net mirroring FLExBridgeHelper.cs's process watchdog: if the worker dies + // (crash, killed, exits on its own parent-watchdog per Program.cs) this notices so + // the next EnsureStarted()/HCWorkerClient retry respawns it rather than talking to + // a dead pipe. + var watchdog = new Thread(() => + { + try + { + process.WaitForExit(); + } + catch (Exception) + { + // Process handle may already be invalid; either way treat it as exited. + } + }) + { IsBackground = true, Name = "HCWorker process watchdog" }; + watchdog.Start(); + + m_process = process; + return m_pipeName; + } + } + + /// + /// Kills the worker (FieldWorks exit, or an idle timeout releasing its Server-GC memory + /// footprint - design §4 "Shutdown"). Safe to call when nothing is running. + /// + public void Shutdown() + { + lock (m_lock) + { + if (m_process == null) + return; + try + { + if (!m_process.HasExited) + m_process.Kill(); + } + catch (Exception) + { + // Already exited or exiting; nothing more to do. + } + finally + { + m_process.Dispose(); + m_process = null; + m_pipeName = null; + } + } + } + + public void Dispose() + { + Shutdown(); + } + } +} diff --git a/Src/LexText/ParserCore/IHCWorkerService.cs b/Src/LexText/ParserCore/IHCWorkerService.cs new file mode 100644 index 0000000000..cbf9cebcb1 --- /dev/null +++ b/Src/LexText/ParserCore/IHCWorkerService.cs @@ -0,0 +1,89 @@ +// Copyright (c) 2026 SIL International +// This software is licensed under the LGPL, version 2.1 or later +// (http://www.gnu.org/licenses/lgpl-2.1.html) + +using System.Collections.Generic; +using System.Runtime.Serialization; +using System.ServiceModel; + +namespace SIL.FieldWorks.WordWorks.Parser +{ + /// + /// The WCF contract for the out-of-process HermitCrab worker (Src\LexText\HCWorker), defined + /// once here in ParserCore and referenced by BOTH the worker (which implements it) and + /// HCWorkerClient (which consumes it) - no hand-synced duplicate. The explicit Namespace/Name/ + /// DataMember attributes pin the wire shape. See RUSTIFY-fieldworks-worker-design.md. + /// + [ServiceContract(Namespace = "http://sil.org/machine/hermitcrab/worker", Name = "IHCWorkerService")] + public interface IHCWorkerService + { + [OperationContract] + void UpdateGrammar(HCGrammarDto grammar); + + [OperationContract] + WordAnalysisDto[] ParseWord(string word, bool guessRoots); + + [OperationContract] + IDictionary ParseWordsBatch(string[] words, bool guessRoots); + } + + [DataContract(Namespace = "http://sil.org/machine/hermitcrab/worker")] + public class HCGrammarDto + { + [DataMember] + public string CompiledGrammarXml { get; set; } + + [DataMember] + public int DeletionReapplications { get; set; } + + [DataMember] + public int MaxStemCount { get; set; } + + [DataMember] + public bool MergeEquivalentAnalyses { get; set; } + } + + [DataContract(Namespace = "http://sil.org/machine/hermitcrab/worker")] + public class WordAnalysisDto + { + [DataMember] + public MorphDto[] Morphs { get; set; } + } + + /// + /// One morph's raw, LCM-lookup-ready fields, in the order HermitCrab.Worker encountered them + /// walking the parsed Word - i.e. exactly what HCParser.GetMorphs used to read directly off + /// the live Word/Allomorph/Morpheme object graph before parsing moved out-of-process. FormId/ + /// MsaId/InflTypeId are the ids GetMorphs already resolves via IMoFormRepository/ + /// IMoMorphSynAnalysisRepository/ILexEntryInflTypeRepository; the circumfix/infix placement + /// logic that used to run inline (checking a resolved IMoForm's MorphTypeRA) still runs here, + /// now over this flat list instead of over Annotation<ShapeNode>/Allomorph/Morpheme. + /// + [DataContract(Namespace = "http://sil.org/machine/hermitcrab/worker")] + public class MorphDto + { + [DataMember] + public int FormId { get; set; } + + [DataMember] + public int FormId2 { get; set; } + + [DataMember] + public bool IsAffixProcessAllomorph { get; set; } + + [DataMember] + public string FormStr { get; set; } + + [DataMember] + public bool Guessed { get; set; } + + [DataMember] + public int MsaId { get; set; } + + [DataMember] + public int InflTypeId { get; set; } + + [DataMember] + public int MorphemeIndex { get; set; } + } +} diff --git a/Src/LexText/ParserCore/ParseFiler.cs b/Src/LexText/ParserCore/ParseFiler.cs index 87014466b4..45e9043302 100644 --- a/Src/LexText/ParserCore/ParseFiler.cs +++ b/Src/LexText/ParserCore/ParseFiler.cs @@ -13,6 +13,7 @@ using SIL.LCModel.Core.Text; using SIL.LCModel.DomainServices; using SIL.LCModel.Infrastructure; +using SIL.Reporting; using XCore; namespace SIL.FieldWorks.WordWorks.Parser @@ -167,13 +168,17 @@ private bool UpdateWordforms(object parameter) // update all of the wordforms in a batch, this might slow down the UI thread a little, if it causes too much unresponsiveness // we can bail out early if there is a message in the Win32 message queue - IEnumerable results; + WordformUpdateWork[] results; lock (m_syncRoot) { results = m_workQueue.ToArray(); m_workQueue.Clear(); } + // Instrumentation: filing is the serial part of bulk parsing; time it so the + // parse:file split can be measured against the parse timings logged by ParserWorker. + var filingTimer = Stopwatch.StartNew(); + // Update work.Wordform with its own NonUndoableUnitOfWorkHelper // so that PropChanged will be triggered when it is updated below. NonUndoableUnitOfWorkHelper.Do(m_cache.ActionHandlerAccessor, () => @@ -243,6 +248,10 @@ from ann in m_baseAnnotationRepository.AllInstances() FireWordformUpdated(work.Wordform, work.Priority, work.ParseResult, work.CheckParser); } }); + + filingTimer.Stop(); + if (results.Length > 0) + Logger.WriteMinorEvent("Parser filing: {0} results filed in {1} ms", results.Length, filingTimer.ElapsedMilliseconds); return true; } diff --git a/Src/LexText/ParserCore/ParserCoreTests/ParseWorkerTests.cs b/Src/LexText/ParserCore/ParserCoreTests/ParseWorkerTests.cs index a62cfb9e1a..3b4249c99e 100644 --- a/Src/LexText/ParserCore/ParserCoreTests/ParseWorkerTests.cs +++ b/Src/LexText/ParserCore/ParserCoreTests/ParseWorkerTests.cs @@ -161,6 +161,69 @@ public void UpdateWordform() CheckAnalysisSize("Cats", 0, false); CheckAnalysisSize("cats", 1, false); } + + /// + /// Parsing a batch of wordforms in parallel must produce exactly the same analyses as + /// parsing them one at a time (the key acceptance criterion for the concurrent bulk + /// parse). Two disjoint sets of identical-shaped wordforms are parsed, one serially and + /// one in parallel, and the resulting analyses are compared. + /// + [Test] + public void ParseAndUpdateWordforms_ParallelMatchesSerial() + { + IMoStemAllomorph catNForm = null; + IMoStemMsa catNMsa = null; + UndoableUnitOfWorkHelper.Do("Undo stuff", "Redo stuff", m_actionHandler, () => + { + // Noun + ILexEntry catN = Cache.ServiceLocator.GetInstance().Create(); + catNForm = Cache.ServiceLocator.GetInstance().Create(); + catN.AlternateFormsOS.Add(catNForm); + catNForm.Form.VernacularDefaultWritingSystem = TsStringUtils.MakeString("catn", m_vernacularWS.Handle); + catNMsa = Cache.ServiceLocator.GetInstance().Create(); + catN.MorphoSyntaxAnalysesOC.Add(catNMsa); + }); + + // The same single-analysis result is returned for every (lowercase) wordform. + var sharedResult = new ParseResult(new[] + { + new ParseAnalysis(new[] { new ParseMorph(catNForm, catNMsa) }) + }); + + // Disjoint sets of distinct lowercase wordforms so the two runs do not interfere. + // Lowercase forms also skip the lowercase-variant re-parse, keeping the test focused. + var serialForms = new[] { "able", "baker", "charlie", "dog", "easy", "foxtrot", "golf" }; + var parallelForms = new[] { "hotel", "india", "juliet", "kilo", "lima", "mike", "november" }; + + Func> parseAll = (forms, maxDop) => + { + var worker = new ParserWorker(Cache, null, HandleTaskUpdate, m_idleQueue, null); + worker.Parser = new TestParserClass(sharedResult, null); + var wordforms = new List(); + foreach (string form in forms) + wordforms.Add(FindOrCreateWordform(form)); + + // SUT + worker.ParseAndUpdateWordforms(wordforms, ParserPriority.Low, false, maxDop); + ExecuteIdleQueue(); + + var counts = new Dictionary(); + foreach (IWfiWordform wf in wordforms) + counts[wf.Form.VernacularDefaultWritingSystem.Text] = wf.AnalysesOC.Count; + return counts; + }; + + Dictionary serialCounts = parseAll(serialForms, 1); + Dictionary parallelCounts = parseAll(parallelForms, 4); + + // Every serially-parsed wordform got exactly one analysis. + foreach (string form in serialForms) + Assert.That(serialCounts[form], Is.EqualTo(1), "serial analysis count for " + form); + // The parallel batch produced the identical result for its (equivalent) wordforms: + // no dropped, duplicated, or missing analyses. + foreach (string form in parallelForms) + Assert.That(parallelCounts[form], Is.EqualTo(1), "parallel analysis count for " + form); + } #endregion // Tests } diff --git a/Src/LexText/ParserCore/ParserScheduler.cs b/Src/LexText/ParserCore/ParserScheduler.cs index f96a38f754..a482675e5b 100644 --- a/Src/LexText/ParserCore/ParserScheduler.cs +++ b/Src/LexText/ParserCore/ParserScheduler.cs @@ -8,6 +8,7 @@ using System; using System.Collections.Generic; +using System.Linq; using SIL.FieldWorks.Common.FwUtils; using SIL.LCModel.Utils; using SIL.LCModel; @@ -54,12 +55,17 @@ abstract class ParserWork { protected readonly ParserScheduler m_scheduler; protected readonly ParserPriority m_priority; + // The number of wordforms this work item represents. The queue counts track + // wordforms (not work items) so the "Queue: low/med/high" display and the idle + // detection stay meaningful when many wordforms are batched into one work item. + private readonly int m_queueCount; - protected ParserWork(ParserScheduler scheduler, ParserPriority priority) + protected ParserWork(ParserScheduler scheduler, ParserPriority priority, int queueCount = 1) { m_scheduler = scheduler; m_priority = priority; - m_scheduler.IncrementQueueCount(m_priority); + m_queueCount = queueCount; + m_scheduler.IncrementQueueCount(m_priority, m_queueCount); } public virtual void DoWork() @@ -67,7 +73,7 @@ public virtual void DoWork() // This undoes the IncrementQueueCount above. // Subclasses should always call base.DoWork(). // Nobody else should call IncrementQueueCount or DecrementQueueCount. - m_scheduler.DecrementQueueCount(m_priority); + m_scheduler.DecrementQueueCount(m_priority, m_queueCount); } } @@ -112,6 +118,31 @@ public override void DoWork() } } + /// + /// A batch of wordforms parsed together. The parses may run concurrently (for + /// thread-safe parsers); filing of results stays on the existing serial idle-queue path. + /// + class UpdateWordformsWork : ParserWork + { + private readonly IList m_wordforms; + private readonly bool m_checkParser; + private readonly int m_maxDegreeOfParallelism; + + public UpdateWordformsWork(ParserScheduler scheduler, ParserPriority priority, IList wordforms, bool checkParser, int maxDegreeOfParallelism) + : base(scheduler, priority, wordforms.Count) + { + m_wordforms = wordforms; + m_checkParser = checkParser; + m_maxDegreeOfParallelism = maxDegreeOfParallelism; + } + + public override void DoWork() + { + m_scheduler.m_parserWorker.ParseAndUpdateWordforms(m_wordforms, m_priority, m_checkParser, m_maxDegreeOfParallelism); + base.DoWork(); + } + } + class ReloadGrammarAndLexiconWork : ParserWork { public ReloadGrammarAndLexiconWork(ParserScheduler scheduler) @@ -254,18 +285,18 @@ public int GetQueueSize(ParserPriority priority) return m_queueCounts[(int) priority]; } - private void IncrementQueueCount(ParserPriority priority) + private void IncrementQueueCount(ParserPriority priority, int count = 1) { lock (SyncRoot) - m_queueCounts[(int) priority]++; + m_queueCounts[(int) priority] += count; } - private void DecrementQueueCount(ParserPriority priority) + private void DecrementQueueCount(ParserPriority priority, int count = 1) { bool isIdle; lock (SyncRoot) { - m_queueCounts[(int) priority]--; + m_queueCounts[(int) priority] -= count; isIdle = m_queueCounts[(int)ParserPriority.TryAWord] == 0 && m_queueCounts[(int)ParserPriority.Low] == 0 && m_queueCounts[(int)ParserPriority.Medium] == 0 @@ -302,12 +333,38 @@ public void ScheduleOneWordformForUpdate(IWfiWordform wordform, ParserPriority p m_thread.EnqueueWork(priority, new UpdateWordformWork(this, priority, wordform, checkParser)); } + /// + /// Number of bounded chunks-worth of parallel "waves" packed into a single batch work + /// item. Larger amortizes the cost of starting a parallel loop; smaller lets an + /// interactive Try-A-Word (higher priority) or grammar reload preempt sooner. + /// + private const int ParseChunkMultiplier = 8; + public void ScheduleWordformsForUpdate(IEnumerable wordforms, ParserPriority priority, bool checkParser) { CheckDisposed(); - foreach (var wordform in wordforms) - ScheduleOneWordformForUpdate(wordform, priority, checkParser); + // Materialize once: callers commonly pass lazy queries (AllInstances(), Union(), ...). + IList wordformList = wordforms as IList ?? wordforms.ToList(); + if (wordformList.Count == 0) + return; + + int maxDegreeOfParallelism = m_parserWorker.MaxDegreeOfParallelism; + // Split into bounded chunks and enqueue one batch work item per chunk. The work + // items go through the same priority queue, so a Try-A-Word (priority TryAWord) or + // a reload (priority ReloadGrammarAndLexicon) still preempts between chunks, and + // Stop()/Dispose only has to wait for the current chunk. A non-parallel parser keeps + // one wordform per work item, exactly as before. + int chunkSize = maxDegreeOfParallelism <= 1 ? 1 : maxDegreeOfParallelism * ParseChunkMultiplier; + + for (int start = 0; start < wordformList.Count; start += chunkSize) + { + int count = Math.Min(chunkSize, wordformList.Count - start); + var chunk = new List(count); + for (int i = 0; i < count; i++) + chunk.Add(wordformList[start + i]); + m_thread.EnqueueWork(priority, new UpdateWordformsWork(this, priority, chunk, checkParser, maxDegreeOfParallelism)); + } } private void HandleTaskUpdate(TaskReport task) diff --git a/Src/LexText/ParserCore/ParserWorker.cs b/Src/LexText/ParserCore/ParserWorker.cs index 58f494f7bd..a4ca43357f 100644 --- a/Src/LexText/ParserCore/ParserWorker.cs +++ b/Src/LexText/ParserCore/ParserWorker.cs @@ -26,12 +26,18 @@ */ using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; using SIL.FieldWorks.Common.FwUtils; using SIL.LCModel.Core.KernelInterfaces; using SIL.LCModel.Core.Text; using SIL.LCModel; using SIL.LCModel.Infrastructure; using SIL.ObjectModel; +using SIL.Reporting; using XCore; using SIL.LCModel.DomainServices; using System.Xml.Linq; @@ -122,7 +128,278 @@ public void TryAWord(string sForm, bool fDoTrace, int[] sSelectTraceMorphs) } } - public bool ParseAndUpdateWordform(IWfiWordform wordform, ParserPriority priority, bool checkParser = false) + /// + /// Indicates whether the active parser is safe to run on several wordforms concurrently. + /// Only the HermitCrab parser exposes a thread-safe, shared morpher; XAmple wraps a + /// single-instance native parser and must stay serial. + /// + internal bool SupportsParallelParsing => m_parser is HCParser; + + /// + /// Property name used to override the bulk-parse concurrency from settings. + /// + internal const string MaxConcurrencyPropertyName = "ParserMaxConcurrency"; + + /// + /// The maximum number of wordforms to parse concurrently during bulk parsing. + /// Returns 1 (serial) unless the active parser is thread-safe (HermitCrab). + /// + /// + /// Measured bulk HermitCrab parsing scales to only ~2.4x at 4 concurrent parses and + /// plateaus around ~2.8x thereafter on a representative project — the ceiling is inside + /// the HermitCrab morpher (shared compiled grammar / allocation-bound), not in this code. + /// So we default to a small cap that captures most of the available speed-up while + /// leaving cores free so the rest of the UI stays responsive during a "Parse All Words". + /// Override (up or down) with the "ParserMaxConcurrency" setting. + /// + public int MaxDegreeOfParallelism + { + get + { + CheckDisposed(); + if (!SupportsParallelParsing) + return 1; + int defaultDop = Math.Max(1, Math.Min(Environment.ProcessorCount - 1, 4)); + int dop = m_propertyTable != null + ? m_propertyTable.GetIntProperty(MaxConcurrencyPropertyName, defaultDop) + : defaultDop; + return Math.Max(1, dop); + } + } + + /// + /// Parse a batch of wordforms, optionally in parallel, and file the results. + /// The grammar/lexicon update check is done once up front (it is not thread-safe and + /// must not run inside the parallel body). Each wordform's parse is CPU-bound and + /// lock-free; filing is enqueued to the thread-safe idle queue and still happens + /// serially on the UI thread. + /// + /// The wordforms to parse. + /// The priority the parse is run at. + /// Whether this is a parser check (no model update). + /// Maximum wordforms to parse concurrently; + /// 1 means serial. Callers must pass 1 for parsers that are not thread-safe. + public void ParseAndUpdateWordforms(IList wordforms, ParserPriority priority, bool checkParser, int maxDegreeOfParallelism) + { + CheckDisposed(); + + if (wordforms.Count == 0) + return; + + // Bring the grammar/lexicon up to date exactly once, on this (single) dispatcher + // thread, before any concurrent parsing begins (Update() is not thread-safe). + CheckNeedsUpdate(); + + // Instrumentation: batch wall-clock vs. summed per-wordform parse time lets us + // report the achieved parallelism and (together with ParseFiler's filing time) the + // parse:file split. + var batchTimer = Stopwatch.StartNew(); + long summedWordMs = 0; + + if (m_parser is HCParser hcParser && maxDegreeOfParallelism > 1 && wordforms.Count > 1) + { + // Route the whole batch through the out-of-process worker in one call instead of + // an in-process Parallel.ForEach over individual ParseWord calls - the worker + // parallelizes internally under Server GC with no artificial cap (RUSTIFY- + // fieldworks-worker-design.md §5), so FieldWorks no longer needs (or benefits + // from) its own per-wordform parallel loop for HC. If the batch call fails outright + // (design §6: after HCWorkerClient's own retry-once), this falls back to the + // per-wordform guarded loop below rather than losing the run. + long? batchWordMs = ParseAndUpdateWordformsBatch(hcParser, wordforms, priority, checkParser); + if (batchWordMs.HasValue) + { + summedWordMs = batchWordMs.Value; + } + else + { + var options = new ParallelOptions { MaxDegreeOfParallelism = maxDegreeOfParallelism }; + Parallel.ForEach(wordforms, options, wordform => + { + var wordTimer = Stopwatch.StartNew(); + ParseAndUpdateWordformGuarded(wordform, priority, checkParser); + Interlocked.Add(ref summedWordMs, wordTimer.ElapsedMilliseconds); + }); + } + } + else if (maxDegreeOfParallelism <= 1 || wordforms.Count == 1) + { + // Serial path: behaves exactly as the original one-wordform-at-a-time code, + // including letting unexpected exceptions propagate (so XAmple keeps its + // existing error handling). + foreach (IWfiWordform wordform in wordforms) + { + var wordTimer = Stopwatch.StartNew(); + ParseAndUpdateWordform(wordform, priority, checkParser, ensureUpToDate: false); + summedWordMs += wordTimer.ElapsedMilliseconds; + } + } + else + { + // Parallel path (non-HC thread-safe parsers only - HC now batches above). Guard + // each wordform so one unexpected exception cannot abort the whole batch (Gotcha #6). + var options = new ParallelOptions { MaxDegreeOfParallelism = maxDegreeOfParallelism }; + Parallel.ForEach(wordforms, options, wordform => + { + var wordTimer = Stopwatch.StartNew(); + ParseAndUpdateWordformGuarded(wordform, priority, checkParser); + Interlocked.Add(ref summedWordMs, wordTimer.ElapsedMilliseconds); + }); + } + + batchTimer.Stop(); + long wallMs = batchTimer.ElapsedMilliseconds; + double effectiveParallelism = wallMs > 0 ? summedWordMs / (double) wallMs : 0.0; + Logger.WriteMinorEvent( + "Parser batch: {0} wordforms in {1} ms wall (summed parse {2} ms, effective parallelism {3:0.0}x, maxDOP {4})", + wordforms.Count, wallMs, summedWordMs, effectiveParallelism, maxDegreeOfParallelism); + } + + /// + /// Runs for one wordform, swallowing any unexpected + /// exception so that a single bad wordform cannot abort an entire parallel batch. + /// (HCParser already converts parse failures into ParseResult error results; this guards + /// against anything else.) + /// + private void ParseAndUpdateWordformGuarded(IWfiWordform wordform, ParserPriority priority, bool checkParser) + { + try + { + ParseAndUpdateWordform(wordform, priority, checkParser, ensureUpToDate: false); + } + catch (Exception) + { + // File an error result so clients still learn the parser finished with this wordform. + try + { + var parseResult = new ParseResult(string.Format(ParserCoreStrings.ksHCInvalidWordform, "", 0, "", "")); + m_parseFiler.ProcessParse(wordform, priority, parseResult, checkParser); + } + catch (Exception) + { + // Nothing more we can safely do for this wordform. + } + } + } + + /// + /// One batch item's normalized form(s), gathered under a single read-lock pass instead of + /// once per wordform (Src\LexText\ParserCore\ParserWorker.cs's original ParseAndUpdateWordform + /// re-acquired the read lock per wordform; the batch RPC below needs no lock at all, so + /// there is no reason to keep doing that here). + /// + private class BatchItem + { + public IWfiWordform Wordform; + public string Word; + public string LowerWord; + public ITsString LowerText; + } + + /// + /// Design §5's bulk path: gathers every wordform's normalized form (and, same as + /// ParseAndUpdateWordform, its lowercase variant when different) into one word list, makes + /// a single HCParser.ParseWordsBatch WCF round trip for the whole list, then files results + /// exactly as ParseAndUpdateWordform would have per wordform. Returns null if the batch + /// call itself failed (caller falls back to the per-wordform path), otherwise the batch's + /// wall-clock ms (there is no longer a meaningful per-wordform parse-time split once one + /// RPC covers the whole batch - see design §7's note that this instrumentation shifts to + /// worker-parse-time/IPC-overhead/LCM-lock-time instead). + /// + private long? ParseAndUpdateWordformsBatch(HCParser hcParser, IList wordforms, ParserPriority priority, bool checkParser) + { + var items = new List(wordforms.Count); + using (new WorkerThreadReadHelper(m_cache.ServiceLocator.GetInstance())) + { + var normalizer = CustomIcu.GetIcuNormalizer(FwNormalizationMode.knmNFD); + foreach (IWfiWordform wordform in wordforms) + { + ITsString form = wordform.IsValidObject ? wordform.Form.VernacularDefaultWritingSystem : null; + if (form == null || string.IsNullOrEmpty(form.Text)) + { + items.Add(new BatchItem { Wordform = wordform }); + continue; + } + + var item = new BatchItem + { + Wordform = wordform, + Word = normalizer.Normalize(form.Text.Replace(' ', '.')) + }; + + var cf = new CaseFunctions(m_cache.ServiceLocator.WritingSystemManager.Get(form.get_WritingSystemAt(0))); + string sLower = cf.ToLower(form.Text); + if (sLower != form.Text) + { + item.LowerWord = normalizer.Normalize(sLower.Replace(' ', '.')); + item.LowerText = TsStringUtils.MakeString(sLower, form.get_WritingSystem(0)); + } + items.Add(item); + } + } + + string[] batchWords = items + .SelectMany(i => new[] { i.Word, i.LowerWord }) + .Where(w => w != null) + .Distinct() + .ToArray(); + if (batchWords.Length == 0) + { + // Every wordform was invalid/empty; nothing to send the worker, but still file + // per-wordform so clients learn the parser finished with each of them. + foreach (BatchItem item in items) + FileInvalidWordform(item.Wordform, priority, checkParser); + return 0; + } + + var wordTimer = Stopwatch.StartNew(); + IDictionary resultsByWord; + using (var task = new TaskReport(string.Format(ParserCoreStrings.ksParsingX, batchWords[0]), m_taskUpdateHandler)) + { + resultsByWord = hcParser.ParseWordsBatch(batchWords); + } + wordTimer.Stop(); + + if (resultsByWord == null) + return null; + + long perWordMs = items.Count > 0 ? wordTimer.ElapsedMilliseconds / items.Count : 0; + foreach (BatchItem item in items) + { + if (item.Word == null) + { + FileInvalidWordform(item.Wordform, priority, checkParser); + continue; + } + + ParseResult result; + if (!resultsByWord.TryGetValue(item.Word, out result)) + result = new ParseResult(string.Format(ParserCoreStrings.ksHCInvalidWordform, "", 0, "", "")); + result.ParseTime = perWordMs; + + if (item.LowerWord != null && resultsByWord.TryGetValue(item.LowerWord, out ParseResult lcResult)) + { + lcResult.ParseTime = perWordMs; + if (lcResult.Analyses.Count > 0 && lcResult.ErrorMessage == null) + { + // Don't turn lcText into a wordform here. + // This avoids a problem with broadcasting PropChanged (cf. LT-22079). + m_parseFiler.ProcessParse(item.LowerText, 0, lcResult, checkParser); + } + } + + m_parseFiler.ProcessParse(item.Wordform, priority, result, checkParser); + } + + return wordTimer.ElapsedMilliseconds; + } + + private void FileInvalidWordform(IWfiWordform wordform, ParserPriority priority, bool checkParser) + { + var parseResult = new ParseResult(string.Format(ParserCoreStrings.ksHCInvalidWordform, "", 0, "", "")); + m_parseFiler.ProcessParse(wordform, priority, parseResult, checkParser); + } + + public bool ParseAndUpdateWordform(IWfiWordform wordform, ParserPriority priority, bool checkParser = false, bool ensureUpToDate = true) { CheckDisposed(); @@ -144,7 +421,10 @@ public bool ParseAndUpdateWordform(IWfiWordform wordform, ParserPriority priorit return false; } - CheckNeedsUpdate(); + // During a parallel batch the caller has already ensured the parser is up to date; + // Update() is not thread-safe so it must not run inside the parallel body. + if (ensureUpToDate) + CheckNeedsUpdate(); var normalizer = CustomIcu.GetIcuNormalizer(FwNormalizationMode.knmNFD); var word = normalizer.Normalize(form.Text.Replace(' ', '.')); ParseResult result = null; diff --git a/Src/LexText/ParserCore/PipeBindingFactory.cs b/Src/LexText/ParserCore/PipeBindingFactory.cs new file mode 100644 index 0000000000..1066354e97 --- /dev/null +++ b/Src/LexText/ParserCore/PipeBindingFactory.cs @@ -0,0 +1,42 @@ +// Copyright (c) 2026 SIL International +// This software is licensed under the LGPL, version 2.1 or later +// (http://www.gnu.org/licenses/lgpl-2.1.html) + +using System; +using System.ServiceModel; + +namespace SIL.FieldWorks.WordWorks.Parser +{ + /// + /// The one net.pipe binding definition shared by both ends of the HermitCrab worker channel: + /// the worker's ServiceHost (HCWorker.exe) and the in-FieldWorks client (HCWorkerClient). Both + /// sides must agree on quotas/timeouts, so there is exactly one copy here. + /// + public static class PipeBindingFactory + { + // Real compiled grammars can be several MB once serialized to the HC.NET XML input format + // (a real Sena grammar is ~1.4 MB). NetNamedPipeBinding's 64 KB default is nowhere near + // enough and fails with a low-level "pipe is being closed" error rather than a clear + // quota-exceeded one, so size generously - grammars only grow as projects grow. + private const long MaxMessageSize = 256L * 1024 * 1024; + + public static NetNamedPipeBinding Create() + { + var pipeBinding = new NetNamedPipeBinding(); + pipeBinding.Security.Mode = NetNamedPipeSecurityMode.None; + pipeBinding.MaxBufferSize = ClampToInt(MaxMessageSize); + pipeBinding.MaxReceivedMessageSize = MaxMessageSize; + pipeBinding.MaxBufferPoolSize = MaxMessageSize; + pipeBinding.ReaderQuotas.MaxArrayLength = ClampToInt(MaxMessageSize); + pipeBinding.ReaderQuotas.MaxStringContentLength = ClampToInt(MaxMessageSize); + pipeBinding.ReaderQuotas.MaxBytesPerRead = 65536; + pipeBinding.ReaderQuotas.MaxDepth = 64; + pipeBinding.ReaderQuotas.MaxNameTableCharCount = 65536; + pipeBinding.SendTimeout = TimeSpan.FromMinutes(10); + pipeBinding.ReceiveTimeout = TimeSpan.FromMinutes(10); + return pipeBinding; + } + + private static int ClampToInt(long value) => (int)Math.Min(value, int.MaxValue); + } +} diff --git a/Src/Utilities/pcpatrflex/DisambiguateInFLExDB/DisambiguateInFLExDBTests/ParserConcurrencyBenchmark.cs b/Src/Utilities/pcpatrflex/DisambiguateInFLExDB/DisambiguateInFLExDBTests/ParserConcurrencyBenchmark.cs new file mode 100644 index 0000000000..607baf7064 --- /dev/null +++ b/Src/Utilities/pcpatrflex/DisambiguateInFLExDB/DisambiguateInFLExDBTests/ParserConcurrencyBenchmark.cs @@ -0,0 +1,183 @@ +// Copyright (c) 2026 SIL International +// This software is licensed under the LGPL, version 2.1 or later +// (http://www.gnu.org/licenses/lgpl-2.1.html) +// +// Headless benchmark for concurrent bulk parsing (Parse All Words). +// NOT part of the normal test suite ([Explicit]); run by FullyQualifiedName filter against a +// real project whose path is given in the FW_BENCH_FWDATA environment variable. + +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using GenerateHCConfig; +using NUnit.Framework; +using SIL.FieldWorks; +using SIL.FieldWorks.Common.FwUtils; +using SIL.FieldWorks.WordWorks.Parser; +using SIL.LCModel; +using SIL.LCModel.DomainServices; +using SIL.LCModel.Infrastructure; +using SIL.LCModel.Utils; + +namespace SIL.DisambiguateInFLExDBTests +{ + [TestFixture] + [Explicit("Performance benchmark; run manually with FW_BENCH_FWDATA set")] + internal class ParserConcurrencyBenchmark + { + private LcmCache m_cache; + + [OneTimeSetUp] + public void Setup() + { + string fwdata = Environment.GetEnvironmentVariable("FW_BENCH_FWDATA"); + Assert.That(fwdata, Is.Not.Null.And.Not.Empty, "Set FW_BENCH_FWDATA to a .fwdata path"); + Assert.That(File.Exists(fwdata), Is.True, "fwdata not found: " + fwdata); + + FwRegistryHelper.Initialize(); + FwUtils.InitializeIcu(); + var sync = new SingleThreadedSynchronizeInvoke(); + var logger = new ConsoleLogger(sync); + var dirs = new NullFdoDirectories(); + var settings = new LcmSettings(); + var progress = new NullThreadedProgress(sync); + var projId = new ProjectId(fwdata); + m_cache = LcmCache.CreateCacheFromExistingData(projId, "en", logger, dirs, settings, progress); + } + + [OneTimeTearDown] + public void Teardown() + { + if (m_cache != null) + { + ProjectLockingService.UnlockCurrentProject(m_cache); + m_cache.Dispose(); + m_cache = null; + } + } + + [Test] + [Timeout(2400000)] + public void Benchmark() + { + // Make sure we exercise the HermitCrab (parallelizable) path. + NonUndoableUnitOfWorkHelper.Do(m_cache.ActionHandlerAccessor, () => + m_cache.LanguageProject.MorphologicalDataOA.ActiveParser = "HC"); + + var allWordforms = m_cache.ServiceLocator.GetInstance().AllInstances().ToList(); + int cores = Environment.ProcessorCount; + + // Keep the A/B subset small enough that the SERIAL pass finishes quickly. + int limit = allWordforms.Count; + string limitEnv = Environment.GetEnvironmentVariable("FW_BENCH_LIMIT"); + if (!string.IsNullOrEmpty(limitEnv) && int.TryParse(limitEnv, out int parsedLimit)) + limit = Math.Min(parsedLimit, allWordforms.Count); + var wordforms = allWordforms.Take(limit).ToList(); + + Log($"Project wordforms: {allWordforms.Count} (benchmarking {wordforms.Count}) logical cores: {cores}"); + + using (var idleQueue = new IdleQueue { IsPaused = true }) + using (var worker = new ParserWorker(m_cache, null, t => { }, idleQueue, Path.GetTempPath())) + { + // One-time grammar load + JIT warm-up (parse a single word). + var loadSw = Stopwatch.StartNew(); + worker.ParseAndUpdateWordforms(wordforms.Take(1).ToList(), ParserPriority.Low, false, 1); + DrainIdle(idleQueue); + loadSw.Stop(); + Log($"Grammar load + warm-up (1 word): {loadSw.ElapsedMilliseconds} ms"); + + // Serial-only mode: just the full-project baseline (the true "pre" number). + if (Environment.GetEnvironmentVariable("FW_BENCH_SERIAL_ONLY") == "1") + { + long s = MeasureParse(worker, wordforms, 1, "maxDop= 1 (serial)"); + MeasureFile(idleQueue, " "); + Log($"==> Full project serial ({wordforms.Count} wordforms): {s / 1000.0:0.0}s"); + return; + } + + // Parallel-only mode: skip the (very slow) full-project serial baseline; just time + // the default cap and full-core runs to report real post-change wall-clock. + if (Environment.GetEnvironmentVariable("FW_BENCH_PARALLEL_ONLY") == "1") + { + int cap = Math.Max(1, Math.Min(cores - 1, 4)); + long capParse = MeasureParse(worker, wordforms, cap, $"maxDop={cap,2} (default cap)"); + MeasureFile(idleQueue, " "); + long fullParse = MeasureParse(worker, wordforms, cores, $"maxDop={cores,2} (all cores)"); + MeasureFile(idleQueue, " "); + Log($"==> Full project ({wordforms.Count} wordforms): {capParse / 1000.0:0.0}s at cap {cap}, {fullParse / 1000.0:0.0}s at {cores} cores"); + return; + } + + long serialParse = MeasureParse(worker, wordforms, 1, "maxDop= 1"); + long serialFile = MeasureFile(idleQueue, " "); + + // Sweep the degree of parallelism to find where scaling plateaus. + var dops = new[] { 2, 4, 8, 12, 16, cores }.Where(d => d <= cores).Distinct().OrderBy(d => d).ToArray(); + Log(""); + Log($"{"DOP",4} | {"parse ms",9} | {"speedup",7} | cores-equiv"); + Log($"{1,4} | {serialParse,9} | {1.0,6:0.00}x | {1.0,4:0.0}"); + foreach (int dop in dops) + { + long p = MeasureParseQuiet(worker, wordforms, dop); + MeasureFile(idleQueue, null); + double sp = serialParse / (double) Math.Max(1, p); + Log($"{dop,4} | {p,9} | {sp,6:0.00}x | {sp,4:0.0}"); + } + + long bulk = serialParse + serialFile; + Log(""); + Log($"==> Serial split over {wordforms.Count} wordforms: parse {serialParse} ms ({100.0 * serialParse / Math.Max(1, bulk):0}%) " + + $"vs file {serialFile} ms ({100.0 * serialFile / Math.Max(1, bulk):0}%)"); + Log($"==> Per-wordform serial parse {serialParse / (double) wordforms.Count:0.0} ms, file {serialFile / (double) wordforms.Count:0.0} ms"); + } + } + + private long MeasureParse(ParserWorker worker, List wfs, int dop, string label) + { + HCParser.DiagMorpherParseTicks = 0; + HCParser.DiagGetMorphsTicks = 0; + var sw = Stopwatch.StartNew(); + worker.ParseAndUpdateWordforms(wfs, ParserPriority.Low, false, dop); + sw.Stop(); + long morpherMs = HCParser.DiagMorpherParseTicks / TimeSpan.TicksPerMillisecond; + long getMorphsMs = HCParser.DiagGetMorphsTicks / TimeSpan.TicksPerMillisecond; + Log($"{label}: parse {sw.ElapsedMilliseconds,7} ms (summed across threads: morpher {morpherMs} ms, GetMorphs/readlock {getMorphsMs} ms)"); + return sw.ElapsedMilliseconds; + } + + private long MeasureParseQuiet(ParserWorker worker, List wfs, int dop) + { + var sw = Stopwatch.StartNew(); + worker.ParseAndUpdateWordforms(wfs, ParserPriority.Low, false, dop); + sw.Stop(); + return sw.ElapsedMilliseconds; + } + + private long MeasureFile(IdleQueue q, string label) + { + var sw = Stopwatch.StartNew(); + DrainIdle(q); + sw.Stop(); + if (label != null) + Log($"{label} : file {sw.ElapsedMilliseconds,7} ms"); + return sw.ElapsedMilliseconds; + } + + private void DrainIdle(IdleQueue q) + { + // Run filing repeatedly until the queue is empty (UpdateWordforms re-queues itself if + // it can't complete; in this single-threaded benchmark it completes on the first pass). + foreach (IdleQueueTask task in q.ToArray()) + task.Delegate(task.Parameter); + q.Clear(); + } + + private static void Log(string msg) + { + TestContext.Progress.WriteLine(msg); + Console.WriteLine(msg); + } + } +}