diff --git a/Src/LexText/Interlinear/BIRDInterlinearImporter.cs b/Src/LexText/Interlinear/BIRDInterlinearImporter.cs index e71052d0c6..a7322cea47 100644 --- a/Src/LexText/Interlinear/BIRDInterlinearImporter.cs +++ b/Src/LexText/Interlinear/BIRDInterlinearImporter.cs @@ -769,48 +769,164 @@ private static IAnalysis CreateWordformWithWfiAnalysis(LcmCache cache, Word word if (itemDict.ContainsKey("cf")) // Lex. Entries { + // NB: "cf" records the lexeme, not the headword/citation form (in spite of the name). int ws_cf = GetWsEngine(wsFact, itemDict["cf"].Item1).Handle; ILexEntry entry = null; var entries = lex_entry_repo.AllInstances().Where( - m => StringServices.CitationFormWithAffixTypeStaticForWs(m, ws_cf, string.Empty) == itemDict["cf"].Item2); - if (entries.Count() == 1) + m => DecorateFormWithAffixMarkers(m.LexemeFormOA?.MorphTypeRA, m.LexemeFormOA?.Form?.get_String(ws_cf)?.Text) == itemDict["cf"].Item2); + + // Filter entries by homograph number. + // If the lexeme and the headword are different, + // then there may be more than one entry with the given homograph number. + // This is because homograph numbers distinguish headwords rather than lexemes. + // If there is no "hn" entry, then the hn is 0. + string hn = "0"; + if (itemDict.ContainsKey("hn")) // Homograph Number { - entry = entries.First(); + hn = itemDict["hn"].Item2; } - else if (itemDict.ContainsKey("hn")) // Homograph Number + var hnEntries = entries.Where(m => m.HomographNumber.ToString() == hn); + if (hnEntries.Count() > 0) { - entry = entries.FirstOrDefault(m => m.HomographNumber.ToString() == itemDict["hn"].Item2); + entries = hnEntries; } - if (entry != null) - { - bundle.MorphRA = entry.LexemeFormOA; - if (itemDict.ContainsKey("gls")) // Lex. Gloss + if (itemDict.ContainsKey("gls")) // Lex. Gloss + { + // Filter senses by gloss. + int ws_gls = GetWsEngine(wsFact, itemDict["gls"].Item1).Handle; + IList senses = new List(); + foreach (var e in entries) + { + senses.AddRange(e.SensesOS.Where(s => s.Gloss.get_String(ws_gls).Text == itemDict["gls"].Item2)); + } + if (senses.Count() > 1 && itemDict.ContainsKey("msa")) { - int ws_gls = GetWsEngine(wsFact, itemDict["gls"].Item1).Handle; - ILexSense sense = entry.SensesOS.FirstOrDefault(s => s.Gloss.get_String(ws_gls).Text == itemDict["gls"].Item2); - if (sense != null) + // Filter senses by MSA. + IList msaSenses = senses.Where(s => s.MorphoSyntaxAnalysisRA?.InterlinearAbbr == itemDict["msa"].Item2).ToList(); + if (msaSenses.Count() > 0) { - bundle.SenseRA = sense; + senses = msaSenses; } } + // Record sense. + if (senses.Count() > 0) + { + bundle.SenseRA = senses.FirstOrDefault(); + entry = bundle.SenseRA.Entry; + } + } + + if (entry == null && entries.Count() > 0) + { + entry = entries.First(); + } + + // Record morpheme. + if (entry != null) + { + if (itemDict.ContainsKey("txt")) + { + // Try allomorph first. + var ws_txt = GetWsEngine(wsFact, itemDict["txt"].Item1).Handle; + bundle.MorphRA = entry.AllAllomorphs.Where( + m => DecorateFormWithAffixMarkers(m.MorphTypeRA, m.Form.get_String(ws_txt).Text) == itemDict["txt"].Item2).FirstOrDefault(); + } + if (bundle.MorphRA == null) + { + bundle.MorphRA = entry.LexemeFormOA; + } } } if (itemDict.ContainsKey("msa")) // Lex. Gram. Info { - IMoMorphSynAnalysis match = msa_repo.AllInstances().FirstOrDefault(m => m.InterlinearAbbr == itemDict["msa"].Item2); - if (match != null) + if (bundle.SenseRA != null && bundle.SenseRA.MorphoSyntaxAnalysisRA?.InterlinearAbbr == itemDict["msa"].Item2) + { + bundle.MsaRA = bundle.SenseRA.MorphoSyntaxAnalysisRA; + } + else + { + IMoMorphSynAnalysis match = msa_repo.AllInstances().FirstOrDefault(m => m.InterlinearAbbr == itemDict["msa"].Item2); + if (match != null) + { + bundle.MsaRA = match; + } + } + } + } + } + + // Try to fill in category. + if (word.Items != null && wordForm.Analysis != null) + { + // Look for an existing category that matches a "pos". + bool hasPOS = false; + foreach (var item in word.Items) + { + if (wordForm.Analysis.CategoryRA != null) + { + // Category filled in. + break; + } + if (item.type == "pos") + { + hasPOS = true; + ILgWritingSystem writingSystem = GetWsEngine(cache.WritingSystemFactory, item.lang); + if (writingSystem != null) + { + foreach (var cat in cache.LanguageProject.AllPartsOfSpeech) + { + if (MatchesCatNameOrAbbreviation(writingSystem.Handle, item.Value, cat)) + { + wordForm.Analysis.CategoryRA = cat; + break; + } + } + } + } + } + if (hasPOS && wordForm.Analysis.CategoryRA == null) + { + // Create a new category. + IPartOfSpeech cat = cache.ServiceLocator.GetInstance().Create(); + cache.LanguageProject.PartsOfSpeechOA.PossibilitiesOS.Add(cat); + foreach (var item in word.Items) + { + if (item.type == "pos") { - bundle.MsaRA = match; + ILgWritingSystem writingSystem = GetWsEngine(cache.WritingSystemFactory, item.lang); + if (writingSystem != null) + { + cat.Name.set_String(writingSystem.Handle, item.Value); + cat.Abbreviation.set_String(writingSystem.Handle, item.Value); + } } } + wordForm.Analysis.CategoryRA = cat; } } return wordForm; } + // Based on StringServices.DecorateFormWithAffixMarkers. + private static string DecorateFormWithAffixMarkers(IMoMorphType mmt, string form) + { + if (mmt == null || form == null) + return form; + // Add pre- post markers, if any. + if (!String.IsNullOrEmpty(mmt.Prefix)) + { + form = mmt.Prefix + form; + } + if (!String.IsNullOrEmpty(mmt.Postfix)) + { + form = form + mmt.Postfix; + } + return form; + } + private static bool FindOrCreateWfiAnalysis(LcmCache cache, Word word, int mainWritingSystem, out IAnalysis analysis) @@ -820,6 +936,7 @@ private static bool FindOrCreateWfiAnalysis(LcmCache cache, Word word, // First, collect all expected forms and glosses from the Word var expectedForms = new Dictionary(); // wsHandle -> expected value var expectedGlosses = new Dictionary(); // wsHandle -> expected gloss + var expectedCats = new Dictionary(); // wsHandle -> expected cat IAnalysis candidateForm = null; ITsString wordForm = null; ITsString punctForm = null; @@ -871,6 +988,10 @@ private static bool FindOrCreateWfiAnalysis(LcmCache cache, Word word, expectedGlosses[ws.Handle] = wordItem.Value; break; + + case "pos": + expectedCats[ws.Handle] = wordItem.Value; + break; } } @@ -896,23 +1017,57 @@ private static bool FindOrCreateWfiAnalysis(LcmCache cache, Word word, return true; } + analysis = FindMatchingAnalysis(cache, candidateWordform, word, expectedGlosses, expectedCats); + if (analysis != null) + { + return true; + } + + if (wordForm.Text.ToLower() != wordForm.Text) + { + // Try lowercase. + var lcCandidateForm = cache.ServiceLocator + .GetInstance() + .GetMatchingWordform(wordForm.get_WritingSystemAt(0), wordForm.Text.ToLower()); + if (lcCandidateForm is IWfiWordform lcCandidateWordform) + { + analysis = FindMatchingAnalysis(cache, lcCandidateWordform, word, expectedGlosses, expectedCats); + if (analysis != null) + { + return true; + } + } + } + + // No matching analysis found with all expected gloss and morpheme data + analysis = AddEmptyAnalysisToWordform(cache, candidateWordform); + return false; + } + + private static IAnalysis FindMatchingAnalysis(LcmCache cache, IWfiWordform candidateWordform, Word word, + Dictionary expectedGlosses, Dictionary expectedCats) + { + IAnalysis analysis = null; + var wsFact = cache.WritingSystemFactory; // Look for an analysis that has the correct morphemes and a matching gloss foreach (var wfiAnalysis in candidateWordform.AnalysesOC) { var morphemeMatch = true; // verify that the analysis has a Morph Bundle with the expected morphemes from the import - if (word.morphemes != null && wfiAnalysis.MorphBundlesOS.Count == word.morphemes?.morphs.Length) + if (word.morphemes != null && wfiAnalysis.MorphBundlesOS.Count == word.morphemes?.morphs.Length && + word.morphemes.analysisStatus == analysisStatusTypes.humanApproved) { analysis = GetMostSpecificAnalysisForWordForm(wfiAnalysis); - for(var i = 0; i < wfiAnalysis.MorphBundlesOS.Count; ++i) + for (var i = 0; i < wfiAnalysis.MorphBundlesOS.Count; ++i) { - var extantMorphForm = wfiAnalysis.MorphBundlesOS[i].Form; + var morphBundle = wfiAnalysis.MorphBundlesOS[i]; + var extantMorphForm = morphBundle.Form; var importMorphForm = word.morphemes.morphs[i].items.FirstOrDefault(item => item.type == "txt"); var importFormWs = GetWsEngine(wsFact, importMorphForm?.lang); // compare the import item to the extant morph form if (importMorphForm == null || extantMorphForm == null || TsStringUtils.IsNullOrEmpty(extantMorphForm.get_String(importFormWs.Handle)) || - !extantMorphForm.get_String(importFormWs.Handle).Text.Normalize() + !DecorateFormWithAffixMarkers(morphBundle.MorphRA?.MorphTypeRA, extantMorphForm.get_String(importFormWs.Handle).Text).Normalize() .Equals(importMorphForm.Value?.Normalize())) { morphemeMatch = false; @@ -923,18 +1078,14 @@ private static bool FindOrCreateWfiAnalysis(LcmCache cache, Word word, if (morphemeMatch) { - var matchingGloss = wfiAnalysis.MeaningsOC.FirstOrDefault(g => VerifyGlossesMatch(g, expectedGlosses)); + var matchingGloss = wfiAnalysis.MeaningsOC.FirstOrDefault(g => VerifyGlossesMatch(g, expectedGlosses, expectedCats)); if (matchingGloss != null) { - analysis = matchingGloss; - return true; + return matchingGloss; } } } - - // No matching analysis found with all expected gloss and morpheme data - analysis = AddEmptyAnalysisToWordform(cache, candidateWordform); - return false; + return null; } private static IAnalysis GetMostSpecificAnalysisForWordForm(IAnalysis candidateWordform) @@ -1031,7 +1182,8 @@ private static bool MatchPrimaryFormAndAddMissingAlternatives(IAnalysis wordForm // Helper method to verify that all expected glosses match the stored glosses private static bool VerifyGlossesMatch(IWfiGloss wfiGloss, - Dictionary expectedGlosses) + Dictionary expectedGlosses, + Dictionary expectedCats) { foreach (var expectedGloss in expectedGlosses) { @@ -1042,10 +1194,28 @@ private static bool VerifyGlossesMatch(IWfiGloss wfiGloss, if (storedGloss == null || storedGloss.Text != expectedValue) return false; // Mismatch found } + foreach (var expectedCat in expectedCats) + { + if (!MatchesCatNameOrAbbreviation(expectedCat.Key, expectedCat.Value, wfiGloss.Analysis?.CategoryRA)) + return false; + } return true; } + private static bool MatchesCatNameOrAbbreviation(int ws, string text, IPartOfSpeech cat) + { + if (cat == null) + return false; + ITsString name = cat.Name.get_String(ws); + if (name != null && name.Text == text) + return true; + ITsString abbr = cat.Abbreviation.get_String(ws); + if (abbr != null && abbr.Text == text) + return true; + return false; + } + /// /// /// The word Gloss. If multiple glosses, returns the last one created. diff --git a/Src/LexText/Interlinear/ITextDllTests/BIRDFormatImportTests.cs b/Src/LexText/Interlinear/ITextDllTests/BIRDFormatImportTests.cs index 4d1f0f27dd..c29f252cdc 100644 --- a/Src/LexText/Interlinear/ITextDllTests/BIRDFormatImportTests.cs +++ b/Src/LexText/Interlinear/ITextDllTests/BIRDFormatImportTests.cs @@ -860,6 +860,78 @@ public void TestGenres() } } + [Test] + public void TestExistingWordCategory() + { + string title = "atrocious"; + string abbr = "atroc"; + //an interliner text example xml string + string xml = "" + + "" + + "1 Musical" + + "origem: mary poppins" + + "supercalifragilisticexpialidocious" + + "absurdo" + + "N" + + ""; + + // Create a category to find. + IPartOfSpeech cat = null; + NonUndoableUnitOfWorkHelper.Do(Cache.ActionHandlerAccessor, + () => + { + cat = Cache.ServiceLocator.GetInstance().Create(); + Cache.LanguageProject.PartsOfSpeechOA.PossibilitiesOS.Add(cat); + cat.Name.set_String(Cache.DefaultAnalWs, "N"); + }); + LinguaLinksImport li = new LinguaLinksImport(Cache, null, null); + LCModel.IText text = null; + using (var stream = new MemoryStream(Encoding.ASCII.GetBytes(xml.ToCharArray()))) + { + li.ImportInterlinear(new DummyProgressDlg(), stream, 0, ref text); + using (var firstEntry = Cache.LanguageProject.Texts.GetEnumerator()) + { + firstEntry.MoveNext(); + var imported = firstEntry.Current; + ISegment segment = imported.ContentsOA[0].SegmentsOS[0]; + // Verify that we found the category. + Assert.That(segment.AnalysesRS[0].Analysis.CategoryRA, Is.EqualTo(cat)); + } + } + } + + [Test] + public void TestNewWordCategory() + { + string title = "atrocious"; + string abbr = "atroc"; + //an interliner text example xml string + string xml = "" + + "" + + "1 Musical" + + "origem: mary poppins" + + "supercalifragilisticexpialidocious" + + "absurdo" + + "X" + + ""; + + LinguaLinksImport li = new LinguaLinksImport(Cache, null, null); + LCModel.IText text = null; + using (var stream = new MemoryStream(Encoding.ASCII.GetBytes(xml.ToCharArray()))) + { + li.ImportInterlinear(new DummyProgressDlg(), stream, 0, ref text); + using (var firstEntry = Cache.LanguageProject.Texts.GetEnumerator()) + { + firstEntry.MoveNext(); + var imported = firstEntry.Current; + ISegment segment = imported.ContentsOA[0].SegmentsOS[0]; + // Verify that we created a category. + Assert.True(segment.AnalysesRS[0].Analysis.CategoryRA.Name.BestAnalysisAlternative.Text.Equals("X")); + Assert.True(segment.AnalysesRS[0].Analysis.CategoryRA.Abbreviation.BestAnalysisAlternative.Text.Equals("X")); + } + } + } + [Test] public void TestSpacesAroundPunct() {