From d70813ce013f2f1c47e34fce944d6b3b6a38d93c Mon Sep 17 00:00:00 2001 From: Damien Daspit Date: Tue, 30 Jan 2024 07:42:25 -0500 Subject: [PATCH] Add UsfmVerseTextUpdater class (#160) --- .../FileParatextProjectSettingsParser.cs | 42 +++ .../Corpora/ParatextBackupTermsCorpus.cs | 47 ++-- .../Corpora/ParatextBackupTextCorpus.cs | 105 +------ .../Corpora/ParatextProjectSettings.cs | 47 ++++ .../ParatextProjectSettingsParserBase.cs | 108 +++++++ src/SIL.Machine/Corpora/ParatextTextCorpus.cs | 86 +----- src/SIL.Machine/Corpora/UsfmParser.cs | 60 ++-- src/SIL.Machine/Corpora/UsfmParserState.cs | 6 + src/SIL.Machine/Corpora/UsfmStylesheet.cs | 13 +- src/SIL.Machine/Corpora/UsfmTextBase.cs | 13 +- src/SIL.Machine/Corpora/UsfmTokenizer.cs | 8 +- .../Corpora/UsfmVerseTextUpdater.cs | 263 ++++++++++++++++++ .../ZipParatextProjectSettingsParser.cs | 70 +++++ .../Corpora/ParatextBackupTermsCorpusTests.cs | 2 - .../Corpora/ParatextBackupTextCorpusTests.cs | 4 +- .../Corpora/TestData/usfm/Tes/41MATTes.SFM | 14 +- .../Corpora/UsfmTokenizerTests.cs | 2 +- .../Corpora/UsfmVerseTextUpdaterTests.cs | 208 ++++++++++++++ 18 files changed, 852 insertions(+), 246 deletions(-) create mode 100644 src/SIL.Machine/Corpora/FileParatextProjectSettingsParser.cs create mode 100644 src/SIL.Machine/Corpora/ParatextProjectSettings.cs create mode 100644 src/SIL.Machine/Corpora/ParatextProjectSettingsParserBase.cs create mode 100644 src/SIL.Machine/Corpora/UsfmVerseTextUpdater.cs create mode 100644 src/SIL.Machine/Corpora/ZipParatextProjectSettingsParser.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/UsfmVerseTextUpdaterTests.cs diff --git a/src/SIL.Machine/Corpora/FileParatextProjectSettingsParser.cs b/src/SIL.Machine/Corpora/FileParatextProjectSettingsParser.cs new file mode 100644 index 00000000..09f7b28a --- /dev/null +++ b/src/SIL.Machine/Corpora/FileParatextProjectSettingsParser.cs @@ -0,0 +1,42 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; + +namespace SIL.Machine.Corpora +{ + public class FileParatextProjectSettingsParser : ParatextProjectSettingsParserBase + { + private readonly string _projectDir; + + public FileParatextProjectSettingsParser(string projectDir) + { + _projectDir = projectDir; + } + + protected override UsfmStylesheet CreateStylesheet(string fileName) + { + string customStylesheetFileName = Path.Combine(_projectDir, fileName); + return new UsfmStylesheet( + fileName, + File.Exists(customStylesheetFileName) ? customStylesheetFileName : null + ); + } + + protected override bool Exists(string fileName) + { + return File.Exists(Path.Combine(_projectDir, fileName)); + } + + protected override string Find(string extension) + { + return Directory.EnumerateFiles(_projectDir, "*" + extension).FirstOrDefault(); + } + + protected override Stream Open(string fileName) + { + return File.OpenRead(Path.Combine(_projectDir, fileName)); + } + } +} diff --git a/src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs b/src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs index 39fd5341..8202e31f 100644 --- a/src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs +++ b/src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs @@ -2,16 +2,22 @@ using System.Collections.Generic; using System.IO; using System.IO.Compression; -using System.Text.RegularExpressions; -using System.Xml.Linq; using System.Linq; using System.Reflection; +using System.Text.RegularExpressions; +using System.Xml.Linq; namespace SIL.Machine.Corpora { public class ParatextBackupTermsCorpus : DictionaryTextCorpus { - private static List PREDEFINED_TERMS_LIST_TYPES = new List() { "Major", "All", "SilNt", "Pt6" }; + private static readonly List PredefinedTermsListTypes = new List() + { + "Major", + "All", + "SilNt", + "Pt6" + }; public ParatextBackupTermsCorpus(string fileName, IEnumerable termCategories) { @@ -22,39 +28,25 @@ public ParatextBackupTermsCorpus(string fileName, IEnumerable termCatego if (termsFileEntry is null) return; - ZipArchiveEntry settingsEntry = archive.GetEntry("Settings.xml"); - if (settingsEntry == null) - settingsEntry = archive.Entries.FirstOrDefault(e => e.FullName.EndsWith(".ssf")); - if (settingsEntry == null) - { - throw new ArgumentException( - "The project backup does not contain a settings file.", - nameof(fileName) - ); - } - XDocument settingsDoc; - using (Stream stream = settingsEntry.Open()) - { - settingsDoc = XDocument.Load(stream); - } - string textId = settingsDoc.Root.Element("BiblicalTermsListSetting").Value; + var settingsParser = new ZipParatextProjectSettingsParser(archive); + ParatextProjectSettings settings = settingsParser.Parse(); XDocument termRenderingsDoc; - using (var keyTermsFile = termsFileEntry.Open()) + using (Stream keyTermsFile = termsFileEntry.Open()) { termRenderingsDoc = XDocument.Load(keyTermsFile); } - ZipArchiveEntry biblicalTermsFileEntry = archive.GetEntry(textId.Split(':').Last()); + ZipArchiveEntry biblicalTermsFileEntry = archive.GetEntry(settings.BiblicalTermsFileName); XDocument biblicalTermsDoc; IDictionary termIdToCategoryDictionary; - if (PREDEFINED_TERMS_LIST_TYPES.Contains(textId.Split(':').First())) + if (PredefinedTermsListTypes.Contains(settings.BiblicalTermsListType)) { using ( - var keyTermsFile = Assembly + Stream keyTermsFile = Assembly .GetExecutingAssembly() - .GetManifestResourceStream("SIL.Machine.Corpora." + textId.Split(':').Last()) + .GetManifestResourceStream("SIL.Machine.Corpora." + settings.BiblicalTermsFileName) ) { biblicalTermsDoc = XDocument.Load(keyTermsFile); @@ -62,11 +54,10 @@ public ParatextBackupTermsCorpus(string fileName, IEnumerable termCatego } } else if ( - textId.Split(':').First() == "Project" - && textId.Split(':')[1] == settingsDoc.Root.Element("Name").Value + settings.BiblicalTermsListType == "Project" && settings.BiblicalTermsProjectName == settings.Name ) { - using (var keyTermsFile = biblicalTermsFileEntry.Open()) + using (Stream keyTermsFile = biblicalTermsFileEntry.Open()) { biblicalTermsDoc = XDocument.Load(keyTermsFile); termIdToCategoryDictionary = GetCategoryPerId(biblicalTermsDoc); @@ -81,6 +72,8 @@ public ParatextBackupTermsCorpus(string fileName, IEnumerable termCatego .Descendants() .Where(n => n.Name.LocalName == "TermRendering"); + string textId = + $"{settings.BiblicalTermsListType}:{settings.BiblicalTermsProjectName}:{settings.BiblicalTermsFileName}"; foreach (XElement element in termsElements) { string id = element.Attribute("Id").Value; diff --git a/src/SIL.Machine/Corpora/ParatextBackupTextCorpus.cs b/src/SIL.Machine/Corpora/ParatextBackupTextCorpus.cs index 86fbde0b..2279c542 100644 --- a/src/SIL.Machine/Corpora/ParatextBackupTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParatextBackupTextCorpus.cs @@ -1,12 +1,6 @@ -using System; -using System.IO; -using System.IO.Compression; +using System.IO.Compression; using System.Linq; -using System.Text; using System.Text.RegularExpressions; -using System.Xml.Linq; -using SIL.IO; -using SIL.Scripture; namespace SIL.Machine.Corpora { @@ -14,104 +8,23 @@ public class ParatextBackupTextCorpus : ScriptureTextCorpus { public ParatextBackupTextCorpus(string fileName, bool includeMarkers = false) { - Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); using (ZipArchive archive = ZipFile.OpenRead(fileName)) { - ZipArchiveEntry settingsEntry = archive.GetEntry("Settings.xml"); - if (settingsEntry == null) - settingsEntry = archive.Entries.FirstOrDefault(e => e.FullName.EndsWith(".ssf")); - if (settingsEntry == null) - { - throw new ArgumentException( - "The project backup does not contain a settings file.", - nameof(fileName) - ); - } - XDocument settingsDoc; - using (Stream stream = settingsEntry.Open()) - { - settingsDoc = XDocument.Load(stream); - } - var encodingStr = (string)settingsDoc.Root.Element("Encoding") ?? "65001"; - if (!int.TryParse(encodingStr, out int codePage)) - { - throw new NotImplementedException( - $"The project uses a legacy encoding that requires TECKit, map file: {encodingStr}." - ); - } - var encoding = Encoding.GetEncoding(codePage); + var parser = new ZipParatextProjectSettingsParser(archive); + ParatextProjectSettings settings = parser.Parse(); - var scrVersType = (int?)settingsDoc.Root.Element("Versification") ?? (int)ScrVersType.English; - Versification = new ScrVers((ScrVersType)scrVersType); - ZipArchiveEntry customVersEntry = archive.GetEntry("custom.vrs"); - if (customVersEntry != null) - { - var guid = (string)settingsDoc.Root.Element("Guid"); - string versName = ((ScrVersType)scrVersType).ToString() + "-" + guid; - if (Scripture.Versification.Table.Implementation.Exists(versName)) - { - Versification = new ScrVers(versName); - } - else - { - using (var reader = new StreamReader(customVersEntry.Open())) - { - Versification = Scripture.Versification.Table.Implementation.Load( - reader, - "custom.vrs", - Versification, - versName - ); - } - } - } - - var stylesheetName = (string)settingsDoc.Root.Element("StyleSheet") ?? "usfm.sty"; - ZipArchiveEntry stylesheetEntry = archive.GetEntry(stylesheetName); - if (stylesheetEntry == null && stylesheetName != "usfm_sb.sty") - stylesheetEntry = archive.GetEntry("usfm.sty"); - ZipArchiveEntry customStylesheetEntry = archive.GetEntry("custom.sty"); - - UsfmStylesheet stylesheet; - using (var stylesheetTempFile = TempFile.CreateAndGetPathButDontMakeTheFile()) - using (var customStylesheetTempFile = TempFile.CreateAndGetPathButDontMakeTheFile()) - { - string stylesheetPath = "usfm.sty"; - if (stylesheetEntry != null) - { - stylesheetEntry.ExtractToFile(stylesheetTempFile.Path); - stylesheetPath = stylesheetTempFile.Path; - } - string customStylesheetPath = null; - if (customStylesheetEntry != null) - { - customStylesheetEntry.ExtractToFile(customStylesheetTempFile.Path); - customStylesheetPath = customStylesheetTempFile.Path; - } - stylesheet = new UsfmStylesheet(stylesheetPath, customStylesheetPath); - } - - string prefix = ""; - string suffix = ".SFM"; - XElement namingElem = settingsDoc.Root.Element("Naming"); - if (namingElem != null) - { - var prePart = (string)namingElem.Attribute("PrePart"); - if (!string.IsNullOrEmpty(prePart)) - prefix = prePart; - var postPart = (string)namingElem.Attribute("PostPart"); - if (!string.IsNullOrEmpty(postPart)) - suffix = postPart; - } + Versification = settings.Versification; - var regex = new Regex($"^{Regex.Escape(prefix)}.*{Regex.Escape(suffix)}$"); + var regex = new Regex( + $"^{Regex.Escape(settings.FileNamePrefix)}.*{Regex.Escape(settings.FileNameSuffix)}$" + ); foreach (ZipArchiveEntry sfmEntry in archive.Entries.Where(e => regex.IsMatch(e.FullName))) { AddText( new UsfmZipText( - stylesheet, - encoding, + settings.Stylesheet, + settings.Encoding, fileName, sfmEntry.FullName, Versification, diff --git a/src/SIL.Machine/Corpora/ParatextProjectSettings.cs b/src/SIL.Machine/Corpora/ParatextProjectSettings.cs new file mode 100644 index 00000000..866ab8e7 --- /dev/null +++ b/src/SIL.Machine/Corpora/ParatextProjectSettings.cs @@ -0,0 +1,47 @@ +using System.Text; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + public class ParatextProjectSettings + { + public ParatextProjectSettings( + string name, + string fullName, + Encoding encoding, + ScrVers versification, + UsfmStylesheet stylesheet, + string fileNamePrefix, + string fileNameForm, + string fileNameSuffix, + string biblicalTermsListType, + string biblicalTermsProjectName, + string biblicalTermsFileName + ) + { + Name = name; + FullName = fullName; + Encoding = encoding; + Versification = versification; + Stylesheet = stylesheet; + FileNamePrefix = fileNamePrefix; + FileNameForm = fileNameForm; + FileNameSuffix = fileNameSuffix; + BiblicalTermsListType = biblicalTermsListType; + BiblicalTermsProjectName = biblicalTermsProjectName; + BiblicalTermsFileName = biblicalTermsFileName; + } + + public string Name { get; } + public string FullName { get; } + public Encoding Encoding { get; } + public ScrVers Versification { get; } + public UsfmStylesheet Stylesheet { get; } + public string FileNamePrefix { get; } + public string FileNameForm { get; } + public string FileNameSuffix { get; } + public string BiblicalTermsListType { get; } + public string BiblicalTermsProjectName { get; } + public string BiblicalTermsFileName { get; } + } +} diff --git a/src/SIL.Machine/Corpora/ParatextProjectSettingsParserBase.cs b/src/SIL.Machine/Corpora/ParatextProjectSettingsParserBase.cs new file mode 100644 index 00000000..d683f4b5 --- /dev/null +++ b/src/SIL.Machine/Corpora/ParatextProjectSettingsParserBase.cs @@ -0,0 +1,108 @@ +using System; +using System.IO; +using System.Text; +using System.Xml.Linq; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + public abstract class ParatextProjectSettingsParserBase + { + public ParatextProjectSettings Parse() + { + Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); + string settingsFileName = "Settings.xml"; + if (!Exists(settingsFileName)) + settingsFileName = Find(".ssf"); + if (string.IsNullOrEmpty(settingsFileName)) + throw new InvalidOperationException("The project does not contain a settings file."); + XDocument settingsDoc; + using (Stream stream = Open(settingsFileName)) + { + settingsDoc = XDocument.Load(stream); + } + + string name = settingsDoc.Root.Element("Name").Value; + string fullName = settingsDoc.Root.Element("FullName").Value; + + var encodingStr = (string)settingsDoc.Root.Element("Encoding") ?? "65001"; + if (!int.TryParse(encodingStr, out int codePage)) + { + throw new NotImplementedException( + $"The project uses a legacy encoding that requires TECKit, map file: {encodingStr}." + ); + } + var encoding = Encoding.GetEncoding(codePage); + + var scrVersType = (int?)settingsDoc.Root.Element("Versification") ?? (int)ScrVersType.English; + var versification = new ScrVers((ScrVersType)scrVersType); + if (Exists("custom.vrs")) + { + var guid = (string)settingsDoc.Root.Element("Guid"); + string versName = ((ScrVersType)scrVersType).ToString() + "-" + guid; + if (Versification.Table.Implementation.Exists(versName)) + { + versification = new ScrVers(versName); + } + else + { + using (var reader = new StreamReader(Open("custom.vrs"))) + { + versification = Versification.Table.Implementation.Load( + reader, + "custom.vrs", + versification, + versName + ); + } + } + } + + var stylesheetFileName = (string)settingsDoc.Root.Element("StyleSheet") ?? "usfm.sty"; + if (!Exists(stylesheetFileName) && stylesheetFileName != "usfm_sb.sty") + stylesheetFileName = "usfm.sty"; + UsfmStylesheet stylesheet = CreateStylesheet(stylesheetFileName); + + string prefix = ""; + string form = "41MAT"; + string suffix = ".SFM"; + XElement namingElem = settingsDoc.Root.Element("Naming"); + if (namingElem != null) + { + var prePart = (string)namingElem.Attribute("PrePart"); + if (!string.IsNullOrEmpty(prePart)) + prefix = prePart; + + var bookNameForm = (string)namingElem.Attribute("BookNameForm"); + if (!string.IsNullOrEmpty(bookNameForm)) + form = bookNameForm; + + var postPart = (string)namingElem.Attribute("PostPart"); + if (!string.IsNullOrEmpty(postPart)) + suffix = postPart; + } + + string biblicalTerms = settingsDoc.Root.Element("BiblicalTermsListSetting").Value; + string[] parts = biblicalTerms.Split(new[] { ':' }, 3); + + return new ParatextProjectSettings( + name, + fullName, + encoding, + versification, + stylesheet, + prefix, + form, + suffix, + parts[0], + parts[1], + parts[2] + ); + } + + protected abstract bool Exists(string fileName); + protected abstract string Find(string extension); + protected abstract Stream Open(string fileName); + protected abstract UsfmStylesheet CreateStylesheet(string fileName); + } +} diff --git a/src/SIL.Machine/Corpora/ParatextTextCorpus.cs b/src/SIL.Machine/Corpora/ParatextTextCorpus.cs index b471b79d..aec48df7 100644 --- a/src/SIL.Machine/Corpora/ParatextTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParatextTextCorpus.cs @@ -1,9 +1,4 @@ -using System; -using System.IO; -using System.Linq; -using System.Text; -using System.Xml.Linq; -using SIL.Scripture; +using System.IO; namespace SIL.Machine.Corpora { @@ -11,77 +6,22 @@ public class ParatextTextCorpus : ScriptureTextCorpus { public ParatextTextCorpus(string projectDir, bool includeMarkers = false) { - Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); - string settingsFileName = Path.Combine(projectDir, "Settings.xml"); - if (!File.Exists(settingsFileName)) - settingsFileName = Directory.EnumerateFiles(projectDir, "*.ssf").FirstOrDefault(); - if (string.IsNullOrEmpty(settingsFileName)) - { - throw new ArgumentException( - "The project directory does not contain a settings file.", - nameof(projectDir) - ); - } - var settingsDoc = XDocument.Load(settingsFileName); - var encodingStr = (string)settingsDoc.Root.Element("Encoding") ?? "65001"; - if (!int.TryParse(encodingStr, out int codePage)) - { - throw new NotImplementedException( - $"The project uses a legacy encoding that requires TECKit, map file: {encodingStr}." - ); - } - var encoding = Encoding.GetEncoding(codePage); + var parser = new FileParatextProjectSettingsParser(projectDir); + ParatextProjectSettings settings = parser.Parse(); - var scrVersType = (int?)settingsDoc.Root.Element("Versification") ?? (int)ScrVersType.English; - Versification = new ScrVers((ScrVersType)scrVersType); - string customVersPath = Path.Combine(projectDir, "custom.vrs"); - if (File.Exists(customVersPath)) - { - var guid = (string)settingsDoc.Root.Element("Guid"); - string versName = ((ScrVersType)scrVersType).ToString() + "-" + guid; - if (Scripture.Versification.Table.Implementation.Exists(versName)) - { - Versification = new ScrVers(versName); - } - else - { - using (var reader = new StreamReader(customVersPath)) - { - Versification = Scripture.Versification.Table.Implementation.Load( - reader, - customVersPath, - Versification, - versName - ); - } - } - } - - var stylesheetName = (string)settingsDoc.Root.Element("StyleSheet") ?? "usfm.sty"; - string stylesheetFileName = Path.Combine(projectDir, stylesheetName); - if (!File.Exists(stylesheetFileName) && stylesheetName != "usfm_sb.sty") - stylesheetFileName = Path.Combine(projectDir, "usfm.sty"); - string customStylesheetPath = Path.Combine(projectDir, "custom.sty"); - var stylesheet = new UsfmStylesheet( - stylesheetFileName, - File.Exists(customStylesheetPath) ? customStylesheetPath : null - ); + Versification = settings.Versification; - string prefix = ""; - string suffix = ".SFM"; - XElement namingElem = settingsDoc.Root.Element("Naming"); - if (namingElem != null) + foreach ( + string sfmFileName in Directory.EnumerateFiles( + projectDir, + $"{settings.FileNamePrefix}*{settings.FileNameSuffix}" + ) + ) { - var prePart = (string)namingElem.Attribute("PrePart"); - if (!string.IsNullOrEmpty(prePart)) - prefix = prePart; - var postPart = (string)namingElem.Attribute("PostPart"); - if (!string.IsNullOrEmpty(postPart)) - suffix = postPart; + AddText( + new UsfmFileText(settings.Stylesheet, settings.Encoding, sfmFileName, Versification, includeMarkers) + ); } - - foreach (string sfmFileName in Directory.EnumerateFiles(projectDir, $"{prefix}*{suffix}")) - AddText(new UsfmFileText(stylesheet, encoding, sfmFileName, Versification, includeMarkers)); } } } diff --git a/src/SIL.Machine/Corpora/UsfmParser.cs b/src/SIL.Machine/Corpora/UsfmParser.cs index 32a463fe..47ac30ae 100644 --- a/src/SIL.Machine/Corpora/UsfmParser.cs +++ b/src/SIL.Machine/Corpora/UsfmParser.cs @@ -29,7 +29,7 @@ public static void Parse( public static void Parse( string usfm, IUsfmParserHandler handler, - UsfmStylesheet stylesheet = null, + UsfmStylesheet stylesheet, ScrVers versification = null, bool preserveWhitespace = false ) @@ -46,12 +46,6 @@ public static void Parse( private static readonly Regex OptBreakSplitter = new Regex("(//)", RegexOptions.Compiled); - /// - /// Number of tokens to skip over because have been processed in advance - /// (i.e. for figures which are three tokens, or links, or chapter/verse alternates) - /// - private int _skip = 0; - public UsfmParser( IReadOnlyList tokens, IUsfmParserHandler handler = null, @@ -63,8 +57,8 @@ public UsfmParser( public UsfmParser( IReadOnlyList tokens, - IUsfmParserHandler handler = null, - UsfmStylesheet stylesheet = null, + IUsfmParserHandler handler, + UsfmStylesheet stylesheet, ScrVers versification = null, bool tokensPreserveWhitespace = false ) @@ -89,8 +83,8 @@ public UsfmParser( public UsfmParser( string usfm, - IUsfmParserHandler handler = null, - UsfmStylesheet stylesheet = null, + IUsfmParserHandler handler, + UsfmStylesheet stylesheet, ScrVers versification = null, bool tokensPreserveWhitespace = false ) @@ -164,9 +158,9 @@ public bool ProcessToken() // Skip over tokens that are to be skipped, ensuring that // SpecialToken state is true. - if (_skip > 0) + if (State.SpecialTokenCount > 0) { - _skip--; + State.SpecialTokenCount--; State.SpecialToken = true; return true; } @@ -327,26 +321,26 @@ public bool ProcessToken() ) { altChapter = State.Tokens[State.Index + 2].Text.Trim(); - _skip += 3; + State.SpecialTokenCount += 3; // Skip blank space after if present if ( - State.Index + _skip < State.Tokens.Count - 1 - && State.Tokens[State.Index + _skip + 1].Text != null - && State.Tokens[State.Index + _skip + 1].Text.Trim().Length == 0 + State.Index + State.SpecialTokenCount < State.Tokens.Count - 1 + && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text != null + && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text.Trim().Length == 0 ) - _skip++; + State.SpecialTokenCount++; } // Get publishable chapter number if ( - State.Index + _skip < State.Tokens.Count - 2 - && State.Tokens[State.Index + _skip + 1].Marker == "cp" - && State.Tokens[State.Index + _skip + 2].Text != null + State.Index + State.SpecialTokenCount < State.Tokens.Count - 2 + && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "cp" + && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null ) { - pubChapter = State.Tokens[State.Index + _skip + 2].Text.Trim(); - _skip += 2; + pubChapter = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim(); + State.SpecialTokenCount += 2; } // Chapter @@ -373,18 +367,18 @@ public bool ProcessToken() { // Get alternate verse number altVerse = State.Tokens[State.Index + 2].Text.Trim(); - _skip += 3; + State.SpecialTokenCount += 3; } if ( - State.Index + _skip < State.Tokens.Count - 3 - && State.Tokens[State.Index + _skip + 1].Marker == "vp" - && State.Tokens[State.Index + _skip + 2].Text != null - && State.Tokens[State.Index + _skip + 3].Marker == "vp*" + State.Index + State.SpecialTokenCount < State.Tokens.Count - 3 + && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "vp" + && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null + && State.Tokens[State.Index + State.SpecialTokenCount + 3].Marker == "vp*" ) { // Get publishable verse number - pubVerse = State.Tokens[State.Index + _skip + 2].Text.Trim(); - _skip += 3; + pubVerse = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim(); + State.SpecialTokenCount += 3; } // Verse @@ -432,7 +426,7 @@ public bool ProcessToken() { // Get category sidebarCategory = State.Tokens[State.Index + 2].Text.Trim(); - _skip += 3; + State.SpecialTokenCount += 3; } if (Handler != null) @@ -486,7 +480,7 @@ public bool ProcessToken() ParseDisplayAndTarget(out string display, out string target); - _skip += 2; + State.SpecialTokenCount += 2; if (Handler != null) Handler.Ref(State, token.Marker, display, target); @@ -528,7 +522,7 @@ public bool ProcessToken() { // Get category noteCategory = State.Tokens[State.Index + 2].Text.Trim(); - _skip += 3; + State.SpecialTokenCount += 3; } State.Push(new UsfmParserElement(UsfmElementType.Note, token.Marker)); diff --git a/src/SIL.Machine/Corpora/UsfmParserState.cs b/src/SIL.Machine/Corpora/UsfmParserState.cs index 9603d07b..aa286c0d 100644 --- a/src/SIL.Machine/Corpora/UsfmParserState.cs +++ b/src/SIL.Machine/Corpora/UsfmParserState.cs @@ -65,6 +65,12 @@ public UsfmParserState(UsfmStylesheet stylesheet, ScrVers versification, IReadOn /// public bool SpecialToken { get; internal set; } + /// + /// Number of tokens to skip over because have been processed in advance + /// (i.e. for figures which are three tokens, or links, or chapter/verse alternates) + /// + public int SpecialTokenCount { get; internal set; } + /// /// True if the token processed is a figure. /// diff --git a/src/SIL.Machine/Corpora/UsfmStylesheet.cs b/src/SIL.Machine/Corpora/UsfmStylesheet.cs index 0419a0de..3fdb71ff 100644 --- a/src/SIL.Machine/Corpora/UsfmStylesheet.cs +++ b/src/SIL.Machine/Corpora/UsfmStylesheet.cs @@ -29,7 +29,8 @@ public class UsfmStylesheet { { "character", UsfmStyleType.Character }, { "paragraph", UsfmStyleType.Paragraph }, - { "note", UsfmStyleType.Note } + { "note", UsfmStyleType.Note }, + { "milestone", UsfmStyleType.Milestone } }; private static readonly Dictionary TextTypeMappings = new Dictionary< @@ -375,6 +376,16 @@ private static UsfmTag ParseTagEntry(UsfmTag tag, List styleshe endTag = MakeEndTag(endMarkerStr); tag.EndMarker = endMarkerStr; } + else if (tag.StyleType == UsfmStyleType.Milestone) + { + if (endTag != null) + { + endTag.StyleType = UsfmStyleType.MilestoneEnd; + // eid is always an optional attribute for the end marker + tag.Attributes.Add(new UsfmStyleAttribute("eid", isRequired: false)); + endTag.Name = tag.Name; + } + } // Special cases if ( diff --git a/src/SIL.Machine/Corpora/UsfmTextBase.cs b/src/SIL.Machine/Corpora/UsfmTextBase.cs index 3180a3eb..956fbd2a 100644 --- a/src/SIL.Machine/Corpora/UsfmTextBase.cs +++ b/src/SIL.Machine/Corpora/UsfmTextBase.cs @@ -1,5 +1,6 @@ using System.Collections.Generic; using System.IO; +using System.Linq; using System.Text; using SIL.Machine.Utils; using SIL.Scripture; @@ -183,6 +184,12 @@ public override void EndNote(UsfmParserState state, string marker, bool closed) OutputMarker(state); } + public override void OptBreak(UsfmParserState state) + { + if (!_text._includeMarkers) + _verseText.TrimEnd(); + } + public override void Text(UsfmParserState state, string text) { if (_verseRef.IsDefault || !state.IsVersePara) @@ -191,7 +198,7 @@ public override void Text(UsfmParserState state, string text) if (_text._includeMarkers) { text = text.TrimEnd('\r', '\n'); - if (text.Length > 0) + if (text.Length > 0 && !state.Stack.Any(e => e.Type == UsfmElementType.Sidebar)) { if (!text.IsWhiteSpace()) { @@ -200,6 +207,10 @@ public override void Text(UsfmParserState state, string text) _nextParaTokens.Clear(); _nextParaTextStarted = true; } + if (_verseText.Length == 0 || char.IsWhiteSpace(_verseText[_verseText.Length - 1])) + { + text = text.TrimStart(); + } _verseText.Append(text); } } diff --git a/src/SIL.Machine/Corpora/UsfmTokenizer.cs b/src/SIL.Machine/Corpora/UsfmTokenizer.cs index 564c4b4e..2f6164fb 100644 --- a/src/SIL.Machine/Corpora/UsfmTokenizer.cs +++ b/src/SIL.Machine/Corpora/UsfmTokenizer.cs @@ -313,8 +313,8 @@ public string Detokenize(IEnumerable tokens, bool tokensHaveWhitespac if (usfm.Length > 0) { if ( - usfm[usfm.Length - 1] == ' ' && (prevToken != null && prevToken.ToUsfm().Trim() != "") - || !tokensHaveWhitespace + usfm[usfm.Length - 1] == ' ' + && ((prevToken != null && prevToken.ToUsfm().Trim() != "") || !tokensHaveWhitespace) ) { usfm.Length--; @@ -329,8 +329,8 @@ public string Detokenize(IEnumerable tokens, bool tokensHaveWhitespac if (usfm.Length > 0 && usfm[usfm.Length - 1] != '[' && usfm[usfm.Length - 1] != '(') { if ( - usfm[usfm.Length - 1] == ' ' && (prevToken != null && prevToken.ToUsfm().Trim() != "") - || !tokensHaveWhitespace + usfm[usfm.Length - 1] == ' ' + && ((prevToken != null && prevToken.ToUsfm().Trim() != "") || !tokensHaveWhitespace) ) { usfm.Length--; diff --git a/src/SIL.Machine/Corpora/UsfmVerseTextUpdater.cs b/src/SIL.Machine/Corpora/UsfmVerseTextUpdater.cs new file mode 100644 index 00000000..b50b2cb9 --- /dev/null +++ b/src/SIL.Machine/Corpora/UsfmVerseTextUpdater.cs @@ -0,0 +1,263 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + /*** + * This is a USFM parser handler that can be used to replace the existing text in a USFM file with the specified + * text. + */ + public class UsfmVerseTextUpdater : UsfmParserHandlerBase + { + private readonly IReadOnlyList<(IReadOnlyList, string)> _rows; + private readonly List _tokens; + private readonly string _idText; + private readonly bool _stripAllText; + private int _rowIndex; + private int _tokenIndex; + private bool _replaceText; + + public UsfmVerseTextUpdater( + IReadOnlyList<(IReadOnlyList, string)> rows = null, + string idText = null, + bool stripAllText = false + ) + { + _rows = rows ?? Array.Empty<(IReadOnlyList, string)>(); + _tokens = new List(); + _idText = idText; + _stripAllText = stripAllText; + } + + public IReadOnlyList Tokens => _tokens; + + public override void StartBook(UsfmParserState state, string marker, string code) + { + CollectTokens(state); + if (_idText != null) + { + _tokens.Add(new UsfmToken(_idText + " ")); + _replaceText = true; + } + } + + public override void EndBook(UsfmParserState state, string marker) + { + _replaceText = false; + } + + public override void StartPara( + UsfmParserState state, + string marker, + bool unknown, + IReadOnlyList attributes + ) + { + if (!state.IsVersePara) + _replaceText = false; + CollectTokens(state); + } + + public override void StartRow(UsfmParserState state, string marker) + { + CollectTokens(state); + } + + public override void StartCell(UsfmParserState state, string marker, string align, int colspan) + { + CollectTokens(state); + } + + public override void EndCell(UsfmParserState state, string marker) + { + CollectTokens(state); + } + + public override void StartSidebar(UsfmParserState state, string marker, string category) + { + _replaceText = false; + CollectTokens(state); + } + + public override void EndSidebar(UsfmParserState state, string marker, bool closed) + { + _replaceText = false; + if (closed) + CollectTokens(state); + } + + public override void Chapter( + UsfmParserState state, + string number, + string marker, + string altNumber, + string pubNumber + ) + { + _replaceText = false; + CollectTokens(state); + } + + public override void Milestone( + UsfmParserState state, + string marker, + bool startMilestone, + IReadOnlyList attributes + ) + { + CollectTokens(state); + } + + public override void Verse( + UsfmParserState state, + string number, + string marker, + string altNumber, + string pubNumber + ) + { + _replaceText = false; + CollectTokens(state); + + while (_rowIndex < _rows.Count) + { + var (verseRefs, text) = _rows[_rowIndex]; + bool stop = false; + foreach (VerseRef verseRef in verseRefs) + { + int compare = verseRef.CompareTo(state.VerseRef, compareAllVerses: true); + if (compare == 0) + { + _tokens.Add(new UsfmToken(text + " ")); + _replaceText = true; + break; + } + else + { + if (state.VerseRef.AllVerses().Any(v => v.Equals(verseRef))) + { + _tokens.Add(new UsfmToken(text + " ")); + _replaceText = true; + break; + } + if (compare > 0) + { + stop = true; + break; + } + } + } + + if (stop) + break; + else + _rowIndex++; + } + } + + public override void StartChar( + UsfmParserState state, + string markerWithoutPlus, + bool unknown, + IReadOnlyList attributes + ) + { + // strip out char-style markers in verses that are being replaced + if (_stripAllText || (_replaceText && state.IsVersePara)) + SkipTokens(state); + else + CollectTokens(state); + } + + public override void EndChar( + UsfmParserState state, + string marker, + IReadOnlyList attributes, + bool closed + ) + { + // strip out char-style markers in verses that are being replaced + if (closed && (_stripAllText || (_replaceText && state.IsVersePara))) + SkipTokens(state); + } + + public override void StartNote(UsfmParserState state, string marker, string caller, string category) + { + // strip out notes in verses that are being replaced + if (_stripAllText || (_replaceText && state.IsVersePara)) + SkipTokens(state); + else + CollectTokens(state); + } + + public override void EndNote(UsfmParserState state, string marker, bool closed) + { + // strip out notes in verses that are being replaced + if (closed && (_stripAllText || (_replaceText && state.IsVersePara))) + SkipTokens(state); + } + + public override void Ref(UsfmParserState state, string marker, string display, string target) + { + // strip out ref in verses that are being replaced + if (_stripAllText || (_replaceText && state.IsVersePara)) + SkipTokens(state); + else + CollectTokens(state); + } + + public override void Text(UsfmParserState state, string text) + { + // strip out text in verses that are being replaced + if (_stripAllText || (_replaceText && (state.IsVersePara || state.ParaTag.Marker == "id"))) + SkipTokens(state); + else + CollectTokens(state); + } + + public override void OptBreak(UsfmParserState state) + { + // strip out optbreaks in verses that are being replaced + if (_stripAllText || (_replaceText && state.IsVersePara)) + SkipTokens(state); + else + CollectTokens(state); + } + + public override void Unmatched(UsfmParserState state, string marker) + { + // strip out unmatched end markers in verses that are being replaced + if (_stripAllText || (_replaceText && state.IsVersePara)) + SkipTokens(state); + else + CollectTokens(state); + } + + public string GetUsfm(string stylesheetFileName = "usfm.sty") + { + return GetUsfm(new UsfmStylesheet(stylesheetFileName)); + } + + public string GetUsfm(UsfmStylesheet stylesheet) + { + var tokenizer = new UsfmTokenizer(stylesheet); + return tokenizer.Detokenize(_tokens); + } + + private void CollectTokens(UsfmParserState state) + { + while (_tokenIndex <= state.Index + state.SpecialTokenCount) + { + _tokens.Add(state.Tokens[_tokenIndex]); + _tokenIndex++; + } + } + + private void SkipTokens(UsfmParserState state) + { + _tokenIndex = state.Index + 1 + state.SpecialTokenCount; + } + } +} diff --git a/src/SIL.Machine/Corpora/ZipParatextProjectSettingsParser.cs b/src/SIL.Machine/Corpora/ZipParatextProjectSettingsParser.cs new file mode 100644 index 00000000..0eeee8f3 --- /dev/null +++ b/src/SIL.Machine/Corpora/ZipParatextProjectSettingsParser.cs @@ -0,0 +1,70 @@ +using System.IO; +using System.IO.Compression; +using System.Linq; +using SIL.IO; + +namespace SIL.Machine.Corpora +{ + public class ZipParatextProjectSettingsParser : ParatextProjectSettingsParserBase + { + private readonly ZipArchive _archive; + + public ZipParatextProjectSettingsParser(ZipArchive archive) + { + _archive = archive; + } + + protected override UsfmStylesheet CreateStylesheet(string fileName) + { + TempFile stylesheetTempFile = null; + TempFile customStylesheetTempFile = null; + try + { + string stylesheetPath = fileName; + ZipArchiveEntry stylesheetEntry = _archive.GetEntry(fileName); + if (stylesheetEntry != null) + { + stylesheetTempFile = TempFile.CreateAndGetPathButDontMakeTheFile(); + stylesheetEntry.ExtractToFile(stylesheetTempFile.Path); + stylesheetPath = stylesheetTempFile.Path; + } + + string customStylesheetPath = null; + ZipArchiveEntry customStylesheetEntry = _archive.GetEntry("custom.sty"); + if (customStylesheetEntry != null) + { + customStylesheetTempFile = TempFile.CreateAndGetPathButDontMakeTheFile(); + customStylesheetEntry.ExtractToFile(customStylesheetTempFile.Path); + customStylesheetPath = customStylesheetTempFile.Path; + } + return new UsfmStylesheet(stylesheetPath, customStylesheetPath); + } + finally + { + stylesheetTempFile?.Dispose(); + customStylesheetTempFile?.Dispose(); + } + } + + protected override bool Exists(string fileName) + { + return _archive.GetEntry(fileName) != null; + } + + protected override string Find(string extension) + { + ZipArchiveEntry entry = _archive.Entries.FirstOrDefault(e => e.FullName.EndsWith(extension)); + if (entry == null) + return null; + return entry.FullName; + } + + protected override Stream Open(string fileName) + { + ZipArchiveEntry entry = _archive.GetEntry(fileName); + if (entry == null) + return null; + return entry.Open(); + } + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/ParatextBackupTermsCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/ParatextBackupTermsCorpusTests.cs index 5a9c014a..99d4a11a 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParatextBackupTermsCorpusTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParatextBackupTermsCorpusTests.cs @@ -1,5 +1,3 @@ -using System.Reflection; -using System.Xml.Linq; using NUnit.Framework; using SIL.ObjectModel; diff --git a/tests/SIL.Machine.Tests/Corpora/ParatextBackupTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/ParatextBackupTextCorpusTests.cs index 6486779b..9664827d 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParatextBackupTextCorpusTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParatextBackupTextCorpusTests.cs @@ -1,6 +1,4 @@ -using System.IO; -using System.Linq; -using NUnit.Framework; +using NUnit.Framework; using SIL.ObjectModel; namespace SIL.Machine.Corpora diff --git a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM index 8cb81d7a..2c77542e 100644 --- a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM +++ b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM @@ -8,7 +8,7 @@ \li1 \v 2 \bd C\bd*hapter one, \li2 verse\f + \fr 1:2: \ft This is a footnote.\f* two. -\v 3 Chapter one, +\v 3 Chapter one \w*, \li2 verse three. \v 4 Chapter one,  \li2 verse four, @@ -18,7 +18,11 @@ \s1 Chapter Two \p \v 1 Chapter \add two\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one. -\v 2-3 Chapter two, verse \fm ∆\fm*two. +\v 2-3 Chapter two, // verse \fm ∆\fm*two. +\esb +\ms This is a sidebar +\p Here is some sidebar content. +\esbe \v 3-4a Chapter two, verse \w three|lemma\w*. \v 4b Chapter two, verse four. \p @@ -26,15 +30,15 @@ \v 6 Bad verse. \v 5 Chapter two, verse five \rq (MAT 3:1)\rq*. \v 7a Chapter two, verse seven A, -\s Section header +\s Section header \ts-s\* \p \v 7b verse seven B. \p -\v 8 This is a list: +\v 8 This is a list: \ts-e\* \b \tr \tc1 \v 9 Chapter\tcr2 2\tc3 verse\tcr4 9 \tr \tc1-2 -\v 10 \tc3-4 Chapter 2 verse 10 +\v 10 \tc3-4 \qt-s |Jesus\*Chapter 2 verse 10\qt-e\* \v 11-12 \restore restore information diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs index ff6aaf30..118cb181 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs @@ -11,7 +11,7 @@ public void Tokenize() string usfm = ReadUsfm(); var tokenizer = new UsfmTokenizer(); IReadOnlyList tokens = tokenizer.Tokenize(usfm); - Assert.That(tokens, Has.Count.EqualTo(136)); + Assert.That(tokens, Has.Count.EqualTo(151)); Assert.That(tokens[0].Type, Is.EqualTo(UsfmTokenType.Book)); Assert.That(tokens[0].Marker, Is.EqualTo("id")); diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmVerseTextUpdaterTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmVerseTextUpdaterTests.cs new file mode 100644 index 00000000..8186e063 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/UsfmVerseTextUpdaterTests.cs @@ -0,0 +1,208 @@ +using NUnit.Framework; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + [TestFixture] + public class UsfmVerseTextUpdaterTests + { + [Test] + public void GetUsfm_CharStyle() + { + var rows = new List<(IReadOnlyList, string)> + { + (new[] { new VerseRef("MAT 1:1", ScrVers.English) }, "First verse of the first chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\id MAT - Test\r\n")); + Assert.That(target, Contains.Substring("\\v 1 First verse of the first chapter.\r\n")); + } + + [Test] + public void GetUsfm_IdText() + { + string target = UpdateUsfm(idText: "- Updated"); + Assert.That(target, Contains.Substring("\\id MAT - Updated\r\n")); + } + + [Test] + public void GetUsfm_StripAllText() + { + string target = UpdateUsfm(stripAllText: true); + Assert.That(target, Contains.Substring("\\id MAT\r\n")); + Assert.That(target, Contains.Substring("\\v 1\r\n")); + Assert.That(target, Contains.Substring("\\s\r\n")); + } + + [Test] + public void GetUsfm_Notes() + { + var rows = new List<(IReadOnlyList, string)> + { + (new[] { new VerseRef("MAT 2:1", ScrVers.English) }, "First verse of the second chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\v 1 First verse of the second chapter.\r\n")); + } + + [Test] + public void GetUsfm_RowVerseSegment() + { + var rows = new List<(IReadOnlyList, string)> + { + (new[] { new VerseRef("MAT 2:1a", ScrVers.English) }, "First verse of the second chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\v 1 First verse of the second chapter.\r\n")); + } + + [Test] + public void GetUsfm_UsfmVerseSegment() + { + var rows = new List<(IReadOnlyList, string)> + { + (new[] { new VerseRef("MAT 2:7", ScrVers.English) }, "Seventh verse of the second chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\v 7a Seventh verse of the second chapter.\r\n")); + } + + [Test] + public void GetUsfm_MultipleParas() + { + var rows = new List<(IReadOnlyList, string)> + { + (new[] { new VerseRef("MAT 1:2", ScrVers.English) }, "Second verse of the first chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\v 2 Second verse of the first chapter.\r\n\\li2\r\n")); + } + + [Test] + public void GetUsfm_Table() + { + var rows = new List<(IReadOnlyList, string)> + { + (new[] { new VerseRef("MAT 2:9", ScrVers.English) }, "Ninth verse of the second chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\v 9 Ninth verse of the second chapter. \\tcr2 \\tc3 \\tcr4\r\n")); + } + + [Test] + public void GetUsfm_RangeSingleRowMultipleVerses() + { + var rows = new List<(IReadOnlyList, string)> + { + ( + new[] { new VerseRef("MAT 2:11", ScrVers.English), new VerseRef("MAT 2:12", ScrVers.English) }, + "Eleventh verse of the second chapter. Twelfth verse of the second chapter." + ) + }; + + string target = UpdateUsfm(rows); + Assert.That( + target, + Contains.Substring( + "\\v 11-12 Eleventh verse of the second chapter. Twelfth verse of the second chapter.\r\n" + ) + ); + } + + [Test] + public void GetUsfm_RangeSingleRowSingleVerse() + { + var rows = new List<(IReadOnlyList, string)> + { + (new[] { new VerseRef("MAT 2:11", ScrVers.English) }, "Eleventh verse of the second chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\v 11-12 Eleventh verse of the second chapter.\r\n")); + } + + [Test] + public void GetUsfm_RangeMultipleRowsSingleVerse() + { + var rows = new List<(IReadOnlyList, string)> + { + (new[] { new VerseRef("MAT 2:11", ScrVers.English) }, "Eleventh verse of the second chapter."), + (new[] { new VerseRef("MAT 2:12", ScrVers.English) }, "Twelfth verse of the second chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That( + target, + Contains.Substring( + "\\v 11-12 Eleventh verse of the second chapter. Twelfth verse of the second chapter.\r\n" + ) + ); + } + + [Test] + public void GetUsfm_OptBreak() + { + var rows = new List<(IReadOnlyList, string)> + { + (new[] { new VerseRef("MAT 2:2", ScrVers.English) }, "Second verse of the second chapter."), + (new[] { new VerseRef("MAT 2:3", ScrVers.English) }, "Third verse of the second chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That( + target, + Contains.Substring("\\v 2-3 Second verse of the second chapter. Third verse of the second chapter.\r\n") + ); + } + + [Test] + public void GetUsfm_Milestone() + { + var rows = new List<(IReadOnlyList, string)> + { + (new[] { new VerseRef("MAT 2:10", ScrVers.English) }, "Tenth verse of the second chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That( + target, + Contains.Substring("\\v 10 Tenth verse of the second chapter. \\tc3-4 \\qt-s |Jesus\\*\\qt-e\\*\r\n") + ); + } + + [Test] + public void GetUsfm_Unmatched() + { + var rows = new List<(IReadOnlyList, string)> + { + (new[] { new VerseRef("MAT 1:3", ScrVers.English) }, "Third verse of the first chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\v 3 Third verse of the first chapter.\r\n")); + } + + private static string UpdateUsfm( + IReadOnlyList<(IReadOnlyList, string)>? rows = null, + string? idText = null, + bool stripAllText = false + ) + { + string source = ReadUsfm(); + var updater = new UsfmVerseTextUpdater(rows, idText, stripAllText); + UsfmParser.Parse(source, updater); + return updater.GetUsfm(); + } + + private static string ReadUsfm() + { + return File.ReadAllText(Path.Combine(CorporaTestHelpers.UsfmTestProjectPath, "41MATTes.SFM")); + } + } +}