Skip to content

Commit

Permalink
Add UsfmVerseTextUpdater class
Browse files Browse the repository at this point in the history
  • Loading branch information
ddaspit committed Jan 29, 2024
1 parent 79c0832 commit 267de3c
Show file tree
Hide file tree
Showing 18 changed files with 852 additions and 246 deletions.
42 changes: 42 additions & 0 deletions src/SIL.Machine/Corpora/FileParatextProjectSettingsParser.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;

namespace SIL.Machine.Corpora
{
public class FileParatextProjectSettingsParser : ParatextProjectSettingsParserBase
{
private readonly string _projectDir;

public FileParatextProjectSettingsParser(string projectDir)
{
_projectDir = projectDir;
}

protected override UsfmStylesheet CreateStylesheet(string fileName)
{
string customStylesheetFileName = Path.Combine(_projectDir, fileName);
return new UsfmStylesheet(
fileName,
File.Exists(customStylesheetFileName) ? customStylesheetFileName : null
);
}

protected override bool Exists(string fileName)
{
return File.Exists(Path.Combine(_projectDir, fileName));
}

protected override string Find(string extension)
{
return Directory.EnumerateFiles(_projectDir, "*" + extension).FirstOrDefault();
}

protected override Stream Open(string fileName)
{
return File.OpenRead(Path.Combine(_projectDir, fileName));
}
}
}
47 changes: 20 additions & 27 deletions src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,22 @@
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using System.Linq;
using System.Reflection;
using System.Text.RegularExpressions;
using System.Xml.Linq;

namespace SIL.Machine.Corpora
{
public class ParatextBackupTermsCorpus : DictionaryTextCorpus
{
private static List<string> PREDEFINED_TERMS_LIST_TYPES = new List<string>() { "Major", "All", "SilNt", "Pt6" };
private static readonly List<string> PredefinedTermsListTypes = new List<string>()
{
"Major",
"All",
"SilNt",
"Pt6"
};

public ParatextBackupTermsCorpus(string fileName, IEnumerable<string> termCategories)
{
Expand All @@ -22,51 +28,36 @@ public ParatextBackupTermsCorpus(string fileName, IEnumerable<string> termCatego
if (termsFileEntry is null)
return;

ZipArchiveEntry settingsEntry = archive.GetEntry("Settings.xml");
if (settingsEntry == null)
settingsEntry = archive.Entries.FirstOrDefault(e => e.FullName.EndsWith(".ssf"));
if (settingsEntry == null)
{
throw new ArgumentException(
"The project backup does not contain a settings file.",
nameof(fileName)
);
}
XDocument settingsDoc;
using (Stream stream = settingsEntry.Open())
{
settingsDoc = XDocument.Load(stream);
}
string textId = settingsDoc.Root.Element("BiblicalTermsListSetting").Value;
var settingsParser = new ZipParatextProjectSettingsParser(archive);
ParatextProjectSettings settings = settingsParser.Parse();

XDocument termRenderingsDoc;
using (var keyTermsFile = termsFileEntry.Open())
using (Stream keyTermsFile = termsFileEntry.Open())
{
termRenderingsDoc = XDocument.Load(keyTermsFile);
}

ZipArchiveEntry biblicalTermsFileEntry = archive.GetEntry(textId.Split(':').Last());
ZipArchiveEntry biblicalTermsFileEntry = archive.GetEntry(settings.BiblicalTermsFileName);

XDocument biblicalTermsDoc;
IDictionary<string, string> termIdToCategoryDictionary;
if (PREDEFINED_TERMS_LIST_TYPES.Contains(textId.Split(':').First()))
if (PredefinedTermsListTypes.Contains(settings.BiblicalTermsListType))
{
using (
var keyTermsFile = Assembly
Stream keyTermsFile = Assembly
.GetExecutingAssembly()
.GetManifestResourceStream("SIL.Machine.Corpora." + textId.Split(':').Last())
.GetManifestResourceStream("SIL.Machine.Corpora." + settings.BiblicalTermsFileName)
)
{
biblicalTermsDoc = XDocument.Load(keyTermsFile);
termIdToCategoryDictionary = GetCategoryPerId(biblicalTermsDoc);
}
}
else if (
textId.Split(':').First() == "Project"
&& textId.Split(':')[1] == settingsDoc.Root.Element("Name").Value
settings.BiblicalTermsListType == "Project" && settings.BiblicalTermsProjectName == settings.Name
)
{
using (var keyTermsFile = biblicalTermsFileEntry.Open())
using (Stream keyTermsFile = biblicalTermsFileEntry.Open())
{
biblicalTermsDoc = XDocument.Load(keyTermsFile);
termIdToCategoryDictionary = GetCategoryPerId(biblicalTermsDoc);
Expand All @@ -81,6 +72,8 @@ public ParatextBackupTermsCorpus(string fileName, IEnumerable<string> termCatego
.Descendants()
.Where(n => n.Name.LocalName == "TermRendering");

string textId =
$"{settings.BiblicalTermsListType}:{settings.BiblicalTermsProjectName}:{settings.BiblicalTermsFileName}";
foreach (XElement element in termsElements)
{
string id = element.Attribute("Id").Value;
Expand Down
105 changes: 9 additions & 96 deletions src/SIL.Machine/Corpora/ParatextBackupTextCorpus.cs
Original file line number Diff line number Diff line change
@@ -1,117 +1,30 @@
using System;
using System.IO;
using System.IO.Compression;
using System.IO.Compression;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using SIL.IO;
using SIL.Scripture;

namespace SIL.Machine.Corpora
{
public class ParatextBackupTextCorpus : ScriptureTextCorpus
{
public ParatextBackupTextCorpus(string fileName, bool includeMarkers = false)
{
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
using (ZipArchive archive = ZipFile.OpenRead(fileName))
{
ZipArchiveEntry settingsEntry = archive.GetEntry("Settings.xml");
if (settingsEntry == null)
settingsEntry = archive.Entries.FirstOrDefault(e => e.FullName.EndsWith(".ssf"));
if (settingsEntry == null)
{
throw new ArgumentException(
"The project backup does not contain a settings file.",
nameof(fileName)
);
}
XDocument settingsDoc;
using (Stream stream = settingsEntry.Open())
{
settingsDoc = XDocument.Load(stream);
}
var encodingStr = (string)settingsDoc.Root.Element("Encoding") ?? "65001";
if (!int.TryParse(encodingStr, out int codePage))
{
throw new NotImplementedException(
$"The project uses a legacy encoding that requires TECKit, map file: {encodingStr}."
);
}
var encoding = Encoding.GetEncoding(codePage);
var parser = new ZipParatextProjectSettingsParser(archive);
ParatextProjectSettings settings = parser.Parse();

var scrVersType = (int?)settingsDoc.Root.Element("Versification") ?? (int)ScrVersType.English;
Versification = new ScrVers((ScrVersType)scrVersType);
ZipArchiveEntry customVersEntry = archive.GetEntry("custom.vrs");
if (customVersEntry != null)
{
var guid = (string)settingsDoc.Root.Element("Guid");
string versName = ((ScrVersType)scrVersType).ToString() + "-" + guid;
if (Scripture.Versification.Table.Implementation.Exists(versName))
{
Versification = new ScrVers(versName);
}
else
{
using (var reader = new StreamReader(customVersEntry.Open()))
{
Versification = Scripture.Versification.Table.Implementation.Load(
reader,
"custom.vrs",
Versification,
versName
);
}
}
}

var stylesheetName = (string)settingsDoc.Root.Element("StyleSheet") ?? "usfm.sty";
ZipArchiveEntry stylesheetEntry = archive.GetEntry(stylesheetName);
if (stylesheetEntry == null && stylesheetName != "usfm_sb.sty")
stylesheetEntry = archive.GetEntry("usfm.sty");
ZipArchiveEntry customStylesheetEntry = archive.GetEntry("custom.sty");

UsfmStylesheet stylesheet;
using (var stylesheetTempFile = TempFile.CreateAndGetPathButDontMakeTheFile())
using (var customStylesheetTempFile = TempFile.CreateAndGetPathButDontMakeTheFile())
{
string stylesheetPath = "usfm.sty";
if (stylesheetEntry != null)
{
stylesheetEntry.ExtractToFile(stylesheetTempFile.Path);
stylesheetPath = stylesheetTempFile.Path;
}
string customStylesheetPath = null;
if (customStylesheetEntry != null)
{
customStylesheetEntry.ExtractToFile(customStylesheetTempFile.Path);
customStylesheetPath = customStylesheetTempFile.Path;
}
stylesheet = new UsfmStylesheet(stylesheetPath, customStylesheetPath);
}

string prefix = "";
string suffix = ".SFM";
XElement namingElem = settingsDoc.Root.Element("Naming");
if (namingElem != null)
{
var prePart = (string)namingElem.Attribute("PrePart");
if (!string.IsNullOrEmpty(prePart))
prefix = prePart;
var postPart = (string)namingElem.Attribute("PostPart");
if (!string.IsNullOrEmpty(postPart))
suffix = postPart;
}
Versification = settings.Versification;

var regex = new Regex($"^{Regex.Escape(prefix)}.*{Regex.Escape(suffix)}$");
var regex = new Regex(
$"^{Regex.Escape(settings.FileNamePrefix)}.*{Regex.Escape(settings.FileNameSuffix)}$"
);

foreach (ZipArchiveEntry sfmEntry in archive.Entries.Where(e => regex.IsMatch(e.FullName)))
{
AddText(
new UsfmZipText(
stylesheet,
encoding,
settings.Stylesheet,
settings.Encoding,
fileName,
sfmEntry.FullName,
Versification,
Expand Down
47 changes: 47 additions & 0 deletions src/SIL.Machine/Corpora/ParatextProjectSettings.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
using System.Text;
using SIL.Scripture;

namespace SIL.Machine.Corpora
{
public class ParatextProjectSettings
{
public ParatextProjectSettings(
string name,
string fullName,
Encoding encoding,
ScrVers versification,
UsfmStylesheet stylesheet,
string fileNamePrefix,
string fileNameForm,
string fileNameSuffix,
string biblicalTermsListType,
string biblicalTermsProjectName,
string biblicalTermsFileName
)
{
Name = name;
FullName = fullName;
Encoding = encoding;
Versification = versification;
Stylesheet = stylesheet;
FileNamePrefix = fileNamePrefix;
FileNameForm = fileNameForm;
FileNameSuffix = fileNameSuffix;
BiblicalTermsListType = biblicalTermsListType;
BiblicalTermsProjectName = biblicalTermsProjectName;
BiblicalTermsFileName = biblicalTermsFileName;
}

public string Name { get; }
public string FullName { get; }
public Encoding Encoding { get; }
public ScrVers Versification { get; }
public UsfmStylesheet Stylesheet { get; }
public string FileNamePrefix { get; }
public string FileNameForm { get; }
public string FileNameSuffix { get; }
public string BiblicalTermsListType { get; }
public string BiblicalTermsProjectName { get; }
public string BiblicalTermsFileName { get; }
}
}
Loading

0 comments on commit 267de3c

Please sign in to comment.