Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add UsfmVerseTextUpdater class #160

Merged
merged 1 commit into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions src/SIL.Machine/Corpora/FileParatextProjectSettingsParser.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;

namespace SIL.Machine.Corpora
{
public class FileParatextProjectSettingsParser : ParatextProjectSettingsParserBase
{
private readonly string _projectDir;

public FileParatextProjectSettingsParser(string projectDir)
{
_projectDir = projectDir;
}

protected override UsfmStylesheet CreateStylesheet(string fileName)
{
string customStylesheetFileName = Path.Combine(_projectDir, fileName);
return new UsfmStylesheet(
fileName,
File.Exists(customStylesheetFileName) ? customStylesheetFileName : null
);
}

protected override bool Exists(string fileName)
{
return File.Exists(Path.Combine(_projectDir, fileName));
}

protected override string Find(string extension)
{
return Directory.EnumerateFiles(_projectDir, "*" + extension).FirstOrDefault();
}

protected override Stream Open(string fileName)
{
return File.OpenRead(Path.Combine(_projectDir, fileName));
}
}
}
47 changes: 20 additions & 27 deletions src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,22 @@
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using System.Linq;
using System.Reflection;
using System.Text.RegularExpressions;
using System.Xml.Linq;

namespace SIL.Machine.Corpora
{
public class ParatextBackupTermsCorpus : DictionaryTextCorpus
{
private static List<string> PREDEFINED_TERMS_LIST_TYPES = new List<string>() { "Major", "All", "SilNt", "Pt6" };
private static readonly List<string> PredefinedTermsListTypes = new List<string>()
{
"Major",
"All",
"SilNt",
"Pt6"
};

public ParatextBackupTermsCorpus(string fileName, IEnumerable<string> termCategories)
{
Expand All @@ -22,51 +28,36 @@ public ParatextBackupTermsCorpus(string fileName, IEnumerable<string> termCatego
if (termsFileEntry is null)
return;

ZipArchiveEntry settingsEntry = archive.GetEntry("Settings.xml");
if (settingsEntry == null)
settingsEntry = archive.Entries.FirstOrDefault(e => e.FullName.EndsWith(".ssf"));
if (settingsEntry == null)
{
throw new ArgumentException(
"The project backup does not contain a settings file.",
nameof(fileName)
);
}
XDocument settingsDoc;
using (Stream stream = settingsEntry.Open())
{
settingsDoc = XDocument.Load(stream);
}
string textId = settingsDoc.Root.Element("BiblicalTermsListSetting").Value;
var settingsParser = new ZipParatextProjectSettingsParser(archive);
ParatextProjectSettings settings = settingsParser.Parse();

XDocument termRenderingsDoc;
using (var keyTermsFile = termsFileEntry.Open())
using (Stream keyTermsFile = termsFileEntry.Open())
{
termRenderingsDoc = XDocument.Load(keyTermsFile);
}

ZipArchiveEntry biblicalTermsFileEntry = archive.GetEntry(textId.Split(':').Last());
ZipArchiveEntry biblicalTermsFileEntry = archive.GetEntry(settings.BiblicalTermsFileName);

XDocument biblicalTermsDoc;
IDictionary<string, string> termIdToCategoryDictionary;
if (PREDEFINED_TERMS_LIST_TYPES.Contains(textId.Split(':').First()))
if (PredefinedTermsListTypes.Contains(settings.BiblicalTermsListType))
{
using (
var keyTermsFile = Assembly
Stream keyTermsFile = Assembly
.GetExecutingAssembly()
.GetManifestResourceStream("SIL.Machine.Corpora." + textId.Split(':').Last())
.GetManifestResourceStream("SIL.Machine.Corpora." + settings.BiblicalTermsFileName)
)
{
biblicalTermsDoc = XDocument.Load(keyTermsFile);
termIdToCategoryDictionary = GetCategoryPerId(biblicalTermsDoc);
}
}
else if (
textId.Split(':').First() == "Project"
&& textId.Split(':')[1] == settingsDoc.Root.Element("Name").Value
settings.BiblicalTermsListType == "Project" && settings.BiblicalTermsProjectName == settings.Name
)
{
using (var keyTermsFile = biblicalTermsFileEntry.Open())
using (Stream keyTermsFile = biblicalTermsFileEntry.Open())
{
biblicalTermsDoc = XDocument.Load(keyTermsFile);
termIdToCategoryDictionary = GetCategoryPerId(biblicalTermsDoc);
Expand All @@ -81,6 +72,8 @@ public ParatextBackupTermsCorpus(string fileName, IEnumerable<string> termCatego
.Descendants()
.Where(n => n.Name.LocalName == "TermRendering");

string textId =
$"{settings.BiblicalTermsListType}:{settings.BiblicalTermsProjectName}:{settings.BiblicalTermsFileName}";
foreach (XElement element in termsElements)
{
string id = element.Attribute("Id").Value;
Expand Down
105 changes: 9 additions & 96 deletions src/SIL.Machine/Corpora/ParatextBackupTextCorpus.cs
Original file line number Diff line number Diff line change
@@ -1,117 +1,30 @@
using System;
using System.IO;
using System.IO.Compression;
using System.IO.Compression;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using SIL.IO;
using SIL.Scripture;

namespace SIL.Machine.Corpora
{
public class ParatextBackupTextCorpus : ScriptureTextCorpus
{
public ParatextBackupTextCorpus(string fileName, bool includeMarkers = false)
{
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
using (ZipArchive archive = ZipFile.OpenRead(fileName))
{
ZipArchiveEntry settingsEntry = archive.GetEntry("Settings.xml");
if (settingsEntry == null)
settingsEntry = archive.Entries.FirstOrDefault(e => e.FullName.EndsWith(".ssf"));
if (settingsEntry == null)
{
throw new ArgumentException(
"The project backup does not contain a settings file.",
nameof(fileName)
);
}
XDocument settingsDoc;
using (Stream stream = settingsEntry.Open())
{
settingsDoc = XDocument.Load(stream);
}
var encodingStr = (string)settingsDoc.Root.Element("Encoding") ?? "65001";
if (!int.TryParse(encodingStr, out int codePage))
{
throw new NotImplementedException(
$"The project uses a legacy encoding that requires TECKit, map file: {encodingStr}."
);
}
var encoding = Encoding.GetEncoding(codePage);
var parser = new ZipParatextProjectSettingsParser(archive);
ParatextProjectSettings settings = parser.Parse();

var scrVersType = (int?)settingsDoc.Root.Element("Versification") ?? (int)ScrVersType.English;
Versification = new ScrVers((ScrVersType)scrVersType);
ZipArchiveEntry customVersEntry = archive.GetEntry("custom.vrs");
if (customVersEntry != null)
{
var guid = (string)settingsDoc.Root.Element("Guid");
string versName = ((ScrVersType)scrVersType).ToString() + "-" + guid;
if (Scripture.Versification.Table.Implementation.Exists(versName))
{
Versification = new ScrVers(versName);
}
else
{
using (var reader = new StreamReader(customVersEntry.Open()))
{
Versification = Scripture.Versification.Table.Implementation.Load(
reader,
"custom.vrs",
Versification,
versName
);
}
}
}

var stylesheetName = (string)settingsDoc.Root.Element("StyleSheet") ?? "usfm.sty";
ZipArchiveEntry stylesheetEntry = archive.GetEntry(stylesheetName);
if (stylesheetEntry == null && stylesheetName != "usfm_sb.sty")
stylesheetEntry = archive.GetEntry("usfm.sty");
ZipArchiveEntry customStylesheetEntry = archive.GetEntry("custom.sty");

UsfmStylesheet stylesheet;
using (var stylesheetTempFile = TempFile.CreateAndGetPathButDontMakeTheFile())
using (var customStylesheetTempFile = TempFile.CreateAndGetPathButDontMakeTheFile())
{
string stylesheetPath = "usfm.sty";
if (stylesheetEntry != null)
{
stylesheetEntry.ExtractToFile(stylesheetTempFile.Path);
stylesheetPath = stylesheetTempFile.Path;
}
string customStylesheetPath = null;
if (customStylesheetEntry != null)
{
customStylesheetEntry.ExtractToFile(customStylesheetTempFile.Path);
customStylesheetPath = customStylesheetTempFile.Path;
}
stylesheet = new UsfmStylesheet(stylesheetPath, customStylesheetPath);
}

string prefix = "";
string suffix = ".SFM";
XElement namingElem = settingsDoc.Root.Element("Naming");
if (namingElem != null)
{
var prePart = (string)namingElem.Attribute("PrePart");
if (!string.IsNullOrEmpty(prePart))
prefix = prePart;
var postPart = (string)namingElem.Attribute("PostPart");
if (!string.IsNullOrEmpty(postPart))
suffix = postPart;
}
Versification = settings.Versification;

var regex = new Regex($"^{Regex.Escape(prefix)}.*{Regex.Escape(suffix)}$");
var regex = new Regex(
$"^{Regex.Escape(settings.FileNamePrefix)}.*{Regex.Escape(settings.FileNameSuffix)}$"
);

foreach (ZipArchiveEntry sfmEntry in archive.Entries.Where(e => regex.IsMatch(e.FullName)))
{
AddText(
new UsfmZipText(
stylesheet,
encoding,
settings.Stylesheet,
settings.Encoding,
fileName,
sfmEntry.FullName,
Versification,
Expand Down
47 changes: 47 additions & 0 deletions src/SIL.Machine/Corpora/ParatextProjectSettings.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
using System.Text;
using SIL.Scripture;

namespace SIL.Machine.Corpora
{
public class ParatextProjectSettings
{
public ParatextProjectSettings(
string name,
string fullName,
Encoding encoding,
ScrVers versification,
UsfmStylesheet stylesheet,
string fileNamePrefix,
string fileNameForm,
string fileNameSuffix,
string biblicalTermsListType,
string biblicalTermsProjectName,
string biblicalTermsFileName
)
{
Name = name;
FullName = fullName;
Encoding = encoding;
Versification = versification;
Stylesheet = stylesheet;
FileNamePrefix = fileNamePrefix;
FileNameForm = fileNameForm;
FileNameSuffix = fileNameSuffix;
BiblicalTermsListType = biblicalTermsListType;
BiblicalTermsProjectName = biblicalTermsProjectName;
BiblicalTermsFileName = biblicalTermsFileName;
}

public string Name { get; }
public string FullName { get; }
public Encoding Encoding { get; }
public ScrVers Versification { get; }
public UsfmStylesheet Stylesheet { get; }
public string FileNamePrefix { get; }
public string FileNameForm { get; }
public string FileNameSuffix { get; }
public string BiblicalTermsListType { get; }
public string BiblicalTermsProjectName { get; }
public string BiblicalTermsFileName { get; }
}
}
Loading
Loading