Skip to content

Commit

Permalink
tei-export: Only allow PC if literal is has no alphanumeric chars
Browse files Browse the repository at this point in the history
  • Loading branch information
PrinsINT committed Oct 16, 2024
1 parent 02760a7 commit aed57d4
Show file tree
Hide file tree
Showing 12 changed files with 214 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,17 @@ class LayerToTEIConverter(
}
}
if (punctuationTags.contains(term.pos)) {
// Interpret as pc tag
writer.writeRaw("<pc xml:id=\"${term.targets[0].id}\">${getLiteral()}</pc>")
val alphaNumeric = Regex("""[a-zA-Z0-9]""")
if (!term.literals.contains(alphaNumeric)) {
// Interpret as punctuation only if it doesn't contain any alphanumeric characters
writer.writeRaw("<pc xml:id=\"${term.targets[0].id}\">${getLiteral()}</pc>")
} else {
// Clear the pos and interpret as <w>
val lemma = term.lemmaOrEmpty.escapeXML()
writer.writeRaw("<w lemma=\"$lemma\" pos=\"\" xml:id=\"${term.targets[0].id}\">${getLiteral()}</w>")
}
} else {
// If it is not punctuation, safely assume it can be interpreted as <w>
// Assume it can be interpreted as <w>
val lemma = term.lemmaOrEmpty.escapeXML()
val pos = term.posOrEmpty.escapeXML()
writer.writeRaw("<w lemma=\"$lemma\" pos=\"$pos\" xml:id=\"${term.targets[0].id}\">${getLiteral()}</w>")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,20 @@ import org.ivdnt.galahad.data.layer.WordForm
import org.ivdnt.galahad.evaluation.comparison.LayerComparison.Companion.truncatedPcMatch
import org.ivdnt.galahad.port.folia.export.deepcopy
import org.ivdnt.galahad.port.xml.getPlainTextContent
import org.ivdnt.galahad.taggers.TaggerStore
import org.ivdnt.galahad.tagset.TagsetStore
import org.ivdnt.galahad.util.*
import org.w3c.dom.Document
import org.w3c.dom.Element
import org.w3c.dom.Node
import kotlin.collections.contains

fun HashSet<String>.contains(s: String?, ignoreCase: Boolean = false): Boolean {
return any { it.equals(s, ignoreCase) }
}

private val alphaNumeric = Regex("""[a-zA-Z0-9]""")

open class TEITextMerger(
var node: Node,
var offset: Int,
Expand Down Expand Up @@ -228,16 +233,23 @@ open class TEITextMerger(

protected open fun createWTag(wf: WordForm): Element {
val termToAdd = layer.termForWordForm(wf)
val wTag = if (layer.tagset.punctuationTags.contains(termToAdd.pos)) {

val wTag = if (layer.tagset.punctuationTags.contains(termToAdd.pos) && !termToAdd.literals.contains(alphaNumeric)) {
val n = document.createElement("pc")
n
} else {
val n = document.createElement("w")
n.setAttribute("lemma", termToAdd.lemmaOrEmpty)
n
}
// Both <w> and <pc> have a pos.
wTag.setAttribute(posType(), termToAdd.posOrEmpty)

// Empty pos if it is a PC and it contains alphanumeric characters (so it can't be PC anyway).
if (layer.tagset.punctuationTags.contains(termToAdd.pos) && termToAdd.literals.contains(alphaNumeric)) {
wTag.setAttribute(posType(), "") // empty
} else {
wTag.setAttribute(posType(), termToAdd.posOrEmpty)
}

wTag.setAttribute("xml:id", termToAdd.targets.first().id)
return wTag
}
Expand Down Expand Up @@ -302,11 +314,19 @@ open class TEITextMerger(

private fun mergeWTag(wordFormToAdd: WordForm, element: Element) {
val termToAdd = layer.termForWordForm(wordFormToAdd)

// <pc> tags do not have a lemma.
if (element.tagName == "w") {
element.setAttribute("lemma", termToAdd.lemmaOrEmpty)
}
element.setAttribute(posType(), termToAdd.posOrEmpty)

// Clear the pos if it is a PC, and it contains alphanumeric characters (so it can't be PC anyway).
if (layer.tagset.punctuationTags.contains(termToAdd.pos) && termToAdd.literals.contains(alphaNumeric)) {
element.setAttribute(posType(), "") // Clear the pos
} else {
element.setAttribute(posType(), termToAdd.posOrEmpty)
}

element.removeAttribute("type") // Update legacy formats to TEI p5
// First check if the element has an id already, else add it.
if (element.getAttribute("xml:id").isNullOrBlank()) {
Expand Down
41 changes: 41 additions & 0 deletions server/src/test/kotlin/org/ivdnt/galahad/port/tei/TEIExportTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,47 @@ internal class TEIExportTest {
.result()
}

@Test
fun `Merge doc with alphanumeric PC`() {
fun asserAlphaNumericPC(folder: String) {
val file = TEIFile(Resource.get("$folder/input.tei.xml"))
assertPlainText(folder, file)

val plaintext: String = Resource.get("$folder/plaintext.txt").readText()
val tagset = TagsetStore().getOrNull("TDN-Core")!!
val layer = LayerBuilder()
.loadLayerFromTSV("$folder/pie-tdn.tsv", plaintext)
.setTagset(tagset)
.build()

DocTest.builder(corpus)
.expectingFile("$folder/merged-output.xml")
.mergeTEI(Resource.get("$folder/input.tei.xml"), layer)
.ignoreDate()
.ignoreUUID()
.result()
}
asserAlphaNumericPC("tei/alphanumericpc/with-w-tags")
asserAlphaNumericPC("tei/alphanumericpc/without-w-tags")
}

@Test
fun `Convert doc with alphanumeric PC`() {
val folder = "tei/alphanumericpc/with-w-tags"
val plaintext: String = Resource.get("$folder/plaintext.txt").readText()
val tagset = TagsetStore().getOrNull("TDN-Core")!!
val layer = LayerBuilder()
.loadLayerFromTSV("$folder/pie-tdn.tsv", plaintext)
.setTagset(tagset)
.build()
DocTest.builder(corpus)
.expectingFile("$folder/converted-output.xml")
.convertToTEI(Resource.get("$folder/input.tei.xml"), layer)
.ignoreDate()
.ignoreUUID()
.result()
}

@Test
fun `Merge a pie-tdn layer with a tei file that only contains plaintext`() {
val file = TEIFile(Resource.get("tei/brieven/input.tei.xml"))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:xs="http://www.w3.org/2001/XMLSchema">
<teiHeader>
<fileDesc>
<titleStmt>
<title>input.tei</title>
<respStmt>
<resp>linguistic annotation by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp>
<orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName>
<orgName xml:lang="en">Dutch Language Institute</orgName>
</respStmt>
<respStmt>
<resp>exported as tei-p5 by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp>
<orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName>
<orgName xml:lang="en">Dutch Language Institute</orgName>
</respStmt>
</titleStmt>
<publicationStmt>
<publisher>!Needs to be filled in!</publisher>
<idno type="sourceID">input.tei</idno>
<idno type="GaLAHaDPersistentIdentifier">ec68ad7d-542c-40d6-9195-e21ccadad0d0_tei</idno>
</publicationStmt>
<notesStmt>
<note resp="GaLAHaD" type="corpusName">testCorpus</note>
<note resp="GaLAHaD" type="sourceCollection">source name</note>
<note resp="GaLAHaD" type="sourceCollectionURL">http://source.url</note>
</notesStmt>
<sourceDesc>
<ab>
<idno type="sourceID">input.tei</idno>
</ab>
<ab type="date">
<date from="0" to="0"/>
</ab>
</sourceDesc>
</fileDesc>
<encodingDesc>
<appInfo resp="GaLAHaD">
<application ident="pie-tdn-all" version="1.0.1" xml:id="pie-tdn-all">
<label>POS-tagger and lemmatiser</label>
<ptr target="https://github.com/INL/galahad-taggers-dockerized/tree/1.0.2/pie/TDN-ALL"/>
</application>
</appInfo>
<editorialDecl resp="GaLAHaD">
<interpretation xml:id="A0001">
<ab subtype="POS-tagging_lemmatisation" type="linguisticAnnotation">
<interpGrp type="annotationStyle">
<interp>inline</interp>
</interpGrp>
<interpGrp type="Documentation">
<interp/>
</interpGrp>
<interpGrp type="annotationSet">
<interp>TDN-Core</interp>
</interpGrp>
<interpGrp type="annotationDescription">
<interp>The file was automatically annotated within the platform GaLAHaD, which is a central hub for enriching historical Dutch.</interp>
</interpGrp>
<interpGrp type="annotationFormat">
<interp>TEI xml</interp>
</interpGrp>
</ab>
<ab subtype="POS-tagging_lemmatisationProvenance1" type="linguisticAnnotation">
<interpGrp type="annotationMode">
<interp>automatically annotated</interp>
</interpGrp>
<interpGrp type="processor">
<interp sameAs="#pie-tdn-all"/>
</interpGrp>
<date from="2024-10-16" to="2024-10-16"/>
</ab>
</interpretation>
</editorialDecl>
</encodingDesc>
<profileDesc>
<langUsage>
<language ident="nl">
Dutch
<interpGrp type="dominantLanguage">
<interp>true</interp>
</interpGrp>
</language>
</langUsage>
</profileDesc>
</teiHeader>
<text>
<body>
<div>
<p>


<w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">&quot;loop&quot;</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc xml:id="w4">!</pc>
</p>
</div>
</body>
</text>
</TEI>
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<TEI>
<text>
<p>
<w>Ik</w> <w>"loop"</w> <w>naar:</w> <w>school</w><pc>!</pc>
</p>
</text>
</TEI>
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>input.tei</title><respStmt><resp>linguistic annotation by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt><respStmt><resp>TEI merged by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt></titleStmt><publicationStmt><publisher>!Needs to be filled in!</publisher><idno type="sourceID">input.tei</idno><idno type="GaLAHaDPersistentIdentifier">af2b5d82-b261-4752-bd5e-cda672d5d196_tei</idno></publicationStmt><notesStmt><note resp="GaLAHaD" type="corpusName">testCorpus</note><note resp="GaLAHaD" type="sourceCollection">source name</note><note resp="GaLAHaD" type="sourceCollectionURL">http://source.url</note></notesStmt><sourceDesc><ab><idno type="sourceID">input.tei</idno></ab><ab type="date"><date from="0" to="0"/></ab></sourceDesc></fileDesc><encodingDesc><appInfo resp="GaLAHaD"><application ident="pie-tdn-all" version="1.0.1" xml:id="pie-tdn-all"><label>POS-tagger and lemmatiser</label><ptr target="https://github.com/INL/galahad-taggers-dockerized/tree/1.0.2/pie/TDN-ALL"/></application></appInfo><editorialDecl resp="GaLAHaD"><interpretation xml:id="A0001"><ab subtype="POS-tagging_lemmatisation" type="linguisticAnnotation"><interpGrp type="annotationStyle"><interp>inline</interp></interpGrp><interpGrp type="Documentation"><interp/></interpGrp><interpGrp type="annotationSet"><interp>TDN-Core</interp></interpGrp><interpGrp type="annotationDescription"><interp>The file was automatically annotated within the platform GaLAHaD, which is a central hub for enriching historical Dutch.</interp></interpGrp><interpGrp type="annotationFormat"><interp>TEI xml</interp></interpGrp></ab><ab subtype="POS-tagging_lemmatisationProvenance1" type="linguisticAnnotation"><interpGrp type="annotationMode"><interp>automatically annotated</interp></interpGrp><interpGrp type="processor"><interp sameAs="#pie-tdn-all"/></interpGrp><date from="2024-10-16" to="2024-10-16"/></ab></interpretation></editorialDecl></encodingDesc><profileDesc><langUsage><language ident="nl">Dutch<interpGrp type="dominantLanguage"><interp>true</interp></interpGrp></language></langUsage></profileDesc></teiHeader>
<text>
<p> <w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">"loop"</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc pos="PC" xml:id="w4">!</pc> </p>
</text>
</TEI>
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
word lemma pos
Ik ik PD
"loop" loop PC
naar: : PC
school school NOU
! school! PC
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@


Ik "loop" naar: school!
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<TEI>
<text>
<p>
Ik "loop" naar: school!
</p>
</text>
</TEI>
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>input.tei</title><respStmt><resp>linguistic annotation by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt><respStmt><resp>TEI merged by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt></titleStmt><publicationStmt><publisher>!Needs to be filled in!</publisher><idno type="sourceID">input.tei</idno><idno type="GaLAHaDPersistentIdentifier">37bff7c0-7fd8-4523-9643-309909f0d7c9_tei</idno></publicationStmt><notesStmt><note resp="GaLAHaD" type="corpusName">testCorpus</note><note resp="GaLAHaD" type="sourceCollection">source name</note><note resp="GaLAHaD" type="sourceCollectionURL">http://source.url</note></notesStmt><sourceDesc><ab><idno type="sourceID">input.tei</idno></ab><ab type="date"><date from="0" to="0"/></ab></sourceDesc></fileDesc><encodingDesc><appInfo resp="GaLAHaD"><application ident="pie-tdn-all" version="1.0.1" xml:id="pie-tdn-all"><label>POS-tagger and lemmatiser</label><ptr target="https://github.com/INL/galahad-taggers-dockerized/tree/1.0.2/pie/TDN-ALL"/></application></appInfo><editorialDecl resp="GaLAHaD"><interpretation xml:id="A0001"><ab subtype="POS-tagging_lemmatisation" type="linguisticAnnotation"><interpGrp type="annotationStyle"><interp>inline</interp></interpGrp><interpGrp type="Documentation"><interp/></interpGrp><interpGrp type="annotationSet"><interp>TDN-Core</interp></interpGrp><interpGrp type="annotationDescription"><interp>The file was automatically annotated within the platform GaLAHaD, which is a central hub for enriching historical Dutch.</interp></interpGrp><interpGrp type="annotationFormat"><interp>TEI xml</interp></interpGrp></ab><ab subtype="POS-tagging_lemmatisationProvenance1" type="linguisticAnnotation"><interpGrp type="annotationMode"><interp>automatically annotated</interp></interpGrp><interpGrp type="processor"><interp sameAs="#pie-tdn-all"/></interpGrp><date from="2024-10-16" to="2024-10-16"/></ab></interpretation></editorialDecl></encodingDesc><profileDesc><langUsage><language ident="nl">Dutch<interpGrp type="dominantLanguage"><interp>true</interp></interpGrp></language></langUsage></profileDesc></teiHeader>
<text>
<p> <w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">"loop"</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc pos="PC" xml:id="w4">!</pc></p>
</text>
</TEI>
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
word lemma pos
Ik ik PD
"loop" loop PC
naar: : PC
school school NOU
! school! PC
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@


Ik "loop" naar: school!

0 comments on commit aed57d4

Please sign in to comment.