Skip to content

Commit 9e832fa

Browse files
committed
Fix some tests around pc pos
1 parent 2741f9b commit 9e832fa

File tree

6 files changed

+14
-17
lines changed

6 files changed

+14
-17
lines changed

server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/LayerToTEIConverter.kt

+1-2
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,7 @@ class LayerToTEIConverter(
111111
val alphaNumeric = Regex("""[a-zA-Z0-9]""")
112112
if (!term.literals.contains(alphaNumeric)) {
113113
// Interpret as punctuation only if it doesn't contain any alphanumeric characters
114-
val pos = term.posOrEmpty.escapeXML()
115-
writer.writeRaw("<pc pos=\"$pos\" xml:id=\"${term.targets[0].id}\">${getLiteral()}</pc>")
114+
writer.writeRaw("<pc xml:id=\"${term.targets[0].id}\">${getLiteral()}</pc>")
116115
} else {
117116
// Clear the pos and interpret as <w>
118117
val lemma = term.lemmaOrEmpty.escapeXML()

server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt

+7-9
Original file line numberDiff line numberDiff line change
@@ -238,16 +238,15 @@ open class TEITextMerger(
238238
} else {
239239
val element = document.createElement("w")
240240
element.setAttribute("lemma", termToAdd.lemmaOrEmpty)
241+
// Empty pos if it is a PC and it contains alphanumeric characters (so it can't be PC anyway).
242+
if (layer.tagset.punctuationTags.contains(termToAdd.pos) && termToAdd.literals.contains(alphaNumeric)) {
243+
element.setAttribute(posType(), "") // empty
244+
} else {
245+
element.setAttribute(posType(), termToAdd.posOrEmpty)
246+
}
241247
element
242248
}
243249

244-
// Empty pos if it is a PC and it contains alphanumeric characters (so it can't be PC anyway).
245-
if (layer.tagset.punctuationTags.contains(termToAdd.pos) && termToAdd.literals.contains(alphaNumeric)) {
246-
wTag.setAttribute(posType(), "") // empty
247-
} else {
248-
wTag.setAttribute(posType(), termToAdd.posOrEmpty)
249-
}
250-
251250
wTag.setAttribute("xml:id", termToAdd.targets.first().id)
252251
return wTag
253252
}
@@ -316,13 +315,12 @@ open class TEITextMerger(
316315
// <pc> tags do not have a lemma.
317316
if (element.tagName == "w") {
318317
element.setAttribute("lemma", termToAdd.lemmaOrEmpty)
318+
element.setAttribute(posType(), termToAdd.posOrEmpty)
319319
}
320320

321321
// Clear the pos if it is a PC, and it contains alphanumeric characters (so it can't be PC anyway).
322322
if (layer.tagset.punctuationTags.contains(termToAdd.pos) && termToAdd.literals.contains(alphaNumeric)) {
323323
element.setAttribute(posType(), "") // Clear the pos
324-
} else {
325-
element.setAttribute(posType(), termToAdd.posOrEmpty)
326324
}
327325

328326
element.removeAttribute("type") // Update legacy formats to TEI p5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<?xml version="1.0" encoding="UTF-8" standalone="no"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>input.tei</title><respStmt><resp>linguistic annotation by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt><respStmt><resp>TEI merged by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt></titleStmt><publicationStmt><publisher>!Needs to be filled in!</publisher><idno type="sourceID">input.tei</idno><idno type="GaLAHaDPersistentIdentifier">af2b5d82-b261-4752-bd5e-cda672d5d196_tei</idno></publicationStmt><notesStmt><note resp="GaLAHaD" type="corpusName">testCorpus</note><note resp="GaLAHaD" type="sourceCollection">source name</note><note resp="GaLAHaD" type="sourceCollectionURL">http://source.url</note></notesStmt><sourceDesc><ab><idno type="sourceID">input.tei</idno></ab><ab type="date"><date from="0" to="0"/></ab></sourceDesc></fileDesc><encodingDesc><appInfo resp="GaLAHaD"><application ident="pie-tdn-all" version="1.0.1" xml:id="pie-tdn-all"><label>POS-tagger and lemmatiser</label><ptr target="https://github.com/INL/galahad-taggers-dockerized/tree/1.0.2/pie/TDN-ALL"/></application></appInfo><editorialDecl resp="GaLAHaD"><interpretation xml:id="A0001"><ab subtype="POS-tagging_lemmatisation" type="linguisticAnnotation"><interpGrp type="annotationStyle"><interp>inline</interp></interpGrp><interpGrp type="Documentation"><interp/></interpGrp><interpGrp type="annotationSet"><interp>TDN-Core</interp></interpGrp><interpGrp type="annotationDescription"><interp>The file was automatically annotated within the platform GaLAHaD, which is a central hub for enriching historical Dutch.</interp></interpGrp><interpGrp type="annotationFormat"><interp>TEI xml</interp></interpGrp></ab><ab subtype="POS-tagging_lemmatisationProvenance1" type="linguisticAnnotation"><interpGrp type="annotationMode"><interp>automatically annotated</interp></interpGrp><interpGrp type="processor"><interp sameAs="#pie-tdn-all"/></interpGrp><date from="2024-10-16" to="2024-10-16"/></ab></interpretation></editorialDecl></encodingDesc><profileDesc><langUsage><language ident="nl">Dutch<interpGrp type="dominantLanguage"><interp>true</interp></interpGrp></language></langUsage></profileDesc></teiHeader>
22
<text>
3-
<p> <w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">"loop"</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc pos="PC" xml:id="w4">!</pc> </p>
3+
<p> <w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">"loop"</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc xml:id="w4">!</pc> </p>
44
</text>
55
</TEI>
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<?xml version="1.0" encoding="UTF-8" standalone="no"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>input.tei</title><respStmt><resp>linguistic annotation by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt><respStmt><resp>TEI merged by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt></titleStmt><publicationStmt><publisher>!Needs to be filled in!</publisher><idno type="sourceID">input.tei</idno><idno type="GaLAHaDPersistentIdentifier">37bff7c0-7fd8-4523-9643-309909f0d7c9_tei</idno></publicationStmt><notesStmt><note resp="GaLAHaD" type="corpusName">testCorpus</note><note resp="GaLAHaD" type="sourceCollection">source name</note><note resp="GaLAHaD" type="sourceCollectionURL">http://source.url</note></notesStmt><sourceDesc><ab><idno type="sourceID">input.tei</idno></ab><ab type="date"><date from="0" to="0"/></ab></sourceDesc></fileDesc><encodingDesc><appInfo resp="GaLAHaD"><application ident="pie-tdn-all" version="1.0.1" xml:id="pie-tdn-all"><label>POS-tagger and lemmatiser</label><ptr target="https://github.com/INL/galahad-taggers-dockerized/tree/1.0.2/pie/TDN-ALL"/></application></appInfo><editorialDecl resp="GaLAHaD"><interpretation xml:id="A0001"><ab subtype="POS-tagging_lemmatisation" type="linguisticAnnotation"><interpGrp type="annotationStyle"><interp>inline</interp></interpGrp><interpGrp type="Documentation"><interp/></interpGrp><interpGrp type="annotationSet"><interp>TDN-Core</interp></interpGrp><interpGrp type="annotationDescription"><interp>The file was automatically annotated within the platform GaLAHaD, which is a central hub for enriching historical Dutch.</interp></interpGrp><interpGrp type="annotationFormat"><interp>TEI xml</interp></interpGrp></ab><ab subtype="POS-tagging_lemmatisationProvenance1" type="linguisticAnnotation"><interpGrp type="annotationMode"><interp>automatically annotated</interp></interpGrp><interpGrp type="processor"><interp sameAs="#pie-tdn-all"/></interpGrp><date from="2024-10-16" to="2024-10-16"/></ab></interpretation></editorialDecl></encodingDesc><profileDesc><langUsage><language ident="nl">Dutch<interpGrp type="dominantLanguage"><interp>true</interp></interpGrp></language></langUsage></profileDesc></teiHeader>
22
<text>
3-
<p> <w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">"loop"</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc pos="PC" xml:id="w4">!</pc></p>
3+
<p> <w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">"loop"</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc xml:id="w4">!</pc></p>
44
</text>
55
</TEI>
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
<?xml version="1.0" encoding="UTF-8" standalone="no"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>punctutation-mixed-tags</title><respStmt><resp>linguistic annotation by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt><respStmt><resp>TEI merged by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt></titleStmt><publicationStmt><publisher>!Needs to be filled in!</publisher><idno type="sourceID">punctutation-mixed-tags</idno><idno type="GaLAHaDPersistentIdentifier">__UUID_IGNORED_BY_TEST___tei</idno></publicationStmt><notesStmt><note resp="GaLAHaD" type="corpusName">testCorpus</note><note resp="GaLAHaD" type="sourceCollection">source name</note><note resp="GaLAHaD" type="sourceCollectionURL">http://source.url</note></notesStmt><sourceDesc><ab><idno type="sourceID">punctutation-mixed-tags</idno></ab><ab type="date"><date from="0" to="0"/></ab></sourceDesc></fileDesc><encodingDesc><appInfo resp="GaLAHaD"><application ident="pie-tdn-all" version="1.0.1" xml:id="pie-tdn-all"><label>POS-tagger and lemmatiser</label><ptr target="https://github.com/INL/galahad-taggers-dockerized/tree/1.0.2/pie/TDN-ALL"/></application></appInfo><editorialDecl resp="GaLAHaD"><interpretation xml:id="A0001"><ab subtype="POS-tagging_lemmatisation" type="linguisticAnnotation"><interpGrp type="annotationStyle"><interp>inline</interp></interpGrp><interpGrp type="Documentation"><interp/></interpGrp><interpGrp type="annotationSet"><interp>TDN-Core</interp></interpGrp><interpGrp type="annotationDescription"><interp>The file was automatically annotated within the platform GaLAHaD, which is a central hub for enriching historical Dutch.</interp></interpGrp><interpGrp type="annotationFormat"><interp>TEI xml</interp></interpGrp></ab><ab subtype="POS-tagging_lemmatisationProvenance1" type="linguisticAnnotation"><interpGrp type="annotationMode"><interp>automatically annotated</interp></interpGrp><interpGrp type="processor"><interp sameAs="#pie-tdn-all"/></interpGrp><date from="__DATE_IGNORED_BY_TEST__" to="__DATE_IGNORED_BY_TEST__"/></ab></interpretation></editorialDecl></encodingDesc><profileDesc><langUsage><language ident="nl">Dutch<interpGrp type="dominantLanguage"><interp>true</interp></interpGrp></language></langUsage></profileDesc></teiHeader>
2-
<text><p><w lemma="punctuatie" pos="DUM" xml:id="w0">Punctuatie</w><pc pos="PC" xml:id="w1">,</pc> <w lemma="zin" pos="DUM" xml:id="w2">zin</w> <w lemma="zonder" pos="DUM" xml:id="w3">zonder</w> <w lemma="tags" pos="DUM" xml:id="w4">tags</w><pc pos="PC" xml:id="w5">.</pc><w lemma="zin" pos="DUM" xml:id="w6">Zin</w> <w lemma="met" pos="DUM" xml:id="w7">met</w> <w lemma="tags" pos="DUM" xml:id="w8">tags</w><pc pos="PC" xml:id="w9">.</pc></p></text>
2+
<text><p><w lemma="punctuatie" pos="DUM" xml:id="w0">Punctuatie</w><pc xml:id="w1">,</pc> <w lemma="zin" pos="DUM" xml:id="w2">zin</w> <w lemma="zonder" pos="DUM" xml:id="w3">zonder</w> <w lemma="tags" pos="DUM" xml:id="w4">tags</w><pc xml:id="w5">.</pc><w lemma="zin" pos="DUM" xml:id="w6">Zin</w> <w lemma="met" pos="DUM" xml:id="w7">met</w> <w lemma="tags" pos="DUM" xml:id="w8">tags</w><pc xml:id="w9">.</pc></p></text>
33
</TEI>

server/src/test/resources/tei/twine/merged-output.xml

+3-3
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
<body>
66
<p><w lemma="net" pos="AA(degree=pos,position=free)" xml:id="w0">n<hi rend="bold">e</hi>t</w><milestone n="1va" unit="fol"/> <w lemma="zijn" pos="VRB(finiteness=fin,tense=past)" xml:id="w1"><hi rend="bold">w</hi><hi rend="bold">a</hi><hi rend="bold">s</hi></w> <w lemma="ik" pos="PD(type=pers,position=free)" xml:id="w2">ik</w><w lemma="naar" pos="ADP(type=pre)" xml:id="w3"><hi rend="bold">n</hi>aar</w> <w lemma="school" pos="NOU-C(number=sg)" xml:id="w4">sc<hi rend="bold">h</hi><hi rend="bold">o</hi>o<hi rend="bold">l</hi></w> <w lemma="heengaan" pos="ADV(type=reg)" xml:id="w5">heen</w> <w lemma="en" pos="CONJ(type=coor)" xml:id="w6">en</w> <w lemma="terugkomen" pos="VRB(finiteness=fin,tense=past)" xml:id="w7">t<hi rend="bold">e</hi>r<hi rend="bold">u</hi>g</w> <w lemma="wezen" pos="VRB(finiteness=inf)" xml:id="w8">w<hi rend="bold">eze</hi>n</w> <w lemma="lopen" pos="VRB(finiteness=inf)" xml:id="w9">lope<hi rend="bold">n</hi></w> <w lemma="hetzovoor" pos="ADV(type=reg)" xml:id="w10"><hi rend="bold">enz</hi>o</w></p>
77

8-
<pc lemma="onzin" pos="LET" xml:id="w11">.</pc>
9-
<pc pos="LET" xml:id="w12">.</pc>
10-
<pc pos="LET" xml:id="w13">.</pc>
8+
<pc lemma="onzin" xml:id="w11">.</pc>
9+
<pc pos="onzin" xml:id="w12">.</pc>
10+
<pc xml:id="w13">.</pc>
1111
<w lemma="school" pos="NOU-C(number=pl)" xml:id="w14">scholen</w>
1212
<w lemma="school" pos="NOU-C(number=pl)" xml:id="w15">scholen</w>
1313

0 commit comments

Comments
 (0)