Fix some tests around pc pos

PrinsINT · PrinsINT · commit 9e832fa0c7b8 · 2024-10-31T15:16:06.000+01:00
diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/LayerToTEIConverter.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/LayerToTEIConverter.kt
@@ -111,8 +111,7 @@ class LayerToTEIConverter(
             val alphaNumeric = Regex("""[a-zA-Z0-9]""")
             if (!term.literals.contains(alphaNumeric)) {
                 // Interpret as punctuation only if it doesn't contain any alphanumeric characters
-                val pos = term.posOrEmpty.escapeXML()
-                writer.writeRaw("<pc pos=\"$pos\" xml:id=\"${term.targets[0].id}\">${getLiteral()}</pc>")
+                writer.writeRaw("<pc xml:id=\"${term.targets[0].id}\">${getLiteral()}</pc>")
             } else {
                 // Clear the pos and interpret as <w>
                 val lemma = term.lemmaOrEmpty.escapeXML()
diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt
@@ -238,16 +238,15 @@ open class TEITextMerger(
         } else {
             val element = document.createElement("w")
             element.setAttribute("lemma", termToAdd.lemmaOrEmpty)
+            // Empty pos if it is a PC and it contains alphanumeric characters (so it can't be PC anyway).
+            if (layer.tagset.punctuationTags.contains(termToAdd.pos) && termToAdd.literals.contains(alphaNumeric)) {
+                element.setAttribute(posType(), "") // empty
+            } else {
+                element.setAttribute(posType(), termToAdd.posOrEmpty)
+            }
             element
         }
 
-        // Empty pos if it is a PC and it contains alphanumeric characters (so it can't be PC anyway).
-        if (layer.tagset.punctuationTags.contains(termToAdd.pos) && termToAdd.literals.contains(alphaNumeric)) {
-            wTag.setAttribute(posType(), "") // empty
-        } else {
-            wTag.setAttribute(posType(), termToAdd.posOrEmpty)
-        }
-
         wTag.setAttribute("xml:id", termToAdd.targets.first().id)
         return wTag
     }
@@ -316,13 +315,12 @@ open class TEITextMerger(
         // <pc> tags do not have a lemma.
         if (element.tagName == "w") {
             element.setAttribute("lemma", termToAdd.lemmaOrEmpty)
+            element.setAttribute(posType(), termToAdd.posOrEmpty)
         }
 
         // Clear the pos if it is a PC, and it contains alphanumeric characters (so it can't be PC anyway).
         if (layer.tagset.punctuationTags.contains(termToAdd.pos) && termToAdd.literals.contains(alphaNumeric)) {
             element.setAttribute(posType(), "") // Clear the pos
-        } else {
-            element.setAttribute(posType(), termToAdd.posOrEmpty)
         }
 
         element.removeAttribute("type") // Update legacy formats to TEI p5
diff --git a/server/src/test/resources/tei/alphanumericpc/with-w-tags/merged-output.xml b/server/src/test/resources/tei/alphanumericpc/with-w-tags/merged-output.xml
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>input.tei</title><respStmt><resp>linguistic annotation by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt><respStmt><resp>TEI merged by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt></titleStmt><publicationStmt><publisher>!Needs to be filled in!</publisher><idno type="sourceID">input.tei</idno><idno type="GaLAHaDPersistentIdentifier">af2b5d82-b261-4752-bd5e-cda672d5d196_tei</idno></publicationStmt><notesStmt><note resp="GaLAHaD" type="corpusName">testCorpus</note><note resp="GaLAHaD" type="sourceCollection">source name</note><note resp="GaLAHaD" type="sourceCollectionURL">http://source.url</note></notesStmt><sourceDesc><ab><idno type="sourceID">input.tei</idno></ab><ab type="date"><date from="0" to="0"/></ab></sourceDesc></fileDesc><encodingDesc><appInfo resp="GaLAHaD"><application ident="pie-tdn-all" version="1.0.1" xml:id="pie-tdn-all"><label>POS-tagger and lemmatiser</label><ptr target="https://github.com/INL/galahad-taggers-dockerized/tree/1.0.2/pie/TDN-ALL"/></application></appInfo><editorialDecl resp="GaLAHaD"><interpretation xml:id="A0001"><ab subtype="POS-tagging_lemmatisation" type="linguisticAnnotation"><interpGrp type="annotationStyle"><interp>inline</interp></interpGrp><interpGrp type="Documentation"><interp/></interpGrp><interpGrp type="annotationSet"><interp>TDN-Core</interp></interpGrp><interpGrp type="annotationDescription"><interp>The file was automatically annotated within the platform GaLAHaD, which is a central hub for enriching historical Dutch.</interp></interpGrp><interpGrp type="annotationFormat"><interp>TEI xml</interp></interpGrp></ab><ab subtype="POS-tagging_lemmatisationProvenance1" type="linguisticAnnotation"><interpGrp type="annotationMode"><interp>automatically annotated</interp></interpGrp><interpGrp type="processor"><interp sameAs="#pie-tdn-all"/></interpGrp><date from="2024-10-16" to="2024-10-16"/></ab></interpretation></editorialDecl></encodingDesc><profileDesc><langUsage><language ident="nl">Dutch<interpGrp type="dominantLanguage"><interp>true</interp></interpGrp></language></langUsage></profileDesc></teiHeader>
     <text>
-        <p> <w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">"loop"</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc pos="PC" xml:id="w4">!</pc> </p>
+        <p> <w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">"loop"</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc xml:id="w4">!</pc> </p>
     </text>
 </TEI>
diff --git a/server/src/test/resources/tei/alphanumericpc/without-w-tags/merged-output.xml b/server/src/test/resources/tei/alphanumericpc/without-w-tags/merged-output.xml
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>input.tei</title><respStmt><resp>linguistic annotation by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt><respStmt><resp>TEI merged by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt></titleStmt><publicationStmt><publisher>!Needs to be filled in!</publisher><idno type="sourceID">input.tei</idno><idno type="GaLAHaDPersistentIdentifier">37bff7c0-7fd8-4523-9643-309909f0d7c9_tei</idno></publicationStmt><notesStmt><note resp="GaLAHaD" type="corpusName">testCorpus</note><note resp="GaLAHaD" type="sourceCollection">source name</note><note resp="GaLAHaD" type="sourceCollectionURL">http://source.url</note></notesStmt><sourceDesc><ab><idno type="sourceID">input.tei</idno></ab><ab type="date"><date from="0" to="0"/></ab></sourceDesc></fileDesc><encodingDesc><appInfo resp="GaLAHaD"><application ident="pie-tdn-all" version="1.0.1" xml:id="pie-tdn-all"><label>POS-tagger and lemmatiser</label><ptr target="https://github.com/INL/galahad-taggers-dockerized/tree/1.0.2/pie/TDN-ALL"/></application></appInfo><editorialDecl resp="GaLAHaD"><interpretation xml:id="A0001"><ab subtype="POS-tagging_lemmatisation" type="linguisticAnnotation"><interpGrp type="annotationStyle"><interp>inline</interp></interpGrp><interpGrp type="Documentation"><interp/></interpGrp><interpGrp type="annotationSet"><interp>TDN-Core</interp></interpGrp><interpGrp type="annotationDescription"><interp>The file was automatically annotated within the platform GaLAHaD, which is a central hub for enriching historical Dutch.</interp></interpGrp><interpGrp type="annotationFormat"><interp>TEI xml</interp></interpGrp></ab><ab subtype="POS-tagging_lemmatisationProvenance1" type="linguisticAnnotation"><interpGrp type="annotationMode"><interp>automatically annotated</interp></interpGrp><interpGrp type="processor"><interp sameAs="#pie-tdn-all"/></interpGrp><date from="2024-10-16" to="2024-10-16"/></ab></interpretation></editorialDecl></encodingDesc><profileDesc><langUsage><language ident="nl">Dutch<interpGrp type="dominantLanguage"><interp>true</interp></interpGrp></language></langUsage></profileDesc></teiHeader>
     <text>
-        <p> <w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">"loop"</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc pos="PC" xml:id="w4">!</pc></p>
+        <p> <w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">"loop"</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc xml:id="w4">!</pc></p>
     </text>
 </TEI>
diff --git a/server/src/test/resources/tei/export/punctuation-mixed-tags-merge-export-result.xml b/server/src/test/resources/tei/export/punctuation-mixed-tags-merge-export-result.xml
@@ -1,3 +1,3 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>punctutation-mixed-tags</title><respStmt><resp>linguistic annotation by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt><respStmt><resp>TEI merged by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt></titleStmt><publicationStmt><publisher>!Needs to be filled in!</publisher><idno type="sourceID">punctutation-mixed-tags</idno><idno type="GaLAHaDPersistentIdentifier">__UUID_IGNORED_BY_TEST___tei</idno></publicationStmt><notesStmt><note resp="GaLAHaD" type="corpusName">testCorpus</note><note resp="GaLAHaD" type="sourceCollection">source name</note><note resp="GaLAHaD" type="sourceCollectionURL">http://source.url</note></notesStmt><sourceDesc><ab><idno type="sourceID">punctutation-mixed-tags</idno></ab><ab type="date"><date from="0" to="0"/></ab></sourceDesc></fileDesc><encodingDesc><appInfo resp="GaLAHaD"><application ident="pie-tdn-all" version="1.0.1" xml:id="pie-tdn-all"><label>POS-tagger and lemmatiser</label><ptr target="https://github.com/INL/galahad-taggers-dockerized/tree/1.0.2/pie/TDN-ALL"/></application></appInfo><editorialDecl resp="GaLAHaD"><interpretation xml:id="A0001"><ab subtype="POS-tagging_lemmatisation" type="linguisticAnnotation"><interpGrp type="annotationStyle"><interp>inline</interp></interpGrp><interpGrp type="Documentation"><interp/></interpGrp><interpGrp type="annotationSet"><interp>TDN-Core</interp></interpGrp><interpGrp type="annotationDescription"><interp>The file was automatically annotated within the platform GaLAHaD, which is a central hub for enriching historical Dutch.</interp></interpGrp><interpGrp type="annotationFormat"><interp>TEI xml</interp></interpGrp></ab><ab subtype="POS-tagging_lemmatisationProvenance1" type="linguisticAnnotation"><interpGrp type="annotationMode"><interp>automatically annotated</interp></interpGrp><interpGrp type="processor"><interp sameAs="#pie-tdn-all"/></interpGrp><date from="__DATE_IGNORED_BY_TEST__" to="__DATE_IGNORED_BY_TEST__"/></ab></interpretation></editorialDecl></encodingDesc><profileDesc><langUsage><language ident="nl">Dutch<interpGrp type="dominantLanguage"><interp>true</interp></interpGrp></language></langUsage></profileDesc></teiHeader>
-    <text><p><w lemma="punctuatie" pos="DUM" xml:id="w0">Punctuatie</w><pc pos="PC" xml:id="w1">,</pc> <w lemma="zin" pos="DUM" xml:id="w2">zin</w> <w lemma="zonder" pos="DUM" xml:id="w3">zonder</w> <w lemma="tags" pos="DUM" xml:id="w4">tags</w><pc pos="PC" xml:id="w5">.</pc><w lemma="zin" pos="DUM" xml:id="w6">Zin</w> <w lemma="met" pos="DUM" xml:id="w7">met</w> <w lemma="tags" pos="DUM" xml:id="w8">tags</w><pc pos="PC" xml:id="w9">.</pc></p></text>
+    <text><p><w lemma="punctuatie" pos="DUM" xml:id="w0">Punctuatie</w><pc xml:id="w1">,</pc> <w lemma="zin" pos="DUM" xml:id="w2">zin</w> <w lemma="zonder" pos="DUM" xml:id="w3">zonder</w> <w lemma="tags" pos="DUM" xml:id="w4">tags</w><pc xml:id="w5">.</pc><w lemma="zin" pos="DUM" xml:id="w6">Zin</w> <w lemma="met" pos="DUM" xml:id="w7">met</w> <w lemma="tags" pos="DUM" xml:id="w8">tags</w><pc xml:id="w9">.</pc></p></text>
 </TEI>
diff --git a/server/src/test/resources/tei/twine/merged-output.xml b/server/src/test/resources/tei/twine/merged-output.xml
@@ -5,9 +5,9 @@
         <body>
             <p><w lemma="net" pos="AA(degree=pos,position=free)" xml:id="w0">n<hi rend="bold">e</hi>t</w><milestone n="1va" unit="fol"/> <w lemma="zijn" pos="VRB(finiteness=fin,tense=past)" xml:id="w1"><hi rend="bold">w</hi><hi rend="bold">a</hi><hi rend="bold">s</hi></w> <w lemma="ik" pos="PD(type=pers,position=free)" xml:id="w2">ik</w><w lemma="naar" pos="ADP(type=pre)" xml:id="w3"><hi rend="bold">n</hi>aar</w> <w lemma="school" pos="NOU-C(number=sg)" xml:id="w4">sc<hi rend="bold">h</hi><hi rend="bold">o</hi>o<hi rend="bold">l</hi></w> <w lemma="heengaan" pos="ADV(type=reg)" xml:id="w5">heen</w> <w lemma="en" pos="CONJ(type=coor)" xml:id="w6">en</w> <w lemma="terugkomen" pos="VRB(finiteness=fin,tense=past)" xml:id="w7">t<hi rend="bold">e</hi>r<hi rend="bold">u</hi>g</w> <w lemma="wezen" pos="VRB(finiteness=inf)" xml:id="w8">w<hi rend="bold">eze</hi>n</w> <w lemma="lopen" pos="VRB(finiteness=inf)" xml:id="w9">lope<hi rend="bold">n</hi></w> <w lemma="hetzovoor" pos="ADV(type=reg)" xml:id="w10"><hi rend="bold">enz</hi>o</w></p>
 
-            <pc lemma="onzin" pos="LET" xml:id="w11">.</pc>
-            <pc pos="LET" xml:id="w12">.</pc>
-            <pc pos="LET" xml:id="w13">.</pc>
+            <pc lemma="onzin" xml:id="w11">.</pc>
+            <pc pos="onzin" xml:id="w12">.</pc>
+            <pc xml:id="w13">.</pc>
             <w lemma="school" pos="NOU-C(number=pl)" xml:id="w14">scholen</w>
             <w lemma="school" pos="NOU-C(number=pl)" xml:id="w15">scholen</w>