Skip to content

Commit

Permalink
Fix some tests around pc pos
Browse files Browse the repository at this point in the history
  • Loading branch information
PrinsINT committed Oct 31, 2024
1 parent 2741f9b commit 9e832fa
Show file tree
Hide file tree
Showing 6 changed files with 14 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,7 @@ class LayerToTEIConverter(
val alphaNumeric = Regex("""[a-zA-Z0-9]""")
if (!term.literals.contains(alphaNumeric)) {
// Interpret as punctuation only if it doesn't contain any alphanumeric characters
val pos = term.posOrEmpty.escapeXML()
writer.writeRaw("<pc pos=\"$pos\" xml:id=\"${term.targets[0].id}\">${getLiteral()}</pc>")
writer.writeRaw("<pc xml:id=\"${term.targets[0].id}\">${getLiteral()}</pc>")
} else {
// Clear the pos and interpret as <w>
val lemma = term.lemmaOrEmpty.escapeXML()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,16 +238,15 @@ open class TEITextMerger(
} else {
val element = document.createElement("w")
element.setAttribute("lemma", termToAdd.lemmaOrEmpty)
// Empty pos if it is a PC and it contains alphanumeric characters (so it can't be PC anyway).
if (layer.tagset.punctuationTags.contains(termToAdd.pos) && termToAdd.literals.contains(alphaNumeric)) {
element.setAttribute(posType(), "") // empty
} else {
element.setAttribute(posType(), termToAdd.posOrEmpty)
}
element
}

// Empty pos if it is a PC and it contains alphanumeric characters (so it can't be PC anyway).
if (layer.tagset.punctuationTags.contains(termToAdd.pos) && termToAdd.literals.contains(alphaNumeric)) {
wTag.setAttribute(posType(), "") // empty
} else {
wTag.setAttribute(posType(), termToAdd.posOrEmpty)
}

wTag.setAttribute("xml:id", termToAdd.targets.first().id)
return wTag
}
Expand Down Expand Up @@ -316,13 +315,12 @@ open class TEITextMerger(
// <pc> tags do not have a lemma.
if (element.tagName == "w") {
element.setAttribute("lemma", termToAdd.lemmaOrEmpty)
element.setAttribute(posType(), termToAdd.posOrEmpty)
}

// Clear the pos if it is a PC, and it contains alphanumeric characters (so it can't be PC anyway).
if (layer.tagset.punctuationTags.contains(termToAdd.pos) && termToAdd.literals.contains(alphaNumeric)) {
element.setAttribute(posType(), "") // Clear the pos
} else {
element.setAttribute(posType(), termToAdd.posOrEmpty)
}

element.removeAttribute("type") // Update legacy formats to TEI p5
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>input.tei</title><respStmt><resp>linguistic annotation by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt><respStmt><resp>TEI merged by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt></titleStmt><publicationStmt><publisher>!Needs to be filled in!</publisher><idno type="sourceID">input.tei</idno><idno type="GaLAHaDPersistentIdentifier">af2b5d82-b261-4752-bd5e-cda672d5d196_tei</idno></publicationStmt><notesStmt><note resp="GaLAHaD" type="corpusName">testCorpus</note><note resp="GaLAHaD" type="sourceCollection">source name</note><note resp="GaLAHaD" type="sourceCollectionURL">http://source.url</note></notesStmt><sourceDesc><ab><idno type="sourceID">input.tei</idno></ab><ab type="date"><date from="0" to="0"/></ab></sourceDesc></fileDesc><encodingDesc><appInfo resp="GaLAHaD"><application ident="pie-tdn-all" version="1.0.1" xml:id="pie-tdn-all"><label>POS-tagger and lemmatiser</label><ptr target="https://github.com/INL/galahad-taggers-dockerized/tree/1.0.2/pie/TDN-ALL"/></application></appInfo><editorialDecl resp="GaLAHaD"><interpretation xml:id="A0001"><ab subtype="POS-tagging_lemmatisation" type="linguisticAnnotation"><interpGrp type="annotationStyle"><interp>inline</interp></interpGrp><interpGrp type="Documentation"><interp/></interpGrp><interpGrp type="annotationSet"><interp>TDN-Core</interp></interpGrp><interpGrp type="annotationDescription"><interp>The file was automatically annotated within the platform GaLAHaD, which is a central hub for enriching historical Dutch.</interp></interpGrp><interpGrp type="annotationFormat"><interp>TEI xml</interp></interpGrp></ab><ab subtype="POS-tagging_lemmatisationProvenance1" type="linguisticAnnotation"><interpGrp type="annotationMode"><interp>automatically annotated</interp></interpGrp><interpGrp type="processor"><interp sameAs="#pie-tdn-all"/></interpGrp><date from="2024-10-16" to="2024-10-16"/></ab></interpretation></editorialDecl></encodingDesc><profileDesc><langUsage><language ident="nl">Dutch<interpGrp type="dominantLanguage"><interp>true</interp></interpGrp></language></langUsage></profileDesc></teiHeader>
<text>
<p> <w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">"loop"</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc pos="PC" xml:id="w4">!</pc> </p>
<p> <w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">"loop"</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc xml:id="w4">!</pc> </p>
</text>
</TEI>
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>input.tei</title><respStmt><resp>linguistic annotation by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt><respStmt><resp>TEI merged by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt></titleStmt><publicationStmt><publisher>!Needs to be filled in!</publisher><idno type="sourceID">input.tei</idno><idno type="GaLAHaDPersistentIdentifier">37bff7c0-7fd8-4523-9643-309909f0d7c9_tei</idno></publicationStmt><notesStmt><note resp="GaLAHaD" type="corpusName">testCorpus</note><note resp="GaLAHaD" type="sourceCollection">source name</note><note resp="GaLAHaD" type="sourceCollectionURL">http://source.url</note></notesStmt><sourceDesc><ab><idno type="sourceID">input.tei</idno></ab><ab type="date"><date from="0" to="0"/></ab></sourceDesc></fileDesc><encodingDesc><appInfo resp="GaLAHaD"><application ident="pie-tdn-all" version="1.0.1" xml:id="pie-tdn-all"><label>POS-tagger and lemmatiser</label><ptr target="https://github.com/INL/galahad-taggers-dockerized/tree/1.0.2/pie/TDN-ALL"/></application></appInfo><editorialDecl resp="GaLAHaD"><interpretation xml:id="A0001"><ab subtype="POS-tagging_lemmatisation" type="linguisticAnnotation"><interpGrp type="annotationStyle"><interp>inline</interp></interpGrp><interpGrp type="Documentation"><interp/></interpGrp><interpGrp type="annotationSet"><interp>TDN-Core</interp></interpGrp><interpGrp type="annotationDescription"><interp>The file was automatically annotated within the platform GaLAHaD, which is a central hub for enriching historical Dutch.</interp></interpGrp><interpGrp type="annotationFormat"><interp>TEI xml</interp></interpGrp></ab><ab subtype="POS-tagging_lemmatisationProvenance1" type="linguisticAnnotation"><interpGrp type="annotationMode"><interp>automatically annotated</interp></interpGrp><interpGrp type="processor"><interp sameAs="#pie-tdn-all"/></interpGrp><date from="2024-10-16" to="2024-10-16"/></ab></interpretation></editorialDecl></encodingDesc><profileDesc><langUsage><language ident="nl">Dutch<interpGrp type="dominantLanguage"><interp>true</interp></interpGrp></language></langUsage></profileDesc></teiHeader>
<text>
<p> <w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">"loop"</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc pos="PC" xml:id="w4">!</pc></p>
<p> <w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">"loop"</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc xml:id="w4">!</pc></p>
</text>
</TEI>
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>punctutation-mixed-tags</title><respStmt><resp>linguistic annotation by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt><respStmt><resp>TEI merged by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt></titleStmt><publicationStmt><publisher>!Needs to be filled in!</publisher><idno type="sourceID">punctutation-mixed-tags</idno><idno type="GaLAHaDPersistentIdentifier">__UUID_IGNORED_BY_TEST___tei</idno></publicationStmt><notesStmt><note resp="GaLAHaD" type="corpusName">testCorpus</note><note resp="GaLAHaD" type="sourceCollection">source name</note><note resp="GaLAHaD" type="sourceCollectionURL">http://source.url</note></notesStmt><sourceDesc><ab><idno type="sourceID">punctutation-mixed-tags</idno></ab><ab type="date"><date from="0" to="0"/></ab></sourceDesc></fileDesc><encodingDesc><appInfo resp="GaLAHaD"><application ident="pie-tdn-all" version="1.0.1" xml:id="pie-tdn-all"><label>POS-tagger and lemmatiser</label><ptr target="https://github.com/INL/galahad-taggers-dockerized/tree/1.0.2/pie/TDN-ALL"/></application></appInfo><editorialDecl resp="GaLAHaD"><interpretation xml:id="A0001"><ab subtype="POS-tagging_lemmatisation" type="linguisticAnnotation"><interpGrp type="annotationStyle"><interp>inline</interp></interpGrp><interpGrp type="Documentation"><interp/></interpGrp><interpGrp type="annotationSet"><interp>TDN-Core</interp></interpGrp><interpGrp type="annotationDescription"><interp>The file was automatically annotated within the platform GaLAHaD, which is a central hub for enriching historical Dutch.</interp></interpGrp><interpGrp type="annotationFormat"><interp>TEI xml</interp></interpGrp></ab><ab subtype="POS-tagging_lemmatisationProvenance1" type="linguisticAnnotation"><interpGrp type="annotationMode"><interp>automatically annotated</interp></interpGrp><interpGrp type="processor"><interp sameAs="#pie-tdn-all"/></interpGrp><date from="__DATE_IGNORED_BY_TEST__" to="__DATE_IGNORED_BY_TEST__"/></ab></interpretation></editorialDecl></encodingDesc><profileDesc><langUsage><language ident="nl">Dutch<interpGrp type="dominantLanguage"><interp>true</interp></interpGrp></language></langUsage></profileDesc></teiHeader>
<text><p><w lemma="punctuatie" pos="DUM" xml:id="w0">Punctuatie</w><pc pos="PC" xml:id="w1">,</pc> <w lemma="zin" pos="DUM" xml:id="w2">zin</w> <w lemma="zonder" pos="DUM" xml:id="w3">zonder</w> <w lemma="tags" pos="DUM" xml:id="w4">tags</w><pc pos="PC" xml:id="w5">.</pc><w lemma="zin" pos="DUM" xml:id="w6">Zin</w> <w lemma="met" pos="DUM" xml:id="w7">met</w> <w lemma="tags" pos="DUM" xml:id="w8">tags</w><pc pos="PC" xml:id="w9">.</pc></p></text>
<text><p><w lemma="punctuatie" pos="DUM" xml:id="w0">Punctuatie</w><pc xml:id="w1">,</pc> <w lemma="zin" pos="DUM" xml:id="w2">zin</w> <w lemma="zonder" pos="DUM" xml:id="w3">zonder</w> <w lemma="tags" pos="DUM" xml:id="w4">tags</w><pc xml:id="w5">.</pc><w lemma="zin" pos="DUM" xml:id="w6">Zin</w> <w lemma="met" pos="DUM" xml:id="w7">met</w> <w lemma="tags" pos="DUM" xml:id="w8">tags</w><pc xml:id="w9">.</pc></p></text>
</TEI>
6 changes: 3 additions & 3 deletions server/src/test/resources/tei/twine/merged-output.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
<body>
<p><w lemma="net" pos="AA(degree=pos,position=free)" xml:id="w0">n<hi rend="bold">e</hi>t</w><milestone n="1va" unit="fol"/> <w lemma="zijn" pos="VRB(finiteness=fin,tense=past)" xml:id="w1"><hi rend="bold">w</hi><hi rend="bold">a</hi><hi rend="bold">s</hi></w> <w lemma="ik" pos="PD(type=pers,position=free)" xml:id="w2">ik</w><w lemma="naar" pos="ADP(type=pre)" xml:id="w3"><hi rend="bold">n</hi>aar</w> <w lemma="school" pos="NOU-C(number=sg)" xml:id="w4">sc<hi rend="bold">h</hi><hi rend="bold">o</hi>o<hi rend="bold">l</hi></w> <w lemma="heengaan" pos="ADV(type=reg)" xml:id="w5">heen</w> <w lemma="en" pos="CONJ(type=coor)" xml:id="w6">en</w> <w lemma="terugkomen" pos="VRB(finiteness=fin,tense=past)" xml:id="w7">t<hi rend="bold">e</hi>r<hi rend="bold">u</hi>g</w> <w lemma="wezen" pos="VRB(finiteness=inf)" xml:id="w8">w<hi rend="bold">eze</hi>n</w> <w lemma="lopen" pos="VRB(finiteness=inf)" xml:id="w9">lope<hi rend="bold">n</hi></w> <w lemma="hetzovoor" pos="ADV(type=reg)" xml:id="w10"><hi rend="bold">enz</hi>o</w></p>

<pc lemma="onzin" pos="LET" xml:id="w11">.</pc>
<pc pos="LET" xml:id="w12">.</pc>
<pc pos="LET" xml:id="w13">.</pc>
<pc lemma="onzin" xml:id="w11">.</pc>
<pc pos="onzin" xml:id="w12">.</pc>
<pc xml:id="w13">.</pc>
<w lemma="school" pos="NOU-C(number=pl)" xml:id="w14">scholen</w>
<w lemma="school" pos="NOU-C(number=pl)" xml:id="w15">scholen</w>

Expand Down

0 comments on commit 9e832fa

Please sign in to comment.