-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
tei-export: Only allow PC if literal is has no alphanumeric chars
- Loading branch information
Showing
12 changed files
with
214 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
97 changes: 97 additions & 0 deletions
97
server/src/test/resources/tei/alphanumericpc/with-w-tags/converted-output.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
<?xml version="1.0" encoding="UTF-8" standalone="no"?> | ||
<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:xs="http://www.w3.org/2001/XMLSchema"> | ||
<teiHeader> | ||
<fileDesc> | ||
<titleStmt> | ||
<title>input.tei</title> | ||
<respStmt> | ||
<resp>linguistic annotation by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp> | ||
<orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName> | ||
<orgName xml:lang="en">Dutch Language Institute</orgName> | ||
</respStmt> | ||
<respStmt> | ||
<resp>exported as tei-p5 by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp> | ||
<orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName> | ||
<orgName xml:lang="en">Dutch Language Institute</orgName> | ||
</respStmt> | ||
</titleStmt> | ||
<publicationStmt> | ||
<publisher>!Needs to be filled in!</publisher> | ||
<idno type="sourceID">input.tei</idno> | ||
<idno type="GaLAHaDPersistentIdentifier">ec68ad7d-542c-40d6-9195-e21ccadad0d0_tei</idno> | ||
</publicationStmt> | ||
<notesStmt> | ||
<note resp="GaLAHaD" type="corpusName">testCorpus</note> | ||
<note resp="GaLAHaD" type="sourceCollection">source name</note> | ||
<note resp="GaLAHaD" type="sourceCollectionURL">http://source.url</note> | ||
</notesStmt> | ||
<sourceDesc> | ||
<ab> | ||
<idno type="sourceID">input.tei</idno> | ||
</ab> | ||
<ab type="date"> | ||
<date from="0" to="0"/> | ||
</ab> | ||
</sourceDesc> | ||
</fileDesc> | ||
<encodingDesc> | ||
<appInfo resp="GaLAHaD"> | ||
<application ident="pie-tdn-all" version="1.0.1" xml:id="pie-tdn-all"> | ||
<label>POS-tagger and lemmatiser</label> | ||
<ptr target="https://github.com/INL/galahad-taggers-dockerized/tree/1.0.2/pie/TDN-ALL"/> | ||
</application> | ||
</appInfo> | ||
<editorialDecl resp="GaLAHaD"> | ||
<interpretation xml:id="A0001"> | ||
<ab subtype="POS-tagging_lemmatisation" type="linguisticAnnotation"> | ||
<interpGrp type="annotationStyle"> | ||
<interp>inline</interp> | ||
</interpGrp> | ||
<interpGrp type="Documentation"> | ||
<interp/> | ||
</interpGrp> | ||
<interpGrp type="annotationSet"> | ||
<interp>TDN-Core</interp> | ||
</interpGrp> | ||
<interpGrp type="annotationDescription"> | ||
<interp>The file was automatically annotated within the platform GaLAHaD, which is a central hub for enriching historical Dutch.</interp> | ||
</interpGrp> | ||
<interpGrp type="annotationFormat"> | ||
<interp>TEI xml</interp> | ||
</interpGrp> | ||
</ab> | ||
<ab subtype="POS-tagging_lemmatisationProvenance1" type="linguisticAnnotation"> | ||
<interpGrp type="annotationMode"> | ||
<interp>automatically annotated</interp> | ||
</interpGrp> | ||
<interpGrp type="processor"> | ||
<interp sameAs="#pie-tdn-all"/> | ||
</interpGrp> | ||
<date from="2024-10-16" to="2024-10-16"/> | ||
</ab> | ||
</interpretation> | ||
</editorialDecl> | ||
</encodingDesc> | ||
<profileDesc> | ||
<langUsage> | ||
<language ident="nl"> | ||
Dutch | ||
<interpGrp type="dominantLanguage"> | ||
<interp>true</interp> | ||
</interpGrp> | ||
</language> | ||
</langUsage> | ||
</profileDesc> | ||
</teiHeader> | ||
<text> | ||
<body> | ||
<div> | ||
<p> | ||
|
||
|
||
<w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">"loop"</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc xml:id="w4">!</pc> | ||
</p> | ||
</div> | ||
</body> | ||
</text> | ||
</TEI> |
7 changes: 7 additions & 0 deletions
7
server/src/test/resources/tei/alphanumericpc/with-w-tags/input.tei.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
<TEI> | ||
<text> | ||
<p> | ||
<w>Ik</w> <w>"loop"</w> <w>naar:</w> <w>school</w><pc>!</pc> | ||
</p> | ||
</text> | ||
</TEI> |
5 changes: 5 additions & 0 deletions
5
server/src/test/resources/tei/alphanumericpc/with-w-tags/merged-output.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
<?xml version="1.0" encoding="UTF-8" standalone="no"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>input.tei</title><respStmt><resp>linguistic annotation by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt><respStmt><resp>TEI merged by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt></titleStmt><publicationStmt><publisher>!Needs to be filled in!</publisher><idno type="sourceID">input.tei</idno><idno type="GaLAHaDPersistentIdentifier">af2b5d82-b261-4752-bd5e-cda672d5d196_tei</idno></publicationStmt><notesStmt><note resp="GaLAHaD" type="corpusName">testCorpus</note><note resp="GaLAHaD" type="sourceCollection">source name</note><note resp="GaLAHaD" type="sourceCollectionURL">http://source.url</note></notesStmt><sourceDesc><ab><idno type="sourceID">input.tei</idno></ab><ab type="date"><date from="0" to="0"/></ab></sourceDesc></fileDesc><encodingDesc><appInfo resp="GaLAHaD"><application ident="pie-tdn-all" version="1.0.1" xml:id="pie-tdn-all"><label>POS-tagger and lemmatiser</label><ptr target="https://github.com/INL/galahad-taggers-dockerized/tree/1.0.2/pie/TDN-ALL"/></application></appInfo><editorialDecl resp="GaLAHaD"><interpretation xml:id="A0001"><ab subtype="POS-tagging_lemmatisation" type="linguisticAnnotation"><interpGrp type="annotationStyle"><interp>inline</interp></interpGrp><interpGrp type="Documentation"><interp/></interpGrp><interpGrp type="annotationSet"><interp>TDN-Core</interp></interpGrp><interpGrp type="annotationDescription"><interp>The file was automatically annotated within the platform GaLAHaD, which is a central hub for enriching historical Dutch.</interp></interpGrp><interpGrp type="annotationFormat"><interp>TEI xml</interp></interpGrp></ab><ab subtype="POS-tagging_lemmatisationProvenance1" type="linguisticAnnotation"><interpGrp type="annotationMode"><interp>automatically annotated</interp></interpGrp><interpGrp type="processor"><interp sameAs="#pie-tdn-all"/></interpGrp><date from="2024-10-16" to="2024-10-16"/></ab></interpretation></editorialDecl></encodingDesc><profileDesc><langUsage><language ident="nl">Dutch<interpGrp type="dominantLanguage"><interp>true</interp></interpGrp></language></langUsage></profileDesc></teiHeader> | ||
<text> | ||
<p> <w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">"loop"</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc pos="PC" xml:id="w4">!</pc> </p> | ||
</text> | ||
</TEI> |
6 changes: 6 additions & 0 deletions
6
server/src/test/resources/tei/alphanumericpc/with-w-tags/pie-tdn.tsv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
word lemma pos | ||
Ik ik PD | ||
"loop" loop PC | ||
naar: : PC | ||
school school NOU | ||
! school! PC |
3 changes: 3 additions & 0 deletions
3
server/src/test/resources/tei/alphanumericpc/with-w-tags/plaintext.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
|
||
|
||
Ik "loop" naar: school! |
7 changes: 7 additions & 0 deletions
7
server/src/test/resources/tei/alphanumericpc/without-w-tags/input.tei.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
<TEI> | ||
<text> | ||
<p> | ||
Ik "loop" naar: school! | ||
</p> | ||
</text> | ||
</TEI> |
5 changes: 5 additions & 0 deletions
5
server/src/test/resources/tei/alphanumericpc/without-w-tags/merged-output.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
<?xml version="1.0" encoding="UTF-8" standalone="no"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>input.tei</title><respStmt><resp>linguistic annotation by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt><respStmt><resp>TEI merged by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)</resp><orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName><orgName xml:lang="en">Dutch Language Institute</orgName></respStmt></titleStmt><publicationStmt><publisher>!Needs to be filled in!</publisher><idno type="sourceID">input.tei</idno><idno type="GaLAHaDPersistentIdentifier">37bff7c0-7fd8-4523-9643-309909f0d7c9_tei</idno></publicationStmt><notesStmt><note resp="GaLAHaD" type="corpusName">testCorpus</note><note resp="GaLAHaD" type="sourceCollection">source name</note><note resp="GaLAHaD" type="sourceCollectionURL">http://source.url</note></notesStmt><sourceDesc><ab><idno type="sourceID">input.tei</idno></ab><ab type="date"><date from="0" to="0"/></ab></sourceDesc></fileDesc><encodingDesc><appInfo resp="GaLAHaD"><application ident="pie-tdn-all" version="1.0.1" xml:id="pie-tdn-all"><label>POS-tagger and lemmatiser</label><ptr target="https://github.com/INL/galahad-taggers-dockerized/tree/1.0.2/pie/TDN-ALL"/></application></appInfo><editorialDecl resp="GaLAHaD"><interpretation xml:id="A0001"><ab subtype="POS-tagging_lemmatisation" type="linguisticAnnotation"><interpGrp type="annotationStyle"><interp>inline</interp></interpGrp><interpGrp type="Documentation"><interp/></interpGrp><interpGrp type="annotationSet"><interp>TDN-Core</interp></interpGrp><interpGrp type="annotationDescription"><interp>The file was automatically annotated within the platform GaLAHaD, which is a central hub for enriching historical Dutch.</interp></interpGrp><interpGrp type="annotationFormat"><interp>TEI xml</interp></interpGrp></ab><ab subtype="POS-tagging_lemmatisationProvenance1" type="linguisticAnnotation"><interpGrp type="annotationMode"><interp>automatically annotated</interp></interpGrp><interpGrp type="processor"><interp sameAs="#pie-tdn-all"/></interpGrp><date from="2024-10-16" to="2024-10-16"/></ab></interpretation></editorialDecl></encodingDesc><profileDesc><langUsage><language ident="nl">Dutch<interpGrp type="dominantLanguage"><interp>true</interp></interpGrp></language></langUsage></profileDesc></teiHeader> | ||
<text> | ||
<p> <w lemma="ik" pos="PD" xml:id="w0">Ik</w> <w lemma="loop" pos="" xml:id="w1">"loop"</w> <w lemma=":" pos="" xml:id="w2">naar:</w> <w lemma="school" pos="NOU" xml:id="w3">school</w><pc pos="PC" xml:id="w4">!</pc></p> | ||
</text> | ||
</TEI> |
6 changes: 6 additions & 0 deletions
6
server/src/test/resources/tei/alphanumericpc/without-w-tags/pie-tdn.tsv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
word lemma pos | ||
Ik ik PD | ||
"loop" loop PC | ||
naar: : PC | ||
school school NOU | ||
! school! PC |
3 changes: 3 additions & 0 deletions
3
server/src/test/resources/tei/alphanumericpc/without-w-tags/plaintext.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
|
||
|
||
Ik "loop" naar: school! |