Add some more French lemmas

stanfordnlp · Jan 11, 2025 · 37b0416 · 37b0416
1 parent 56fc83e
commit 37b0416
Showing 1 changed file with 5 additions and 0 deletions.
diff --git a/stanza/utils/datasets/prepare_tokenizer_treebank.py b/stanza/utils/datasets/prepare_tokenizer_treebank.py
@@ -888,6 +888,11 @@ def build_extra_combined_french_dataset(paths, model_type, dataset):
             handparsed_sentences = read_sentences_from_conllu(handparsed_path)
             print("Loaded %d sentences from %s" % (len(handparsed_sentences), handparsed_path))
             sents.extend(handparsed_sentences)
+
+            handparsed_path = os.path.join(handparsed_dir, "french-lemmas", "french1st_6thGrade.conllu")
+            handparsed_sentences = read_sentences_from_conllu(handparsed_path)
+            print("Loaded %d sentences from %s" % (len(handparsed_sentences), handparsed_path))
+            sents.extend(handparsed_sentences)
     return sents
 
 def build_extra_combined_german_dataset(paths, model_type, dataset):