Add the verbs conllu from Prof. Lapalme to the English lemmatizer

stanfordnlp · Jan 10, 2025 · fe2f394 · fe2f394
1 parent 37d9cf9
commit fe2f394
Showing 1 changed file with 6 additions and 1 deletion.
diff --git a/stanza/utils/datasets/prepare_tokenizer_treebank.py b/stanza/utils/datasets/prepare_tokenizer_treebank.py
@@ -816,7 +816,7 @@ def build_combined_english_dataset(paths, model_type, dataset):
     """
     en_combined is currently EWT, GUM, PUD, Pronouns, and handparsed
     """
-    udbase_dir = paths["UDBASE"]
+    udbase_dir = paths["UDBASE_GIT"]
     check_gum_ready(udbase_dir)
 
     if dataset == 'train':
@@ -911,6 +911,11 @@ def build_extra_combined_english_dataset(paths, model_type, dataset):
             handparsed_sentences = read_sentences_from_conllu(handparsed_path)
             print("Loaded %d sentences from %s" % (len(handparsed_sentences), handparsed_path))
             sents.extend(handparsed_sentences)
+
+            handparsed_path = os.path.join(handparsed_dir, "english-lemmas-verbs", "irregularVerbs-noNnoAdj.conllu")
+            handparsed_sentences = read_sentences_from_conllu(handparsed_path)
+            print("Loaded %d sentences from %s" % (len(handparsed_sentences), handparsed_path))
+            sents.extend(handparsed_sentences)
     return sents
 
 def build_extra_combined_italian_dataset(paths, model_type, dataset):