Skip to content

Commit

Permalink
Add the verbs conllu from Prof. Lapalme to the English lemmatizer
Browse files Browse the repository at this point in the history
  • Loading branch information
AngledLuffa committed Jan 10, 2025
1 parent 37d9cf9 commit fe2f394
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion stanza/utils/datasets/prepare_tokenizer_treebank.py
Original file line number Diff line number Diff line change
Expand Up @@ -816,7 +816,7 @@ def build_combined_english_dataset(paths, model_type, dataset):
"""
en_combined is currently EWT, GUM, PUD, Pronouns, and handparsed
"""
udbase_dir = paths["UDBASE"]
udbase_dir = paths["UDBASE_GIT"]
check_gum_ready(udbase_dir)

if dataset == 'train':
Expand Down Expand Up @@ -911,6 +911,11 @@ def build_extra_combined_english_dataset(paths, model_type, dataset):
handparsed_sentences = read_sentences_from_conllu(handparsed_path)
print("Loaded %d sentences from %s" % (len(handparsed_sentences), handparsed_path))
sents.extend(handparsed_sentences)

handparsed_path = os.path.join(handparsed_dir, "english-lemmas-verbs", "irregularVerbs-noNnoAdj.conllu")
handparsed_sentences = read_sentences_from_conllu(handparsed_path)
print("Loaded %d sentences from %s" % (len(handparsed_sentences), handparsed_path))
sents.extend(handparsed_sentences)
return sents

def build_extra_combined_italian_dataset(paths, model_type, dataset):
Expand Down

0 comments on commit fe2f394

Please sign in to comment.