From 320a9aae0a68265f5fe683fd231a1b4e44d121e1 Mon Sep 17 00:00:00 2001 From: alessandropec Date: Sat, 3 Jul 2021 16:27:01 +0200 Subject: [PATCH] Added a new metric RefUniqueGrams This metrics compute the UniqueGrams respect to a reference text, can be useful to evaluate model that can generate text over the corpus in which the model was trained. This metrics is inspired by the evaluation part of the Paper "BERT has a Mouth, and It Must Speak: BERT as a Markov Random Field Language Model" https://arxiv.org/abs/1902.04094 --- utils/metrics/RefUniqueGram.py | 75 ++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 utils/metrics/RefUniqueGram.py diff --git a/utils/metrics/RefUniqueGram.py b/utils/metrics/RefUniqueGram.py new file mode 100644 index 0000000..5da0f4b --- /dev/null +++ b/utils/metrics/RefUniqueGram.py @@ -0,0 +1,75 @@ +import nltk +from utils.metrics.Metrics import Metrics +from nltk import ngrams + +class RefUniqueGram(Metrics): + def __init__(self, test_text='',ref_text='', gram=3): + super().__init__() + self.name = 'RefUniqueGram' + self.test_data = test_text + self.ref_data=ref_text + self.gram = gram + self.sample_size = 500 + self.test_text=None + self.reference_text = None + self.is_first = True + + def get_score(self, ignore=False): + if ignore: + return 0 + if self.is_first: + self.get_reference() + self.get_test() + self.is_first = False + return self.get_ng() + + def get_ng(self): + documentRef = self.get_reference() + documentTest= self.get_test() + length = len(documentTest) + gramsRef = list() + gramsTest = list() + for sentence in documentRef: + gramsRef += self.get_gram(sentence) + + for sentence in documentTest: + gramsTest += self.get_gram(sentence) + + + return len(set(gramsTest).difference(set(gramsRef)))/length + + def get_gram(self, tokens): + grams = list() + if len(tokens) < self.gram: + return grams + gram_generator = ngrams(tokens, self.gram) + for gram in gram_generator: + grams.append(gram) + return grams + + + def get_reference(self): + if self.reference_text is None: + reference = list() + with open(self.ref_data) as ref_text: + for text in ref_text: + #text = text.strip().split(" ") + text= nltk.word_tokenize(text) + reference.append(text) + self.reference_text = reference + return reference + else: + return self.reference_text + + def get_test(self): + if self.test_text is None: + test = list() + with open(self.test_data) as test_text: + for text in test_text: + text = nltk.word_tokenize(text) + test.append(text) + self.test_text = test + return test + else: + return self.test_text +