From c0940b03243f2d3bf1aca4097ceb315a36f58430 Mon Sep 17 00:00:00 2001 From: Tushaar Gangavarapu Date: Thu, 1 Dec 2022 06:01:16 -0500 Subject: [PATCH 1/9] Decoupling surprise computations from the Surprise transformer. This commit attempts to decouple the perplexity computes out of the transformer, so as to enable more flexibility in choosing several other language models, including KenLM. Furthermore, this change also implements two perplexity modules, including modifying the existing cross-entropy function to the CrossEntropy class and the KenlmPerplexity class that uses the KenLM model. Furthermore, the computation of surprise has been thread-parallelized, and the perplexity functions themselves have also been thread-parallelized for runtime efficiency. The surprise_demo has been updated to reflect the changes made. To ensure most possible backward compatibility, only the 'smooth' argument has been removed from the Surprise class signature; the fit and transform methods still run with the same arguments (use kwargs to input arguments pertaining to added functionality). Use Perplexity.config to view the config of individual perplexity classes, and Perplexity.type to view the perplexity type. Other changes include code modularization, utils addition, and other code formatting. TQDM imports have been modified to enable notebook rendering when using notebooks, and additional TQDM (disappearing) bars have been included to track the progress of surprise computations. --- convokit/surprise/__init__.py | 3 + convokit/surprise/cross_entropy.py | 57 + convokit/surprise/demos/surprise_demo.ipynb | 1743 +++++++++++++++++-- convokit/surprise/kenlm_perplexity.py | 94 + convokit/surprise/perplexity.py | 51 + convokit/surprise/surprise.py | 413 ++--- convokit/surprise/utils.py | 49 + 7 files changed, 2044 insertions(+), 366 deletions(-) create mode 100644 convokit/surprise/cross_entropy.py create mode 100644 convokit/surprise/kenlm_perplexity.py create mode 100644 convokit/surprise/perplexity.py create mode 100644 convokit/surprise/utils.py diff --git a/convokit/surprise/__init__.py b/convokit/surprise/__init__.py index d6d19d0c..0e41609b 100644 --- a/convokit/surprise/__init__.py +++ b/convokit/surprise/__init__.py @@ -1 +1,4 @@ +from .cross_entropy import CrossEntropy +from .kenlm_perplexity import KenlmPerplexity +from .perplexity import Perplexity from .surprise import * diff --git a/convokit/surprise/cross_entropy.py b/convokit/surprise/cross_entropy.py new file mode 100644 index 00000000..549f0c27 --- /dev/null +++ b/convokit/surprise/cross_entropy.py @@ -0,0 +1,57 @@ +import multiprocessing +from collections import Counter + +import numpy as np +from joblib import Parallel, delayed + +from .perplexity import Perplexity + + +class CrossEntropy(Perplexity): + """ + + :param perplexity_type: + :param kwargs: + """ + + def __init__(self, perplexity_type='convokit_cross_entropy', **kwargs): + super().__init__(perplexity_type, **kwargs) + + self._smooth = kwargs['smooth'] if 'smooth' in kwargs else True + self._n_jobs = kwargs['n_jobs'] if 'n_jobs' in kwargs else multiprocessing.cpu_count() + + @staticmethod + def __cross_entropy(target, context, smooth): + """ + + :param target: + :param context: + :param smooth: + :return: + """ + n_target, n_context = len(target), len(context) + if min(n_target, n_context) == 0: + return np.nan + + context_counts = Counter(context) + smooth_v = len(context_counts) + 1 if smooth else 0 + smooth_k = 1 if smooth else 0 + value = 0 if smooth else 1 + + return sum(-np.log((context_counts.get(token, value) + smooth_k) / (n_context + smooth_v)) for token in + target) / n_target + + def perplexity_fn(self, target_samples, context_samples, **kwargs): + """ + + :param target_samples: + :param context_samples: + :param kwargs: + :return: + """ + self.overwrite_args(kwargs.keys(), kwargs) + + model_scores = Parallel(n_jobs=self._n_jobs, backend='threading')( + delayed(self.__cross_entropy)(target_sample, context_sample, smooth=self._smooth) for + target_sample, context_sample in zip(target_samples, context_samples)) + return np.nanmean(model_scores) diff --git a/convokit/surprise/demos/surprise_demo.ipynb b/convokit/surprise/demos/surprise_demo.ipynb index 92946d6c..8c576272 100644 --- a/convokit/surprise/demos/surprise_demo.ipynb +++ b/convokit/surprise/demos/surprise_demo.ipynb @@ -6,22 +6,31 @@ "source": [ "Computing Surprise With ConvoKit\n", "=====================\n", - "This notebook provides a demo of how to use the Surprise transformer to compute surprise across a corpus. In this demo, we will use the Surprise transformer to compute Speaker Convo Diversity, a measure of how surprising a speaker's participation in one conversation is compared to their participation in all other conversations." + "This notebook provides a demo of how to use the Surprise transformer to compute surprise across a corpus. In this demo, we will use the Surprise transformer to compute Speaker Convo Diversity, a measure of how surprising a speaker's participation in one conversation is compared to their participation in all other conversations.\n", + "\n", + "" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ - "import convokit\n", "import itertools\n", + "\n", "import numpy as np\n", "import spacy\n", - "from convokit import Corpus, download, Surprise\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "import convokit\n", + "from convokit import Corpus, download\n", + "from convokit import Surprise, CrossEntropy, KenlmPerplexity\n", "from convokit.text_processing import TextProcessor, TextParser\n", - "from sklearn.feature_extraction.text import CountVectorizer" + "\n", + "from tqdm.notebook import tqdm\n", + "import pprint as pp" ] }, { @@ -35,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": { "tags": [] }, @@ -44,7 +53,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Dataset already exists at /home/axl4/.convokit/downloads/subreddit-Cornell\n" + "Dataset already exists at /Users/tushaar/.convokit/downloads/subreddit-Cornell\n" ] } ], @@ -54,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": { "tags": [] }, @@ -82,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -93,27 +102,18 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/axl4/ConvoKit/convokit/model/corpus.py:1213: FutureWarning: set_info() is deprecated and will be removed in a future release. Use add_meta() instead.\n", - "/home/axl4/ConvoKit/convokit/model/corpus.py:1219: FutureWarning: set_info() is deprecated and will be removed in a future release. Use add_meta() instead.\n" - ] - } - ], + "outputs": [], "source": [ "corpus.organize_speaker_convo_history(utterance_filter=utterance_is_valid)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -122,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -213,7 +213,7 @@ "Fencerman2 298.0" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -224,7 +224,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -233,19 +233,20 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import itertools\n", "\n", - "subset_utts = [list(corpus.get_speaker(speaker).iter_utterances(selector=utterance_is_valid)) for speaker in top_speakers]\n", + "subset_utts = [list(corpus.get_speaker(speaker).iter_utterances(selector=utterance_is_valid)) \n", + " for speaker in top_speakers]\n", "subset_corpus = Corpus(utterances=list(itertools.chain(*subset_utts)))" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": { "tags": [] }, @@ -276,47 +277,87 @@ "\n", "The transformer also has an optional `tokenizer` parameter to customize tokenization. Here we will tokenize the text outside of the surprise transformer, so our tokenizer will be an identity function.\n", "\n", - "The `smooth` parameter determines whether the transformer uses +1 laplace smoothing (`smooth = True`) or naively replaces 0 counts with 1's as the SpeakerConvoDiversity transformer does (`smooth = False`)." + "" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ef06bdb009d04568901dfdabfdaf0636", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import spacy\n", "\n", "spacy_nlp = spacy.load('en_core_web_sm', disable=['ner','parser', 'tagger', 'lemmatizer'])\n", - "for utt in subset_corpus.iter_utterances():\n", + "for utt in tqdm(subset_corpus.iter_utterances()):\n", " utt.meta['joined_tokens'] = [t.text.lower() for t in spacy_nlp(utt.text)]" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "surp = Surprise(tokenizer=lambda x: x, model_key_selector=lambda utt: '_'.join([utt.speaker.id, utt.conversation_id]), target_sample_size=100, context_sample_size=1000, n_samples=50, smooth=True)" + "surp = Surprise(tokenizer=lambda x: x, \n", + " model_key_selector=lambda utt: '_'.join([utt.speaker.id, utt.conversation_id]), \n", + " target_sample_size=100, context_sample_size=1000, n_samples=50)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 18, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "fit1: 20550it [00:16, 1283.44it/s]\n", - "fit2: 100%|██████████| 15394/15394 [00:00<00:00, 1032033.56it/s]\n" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7a51d74586aa4f5986d889dd7354ad65", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "fit: 0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5cd7adb40a7d4d168ebf7f5611a2edc7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "fit: 0%| | 0/15394 [00:00= sample_size]) - if tokens_list.shape[0] == 0: - return None - rng = np.random.default_rng() - sample_idxes = rng.integers(0, tokens_list.shape[0], size=(n_samples)) - return np.array([rng.choice(tokens_list[i], sample_size) for i in sample_idxes]) + def __init__(self, model_key_selector, tokenizer=word_tokenize, surprise_attr_name='surprise', + target_sample_size=100, context_sample_size=100, n_samples=50, sampling_fn=random_sampler, + n_jobs=None): + self._model_key_selector = model_key_selector + self._tokenizer = tokenizer + self._surprise_attr_name = surprise_attr_name + self._target_sample_size = target_sample_size + self._context_sample_size = context_sample_size + self._n_samples = n_samples + self._sampling_fn = sampling_fn + self._n_jobs = n_jobs if n_jobs is not None else multiprocessing.cpu_count() + self._model_groups = None + + def fit(self, corpus: Corpus, text_func=None, selector=lambda utt: True): + """ + :param corpus: + :param text_func: + :param selector: + :return: + """ + self._model_groups = defaultdict(list) -class Surprise(Transformer): - """ - Computes how surprising a target (an utterance or group of utterances) is based on some context. - The measure for surprise used is cross entropy. Uses fixed size samples from target and context text - to mitigate effects of length on cross entropy. + for utt in tqdm(corpus.iter_utterances(selector=selector), desc='fit'): + key = self._model_key_selector(utt) + if text_func is not None: + if key not in self._model_groups: + self._model_groups[key] = text_func(utt) + else: + self._model_groups[key].append(utt.text) - :param model_key_selector: function that defines how utterances should be mapped to models. - Takes in an utterance and returns the key to use for mapping the utterance to a corresponding model. - :param tokenize: optional function that takes in a string and returns a list of tokens in that string. - default: nltk's word_tokenize - :param surprise_attr_name: the name for the metadata attribute to add to objects. - default: surprise - :param target_sample_size: number of tokens to sample from each target (test text). If `None`, then the entire target will be used. - :param context_sample_size: number of tokens to sample from each context (training text). If `None`, then the entire context will be used. - :param n_samples: number of samples to take for each target-context pair. - :param sampling_fn: function for generating samples of tokens. - :param smooth: whether to use laplace smoothing when calculating surprise. - """ + for key in tqdm(self._model_groups, desc='fit'): + if text_func is None: + self._model_groups[key] = [' '.join(self._model_groups[key])] + # Using `map()` with a `lambda` function is (microscopically) costlier than a list comprehension. + # Reference: https://stackoverflow.com/a/1247490/6907625. + self._model_groups[key] = [self._tokenizer(utt_text) for utt_text in self._model_groups[key]] - def __init__( - self, - model_key_selector: Callable[[Utterance], str], - tokenizer: Callable[[str], List[str]] = word_tokenize, - surprise_attr_name="surprise", - target_sample_size=100, - context_sample_size=100, - n_samples=50, - sampling_fn: Callable[[np.ndarray, int], np.ndarray] = sample, - smooth: bool = True, - ): - self.model_key_selector = model_key_selector - self.tokenizer = tokenizer - self.surprise_attr_name = surprise_attr_name - self.target_sample_size = target_sample_size - self.context_sample_size = context_sample_size - self.n_samples = n_samples - self.sampling_fn = sampling_fn - self.smooth = smooth + return self - def fit( - self, - corpus: Corpus, - text_func: Callable[[Utterance], List[str]] = None, - selector: Callable[[Utterance], bool] = lambda utt: True, - ): + def _compute_surprise(self, target, context, perplexity_fn, **kwargs): """ - Fits a model for each group of utterances in a corpus. The group that an - utterance belongs to is determined by the `model_key_selector` parameter in - the transformer's constructor. - :param corpus: corpus to create models from. - :param text_func: optional function to define how the text a model is trained - on should be selected. Takes an utterance as input and returns a list of - strings to train the model corresponding to that utterance on. The model - corresponding to the utterance is determined by `self.model_key_selector`. - For every utterance corresponding to the same model key, this function - should return the same result. - If `text_func` is `None`, a model will be trained on the text from all - the utterances that belong to its group. - :param selector: determines which utterances in the corpus to train models for. + :param target: + :param context: + :param perplexity_fn: + :param kwargs: + :return: """ - self.model_groups = defaultdict(list) - for utt in tqdm(corpus.iter_utterances(selector=selector), desc="fit1"): - key = self.model_key_selector(utt) - if text_func: - if key not in self.model_groups: - self.model_groups[key] = text_func(utt) - else: - self.model_groups[key].append(utt.text) - for key in tqdm(self.model_groups, desc="fit2"): - if not text_func: - self.model_groups[key] = [" ".join(self.model_groups[key])] - self.model_groups[key] = list(map(lambda x: self.tokenizer(x), self.model_groups[key])) - return self + target_tokens = np.array(target) + context_tokens = [np.array(text) for text in context] + target_samples = self._sampling_fn([target_tokens], self._target_sample_size, self._n_samples) + context_samples = self._sampling_fn(context_tokens, self._context_sample_size, self._n_samples) + + if target_samples is None or context_samples is None: + return np.nan + return perplexity_fn(target_samples, context_samples, **kwargs) - def transform( - self, - corpus: Corpus, - obj_type: str, - group_and_models: Callable[[Utterance], Tuple[str, List[str]]] = None, - group_model_attr_key: Callable[[str, str], str] = None, - selector: Callable[[CorpusComponent], bool] = lambda _: True, - target_text_func: Callable[[Utterance], List[str]] = None, - ): + def _transform(self, corpus, obj_type, group_and_models=None, target_text_func=None, selector=lambda _: True, + group_model_attr_key=None, **kwargs): """ - Annotates `obj_type` components in a corpus with surprise scores. Should be - called after fit(). - :param corpus: corpus to compute surprise for. - :param obj_type: the type of corpus components to annotate. Should be either - 'utterance', 'speaker', 'conversation', or 'corpus'. - :param group_and_models: optional function that defines how an utterance should - be grouped to form a target text and what models (contexts) the group should - be compared to when calculating surprise. Takes in an utterance and returns - a tuple containing the name of the group the utterance belongs to and a - list of models to calculate how surprising that group is against. Objects - will be annotated with a metadata field `self.surprise_attr_name` that is - maps a key corresponding to the `groupname` and `modelkey` to the surprise - score for utterances in the group when compared to the model. The key used - is defined by the `group_model_attr_key` parameter. - If `group_and_models` is `None`, `self.model_key_selector` will be used - to select the group that an utterance belongs to. The surprise score will - be calculated for each group of utterances compared to the model in - `self.models` corresponding to the group. - :param group_model_attr_key: optional function to define what key should be used - for a given `groupname` and `modelkey`. - If `group_model_attr_key` is `None`, the default key used will be - "GROUP_groupname_MODEL_modelkey" unless `groupname` and `modelkey` are equal - in which case just "modelkey" will be used as the key. - :param selector: function to select objects to annotate. if function returns true, object will be annotated. - :param target_text_func: optional function to define what the target text corresponding to an utterance should be. - takes in an utterance and returns a list of string tokens + :param corpus: + :param obj_type: + :param group_and_models: + :param target_text_func: + :param selector: + :param group_model_attr_key: + :param kwargs: + :return: """ - if obj_type == "corpus": - utt_groups = defaultdict(list) - group_models = defaultdict(set) - for utt in corpus.iter_utterances(): - if group_and_models: - group_name, models = group_and_models(utt) - else: - group_name = self.model_key_selector(utt) - models = {group_name} - if target_text_func: - if group_name not in utt_groups: - utt_groups[group_name] = [target_text_func(utt)] - else: - utt_groups[group_name].append(self.tokenizer(utt.text)) - group_models[group_name].update(models) - surprise_scores = {} - for group_name in tqdm(utt_groups, desc="transform"): - for model_key in group_models[group_name]: - context = self.model_groups[model_key] - target = list(chain(*utt_groups[group_name])) - surprise_scores[ - Surprise._format_attr_key(group_name, model_key, group_model_attr_key) - ] = self._compute_surprise(target, context) - corpus.add_meta(self.surprise_attr_name, surprise_scores) - elif obj_type == "utterance": - for utt in tqdm(corpus.iter_utterances(selector=selector), desc="transform"): - if group_and_models: - group_name, models = group_and_models(utt) - surprise_scores = {} - for model_key in models: - context = self.model_groups[model_key] - target = ( - target_text_func(utt) if target_text_func else self.tokenizer(utt.text) - ) - surprise_scores[ - Surprise._format_attr_key(group_name, model_key, group_model_attr_key) - ] = self._compute_surprise(target, context) - utt.add_meta(self.surprise_attr_name, surprise_scores) - else: - group_name = self.model_key_selector(utt) - context = self.model_groups[group_name] - target = target_text_func(utt) if target_text_func else self.tokenizer(utt.text) - utt.add_meta(self.surprise_attr_name, self._compute_surprise(target, context)) + + def _update_groups_models(utt_, utt_groups_, group_models_): + """ + + :param utt_: + :param utt_groups_: + :param group_models_: + :return: + """ + group_name, models = group_and_models(utt_) if group_and_models else self._model_key_selector(utt_), None + models = {group_name} if models is None else models + if target_text_func: + if group_name not in utt_groups_: + utt_groups_[group_name] = [target_text_func(utt_)] + else: + utt_groups_[group_name].append(self._tokenizer(utt_.text)) + group_models_[group_name].update(models) + + def _format_attr_key(group_name, model_key, format_fn=None): + """ + + :param group_name: + :param model_key: + :param format_fn: + :return: + """ + if format_fn: + return format_fn(group_name, model_key) + if group_name == model_key: + return model_key + return f'GROUP_{group_name}__MODEL_{model_key}' + + def __surprise_score_helper(group_name, utt_group, group_models_, surprise_scores_, perplexity_fn): + """ + + :param group_name: + :param utt_group: + :param group_models_: + :param surprise_scores_: + :param perplexity_fn: + :return: + """ + for model_key in group_models_[group_name]: + assert model_key in self._model_groups, 'invalid model key' + surprise_key = _format_attr_key(group_name, model_key, group_model_attr_key) + context = self._model_groups[model_key] + target = list(chain(*utt_group)) + surprise_scores_[surprise_key] = self._compute_surprise(target, context, perplexity_fn, **kwargs) + + def _get_surprise_scores(utt_groups_, group_models_, surprise_scores_, perplexity_fn): + """ + + :param utt_groups_: + :param group_models_: + :param surprise_scores_: + :param perplexity_fn: + :return: + """ + Parallel(n_jobs=self._n_jobs, backend='threading')( + delayed(__surprise_score_helper)(group_name, utt_groups_[group_name], group_models_, surprise_scores_, + perplexity_fn) for group_name in + tqdm(utt_groups_, leave=False, desc='surprise')) + + surprise_scores = {} + perplexity = kwargs['perplexity'] if 'perplexity' in kwargs else CrossEntropy(**kwargs) + + if obj_type == 'corpus': + utt_groups, group_models = defaultdict(list), defaultdict(set) + for utt in tqdm(corpus.iter_utterances(), desc='transform'): + _update_groups_models(utt, utt_groups, group_models) + _get_surprise_scores(utt_groups, group_models, surprise_scores, perplexity.perplexity_fn) + corpus.add_meta(self._surprise_attr_name, surprise_scores) + elif obj_type == 'utterance': + for utt in tqdm(corpus.iter_utterances(selector=selector), desc='transform'): + utt_groups, group_models = defaultdict(list), defaultdict(set) + _update_groups_models(utt, utt_groups, group_models) + _get_surprise_scores(utt_groups, group_models, surprise_scores, perplexity.perplexity_fn) + utt.add_meta(self._surprise_attr_name, surprise_scores) else: - for obj in tqdm(corpus.iter_objs(obj_type, selector=selector), desc="transform"): - utt_groups = defaultdict(list) - group_models = defaultdict(set) + for obj in tqdm(corpus.iter_objs(obj_type, selector=selector), desc='transform'): + utt_groups, group_models = defaultdict(list), defaultdict(set) for utt in obj.iter_utterances(): - if group_and_models: - group_name, models = group_and_models(utt) - else: - group_name = self.model_key_selector(utt) - models = {group_name} - if target_text_func: - if group_name not in utt_groups: - utt_groups[group_name] = [target_text_func(utt)] - else: - utt_groups[group_name].append(self.tokenizer(utt.text)) - group_models[group_name].update(models) - surprise_scores = {} - for group_name in utt_groups: - for model_key in group_models[group_name]: - assert model_key in self.model_groups, "invalid model key" - if not self.model_groups[model_key]: - continue - context = self.model_groups[model_key] - target = list(chain(*utt_groups[group_name])) - surprise_scores[ - Surprise._format_attr_key(group_name, model_key, group_model_attr_key) - ] = self._compute_surprise(target, context) - obj.add_meta(self.surprise_attr_name, surprise_scores) + _update_groups_models(utt, utt_groups, group_models) + _get_surprise_scores(utt_groups, group_models, surprise_scores, perplexity.perplexity_fn) + obj.add_meta(self._surprise_attr_name, surprise_scores) return corpus - def _compute_surprise(self, target: List[str], context: List[List[str]]): + def transform(self, corpus: Corpus, **kwargs) -> Corpus: """ - Computes how surprising a target text is based on a context. Surprise scores are calculated using cross entropy. - To mitigate length based effects on cross entropy, several random sample of fixed sizes are taken from the traget and context. - Returns the average of the cross entropies for all pairs of samples. - :param target: a list of tokens in the target - :param context: a list of lists of tokens in each group of the context - - :return: surprise score + :param corpus: + :param kwargs: + :return: """ - target_tokens = np.array(target) - context_tokens = [np.array(text) for text in context] - target_samples = self.sampling_fn([target_tokens], self.target_sample_size, self.n_samples) - context_samples = self.sampling_fn(context_tokens, self.context_sample_size, self.n_samples) - if target_samples is None or context_samples is None: - return np.nan - return np.nanmean( - [ - _cross_entropy(target_sample, context_sample, self.smooth) - for target_sample, context_sample in zip(target_samples, context_samples) - ] - ) - - @staticmethod - def _format_attr_key(group_name, model_key, format_fn=None): - if format_fn: - return format_fn(group_name, model_key) - if group_name == model_key: - return model_key - return f"GROUP_{group_name}__MODEL_{model_key}" + return self._transform(corpus=corpus, **kwargs) diff --git a/convokit/surprise/utils.py b/convokit/surprise/utils.py new file mode 100644 index 00000000..4d230609 --- /dev/null +++ b/convokit/surprise/utils.py @@ -0,0 +1,49 @@ +import tempfile + +import numpy as np + + +def random_sampler(tokens, sample_size, n_samples): + """ + + :param tokens: + :param sample_size: + :param n_samples: + :return: + """ + if not sample_size: + assert len(tokens) == 1 + return np.tile(tokens[0], (n_samples, 1)) + + tokens_list = np.array([tokens_ for tokens_ in tokens if len(tokens_) >= sample_size]) + if tokens_list.shape[0] == 0: + return None + + rng = np.random.default_rng() + sample_idxs = rng.integers(0, tokens_list.shape[0], size=n_samples) + return np.array([rng.choice(tokens_list[idx], sample_size) for idx in sample_idxs]) + + +def create_tmp_files(num_files): + """ + + :param num_files: + :return: + """ + tmp_files = [] + for _ in range(num_files): + tmp_files.append(tempfile.NamedTemporaryFile('w', delete=True)) + return tmp_files + + +def delete_tmp_files(tmp_files): + """ + + :param tmp_files: + :return: + """ + for tmp_file in tmp_files: + try: + tmp_file.close() + except FileNotFoundError: + pass From 878a7abadf0a0ad34c93b0babf6840fa1bc6bbdb Mon Sep 17 00:00:00 2001 From: Tushaar Gangavarapu Date: Thu, 1 Dec 2022 11:37:21 -0500 Subject: [PATCH 2/9] Include typing hints for all functions in the Surprise transformer. This change adds typing hints to all the functions in the Surprise transformer. Further, it also includes a minor correction of having varied surprise_scores dictionary per object type. --- convokit/surprise/cross_entropy.py | 10 +- convokit/surprise/demos/surprise_demo.ipynb | 102 +++++++++----------- convokit/surprise/kenlm_perplexity.py | 12 ++- convokit/surprise/perplexity.py | 10 +- convokit/surprise/surprise.py | 52 ++++++---- convokit/surprise/utils.py | 8 +- 6 files changed, 107 insertions(+), 87 deletions(-) diff --git a/convokit/surprise/cross_entropy.py b/convokit/surprise/cross_entropy.py index 549f0c27..36a3bdf3 100644 --- a/convokit/surprise/cross_entropy.py +++ b/convokit/surprise/cross_entropy.py @@ -1,5 +1,6 @@ import multiprocessing from collections import Counter +from typing import Optional, Any, Union, List import numpy as np from joblib import Parallel, delayed @@ -14,14 +15,14 @@ class CrossEntropy(Perplexity): :param kwargs: """ - def __init__(self, perplexity_type='convokit_cross_entropy', **kwargs): + def __init__(self, perplexity_type: str = 'convokit_cross_entropy', **kwargs: Optional[Any]): super().__init__(perplexity_type, **kwargs) self._smooth = kwargs['smooth'] if 'smooth' in kwargs else True self._n_jobs = kwargs['n_jobs'] if 'n_jobs' in kwargs else multiprocessing.cpu_count() @staticmethod - def __cross_entropy(target, context, smooth): + def __cross_entropy(target: np.ndarray, context: np.ndarray, smooth: bool) -> float: """ :param target: @@ -41,7 +42,8 @@ def __cross_entropy(target, context, smooth): return sum(-np.log((context_counts.get(token, value) + smooth_k) / (n_context + smooth_v)) for token in target) / n_target - def perplexity_fn(self, target_samples, context_samples, **kwargs): + def perplexity_fn(self, target_samples: Union[List[str], np.ndarray], context_samples: Union[List[str], np.ndarray], + **kwargs: Optional[Any]) -> np.ndarray: """ :param target_samples: @@ -49,7 +51,7 @@ def perplexity_fn(self, target_samples, context_samples, **kwargs): :param kwargs: :return: """ - self.overwrite_args(kwargs.keys(), kwargs) + self.overwrite_args(list(kwargs.keys()), kwargs) model_scores = Parallel(n_jobs=self._n_jobs, backend='threading')( delayed(self.__cross_entropy)(target_sample, context_sample, smooth=self._smooth) for diff --git a/convokit/surprise/demos/surprise_demo.ipynb b/convokit/surprise/demos/surprise_demo.ipynb index 8c576272..75d680ca 100644 --- a/convokit/surprise/demos/surprise_demo.ipynb +++ b/convokit/surprise/demos/surprise_demo.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "tags": [] }, @@ -63,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": { "tags": [] }, @@ -91,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -102,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": { "tags": [] }, @@ -113,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -122,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -213,7 +213,7 @@ "Fencerman2 298.0" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -224,7 +224,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -233,7 +233,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -246,7 +246,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": { "tags": [] }, @@ -282,13 +282,13 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "ef06bdb009d04568901dfdabfdaf0636", + "model_id": "06b2d18624844e8daaaca64ec131a41a", "version_major": 2, "version_minor": 0 }, @@ -310,24 +310,24 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "surp = Surprise(tokenizer=lambda x: x, \n", " model_key_selector=lambda utt: '_'.join([utt.speaker.id, utt.conversation_id]), \n", - " target_sample_size=100, context_sample_size=1000, n_samples=50)" + " target_sample_size=100, context_sample_size=1000, n_samples=50, n_jobs=16)" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7a51d74586aa4f5986d889dd7354ad65", + "model_id": "b5766f2f6f6941e9a234b3a2b9194885", "version_major": 2, "version_minor": 0 }, @@ -341,7 +341,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5cd7adb40a7d4d168ebf7f5611a2edc7", + "model_id": "f920be9ba7f14ea2898f230e26242336", "version_major": 2, "version_minor": 0 }, @@ -371,7 +371,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -389,7 +389,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -401,14 +401,6 @@ " 'ngram_order': 2,\n", " 'perplexity_type': 'kenlm_perplexity'}\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/tushaar/Downloads/Cornell/Research/ConvoKit/convokit/surprise/kenlm_perplexity.py:28: UserWarning: the kenlm_path is unspecified, setting it to /Users/tushaar/kenlm\n", - " warnings.warn(f'the kenlm_path is unspecified, setting it to {self._kenlm_path}')\n" - ] } ], "source": [ @@ -418,7 +410,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 21, "metadata": { "tags": [] }, @@ -426,7 +418,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "28c1c21e330f4673978115fb7071e5a1", + "model_id": "71a012f861fe4208bc52e8e6058e2bac", "version_major": 2, "version_minor": 0 }, @@ -1852,7 +1844,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -1881,26 +1873,26 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "EQUASHNZRKUL_815y6t 7.242685\n", - "SwissWatchesOnly_8g5q88 7.214482\n", - "SwissWatchesOnly_67cljd 7.133236\n", - "EQUASHNZRKUL_73xuw6 7.108042\n", - "Straight_Derpin_5kst5l 7.099197\n", - "ClawofBeta_52u1nu 7.066957\n", - "syntheticity_97zg9z 7.056638\n", - "Udontlikecake_7rj6a0 7.051979\n", - "CornellMan333_9iwucv 7.040724\n", - "t3hasiangod_42k6wa 7.035798\n", + "EQUASHNZRKUL_815y6t 7.233201\n", + "SwissWatchesOnly_8g5q88 7.222188\n", + "SwissWatchesOnly_67cljd 7.140040\n", + "EQUASHNZRKUL_73xuw6 7.089441\n", + "Udontlikecake_7rj6a0 7.083908\n", + "ClawofBeta_52u1nu 7.080123\n", + "Straight_Derpin_5kst5l 7.068058\n", + "CornellMan333_9iwucv 7.066687\n", + "laveritecestla_6v4ysm 7.053263\n", + "dedicateddan_3qq6il 7.046629\n", "dtype: float64" ] }, - "execution_count": 26, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1919,26 +1911,26 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Unga_Bunga_30ac0l 5.846026\n", - "crash_over-ride_6bjxnm 5.942989\n", - "crash_over-ride_8f7b0y 5.951988\n", - "omgdonerkebab_v4a3p 5.956038\n", - "crash_over-ride_30zba1 5.976818\n", - "Bisphosphate_7r8nu1 5.986193\n", - "crash_over-ride_7owfvv 5.990212\n", - "crash_over-ride_v4j70 5.990862\n", - "crash_over-ride_t6w01 5.992889\n", - "crash_over-ride_9b132c 6.005226\n", + "Unga_Bunga_30ac0l 5.876776\n", + "crash_over-ride_6bjxnm 5.927376\n", + "crash_over-ride_7owfvv 5.947449\n", + "crash_over-ride_8f7b0y 5.950245\n", + "crash_over-ride_t6w01 5.961168\n", + "Bisphosphate_7r8nu1 5.962373\n", + "Bisphosphate_8mbpdu 5.971744\n", + "crash_over-ride_9ghfjc 5.991398\n", + "crash_over-ride_6fyzu8 5.994424\n", + "Bisphosphate_6w00br 5.995339\n", "dtype: float64" ] }, - "execution_count": 27, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } diff --git a/convokit/surprise/kenlm_perplexity.py b/convokit/surprise/kenlm_perplexity.py index 0c7ffac7..9d9b153d 100644 --- a/convokit/surprise/kenlm_perplexity.py +++ b/convokit/surprise/kenlm_perplexity.py @@ -4,6 +4,7 @@ import sys import warnings from pathlib import Path +from typing import Optional, Any, Union, List import kenlm import numpy as np @@ -20,7 +21,7 @@ class KenlmPerplexity(Perplexity): :param kwargs: """ - def __init__(self, perplexity_type='kenlm_perplexity', **kwargs): + def __init__(self, perplexity_type: str = 'kenlm_perplexity', **kwargs: Optional[Any]): super().__init__(perplexity_type, **kwargs) self._ngram_order = kwargs['ngram_order'] if 'ngram_order' in kwargs else 2 @@ -40,7 +41,7 @@ def __init__(self, perplexity_type='kenlm_perplexity', **kwargs): self._n_jobs = kwargs['n_jobs'] if 'n_jobs' in kwargs else multiprocessing.cpu_count() @staticmethod - def __populate_train_file(filepath, samples): + def __populate_train_file(filepath: str, samples: Union[List[str], np.ndarray]): """ :param filepath: @@ -51,7 +52,7 @@ def __populate_train_file(filepath, samples): for sample in samples: f.write(f'{" ".join(sample)}\n') - def _get_kenlm_model(self, context_samples): + def _get_kenlm_model(self, context_samples: Union[List[str], np.ndarray]) -> kenlm.Model: """ :param context_samples: @@ -78,7 +79,8 @@ def _get_kenlm_model(self, context_samples): return kenlm_model - def perplexity_fn(self, target_samples, context_samples, **kwargs): + def perplexity_fn(self, target_samples: Union[List[str], np.ndarray], context_samples: Union[List[str], np.ndarray], + **kwargs: Optional[Any]) -> np.ndarray: """ :param target_samples: @@ -86,7 +88,7 @@ def perplexity_fn(self, target_samples, context_samples, **kwargs): :param kwargs: :return: """ - self.overwrite_args(kwargs.keys(), kwargs) + self.overwrite_args(list(kwargs.keys()), kwargs) kenlm_model = self._get_kenlm_model(context_samples) model_scores = Parallel(n_jobs=self._n_jobs, backend='threading')( diff --git a/convokit/surprise/perplexity.py b/convokit/surprise/perplexity.py index e1f1ed38..ce08cc68 100644 --- a/convokit/surprise/perplexity.py +++ b/convokit/surprise/perplexity.py @@ -1,4 +1,7 @@ from abc import ABC, abstractmethod +from typing import Optional, Any, List, Dict, Union + +import numpy as np class Perplexity(ABC): @@ -8,7 +11,7 @@ class Perplexity(ABC): :param kwargs: """ - def __init__(self, perplexity_type='', **kwargs): + def __init__(self, perplexity_type: str = '', **kwargs: Optional[Any]): self._perplexity_type = perplexity_type self.__dict__.update((f'_{arg}', value) for arg, value in kwargs.items()) @@ -29,7 +32,7 @@ def config(self): private_var_prefix = f'_{self.__class__.__name__}' return {arg[1:]: value for arg, value in self.__dict__.items() if not arg.startswith(private_var_prefix)} - def overwrite_args(self, args_to_overwrite, kwargs): + def overwrite_args(self, args_to_overwrite: List[str], kwargs: Dict[str, Any]): """ :param args_to_overwrite: @@ -40,7 +43,8 @@ def overwrite_args(self, args_to_overwrite, kwargs): self.__dict__[f'_{arg}'] = kwargs[arg] if arg in kwargs else self.__dict__[f'_{arg}'] @abstractmethod - def perplexity_fn(self, target_samples, context_samples, **kwargs): + def perplexity_fn(self, target_samples: Union[List[str], np.ndarray], context_samples: Union[List[str], np.ndarray], + **kwargs: Optional[Any]) -> np.ndarray: """ :param target_samples: diff --git a/convokit/surprise/surprise.py b/convokit/surprise/surprise.py index c083e45c..9ca2a60d 100644 --- a/convokit/surprise/surprise.py +++ b/convokit/surprise/surprise.py @@ -1,6 +1,7 @@ import multiprocessing from collections import defaultdict from itertools import chain +from typing import Callable, List, Tuple, Dict, Any, Optional, Union, Set import numpy as np from joblib import Parallel, delayed @@ -8,7 +9,7 @@ from tqdm import tqdm from convokit import Transformer -from convokit.model import Corpus +from convokit.model import Corpus, Utterance, CorpusComponent from .cross_entropy import CrossEntropy from .utils import random_sampler @@ -33,9 +34,11 @@ class Surprise(Transformer): :param n_jobs: """ - def __init__(self, model_key_selector, tokenizer=word_tokenize, surprise_attr_name='surprise', - target_sample_size=100, context_sample_size=100, n_samples=50, sampling_fn=random_sampler, - n_jobs=None): + def __init__(self, model_key_selector: Callable[[Utterance], str], + tokenizer: Callable[[str], List[str]] = word_tokenize, surprise_attr_name: str = 'surprise', + target_sample_size: int = 100, context_sample_size: int = 100, n_samples: int = 50, + sampling_fn: Callable[[List[Union[np.ndarray, List[str]]], int, int], np.ndarray] = random_sampler, + n_jobs: int = multiprocessing.cpu_count()): self._model_key_selector = model_key_selector self._tokenizer = tokenizer self._surprise_attr_name = surprise_attr_name @@ -43,10 +46,11 @@ def __init__(self, model_key_selector, tokenizer=word_tokenize, surprise_attr_na self._context_sample_size = context_sample_size self._n_samples = n_samples self._sampling_fn = sampling_fn - self._n_jobs = n_jobs if n_jobs is not None else multiprocessing.cpu_count() + self._n_jobs = n_jobs self._model_groups = None - def fit(self, corpus: Corpus, text_func=None, selector=lambda utt: True): + def fit(self, corpus: Corpus, text_func: Callable[[Utterance], List[str]] = None, + selector: Callable[[Utterance], bool] = lambda utt: True) -> Transformer: """ :param corpus: @@ -73,7 +77,10 @@ def fit(self, corpus: Corpus, text_func=None, selector=lambda utt: True): return self - def _compute_surprise(self, target, context, perplexity_fn, **kwargs): + def _compute_surprise(self, target: List[str], context: List[List[str]], + perplexity_fn: Callable[[Union[List[str], np.ndarray], Union[List[str], np.ndarray], + Optional[Any]], np.ndarray], + **kwargs: Optional[Any]) -> np.ndarray: """ :param target: @@ -91,8 +98,11 @@ def _compute_surprise(self, target, context, perplexity_fn, **kwargs): return np.nan return perplexity_fn(target_samples, context_samples, **kwargs) - def _transform(self, corpus, obj_type, group_and_models=None, target_text_func=None, selector=lambda _: True, - group_model_attr_key=None, **kwargs): + def _transform(self, corpus: Corpus, obj_type: str, + group_and_models: Callable[[Utterance], Tuple[str, List[str]]] = None, + target_text_func: Callable[[Utterance], List[str]] = None, + selector: Callable[[CorpusComponent], bool] = lambda _: True, + group_model_attr_key: Callable[[str, str], str] = None, **kwargs: Optional[Any]) -> Corpus: """ :param corpus: @@ -105,7 +115,8 @@ def _transform(self, corpus, obj_type, group_and_models=None, target_text_func=N :return: """ - def _update_groups_models(utt_, utt_groups_, group_models_): + def _update_groups_models(utt_: Utterance, utt_groups_: Dict[str, List[List[str]]], + group_models_: Dict[str, Set[str]]): """ :param utt_: @@ -122,7 +133,7 @@ def _update_groups_models(utt_, utt_groups_, group_models_): utt_groups_[group_name].append(self._tokenizer(utt_.text)) group_models_[group_name].update(models) - def _format_attr_key(group_name, model_key, format_fn=None): + def _format_attr_key(group_name: str, model_key: str, format_fn: Callable[[str, str], str] = None) -> str: """ :param group_name: @@ -136,7 +147,9 @@ def _format_attr_key(group_name, model_key, format_fn=None): return model_key return f'GROUP_{group_name}__MODEL_{model_key}' - def __surprise_score_helper(group_name, utt_group, group_models_, surprise_scores_, perplexity_fn): + def __surprise_score_helper(group_name: str, utt_group, group_models_, surprise_scores_: Dict, + perplexity_fn: Callable[[Union[List[str], np.ndarray], Union[List[str], np.ndarray], + Optional[Any]], np.ndarray]): """ :param group_name: @@ -153,7 +166,10 @@ def __surprise_score_helper(group_name, utt_group, group_models_, surprise_score target = list(chain(*utt_group)) surprise_scores_[surprise_key] = self._compute_surprise(target, context, perplexity_fn, **kwargs) - def _get_surprise_scores(utt_groups_, group_models_, surprise_scores_, perplexity_fn): + def _update_surprise_scores(utt_groups_: Dict[str, List[List[str]]], group_models_: Dict[str, Set[str]], + surprise_scores_: Dict[str, float], + perplexity_fn: Callable[[Union[List[str], np.ndarray], Union[List[str], np.ndarray], + Optional[Any]], np.ndarray]): """ :param utt_groups_: @@ -167,27 +183,29 @@ def _get_surprise_scores(utt_groups_, group_models_, surprise_scores_, perplexit perplexity_fn) for group_name in tqdm(utt_groups_, leave=False, desc='surprise')) - surprise_scores = {} perplexity = kwargs['perplexity'] if 'perplexity' in kwargs else CrossEntropy(**kwargs) if obj_type == 'corpus': + surprise_scores = {} utt_groups, group_models = defaultdict(list), defaultdict(set) for utt in tqdm(corpus.iter_utterances(), desc='transform'): _update_groups_models(utt, utt_groups, group_models) - _get_surprise_scores(utt_groups, group_models, surprise_scores, perplexity.perplexity_fn) + _update_surprise_scores(utt_groups, group_models, surprise_scores, perplexity.perplexity_fn) corpus.add_meta(self._surprise_attr_name, surprise_scores) elif obj_type == 'utterance': for utt in tqdm(corpus.iter_utterances(selector=selector), desc='transform'): + surprise_scores = {} utt_groups, group_models = defaultdict(list), defaultdict(set) _update_groups_models(utt, utt_groups, group_models) - _get_surprise_scores(utt_groups, group_models, surprise_scores, perplexity.perplexity_fn) + _update_surprise_scores(utt_groups, group_models, surprise_scores, perplexity.perplexity_fn) utt.add_meta(self._surprise_attr_name, surprise_scores) else: for obj in tqdm(corpus.iter_objs(obj_type, selector=selector), desc='transform'): + surprise_scores = {} utt_groups, group_models = defaultdict(list), defaultdict(set) for utt in obj.iter_utterances(): _update_groups_models(utt, utt_groups, group_models) - _get_surprise_scores(utt_groups, group_models, surprise_scores, perplexity.perplexity_fn) + _update_surprise_scores(utt_groups, group_models, surprise_scores, perplexity.perplexity_fn) obj.add_meta(self._surprise_attr_name, surprise_scores) return corpus diff --git a/convokit/surprise/utils.py b/convokit/surprise/utils.py index 4d230609..67f4250c 100644 --- a/convokit/surprise/utils.py +++ b/convokit/surprise/utils.py @@ -1,9 +1,11 @@ import tempfile +from typing import List, Union, IO, Optional import numpy as np -def random_sampler(tokens, sample_size, n_samples): +def random_sampler(tokens: List[Union[np.ndarray, List[str]]], sample_size: int, + n_samples: int) -> Optional[np.ndarray]: """ :param tokens: @@ -24,7 +26,7 @@ def random_sampler(tokens, sample_size, n_samples): return np.array([rng.choice(tokens_list[idx], sample_size) for idx in sample_idxs]) -def create_tmp_files(num_files): +def create_tmp_files(num_files: int) -> List[IO]: """ :param num_files: @@ -36,7 +38,7 @@ def create_tmp_files(num_files): return tmp_files -def delete_tmp_files(tmp_files): +def delete_tmp_files(tmp_files: List[IO]): """ :param tmp_files: From 859b2d7044a13584ab2b8af4858d3da27bdbde42 Mon Sep 17 00:00:00 2001 From: Tushaar Gangavarapu Date: Fri, 9 Dec 2022 02:40:46 -0500 Subject: [PATCH 3/9] Naming and testing language model refactoring changes. The change has been tested by comparing to an earlier KenLM implementation, and with the same preprocessing, the same results were observed on the tennis dataset (works on utterance-level). Further, the cross-entropy refactoring has been tested using the old ConvoKit surprise demo, and (with some randomness in choosing the context and target) the outputs seem to be about the same, i.e., almost the same ordering in most and least surprising involvements. In the tennis demo, the values remained the same. --- .../speakerConvoDiversity2.py | 4 +- convokit/surprise/__init__.py | 6 +- convokit/surprise/convokit_lm.py | 38 + convokit/surprise/cross_entropy.py | 59 - convokit/surprise/demos/surprise_demo.ipynb | 1646 ++--------------- convokit/surprise/demos/tennis_demo.ipynb | 349 ++-- convokit/surprise/file_utils.py | 32 + convokit/surprise/kenlm.py | 129 ++ convokit/surprise/kenlm_perplexity.py | 96 - convokit/surprise/language_model.py | 78 + convokit/surprise/perplexity.py | 55 - convokit/surprise/surprise.py | 60 +- convokit/surprise/utils.py | 28 +- 13 files changed, 649 insertions(+), 1931 deletions(-) create mode 100644 convokit/surprise/convokit_lm.py delete mode 100644 convokit/surprise/cross_entropy.py create mode 100644 convokit/surprise/file_utils.py create mode 100644 convokit/surprise/kenlm.py delete mode 100644 convokit/surprise/kenlm_perplexity.py create mode 100644 convokit/surprise/language_model.py delete mode 100644 convokit/surprise/perplexity.py diff --git a/convokit/speakerConvoDiversity/speakerConvoDiversity2.py b/convokit/speakerConvoDiversity/speakerConvoDiversity2.py index 34a8054a..f0d8fd01 100644 --- a/convokit/speakerConvoDiversity/speakerConvoDiversity2.py +++ b/convokit/speakerConvoDiversity/speakerConvoDiversity2.py @@ -126,6 +126,7 @@ def transform(self, corpus): corpus, "speaker", target_text_func=lambda utt: self._get_utt_row(utt, input_table).tokens, + smooth=False ) self._set_output(corpus, input_table) return corpus @@ -163,8 +164,7 @@ def _init_surprise(self, model_key_selector): surprise_attr_name=self.surprise_attr_name, target_sample_size=target_sample_size, context_sample_size=context_sample_size, - n_samples=n_samples, - smooth=False, + n_samples=n_samples ) def _get_text_func(self, utt: Utterance, df: pd.DataFrame): diff --git a/convokit/surprise/__init__.py b/convokit/surprise/__init__.py index 0e41609b..49805c28 100644 --- a/convokit/surprise/__init__.py +++ b/convokit/surprise/__init__.py @@ -1,4 +1,4 @@ -from .cross_entropy import CrossEntropy -from .kenlm_perplexity import KenlmPerplexity -from .perplexity import Perplexity +from .convokit_lm import ConvoKitLanguageModel +from .kenlm import Kenlm +from .language_model import LanguageModel from .surprise import * diff --git a/convokit/surprise/convokit_lm.py b/convokit/surprise/convokit_lm.py new file mode 100644 index 00000000..75531e18 --- /dev/null +++ b/convokit/surprise/convokit_lm.py @@ -0,0 +1,38 @@ +from collections import Counter +from typing import Optional, Any, Union, List + +import numpy as np + +from .language_model import LanguageModel + + +class ConvoKitLanguageModel(LanguageModel): + """ + + :param model_type: + :param kwargs: + """ + + def __init__(self, model_type: str = 'convokit_lm', **kwargs: Optional[Any]): + super().__init__(model_type, **kwargs) + + self._smooth = kwargs['smooth'] if 'smooth' in kwargs else True + + def cross_entropy(self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray]) -> float: + """ + + :param target: + :param context: + :return: + """ + n_target, n_context = len(target), len(context) + if min(n_target, n_context) == 0: + return np.nan + + context_counts = Counter(context) + smooth_v = len(context_counts) + 1 if self._smooth else 0 + smooth_k = 1 if self._smooth else 0 + value = 0 if self._smooth else 1 + + return sum(-np.log((context_counts.get(token, value) + smooth_k) / (n_context + smooth_v)) for token in + target) / n_target diff --git a/convokit/surprise/cross_entropy.py b/convokit/surprise/cross_entropy.py deleted file mode 100644 index 36a3bdf3..00000000 --- a/convokit/surprise/cross_entropy.py +++ /dev/null @@ -1,59 +0,0 @@ -import multiprocessing -from collections import Counter -from typing import Optional, Any, Union, List - -import numpy as np -from joblib import Parallel, delayed - -from .perplexity import Perplexity - - -class CrossEntropy(Perplexity): - """ - - :param perplexity_type: - :param kwargs: - """ - - def __init__(self, perplexity_type: str = 'convokit_cross_entropy', **kwargs: Optional[Any]): - super().__init__(perplexity_type, **kwargs) - - self._smooth = kwargs['smooth'] if 'smooth' in kwargs else True - self._n_jobs = kwargs['n_jobs'] if 'n_jobs' in kwargs else multiprocessing.cpu_count() - - @staticmethod - def __cross_entropy(target: np.ndarray, context: np.ndarray, smooth: bool) -> float: - """ - - :param target: - :param context: - :param smooth: - :return: - """ - n_target, n_context = len(target), len(context) - if min(n_target, n_context) == 0: - return np.nan - - context_counts = Counter(context) - smooth_v = len(context_counts) + 1 if smooth else 0 - smooth_k = 1 if smooth else 0 - value = 0 if smooth else 1 - - return sum(-np.log((context_counts.get(token, value) + smooth_k) / (n_context + smooth_v)) for token in - target) / n_target - - def perplexity_fn(self, target_samples: Union[List[str], np.ndarray], context_samples: Union[List[str], np.ndarray], - **kwargs: Optional[Any]) -> np.ndarray: - """ - - :param target_samples: - :param context_samples: - :param kwargs: - :return: - """ - self.overwrite_args(list(kwargs.keys()), kwargs) - - model_scores = Parallel(n_jobs=self._n_jobs, backend='threading')( - delayed(self.__cross_entropy)(target_sample, context_sample, smooth=self._smooth) for - target_sample, context_sample in zip(target_samples, context_samples)) - return np.nanmean(model_scores) diff --git a/convokit/surprise/demos/surprise_demo.ipynb b/convokit/surprise/demos/surprise_demo.ipynb index 75d680ca..183ed5df 100644 --- a/convokit/surprise/demos/surprise_demo.ipynb +++ b/convokit/surprise/demos/surprise_demo.ipynb @@ -26,7 +26,7 @@ "\n", "import convokit\n", "from convokit import Corpus, download\n", - "from convokit import Surprise, CrossEntropy, KenlmPerplexity\n", + "from convokit import Surprise, ConvoKitLanguageModel, Kenlm\n", "from convokit.text_processing import TextProcessor, TextParser\n", "\n", "from tqdm.notebook import tqdm\n", @@ -288,7 +288,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "06b2d18624844e8daaaca64ec131a41a", + "model_id": "341d4b05ecde4202933ef88881e95dc0", "version_major": 2, "version_minor": 0 }, @@ -310,1528 +310,112 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "surp = Surprise(tokenizer=lambda x: x, \n", - " model_key_selector=lambda utt: '_'.join([utt.speaker.id, utt.conversation_id]), \n", - " target_sample_size=100, context_sample_size=1000, n_samples=50, n_jobs=16)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b5766f2f6f6941e9a234b3a2b9194885", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "fit: 0it [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f920be9ba7f14ea2898f230e26242336", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "fit: 0%| | 0/15394 [00:00" + "fit: 0%| | 0/1 [00:00" + "transform: 0it [00:00, ?it/s]" ] }, - "execution_count": 13, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "surp.transform(subset_corpus, obj_type='utterance', selector=lambda utt: utt.meta['is_question'])" + "subset_corpus = surp.transform(subset_corpus, obj_type='utterance',\n", + " selector=lambda utt: utt.meta['is_question'], \n", + " language_model=kenlm, eval_type='cross_entropy')" ] }, { @@ -276,7 +346,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -285,45 +355,46 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "7.1372781396723255" + "36.54281806945801" ] }, - "execution_count": 15, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", + "get_scores = lambda utterances: pd.Series([score['corpus']for score in utterances], index=utterances.index)\n", "\n", - "female_qs = pd.to_numeric(utterances[utterances['meta.player_gender'] == 'F']['meta.surprise']).dropna()\n", + "female_qs = get_scores(utterances[utterances['meta.player_gender'] == 'F']['meta.surprise']).dropna()\n", "female_qs.median()" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "7.147981123495766" + "36.73420365651448" ] }, - "execution_count": 16, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "male_qs = pd.to_numeric(utterances[utterances['meta.player_gender'] == 'M']['meta.surprise']).dropna()\n", + "male_qs = get_scores(utterances[utterances['meta.player_gender'] == 'M']['meta.surprise']).dropna()\n", "male_qs.median()" ] }, @@ -343,7 +414,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -353,18 +424,18 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "And when was that in the match? The first set? Second set?\n", - "When she broke you in the eighth game of the third set, she did a backhand off the net and it kind of clipped the net and you kind of netted the next one. Was that just a tough break?\n", - "You started 3Love down in the first set. You came back and won it 64. What was the turnaround for you in the opening set and on through the match?\n", - "Would you give her a good chance against Stosur in the next round?\n", - "Do you enjoy the balance of the life as a tour player and then back home in and the ability to serve your country in the military?\n" + "Does she give you a program of what you need to do?\n", + "How did you feel today? Sleepy? Awake? Energetic?\n", + "Congratulations. That seemed like a very strong win for you. How did you feel about your performance?\n", + "That could be against Sabine. I don't know the score right now.\n", + "That was a highlevel match today. You start off pretty well and fall off. I saw the entire match. What happened then? Because you had more troubles for the serve, I think.\n" ] } ], @@ -375,18 +446,18 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "And the second serve on the set point in the fourth set, just another day at the office?\n", - "Was it a big advantage to serve first in the third set?\n", - "But at the start of the third set again you had a little bit of a...\n", - "Speaking of the mental game, much is made of being the hunter or the hunted. For so long you were the hunted. This is the first week in a long time being the hunter. Is there a change at all in you?\n", - "How big of a deal was it get that break in the first game of the second set?\n" + "You had a bit of a slow start. Was that nerves out there?\n", + "Is it fair to say that last year you had this opponent who beat you, is that in your mind or is it old or do you want to beat him more?\n", + "Rafael Nadal said something about a special tax regime in UK tournaments yesterday. What is your feeling about that issue?\n", + "Is that a sign of lack of confidence?\n", + "When you're remembering it, are you seeing it, too?\n" ] } ], @@ -397,18 +468,18 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "No yoga, you prefer to dance? Some players do yoga.\n", - "What aspects of the match do you think were decisive, technically speaking?\n", - "Did you hear the birds? They were really crying. They were trapped and --\n", - "Did Sasha get an invitation to Kris Humphries' wedding this weekend?\n", - "Are you primarily based in Southern California or South Florida now?\n" + "With this win and also the win at the US Open, wondering if you think that you have an edge if this match goes to three sets against her? Just physically and mentally seems you've been able to outlast her in these tough, grueling matches.\n", + "He didn't say Carlos sent an email or anything?\n", + "Gulbis recently talked about top players in general?\n", + "This is only your third Grand Slam main draw match. Do you think even without all the difficulties you've had this year you would still consider that a reasonably good success rate, to win your third main draw Grand Slam match?\n", + "Your ambitions for the year have been enhanced by this tournament? What had you hoped to do first of the year coming into here?\n" ] } ], @@ -419,18 +490,18 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Are you planning to play tactically against James or Mathieu tomorrow?\n", - "Did you consider yourself a streaky player even in college?\n", - "You said you watched Scream last night to relax. Do you normally watch horror films to relax?\n", - "How do you view your secondround matchup with Bernard Tomic?\n", - "Just talk us through the messages on your kit bag.\n" + "Janko Tipsarevic came through after being down two sets. He said he hates the idea of being called a top10 player because you have this expectation that you're going to walk in and crush everybody. He felt that his energy level was down. Do you have to worry at all in these early rounds that you're overconfident going into matches?\n", + "Did you take anything from John Millman's performance last night?\n", + "One of the TV guys told me you said you had blistered hands.\n", + "Based on today's performance, do you think you can repeat your good result here last year?\n", + "Can you give us an assessment of what you've been doing differently this year regarding previous years at Wimbledon?\n" ] } ], @@ -450,39 +521,52 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ - "gender_models_surp = Surprise(model_key_selector=lambda utt: utt.meta['player_gender'], target_sample_size=10, context_sample_size=5000, surprise_attr_name='surprise_gender_model')" + "gender_models_surp = Surprise(model_key_selector=lambda utt: utt.meta['player_gender'],\n", + " target_sample_size=10, context_sample_size=5000,\n", + " surprise_attr_name='surprise_gender_model')" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 25, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "fit1: 81974it [00:00, 302952.81it/s]\n", - "fit2: 100%|██████████| 2/2 [00:12<00:00, 6.31s/it]\n" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ad9e3f7901064d84ac246ca0ed036d43", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "fit: 0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" }, { "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0a754d05ce3849649a81756bad73c32c", + "version_major": 2, + "version_minor": 0 + }, "text/plain": [ - "" + "fit: 0%| | 0/2 [00:00" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "gender_models_surp.transform(subset_corpus, obj_type='utterance', group_and_models=lambda utt: (utt.id, ['M', 'F']), group_model_attr_key=lambda _, m: m, selector=lambda utt: utt.meta['is_question'])" + "convokit_lm = ConvoKitLanguageModel(smooth=True)\n", + "pp.pprint(convokit_lm.config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subset_corpus = \\\n", + " gender_models_surp.transform(subset_corpus, obj_type='utterance', \n", + " group_and_models=lambda utt: (utt.id, ['M', 'F']), \n", + " group_model_attr_key=lambda _, m: m,\n", + " selector=lambda utt: utt.meta['is_question'], \n", + " language_model=convokit_lm, eval_type='cross_entropy')" ] }, { @@ -531,7 +612,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -540,82 +621,86 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "5.78670861966856" + "5.820603795831438" ] }, - "execution_count": 26, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "utterances[utterances['meta.player_gender'] == 'F']['meta.surprise_gender_model'].map(lambda x: x['M']).dropna().mean()" + "utterances[utterances['meta.player_gender'] == 'F'] \\\n", + " ['meta.surprise_gender_model'].map(lambda x: x['M']).dropna().mean()" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "5.7477053372750335" + "5.773426120991382" ] }, - "execution_count": 27, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "utterances[utterances['meta.player_gender'] == 'F']['meta.surprise_gender_model'].map(lambda x: x['F']).dropna().mean()" + "utterances[utterances['meta.player_gender'] == 'F'] \\\n", + " ['meta.surprise_gender_model'].map(lambda x: x['F']).dropna().mean()" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "5.784562889828235" + "5.79686635301196" ] }, - "execution_count": 28, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "utterances[utterances['meta.player_gender'] == 'M']['meta.surprise_gender_model'].map(lambda x: x['M']).dropna().mean()" + "utterances[utterances['meta.player_gender'] == 'M'] \\\n", + " ['meta.surprise_gender_model'].map(lambda x: x['M']).dropna().mean()" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "5.81045743833415" + "5.830856237083713" ] }, - "execution_count": 29, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "utterances[utterances['meta.player_gender'] == 'M']['meta.surprise_gender_model'].map(lambda x: x['F']).dropna().mean()" + "utterances[utterances['meta.player_gender'] == 'M'] \\\n", + " ['meta.surprise_gender_model'].map(lambda x: x['F']).dropna().mean()" ] }, { @@ -628,7 +713,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -642,7 +727,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.9.15" } }, "nbformat": 4, diff --git a/convokit/surprise/file_utils.py b/convokit/surprise/file_utils.py new file mode 100644 index 00000000..9e86792c --- /dev/null +++ b/convokit/surprise/file_utils.py @@ -0,0 +1,32 @@ +import tempfile +from pathlib import Path +from typing import List, IO + + +def create_tmp_files(num_files: int) -> List[IO]: + """ + + :param num_files: + :return: + """ + tmp_files = [] + for _ in range(num_files): + tmp_files.append(tempfile.NamedTemporaryFile('w', delete=True)) + return tmp_files + + +def delete_files(tmp_filenames: List[str], remove_parent_dir=True): + """ + + :param tmp_filenames: + :param remove_parent_dir: + :return: + """ + tmp_filepaths = [Path(tmp_filename) for tmp_filename in tmp_filenames] + parent_dir = tmp_filepaths[0].parents[0] + + for tmp_filepath in tmp_filepaths: + Path.unlink(tmp_filepath, missing_ok=True) + + if remove_parent_dir and len(list(parent_dir.glob('*'))) == 0: + Path.rmdir(parent_dir) diff --git a/convokit/surprise/kenlm.py b/convokit/surprise/kenlm.py new file mode 100644 index 00000000..00252dc6 --- /dev/null +++ b/convokit/surprise/kenlm.py @@ -0,0 +1,129 @@ +import os +import subprocess +import time +import warnings +from pathlib import Path +from typing import Optional, Any, Union, List + +import kenlm +import numpy as np + +from .file_utils import create_tmp_files, delete_files +from .language_model import LanguageModel + + +class Kenlm(LanguageModel): + """ + + :param model_type: + :param kwargs: + """ + + def __init__(self, model_type: str = 'kenlm', **kwargs: Optional[Any]): + super().__init__(model_type, **kwargs) + + self._ngram_order = kwargs['ngram_order'] if 'ngram_order' in kwargs else 2 + if self._ngram_order < 2: + warnings.warn(f'kenlm does not support n-gram order below 2; setting n-gram order to 2. ' + f'See: https://github.com/kpu/kenlm/issues/171 for specifics.') + self._ngram_order = 2 + + self._is_persistent = kwargs['is_persistent'] if 'is_persistent' in kwargs else False + if self._is_persistent or 'trained_model_filepath' in kwargs: + self._is_persistent = True + self.__kenlm_model = Kenlm.load_kenlm_from_file( + kwargs['trained_model_filepath']) if 'trained_model_filepath' in kwargs else None + + if 'kenlm_path' not in kwargs: + self._kenlm_path = os.path.join(str(Path.home()), 'kenlm') + warnings.warn(f'the kenlm_path is unspecified, setting it to {self._kenlm_path}') + self.__kenlm_bin_path = os.path.join(self._kenlm_path, 'build/bin') + if not os.path.isdir(self.__kenlm_bin_path): + raise FileNotFoundError(f'the build directory for kenlm does not exist at: {self.__kenlm_bin_path}; ' + f'build kenlm {self._kenlm_path} before computing surprise scores') + + self._models_dir = kwargs['models_dir'] if 'models_dir' in kwargs else None + if self._models_dir and not os.path.exists(self._models_dir): + warnings.warn(f'creating the folder: {self._models_dir} as it does not exist') + os.makedirs(self._models_dir) + self._model_filename = kwargs['model_filename'] if 'model_filename' in kwargs else self._model_type + + @staticmethod + def load_kenlm_from_file(trained_model_filepath): + """ + + :param trained_model_filepath: + :return: + """ + kenlm_model = kenlm.Model(trained_model_filepath) + return kenlm_model + + def __make_files(self): + """ + + :return: + """ + if self._models_dir: + epoch = str(int(time.time())) + os.makedirs(os.path.join(self._models_dir, epoch)) + + train_filename = os.path.join(self._models_dir, epoch, f'{self._model_filename}.txt') + arpa_filename = os.path.join(self._models_dir, epoch, f'{self._model_filename}.arpa') + model_filename = os.path.join(self._models_dir, epoch, f'{self._model_filename}.bin') + else: + train_file, arpa_file, model_file = create_tmp_files(num_files=3) + train_filename, arpa_filename, model_filename = train_file.name, arpa_file.name, model_file.name + return train_filename, arpa_filename, model_filename + + @staticmethod + def __populate_train_file(filepath: str, samples: Union[List[List[str]], np.ndarray]): + """ + + :param filepath: + :param samples: + :return: + """ + with open(filepath, 'w', encoding='utf-8') as f: + for sample in samples: + f.write(f'{" ".join(sample).strip()}\n') + + def _get_kenlm_model(self, context_samples: Union[List[List[str]], np.ndarray]) -> kenlm.Model: + """ + + :param context_samples: + :return: + """ + train_filename, arpa_filename, model_filename = self.__make_files() + + self.__populate_train_file(train_filename, samples=context_samples) + kenlm_args = [os.path.join(self.__kenlm_bin_path, 'lmplz'), '-o', f'{self._ngram_order}', '--text', + train_filename, '--arpa', arpa_filename, '--discount_fallback'] + cmd_return = subprocess.run(kenlm_args, capture_output=False, text=True, stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT) + if cmd_return.returncode != 0: + delete_files([model_filename, arpa_filename, train_filename]) + raise RuntimeError('the kenlm model training was unsuccessful') + + kenlm_args = [os.path.join(self.__kenlm_bin_path, 'build_binary'), 'trie', arpa_filename, model_filename] + cmd_return = subprocess.run(kenlm_args, capture_output=False, text=True, stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT) + if cmd_return.returncode != 0: + delete_files([model_filename, arpa_filename, train_filename]) + raise RuntimeError('the kenlm model (binary) building was unsuccessful') + + kenlm_model = kenlm.Model(model_filename) + if not self._models_dir: + delete_files([model_filename, arpa_filename, train_filename]) + + return kenlm_model + + def cross_entropy(self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray]) -> float: + """ + + :param target: + :param context: + :return: + """ + if self.__kenlm_model is None or not self._is_persistent: + self.__kenlm_model = self._get_kenlm_model([context]) + return -self.__kenlm_model.score(' '.join(target)) diff --git a/convokit/surprise/kenlm_perplexity.py b/convokit/surprise/kenlm_perplexity.py deleted file mode 100644 index 9d9b153d..00000000 --- a/convokit/surprise/kenlm_perplexity.py +++ /dev/null @@ -1,96 +0,0 @@ -import multiprocessing -import os -import subprocess -import sys -import warnings -from pathlib import Path -from typing import Optional, Any, Union, List - -import kenlm -import numpy as np -from joblib import Parallel, delayed - -from .perplexity import Perplexity -from .utils import create_tmp_files, delete_tmp_files - - -class KenlmPerplexity(Perplexity): - """ - - :param perplexity_type: - :param kwargs: - """ - - def __init__(self, perplexity_type: str = 'kenlm_perplexity', **kwargs: Optional[Any]): - super().__init__(perplexity_type, **kwargs) - - self._ngram_order = kwargs['ngram_order'] if 'ngram_order' in kwargs else 2 - if self._ngram_order < 2: - warnings.warn(f'kenlm does not support n-gram order below 2; setting n-gram order to 2. ' - f'See: https://github.com/kpu/kenlm/issues/171 for specifics.') - self._ngram_order = 2 - - if 'kenlm_path' not in kwargs: - self._kenlm_path = os.path.join(str(Path.home()), 'kenlm') - warnings.warn(f'the kenlm_path is unspecified, setting it to {self._kenlm_path}') - self.__kenlm_bin_path = os.path.join(self._kenlm_path, 'build/bin') - if not os.path.isdir(self.__kenlm_bin_path): - raise FileNotFoundError(f'the build directory for kenlm does not exist at: {self.__kenlm_bin_path}; ' - f'build kenlm {self._kenlm_path} before computing surprise scores') - - self._n_jobs = kwargs['n_jobs'] if 'n_jobs' in kwargs else multiprocessing.cpu_count() - - @staticmethod - def __populate_train_file(filepath: str, samples: Union[List[str], np.ndarray]): - """ - - :param filepath: - :param samples: - :return: - """ - with open(filepath, 'w', encoding='utf-8') as f: - for sample in samples: - f.write(f'{" ".join(sample)}\n') - - def _get_kenlm_model(self, context_samples: Union[List[str], np.ndarray]) -> kenlm.Model: - """ - - :param context_samples: - :return: - """ - train_file, arpa_file, model_file = create_tmp_files(num_files=3) - - self.__populate_train_file(train_file.name, samples=context_samples) - kenlm_args = [os.path.join(self.__kenlm_bin_path, 'lmplz'), '-o', f'{self._ngram_order}', '--text', - train_file.name, '--arpa', arpa_file.name, '--discount_fallback'] - cmd_return = subprocess.run(kenlm_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr) - if cmd_return.returncode != 0: - delete_tmp_files([model_file, arpa_file, train_file]) - raise RuntimeError('the kenlm model training was unsuccessful') - - kenlm_args = [os.path.join(self.__kenlm_bin_path, 'build_binary'), 'trie', arpa_file.name, model_file.name] - cmd_return = subprocess.run(kenlm_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr) - if cmd_return.returncode != 0: - delete_tmp_files([model_file, arpa_file, train_file]) - raise RuntimeError('the kenlm model (binary) building was unsuccessful') - - kenlm_model = kenlm.Model(model_file.name) - delete_tmp_files([model_file, arpa_file, train_file]) - - return kenlm_model - - def perplexity_fn(self, target_samples: Union[List[str], np.ndarray], context_samples: Union[List[str], np.ndarray], - **kwargs: Optional[Any]) -> np.ndarray: - """ - - :param target_samples: - :param context_samples: - :param kwargs: - :return: - """ - self.overwrite_args(list(kwargs.keys()), kwargs) - - kenlm_model = self._get_kenlm_model(context_samples) - model_scores = Parallel(n_jobs=self._n_jobs, backend='threading')( - delayed(kenlm_model.score)(' '.join(target_sample)) for target_sample in target_samples) - return np.nanmean(model_scores) diff --git a/convokit/surprise/language_model.py b/convokit/surprise/language_model.py new file mode 100644 index 00000000..72eaf5b9 --- /dev/null +++ b/convokit/surprise/language_model.py @@ -0,0 +1,78 @@ +from abc import ABC +from typing import Optional, Any, List, Dict, Union + +import numpy as np +from joblib import Parallel, delayed + + +class LanguageModel(ABC): + """ + + :param model_type: + :param kwargs: + """ + + def __init__(self, model_type: str = 'language_model', **kwargs: Optional[Any]): + self._model_type = model_type + self._n_jobs = kwargs['n_jobs'] if 'n_jobs' in kwargs else 1 + + self.__dict__.update((f'_{arg}', value) for arg, value in kwargs.items()) + + @property + def type(self): + """ + + :return: + """ + return self._model_type + + @property + def config(self): + """ + + :return: + """ + private_var_prefix = f'_{self.__class__.__name__}' + return {arg[1:]: value for arg, value in self.__dict__.items() if not arg.startswith(private_var_prefix)} + + def _overwrite_args(self, args_to_overwrite: List[str], kwargs: Dict[str, Any]): + """ + + :param args_to_overwrite: + :param kwargs: + :return: + """ + for arg in args_to_overwrite: + self.__dict__[f'_{arg}'] = kwargs[arg] if arg in kwargs else self.__dict__[f'_{arg}'] + + def cross_entropy(self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray]) -> float: + """ + + :param target: + :param context: + :return: + """ + raise RuntimeError('cross entropy is not implemented') + + def evaluate(self, target_samples: Union[List[List[str]], np.ndarray], + context_samples: Union[List[List[str]], np.ndarray], eval_type: str = 'cross_entropy', + **kwargs: Optional[Any]) -> np.ndarray: + """ + + :param target_samples: + :param context_samples: + :param eval_type: + :param kwargs: + :return: + """ + self._overwrite_args(list(kwargs.keys()), kwargs) + eval_fn = getattr(self, eval_type) + + if self._n_jobs == 1: + model_scores = [eval_fn(target_sample, context_sample) for target_sample, context_sample in + zip(target_samples, context_samples)] + else: + model_scores = Parallel(n_jobs=self._n_jobs, backend='threading')( + delayed(eval_fn)(target_sample, context_sample) for target_sample, context_sample in + zip(target_samples, context_samples)) + return np.nanmean(model_scores) diff --git a/convokit/surprise/perplexity.py b/convokit/surprise/perplexity.py deleted file mode 100644 index ce08cc68..00000000 --- a/convokit/surprise/perplexity.py +++ /dev/null @@ -1,55 +0,0 @@ -from abc import ABC, abstractmethod -from typing import Optional, Any, List, Dict, Union - -import numpy as np - - -class Perplexity(ABC): - """ - - :param perplexity_type: - :param kwargs: - """ - - def __init__(self, perplexity_type: str = '', **kwargs: Optional[Any]): - self._perplexity_type = perplexity_type - self.__dict__.update((f'_{arg}', value) for arg, value in kwargs.items()) - - @property - def type(self): - """ - - :return: - """ - return self._perplexity_type - - @property - def config(self): - """ - - :return: - """ - private_var_prefix = f'_{self.__class__.__name__}' - return {arg[1:]: value for arg, value in self.__dict__.items() if not arg.startswith(private_var_prefix)} - - def overwrite_args(self, args_to_overwrite: List[str], kwargs: Dict[str, Any]): - """ - - :param args_to_overwrite: - :param kwargs: - :return: - """ - for arg in args_to_overwrite: - self.__dict__[f'_{arg}'] = kwargs[arg] if arg in kwargs else self.__dict__[f'_{arg}'] - - @abstractmethod - def perplexity_fn(self, target_samples: Union[List[str], np.ndarray], context_samples: Union[List[str], np.ndarray], - **kwargs: Optional[Any]) -> np.ndarray: - """ - - :param target_samples: - :param context_samples: - :param kwargs: - :return: - """ - raise NotImplementedError('the subclass needs to implement it\'s own perplexity function') diff --git a/convokit/surprise/surprise.py b/convokit/surprise/surprise.py index 9ca2a60d..5949e407 100644 --- a/convokit/surprise/surprise.py +++ b/convokit/surprise/surprise.py @@ -1,4 +1,3 @@ -import multiprocessing from collections import defaultdict from itertools import chain from typing import Callable, List, Tuple, Dict, Any, Optional, Union, Set @@ -10,7 +9,7 @@ from convokit import Transformer from convokit.model import Corpus, Utterance, CorpusComponent -from .cross_entropy import CrossEntropy +from .convokit_lm import ConvoKitLanguageModel from .utils import random_sampler try: @@ -38,7 +37,7 @@ def __init__(self, model_key_selector: Callable[[Utterance], str], tokenizer: Callable[[str], List[str]] = word_tokenize, surprise_attr_name: str = 'surprise', target_sample_size: int = 100, context_sample_size: int = 100, n_samples: int = 50, sampling_fn: Callable[[List[Union[np.ndarray, List[str]]], int, int], np.ndarray] = random_sampler, - n_jobs: int = multiprocessing.cpu_count()): + n_jobs: int = 1): self._model_key_selector = model_key_selector self._tokenizer = tokenizer self._surprise_attr_name = surprise_attr_name @@ -78,14 +77,14 @@ def fit(self, corpus: Corpus, text_func: Callable[[Utterance], List[str]] = None return self def _compute_surprise(self, target: List[str], context: List[List[str]], - perplexity_fn: Callable[[Union[List[str], np.ndarray], Union[List[str], np.ndarray], - Optional[Any]], np.ndarray], + lm_evaluation_fn: Callable[[Union[List[str], np.ndarray], Union[List[str], np.ndarray], + Optional[Any]], np.ndarray], **kwargs: Optional[Any]) -> np.ndarray: """ :param target: :param context: - :param perplexity_fn: + :param lm_evaluation_fn: :param kwargs: :return: """ @@ -96,7 +95,7 @@ def _compute_surprise(self, target: List[str], context: List[List[str]], if target_samples is None or context_samples is None: return np.nan - return perplexity_fn(target_samples, context_samples, **kwargs) + return lm_evaluation_fn(target_samples, context_samples, **kwargs) def _transform(self, corpus: Corpus, obj_type: str, group_and_models: Callable[[Utterance], Tuple[str, List[str]]] = None, @@ -124,8 +123,8 @@ def _update_groups_models(utt_: Utterance, utt_groups_: Dict[str, List[List[str] :param group_models_: :return: """ - group_name, models = group_and_models(utt_) if group_and_models else self._model_key_selector(utt_), None - models = {group_name} if models is None else models + group_name, models = group_and_models(utt_) if group_and_models else (self._model_key_selector(utt_), None) + models = {group_name} if not models else models if target_text_func: if group_name not in utt_groups_: utt_groups_[group_name] = [target_text_func(utt_)] @@ -147,16 +146,18 @@ def _format_attr_key(group_name: str, model_key: str, format_fn: Callable[[str, return model_key return f'GROUP_{group_name}__MODEL_{model_key}' - def __surprise_score_helper(group_name: str, utt_group, group_models_, surprise_scores_: Dict, - perplexity_fn: Callable[[Union[List[str], np.ndarray], Union[List[str], np.ndarray], - Optional[Any]], np.ndarray]): + def __surprise_score_helper(group_name: str, utt_group: List[List[str]], group_models_: Dict[str, Set[str]], + surprise_scores_: Dict[str, np.ndarray], + lm_evaluation_fn: Callable[ + [Union[List[str], np.ndarray], Union[List[str], np.ndarray], + Optional[Any]], np.ndarray]): """ :param group_name: :param utt_group: :param group_models_: :param surprise_scores_: - :param perplexity_fn: + :param lm_evaluation_fn: :return: """ for model_key in group_models_[group_name]: @@ -164,40 +165,47 @@ def __surprise_score_helper(group_name: str, utt_group, group_models_, surprise_ surprise_key = _format_attr_key(group_name, model_key, group_model_attr_key) context = self._model_groups[model_key] target = list(chain(*utt_group)) - surprise_scores_[surprise_key] = self._compute_surprise(target, context, perplexity_fn, **kwargs) + surprise_scores_[surprise_key] = self._compute_surprise(target, context, lm_evaluation_fn, **kwargs) def _update_surprise_scores(utt_groups_: Dict[str, List[List[str]]], group_models_: Dict[str, Set[str]], - surprise_scores_: Dict[str, float], - perplexity_fn: Callable[[Union[List[str], np.ndarray], Union[List[str], np.ndarray], - Optional[Any]], np.ndarray]): + surprise_scores_: Dict[str, np.ndarray], + lm_evaluation_fn: Callable[ + [Union[List[str], np.ndarray], Union[List[str], np.ndarray], + Optional[Any]], np.ndarray]): """ :param utt_groups_: :param group_models_: :param surprise_scores_: - :param perplexity_fn: + :param lm_evaluation_fn: :return: """ - Parallel(n_jobs=self._n_jobs, backend='threading')( - delayed(__surprise_score_helper)(group_name, utt_groups_[group_name], group_models_, surprise_scores_, - perplexity_fn) for group_name in - tqdm(utt_groups_, leave=False, desc='surprise')) + if self._n_jobs == 1: + for group_name in tqdm(utt_groups_, leave=False, desc='surprise', delay=2): + __surprise_score_helper(group_name, utt_groups_[group_name], group_models_, surprise_scores_, + lm_evaluation_fn) + else: + Parallel(n_jobs=self._n_jobs, backend='threading')( + delayed(__surprise_score_helper)(group_name, utt_groups_[group_name], group_models_, + surprise_scores_, lm_evaluation_fn) for group_name in + tqdm(utt_groups_, leave=False, desc='surprise', delay=2)) - perplexity = kwargs['perplexity'] if 'perplexity' in kwargs else CrossEntropy(**kwargs) + language_model = kwargs['language_model'] if 'language_model' in kwargs else ConvoKitLanguageModel( + n_jobs=self._n_jobs, **kwargs) if obj_type == 'corpus': surprise_scores = {} utt_groups, group_models = defaultdict(list), defaultdict(set) for utt in tqdm(corpus.iter_utterances(), desc='transform'): _update_groups_models(utt, utt_groups, group_models) - _update_surprise_scores(utt_groups, group_models, surprise_scores, perplexity.perplexity_fn) + _update_surprise_scores(utt_groups, group_models, surprise_scores, language_model.evaluate) corpus.add_meta(self._surprise_attr_name, surprise_scores) elif obj_type == 'utterance': for utt in tqdm(corpus.iter_utterances(selector=selector), desc='transform'): surprise_scores = {} utt_groups, group_models = defaultdict(list), defaultdict(set) _update_groups_models(utt, utt_groups, group_models) - _update_surprise_scores(utt_groups, group_models, surprise_scores, perplexity.perplexity_fn) + _update_surprise_scores(utt_groups, group_models, surprise_scores, language_model.evaluate) utt.add_meta(self._surprise_attr_name, surprise_scores) else: for obj in tqdm(corpus.iter_objs(obj_type, selector=selector), desc='transform'): @@ -205,7 +213,7 @@ def _update_surprise_scores(utt_groups_: Dict[str, List[List[str]]], group_model utt_groups, group_models = defaultdict(list), defaultdict(set) for utt in obj.iter_utterances(): _update_groups_models(utt, utt_groups, group_models) - _update_surprise_scores(utt_groups, group_models, surprise_scores, perplexity.perplexity_fn) + _update_surprise_scores(utt_groups, group_models, surprise_scores, language_model.evaluate) obj.add_meta(self._surprise_attr_name, surprise_scores) return corpus diff --git a/convokit/surprise/utils.py b/convokit/surprise/utils.py index 67f4250c..a11e6852 100644 --- a/convokit/surprise/utils.py +++ b/convokit/surprise/utils.py @@ -1,5 +1,4 @@ -import tempfile -from typing import List, Union, IO, Optional +from typing import List, Union, Optional import numpy as np @@ -24,28 +23,3 @@ def random_sampler(tokens: List[Union[np.ndarray, List[str]]], sample_size: int, rng = np.random.default_rng() sample_idxs = rng.integers(0, tokens_list.shape[0], size=n_samples) return np.array([rng.choice(tokens_list[idx], sample_size) for idx in sample_idxs]) - - -def create_tmp_files(num_files: int) -> List[IO]: - """ - - :param num_files: - :return: - """ - tmp_files = [] - for _ in range(num_files): - tmp_files.append(tempfile.NamedTemporaryFile('w', delete=True)) - return tmp_files - - -def delete_tmp_files(tmp_files: List[IO]): - """ - - :param tmp_files: - :return: - """ - for tmp_file in tmp_files: - try: - tmp_file.close() - except FileNotFoundError: - pass From 636a85120712c0383d74f502370350b2d3065496 Mon Sep 17 00:00:00 2001 From: Tushaar Gangavarapu Date: Fri, 9 Dec 2022 03:40:50 -0500 Subject: [PATCH 4/9] Fix typing issues for newly added functions. The change make minor changes to the types of specific functions that were added after the first typing commit. --- convokit/surprise/demos/tennis_demo.ipynb | 12 +----------- convokit/surprise/file_utils.py | 4 ++-- convokit/surprise/kenlm.py | 10 +++++----- convokit/surprise/language_model.py | 4 ++-- convokit/surprise/surprise.py | 1 + 5 files changed, 11 insertions(+), 20 deletions(-) diff --git a/convokit/surprise/demos/tennis_demo.ipynb b/convokit/surprise/demos/tennis_demo.ipynb index 4b4ae589..ef3469fb 100644 --- a/convokit/surprise/demos/tennis_demo.ipynb +++ b/convokit/surprise/demos/tennis_demo.ipynb @@ -10,17 +10,7 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "sys.path.insert(0, '/Users/tushaar/Downloads/Cornell/Research/ConvoKit/')" - ] - }, - { - "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ diff --git a/convokit/surprise/file_utils.py b/convokit/surprise/file_utils.py index 9e86792c..221a6c9a 100644 --- a/convokit/surprise/file_utils.py +++ b/convokit/surprise/file_utils.py @@ -3,7 +3,7 @@ from typing import List, IO -def create_tmp_files(num_files: int) -> List[IO]: +def create_temp_files(num_files: int) -> List[IO]: """ :param num_files: @@ -15,7 +15,7 @@ def create_tmp_files(num_files: int) -> List[IO]: return tmp_files -def delete_files(tmp_filenames: List[str], remove_parent_dir=True): +def delete_files(tmp_filenames: List[str], remove_parent_dir: bool = True): """ :param tmp_filenames: diff --git a/convokit/surprise/kenlm.py b/convokit/surprise/kenlm.py index 00252dc6..40f1cff1 100644 --- a/convokit/surprise/kenlm.py +++ b/convokit/surprise/kenlm.py @@ -3,12 +3,12 @@ import time import warnings from pathlib import Path -from typing import Optional, Any, Union, List +from typing import Optional, Any, Union, List, Tuple import kenlm import numpy as np -from .file_utils import create_tmp_files, delete_files +from .file_utils import create_temp_files, delete_files from .language_model import LanguageModel @@ -49,7 +49,7 @@ def __init__(self, model_type: str = 'kenlm', **kwargs: Optional[Any]): self._model_filename = kwargs['model_filename'] if 'model_filename' in kwargs else self._model_type @staticmethod - def load_kenlm_from_file(trained_model_filepath): + def load_kenlm_from_file(trained_model_filepath: str) -> kenlm.Model: """ :param trained_model_filepath: @@ -58,7 +58,7 @@ def load_kenlm_from_file(trained_model_filepath): kenlm_model = kenlm.Model(trained_model_filepath) return kenlm_model - def __make_files(self): + def __make_files(self) -> Tuple[str, str, str]: """ :return: @@ -71,7 +71,7 @@ def __make_files(self): arpa_filename = os.path.join(self._models_dir, epoch, f'{self._model_filename}.arpa') model_filename = os.path.join(self._models_dir, epoch, f'{self._model_filename}.bin') else: - train_file, arpa_file, model_file = create_tmp_files(num_files=3) + train_file, arpa_file, model_file = create_temp_files(num_files=3) train_filename, arpa_filename, model_filename = train_file.name, arpa_file.name, model_file.name return train_filename, arpa_filename, model_filename diff --git a/convokit/surprise/language_model.py b/convokit/surprise/language_model.py index 72eaf5b9..a721003d 100644 --- a/convokit/surprise/language_model.py +++ b/convokit/surprise/language_model.py @@ -19,7 +19,7 @@ def __init__(self, model_type: str = 'language_model', **kwargs: Optional[Any]): self.__dict__.update((f'_{arg}', value) for arg, value in kwargs.items()) @property - def type(self): + def type(self) -> str: """ :return: @@ -27,7 +27,7 @@ def type(self): return self._model_type @property - def config(self): + def config(self) -> Dict[str, Any]: """ :return: diff --git a/convokit/surprise/surprise.py b/convokit/surprise/surprise.py index 5949e407..bf49d071 100644 --- a/convokit/surprise/surprise.py +++ b/convokit/surprise/surprise.py @@ -3,6 +3,7 @@ from typing import Callable, List, Tuple, Dict, Any, Optional, Union, Set import numpy as np +from IPython import get_ipython from joblib import Parallel, delayed from nltk.tokenize import word_tokenize from tqdm import tqdm From 66fde3beb4dfe4df35dbbd034e232708c40f0906 Mon Sep 17 00:00:00 2001 From: Tushaar Gangavarapu Date: Fri, 16 Dec 2022 01:08:27 -0500 Subject: [PATCH 5/9] Format code and unify all the utility functions. This change formats the code in accordance with the existing black formatter, and it deletes the utility files created for the surprise transformer and includes them in the main utils file. --- convokit/surprise/convokit_lm.py | 19 ++- convokit/surprise/file_utils.py | 32 ----- convokit/surprise/kenlm.py | 122 ++++++++++++------ convokit/surprise/language_model.py | 47 ++++--- convokit/surprise/surprise.py | 193 +++++++++++++++++++--------- convokit/surprise/utils.py | 25 ---- convokit/util.py | 81 +++++++++++- setup.py | 1 + 8 files changed, 339 insertions(+), 181 deletions(-) delete mode 100644 convokit/surprise/file_utils.py delete mode 100644 convokit/surprise/utils.py diff --git a/convokit/surprise/convokit_lm.py b/convokit/surprise/convokit_lm.py index 75531e18..a7f5358d 100644 --- a/convokit/surprise/convokit_lm.py +++ b/convokit/surprise/convokit_lm.py @@ -13,12 +13,16 @@ class ConvoKitLanguageModel(LanguageModel): :param kwargs: """ - def __init__(self, model_type: str = 'convokit_lm', **kwargs: Optional[Any]): + def __init__(self, model_type: str = "convokit_lm", **kwargs: Optional[Any]): super().__init__(model_type, **kwargs) - self._smooth = kwargs['smooth'] if 'smooth' in kwargs else True + self._smooth = kwargs["smooth"] if "smooth" in kwargs else True - def cross_entropy(self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray]) -> float: + def cross_entropy( + self, + target: Union[List[str], np.ndarray], + context: Union[List[str], np.ndarray], + ) -> float: """ :param target: @@ -34,5 +38,10 @@ def cross_entropy(self, target: Union[List[str], np.ndarray], context: Union[Lis smooth_k = 1 if self._smooth else 0 value = 0 if self._smooth else 1 - return sum(-np.log((context_counts.get(token, value) + smooth_k) / (n_context + smooth_v)) for token in - target) / n_target + return ( + sum( + -np.log((context_counts.get(token, value) + smooth_k) / (n_context + smooth_v)) + for token in target + ) + / n_target + ) diff --git a/convokit/surprise/file_utils.py b/convokit/surprise/file_utils.py deleted file mode 100644 index 221a6c9a..00000000 --- a/convokit/surprise/file_utils.py +++ /dev/null @@ -1,32 +0,0 @@ -import tempfile -from pathlib import Path -from typing import List, IO - - -def create_temp_files(num_files: int) -> List[IO]: - """ - - :param num_files: - :return: - """ - tmp_files = [] - for _ in range(num_files): - tmp_files.append(tempfile.NamedTemporaryFile('w', delete=True)) - return tmp_files - - -def delete_files(tmp_filenames: List[str], remove_parent_dir: bool = True): - """ - - :param tmp_filenames: - :param remove_parent_dir: - :return: - """ - tmp_filepaths = [Path(tmp_filename) for tmp_filename in tmp_filenames] - parent_dir = tmp_filepaths[0].parents[0] - - for tmp_filepath in tmp_filepaths: - Path.unlink(tmp_filepath, missing_ok=True) - - if remove_parent_dir and len(list(parent_dir.glob('*'))) == 0: - Path.rmdir(parent_dir) diff --git a/convokit/surprise/kenlm.py b/convokit/surprise/kenlm.py index 40f1cff1..9830edfb 100644 --- a/convokit/surprise/kenlm.py +++ b/convokit/surprise/kenlm.py @@ -5,12 +5,20 @@ from pathlib import Path from typing import Optional, Any, Union, List, Tuple -import kenlm import numpy as np -from .file_utils import create_temp_files, delete_files +from convokit.util import create_temp_files, delete_files from .language_model import LanguageModel +try: + import kenlm +except (ModuleNotFoundError, ImportError): + raise ModuleNotFoundError( + "kenlm is not currently installed; run `pip install convokit[kenlm]` if you " + "would like to use the Kenlm language model. If kenlm installation fails, please " + "follow: https://github.com/kpu/kenlm/issues/57 to install kenlm." + ) + class Kenlm(LanguageModel): """ @@ -19,34 +27,43 @@ class Kenlm(LanguageModel): :param kwargs: """ - def __init__(self, model_type: str = 'kenlm', **kwargs: Optional[Any]): + def __init__(self, model_type: str = "kenlm", **kwargs: Optional[Any]): super().__init__(model_type, **kwargs) - self._ngram_order = kwargs['ngram_order'] if 'ngram_order' in kwargs else 2 + self._ngram_order = kwargs["ngram_order"] if "ngram_order" in kwargs else 2 if self._ngram_order < 2: - warnings.warn(f'kenlm does not support n-gram order below 2; setting n-gram order to 2. ' - f'See: https://github.com/kpu/kenlm/issues/171 for specifics.') + warnings.warn( + f"kenlm does not support n-gram order below 2; setting n-gram order to 2. " + f"See: https://github.com/kpu/kenlm/issues/171 for specifics." + ) self._ngram_order = 2 - self._is_persistent = kwargs['is_persistent'] if 'is_persistent' in kwargs else False - if self._is_persistent or 'trained_model_filepath' in kwargs: + self._is_persistent = kwargs["is_persistent"] if "is_persistent" in kwargs else False + if self._is_persistent or "trained_model_filepath" in kwargs: self._is_persistent = True - self.__kenlm_model = Kenlm.load_kenlm_from_file( - kwargs['trained_model_filepath']) if 'trained_model_filepath' in kwargs else None - - if 'kenlm_path' not in kwargs: - self._kenlm_path = os.path.join(str(Path.home()), 'kenlm') - warnings.warn(f'the kenlm_path is unspecified, setting it to {self._kenlm_path}') - self.__kenlm_bin_path = os.path.join(self._kenlm_path, 'build/bin') + self.__kenlm_model = ( + Kenlm.load_kenlm_from_file(kwargs["trained_model_filepath"]) + if "trained_model_filepath" in kwargs + else None + ) + + if "kenlm_path" not in kwargs: + self._kenlm_path = os.path.join(str(Path.home()), "kenlm") + warnings.warn(f"the kenlm_path is unspecified, setting it to {self._kenlm_path}") + self.__kenlm_bin_path = os.path.join(self._kenlm_path, "build/bin") if not os.path.isdir(self.__kenlm_bin_path): - raise FileNotFoundError(f'the build directory for kenlm does not exist at: {self.__kenlm_bin_path}; ' - f'build kenlm {self._kenlm_path} before computing surprise scores') + raise FileNotFoundError( + f"the build directory for kenlm does not exist at: {self.__kenlm_bin_path}; " + f"build kenlm {self._kenlm_path} before computing surprise scores" + ) - self._models_dir = kwargs['models_dir'] if 'models_dir' in kwargs else None + self._models_dir = kwargs["models_dir"] if "models_dir" in kwargs else None if self._models_dir and not os.path.exists(self._models_dir): - warnings.warn(f'creating the folder: {self._models_dir} as it does not exist') + warnings.warn(f"creating the folder: {self._models_dir} as it does not exist") os.makedirs(self._models_dir) - self._model_filename = kwargs['model_filename'] if 'model_filename' in kwargs else self._model_type + self._model_filename = ( + kwargs["model_filename"] if "model_filename" in kwargs else self._model_type + ) @staticmethod def load_kenlm_from_file(trained_model_filepath: str) -> kenlm.Model: @@ -67,12 +84,16 @@ def __make_files(self) -> Tuple[str, str, str]: epoch = str(int(time.time())) os.makedirs(os.path.join(self._models_dir, epoch)) - train_filename = os.path.join(self._models_dir, epoch, f'{self._model_filename}.txt') - arpa_filename = os.path.join(self._models_dir, epoch, f'{self._model_filename}.arpa') - model_filename = os.path.join(self._models_dir, epoch, f'{self._model_filename}.bin') + train_filename = os.path.join(self._models_dir, epoch, f"{self._model_filename}.txt") + arpa_filename = os.path.join(self._models_dir, epoch, f"{self._model_filename}.arpa") + model_filename = os.path.join(self._models_dir, epoch, f"{self._model_filename}.bin") else: train_file, arpa_file, model_file = create_temp_files(num_files=3) - train_filename, arpa_filename, model_filename = train_file.name, arpa_file.name, model_file.name + train_filename, arpa_filename, model_filename = ( + train_file.name, + arpa_file.name, + model_file.name, + ) return train_filename, arpa_filename, model_filename @staticmethod @@ -83,7 +104,7 @@ def __populate_train_file(filepath: str, samples: Union[List[List[str]], np.ndar :param samples: :return: """ - with open(filepath, 'w', encoding='utf-8') as f: + with open(filepath, "w", encoding="utf-8") as f: for sample in samples: f.write(f'{" ".join(sample).strip()}\n') @@ -96,20 +117,43 @@ def _get_kenlm_model(self, context_samples: Union[List[List[str]], np.ndarray]) train_filename, arpa_filename, model_filename = self.__make_files() self.__populate_train_file(train_filename, samples=context_samples) - kenlm_args = [os.path.join(self.__kenlm_bin_path, 'lmplz'), '-o', f'{self._ngram_order}', '--text', - train_filename, '--arpa', arpa_filename, '--discount_fallback'] - cmd_return = subprocess.run(kenlm_args, capture_output=False, text=True, stdout=subprocess.DEVNULL, - stderr=subprocess.STDOUT) + kenlm_args = [ + os.path.join(self.__kenlm_bin_path, "lmplz"), + "-o", + f"{self._ngram_order}", + "--text", + train_filename, + "--arpa", + arpa_filename, + "--discount_fallback", + ] + cmd_return = subprocess.run( + kenlm_args, + capture_output=False, + text=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + ) if cmd_return.returncode != 0: delete_files([model_filename, arpa_filename, train_filename]) - raise RuntimeError('the kenlm model training was unsuccessful') - - kenlm_args = [os.path.join(self.__kenlm_bin_path, 'build_binary'), 'trie', arpa_filename, model_filename] - cmd_return = subprocess.run(kenlm_args, capture_output=False, text=True, stdout=subprocess.DEVNULL, - stderr=subprocess.STDOUT) + raise RuntimeError("the kenlm model training was unsuccessful") + + kenlm_args = [ + os.path.join(self.__kenlm_bin_path, "build_binary"), + "trie", + arpa_filename, + model_filename, + ] + cmd_return = subprocess.run( + kenlm_args, + capture_output=False, + text=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + ) if cmd_return.returncode != 0: delete_files([model_filename, arpa_filename, train_filename]) - raise RuntimeError('the kenlm model (binary) building was unsuccessful') + raise RuntimeError("the kenlm model (binary) building was unsuccessful") kenlm_model = kenlm.Model(model_filename) if not self._models_dir: @@ -117,7 +161,11 @@ def _get_kenlm_model(self, context_samples: Union[List[List[str]], np.ndarray]) return kenlm_model - def cross_entropy(self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray]) -> float: + def cross_entropy( + self, + target: Union[List[str], np.ndarray], + context: Union[List[str], np.ndarray], + ) -> float: """ :param target: @@ -126,4 +174,4 @@ def cross_entropy(self, target: Union[List[str], np.ndarray], context: Union[Lis """ if self.__kenlm_model is None or not self._is_persistent: self.__kenlm_model = self._get_kenlm_model([context]) - return -self.__kenlm_model.score(' '.join(target)) + return -self.__kenlm_model.score(" ".join(target).strip()) diff --git a/convokit/surprise/language_model.py b/convokit/surprise/language_model.py index a721003d..06af4ac8 100644 --- a/convokit/surprise/language_model.py +++ b/convokit/surprise/language_model.py @@ -12,11 +12,11 @@ class LanguageModel(ABC): :param kwargs: """ - def __init__(self, model_type: str = 'language_model', **kwargs: Optional[Any]): + def __init__(self, model_type: str = "language_model", **kwargs: Optional[Any]): self._model_type = model_type - self._n_jobs = kwargs['n_jobs'] if 'n_jobs' in kwargs else 1 + self._n_jobs = kwargs["n_jobs"] if "n_jobs" in kwargs else 1 - self.__dict__.update((f'_{arg}', value) for arg, value in kwargs.items()) + self.__dict__.update((f"_{arg}", value) for arg, value in kwargs.items()) @property def type(self) -> str: @@ -32,8 +32,12 @@ def config(self) -> Dict[str, Any]: :return: """ - private_var_prefix = f'_{self.__class__.__name__}' - return {arg[1:]: value for arg, value in self.__dict__.items() if not arg.startswith(private_var_prefix)} + private_var_prefix = f"_{self.__class__.__name__}" + return { + arg[1:]: value + for arg, value in self.__dict__.items() + if not arg.startswith(private_var_prefix) + } def _overwrite_args(self, args_to_overwrite: List[str], kwargs: Dict[str, Any]): """ @@ -43,20 +47,26 @@ def _overwrite_args(self, args_to_overwrite: List[str], kwargs: Dict[str, Any]): :return: """ for arg in args_to_overwrite: - self.__dict__[f'_{arg}'] = kwargs[arg] if arg in kwargs else self.__dict__[f'_{arg}'] + self.__dict__[f"_{arg}"] = kwargs[arg] if arg in kwargs else self.__dict__[f"_{arg}"] - def cross_entropy(self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray]) -> float: + def cross_entropy( + self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray] + ) -> float: """ :param target: :param context: :return: """ - raise RuntimeError('cross entropy is not implemented') - - def evaluate(self, target_samples: Union[List[List[str]], np.ndarray], - context_samples: Union[List[List[str]], np.ndarray], eval_type: str = 'cross_entropy', - **kwargs: Optional[Any]) -> np.ndarray: + raise RuntimeError("cross entropy is not implemented") + + def evaluate( + self, + target_samples: Union[List[List[str]], np.ndarray], + context_samples: Union[List[List[str]], np.ndarray], + eval_type: str = "cross_entropy", + **kwargs: Optional[Any], + ) -> np.ndarray: """ :param target_samples: @@ -69,10 +79,13 @@ def evaluate(self, target_samples: Union[List[List[str]], np.ndarray], eval_fn = getattr(self, eval_type) if self._n_jobs == 1: - model_scores = [eval_fn(target_sample, context_sample) for target_sample, context_sample in - zip(target_samples, context_samples)] + model_scores = [ + eval_fn(target_sample, context_sample) + for target_sample, context_sample in zip(target_samples, context_samples) + ] else: - model_scores = Parallel(n_jobs=self._n_jobs, backend='threading')( - delayed(eval_fn)(target_sample, context_sample) for target_sample, context_sample in - zip(target_samples, context_samples)) + model_scores = Parallel(n_jobs=self._n_jobs, backend="threading")( + delayed(eval_fn)(target_sample, context_sample) + for target_sample, context_sample in zip(target_samples, context_samples) + ) return np.nanmean(model_scores) diff --git a/convokit/surprise/surprise.py b/convokit/surprise/surprise.py index bf49d071..b2fbc6bf 100644 --- a/convokit/surprise/surprise.py +++ b/convokit/surprise/surprise.py @@ -10,12 +10,12 @@ from convokit import Transformer from convokit.model import Corpus, Utterance, CorpusComponent +from convokit.util import random_sampler from .convokit_lm import ConvoKitLanguageModel -from .utils import random_sampler try: shell = get_ipython().__class__.__name__ - if shell == 'ZMQInteractiveShell' or shell == 'TerminalInteractiveShell': + if shell == "ZMQInteractiveShell" or shell == "TerminalInteractiveShell": from tqdm.notebook import tqdm except NameError: pass @@ -34,11 +34,19 @@ class Surprise(Transformer): :param n_jobs: """ - def __init__(self, model_key_selector: Callable[[Utterance], str], - tokenizer: Callable[[str], List[str]] = word_tokenize, surprise_attr_name: str = 'surprise', - target_sample_size: int = 100, context_sample_size: int = 100, n_samples: int = 50, - sampling_fn: Callable[[List[Union[np.ndarray, List[str]]], int, int], np.ndarray] = random_sampler, - n_jobs: int = 1): + def __init__( + self, + model_key_selector: Callable[[Utterance], str], + tokenizer: Callable[[str], List[str]] = word_tokenize, + surprise_attr_name: str = "surprise", + target_sample_size: int = 100, + context_sample_size: int = 100, + n_samples: int = 50, + sampling_fn: Callable[ + [List[Union[np.ndarray, List[str]]], int, int], np.ndarray + ] = random_sampler, + n_jobs: int = 1, + ): self._model_key_selector = model_key_selector self._tokenizer = tokenizer self._surprise_attr_name = surprise_attr_name @@ -49,8 +57,12 @@ def __init__(self, model_key_selector: Callable[[Utterance], str], self._n_jobs = n_jobs self._model_groups = None - def fit(self, corpus: Corpus, text_func: Callable[[Utterance], List[str]] = None, - selector: Callable[[Utterance], bool] = lambda utt: True) -> Transformer: + def fit( + self, + corpus: Corpus, + text_func: Callable[[Utterance], List[str]] = None, + selector: Callable[[Utterance], bool] = lambda utt: True, + ) -> Transformer: """ :param corpus: @@ -60,7 +72,7 @@ def fit(self, corpus: Corpus, text_func: Callable[[Utterance], List[str]] = None """ self._model_groups = defaultdict(list) - for utt in tqdm(corpus.iter_utterances(selector=selector), desc='fit'): + for utt in tqdm(corpus.iter_utterances(selector=selector), desc="fit"): key = self._model_key_selector(utt) if text_func is not None: if key not in self._model_groups: @@ -68,19 +80,27 @@ def fit(self, corpus: Corpus, text_func: Callable[[Utterance], List[str]] = None else: self._model_groups[key].append(utt.text) - for key in tqdm(self._model_groups, desc='fit'): + for key in tqdm(self._model_groups, desc="fit"): if text_func is None: - self._model_groups[key] = [' '.join(self._model_groups[key])] + self._model_groups[key] = [" ".join(self._model_groups[key])] # Using `map()` with a `lambda` function is (microscopically) costlier than a list comprehension. # Reference: https://stackoverflow.com/a/1247490/6907625. - self._model_groups[key] = [self._tokenizer(utt_text) for utt_text in self._model_groups[key]] + self._model_groups[key] = [ + self._tokenizer(utt_text) for utt_text in self._model_groups[key] + ] return self - def _compute_surprise(self, target: List[str], context: List[List[str]], - lm_evaluation_fn: Callable[[Union[List[str], np.ndarray], Union[List[str], np.ndarray], - Optional[Any]], np.ndarray], - **kwargs: Optional[Any]) -> np.ndarray: + def _compute_surprise( + self, + target: List[str], + context: List[List[str]], + lm_evaluation_fn: Callable[ + [Union[List[str], np.ndarray], Union[List[str], np.ndarray], Optional[Any]], + np.ndarray, + ], + **kwargs: Optional[Any], + ) -> np.ndarray: """ :param target: @@ -91,18 +111,27 @@ def _compute_surprise(self, target: List[str], context: List[List[str]], """ target_tokens = np.array(target) context_tokens = [np.array(text) for text in context] - target_samples = self._sampling_fn([target_tokens], self._target_sample_size, self._n_samples) - context_samples = self._sampling_fn(context_tokens, self._context_sample_size, self._n_samples) + target_samples = self._sampling_fn( + [target_tokens], self._target_sample_size, self._n_samples + ) + context_samples = self._sampling_fn( + context_tokens, self._context_sample_size, self._n_samples + ) if target_samples is None or context_samples is None: return np.nan return lm_evaluation_fn(target_samples, context_samples, **kwargs) - def _transform(self, corpus: Corpus, obj_type: str, - group_and_models: Callable[[Utterance], Tuple[str, List[str]]] = None, - target_text_func: Callable[[Utterance], List[str]] = None, - selector: Callable[[CorpusComponent], bool] = lambda _: True, - group_model_attr_key: Callable[[str, str], str] = None, **kwargs: Optional[Any]) -> Corpus: + def _transform( + self, + corpus: Corpus, + obj_type: str, + group_and_models: Callable[[Utterance], Tuple[str, List[str]]] = None, + target_text_func: Callable[[Utterance], List[str]] = None, + selector: Callable[[CorpusComponent], bool] = lambda _: True, + group_model_attr_key: Callable[[str, str], str] = None, + **kwargs: Optional[Any], + ) -> Corpus: """ :param corpus: @@ -115,8 +144,11 @@ def _transform(self, corpus: Corpus, obj_type: str, :return: """ - def _update_groups_models(utt_: Utterance, utt_groups_: Dict[str, List[List[str]]], - group_models_: Dict[str, Set[str]]): + def _update_groups_models( + utt_: Utterance, + utt_groups_: Dict[str, List[List[str]]], + group_models_: Dict[str, Set[str]], + ): """ :param utt_: @@ -124,7 +156,11 @@ def _update_groups_models(utt_: Utterance, utt_groups_: Dict[str, List[List[str] :param group_models_: :return: """ - group_name, models = group_and_models(utt_) if group_and_models else (self._model_key_selector(utt_), None) + group_name, models = ( + group_and_models(utt_) + if group_and_models + else (self._model_key_selector(utt_), None) + ) models = {group_name} if not models else models if target_text_func: if group_name not in utt_groups_: @@ -133,7 +169,9 @@ def _update_groups_models(utt_: Utterance, utt_groups_: Dict[str, List[List[str] utt_groups_[group_name].append(self._tokenizer(utt_.text)) group_models_[group_name].update(models) - def _format_attr_key(group_name: str, model_key: str, format_fn: Callable[[str, str], str] = None) -> str: + def _format_attr_key( + group_name: str, model_key: str, format_fn: Callable[[str, str], str] = None + ) -> str: """ :param group_name: @@ -145,13 +183,22 @@ def _format_attr_key(group_name: str, model_key: str, format_fn: Callable[[str, return format_fn(group_name, model_key) if group_name == model_key: return model_key - return f'GROUP_{group_name}__MODEL_{model_key}' + return f"GROUP_{group_name}__MODEL_{model_key}" - def __surprise_score_helper(group_name: str, utt_group: List[List[str]], group_models_: Dict[str, Set[str]], - surprise_scores_: Dict[str, np.ndarray], - lm_evaluation_fn: Callable[ - [Union[List[str], np.ndarray], Union[List[str], np.ndarray], - Optional[Any]], np.ndarray]): + def __surprise_score_helper( + group_name: str, + utt_group: List[List[str]], + group_models_: Dict[str, Set[str]], + surprise_scores_: Dict[str, np.ndarray], + lm_evaluation_fn: Callable[ + [ + Union[List[str], np.ndarray], + Union[List[str], np.ndarray], + Optional[Any], + ], + np.ndarray, + ], + ): """ :param group_name: @@ -162,17 +209,27 @@ def __surprise_score_helper(group_name: str, utt_group: List[List[str]], group_m :return: """ for model_key in group_models_[group_name]: - assert model_key in self._model_groups, 'invalid model key' + assert model_key in self._model_groups, "invalid model key" surprise_key = _format_attr_key(group_name, model_key, group_model_attr_key) context = self._model_groups[model_key] target = list(chain(*utt_group)) - surprise_scores_[surprise_key] = self._compute_surprise(target, context, lm_evaluation_fn, **kwargs) + surprise_scores_[surprise_key] = self._compute_surprise( + target, context, lm_evaluation_fn, **kwargs + ) - def _update_surprise_scores(utt_groups_: Dict[str, List[List[str]]], group_models_: Dict[str, Set[str]], - surprise_scores_: Dict[str, np.ndarray], - lm_evaluation_fn: Callable[ - [Union[List[str], np.ndarray], Union[List[str], np.ndarray], - Optional[Any]], np.ndarray]): + def _update_surprise_scores( + utt_groups_: Dict[str, List[List[str]]], + group_models_: Dict[str, Set[str]], + surprise_scores_: Dict[str, np.ndarray], + lm_evaluation_fn: Callable[ + [ + Union[List[str], np.ndarray], + Union[List[str], np.ndarray], + Optional[Any], + ], + np.ndarray, + ], + ): """ :param utt_groups_: @@ -182,39 +239,59 @@ def _update_surprise_scores(utt_groups_: Dict[str, List[List[str]]], group_model :return: """ if self._n_jobs == 1: - for group_name in tqdm(utt_groups_, leave=False, desc='surprise', delay=2): - __surprise_score_helper(group_name, utt_groups_[group_name], group_models_, surprise_scores_, - lm_evaluation_fn) + for group_name in tqdm(utt_groups_, leave=False, desc="surprise", delay=2): + __surprise_score_helper( + group_name, + utt_groups_[group_name], + group_models_, + surprise_scores_, + lm_evaluation_fn, + ) else: - Parallel(n_jobs=self._n_jobs, backend='threading')( - delayed(__surprise_score_helper)(group_name, utt_groups_[group_name], group_models_, - surprise_scores_, lm_evaluation_fn) for group_name in - tqdm(utt_groups_, leave=False, desc='surprise', delay=2)) + Parallel(n_jobs=self._n_jobs, backend="threading")( + delayed(__surprise_score_helper)( + group_name, + utt_groups_[group_name], + group_models_, + surprise_scores_, + lm_evaluation_fn, + ) + for group_name in tqdm(utt_groups_, leave=False, desc="surprise", delay=2) + ) - language_model = kwargs['language_model'] if 'language_model' in kwargs else ConvoKitLanguageModel( - n_jobs=self._n_jobs, **kwargs) + language_model = ( + kwargs["language_model"] + if "language_model" in kwargs + else ConvoKitLanguageModel(n_jobs=self._n_jobs, **kwargs) + ) - if obj_type == 'corpus': + if obj_type == "corpus": surprise_scores = {} utt_groups, group_models = defaultdict(list), defaultdict(set) - for utt in tqdm(corpus.iter_utterances(), desc='transform'): + for utt in tqdm(corpus.iter_utterances(), desc="transform"): _update_groups_models(utt, utt_groups, group_models) - _update_surprise_scores(utt_groups, group_models, surprise_scores, language_model.evaluate) + _update_surprise_scores( + utt_groups, group_models, surprise_scores, language_model.evaluate + ) corpus.add_meta(self._surprise_attr_name, surprise_scores) - elif obj_type == 'utterance': - for utt in tqdm(corpus.iter_utterances(selector=selector), desc='transform'): + elif obj_type == "utterance": + for utt in tqdm(corpus.iter_utterances(selector=selector), desc="transform"): surprise_scores = {} utt_groups, group_models = defaultdict(list), defaultdict(set) _update_groups_models(utt, utt_groups, group_models) - _update_surprise_scores(utt_groups, group_models, surprise_scores, language_model.evaluate) + _update_surprise_scores( + utt_groups, group_models, surprise_scores, language_model.evaluate + ) utt.add_meta(self._surprise_attr_name, surprise_scores) else: - for obj in tqdm(corpus.iter_objs(obj_type, selector=selector), desc='transform'): + for obj in tqdm(corpus.iter_objs(obj_type, selector=selector), desc="transform"): surprise_scores = {} utt_groups, group_models = defaultdict(list), defaultdict(set) for utt in obj.iter_utterances(): _update_groups_models(utt, utt_groups, group_models) - _update_surprise_scores(utt_groups, group_models, surprise_scores, language_model.evaluate) + _update_surprise_scores( + utt_groups, group_models, surprise_scores, language_model.evaluate + ) obj.add_meta(self._surprise_attr_name, surprise_scores) return corpus diff --git a/convokit/surprise/utils.py b/convokit/surprise/utils.py deleted file mode 100644 index a11e6852..00000000 --- a/convokit/surprise/utils.py +++ /dev/null @@ -1,25 +0,0 @@ -from typing import List, Union, Optional - -import numpy as np - - -def random_sampler(tokens: List[Union[np.ndarray, List[str]]], sample_size: int, - n_samples: int) -> Optional[np.ndarray]: - """ - - :param tokens: - :param sample_size: - :param n_samples: - :return: - """ - if not sample_size: - assert len(tokens) == 1 - return np.tile(tokens[0], (n_samples, 1)) - - tokens_list = np.array([tokens_ for tokens_ in tokens if len(tokens_) >= sample_size]) - if tokens_list.shape[0] == 0: - return None - - rng = np.random.default_rng() - sample_idxs = rng.integers(0, tokens_list.shape[0], size=n_samples) - return np.array([rng.choice(tokens_list[idx], sample_size) for idx in sample_idxs]) diff --git a/convokit/util.py b/convokit/util.py index 6be3f65d..97a57514 100644 --- a/convokit/util.py +++ b/convokit/util.py @@ -1,12 +1,15 @@ import json import os import shutil +import tempfile import urllib.request import uuid import warnings import zipfile -from typing import Dict +from pathlib import Path +from typing import Dict, Union, Optional, List, IO +import numpy as np import requests @@ -238,7 +241,6 @@ def download_local(name: str, data_dir: str): def _download_helper( dataset_path: str, url: str, verbose: bool, name: str, downloadeds_path: str ) -> None: - if ( url.lower().endswith(".corpus") or url.lower().endswith(".corpus.zip") @@ -254,7 +256,15 @@ def _download_helper( if length > 1e6 else str(round(length / 1e3, 1)) + "KB" ) - print("Downloading", name, "from", url, "(" + length + ")...", end=" ", flush=True) + print( + "Downloading", + name, + "from", + url, + "(" + length + ")...", + end=" ", + flush=True, + ) shutil.copyfileobj(response, out_file) # post-process (extract) corpora @@ -278,7 +288,9 @@ def _download_helper( ) # os.path.join(os.path.dirname(data), name) f.write( "{}$#${}$#${}\n".format( - name, os.path.realpath(os.path.dirname(dataset_path) + "/"), corpus_version(fn) + name, + os.path.realpath(os.path.dirname(dataset_path) + "/"), + corpus_version(fn), ) ) # f.write(name + "\n") @@ -292,7 +304,6 @@ def corpus_version(filename: str) -> int: # retrieve grouping and completes the download link for subreddit def get_subreddit_info(subreddit_name: str) -> str: - # base directory of subreddit corpuses subreddit_base = "http://zissou.infosci.cornell.edu/convokit/datasets/subreddit-corpus/" data_dir = subreddit_base + "corpus-zipped/" @@ -335,13 +346,17 @@ def _get_wikiconv_year_info(year: str) -> str: def _get_supreme_info(year: str) -> str: - supreme_base = "http://zissou.infosci.cornell.edu/convokit/datasets/supreme-corpus/" return supreme_base + "supreme-" + year + ".zip" def meta_index(corpus=None, filename: str = None) -> Dict: - keys = ["utterances-index", "conversations-index", "speakers-index", "overall-index"] + keys = [ + "utterances-index", + "conversations-index", + "speakers-index", + "overall-index", + ] if corpus is not None: return {k: v for k, v in corpus.meta_index.items() if k in keys} if filename is not None: @@ -379,3 +394,55 @@ def deprecation(prev_name: str, new_name: str, stacklevel: int = 3): def create_safe_id(): return "_" + uuid.uuid4().hex + + +def random_sampler( + tokens: List[Union[np.ndarray, List[str]]], sample_size: int, n_samples: int +) -> Optional[np.ndarray]: + """ + + :param tokens: + :param sample_size: + :param n_samples: + :return: + """ + if not sample_size: + assert len(tokens) == 1 + return np.tile(tokens[0], (n_samples, 1)) + + tokens_list = np.array([tokens_ for tokens_ in tokens if len(tokens_) >= sample_size]) + if tokens_list.shape[0] == 0: + return None + + rng = np.random.default_rng() + sample_idxs = rng.integers(0, tokens_list.shape[0], size=n_samples) + return np.array([rng.choice(tokens_list[idx], sample_size) for idx in sample_idxs]) + + +def create_temp_files(num_files: int) -> List[IO]: + """ + + :param num_files: + :return: + """ + tmp_files = [] + for _ in range(num_files): + tmp_files.append(tempfile.NamedTemporaryFile("w", delete=True)) + return tmp_files + + +def delete_files(tmp_filenames: List[str], remove_parent_dir: bool = True): + """ + + :param tmp_filenames: + :param remove_parent_dir: + :return: + """ + tmp_filepaths = [Path(tmp_filename) for tmp_filename in tmp_filenames] + parent_dir = tmp_filepaths[0].parents[0] + + for tmp_filepath in tmp_filepaths: + Path.unlink(tmp_filepath, missing_ok=True) + + if remove_parent_dir and len(list(parent_dir.glob("*"))) == 0: + Path.rmdir(parent_dir) diff --git a/setup.py b/setup.py index c7b41e28..d6cfff81 100644 --- a/setup.py +++ b/setup.py @@ -59,6 +59,7 @@ ], extras_require={ "craft": ["torch>=0.12"], + "kenlm": ["kenlm>=0.0.0"], }, classifiers=[ "Programming Language :: Python", From 312c435a4381fc318ea90b9e3382a07fba660192 Mon Sep 17 00:00:00 2001 From: Tushaar Gangavarapu Date: Sun, 18 Dec 2022 02:18:01 -0500 Subject: [PATCH 6/9] Add documentation to Surprise and related classes. This commit includes extensive documentation of all the modules concerning the Surprise transformer, LanguageModel, Kenlm, and ConvoKitLanguageModel classes, and the newly added util functions. --- convokit/surprise/convokit_lm.py | 50 +++- convokit/surprise/demos/surprise_demo.ipynb | 113 +++----- convokit/surprise/demos/tennis_demo.ipynb | 282 +++++++++++++------- convokit/surprise/kenlm.py | 107 ++++++-- convokit/surprise/language_model.py | 117 ++++++-- convokit/surprise/surprise.py | 254 +++++++++++++----- convokit/util.py | 24 +- 7 files changed, 653 insertions(+), 294 deletions(-) diff --git a/convokit/surprise/convokit_lm.py b/convokit/surprise/convokit_lm.py index a7f5358d..594bc78e 100644 --- a/convokit/surprise/convokit_lm.py +++ b/convokit/surprise/convokit_lm.py @@ -7,10 +7,21 @@ class ConvoKitLanguageModel(LanguageModel): - """ + """A simple language model to compute the deviation of target from context. + + This language model implements cross-entropy and perplexity language model evaluation functions, + to be used in evaluating the average deviation of target from the specified context. - :param model_type: - :param kwargs: + :param model_type: The name of the `convokit.ConvoKitLanguageModel`, defaults to "convokit_lm". + Note that the `model_type` can be accessed using the `type` property (e.g., `lm.type`). + :param kwargs: Any additional keyword arguments needed in the language model evaluations. This + language model currently uses the following keyword arguments: + - `smooth`: Indicator of using Laplace smoothing in the computation of cross-entropy scores, + defaults to `True`. + - `n_jobs`: The number of concurrent threads to be used for routines that are parallelized + with `joblib`, defaults to 1. + The language model configuration can be retrieved using the `config` property of the model + class object (e.g., `lm.config`). """ def __init__(self, model_type: str = "convokit_lm", **kwargs: Optional[Any]): @@ -23,11 +34,18 @@ def cross_entropy( target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray], ) -> float: - """ + """Implements the base class method to compute the cross-entropy. - :param target: - :param context: - :return: + Calculates :math:`H(P, Q) = -\sum_{x \in X}P(x) \times \ln(Q(x))`. Note that we use the + natural logarithm; however, any base and corresponding exponent can be employed. For + instance, KenLM uses base-10 (see `convokit.Kenlm` for reference). + + The smoothing boolean argument, `smooth`, is accessed from the setting in the language model + constructor (defaults to `True` when unspecified). + + :param target: A list of tokens that make up the target text (P). + :param context: A list of tokens that make up the context text (Q). + :return: The cross-entropy score computed as :math:`H(P, Q)`. """ n_target, n_context = len(target), len(context) if min(n_target, n_context) == 0: @@ -45,3 +63,21 @@ def cross_entropy( ) / n_target ) + + def perplexity( + self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray] + ) -> float: + """Implements the base class method to compute perplexity. + + Calculates :math:`\text{PPL}(P, Q) = \exp(-\sum_{x \in X}P(x) \times \ln(Q(x)))`. Note that + we use the natural logarithm; however, any base and corresponding exponent can be employed. + For instance, KenLM uses base-10 (see `convokit.Kenlm` for reference). + + For convenience, the perplexity score is computed as the exponentiation of the cross-entropy + calculated using the `cross_entropy()` method. + + :param target: A list of tokens that make up the target text (P). + :param context: A list of tokens that make up the context text (Q). + :return: The perplexity score computed as :math:`\text{PPL}(P, Q)`. + """ + return np.exp(self.cross_entropy(target, context)) diff --git a/convokit/surprise/demos/surprise_demo.ipynb b/convokit/surprise/demos/surprise_demo.ipynb index 183ed5df..036fcdb5 100644 --- a/convokit/surprise/demos/surprise_demo.ipynb +++ b/convokit/surprise/demos/surprise_demo.ipynb @@ -6,10 +6,7 @@ "source": [ "Computing Surprise With ConvoKit\n", "=====================\n", - "This notebook provides a demo of how to use the Surprise transformer to compute surprise across a corpus. In this demo, we will use the Surprise transformer to compute Speaker Convo Diversity, a measure of how surprising a speaker's participation in one conversation is compared to their participation in all other conversations.\n", - "\n", - "" + "This notebook provides a demo of how to use the Surprise transformer to compute surprise across a corpus. In this demo, we will use the Surprise transformer to compute Speaker Convo Diversity, a measure of how surprising a speaker's participation in one conversation is compared to their participation in all other conversations." ] }, { @@ -282,24 +279,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "341d4b05ecde4202933ef88881e95dc0", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "0it [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import spacy\n", "\n", @@ -321,24 +303,9 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8b1109c726864d8e8e9304b29f67fd71", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "fit: 146it [00:02, 68.33it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "surp = surp.fit(subset_corpus, \n", " text_func=lambda utt: [list(itertools.chain(*[u.meta['joined_tokens'] \n", @@ -357,7 +324,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -375,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -390,19 +357,15 @@ " 'n_jobs': 8,\n", " 'ngram_order': 2}\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/tushaar/Downloads/Cornell/Research/ConvoKit/convokit/surprise/kenlm.py:45: UserWarning: creating the folder: /Users/tushaar/Desktop/kenlm_models as it does not exist\n", - " warnings.warn(f'creating the folder: {self._models_dir} as it does not exist')\n" - ] } ], "source": [ - "kenlm = Kenlm(kenlm_path='/Users/tushaar/kenlm', models_dir='/Users/tushaar/Desktop/kenlm_models', \n", - " model_filename='kenlm_surprise', n_jobs=8)\n", + "# Replace with appropriate paths to your kenlm directory\n", + "# and the folder to save the models.\n", + "kenlm = Kenlm(kenlm_path='/Users/tushaar/kenlm', \n", + " models_dir='/Users/tushaar/Desktop/kenlm_models', \n", + " model_filename='kenlm_surprise', \n", + " n_jobs=8)\n", "pp.pprint(kenlm.config)" ] }, @@ -428,7 +391,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -457,26 +420,26 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "EQUASHNZRKUL_815y6t 7.232845\n", - "SwissWatchesOnly_8g5q88 7.213589\n", - "SwissWatchesOnly_67cljd 7.119290\n", - "EQUASHNZRKUL_73xuw6 7.105761\n", - "Udontlikecake_7rj6a0 7.077323\n", - "Straight_Derpin_5kst5l 7.075608\n", - "ClawofBeta_52u1nu 7.071615\n", - "Enyo287_3s4yj4 7.055865\n", - "Enyo287_48in7h 7.045419\n", - "syntheticity_97zg9z 7.044554\n", + "EQUASHNZRKUL_815y6t 7.258089\n", + "SwissWatchesOnly_8g5q88 7.199374\n", + "SwissWatchesOnly_67cljd 7.125212\n", + "EQUASHNZRKUL_73xuw6 7.100619\n", + "Udontlikecake_7rj6a0 7.083575\n", + "ClawofBeta_52u1nu 7.081842\n", + "Straight_Derpin_5kst5l 7.080008\n", + "syntheticity_97zg9z 7.055642\n", + "CornellMan333_9iwucv 7.043682\n", + "t3hasiangod_42k6wa 7.040483\n", "dtype: float64" ] }, - "execution_count": 20, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -495,26 +458,26 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Unga_Bunga_30ac0l 5.835202\n", - "crash_over-ride_8f7b0y 5.932225\n", - "crash_over-ride_6bjxnm 5.941219\n", - "omgdonerkebab_v4a3p 5.949654\n", - "Bisphosphate_7r8nu1 5.955274\n", - "crash_over-ride_v4j70 5.961225\n", - "crash_over-ride_9b132c 5.972829\n", - "crash_over-ride_2vhtzx 5.977145\n", - "crash_over-ride_llc0q 5.987742\n", - "crash_over-ride_2vtgvc 5.996319\n", + "Unga_Bunga_30ac0l 5.849274\n", + "crash_over-ride_30zba1 5.937072\n", + "omgdonerkebab_v4a3p 5.944469\n", + "Bisphosphate_7r8nu1 5.960513\n", + "crash_over-ride_t6w01 5.962633\n", + "crash_over-ride_6bjxnm 5.967824\n", + "crash_over-ride_v4j70 5.980576\n", + "crash_over-ride_2vhtzx 5.982879\n", + "crash_over-ride_8f7b0y 5.990480\n", + "crash_over-ride_9b132c 6.002238\n", "dtype: float64" ] }, - "execution_count": 21, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } diff --git a/convokit/surprise/demos/tennis_demo.ipynb b/convokit/surprise/demos/tennis_demo.ipynb index ef3469fb..f352404c 100644 --- a/convokit/surprise/demos/tennis_demo.ipynb +++ b/convokit/surprise/demos/tennis_demo.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5a63826376804976a9c7dcc38fa6233a", + "model_id": "7ef5f528cab74142aca4e45705e3e631", "version_major": 2, "version_minor": 0 }, @@ -206,38 +206,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a481a41699d04c3fb7cf54f7220d140f", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "fit: 0it [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d33520f924994f748e07c934092a5efd", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "fit: 0%| | 0/1 [00:00 kenlm.Model: - """ + """Loads the pre-trained KenLM model from the specified filepath. - :param trained_model_filepath: - :return: + :param trained_model_filepath: The path to the pre-trained KenLM model. + :return: The loaded KenLM model. """ kenlm_model = kenlm.Model(trained_model_filepath) return kenlm_model def __make_files(self) -> Tuple[str, str, str]: - """ + """Create (if needed) and return the filenames of intermittent files. + + KenLM language model needs the training data filename, .arpa filename, and the binary model + filename to generate a KenLM model. If the models are not stored (specified through the + argument `models_dir` in the constructor), `tempfile` files are used, else, all the files + are generated in the `models_dir/current_timestamp` folder, using the filename specified in + the constructor. - :return: + :return: A tuple of filenames of all the intermittent files needed. """ if self._models_dir: epoch = str(int(time.time())) @@ -98,21 +130,26 @@ def __make_files(self) -> Tuple[str, str, str]: @staticmethod def __populate_train_file(filepath: str, samples: Union[List[List[str]], np.ndarray]): - """ + """Writes the specified samples to a file, to be used in KenLM training. - :param filepath: - :param samples: - :return: + :param filepath: The filepath to write the samples to. + :param samples: The samples that are to be written to the file. Each list of samples is + delimited using a newline (`\n`). """ with open(filepath, "w", encoding="utf-8") as f: for sample in samples: f.write(f'{" ".join(sample).strip()}\n') def _get_kenlm_model(self, context_samples: Union[List[List[str]], np.ndarray]) -> kenlm.Model: - """ + """Retrieve the KenLM model trained using the specified `context_samples`. + + This method generates the training file using the `context_samples`, which is then used in + the generation of the .arpa and a binary KenLM trained model files. These intermittent files + are deleted, unless the specified value of `models_dir` is not `None`, indicating that the + models are to be stored. - :param context_samples: - :return: + :param context_samples: The context samples to be used in training the KenLM model. + :return: The KenLM model trained on the specified `context_samples`. """ train_filename, arpa_filename, model_filename = self.__make_files() @@ -166,12 +203,42 @@ def cross_entropy( target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray], ) -> float: - """ + """Implements the base class method to compute the cross-entropy. + + A KenLM model is trained using the specified `context`, and is used to evaluate the `target` + text. Note that, if model persistence is indicated in the constructor (using the argument + `is_persistent`), the model generated in the first pass or that loaded from the parameter + value of `trained_model_filepath` is used in all evaluations. (When `trained_model_filepath` + is specified, persistence is automatically implied.) - :param target: - :param context: - :return: + The KenLM library returns a score of log-probabilities (when `score()` method is used), and + the cross-entropy is the negative log-likelihood. + + :param target: A list of tokens that make up the target text (P). + :param context: A list of tokens that make up the context text (Q), used to train the model. + :return: The cross-entropy score computed using the `kenlm.score()` method. """ if self.__kenlm_model is None or not self._is_persistent: self.__kenlm_model = self._get_kenlm_model([context]) return -self.__kenlm_model.score(" ".join(target).strip()) + + def perplexity( + self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray] + ) -> float: + """Implements the base class method to compute perplexity. + + A KenLM model is trained using the specified `context`, and is used to evaluate the `target` + text. Note that, if model persistence is indicated in the constructor (using the argument + `is_persistent`), the model generated in the first pass or that loaded from the parameter + value of `trained_model_filepath` is used in all evaluations. (When `trained_model_filepath` + is specified, persistence is automatically implied.) + + The KenLM library returns a perplexity score, with the use of `kenlm.perplexity()` method. + + :param target: A list of tokens that make up the target text (P). + :param context: A list of tokens that make up the context text (Q), used to train the model. + :return: The perplexity score computed using the `kenlm.perplexity()` method. + """ + if self.__kenlm_model is None or not self._is_persistent: + self.__kenlm_model = self._get_kenlm_model([context]) + return self.__kenlm_model.perplexity(" ".join(target).strip()) diff --git a/convokit/surprise/language_model.py b/convokit/surprise/language_model.py index 06af4ac8..bf937d1f 100644 --- a/convokit/surprise/language_model.py +++ b/convokit/surprise/language_model.py @@ -1,15 +1,40 @@ from abc import ABC -from typing import Optional, Any, List, Dict, Union +from typing import Optional, Any, List, Dict, Union, final import numpy as np from joblib import Parallel, delayed class LanguageModel(ABC): - """ - - :param model_type: - :param kwargs: + """The abstract base class for all language models. + + The language model base class defines the `evaluate()` method, which aims at performing language + model evaluation using the `eval_type` specified as an argument to the `evaluate()` method. Note + that this method must be defined and implemented in the subclass (e.g., if the `eval_type` is + set to `cross_entropy`, the subclass must implement `cross_entropy()` method). The implemented + method takes in a list of target tokens and a list of context tokens, and outputs the language + model evaluation score. + + Since most language models employs cross-entropy and perplexity evaluations, this base class + includes unimplemented designs of `cross_entropy()` and `perplexity()` functions, which may be + implemented (as needed) in the subclasses. For reference, see: `convokit.ConvoKitLanguageModel` + and `convokit.Kenlm` classes, which extend this base class. + + The `evaluate()` method defined in this class is called on a set of context samples and a set of + target samples, and evaluates the target-context distribution deviations using the `eval_type` + language model evaluation function. + + Note: The subclasses cannot override the `evaluate()` method. + + :param model_type: The name of the `convokit.LanguageModel`, defaults to "language_model". Note + that the `model_type` can be accessed using the `type` property (e.g., `lm.type`). + :param kwargs: Any additional keyword arguments needed in the language model evaluations. For + instance, the cross-entropy computes might require smoothing parameter; hence, a `smooth` + parameter can be passed as an additional keyword argument. + Another keyword argument is `n_jobs`, used to specify the number of concurrent threads to be + used for routines that are parallelized with `joblib`, defaults to 1. + The language model configuration can be retrieved using the `config` property of the model + class object (e.g., `lm.config`). """ def __init__(self, model_type: str = "language_model", **kwargs: Optional[Any]): @@ -20,17 +45,18 @@ def __init__(self, model_type: str = "language_model", **kwargs: Optional[Any]): @property def type(self) -> str: - """ + """The `model_type` property of the language model. - :return: + :return: The `model_type` specified in the class constructor, defaults to "language_model". """ return self._model_type @property def config(self) -> Dict[str, Any]: - """ + """The configuration (all the class parameters) of the language model. - :return: + :return: The configuration (all the class parameters specified in the class constructor and + elsewhere) of the language model. """ private_var_prefix = f"_{self.__class__.__name__}" return { @@ -40,11 +66,11 @@ def config(self) -> Dict[str, Any]: } def _overwrite_args(self, args_to_overwrite: List[str], kwargs: Dict[str, Any]): - """ + """Overwrites the class variables with the values specified in `kwargs`. - :param args_to_overwrite: - :param kwargs: - :return: + :param args_to_overwrite: The list of arguments (class variable names) whose values are to + be overwritten using the values in the `kwargs`. + :param kwargs: The keyword arguments with updates to the values of the class variables. """ for arg in args_to_overwrite: self.__dict__[f"_{arg}"] = kwargs[arg] if arg in kwargs else self.__dict__[f"_{arg}"] @@ -52,14 +78,42 @@ def _overwrite_args(self, args_to_overwrite: List[str], kwargs: Dict[str, Any]): def cross_entropy( self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray] ) -> float: - """ + """An unimplemented base class method to compute the cross-entropy. + + The cross-entropy between a list of target tokens and a list of context tokens is to be + computed by the implementation in the subclass. Note that any variables to be used in this + method (e.g., smoothing value) must be accessed from the class scope. - :param target: - :param context: - :return: + Calculates :math:`H(P, Q) = -\sum_{x \in X}P(x) \times \ln(Q(x))`. Note that we use the + natural logarithm; however, any base and corresponding exponent can be employed. For + instance, KenLM uses base-10 (see `convokit.Kenlm` for reference). + + :param target: A list of tokens that make up the target text (P). + :param context: A list of tokens that make up the context text (Q). + :raises: Raises a `RuntimeError` if called without implementing it in the subclass. """ raise RuntimeError("cross entropy is not implemented") + def perplexity( + self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray] + ) -> float: + """An unimplemented base class method to compute perplexity. + + The perplexity between a list of target tokens and a list of context tokens is to be + computed by the implementation in the subclass. Note that any variables to be used in this + method (e.g., smoothing value) must be accessed from the class scope. + + Calculates :math:`\text{PPL}(P, Q) = \exp(-\sum_{x \in X}P(x) \times \ln(Q(x)))`. Note that + we use the natural logarithm; however, any base and corresponding exponent can be employed. + For instance, KenLM uses base-10 (see `convokit.Kenlm` for reference). + + :param target: A list of tokens that make up the target text (P). + :param context: A list of tokens that make up the context text (Q). + :raises: Raises a `RuntimeError` if called without implementing it in the subclass. + """ + raise RuntimeError("perplexity is not implemented") + + @final def evaluate( self, target_samples: Union[List[List[str]], np.ndarray], @@ -67,13 +121,28 @@ def evaluate( eval_type: str = "cross_entropy", **kwargs: Optional[Any], ) -> np.ndarray: - """ - - :param target_samples: - :param context_samples: - :param eval_type: - :param kwargs: - :return: + """Computes the average deviation between target and context distributions. + + For a given list of fixed size target and context sample lists, the `evaluate()` method + computes the deviation between each target and corresponding context pair, using `eval_type` + language model evaluation metric. Note that the subclass implementing this abstract base + class must define and implement the `eval_type` evaluation method. The final score output by + this method is an average of all the individual scores. + + Also note that, if specified as keyword arguments, any class variable values are overwritten + from within this method. + + :param target_samples: A list of target sample lists to be used to evaluate against the + corresponding context sample lists. + :param context_samples: A list of context sample lists that are to be used in evaluating the + corresponding target sample lists. + :param eval_type: The language model evaluation function (as `str`), used in evaluating the + language model trained using the context text, evaluated using the target text. Defaults + to "cross_entropy", i.e., calls the `cross_entropy()` method. + :param kwargs: Any additional keyword arguments needed in the language model evaluations. If + any class variables are passed using `kwargs`, the corresponding class variable values + are overwritten using the new values. + :return: The average score that measures the average deviation of target text from context. """ self._overwrite_args(list(kwargs.keys()), kwargs) eval_fn = getattr(self, eval_type) diff --git a/convokit/surprise/surprise.py b/convokit/surprise/surprise.py index b2fbc6bf..ad4cc097 100644 --- a/convokit/surprise/surprise.py +++ b/convokit/surprise/surprise.py @@ -1,3 +1,4 @@ +import warnings from collections import defaultdict from itertools import chain from typing import Callable, List, Tuple, Dict, Any, Optional, Union, Set @@ -17,21 +18,34 @@ shell = get_ipython().__class__.__name__ if shell == "ZMQInteractiveShell" or shell == "TerminalInteractiveShell": from tqdm.notebook import tqdm -except NameError: +except (NameError, ModuleNotFoundError, ImportError): pass class Surprise(Transformer): - """ + """Measures the amount of "surprise" between target and context utterance(s). + + This transformer computes how surprising a target utterance or group of utterances is, when + compared to some context. The amount of "surprise" is measured by comparing the deviation + of the target distribution from the context distribution (e.g., cross-entropy, perplexity). + Furthermore, to mitigate the effects of text length on language model evaluation, the surprise + transformer uses several random fixed length samples from target and context text. - :param model_key_selector: - :param tokenizer: - :param surprise_attr_name: - :param target_sample_size: - :param context_sample_size: - :param n_samples: - :param sampling_fn: - :param n_jobs: + :param model_key_selector: A function that specifies how utterances are to be mapped to models. + The function takes in an utterance and returns the key to use in mapping the utterance to a + corresponding model. + :param tokenizer: A function that returns a list of tokens in a given string, defaults to + `nltk.word_tokenize`. + :param surprise_attr_name: The name for the metadata attribute to add to the objects, defaults + to "surprise". + :param target_sample_size: The number of tokens to sample from each target (test text); when + specified as `None`, then the entire target will be used, defaults to 100. + :param context_sample_size: The number of tokens to sample from each context (training text); + when specified as `None`, then the entire context will be used, defaults to 100. + :param n_samples: The number of samples to take for each target-context pair, defaults to 50. + :param sampling_fn: A function to generate samples of tokens, defaults to a random sampler. + :param n_jobs: The number of concurrent threads to be used for routines that are parallelized + with `joblib`, defaults to 1. """ def __init__( @@ -63,12 +77,25 @@ def fit( text_func: Callable[[Utterance], List[str]] = None, selector: Callable[[Utterance], bool] = lambda utt: True, ) -> Transformer: - """ + """Populate models for each group of utterances in a corpus. + + For each group of utterances in the corpus, a specific model is populated. The group that an + utterance belongs to is determined by the `model_key_selector` parameter in the constructor. + Furthermore, based on the `tokenizer` specified in the constructor, the text corresponding + to the model key is tokenized. - :param corpus: - :param text_func: - :param selector: - :return: + :param corpus: The corpus to create models from. + :param text_func: The function used to define how the text a model is trained on should be + selected. Takes an utterance as input and returns a list of strings to train the model + corresponding to that utterance on. The model corresponding to the utterance is + determined by the `model_key_selector` parameter specified in the constructor. For each + utterance corresponding to the same model key, this function should return the same + result. + Defaults to `None`; when the value is `None`, a model will be trained on the text from + all the utterances that belong to its group. + :param selector: A function to specify which utterances in the corpus to train models for. + Defaults to choosing all utterances, `lambda utt: True`. + :return: An instance of the Surprise transformer with the corresponding models populated. """ self._model_groups = defaultdict(list) @@ -83,7 +110,7 @@ def fit( for key in tqdm(self._model_groups, desc="fit"): if text_func is None: self._model_groups[key] = [" ".join(self._model_groups[key])] - # Using `map()` with a `lambda` function is (microscopically) costlier than a list comprehension. + # Using `map()` with `lambda` is (microscopically) costlier than a list comprehension. # Reference: https://stackoverflow.com/a/1247490/6907625. self._model_groups[key] = [ self._tokenizer(utt_text) for utt_text in self._model_groups[key] @@ -101,13 +128,58 @@ def _compute_surprise( ], **kwargs: Optional[Any], ) -> np.ndarray: - """ + """Compute the amount of "surprise" between target and context utterance(s). + + This method computes how surprising a target text is, when compared to some context. The + amount of "surprise" is measured by comparing the deviation of the target distribution from + the context distribution (e.g., cross-entropy, perplexity). Furthermore, to mitigate the + effects of text length on language model evaluation, several random samples of fixed sizes + are taken from the target and context. - :param target: - :param context: - :param lm_evaluation_fn: - :param kwargs: - :return: + :param target: A list of tokens in the target. + :param context: A list of lists of tokens in each group of the context. + :param lm_evaluation_fn: The language model evaluation function. If using an instance of + `convokit.LanguageModel`, the `evaluate` function is to be used here. To see examples of + `convokit.LanguageModel`, see: `convokit.ConvoKitLanguageModel` and `convokit.Kenlm`. + The function takes in a list of target samples and corresponding context samples, and + returns the amount of surprise using some underlying language model evaluation metric. + :param kwargs: Additional keyword arguments to be passed to the language model evaluation + function: + - When using `convokit.LanguageModel`, the following keywords are relevant: + - `eval_type`: The language model evaluation metric, defaults to `cross_entropy`. + - The following arguments, if specified, overwrite the existing class values: + - `n_jobs`: The number of concurrent threads to be used for routines that are + parallelized with `joblib`, defaults to 1. + - `model_type`: Name of `convokit.LanguageModel`, defaults to "language_model". + - When using `convokit.ConvoKitLanguageModel`, the following keywords are relevant: + - `eval_type`: The language model evaluation metric, defaults to `cross_entropy`. + - The following arguments, if specified, overwrite the existing class values: + - `smooth`: Indicator of using Laplace smoothing in the computation of surprise + scores, defaults to `True`. + - The following arguments, inherited from `convokit.LanguageModel`, if specified, + overwrite the existing class values: + - `n_jobs`: The number of concurrent threads to be used for routines that are + parallelized with `joblib`, defaults to 1. + - `model_type`: Name of `convokit.LanguageModel`, defaults to "convokit_lm". + - When using `convokit.Kenlm`, the following keywords are relevant: + - `eval_type`: The language model evaluation metric, defaults to `cross_entropy`. + - The following arguments, if specified, overwrite the existing class values: + - `ngram_order`: The order of n-gram language model. + - `trained_model_filepath`: The filepath to a pre-trained language model that is + to be persistently used. + - `is_persistent`: Indicator of model persistence, i.e., the model generated + in the first pass or that loaded from `trained_model_filepath` is used in all + evaluations. When `trained_model_filepath` is specified, persistence is + automatically implied. + - `kenlm_path`: The folder path to the folder of KenLM library. + - `models_dir`: The folder path to store the (trained) binary KenLM models. + - `model_filename`: The filename used in storing the KenLM model artefacts. + - The following arguments, inherited from `convokit.LanguageModel`, if specified, + overwrite the existing class values: + - `n_jobs`: The number of concurrent threads to be used for routines that are + parallelized with `joblib`, defaults to 1. + - `model_type`: Name of `convokit.LanguageModel`, defaults to "kenlm". + :return: The surprise score output by the language model evaluation function. """ target_tokens = np.array(target) context_tokens = [np.array(text) for text in context] @@ -132,16 +204,47 @@ def _transform( group_model_attr_key: Callable[[str, str], str] = None, **kwargs: Optional[Any], ) -> Corpus: - """ + """Annotates `obj_type` components in a corpus with surprise scores. + + The transform function adds surprise score metadata to the `obj_type` components in the + given corpus. - :param corpus: - :param obj_type: - :param group_and_models: - :param target_text_func: - :param selector: - :param group_model_attr_key: - :param kwargs: - :return: + :param corpus: The corpus to compute surprise for. + :param obj_type: The type of corpus components to annotate. Should be one of "utterance", + "speaker", "conversation", or "corpus". + :param group_and_models: A function that defines how an utterance should be grouped to form + a target text and what models (contexts) the group should be compared to in calculating + surprise scores. Takes in an utterance and returns a tuple containing the name of the + group the utterance belongs to and a list of models to calculate how surprising that + group is against. Objects will be annotated with a metadata field `surprise_attr_name` + (specified in the constructor) that maps a key corresponding to the `group_name` and + `model_key` to the surprise score for the utterances in the group when compared to the + model. The key used is defined by the `group_model_attr_key` parameter. + Defaults to `None`; if `group_and_models` is `None`, `model_key_selector` specified in + the constructor will be used to select the group that an utterance belongs to. The + surprise score will be calculated for each group of utterances compared to the model in + `self.models` corresponding to the group. + :param target_text_func: A function to define what the target text corresponding to an + utterance should be; takes in an utterance and returns a list of string tokens. + Defaults to `None`. + :param selector: A function to specify which objects in the corpus to train models for, + defaults to choosing all `obj_type` objects, `lambda _: True`. + :param group_model_attr_key: A function that defines what key is to be used for a given + `group_name` and `model_key`, defaults to `None`. If `group_model_attr_key` is `None`, + the default key used will be "GROUP_group_name_MODEL_model_key" unless `group_name` and + `model_key` are equal, in which case just "model_key" will be used as the key. + :param kwargs: Additional keyword arguments to be passed for surprise computations (see + the documentation for `Surprise._compute_surprise()` for these arguments), and in + creating the language model (if needed): + - `language_model`: An instance of `convokit.LanguageModel` to be used in computing the + surprise scores, defaults to `convokit.ConvoKitLanguageModel` and the arguments to the + `convokit.ConvoKitLanguageModel` can be specified here as: + - `smooth`: Indicator of using Laplace smoothing in the computation of surprise + scores, defaults to `True`. + - `n_jobs`: The number of concurrent threads to be used for routines that are + parallelized with `joblib`, defaults to 1. + - `model_type`: Name of `convokit.LanguageModel`, defaults to "convokit_lm". + :return: A modified version of the input corpus with the surprise scores. """ def _update_groups_models( @@ -149,12 +252,13 @@ def _update_groups_models( utt_groups_: Dict[str, List[List[str]]], group_models_: Dict[str, Set[str]], ): - """ + """Updates the utterance groups and models based on `groups_and_models`. - :param utt_: - :param utt_groups_: - :param group_models_: - :return: + :param utt_: The utterance whose groups and models are to be populated (updated). + :param utt_groups_: Update utterance groups based on `groups_and_models` parameter. The + dictionary is modified in place. + :param group_models_: Update utterance models based on `groups_and_models` parameter. + The dictionary is modified in place. """ group_name, models = ( group_and_models(utt_) @@ -172,12 +276,16 @@ def _update_groups_models( def _format_attr_key( group_name: str, model_key: str, format_fn: Callable[[str, str], str] = None ) -> str: - """ + """Formats the surprise score attribute key, given model name and key. - :param group_name: - :param model_key: - :param format_fn: - :return: + :param group_name: The group name to be included in the surprise score attribute key. + :param model_key: The model key to be included in the surprise score attribute key. + :param format_fn: A function that takes in the `group_name` and `model_key` and outputs + the formatted attribute key, defaults to `None`. When `group_model_attr_key` is + `None`, the default key used will be "GROUP_group_name_MODEL_model_key" unless + `group_name` and `model_key` are equal, in which case just "model_key" will be used + as the key. + :return: The formatted surprise score attribute key. """ if format_fn: return format_fn(group_name, model_key) @@ -199,14 +307,19 @@ def __surprise_score_helper( np.ndarray, ], ): - """ + """A helper function to aid in the computation of surprise scores. - :param group_name: - :param utt_group: - :param group_models_: - :param surprise_scores_: - :param lm_evaluation_fn: - :return: + :param group_name: The group name corresponding to the group model to be used. + :param utt_group: The utterance group from those populated using `groups_and_models`. + :param group_models_: The group models that were populated using `groups_and_models`. + :param surprise_scores_: The surprise score (dictionary value) that is to be updated for + the corresponding utterance group and model. The dictionary is modified in place. + :param lm_evaluation_fn: The language model evaluation function. If using an instance + of `convokit.LanguageModel`, the `evaluate` function is to be used here. To see the + examples of `convokit.LanguageModel`, see: `convokit.ConvoKitLanguageModel` and + `convokit.Kenlm`. The function takes in a list of target samples and corresponding + context samples, and returns the amount of surprise using some underlying language + model evaluation metric. """ for model_key in group_models_[group_name]: assert model_key in self._model_groups, "invalid model key" @@ -230,13 +343,19 @@ def _update_surprise_scores( np.ndarray, ], ): - """ + """Populate (update) the surprise score for utterance groups and models. - :param utt_groups_: - :param group_models_: - :param surprise_scores_: - :param lm_evaluation_fn: - :return: + :param utt_groups_: The utterance groups that were populated using `groups_and_models`. + :param group_models_: The group models that were populated using `groups_and_models`. + :param surprise_scores_: The surprise scores (dictionary values) that are to be updated + for the corresponding utterance groups and models. The surprise scores dictionary is + modified in place. + :param lm_evaluation_fn: The language model evaluation function. If using an instance + of `convokit.LanguageModel`, the `evaluate` function is to be used here. To see the + examples of `convokit.LanguageModel`, see: `convokit.ConvoKitLanguageModel` and + `convokit.Kenlm`. The function takes in a list of target samples and corresponding + context samples, and returns the amount of surprise using some underlying language + model evaluation metric. """ if self._n_jobs == 1: for group_name in tqdm(utt_groups_, leave=False, desc="surprise", delay=2): @@ -259,14 +378,20 @@ def _update_surprise_scores( for group_name in tqdm(utt_groups_, leave=False, desc="surprise", delay=2) ) + if "n_jobs" in kwargs and kwargs["n_jobs"] != self._n_jobs: + warnings.warn( + f"specified n_jobs={kwargs['n_jobs']}; however, the surprise transformer was " + f"initialized with {self._n_jobs}, so defaulting to {self._n_jobs} jobs." + ) + kwargs["n_jobs"] = self._n_jobs language_model = ( kwargs["language_model"] if "language_model" in kwargs - else ConvoKitLanguageModel(n_jobs=self._n_jobs, **kwargs) + else ConvoKitLanguageModel(**kwargs) ) if obj_type == "corpus": - surprise_scores = {} + surprise_scores = defaultdict() utt_groups, group_models = defaultdict(list), defaultdict(set) for utt in tqdm(corpus.iter_utterances(), desc="transform"): _update_groups_models(utt, utt_groups, group_models) @@ -276,7 +401,7 @@ def _update_surprise_scores( corpus.add_meta(self._surprise_attr_name, surprise_scores) elif obj_type == "utterance": for utt in tqdm(corpus.iter_utterances(selector=selector), desc="transform"): - surprise_scores = {} + surprise_scores = defaultdict() utt_groups, group_models = defaultdict(list), defaultdict(set) _update_groups_models(utt, utt_groups, group_models) _update_surprise_scores( @@ -285,7 +410,7 @@ def _update_surprise_scores( utt.add_meta(self._surprise_attr_name, surprise_scores) else: for obj in tqdm(corpus.iter_objs(obj_type, selector=selector), desc="transform"): - surprise_scores = {} + surprise_scores = defaultdict() utt_groups, group_models = defaultdict(list), defaultdict(set) for utt in obj.iter_utterances(): _update_groups_models(utt, utt_groups, group_models) @@ -296,10 +421,15 @@ def _update_surprise_scores( return corpus def transform(self, corpus: Corpus, **kwargs) -> Corpus: - """ + """Wrapper over the `transform` function of the Surprise transformer. + + Note: Since the transformer's `fit()` function populates the model groups, the `transform` + function is to be called after `fit()`. - :param corpus: - :param kwargs: - :return: + :param corpus: The corpus to transform. + :param kwargs: Any keyword arguments to be passed to the `transform` function of the + Surprise transformer. Refer to the documentation of `Surprise._transform()` for specific + keyword arguments. + :return: A modified version of the input corpus with the surprise scores. """ return self._transform(corpus=corpus, **kwargs) diff --git a/convokit/util.py b/convokit/util.py index 97a57514..d9f99f7d 100644 --- a/convokit/util.py +++ b/convokit/util.py @@ -399,12 +399,12 @@ def create_safe_id(): def random_sampler( tokens: List[Union[np.ndarray, List[str]]], sample_size: int, n_samples: int ) -> Optional[np.ndarray]: - """ + """Generates random samples from a list of lists of tokens. - :param tokens: - :param sample_size: - :param n_samples: - :return: + :param tokens: A list of lists of tokens to sample from. + :param sample_size: The number of tokens to include in each sample. + :param n_samples: The number of samples to take. + :return: A `numpy.array`, where each row is a sample of tokens. """ if not sample_size: assert len(tokens) == 1 @@ -420,10 +420,10 @@ def random_sampler( def create_temp_files(num_files: int) -> List[IO]: - """ + """Creates a specified number of `tempfile` files. - :param num_files: - :return: + :param num_files: The number of `tempfile` files to be created. + :return: A list of `tempfile.NamedTemporaryFile` files. """ tmp_files = [] for _ in range(num_files): @@ -432,11 +432,11 @@ def create_temp_files(num_files: int) -> List[IO]: def delete_files(tmp_filenames: List[str], remove_parent_dir: bool = True): - """ + """Delete temporary files generated intermittently. - :param tmp_filenames: - :param remove_parent_dir: - :return: + :param tmp_filenames: The filenames of all the files to be deleted. + :param remove_parent_dir: Indicator of whether the parent directory is to be deleted, if it is + empty after deleting all the temporary files, defaults to True. """ tmp_filepaths = [Path(tmp_filename) for tmp_filename in tmp_filenames] parent_dir = tmp_filepaths[0].parents[0] From bd9ca58905a79bbb77d76c69d19b7a40b7b63668 Mon Sep 17 00:00:00 2001 From: Tushaar Gangavarapu Date: Sun, 18 Dec 2022 03:01:30 -0500 Subject: [PATCH 7/9] Only load `Kenlm` if such a module exists, else ignore it. --- convokit/surprise/__init__.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/convokit/surprise/__init__.py b/convokit/surprise/__init__.py index 49805c28..57568901 100644 --- a/convokit/surprise/__init__.py +++ b/convokit/surprise/__init__.py @@ -1,4 +1,15 @@ -from .convokit_lm import ConvoKitLanguageModel -from .kenlm import Kenlm -from .language_model import LanguageModel +import importlib.util +import sys + +from .convokit_lm import * +from .language_model import * from .surprise import * + +if "kenlm" in sys.modules: + from .kenlm import * +elif (spec := importlib.util.find_spec("kenlm")) is not None: + module = importlib.util.module_from_spec(spec) + sys.modules["kenlm"] = module + spec.loader.exec_module(module) + + from .kenlm import * From 4e60aef7859afe5ef2d0f4f8184441fc484f0b07 Mon Sep 17 00:00:00 2001 From: Tushaar Gangavarapu Date: Sun, 18 Dec 2022 12:01:05 -0500 Subject: [PATCH 8/9] Add .rst documentation files for Surprise transformer. --- convokit/surprise/convokit_lm.py | 19 ++-- convokit/surprise/kenlm.py | 20 +++-- convokit/surprise/language_model.py | 63 ++++++++------ convokit/surprise/surprise.py | 129 ++++++++++++++++------------ docs/source/analysis.rst | 1 + docs/source/language_model.rst | 21 +++++ docs/source/surprise.rst | 24 ++++++ 7 files changed, 179 insertions(+), 98 deletions(-) create mode 100644 docs/source/language_model.rst create mode 100644 docs/source/surprise.rst diff --git a/convokit/surprise/convokit_lm.py b/convokit/surprise/convokit_lm.py index 594bc78e..1b7ee5e5 100644 --- a/convokit/surprise/convokit_lm.py +++ b/convokit/surprise/convokit_lm.py @@ -12,14 +12,17 @@ class ConvoKitLanguageModel(LanguageModel): This language model implements cross-entropy and perplexity language model evaluation functions, to be used in evaluating the average deviation of target from the specified context. - :param model_type: The name of the `convokit.ConvoKitLanguageModel`, defaults to "convokit_lm". - Note that the `model_type` can be accessed using the `type` property (e.g., `lm.type`). + :param model_type: The name (identifier) of the :py:class:`~convokit.ConvoKitLanguageModel`, + defaults to "convokit_lm". Note that the `model_type` can be accessed using the `type` + property (e.g., `lm.type`). :param kwargs: Any additional keyword arguments needed in the language model evaluations. This language model currently uses the following keyword arguments: - - `smooth`: Indicator of using Laplace smoothing in the computation of cross-entropy scores, + + * `smooth`: Indicator of using Laplace smoothing in the computation of cross-entropy scores, defaults to `True`. - - `n_jobs`: The number of concurrent threads to be used for routines that are parallelized + * `n_jobs`: The number of concurrent threads to be used for routines that are parallelized with `joblib`, defaults to 1. + The language model configuration can be retrieved using the `config` property of the model class object (e.g., `lm.config`). """ @@ -34,11 +37,11 @@ def cross_entropy( target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray], ) -> float: - """Implements the base class method to compute the cross-entropy. + r"""Implements the base class method to compute the cross-entropy. Calculates :math:`H(P, Q) = -\sum_{x \in X}P(x) \times \ln(Q(x))`. Note that we use the natural logarithm; however, any base and corresponding exponent can be employed. For - instance, KenLM uses base-10 (see `convokit.Kenlm` for reference). + instance, KenLM uses base-10 (see :py:class:`~convokit.Kenlm` for reference). The smoothing boolean argument, `smooth`, is accessed from the setting in the language model constructor (defaults to `True` when unspecified). @@ -67,11 +70,11 @@ def cross_entropy( def perplexity( self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray] ) -> float: - """Implements the base class method to compute perplexity. + r"""Implements the base class method to compute perplexity. Calculates :math:`\text{PPL}(P, Q) = \exp(-\sum_{x \in X}P(x) \times \ln(Q(x)))`. Note that we use the natural logarithm; however, any base and corresponding exponent can be employed. - For instance, KenLM uses base-10 (see `convokit.Kenlm` for reference). + For instance, KenLM uses base-10 (see :py:class:`~convokit.Kenlm` for reference). For convenience, the perplexity score is computed as the exponentiation of the cross-entropy calculated using the `cross_entropy()` method. diff --git a/convokit/surprise/kenlm.py b/convokit/surprise/kenlm.py index a8d3ed48..23e71cfc 100644 --- a/convokit/surprise/kenlm.py +++ b/convokit/surprise/kenlm.py @@ -31,24 +31,26 @@ class Kenlm(LanguageModel): class. If kenlm installation fails, please follow: https://github.com/kpu/kenlm/issues/57 to install the KenLM library. - :param model_type: The name of the `convokit.Kenlm`, defaults to "kenlm". Note that the - `model_type` can be accessed using the `type` property (e.g., `lm.type`). + :param model_type: The name of the :py:class:`~convokit.Kenlm`, defaults to "kenlm". Note that + the `model_type` can be accessed using the `type` property (e.g., `lm.type`). :param kwargs: Any additional keyword arguments needed in the language model evaluations. This language model currently uses the following keyword arguments: - - `ngram_order`: The order of n-gram language model, when the specified `ngram_order` is + + * `ngram_order`: The order of n-gram language model, when the specified `ngram_order` is less than 2 (or unspecified), the `ngram_order` is set to 2, since the KenLM library does not support n-gram order below 2 (see: https://github.com/kpu/kenlm/issues/171). - - `trained_model_filepath`: The filepath to a pre-trained language model that is to be + * `trained_model_filepath`: The filepath to a pre-trained language model that is to be persistently used. - - `is_persistent`: Indicator of model persistence, i.e., the model generated in the first + * `is_persistent`: Indicator of model persistence, i.e., the model generated in the first pass or that loaded from `trained_model_filepath` is used in all evaluations. When the `trained_model_filepath` is specified, persistence is implied. Defaults to `False`. - - `kenlm_path`: The path to the KenLM library, defaults to the user's home directory. - - `models_dir`: The folder path to store the (trained) binary KenLM models, defaults to + * `kenlm_path`: The path to the KenLM library, defaults to the user's home directory. + * `models_dir`: The folder path to store the (trained) binary KenLM models, defaults to `None`, indicating that the trained KenLM models need not be stored. - - `model_filename`: The filename used in storing model artefacts, defaults to `model_type`. - - `n_jobs`: The number of concurrent threads to be used for routines that are parallelized + * `model_filename`: The filename used in storing model artefacts, defaults to `model_type`. + * `n_jobs`: The number of concurrent threads to be used for routines that are parallelized with `joblib`, defaults to 1. + The language model configuration can be retrieved using the `config` property of the model class object (e.g., `lm.config`). """ diff --git a/convokit/surprise/language_model.py b/convokit/surprise/language_model.py index bf937d1f..d931cce4 100644 --- a/convokit/surprise/language_model.py +++ b/convokit/surprise/language_model.py @@ -8,26 +8,29 @@ class LanguageModel(ABC): """The abstract base class for all language models. - The language model base class defines the `evaluate()` method, which aims at performing language - model evaluation using the `eval_type` specified as an argument to the `evaluate()` method. Note - that this method must be defined and implemented in the subclass (e.g., if the `eval_type` is - set to `cross_entropy`, the subclass must implement `cross_entropy()` method). The implemented - method takes in a list of target tokens and a list of context tokens, and outputs the language + The language model base class defines the :py:meth:`~convokit.LanguageModel.evaluate` method, + which performs language model evaluation using the `eval_type` specified as an argument to the + :py:meth:`~convokit.LanguageModel.evaluate` method. Note that this method must be defined and + implemented in the subclass (e.g., if the `eval_type` is set to "cross_entropy", the subclass + must implement :py:meth:`~convokit.LanguageModel.cross_entropy` method). The implemented method + should take in a list of target tokens and a list of context tokens, and output the language model evaluation score. Since most language models employs cross-entropy and perplexity evaluations, this base class - includes unimplemented designs of `cross_entropy()` and `perplexity()` functions, which may be - implemented (as needed) in the subclasses. For reference, see: `convokit.ConvoKitLanguageModel` - and `convokit.Kenlm` classes, which extend this base class. + includes unimplemented designs of :py:meth:`~convokit.LanguageModel.cross_entropy` and + :py:meth:`~convokit.LanguageModel.perplexity` functions, which may be implemented (as needed) in + the subclasses. See the subclass implementations: :py:class:`~convokit.ConvoKitLanguageModel` + and :py:class:`~convokit.Kenlm` classes, which extend this base class. - The `evaluate()` method defined in this class is called on a set of context samples and a set of - target samples, and evaluates the target-context distribution deviations using the `eval_type` - language model evaluation function. + The :py:meth:`~convokit.LanguageModel.evaluate` method defined in this class is called on a set + of context samples and a set of target samples, and evaluates the target-context distribution + deviations using the `eval_type` language model evaluation function. - Note: The subclasses cannot override the `evaluate()` method. + Note: The subclasses cannot override the :py:meth:`~convokit.LanguageModel.evaluate` method. - :param model_type: The name of the `convokit.LanguageModel`, defaults to "language_model". Note - that the `model_type` can be accessed using the `type` property (e.g., `lm.type`). + :param model_type: The name (identifier) of :py:class:`~convokit.LanguageModel`, defaults to + "language_model". Note that the `model_type` can be accessed using the `type` property + (e.g., `lm.type`). :param kwargs: Any additional keyword arguments needed in the language model evaluations. For instance, the cross-entropy computes might require smoothing parameter; hence, a `smooth` parameter can be passed as an additional keyword argument. @@ -78,15 +81,16 @@ def _overwrite_args(self, args_to_overwrite: List[str], kwargs: Dict[str, Any]): def cross_entropy( self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray] ) -> float: - """An unimplemented base class method to compute the cross-entropy. + r"""An unimplemented base class method to compute the cross-entropy. The cross-entropy between a list of target tokens and a list of context tokens is to be computed by the implementation in the subclass. Note that any variables to be used in this method (e.g., smoothing value) must be accessed from the class scope. - Calculates :math:`H(P, Q) = -\sum_{x \in X}P(x) \times \ln(Q(x))`. Note that we use the - natural logarithm; however, any base and corresponding exponent can be employed. For - instance, KenLM uses base-10 (see `convokit.Kenlm` for reference). + Calculates :math:`H(P, Q) = -\sum_{x \in X}P(x) \times \ln(Q(x))`. + + Note that we use the natural logarithm; however, any base and corresponding exponent can be + employed. For instance, KenLM uses base-10 (see :py:class:`~convokit.Kenlm` for reference). :param target: A list of tokens that make up the target text (P). :param context: A list of tokens that make up the context text (Q). @@ -97,15 +101,16 @@ def cross_entropy( def perplexity( self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray] ) -> float: - """An unimplemented base class method to compute perplexity. + r"""An unimplemented base class method to compute perplexity. The perplexity between a list of target tokens and a list of context tokens is to be computed by the implementation in the subclass. Note that any variables to be used in this method (e.g., smoothing value) must be accessed from the class scope. - Calculates :math:`\text{PPL}(P, Q) = \exp(-\sum_{x \in X}P(x) \times \ln(Q(x)))`. Note that - we use the natural logarithm; however, any base and corresponding exponent can be employed. - For instance, KenLM uses base-10 (see `convokit.Kenlm` for reference). + Calculates :math:`\text{PPL}(P, Q) = \exp(-\sum_{x \in X}P(x) \times \ln(Q(x)))`. + + Note that we use the natural logarithm; however, any base and corresponding exponent can be + employed. For instance, KenLM uses base-10 (see :py:class:`~convokit.Kenlm` for reference). :param target: A list of tokens that make up the target text (P). :param context: A list of tokens that make up the context text (Q). @@ -123,11 +128,12 @@ def evaluate( ) -> np.ndarray: """Computes the average deviation between target and context distributions. - For a given list of fixed size target and context sample lists, the `evaluate()` method - computes the deviation between each target and corresponding context pair, using `eval_type` - language model evaluation metric. Note that the subclass implementing this abstract base - class must define and implement the `eval_type` evaluation method. The final score output by - this method is an average of all the individual scores. + For a given list of (fixed size) target sample lists and (fixed size) context sample lists, + the :py:meth:`~convokit.LanguageModel.evaluate` method computes the deviation between each + target and corresponding context pair, using `eval_type` language model evaluation metric. + Note that the subclass implementing this abstract base class must define and implement the + `eval_type` evaluation method. The final score output by this method is an average of all + the individual scores. Also note that, if specified as keyword arguments, any class variable values are overwritten from within this method. @@ -138,7 +144,8 @@ class must define and implement the `eval_type` evaluation method. The final sco corresponding target sample lists. :param eval_type: The language model evaluation function (as `str`), used in evaluating the language model trained using the context text, evaluated using the target text. Defaults - to "cross_entropy", i.e., calls the `cross_entropy()` method. + to "cross_entropy", i.e., calls the :py:meth:`~convokit.LanguageModel.cross_entropy` + method. :param kwargs: Any additional keyword arguments needed in the language model evaluations. If any class variables are passed using `kwargs`, the corresponding class variable values are overwritten using the new values. diff --git a/convokit/surprise/surprise.py b/convokit/surprise/surprise.py index ad4cc097..73e187d1 100644 --- a/convokit/surprise/surprise.py +++ b/convokit/surprise/surprise.py @@ -139,46 +139,64 @@ def _compute_surprise( :param target: A list of tokens in the target. :param context: A list of lists of tokens in each group of the context. :param lm_evaluation_fn: The language model evaluation function. If using an instance of - `convokit.LanguageModel`, the `evaluate` function is to be used here. To see examples of - `convokit.LanguageModel`, see: `convokit.ConvoKitLanguageModel` and `convokit.Kenlm`. - The function takes in a list of target samples and corresponding context samples, and + :py:class:`~convokit.LanguageModel`, the :py:meth:`~convokit.LanguageModel.evaluate` + function is to be used here. To see examples of :py:class:`~convokit.LanguageModel`, + see: :py:class:`~convokit.ConvoKitLanguageModel` and :py:class:`~convokit.Kenlm`. This + function takes in a list of target samples and corresponding context samples, and returns the amount of surprise using some underlying language model evaluation metric. :param kwargs: Additional keyword arguments to be passed to the language model evaluation function: - - When using `convokit.LanguageModel`, the following keywords are relevant: - - `eval_type`: The language model evaluation metric, defaults to `cross_entropy`. - - The following arguments, if specified, overwrite the existing class values: - - `n_jobs`: The number of concurrent threads to be used for routines that are + + * When using :py:class:`~convokit.LanguageModel`, the following keywords are relevant: + + * `eval_type`: The language model evaluation metric, defaults to "cross_entropy". + * The following arguments, if specified, overwrite the existing class values: + + * `n_jobs`: The number of concurrent threads to be used for routines that are parallelized with `joblib`, defaults to 1. - - `model_type`: Name of `convokit.LanguageModel`, defaults to "language_model". - - When using `convokit.ConvoKitLanguageModel`, the following keywords are relevant: - - `eval_type`: The language model evaluation metric, defaults to `cross_entropy`. - - The following arguments, if specified, overwrite the existing class values: - - `smooth`: Indicator of using Laplace smoothing in the computation of surprise + * `model_type`: The name of :py:class:`~convokit.LanguageModel`, defaults to + "language_model". + + * When using :py:class:`~convokit.ConvoKitLanguageModel`, the following keywords are + relevant: + + * `eval_type`: The language model evaluation metric, defaults to "cross_entropy". + * The following arguments, if specified, overwrite the existing class values: + + * `smooth`: Indicator of using Laplace smoothing in the computation of surprise scores, defaults to `True`. - - The following arguments, inherited from `convokit.LanguageModel`, if specified, - overwrite the existing class values: - - `n_jobs`: The number of concurrent threads to be used for routines that are + + * The following arguments, inherited from :py:class:`~convokit.LanguageModel`, if + specified, overwrite the existing class values: + + * `n_jobs`: The number of concurrent threads to be used for routines that are parallelized with `joblib`, defaults to 1. - - `model_type`: Name of `convokit.LanguageModel`, defaults to "convokit_lm". - - When using `convokit.Kenlm`, the following keywords are relevant: - - `eval_type`: The language model evaluation metric, defaults to `cross_entropy`. - - The following arguments, if specified, overwrite the existing class values: - - `ngram_order`: The order of n-gram language model. - - `trained_model_filepath`: The filepath to a pre-trained language model that is + * `model_type`: The name of :py:class:`~convokit.LanguageModel`, defaults to + "convokit_lm". + + * When using :py:class:`~convokit.Kenlm`, the following keywords are relevant: + + * `eval_type`: The language model evaluation metric, defaults to "cross_entropy". + * The following arguments, if specified, overwrite the existing class values: + + * `ngram_order`: The order of n-gram language model. + * `trained_model_filepath`: The filepath to a pre-trained language model that is to be persistently used. - - `is_persistent`: Indicator of model persistence, i.e., the model generated + * `is_persistent`: Indicator of model persistence, i.e., the model generated in the first pass or that loaded from `trained_model_filepath` is used in all evaluations. When `trained_model_filepath` is specified, persistence is automatically implied. - - `kenlm_path`: The folder path to the folder of KenLM library. - - `models_dir`: The folder path to store the (trained) binary KenLM models. - - `model_filename`: The filename used in storing the KenLM model artefacts. - - The following arguments, inherited from `convokit.LanguageModel`, if specified, - overwrite the existing class values: - - `n_jobs`: The number of concurrent threads to be used for routines that are + * `kenlm_path`: The folder path to the folder of KenLM library. + * `models_dir`: The folder path to store the (trained) binary KenLM models. + * `model_filename`: The filename used in storing the KenLM model artefacts. + + * The following arguments, inherited from :py:class:`~convokit.LanguageModel`, if + specified, overwrite the existing class values: + + * `n_jobs`: The number of concurrent threads to be used for routines that are parallelized with `joblib`, defaults to 1. - - `model_type`: Name of `convokit.LanguageModel`, defaults to "kenlm". + * `model_type`: The name of :py:class:`~convokit.LanguageModel`, defaults to + "kenlm". :return: The surprise score output by the language model evaluation function. """ target_tokens = np.array(target) @@ -234,16 +252,20 @@ def _transform( the default key used will be "GROUP_group_name_MODEL_model_key" unless `group_name` and `model_key` are equal, in which case just "model_key" will be used as the key. :param kwargs: Additional keyword arguments to be passed for surprise computations (see - the documentation for `Surprise._compute_surprise()` for these arguments), and in - creating the language model (if needed): - - `language_model`: An instance of `convokit.LanguageModel` to be used in computing the - surprise scores, defaults to `convokit.ConvoKitLanguageModel` and the arguments to the - `convokit.ConvoKitLanguageModel` can be specified here as: - - `smooth`: Indicator of using Laplace smoothing in the computation of surprise + the documentation for :py:meth:`~Surprise._compute_surprise()` for these arguments), and + in creating the language model (if needed): + + * `language_model`: An instance of :py:class:`~convokit.LanguageModel` to be used in + computing the surprise scores, defaults to :py:class:`~convokit.ConvoKitLanguageModel` + and the arguments to the :py:class:`~convokit.ConvoKitLanguageModel` can be specified + here as: + + * `smooth`: Indicator of using Laplace smoothing in the computation of surprise scores, defaults to `True`. - - `n_jobs`: The number of concurrent threads to be used for routines that are + * `n_jobs`: The number of concurrent threads to be used for routines that are parallelized with `joblib`, defaults to 1. - - `model_type`: Name of `convokit.LanguageModel`, defaults to "convokit_lm". + * `model_type`: The name of :py:class:`~convokit.LanguageModel`, defaults to + "convokit_lm". :return: A modified version of the input corpus with the surprise scores. """ @@ -315,11 +337,11 @@ def __surprise_score_helper( :param surprise_scores_: The surprise score (dictionary value) that is to be updated for the corresponding utterance group and model. The dictionary is modified in place. :param lm_evaluation_fn: The language model evaluation function. If using an instance - of `convokit.LanguageModel`, the `evaluate` function is to be used here. To see the - examples of `convokit.LanguageModel`, see: `convokit.ConvoKitLanguageModel` and - `convokit.Kenlm`. The function takes in a list of target samples and corresponding - context samples, and returns the amount of surprise using some underlying language - model evaluation metric. + of :py:class:`~convokit.LanguageModel`, :py:meth:`~convokit.LanguageModel.evaluate` + function is to be used here. To see examples of :py:class:`~convokit.LanguageModel`, + see: :py:class:`~convokit.ConvoKitLanguageModel` and :py:class:`~convokit.Kenlm`. + The function takes in a list of target samples and corresponding context samples, + and returns the amount of surprise using some underlying model evaluation metric. """ for model_key in group_models_[group_name]: assert model_key in self._model_groups, "invalid model key" @@ -351,11 +373,11 @@ def _update_surprise_scores( for the corresponding utterance groups and models. The surprise scores dictionary is modified in place. :param lm_evaluation_fn: The language model evaluation function. If using an instance - of `convokit.LanguageModel`, the `evaluate` function is to be used here. To see the - examples of `convokit.LanguageModel`, see: `convokit.ConvoKitLanguageModel` and - `convokit.Kenlm`. The function takes in a list of target samples and corresponding - context samples, and returns the amount of surprise using some underlying language - model evaluation metric. + of :py:class:`~convokit.LanguageModel`, the `evaluate` function is to be used here. + To see the subclass implementations of :py:class:`~convokit.LanguageModel`, see: + :py:class:`~convokit.ConvoKitLanguageModel` and :py:class:`~convokit.Kenlm`. The + function takes in a list of target samples and corresponding context samples, and + returns the amount of surprise using some underlying model evaluation metric. """ if self._n_jobs == 1: for group_name in tqdm(utt_groups_, leave=False, desc="surprise", delay=2): @@ -421,15 +443,16 @@ def _update_surprise_scores( return corpus def transform(self, corpus: Corpus, **kwargs) -> Corpus: - """Wrapper over the `transform` function of the Surprise transformer. + """A wrapper over :py:meth:`~convokit.Surprise._transform` of the Surprise transformer. - Note: Since the transformer's `fit()` function populates the model groups, the `transform` - function is to be called after `fit()`. + Note: Since the transformer's :py:meth:`~convokit.Surprise.fit` method populates the model + groups, the :py:meth:`~convokit.Surprise.transform` function is to be called after calling + :py:meth:`~convokit.Surprise.fit`. :param corpus: The corpus to transform. - :param kwargs: Any keyword arguments to be passed to the `transform` function of the - Surprise transformer. Refer to the documentation of `Surprise._transform()` for specific - keyword arguments. + :param kwargs: Any keyword arguments to be passed to :py:meth:`~convokit.Surprise.transform` + function of the Surprise transformer (e.g., `eval_type`). Refer to the documentation of + :py:meth:`~convokit.Surprise._transform()` for specific keyword arguments. :return: A modified version of the input corpus with the surprise scores. """ return self._transform(corpus=corpus, **kwargs) diff --git a/docs/source/analysis.rst b/docs/source/analysis.rst index f92d29cc..0a001512 100644 --- a/docs/source/analysis.rst +++ b/docs/source/analysis.rst @@ -16,4 +16,5 @@ These are the transformers related to generating some analysis of the Corpus. Pairer PairedPrediction Ranker + Surprise SpeakerConvoDiversity diff --git a/docs/source/language_model.rst b/docs/source/language_model.rst new file mode 100644 index 00000000..f3b56c13 --- /dev/null +++ b/docs/source/language_model.rst @@ -0,0 +1,21 @@ +Language model +============== + +Implements a language model and defines the `evaluate()` method, to perform +language model evaluation by comparing the deviation of the target distribution +from the context distribution (e.g., cross-entropy, perplexity). + +Base class +---------- + +.. automodule:: convokit.surprise.language_model + :members: + +Subclasses +---------- + +.. automodule:: convokit.surprise.convokit_lm + :members: + +.. automodule:: convokit.surprise.kenlm + :members: diff --git a/docs/source/surprise.rst b/docs/source/surprise.rst new file mode 100644 index 00000000..18bb23ad --- /dev/null +++ b/docs/source/surprise.rst @@ -0,0 +1,24 @@ +Surprise +======== + +Implements the measure of how "surprising" conversations are (e.g., across users +or within user conversations), thereby measuring users' language evolutions over +time. For reference, see the `tie-breaker paper +`_. + +Example usage: `surprise demo +`_, +`tennis demo +`_. + +.. automodule:: convokit.surprise.surprise + :members: + :private-members: _transform, _compute_surprise + +References +---------- + +.. toctree:: + :maxdepth: 3 + + LanguageModel From a49d8e4a3992346d2f87ca42bda4dc645a183154 Mon Sep 17 00:00:00 2001 From: Tushaar Gangavarapu Date: Fri, 23 Dec 2022 14:22:33 -0500 Subject: [PATCH 9/9] Include unit tests to test Surprise transformer. This change includes unit tests to test the functionality of the Surprise transformer, LanguageModel (using nltk.lm), and the added ConvoKitLanguageModel. The newly added tests were verified to ensure that they run as intended. --- convokit/tests/surprise/__init__.py | 0 convokit/tests/surprise/test_convokit_lm.py | 75 +++++++ .../tests/surprise/test_language_model.py | 139 +++++++++++++ convokit/tests/surprise/test_surprise.py | 191 ++++++++++++++++++ convokit/tests/test_utils.py | 30 ++- 5 files changed, 434 insertions(+), 1 deletion(-) create mode 100644 convokit/tests/surprise/__init__.py create mode 100644 convokit/tests/surprise/test_convokit_lm.py create mode 100644 convokit/tests/surprise/test_language_model.py create mode 100644 convokit/tests/surprise/test_surprise.py diff --git a/convokit/tests/surprise/__init__.py b/convokit/tests/surprise/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/convokit/tests/surprise/test_convokit_lm.py b/convokit/tests/surprise/test_convokit_lm.py new file mode 100644 index 00000000..7ee02180 --- /dev/null +++ b/convokit/tests/surprise/test_convokit_lm.py @@ -0,0 +1,75 @@ +import unittest + +from convokit import ConvoKitLanguageModel + + +class TestConvoKitLanguageModel(unittest.TestCase): + def _init(self, target_samples, context_samples): + self._target_samples = target_samples + self._context_samples = context_samples + + def test_cross_entropy_smooth(self): + convokit_lm = ConvoKitLanguageModel(model_type="test_convokit_lm", smooth=True) + score = convokit_lm.evaluate( + self._target_samples, self._context_samples, eval_type="cross_entropy" + ) + self.assertEqual(round(float(score), 2), 1.38) + + def test_cross_entropy_no_smooth(self): + convokit_lm = ConvoKitLanguageModel(model_type="test_convokit_lm", smooth=False) + score = convokit_lm.evaluate( + self._target_samples, self._context_samples, eval_type="cross_entropy" + ) + self.assertEqual(round(float(score), 2), 1.04) + + def test_perplexity_smooth(self): + convokit_lm = ConvoKitLanguageModel(model_type="test_convokit_lm", smooth=True) + score = convokit_lm.evaluate( + self._target_samples, self._context_samples, eval_type="perplexity" + ) + self.assertEqual(round(float(score), 2), 4.02) + + def test_perplexity_no_smooth(self): + convokit_lm = ConvoKitLanguageModel(model_type="test_convokit_lm", smooth=False) + score = convokit_lm.evaluate( + self._target_samples, self._context_samples, eval_type="perplexity" + ) + self.assertEqual(round(float(score), 2), 3.00) + + +class TestWithMemory(TestConvoKitLanguageModel): + def setUp(self) -> None: + self._target_samples = [["this", "is", "test"], ["is", "test"]] + self._context_samples = [["this", "is", "a", "test"], ["this", "test"]] + super()._init(self._target_samples, self._context_samples) + + def test_cross_entropy_smooth(self): + super().test_cross_entropy_smooth() + + def test_cross_entropy_no_smooth(self): + super().test_cross_entropy_no_smooth() + + def test_perplexity_smooth(self): + super().test_perplexity_smooth() + + def test_perplexity_no_smooth(self): + super().test_perplexity_no_smooth() + + +class TestWithDb(TestConvoKitLanguageModel): + def setUp(self) -> None: + self._target_samples = [["this", "is", "test"], ["is", "test"]] + self._context_samples = [["this", "is", "a", "test"], ["this", "test"]] + super()._init(self._target_samples, self._context_samples) + + def test_cross_entropy_smooth(self): + super().test_cross_entropy_smooth() + + def test_cross_entropy_no_smooth(self): + super().test_cross_entropy_no_smooth() + + def test_perplexity_smooth(self): + super().test_perplexity_smooth() + + def test_perplexity_no_smooth(self): + super().test_perplexity_no_smooth() diff --git a/convokit/tests/surprise/test_language_model.py b/convokit/tests/surprise/test_language_model.py new file mode 100644 index 00000000..45161b49 --- /dev/null +++ b/convokit/tests/surprise/test_language_model.py @@ -0,0 +1,139 @@ +import unittest + +import nltk.lm as nltk_lm +from nltk.util import ngrams, everygrams + +from convokit.surprise import language_model + + +class TestLm(language_model.LanguageModel): + def __init__(self): + super().__init__("test_language_model") + + @staticmethod + def eval_func(target, context): + return abs(len(context) - len(target)) + + +class TestNltkLm(language_model.LanguageModel): + def __init__(self, ngram_order=2): + super().__init__("test_nltk_language_model") + self._ngram_order = ngram_order + + def eval_func(self, target, context): + kneser_ney_lm = nltk_lm.models.KneserNeyInterpolated( + order=self._ngram_order, vocabulary=nltk_lm.Vocabulary(target + context) + ) + kneser_ney_lm.fit([everygrams(context, max_len=self._ngram_order)]) + return kneser_ney_lm.entropy(ngrams(target, n=self._ngram_order)) + + +class TestLanguageModel(unittest.TestCase): + def _init(self, target_samples, context_samples): + self._target_samples = target_samples + self._context_samples = context_samples + + def test_model_type(self): + test_lm = language_model.LanguageModel(model_type="test_language_model") + self.assertEqual(test_lm.type, "test_language_model") + + def test_model_config(self): + test_lm = language_model.LanguageModel(model_type="test_language_model", smooth=True) + expected_config = {"model_type": "test_language_model", "n_jobs": 1, "smooth": True} + self.assertEqual(test_lm.config, expected_config) + + def test_overwrite_args(self): + test_lm = language_model.LanguageModel(model_type="test_language_model", smooth=True) + try: + test_lm.evaluate(self._target_samples, self._context_samples, smooth=False) + except RuntimeError: + pass + expected_config = {"model_type": "test_language_model", "n_jobs": 1, "smooth": False} + self.assertEqual(test_lm.config, expected_config) + + def test_evaluate_cross_entropy_runtime_error(self): + test_lm = language_model.LanguageModel(model_type="test_language_model") + with self.assertRaises(RuntimeError): + test_lm.evaluate(self._target_samples, self._context_samples, "cross_entropy") + + def test_evaluate_perplexity_runtime_error(self): + test_lm = language_model.LanguageModel(model_type="test_language_model") + with self.assertRaises(RuntimeError): + test_lm.evaluate(self._target_samples, self._context_samples, "perplexity") + + def test_evaluate_unimplemented_attribute_error(self): + test_lm = language_model.LanguageModel(model_type="test_language_model") + with self.assertRaises(AttributeError): + test_lm.evaluate(self._target_samples, self._context_samples, "unimplemented") + + def test_evaluate(self): + test_lm = TestLm() + score = test_lm.evaluate(self._target_samples, self._context_samples, "eval_func") + self.assertEqual(score, 0.5) + + def test_evaluate_nltk(self): + test_lm = TestNltkLm() + score = test_lm.evaluate(self._target_samples, self._context_samples, "eval_func") + self.assertEqual(round(float(score), 2), 1.25) + + +class TestWithMemory(TestLanguageModel): + def setUp(self) -> None: + self._target_samples = [["this", "is", "test"], ["is", "test"]] + self._context_samples = [["this", "is", "a", "test"], ["this", "test"]] + super()._init(self._target_samples, self._context_samples) + + def test_model_type(self): + super().test_model_type() + + def test_model_config(self): + super().test_model_config() + + def test_overwrite_args(self): + super().test_overwrite_args() + + def test_evaluate_cross_entropy_runtime_error(self): + super().test_evaluate_cross_entropy_runtime_error() + + def test_evaluate_perplexity_runtime_error(self): + super().test_evaluate_perplexity_runtime_error() + + def test_evaluate_unimplemented_attribute_error(self): + super().test_evaluate_unimplemented_attribute_error() + + def test_evaluate(self): + super().test_evaluate() + + def test_evaluate_nltk(self): + super().test_evaluate_nltk() + + +class TestWithDb(TestLanguageModel): + def setUp(self) -> None: + self._target_samples = [["this", "is", "test"], ["is", "test"]] + self._context_samples = [["this", "is", "a", "test"], ["this", "test"]] + super()._init(self._target_samples, self._context_samples) + + def test_model_type(self): + super().test_model_type() + + def test_model_config(self): + super().test_model_config() + + def test_overwrite_args(self): + super().test_overwrite_args() + + def test_evaluate_cross_entropy_runtime_error(self): + super().test_evaluate_cross_entropy_runtime_error() + + def test_evaluate_perplexity_runtime_error(self): + super().test_evaluate_perplexity_runtime_error() + + def test_evaluate_unimplemented_attribute_error(self): + super().test_evaluate_unimplemented_attribute_error() + + def test_evaluate(self): + super().test_evaluate() + + def test_evaluate_nltk(self): + super().test_evaluate_nltk() diff --git a/convokit/tests/surprise/test_surprise.py b/convokit/tests/surprise/test_surprise.py new file mode 100644 index 00000000..243f960d --- /dev/null +++ b/convokit/tests/surprise/test_surprise.py @@ -0,0 +1,191 @@ +import random +import unittest + +import numpy as np + +from convokit.surprise import Surprise, ConvoKitLanguageModel +from convokit.tests.test_utils import small_burr_conv_corpus + + +class TestSurprise(unittest.TestCase): + def _init(self, corpus) -> None: + self._corpus = corpus + + def test_fit_model_groups(self): + surprise = Surprise( + model_key_selector=lambda utt: "_".join([utt.speaker.id, utt.conversation_id]) + ) + surprise = surprise.fit(self._corpus) + expected_model_groups = { + "hamilton_0": [["Pardon", "me", "."]], + "hamilton_1": [["Who", "'s", "asking", "?"]], + "hamilton_2": [["Are", "you", "Aaron", "Burr", ",", "sir", "?"]], + "burr_0": [["Are", "you", "Aaron", "Burr", ",", "sir", "?"]], + "burr_1": [["That", "depends", ".", "Pardon", "me", "."]], + "burr_2": [["That", "depends", "."]], + } + self.assertEqual(surprise._model_groups, expected_model_groups) + + def test_fit_model_groups_text_func_selector(self): + surprise = Surprise( + model_key_selector=lambda utt: "_".join([utt.speaker.id, utt.conversation_id]) + ) + surprise = surprise.fit( + self._corpus, + text_func=lambda utt: [ + " ".join( + [ + speaker_utt.text + for speaker_utt in utt.speaker.iter_utterances() + if speaker_utt.conversation_id != utt.conversation_id + ] + ) + ], + selector=lambda utt: utt.conversation_id == "0", + ) + expected_model_groups = { + "hamilton_0": [ + ["Who", "'s", "asking", "?", "Are", "you", "Aaron", "Burr", ",", "sir", "?"] + ], + "burr_0": [["That", "depends", ".", "Pardon", "me", ".", "That", "depends", "."]], + } + self.assertEqual(surprise._model_groups, expected_model_groups) + + def test_transform_large_context_target_size(self): + surprise = Surprise(model_key_selector=lambda utt: "corpus") + surprise = surprise.fit( + self._corpus, + text_func=lambda utt: [ + " ".join([corpus_utt.text for corpus_utt in self._corpus.iter_utterances()]) + ], + ) + transformed_corpus = surprise.transform(self._corpus, obj_type="utterance") + + utts = transformed_corpus.get_utterances_dataframe()["meta.surprise"] + surprise_scores = np.array([score["corpus"] for score in utts]) + self.assertTrue(np.isnan(surprise_scores).all()) + + def test_transform_multiple_jobs(self): + surprise = Surprise(model_key_selector=lambda utt: "corpus", n_jobs=2) + surprise = surprise.fit( + self._corpus, + text_func=lambda utt: [ + " ".join([corpus_utt.text for corpus_utt in self._corpus.iter_utterances()]) + ], + ) + transformed_corpus = surprise.transform(self._corpus, obj_type="utterance", n_jobs=2) + + utts = transformed_corpus.get_utterances_dataframe()["meta.surprise"] + surprise_scores = np.array([score["corpus"] for score in utts]) + self.assertTrue(np.isnan(surprise_scores).all()) + + def test_transform_convokit_language_model(self): + random.Random(42) + surprise = Surprise( + model_key_selector=lambda utt: "corpus", target_sample_size=3, context_sample_size=3 + ) + surprise = surprise.fit( + self._corpus, + text_func=lambda utt: [ + " ".join([corpus_utt.text for corpus_utt in self._corpus.iter_utterances()]) + ], + ) + language_model = ConvoKitLanguageModel(smooth=False) + transformed_corpus = surprise.transform( + self._corpus, obj_type="utterance", language_model=language_model + ) + + utts = transformed_corpus.get_utterances_dataframe()["meta.surprise"] + surprise_scores = np.round(np.array([score["corpus"] for score in utts]), 1) + expected_scores = np.array([1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1]) + self.assertTrue(np.allclose(surprise_scores, expected_scores, atol=1e-01)) + + def test_transform_language_model_parameters(self): + random.Random(42) + surprise = Surprise( + model_key_selector=lambda utt: "corpus", target_sample_size=3, context_sample_size=3 + ) + surprise = surprise.fit( + self._corpus, + text_func=lambda utt: [ + " ".join([corpus_utt.text for corpus_utt in self._corpus.iter_utterances()]) + ], + ) + transformed_corpus = surprise.transform(self._corpus, obj_type="utterance", smooth=False) + + utts = transformed_corpus.get_utterances_dataframe()["meta.surprise"] + surprise_scores = np.round(np.array([score["corpus"] for score in utts]), 1) + expected_scores = np.array([1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1]) + self.assertTrue(np.allclose(surprise_scores, expected_scores, atol=1e-01)) + + def test_transform(self): + random.Random(42) + surprise = Surprise( + model_key_selector=lambda utt: "corpus", target_sample_size=3, context_sample_size=3 + ) + surprise = surprise.fit( + self._corpus, + text_func=lambda utt: [ + " ".join([corpus_utt.text for corpus_utt in self._corpus.iter_utterances()]) + ], + ) + transformed_corpus = surprise.transform(self._corpus, obj_type="utterance") + + utts = transformed_corpus.get_utterances_dataframe()["meta.surprise"] + surprise_scores = np.round(np.array([score["corpus"] for score in utts]), 1) + expected_scores = np.array([1.8, 1.7, 1.7, 1.8, 1.7, 1.8, 1.8]) + self.assertTrue(np.allclose(surprise_scores, expected_scores, atol=1e-01)) + + +class TestWithMemory(TestSurprise): + def setUp(self) -> None: + self._small_burr_corpus = small_burr_conv_corpus() + super()._init(self._small_burr_corpus) + + def test_fit_model_groups(self): + super().test_fit_model_groups() + + def test_fit_model_groups_text_func_selector(self): + super().test_fit_model_groups_text_func_selector() + + def test_transform_large_context_target_size(self): + super().test_transform_large_context_target_size() + + def test_transform_multiple_jobs(self): + super().test_transform_multiple_jobs() + + def test_transform_convokit_language_model(self): + super().test_transform_convokit_language_model() + + def test_transform_language_model_parameters(self): + super().test_transform_language_model_parameters() + + def test_transform(self): + super().test_transform() + + +class TestWithDb(TestSurprise): + def setUp(self) -> None: + self._small_burr_corpus = small_burr_conv_corpus() + super()._init(self._small_burr_corpus) + + def test_fit_model_groups(self): + super().test_fit_model_groups() + + def test_fit_model_groups_text_func_selector(self): + super().test_fit_model_groups_text_func_selector() + + def test_transform_large_context_target_size(self): + super().test_transform_large_context_target_size() + + def test_transform_multiple_jobs(self): + super().test_transform_multiple_jobs() + + def test_transform_convokit_language_model(self): + super().test_transform_convokit_language_model() + + def test_transform_language_model_parameters(self): + super().test_transform_language_model_parameters() + + def test_transform(self): + super().test_transform() diff --git a/convokit/tests/test_utils.py b/convokit/tests/test_utils.py index 42bd5759..d9a210ab 100644 --- a/convokit/tests/test_utils.py +++ b/convokit/tests/test_utils.py @@ -9,7 +9,10 @@ FOX_TEXT = "A quick brown fox jumps over the lazy dog." BUFFALO_TEXT = "Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo" -FOX_BUFFALO_TEXT = "A quick brown fox jumps over the lazy dog. Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo" +FOX_BUFFALO_TEXT = ( + "A quick brown fox jumps over the lazy dog. Buffalo buffalo Buffalo buffalo " + "buffalo buffalo Buffalo buffalo" +) BURR_SIR_TEXT_1 = "Pardon me. Are you Aaron Burr, sir?" BURR_SIR_TEXT_2 = "That depends. Who's asking?" BURR_SIR_SENTENCE_1 = "Pardon me." @@ -129,6 +132,31 @@ def small_burr_corpus(): return Corpus(utterances=utterances) +def small_burr_conv_corpus(): + hamilton = Speaker(id="hamilton") + burr = Speaker(id="burr") + + utterances = [ + Utterance(id="0", text=BURR_SIR_SENTENCE_1, conversation_id="0", speaker=hamilton), + Utterance( + id="1", text=BURR_SIR_SENTENCE_2, conversation_id="0", reply_to="0", speaker=burr + ), + Utterance(id="2", text=BURR_SIR_SENTENCE_3, conversation_id="1", speaker=burr), + Utterance( + id="3", text=BURR_SIR_SENTENCE_4, conversation_id="1", reply_to="2", speaker=hamilton + ), + Utterance( + id="4", text=BURR_SIR_SENTENCE_1, conversation_id="1", reply_to="3", speaker=burr + ), + Utterance(id="5", text=BURR_SIR_SENTENCE_2, conversation_id="2", speaker=hamilton), + Utterance( + id="6", text=BURR_SIR_SENTENCE_3, conversation_id="2", reply_to="5", speaker=burr + ), + ] + + return Corpus(utterances=utterances) + + def small_burr_corpus_parsed(): corpus = small_burr_corpus() utterance_infos = [