forked from joshualoehr/ngram-language-model
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
51 lines (37 loc) · 1.39 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/bin/env python
import nltk
SOS = "<s> "
EOS = "</s>"
UNK = "<UNK>"
def add_sentence_tokens(sentences, n):
"""Wrap each sentence in SOS and EOS tokens.
For n >= 2, n-1 SOS tokens are added, otherwise only one is added.
Args:
sentences (list of str): the sentences to wrap.
n (int): order of the n-gram model which will use these sentences.
Returns:
List of sentences with SOS and EOS tokens wrapped around them.
"""
sos = SOS * (n-1) if n > 1 else SOS
return ['{}{} {}'.format(sos, s, EOS) for s in sentences]
def replace_singletons(tokens):
"""Replace tokens which appear only once in the corpus with <UNK>.
Args:
tokens (list of str): the tokens comprising the corpus.
Returns:
The same list of tokens with each singleton replaced by <UNK>.
"""
vocab = nltk.FreqDist(tokens)
return [token if vocab[token] > 1 else UNK for token in tokens]
def preprocess(sentences, n):
"""Add SOS/EOS/UNK tokens to given sentences and tokenize.
Args:
sentences (list of str): the sentences to preprocess.
n (int): order of the n-gram model which will use these sentences.
Returns:
The preprocessed sentences, tokenized by words.
"""
sentences = add_sentence_tokens(sentences, n)
tokens = ' '.join(sentences).split(' ')
tokens = replace_singletons(tokens)
return tokens