Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions scripts/rnnlm/choose_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,8 @@
from collections import defaultdict
sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)

# because this script splits inside words, we cannot use latin-1; we actually need to know what
# what the encoding is. By default we make this utf-8; to handle encodings that are not compatible
# with utf-8 (e.g. gbk), we'll eventually have to make the encoding an option to this script.

import re
tab_or_space = re.compile('[ \t]+')


parser = argparse.ArgumentParser(description="This script chooses the sparse feature representation of words. "
"To be more specific, it chooses the set of features-- you compute "
Expand Down Expand Up @@ -92,7 +88,7 @@ def read_vocab(vocab_file):
vocab = {}
with open(vocab_file, 'r', encoding="utf-8") as f:
for line in f:
fields = re.split(tab_or_space, line)
fields = line.split()
assert len(fields) == 2
if fields[0] in vocab:
sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
Expand Down Expand Up @@ -121,7 +117,7 @@ def read_unigram_probs(unigram_probs_file):
unigram_probs = []
with open(unigram_probs_file, 'r', encoding="utf-8") as f:
for line in f:
fields = re.split(tab_or_space, line)
fields = line.split()
assert len(fields) == 2
idx = int(fields[0])
if idx >= len(unigram_probs):
Expand Down
4 changes: 2 additions & 2 deletions scripts/rnnlm/get_best_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

num_iters = None
try:
with open(args.rnnlm_dir + "/info.txt", encoding="latin-1") as f:
with open(args.rnnlm_dir + "/info.txt", encoding="utf-8") as f:
for line in f:
a = line.split("=")
if a[0] == "num_iters":
Expand All @@ -40,7 +40,7 @@
for i in range(1, num_iters):
this_logfile = "{0}/log/compute_prob.{1}.log".format(args.rnnlm_dir, i)
try:
f = open(this_logfile, 'r', encoding='latin-1')
f = open(this_logfile, 'r', encoding='utf-8')
except:
sys.exit(sys.argv[0] + ": could not open log-file {0}".format(this_logfile))
this_objf = -1000
Expand Down
2 changes: 1 addition & 1 deletion scripts/rnnlm/get_embedding_dim.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
left_context=0
right_context=0
for line in out_lines:
line = line.decode('latin-1')
line = line.decode('utf-8')
m = re.search(r'input-node name=input dim=(\d+)', line)
if m is not None:
try:
Expand Down
6 changes: 3 additions & 3 deletions scripts/rnnlm/get_special_symbol_opts.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import sys

import re
tab_or_space = re.compile('[ \t]+')


parser = argparse.ArgumentParser(description="This script checks whether the special symbols "
"appear in words.txt with expected values, if not, it will "
Expand All @@ -28,9 +28,9 @@

lower_ids = {}
upper_ids = {}
input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1')
input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
for line in input_stream:
fields = re.split(tab_or_space, line)
fields = line.split()
assert(len(fields) == 2)
sym = fields[0]
if sym in special_symbols:
Expand Down
14 changes: 7 additions & 7 deletions scripts/rnnlm/get_unigram_probs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import sys

import re
tab_or_space = re.compile('[ \t]+')


parser = argparse.ArgumentParser(description="This script gets the unigram probabilities of words.",
epilog="E.g. " + sys.argv[0] + " --vocab-file=data/rnnlm/vocab/words.txt "
Expand Down Expand Up @@ -77,10 +77,10 @@ def get_all_data_sources_except_dev(text_dir):
# value is a tuple (repeated_times_per_epoch, weight)
def read_data_weights(weights_file, data_sources):
data_weights = {}
with open(weights_file, 'r', encoding="latin-1") as f:
with open(weights_file, 'r', encoding="utf-8") as f:
for line in f:
try:
fields = re.split(tab_or_space, line)
fields = line.split()
assert len(fields) == 3
if fields[0] in data_weights:
raise Exception("duplicated data source({0}) specified in "
Expand All @@ -102,9 +102,9 @@ def read_data_weights(weights_file, data_sources):
# return the vocab, which is a dict mapping the word to a integer id.
def read_vocab(vocab_file):
vocab = {}
with open(vocab_file, 'r', encoding="latin-1") as f:
with open(vocab_file, 'r', encoding="utf-8") as f:
for line in f:
fields = re.split(tab_or_space, line)
fields = line.split()
assert len(fields) == 2
if fields[0] in vocab:
sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
Expand All @@ -131,9 +131,9 @@ def get_counts(data_sources, data_weights, vocab):
if weight == 0.0:
continue

with open(counts_file, 'r', encoding="latin-1") as f:
with open(counts_file, 'r', encoding="utf-8") as f:
for line in f:
fields = re.split(tab_or_space, line)
fields = line.split()
if len(fields) != 2: print("Warning, should be 2 cols:", fields, line, file=sys.stderr);
assert(len(fields) == 2)
word = fields[0]
Expand Down
8 changes: 4 additions & 4 deletions scripts/rnnlm/get_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
import os
import argparse
import sys
sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)

import re
tab_or_space = re.compile('[ \t]+')


parser = argparse.ArgumentParser(description="This script get a vocab from unigram counts "
"of words produced by get_unigram_counts.sh",
Expand All @@ -28,10 +28,10 @@
# Add the count for every word in counts_file
# the result is written into word_counts
def add_counts(word_counts, counts_file):
with open(counts_file, 'r', encoding="latin-1") as f:
with open(counts_file, 'r', encoding="utf-8") as f:
for line in f:
line = line.strip(" \t\r\n")
word_and_count = re.split(tab_or_space, line)
word_and_count = line.split()
assert len(word_and_count) == 2
if word_and_count[0] in word_counts:
word_counts[word_and_count[0]] += int(word_and_count[1])
Expand Down
14 changes: 7 additions & 7 deletions scripts/rnnlm/get_word_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from collections import defaultdict

import re
tab_or_space = re.compile('[ \t]+')


parser = argparse.ArgumentParser(description="This script turns the words into the sparse feature representation, "
"using features from rnnlm/choose_features.py.",
Expand Down Expand Up @@ -41,9 +41,9 @@
# return the vocab, which is a dict mapping the word to a integer id.
def read_vocab(vocab_file):
vocab = {}
with open(vocab_file, 'r', encoding="latin-1") as f:
with open(vocab_file, 'r', encoding="utf-8") as f:
for line in f:
fields = re.split(tab_or_space, line)
fields = line.split()
assert len(fields) == 2
if fields[0] in vocab:
sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
Expand All @@ -62,9 +62,9 @@ def read_vocab(vocab_file):
# return a list of unigram_probs, indexed by word id
def read_unigram_probs(unigram_probs_file):
unigram_probs = []
with open(unigram_probs_file, 'r', encoding="latin-1") as f:
with open(unigram_probs_file, 'r', encoding="utf-8") as f:
for line in f:
fields = re.split(tab_or_space, line)
fields = line.split()
assert len(fields) == 2
idx = int(fields[0])
if idx >= len(unigram_probs):
Expand Down Expand Up @@ -103,9 +103,9 @@ def read_features(features_file):
feats['min_ngram_order'] = 10000
feats['max_ngram_order'] = -1

with open(features_file, 'r', encoding="latin-1") as f:
with open(features_file, 'r', encoding="utf-8") as f:
for line in f:
fields = re.split(tab_or_space, line)
fields = line.split()
assert(len(fields) in [3, 4, 5])

feat_id = int(fields[0])
Expand Down
12 changes: 6 additions & 6 deletions scripts/rnnlm/prepare_split_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import sys

import re
tab_or_space = re.compile('[ \t]+')


parser = argparse.ArgumentParser(description="This script prepares files containing integerized text, "
"for consumption by nnet3-get-egs.",
Expand Down Expand Up @@ -66,10 +66,10 @@ def get_all_data_sources_except_dev(text_dir):
# value is a tuple (repeated_times_per_epoch, weight)
def read_data_weights(weights_file, data_sources):
data_weights = {}
with open(weights_file, 'r', encoding="latin-1") as f:
with open(weights_file, 'r', encoding="utf-8") as f:
for line in f:
try:
fields = re.split(tab_or_space, line)
fields = line.split()
assert len(fields) == 3
if fields[0] in data_weights:
raise Exception("duplicated data source({0}) specified in "
Expand Down Expand Up @@ -97,7 +97,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles):
num_outputs = len(output_filehandles)
n = 0
try:
f = open(source_filename, 'r', encoding="latin-1")
f = open(source_filename, 'r', encoding="utf-8")
except Exception as e:
sys.exit(sys.argv[0] + ": failed to open file {0} for reading: {1} ".format(
source_filename, str(e)))
Expand All @@ -124,7 +124,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles):
os.makedirs(args.split_dir + "/info")

# set up the 'num_splits' file, which contains an integer.
with open("{0}/info/num_splits".format(args.split_dir), 'w', encoding="latin-1") as f:
with open("{0}/info/num_splits".format(args.split_dir), 'w', encoding="utf-8") as f:
print(args.num_splits, file=f)

# e.g. set temp_files = [ 'foo/1.tmp', 'foo/2.tmp', ..., 'foo/5.tmp' ]
Expand All @@ -136,7 +136,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles):
temp_filehandles = []
for fname in temp_files:
try:
temp_filehandles.append(open(fname, 'w', encoding="latin-1"))
temp_filehandles.append(open(fname, 'w', encoding="utf-8"))
except Exception as e:
sys.exit(sys.argv[0] + ": failed to open file: " + str(e) +
".. if this is a max-open-filehandles limitation, you may "
Expand Down
2 changes: 1 addition & 1 deletion scripts/rnnlm/rnnlm_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def get_compute_prob_info(log_file):
compute_prob_done = False
# roughly based on code in get_best_model.py
try:
f = open(log_file, "r", encoding="latin-1")
f = open(log_file, "r", encoding="utf-8")
except:
print(script_name + ": warning: compute_prob log not found for iteration " +
str(iter) + ". Skipping",
Expand Down
17 changes: 6 additions & 11 deletions scripts/rnnlm/show_word_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,10 @@
import argparse
import sys

# The use of latin-1 encoding does not preclude reading utf-8. latin-1 encoding
# means "treat words as sequences of bytes", and it is compatible with utf-8
# encoding as well as other encodings such as gbk, as long as the spaces are
# also spaces in ascii (which we check). It is basically how we emulate the
# behavior of python before python3.
sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)

import re
tab_or_space = re.compile('[ \t]+')


parser = argparse.ArgumentParser(description="This script turns the word features to a human readable format.",
epilog="E.g. " + sys.argv[0] + "exp/rnnlm/word_feats.txt exp/rnnlm/features.txt "
Expand All @@ -36,9 +31,9 @@
def read_feature_type_and_key(features_file):
feat_types = {}

with open(features_file, 'r', encoding="latin-1") as f:
with open(features_file, 'r', encoding="utf-8") as f:
for line in f:
fields = re.split(tab_or_space, line)
fields = line.split()
assert(len(fields) in [2, 3, 4])

feat_id = int(fields[0])
Expand All @@ -53,9 +48,9 @@ def read_feature_type_and_key(features_file):
feat_type_and_key = read_feature_type_and_key(args.features_file)

num_word_feats = 0
with open(args.word_features_file, 'r', encoding="latin-1") as f:
with open(args.word_features_file, 'r', encoding="utf-8") as f:
for line in f:
fields = re.split(tab_or_space, line)
fields = line.split()
assert len(fields) % 2 == 1

print(int(fields[0]), end='\t')
Expand Down
6 changes: 3 additions & 3 deletions scripts/rnnlm/validate_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import sys

import re
tab_or_space = re.compile('[ \t]+')


parser = argparse.ArgumentParser(description="Validates features file, produced by rnnlm/choose_features.py.",
epilog="E.g. " + sys.argv[0] + " exp/rnnlm/features.txt",
Expand All @@ -24,7 +24,7 @@
if not os.path.isfile(args.features_file):
sys.exit(sys.argv[0] + ": Expected file {0} to exist".format(args.features_file))

with open(args.features_file, 'r', encoding="latin-1") as f:
with open(args.features_file, 'r', encoding="utf-8") as f:
has_unigram = False
has_length = False
idx = 0
Expand All @@ -33,7 +33,7 @@
final_feats = {}
word_feats = {}
for line in f:
fields = re.split(tab_or_space, line)
fields = line.split()
assert(len(fields) in [3, 4, 5])

assert idx == int(fields[0])
Expand Down
10 changes: 5 additions & 5 deletions scripts/rnnlm/validate_text_dir.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import sys

import re
tab_or_space = re.compile('[ \t]+')


parser = argparse.ArgumentParser(description="Validates data directory containing text "
"files from one or more data sources, including dev.txt.",
Expand Down Expand Up @@ -40,7 +40,7 @@


def check_text_file(text_file):
with open(text_file, 'r', encoding="latin-1") as f:
with open(text_file, 'r', encoding="utf-8") as f:
found_nonempty_line = False
lineno = 0
if args.allow_internal_eos == 'true':
Expand All @@ -54,7 +54,7 @@ def check_text_file(text_file):
lineno += 1
if args.spot_check == 'true' and lineno > 10:
break
words = re.split(tab_or_space, line)
words = line.split()
if len(words) != 0:
found_nonempty_line = True
for word in words:
Expand All @@ -76,9 +76,9 @@ def check_text_file(text_file):
# with some kind of utterance-id
first_field_set = set()
other_fields_set = set()
with open(text_file, 'r', encoding="latin-1") as f:
with open(text_file, 'r', encoding="utf-8") as f:
for line in f:
array = re.split(tab_or_space, line)
array = line.split()
if len(array) > 0:
first_word = array[0]
if first_word in first_field_set or first_word in other_fields_set:
Expand Down
10 changes: 5 additions & 5 deletions scripts/rnnlm/validate_word_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import sys

import re
tab_or_space = re.compile('[ \t]+')


parser = argparse.ArgumentParser(description="Validates word features file, produced by rnnlm/get_word_features.py.",
epilog="E.g. " + sys.argv[0] + " --features-file=exp/rnnlm/features.txt "
Expand All @@ -28,9 +28,9 @@
unigram_feat_id = -1
length_feat_id = -1
max_feat_id = -1
with open(args.features_file, 'r', encoding="latin-1") as f:
with open(args.features_file, 'r', encoding="utf-8") as f:
for line in f:
fields = re.split(tab_or_space, line)
fields = line.split()
assert(len(fields) in [3, 4, 5])

feat_id = int(fields[0])
Expand All @@ -52,9 +52,9 @@
if feat_id > max_feat_id:
max_feat_id = feat_id

with open(args.word_features_file, 'r', encoding="latin-1") as f:
with open(args.word_features_file, 'r', encoding="utf-8") as f:
for line in f:
fields = re.split(tab_or_space, line)
fields = line.split()
assert len(fields) > 0 and len(fields) % 2 == 1
word_id = int(fields[0])

Expand Down