diff --git a/scripts/rnnlm/choose_features.py b/scripts/rnnlm/choose_features.py index c6621e04494..595c1d85bc1 100755 --- a/scripts/rnnlm/choose_features.py +++ b/scripts/rnnlm/choose_features.py @@ -10,12 +10,8 @@ from collections import defaultdict sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) -# because this script splits inside words, we cannot use latin-1; we actually need to know what -# what the encoding is. By default we make this utf-8; to handle encodings that are not compatible -# with utf-8 (e.g. gbk), we'll eventually have to make the encoding an option to this script. - import re -tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="This script chooses the sparse feature representation of words. " "To be more specific, it chooses the set of features-- you compute " @@ -92,7 +88,7 @@ def read_vocab(vocab_file): vocab = {} with open(vocab_file, 'r', encoding="utf-8") as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) == 2 if fields[0] in vocab: sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}" @@ -121,7 +117,7 @@ def read_unigram_probs(unigram_probs_file): unigram_probs = [] with open(unigram_probs_file, 'r', encoding="utf-8") as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) == 2 idx = int(fields[0]) if idx >= len(unigram_probs): diff --git a/scripts/rnnlm/get_best_model.py b/scripts/rnnlm/get_best_model.py index 333ed8dbfc7..ed266346e06 100755 --- a/scripts/rnnlm/get_best_model.py +++ b/scripts/rnnlm/get_best_model.py @@ -21,7 +21,7 @@ num_iters = None try: - with open(args.rnnlm_dir + "/info.txt", encoding="latin-1") as f: + with open(args.rnnlm_dir + "/info.txt", encoding="utf-8") as f: for line in f: a = line.split("=") if a[0] == "num_iters": @@ -40,7 +40,7 @@ for i in range(1, num_iters): this_logfile = "{0}/log/compute_prob.{1}.log".format(args.rnnlm_dir, i) try: - f = open(this_logfile, 'r', encoding='latin-1') + f = open(this_logfile, 'r', encoding='utf-8') except: sys.exit(sys.argv[0] + ": could not open log-file {0}".format(this_logfile)) this_objf = -1000 diff --git a/scripts/rnnlm/get_embedding_dim.py b/scripts/rnnlm/get_embedding_dim.py index 63eaf307498..1d516e0edf5 100755 --- a/scripts/rnnlm/get_embedding_dim.py +++ b/scripts/rnnlm/get_embedding_dim.py @@ -45,7 +45,7 @@ left_context=0 right_context=0 for line in out_lines: - line = line.decode('latin-1') + line = line.decode('utf-8') m = re.search(r'input-node name=input dim=(\d+)', line) if m is not None: try: diff --git a/scripts/rnnlm/get_special_symbol_opts.py b/scripts/rnnlm/get_special_symbol_opts.py index 4310b116ad7..7ee0ca54c9a 100755 --- a/scripts/rnnlm/get_special_symbol_opts.py +++ b/scripts/rnnlm/get_special_symbol_opts.py @@ -9,7 +9,7 @@ import sys import re -tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="This script checks whether the special symbols " "appear in words.txt with expected values, if not, it will " @@ -28,9 +28,9 @@ lower_ids = {} upper_ids = {} -input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1') +input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') for line in input_stream: - fields = re.split(tab_or_space, line) + fields = line.split() assert(len(fields) == 2) sym = fields[0] if sym in special_symbols: diff --git a/scripts/rnnlm/get_unigram_probs.py b/scripts/rnnlm/get_unigram_probs.py index ab3f9bb382f..e3189b26a92 100755 --- a/scripts/rnnlm/get_unigram_probs.py +++ b/scripts/rnnlm/get_unigram_probs.py @@ -8,7 +8,7 @@ import sys import re -tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="This script gets the unigram probabilities of words.", epilog="E.g. " + sys.argv[0] + " --vocab-file=data/rnnlm/vocab/words.txt " @@ -77,10 +77,10 @@ def get_all_data_sources_except_dev(text_dir): # value is a tuple (repeated_times_per_epoch, weight) def read_data_weights(weights_file, data_sources): data_weights = {} - with open(weights_file, 'r', encoding="latin-1") as f: + with open(weights_file, 'r', encoding="utf-8") as f: for line in f: try: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) == 3 if fields[0] in data_weights: raise Exception("duplicated data source({0}) specified in " @@ -102,9 +102,9 @@ def read_data_weights(weights_file, data_sources): # return the vocab, which is a dict mapping the word to a integer id. def read_vocab(vocab_file): vocab = {} - with open(vocab_file, 'r', encoding="latin-1") as f: + with open(vocab_file, 'r', encoding="utf-8") as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) == 2 if fields[0] in vocab: sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}" @@ -131,9 +131,9 @@ def get_counts(data_sources, data_weights, vocab): if weight == 0.0: continue - with open(counts_file, 'r', encoding="latin-1") as f: + with open(counts_file, 'r', encoding="utf-8") as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() if len(fields) != 2: print("Warning, should be 2 cols:", fields, line, file=sys.stderr); assert(len(fields) == 2) word = fields[0] diff --git a/scripts/rnnlm/get_vocab.py b/scripts/rnnlm/get_vocab.py index 1502e915f9c..baafcb3a131 100755 --- a/scripts/rnnlm/get_vocab.py +++ b/scripts/rnnlm/get_vocab.py @@ -6,10 +6,10 @@ import os import argparse import sys -sys.stdout = open(1, 'w', encoding='latin-1', closefd=False) +sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) import re -tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="This script get a vocab from unigram counts " "of words produced by get_unigram_counts.sh", @@ -28,10 +28,10 @@ # Add the count for every word in counts_file # the result is written into word_counts def add_counts(word_counts, counts_file): - with open(counts_file, 'r', encoding="latin-1") as f: + with open(counts_file, 'r', encoding="utf-8") as f: for line in f: line = line.strip(" \t\r\n") - word_and_count = re.split(tab_or_space, line) + word_and_count = line.split() assert len(word_and_count) == 2 if word_and_count[0] in word_counts: word_counts[word_and_count[0]] += int(word_and_count[1]) diff --git a/scripts/rnnlm/get_word_features.py b/scripts/rnnlm/get_word_features.py index aeb7a3ec6ae..cdcc0a77734 100755 --- a/scripts/rnnlm/get_word_features.py +++ b/scripts/rnnlm/get_word_features.py @@ -10,7 +10,7 @@ from collections import defaultdict import re -tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="This script turns the words into the sparse feature representation, " "using features from rnnlm/choose_features.py.", @@ -41,9 +41,9 @@ # return the vocab, which is a dict mapping the word to a integer id. def read_vocab(vocab_file): vocab = {} - with open(vocab_file, 'r', encoding="latin-1") as f: + with open(vocab_file, 'r', encoding="utf-8") as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) == 2 if fields[0] in vocab: sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}" @@ -62,9 +62,9 @@ def read_vocab(vocab_file): # return a list of unigram_probs, indexed by word id def read_unigram_probs(unigram_probs_file): unigram_probs = [] - with open(unigram_probs_file, 'r', encoding="latin-1") as f: + with open(unigram_probs_file, 'r', encoding="utf-8") as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) == 2 idx = int(fields[0]) if idx >= len(unigram_probs): @@ -103,9 +103,9 @@ def read_features(features_file): feats['min_ngram_order'] = 10000 feats['max_ngram_order'] = -1 - with open(features_file, 'r', encoding="latin-1") as f: + with open(features_file, 'r', encoding="utf-8") as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert(len(fields) in [3, 4, 5]) feat_id = int(fields[0]) diff --git a/scripts/rnnlm/prepare_split_data.py b/scripts/rnnlm/prepare_split_data.py index cceac48313e..427f043df98 100755 --- a/scripts/rnnlm/prepare_split_data.py +++ b/scripts/rnnlm/prepare_split_data.py @@ -9,7 +9,7 @@ import sys import re -tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="This script prepares files containing integerized text, " "for consumption by nnet3-get-egs.", @@ -66,10 +66,10 @@ def get_all_data_sources_except_dev(text_dir): # value is a tuple (repeated_times_per_epoch, weight) def read_data_weights(weights_file, data_sources): data_weights = {} - with open(weights_file, 'r', encoding="latin-1") as f: + with open(weights_file, 'r', encoding="utf-8") as f: for line in f: try: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) == 3 if fields[0] in data_weights: raise Exception("duplicated data source({0}) specified in " @@ -97,7 +97,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles): num_outputs = len(output_filehandles) n = 0 try: - f = open(source_filename, 'r', encoding="latin-1") + f = open(source_filename, 'r', encoding="utf-8") except Exception as e: sys.exit(sys.argv[0] + ": failed to open file {0} for reading: {1} ".format( source_filename, str(e))) @@ -124,7 +124,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles): os.makedirs(args.split_dir + "/info") # set up the 'num_splits' file, which contains an integer. -with open("{0}/info/num_splits".format(args.split_dir), 'w', encoding="latin-1") as f: +with open("{0}/info/num_splits".format(args.split_dir), 'w', encoding="utf-8") as f: print(args.num_splits, file=f) # e.g. set temp_files = [ 'foo/1.tmp', 'foo/2.tmp', ..., 'foo/5.tmp' ] @@ -136,7 +136,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles): temp_filehandles = [] for fname in temp_files: try: - temp_filehandles.append(open(fname, 'w', encoding="latin-1")) + temp_filehandles.append(open(fname, 'w', encoding="utf-8")) except Exception as e: sys.exit(sys.argv[0] + ": failed to open file: " + str(e) + ".. if this is a max-open-filehandles limitation, you may " diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py index 40cbee7a496..6a304f7f4cb 100644 --- a/scripts/rnnlm/rnnlm_cleanup.py +++ b/scripts/rnnlm/rnnlm_cleanup.py @@ -69,7 +69,7 @@ def get_compute_prob_info(log_file): compute_prob_done = False # roughly based on code in get_best_model.py try: - f = open(log_file, "r", encoding="latin-1") + f = open(log_file, "r", encoding="utf-8") except: print(script_name + ": warning: compute_prob log not found for iteration " + str(iter) + ". Skipping", diff --git a/scripts/rnnlm/show_word_features.py b/scripts/rnnlm/show_word_features.py index 89b134adaf9..4335caed5d8 100755 --- a/scripts/rnnlm/show_word_features.py +++ b/scripts/rnnlm/show_word_features.py @@ -7,15 +7,10 @@ import argparse import sys -# The use of latin-1 encoding does not preclude reading utf-8. latin-1 encoding -# means "treat words as sequences of bytes", and it is compatible with utf-8 -# encoding as well as other encodings such as gbk, as long as the spaces are -# also spaces in ascii (which we check). It is basically how we emulate the -# behavior of python before python3. -sys.stdout = open(1, 'w', encoding='latin-1', closefd=False) +sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) import re -tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="This script turns the word features to a human readable format.", epilog="E.g. " + sys.argv[0] + "exp/rnnlm/word_feats.txt exp/rnnlm/features.txt " @@ -36,9 +31,9 @@ def read_feature_type_and_key(features_file): feat_types = {} - with open(features_file, 'r', encoding="latin-1") as f: + with open(features_file, 'r', encoding="utf-8") as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert(len(fields) in [2, 3, 4]) feat_id = int(fields[0]) @@ -53,9 +48,9 @@ def read_feature_type_and_key(features_file): feat_type_and_key = read_feature_type_and_key(args.features_file) num_word_feats = 0 -with open(args.word_features_file, 'r', encoding="latin-1") as f: +with open(args.word_features_file, 'r', encoding="utf-8") as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) % 2 == 1 print(int(fields[0]), end='\t') diff --git a/scripts/rnnlm/validate_features.py b/scripts/rnnlm/validate_features.py index 2a077da4758..e67f03207bb 100755 --- a/scripts/rnnlm/validate_features.py +++ b/scripts/rnnlm/validate_features.py @@ -8,7 +8,7 @@ import sys import re -tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="Validates features file, produced by rnnlm/choose_features.py.", epilog="E.g. " + sys.argv[0] + " exp/rnnlm/features.txt", @@ -24,7 +24,7 @@ if not os.path.isfile(args.features_file): sys.exit(sys.argv[0] + ": Expected file {0} to exist".format(args.features_file)) -with open(args.features_file, 'r', encoding="latin-1") as f: +with open(args.features_file, 'r', encoding="utf-8") as f: has_unigram = False has_length = False idx = 0 @@ -33,7 +33,7 @@ final_feats = {} word_feats = {} for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert(len(fields) in [3, 4, 5]) assert idx == int(fields[0]) diff --git a/scripts/rnnlm/validate_text_dir.py b/scripts/rnnlm/validate_text_dir.py index 903e720bdf4..1f250d4c2f8 100755 --- a/scripts/rnnlm/validate_text_dir.py +++ b/scripts/rnnlm/validate_text_dir.py @@ -8,7 +8,7 @@ import sys import re -tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="Validates data directory containing text " "files from one or more data sources, including dev.txt.", @@ -40,7 +40,7 @@ def check_text_file(text_file): - with open(text_file, 'r', encoding="latin-1") as f: + with open(text_file, 'r', encoding="utf-8") as f: found_nonempty_line = False lineno = 0 if args.allow_internal_eos == 'true': @@ -54,7 +54,7 @@ def check_text_file(text_file): lineno += 1 if args.spot_check == 'true' and lineno > 10: break - words = re.split(tab_or_space, line) + words = line.split() if len(words) != 0: found_nonempty_line = True for word in words: @@ -76,9 +76,9 @@ def check_text_file(text_file): # with some kind of utterance-id first_field_set = set() other_fields_set = set() - with open(text_file, 'r', encoding="latin-1") as f: + with open(text_file, 'r', encoding="utf-8") as f: for line in f: - array = re.split(tab_or_space, line) + array = line.split() if len(array) > 0: first_word = array[0] if first_word in first_field_set or first_word in other_fields_set: diff --git a/scripts/rnnlm/validate_word_features.py b/scripts/rnnlm/validate_word_features.py index 205b934ae1b..372286d8d12 100755 --- a/scripts/rnnlm/validate_word_features.py +++ b/scripts/rnnlm/validate_word_features.py @@ -8,7 +8,7 @@ import sys import re -tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="Validates word features file, produced by rnnlm/get_word_features.py.", epilog="E.g. " + sys.argv[0] + " --features-file=exp/rnnlm/features.txt " @@ -28,9 +28,9 @@ unigram_feat_id = -1 length_feat_id = -1 max_feat_id = -1 -with open(args.features_file, 'r', encoding="latin-1") as f: +with open(args.features_file, 'r', encoding="utf-8") as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert(len(fields) in [3, 4, 5]) feat_id = int(fields[0]) @@ -52,9 +52,9 @@ if feat_id > max_feat_id: max_feat_id = feat_id -with open(args.word_features_file, 'r', encoding="latin-1") as f: +with open(args.word_features_file, 'r', encoding="utf-8") as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) > 0 and len(fields) % 2 == 1 word_id = int(fields[0])