kaldi-asr · danpovey · Jan 21, 2019 · Oct 30, 2018 · Dec 8, 2018 · Jan 17, 2019
diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
@@ -1,11 +1,9 @@
 #!/usr/bin/env python
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
 # -*- coding: utf-8 -*-
 #
 # Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon
 
-from __future__ import print_function
-import sys
+import sys, re
 import json
 import codecs
 import operator
@@ -17,6 +15,7 @@
 uw_gigaword = tmpdir + "/es_wordlist.json"
 uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences"
 
+filtered_letters = re.compile(u'[¡¥ª°º¿àçèëìîôö0123456789]')
 merged_lexicon = []
 # All three lexicons are in different formats
 # First add the data from lexicon_fisher (A) into the dictionary
@@ -25,7 +24,8 @@
     merged_lexicon.append(line.strip())
 fisher.close()
 
-print("After adding the fisher data, the lexicon contains {} entries".format(len(merged_lexicon)))
+print "After adding the fisher data, the lexicon contains " \
+      + str(len(merged_lexicon)) + " entries."
 
 # Now add data from the LDC lexicon
 ldc = codecs.open(uw_LDC, encoding='iso-8859-1')
@@ -34,11 +34,12 @@
     if entries[0].lower() not in merged_lexicon:
         merged_lexicon.append(entries[0].lower())
 
-print("After adding the LDC data, the lexicon contains {} entries".format(len(merged_lexicon)))
+print "After adding the LDC data, the lexicon contains " \
+      + str(len(merged_lexicon)) + " entries."
 
 # Finally add the gigaword data
 gigaword = json.load(open(uw_gigaword))
-gigaword = reversed(sorted(gigaword.items(), key=operator.itemgetter(1)))
+gigaword = reversed(sorted(gigaword.iteritems(), key=operator.itemgetter(1)))
 
 for item in gigaword:
     # We need a maximum of wordlimit words in the lexicon
@@ -48,15 +49,17 @@
     if item[0].lower() not in merged_lexicon:
         merged_lexicon.append(item[0].lower())
 
-print("After adding the Gigaword data, the lexicon contains {} entries".format(len(merged_lexicon)))
+print "After adding the Gigaword data, the lexicon contains " \
+      + str(len(merged_lexicon)) + " entries."
 
 # Now write the uniquewords to a file
 lf = codecs.open(tmpdir + '/uniquewords64k', encoding='utf-8', mode='w+')
 ltuples = sorted(merged_lexicon)
 
 for item in ltuples:
-    lf.write(item + "\n")
+    if not item==u'ñ' and not re.search(filtered_letters, item):
+        lf.write(item + "\n")
 
 lf.close()
 
-print("Finshed writing unique words")
+print "Finshed writing unique words"