Skip to content
Merged
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
#!/usr/bin/env python
# Copyright 2014 Gaurav Kumar. Apache 2.0
# -*- coding: utf-8 -*-
#
# Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon

from __future__ import print_function
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this import is still needed (for python2 compatibility)

import sys
import sys, re
import json
import codecs
import operator
Expand All @@ -17,6 +15,7 @@
uw_gigaword = tmpdir + "/es_wordlist.json"
uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences"

filtered_letters = re.compile(u'[¡¥ª°º¿àçèëìîôö0123456789]')
merged_lexicon = []
# All three lexicons are in different formats
# First add the data from lexicon_fisher (A) into the dictionary
Expand All @@ -25,7 +24,8 @@
merged_lexicon.append(line.strip())
fisher.close()

print("After adding the fisher data, the lexicon contains {} entries".format(len(merged_lexicon)))
print "After adding the fisher data, the lexicon contains " \
+ str(len(merged_lexicon)) + " entries."

# Now add data from the LDC lexicon
ldc = codecs.open(uw_LDC, encoding='iso-8859-1')
Expand All @@ -34,11 +34,12 @@
if entries[0].lower() not in merged_lexicon:
merged_lexicon.append(entries[0].lower())

print("After adding the LDC data, the lexicon contains {} entries".format(len(merged_lexicon)))
print "After adding the LDC data, the lexicon contains " \
+ str(len(merged_lexicon)) + " entries."

# Finally add the gigaword data
gigaword = json.load(open(uw_gigaword))
gigaword = reversed(sorted(gigaword.items(), key=operator.itemgetter(1)))
gigaword = reversed(sorted(gigaword.iteritems(), key=operator.itemgetter(1)))
Copy link
Contributor

@danpovey danpovey Jan 18, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you seem to have reversed the change of iteritems to items-- this breaks in python3.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for pointing out. Its reverted back now.


for item in gigaword:
# We need a maximum of wordlimit words in the lexicon
Expand All @@ -48,15 +49,17 @@
if item[0].lower() not in merged_lexicon:
merged_lexicon.append(item[0].lower())

print("After adding the Gigaword data, the lexicon contains {} entries".format(len(merged_lexicon)))
print "After adding the Gigaword data, the lexicon contains " \
+ str(len(merged_lexicon)) + " entries."

# Now write the uniquewords to a file
lf = codecs.open(tmpdir + '/uniquewords64k', encoding='utf-8', mode='w+')
ltuples = sorted(merged_lexicon)

for item in ltuples:
lf.write(item + "\n")
if not item==u'ñ' and not re.search(filtered_letters, item):
lf.write(item + "\n")

lf.close()

print("Finshed writing unique words")
print "Finshed writing unique words"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like this was also changed (would break python3).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. Fixed.