-
Notifications
You must be signed in to change notification settings - Fork 5.4k
Spanish lexicon simplification #2999
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
801ab93
04c4a03
ea699b0
e898f93
6a30fed
3979576
3277332
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,11 +1,9 @@ | ||
| #!/usr/bin/env python | ||
| # Copyright 2014 Gaurav Kumar. Apache 2.0 | ||
| # -*- coding: utf-8 -*- | ||
| # | ||
| # Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon | ||
|
|
||
| from __future__ import print_function | ||
| import sys | ||
| import sys, re | ||
| import json | ||
| import codecs | ||
| import operator | ||
|
|
@@ -17,6 +15,7 @@ | |
| uw_gigaword = tmpdir + "/es_wordlist.json" | ||
| uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences" | ||
|
|
||
| filtered_letters = re.compile(u'[¡¥ª°º¿àçèëìîôö0123456789]') | ||
| merged_lexicon = [] | ||
| # All three lexicons are in different formats | ||
| # First add the data from lexicon_fisher (A) into the dictionary | ||
|
|
@@ -25,7 +24,8 @@ | |
| merged_lexicon.append(line.strip()) | ||
| fisher.close() | ||
|
|
||
| print("After adding the fisher data, the lexicon contains {} entries".format(len(merged_lexicon))) | ||
| print "After adding the fisher data, the lexicon contains " \ | ||
| + str(len(merged_lexicon)) + " entries." | ||
|
|
||
| # Now add data from the LDC lexicon | ||
| ldc = codecs.open(uw_LDC, encoding='iso-8859-1') | ||
|
|
@@ -34,11 +34,12 @@ | |
| if entries[0].lower() not in merged_lexicon: | ||
| merged_lexicon.append(entries[0].lower()) | ||
|
|
||
| print("After adding the LDC data, the lexicon contains {} entries".format(len(merged_lexicon))) | ||
| print "After adding the LDC data, the lexicon contains " \ | ||
| + str(len(merged_lexicon)) + " entries." | ||
|
|
||
| # Finally add the gigaword data | ||
| gigaword = json.load(open(uw_gigaword)) | ||
| gigaword = reversed(sorted(gigaword.items(), key=operator.itemgetter(1))) | ||
| gigaword = reversed(sorted(gigaword.iteritems(), key=operator.itemgetter(1))) | ||
|
||
|
|
||
| for item in gigaword: | ||
| # We need a maximum of wordlimit words in the lexicon | ||
|
|
@@ -48,15 +49,17 @@ | |
| if item[0].lower() not in merged_lexicon: | ||
| merged_lexicon.append(item[0].lower()) | ||
|
|
||
| print("After adding the Gigaword data, the lexicon contains {} entries".format(len(merged_lexicon))) | ||
| print "After adding the Gigaword data, the lexicon contains " \ | ||
| + str(len(merged_lexicon)) + " entries." | ||
|
|
||
| # Now write the uniquewords to a file | ||
| lf = codecs.open(tmpdir + '/uniquewords64k', encoding='utf-8', mode='w+') | ||
| ltuples = sorted(merged_lexicon) | ||
|
|
||
| for item in ltuples: | ||
| lf.write(item + "\n") | ||
| if not item==u'ñ' and not re.search(filtered_letters, item): | ||
| lf.write(item + "\n") | ||
|
|
||
| lf.close() | ||
|
|
||
| print("Finshed writing unique words") | ||
| print "Finshed writing unique words" | ||
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this import is still needed (for python2 compatibility)