Skip to content

Commit 3bf5a6c

Browse files
committed
skip non-unicode top passwords in xato. (this only skips one pw currently)
1 parent 0e8c7eb commit 3bf5a6c

File tree

2 files changed

+42966
-28214
lines changed

2 files changed

+42966
-28214
lines changed

data-scripts/count_xato.coffee

+9-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ check_usage = () ->
3838
process.exit(0)
3939

4040
# after all passwords are counted, discard pws with counts <= COUNTS
41-
CUTOFF = 15
41+
CUTOFF = 10
4242

4343
# to save memory, after every batch of size BATCH_SIZE, go through counts and delete
4444
# long tail of entries with only one count.
@@ -52,6 +52,14 @@ normalize = (token) ->
5252
token.toLowerCase()
5353

5454
should_include = (password, xato_rank) ->
55+
for i in [0...password.length]
56+
if password.charCodeAt(i) > 127
57+
# xato mostly contains ascii-only passwords, so in practice
58+
# this will only skip one or two things. were that not the case /
59+
# were this used on a different data source, consider using
60+
# a unidecode-like library instead, similar to count_wikipedia / count_wiktionary
61+
console.log 'SKIPPING non-ascii password=#{password}, rank=#{xato_rank}'
62+
return false
5563
matches = []
5664
for matcher in [
5765
matching.spatial_match

0 commit comments

Comments
 (0)