doc tweak: make usage in data-scripts consistent with filenames in data/

lowe · lowe · commit 92f5ce5e2969 · 2015-11-09T22:53:36.000-08:00
diff --git a/data-scripts/count_wikipedia.py b/data-scripts/count_wikipedia.py
@@ -16,7 +16,7 @@ def usage():
 tokenize a directory of text and count unigrams.
 
 usage:
-%s input_dir ../data/written_english.txt
+%s input_dir ../data/english_wikipedia.txt
 
 input_dir is the root directory where sentence files live. Each file should contain
 one sentence per line, with punctuation. This script will walk the directory recursively,
diff --git a/data-scripts/count_wiktionary.py b/data-scripts/count_wiktionary.py
@@ -17,7 +17,7 @@ def usage():
 
 Put those into a single directory and point it to this script:
 
-%s wiktionary_html_dir ../data/spoken_english.txt
+%s wiktionary_html_dir ../data/us_tv_and_film.txt
 
 output.txt will include one line per word in the study, ordered by rank, of the form:
 
@@ -31,6 +31,7 @@ def parse_wiki_tokens(html_doc_str):
     results = []
     last3 = ['', '', '']
     header = True
+    skipped = 0
     for line in html_doc_str.split('\n'):
         last3.pop(0)
         last3.append(line.strip())
@@ -49,9 +50,12 @@ def parse_wiki_tokens(html_doc_str):
             #
             # otherwise end up with a bunch of duplicates eg victor / victor's
             if token.endswith("'s") and rank > 1000:
+                skipped += 1
                 continue
             count = int(count)
             results.append((rank, token, count))
+    # early docs have 1k entries, later 2k, last 1284
+    assert len(results) + skipped in [1000, 2000, 1284]
     return results
 
 def normalize(token):
diff --git a/data-scripts/count_xato.coffee b/data-scripts/count_xato.coffee
@@ -9,7 +9,7 @@ sprintf = require('sprintf-js').sprintf
 check_usage = () ->
   usage = '''
 
-  Run a frequency count on the raw 10M xato password set and keep the top 40k by
+  Run a frequency count on the raw 10M xato password set and keep counts over CUTOFF in
   descending frequency. That file can be found by googling around for:
   "xato 10-million-combos.txt"