From 9268e1280d088d8979c6a5b5503290854863f9ae Mon Sep 17 00:00:00 2001 From: Tai D Nguyen Date: Sun, 19 Sep 2021 22:09:50 -0400 Subject: [PATCH] GH-485: edit DEVELOPER_GUIDES.md (#499) --- .../dictionary/DEVELOPER_GUIDES.md | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/datasets/UD_Vietnamese-COL/dictionary/DEVELOPER_GUIDES.md b/datasets/UD_Vietnamese-COL/dictionary/DEVELOPER_GUIDES.md index 77902d1d..2430a887 100644 --- a/datasets/UD_Vietnamese-COL/dictionary/DEVELOPER_GUIDES.md +++ b/datasets/UD_Vietnamese-COL/dictionary/DEVELOPER_GUIDES.md @@ -1,27 +1,32 @@ # Developer Guides -Step 1: Download wikipedia dump +Step 1: Download Wikipedia dump ``` export TS=20210720 mkdir -p ~/.underthesea/data/viwiki-$TS -cd ~/.underthesea/data/viwiki-TS -wget https://dumps.wikimedia.org/viwiki/20210820/viwiki-20210820-pages-articles.xml.bz2 +cd ~/.underthesea/data/viwiki-$TS +wget https://dumps.wikimedia.org/viwiki/$TS/viwiki-$TS-pages-articles.xml.bz2 wget https://raw.githubusercontent.com/NTT123/viwik18/master/WikiExtractor.py -bzip2 -d viwiki-20210820-pages-articles.xml.bz2 -python WikiExtractor.py -s --lists viwiki-20210820-pages-articles.xml -q -o - | perl -CSAD -Mutf8 cleaner.pl > viwik18.txt +bzip2 -d viwiki-$TS-pages-articles.xml.bz2 +python WikiExtractor.py --no-templates -b 10M -s --lists viwiki-$TS-pages-articles.xml ``` -Step 2: Run +Step 2: Clean data ``` -python utils/col_wiki_clean.py -python utils/col_wiki_ud.py +python underthesea/utils/col_wiki_clean.py +python underthesea/utils/col_wiki_ud.py ``` -Step 3: Run +Step 3: Run ``` -python utils/col_dictionary.py -python utils/col_dictionary_import.py -``` \ No newline at end of file +python underthesea/utils/col_dictionary.py +python underthesea/utils/col_dictionary_import.py +``` +For Mac OS >= Mojave, alternatively run +``` +OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES python underthesea/utils/col_dictionary.py +python underthesea/utils/col_dictionary_import.py +```