GH-485: add more head words

undertheseanlp · Sep 6, 2021 · 695cc65 · 695cc65
1 parent e4971ed
commit 695cc65
Show file tree

Hide file tree

Showing 7 changed files with 2,682 additions and 2 deletions.
diff --git a/apps/README.md b/apps/README.md
@@ -0,0 +1,22 @@
+# Continual Learning App
+
+## Installation
+
+Step 1: Install dependencies
+
+```
+pip install streamlit
+```
+
+Step 2: Download Elasticsearch
+
+```
+docker run elasticsearch:7.14.1
+```
+
+Run application
+
+```
+streamlit run col_streamlit.py 
+```
+
diff --git a/apps/col_streamlit.py b/apps/col_streamlit.py
@@ -0,0 +1,52 @@
+import os
+from os.path import dirname, join
+import streamlit as st
+from underthesea.utils.col_dictionary import Dictionary
+from underthesea.file_utils import UNDERTHESEA_FOLDER
+from underthesea.utils.col_script import UDDataset
+
+PROJECT_FOLDER = dirname(dirname(__file__))
+os.sys.path.append(PROJECT_FOLDER)
+
+DICTIONARY_FILE = join(PROJECT_FOLDER, "datasets", "dictionary", "202108.yaml")
+dictionary = Dictionary.load(dictionary_file=DICTIONARY_FILE)
+dictionary_n_words = len(dictionary.data)
+st.set_page_config(
+    page_title="Open Dictionary",
+    # layout="wide",
+)
+
+st.write("""
+# Open Dictionary
+""")
+file = "wiki_00"
+ud_file = join(UNDERTHESEA_FOLDER, "data", "viwiki-20210720", "ud", "AA", file)
+ud_dataset = UDDataset.load(ud_file)
+
+
+def find_word(ud_dataset, word) -> list:
+    max_samples = 10
+    i = 0
+    sentences = []
+    for s in ud_dataset:
+        text_original = s.headers['text']
+        text = text_original.lower()
+        if word in text:
+            sentences.append(text_original)
+            i += 1
+        if i > max_samples:
+            break
+    return sentences
+
+
+st.write('Loaded corpus: wiki')
+st.write(f'Loaded dictionary: {dictionary_n_words} words (202108.yaml)')
+
+word = st.text_input('Word')
+
+if word:
+    sentences = find_word(ud_dataset, word)
+    st.write(f"## {word}")
+    st.write("Output:")
+    for s in sentences:
+        st.write(s)
diff --git a/apps/col_streamlit_corpus.py b/apps/col_streamlit_corpus.py
@@ -0,0 +1,52 @@
+import streamlit as st
+import pandas as pd
+
+st.set_page_config(
+    page_title="Corpus",
+    layout="wide",
+)
+
+st.write("""
+# Vietnamese Wiki Corpus
+## Wikidump data August 2021
+""")
+c1, c2, c3 = st.columns([1, 1, 1])
+
+df = pd.DataFrame([
+    {"": "Language", "value": "Vietnamese"},
+    {"": "Corpus description", "value": "Vietnamese Wiki Corpus"},
+    {"": "Tagset", "value": "10"},
+    {"": "Grammar", "value": "0"},
+])
+
+c1.write("""
+# General Info
+""")
+c1.dataframe(df.assign(title='').set_index('title'))
+
+df_count = pd.DataFrame([
+    {"": "Tokens", "value": "0"},
+    {"": "Words", "value": "0"},
+    {"": "Sentences", "value": "0"},
+    {"": "Documents", "value": "0"},
+])
+c2.write("""
+# Counts
+""")
+c2.dataframe(df_count.assign(title='').set_index('title'))
+
+c3.write("""
+# Lexicon Sizes
+""")
+df_lexicon = pd.DataFrame([
+    {"": "Word", "value": "0"},
+    {"": "Lemma", "value": "0"},
+    {"": "lc", "value": "0"},
+    {"": "lemma_lc", "value": "0"},
+])
+c3.dataframe(df_lexicon.assign(title='').set_index('title'))
+
+s2_c1, s2_c2 = st.columns([1, 1])
+s2_c1.write("""
+# Common Tags
+""")
diff --git a/apps/step1_index_data.py b/apps/step1_index_data.py
@@ -0,0 +1,30 @@
+import os
+from os.path import join, dirname
+from elasticsearch import Elasticsearch
+
+from underthesea.utils.col_dictionary import Dictionary
+
+PROJECT_FOLDER = dirname(dirname(__file__))
+os.sys.path.append(PROJECT_FOLDER)
+
+DICTIONARY_FILE = join(PROJECT_FOLDER, "datasets", "dictionary", "202108.yaml")
+dictionary = Dictionary.load(dictionary_file=DICTIONARY_FILE)
+
+
+def index_dictionary(dictionary, es, index_name):
+    for item in dictionary.data:
+        print('Index ', item)
+        try:
+            item_data = dictionary.data[item]
+            doc = {
+                'word': item,
+                'data': item_data
+            }
+            es.index(index=index_name, body=doc)
+        except Exception as e:
+            print(e)
+
+
+if __name__ == '__main__':
+    es = Elasticsearch()
+    # index_dictionary(dictionary, es, index_name="dictionary")
diff --git a/apps/step2_compute_data.py b/apps/step2_compute_data.py