GH-485: update dictionary app

undertheseanlp · Sep 12, 2021 · cd18f57 · cd18f57
1 parent 6b1acfe
commit cd18f57
Show file tree

Hide file tree

Showing 9 changed files with 20,867 additions and 29 deletions.
diff --git a/apps/README.md b/apps/README.md
@@ -8,15 +8,16 @@ Step 1: Install dependencies
 pip install streamlit
 ```
 
-Step 2: Download Elasticsearch
+Step 2: Run Elasticsearch
 
 ```
-docker run elasticsearch:7.14.1
+cd components/elasticsearch
+docker-compose up
 ```
 
-Run application
+Step 3: Run application
 
 ```
-streamlit run col_streamlit.py 
+streamlit run col_dictionary.py 
 ```
 
diff --git a/apps/col_data.py b/apps/col_data.py
@@ -0,0 +1,27 @@
+from elasticsearch import Elasticsearch
+from elasticsearch_dsl import Search
+
+
+class Dictionary:
+    def __init__(self, es_index='dictionary'):
+        self.es_index = es_index
+        self.es = Elasticsearch()
+        s = Search(using=self.es, index=es_index)
+        ids = [h.meta.id for h in s.scan()]
+        words = sorted(ids)
+        self.words = words
+
+    def get_word(self, word):
+        if word not in self.words:
+            return {}
+        print(word)
+        body = self.es.get(index=self.es_index, id=word)
+        data = body["_source"]
+        return data
+
+    def save(self, word, data):
+        self.es.index(index=self.es_index, id=word, body=data)
+
+
+if __name__ == '__main__':
+    dictionary = Dictionary(es_index='dictionary')
diff --git a/apps/col_dictionary.py b/apps/col_dictionary.py
@@ -1,14 +1,14 @@
 import os
 from os.path import join
-
 import streamlit as st
 import streamlit.components.v1 as components
 
+from apps.col_data import Dictionary
+
 st.set_page_config(
     page_title="Dictionary App",
     layout="wide",
 )
-
 json_viewer_build_dir = join(
     os.path.dirname(os.path.abspath(__file__)),
     "components", "json_viewer", "component", "frontend", "build"
@@ -17,29 +17,43 @@
     "json_viewer",
     path=json_viewer_build_dir
 )
+dictionary = Dictionary()
+init_word = 'a'
+
+
+def switch_word(word):
+    st.session_state['current_word'] = word
+    st.session_state['current_word_data'] = dictionary.get_word(word)
+
 
-# SIDEBAR
-add_selectbox = st.sidebar.text_input('Word', value="gà")
-
-# MAIN
-st.write('# Dictionary')
-st.text_input('Word', key=1, value="gà")
-data = [
-    {
-        "description": "(Động vật học) Loài chim nuôi (gia cầm) để lấy thịt và trứng, bay kém, mỏ cứng, con trống có cựa và biết gáy.",
-        "tag": "noun",
-        "examples": [
-            "Bán gà ngày gió, bán chó ngày mưa. (tục ngữ)",
-            "Gà người gáy, gà nhà ta sáng. (tục ngữ)"
-        ]
-    },
-    {
-        "description": "Đánh cuộc riêng trong một ván bài tổ tôm hay tài bàn ngoài số tiền góp chính",
-        "tag": "verb",
-        "examples": [
-            "Gà lần nào cũng thua thì đánh làm gì."
-        ]
+if __name__ == '__main__':
+    if 'current_word' not in st.session_state:
+        switch_word(init_word)
+
+    # SIDEBAR
+    m = st.markdown("""
+    <style>
+    div.stButton > button {
+        width: 100%;
     }
-]
+    </style>""", unsafe_allow_html=True)
+
+    placeholder = st.sidebar.empty()
+    search_text_box = placeholder.text_input('Word', value=st.session_state['current_word'], key='sidebar_text_input')
+    if search_text_box:
+        st.session_state['current_word'] = search_text_box
+
+    buttons = {}
+    for word in dictionary.words[:15]:
+        buttons[word] = st.sidebar.button(label=word, key=word)
+        if buttons[word]:
+            switch_word(word)
+
+    st.write('# Dictionary')
+    data = st.session_state['current_word_data']
+
+    output_data = json_viewer(json_object=data, label=0)
 
-output_data = json_viewer(json_object=data, label=0)
+    save_button = st.button('Save')
+    if save_button:
+        dictionary.save(st.session_state['current_word'], output_data)
diff --git a/apps/col_dictionary_import_to_elasticsearch.py b/apps/col_dictionary_import_to_elasticsearch.py
@@ -0,0 +1,21 @@
+from elasticsearch import Elasticsearch
+from apps.utils import DICTIONARY_FILE
+import yaml
+
+if __name__ == '__main__':
+    with open(DICTIONARY_FILE) as f:
+        content = f.read()
+
+    es = Elasticsearch()
+
+    words = yaml.safe_load(content)
+    i = 0
+    for word_key, value in words.items():
+        i += 1
+        data = {
+            "headword": word_key,
+            "senses": value
+        }
+        es.index(index='dictionary', body=data, id=word_key)
+        if i % 1000 == 0 and i > 0:
+            print(i)
diff --git a/apps/components/elasticsearch/README.md b/apps/components/elasticsearch/README.md
@@ -0,0 +1,10 @@
+# Elasticsearch
+
+We use elasticsearch as our main database, elasticsearch will sync with dictionary.yaml dictionary file
+
+Installation
+
+```
+docker volume create esdata1
+
+```
diff --git a/apps/components/elasticsearch/docker-compose.yml b/apps/components/elasticsearch/docker-compose.yml
@@ -0,0 +1,25 @@
+version: '2.2'
+services:
+  elasticsearch:
+    image: docker.elastic.co/elasticsearch/elasticsearch:6.8.18
+    container_name: elasticsearch
+    environment:
+      - cluster.name=docker-cluster
+      - bootstrap.memory_lock=true
+      - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    volumes:
+      - esdata1:/usr/share/elasticsearch/data
+    ports:
+      - 9200:9200
+    networks:
+      - esnet
+volumes:
+  esdata1:
+    driver: local
+
+networks:
+  esnet:
diff --git a/apps/utils.py b/apps/utils.py
@@ -0,0 +1,4 @@
+from os.path import abspath, dirname, join
+
+PROJECT_FOLDER = dirname(dirname(abspath(__file__)))
+DICTIONARY_FILE = join(PROJECT_FOLDER, "datasets", "UD_Vietnamese-COL", "dictionary", "dictionary.yaml")
diff --git a/datasets/UD_Vietnamese-COL/dictionary/README.md b/datasets/UD_Vietnamese-COL/dictionary/README.md
@@ -6,6 +6,8 @@ In this project, we try to create an open dictionary for Vietnamese.
 
 This is our attempt to build Vietnamese dictionary from scratch.
 We known that building Vietnamese dictionary is a difficult and challenge job that requires great effort.
+But the idea of being able to build an open dictionary for Vietnamese was so compelling that we thought we were crazy enought to do it.
+Who knows, we might be lucky enough to succeed.
 
 In the first phase, our goal was simply to bring the homogeneity of words and parts of speech into the corpus.