Skip to content

Commit

Permalink
GH-485: update dictionary app
Browse files Browse the repository at this point in the history
  • Loading branch information
anh.vu2 committed Sep 12, 2021
1 parent 6b1acfe commit cd18f57
Show file tree
Hide file tree
Showing 9 changed files with 20,867 additions and 29 deletions.
9 changes: 5 additions & 4 deletions apps/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,16 @@ Step 1: Install dependencies
pip install streamlit
```

Step 2: Download Elasticsearch
Step 2: Run Elasticsearch

```
docker run elasticsearch:7.14.1
cd components/elasticsearch
docker-compose up
```

Run application
Step 3: Run application

```
streamlit run col_streamlit.py
streamlit run col_dictionary.py
```

27 changes: 27 additions & 0 deletions apps/col_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search


class Dictionary:
def __init__(self, es_index='dictionary'):
self.es_index = es_index
self.es = Elasticsearch()
s = Search(using=self.es, index=es_index)
ids = [h.meta.id for h in s.scan()]
words = sorted(ids)
self.words = words

def get_word(self, word):
if word not in self.words:
return {}
print(word)
body = self.es.get(index=self.es_index, id=word)
data = body["_source"]
return data

def save(self, word, data):
self.es.index(index=self.es_index, id=word, body=data)


if __name__ == '__main__':
dictionary = Dictionary(es_index='dictionary')
64 changes: 39 additions & 25 deletions apps/col_dictionary.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import os
from os.path import join

import streamlit as st
import streamlit.components.v1 as components

from apps.col_data import Dictionary

st.set_page_config(
page_title="Dictionary App",
layout="wide",
)

json_viewer_build_dir = join(
os.path.dirname(os.path.abspath(__file__)),
"components", "json_viewer", "component", "frontend", "build"
Expand All @@ -17,29 +17,43 @@
"json_viewer",
path=json_viewer_build_dir
)
dictionary = Dictionary()
init_word = 'a'


def switch_word(word):
st.session_state['current_word'] = word
st.session_state['current_word_data'] = dictionary.get_word(word)


# SIDEBAR
add_selectbox = st.sidebar.text_input('Word', value="gà")

# MAIN
st.write('# Dictionary')
st.text_input('Word', key=1, value="gà")
data = [
{
"description": "(Động vật học) Loài chim nuôi (gia cầm) để lấy thịt và trứng, bay kém, mỏ cứng, con trống có cựa và biết gáy.",
"tag": "noun",
"examples": [
"Bán gà ngày gió, bán chó ngày mưa. (tục ngữ)",
"Gà người gáy, gà nhà ta sáng. (tục ngữ)"
]
},
{
"description": "Đánh cuộc riêng trong một ván bài tổ tôm hay tài bàn ngoài số tiền góp chính",
"tag": "verb",
"examples": [
"Gà lần nào cũng thua thì đánh làm gì."
]
if __name__ == '__main__':
if 'current_word' not in st.session_state:
switch_word(init_word)

# SIDEBAR
m = st.markdown("""
<style>
div.stButton > button {
width: 100%;
}
]
</style>""", unsafe_allow_html=True)

placeholder = st.sidebar.empty()
search_text_box = placeholder.text_input('Word', value=st.session_state['current_word'], key='sidebar_text_input')
if search_text_box:
st.session_state['current_word'] = search_text_box

buttons = {}
for word in dictionary.words[:15]:
buttons[word] = st.sidebar.button(label=word, key=word)
if buttons[word]:
switch_word(word)

st.write('# Dictionary')
data = st.session_state['current_word_data']

output_data = json_viewer(json_object=data, label=0)

output_data = json_viewer(json_object=data, label=0)
save_button = st.button('Save')
if save_button:
dictionary.save(st.session_state['current_word'], output_data)
21 changes: 21 additions & 0 deletions apps/col_dictionary_import_to_elasticsearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from elasticsearch import Elasticsearch
from apps.utils import DICTIONARY_FILE
import yaml

if __name__ == '__main__':
with open(DICTIONARY_FILE) as f:
content = f.read()

es = Elasticsearch()

words = yaml.safe_load(content)
i = 0
for word_key, value in words.items():
i += 1
data = {
"headword": word_key,
"senses": value
}
es.index(index='dictionary', body=data, id=word_key)
if i % 1000 == 0 and i > 0:
print(i)
10 changes: 10 additions & 0 deletions apps/components/elasticsearch/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Elasticsearch

We use elasticsearch as our main database, elasticsearch will sync with dictionary.yaml dictionary file

Installation

```
docker volume create esdata1
```
25 changes: 25 additions & 0 deletions apps/components/elasticsearch/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
version: '2.2'
services:
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:6.8.18
container_name: elasticsearch
environment:
- cluster.name=docker-cluster
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- esdata1:/usr/share/elasticsearch/data
ports:
- 9200:9200
networks:
- esnet
volumes:
esdata1:
driver: local

networks:
esnet:
4 changes: 4 additions & 0 deletions apps/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from os.path import abspath, dirname, join

PROJECT_FOLDER = dirname(dirname(abspath(__file__)))
DICTIONARY_FILE = join(PROJECT_FOLDER, "datasets", "UD_Vietnamese-COL", "dictionary", "dictionary.yaml")
2 changes: 2 additions & 0 deletions datasets/UD_Vietnamese-COL/dictionary/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ In this project, we try to create an open dictionary for Vietnamese.

This is our attempt to build Vietnamese dictionary from scratch.
We known that building Vietnamese dictionary is a difficult and challenge job that requires great effort.
But the idea of being able to build an open dictionary for Vietnamese was so compelling that we thought we were crazy enought to do it.
Who knows, we might be lucky enough to succeed.

In the first phase, our goal was simply to bring the homogeneity of words and parts of speech into the corpus.

Expand Down
Loading

0 comments on commit cd18f57

Please sign in to comment.