Skip to content

Commit

Permalink
GH-485: add more head words
Browse files Browse the repository at this point in the history
  • Loading branch information
anh.vu2 committed Sep 6, 2021
1 parent e4971ed commit 695cc65
Show file tree
Hide file tree
Showing 7 changed files with 2,682 additions and 2 deletions.
22 changes: 22 additions & 0 deletions apps/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Continual Learning App

## Installation

Step 1: Install dependencies

```
pip install streamlit
```

Step 2: Download Elasticsearch

```
docker run elasticsearch:7.14.1
```

Run application

```
streamlit run col_streamlit.py
```

52 changes: 52 additions & 0 deletions apps/col_streamlit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
from os.path import dirname, join
import streamlit as st
from underthesea.utils.col_dictionary import Dictionary
from underthesea.file_utils import UNDERTHESEA_FOLDER
from underthesea.utils.col_script import UDDataset

PROJECT_FOLDER = dirname(dirname(__file__))
os.sys.path.append(PROJECT_FOLDER)

DICTIONARY_FILE = join(PROJECT_FOLDER, "datasets", "dictionary", "202108.yaml")
dictionary = Dictionary.load(dictionary_file=DICTIONARY_FILE)
dictionary_n_words = len(dictionary.data)
st.set_page_config(
page_title="Open Dictionary",
# layout="wide",
)

st.write("""
# Open Dictionary
""")
file = "wiki_00"
ud_file = join(UNDERTHESEA_FOLDER, "data", "viwiki-20210720", "ud", "AA", file)
ud_dataset = UDDataset.load(ud_file)


def find_word(ud_dataset, word) -> list:
max_samples = 10
i = 0
sentences = []
for s in ud_dataset:
text_original = s.headers['text']
text = text_original.lower()
if word in text:
sentences.append(text_original)
i += 1
if i > max_samples:
break
return sentences


st.write('Loaded corpus: wiki')
st.write(f'Loaded dictionary: {dictionary_n_words} words (202108.yaml)')

word = st.text_input('Word')

if word:
sentences = find_word(ud_dataset, word)
st.write(f"## {word}")
st.write("Output:")
for s in sentences:
st.write(s)
52 changes: 52 additions & 0 deletions apps/col_streamlit_corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import streamlit as st
import pandas as pd

st.set_page_config(
page_title="Corpus",
layout="wide",
)

st.write("""
# Vietnamese Wiki Corpus
## Wikidump data August 2021
""")
c1, c2, c3 = st.columns([1, 1, 1])

df = pd.DataFrame([
{"": "Language", "value": "Vietnamese"},
{"": "Corpus description", "value": "Vietnamese Wiki Corpus"},
{"": "Tagset", "value": "10"},
{"": "Grammar", "value": "0"},
])

c1.write("""
# General Info
""")
c1.dataframe(df.assign(title='').set_index('title'))

df_count = pd.DataFrame([
{"": "Tokens", "value": "0"},
{"": "Words", "value": "0"},
{"": "Sentences", "value": "0"},
{"": "Documents", "value": "0"},
])
c2.write("""
# Counts
""")
c2.dataframe(df_count.assign(title='').set_index('title'))

c3.write("""
# Lexicon Sizes
""")
df_lexicon = pd.DataFrame([
{"": "Word", "value": "0"},
{"": "Lemma", "value": "0"},
{"": "lc", "value": "0"},
{"": "lemma_lc", "value": "0"},
])
c3.dataframe(df_lexicon.assign(title='').set_index('title'))

s2_c1, s2_c2 = st.columns([1, 1])
s2_c1.write("""
# Common Tags
""")
30 changes: 30 additions & 0 deletions apps/step1_index_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import os
from os.path import join, dirname
from elasticsearch import Elasticsearch

from underthesea.utils.col_dictionary import Dictionary

PROJECT_FOLDER = dirname(dirname(__file__))
os.sys.path.append(PROJECT_FOLDER)

DICTIONARY_FILE = join(PROJECT_FOLDER, "datasets", "dictionary", "202108.yaml")
dictionary = Dictionary.load(dictionary_file=DICTIONARY_FILE)


def index_dictionary(dictionary, es, index_name):
for item in dictionary.data:
print('Index ', item)
try:
item_data = dictionary.data[item]
doc = {
'word': item,
'data': item_data
}
es.index(index=index_name, body=doc)
except Exception as e:
print(e)


if __name__ == '__main__':
es = Elasticsearch()
# index_dictionary(dictionary, es, index_name="dictionary")
Empty file added apps/step2_compute_data.py
Empty file.
Loading

0 comments on commit 695cc65

Please sign in to comment.