-
-
Notifications
You must be signed in to change notification settings - Fork 273
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
anh.vu2
committed
Sep 6, 2021
1 parent
e4971ed
commit 695cc65
Showing
7 changed files
with
2,682 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Continual Learning App | ||
|
||
## Installation | ||
|
||
Step 1: Install dependencies | ||
|
||
``` | ||
pip install streamlit | ||
``` | ||
|
||
Step 2: Download Elasticsearch | ||
|
||
``` | ||
docker run elasticsearch:7.14.1 | ||
``` | ||
|
||
Run application | ||
|
||
``` | ||
streamlit run col_streamlit.py | ||
``` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import os | ||
from os.path import dirname, join | ||
import streamlit as st | ||
from underthesea.utils.col_dictionary import Dictionary | ||
from underthesea.file_utils import UNDERTHESEA_FOLDER | ||
from underthesea.utils.col_script import UDDataset | ||
|
||
PROJECT_FOLDER = dirname(dirname(__file__)) | ||
os.sys.path.append(PROJECT_FOLDER) | ||
|
||
DICTIONARY_FILE = join(PROJECT_FOLDER, "datasets", "dictionary", "202108.yaml") | ||
dictionary = Dictionary.load(dictionary_file=DICTIONARY_FILE) | ||
dictionary_n_words = len(dictionary.data) | ||
st.set_page_config( | ||
page_title="Open Dictionary", | ||
# layout="wide", | ||
) | ||
|
||
st.write(""" | ||
# Open Dictionary | ||
""") | ||
file = "wiki_00" | ||
ud_file = join(UNDERTHESEA_FOLDER, "data", "viwiki-20210720", "ud", "AA", file) | ||
ud_dataset = UDDataset.load(ud_file) | ||
|
||
|
||
def find_word(ud_dataset, word) -> list: | ||
max_samples = 10 | ||
i = 0 | ||
sentences = [] | ||
for s in ud_dataset: | ||
text_original = s.headers['text'] | ||
text = text_original.lower() | ||
if word in text: | ||
sentences.append(text_original) | ||
i += 1 | ||
if i > max_samples: | ||
break | ||
return sentences | ||
|
||
|
||
st.write('Loaded corpus: wiki') | ||
st.write(f'Loaded dictionary: {dictionary_n_words} words (202108.yaml)') | ||
|
||
word = st.text_input('Word') | ||
|
||
if word: | ||
sentences = find_word(ud_dataset, word) | ||
st.write(f"## {word}") | ||
st.write("Output:") | ||
for s in sentences: | ||
st.write(s) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import streamlit as st | ||
import pandas as pd | ||
|
||
st.set_page_config( | ||
page_title="Corpus", | ||
layout="wide", | ||
) | ||
|
||
st.write(""" | ||
# Vietnamese Wiki Corpus | ||
## Wikidump data August 2021 | ||
""") | ||
c1, c2, c3 = st.columns([1, 1, 1]) | ||
|
||
df = pd.DataFrame([ | ||
{"": "Language", "value": "Vietnamese"}, | ||
{"": "Corpus description", "value": "Vietnamese Wiki Corpus"}, | ||
{"": "Tagset", "value": "10"}, | ||
{"": "Grammar", "value": "0"}, | ||
]) | ||
|
||
c1.write(""" | ||
# General Info | ||
""") | ||
c1.dataframe(df.assign(title='').set_index('title')) | ||
|
||
df_count = pd.DataFrame([ | ||
{"": "Tokens", "value": "0"}, | ||
{"": "Words", "value": "0"}, | ||
{"": "Sentences", "value": "0"}, | ||
{"": "Documents", "value": "0"}, | ||
]) | ||
c2.write(""" | ||
# Counts | ||
""") | ||
c2.dataframe(df_count.assign(title='').set_index('title')) | ||
|
||
c3.write(""" | ||
# Lexicon Sizes | ||
""") | ||
df_lexicon = pd.DataFrame([ | ||
{"": "Word", "value": "0"}, | ||
{"": "Lemma", "value": "0"}, | ||
{"": "lc", "value": "0"}, | ||
{"": "lemma_lc", "value": "0"}, | ||
]) | ||
c3.dataframe(df_lexicon.assign(title='').set_index('title')) | ||
|
||
s2_c1, s2_c2 = st.columns([1, 1]) | ||
s2_c1.write(""" | ||
# Common Tags | ||
""") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import os | ||
from os.path import join, dirname | ||
from elasticsearch import Elasticsearch | ||
|
||
from underthesea.utils.col_dictionary import Dictionary | ||
|
||
PROJECT_FOLDER = dirname(dirname(__file__)) | ||
os.sys.path.append(PROJECT_FOLDER) | ||
|
||
DICTIONARY_FILE = join(PROJECT_FOLDER, "datasets", "dictionary", "202108.yaml") | ||
dictionary = Dictionary.load(dictionary_file=DICTIONARY_FILE) | ||
|
||
|
||
def index_dictionary(dictionary, es, index_name): | ||
for item in dictionary.data: | ||
print('Index ', item) | ||
try: | ||
item_data = dictionary.data[item] | ||
doc = { | ||
'word': item, | ||
'data': item_data | ||
} | ||
es.index(index=index_name, body=doc) | ||
except Exception as e: | ||
print(e) | ||
|
||
|
||
if __name__ == '__main__': | ||
es = Elasticsearch() | ||
# index_dictionary(dictionary, es, index_name="dictionary") |
Empty file.
Oops, something went wrong.