Skip to content

Commit

Permalink
GH-244: update UTS2017_BANK_SA model (#258)
Browse files Browse the repository at this point in the history
  • Loading branch information
quang-ph authored and rain1024 committed Jun 15, 2019
1 parent 862f59d commit 6b3b061
Show file tree
Hide file tree
Showing 12 changed files with 158 additions and 68 deletions.
13 changes: 8 additions & 5 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,10 @@ Install dependencies

.. code-block:: bash
$ pip install future scipy numpy scikit-learn==0.19.2 joblib
$ pip install git+https://github.com/facebookresearch/fastText.git
$ pip install unidecode
$ underthesea download sa_general
Usage

Expand All @@ -245,10 +248,10 @@ Usage
>>> # -*- coding: utf-8 -*-
>>> from underthesea import sentiment
>>> sentiment('Gọi mấy lần mà lúc nào cũng là các chuyên viên đang bận hết ạ', domain='bank')
('CUSTOMER SUPPORT#NEGATIVE',)
>>> sentiment('bidv cho vay hay ko phu thuoc y thich cua thang tham dinh, ko co quy dinh ro rang', domain='bank')
('LOAN#NEGATIVE',)
>>> sentiment('Đky qua đường link ở bài viết này từ thứ 6 mà giờ chưa thấy ai lhe hết', domain='bank')
['CUSTOMER_SUPPORT#negative']
>>> sentiment('Xem lại vẫn thấy xúc động và tự hào về BIDV của mình', domain='bank')
['TRADEMARK#positive']
Up Coming Features
----------------------------------------
Expand Down
54 changes: 22 additions & 32 deletions tests/sentiment/test_sentiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,44 +4,34 @@


class TestSentiment(TestCase):
def test_sentiment(self):
def test_no_text(self):
text = ""
actual = sentiment(text)
expected = None
self.assertEqual(actual, expected)

def test_sentiment_1(self):
text = "Gọi mấy lần mà lúc nào cũng là các chuyên viên đang bận hết ạ"
actual = sentiment(text, domain="bank")
expected = ('CUSTOMER SUPPORT#NEGATIVE',)
def test_one_label_1(self):
text = "Xem lại vẫn thấy xúc động và tự hào về BIDV của mình!"
actual = [str(label) for label in sentiment(text, domain="bank")]
expected = ['TRADEMARK#positive']
self.assertEqual(actual, expected)

def test_sentiment_2(self):
text = "bidv cho vay hay ko phu thuoc y thich cua thang tham dinh, ko co quy dinh ro rang"
actual = sentiment(text, domain="bank")
expected = ('LOAN#NEGATIVE',)
def test_one_label_2(self):
text = "Đky qua đường link ở bài viết này từ thứ 6 mà giờ chưa thấy ai lhe hết"
actual = [str(label) for label in sentiment(text, domain="bank")]
expected = ['CUSTOMER_SUPPORT#negative']
self.assertEqual(actual, expected)

def test_sentiment_3(self):
text = "Vừa smartbidv, vừa bidv online mà lại k dùng chung 1 tài khoản đăng nhập, rắc rối!"
actual = sentiment(text, domain="bank")
expected = ('INTERNET BANKING#NEGATIVE',)
self.assertEqual(actual, expected)

def test_sentiment_4(self):
text = "Không tin tưởng vào ngân hàng BIDV"
actual = sentiment(text, domain="bank")
expected = ('TRADEMARK#NEGATIVE',)
self.assertEqual(actual, expected)

def test_sentiment_5(self):
text = "Chương trình này của BIDV thật ý nghĩa"
actual = sentiment(text, domain="bank")
expected = ('PROMOTION#POSITIVE',)
self.assertEqual(actual, expected)

def test_sentiment_6(self):
text = "mình cũng vui vì tiết kệm được thời gian"
actual = sentiment(text, domain="bank")
expected = ('PAYMENT#POSITIVE',)
self.assertEqual(actual, expected)
def test_multi_label_1(self):
text = "Dkm t chuyển vẫn bị mất phí"
actual = [str(label) for label in sentiment(text, domain="bank")]
expected = ['INTEREST_RATE#negative', 'MONEY_TRANSFER#negative']
self.assertEqual(sorted(actual), sorted(expected))

def test_multi_label_2(self):
text = '''TUI cũng bó tay với BIDV Cần Thơ.
Cả quận NK mà chỉ được lèo tèo mấy thùng ATM và luôn trong tình trạng nhìn thấy chữ Sorry cũng nh.ư hết tiền.
Chán ko buồn nói. Qd có khác '''
actual = [str(label) for label in sentiment(text, domain="bank")]
expected = ['CARD#negative', 'CUSTOMER_SUPPORT#negative']
self.assertEqual(sorted(actual), sorted(expected))
1 change: 1 addition & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ commands =
python -m unittest discover tests.classification

; sentiment module
underthesea download sa_bank
python -m unittest discover tests.sentiment

; If you want to make tox run the tests with the same versions, create a
Expand Down
6 changes: 0 additions & 6 deletions underthesea/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,6 @@ def main(args=None):
pass


# @main.command()
# @click.argument('component')
# def download(component):
# download_component(component)


@main.command()
@click.argument('model', required=True)
def download(model):
Expand Down
17 changes: 17 additions & 0 deletions underthesea/model_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
class UTSModel(Enum):
tc_bank = "tc_bank"
tc_general = "tc_general"
sa_bank = "sa_bank"


class ModelFetcher:
Expand Down Expand Up @@ -58,6 +59,19 @@ def download_model(model_name):
)
os.remove(model_path)

if model_name == "sa_bank":
url = "https://www.dropbox.com/s/yo6sf6ofpdb3hlh/sa_svm_uts2017_bank_20190611.zip?dl=1"
cached_path(url, cache_dir=cache_dir)
model_path = Path(CACHE_ROOT) / cache_dir / "sa_svm_uts2017_bank_20190611.zip?dl=1"
cache_folder = Path(CACHE_ROOT) / cache_dir
zip = zipfile.ZipFile(model_path)
zip.extractall(cache_folder)
os.rename(
Path(CACHE_ROOT) / cache_dir / "sa_svm_uts2017_bank_20190611",
Path(CACHE_ROOT) / cache_dir / "sa_bank",
)
os.remove(model_path)

@staticmethod
def list(all):
models = []
Expand Down Expand Up @@ -99,3 +113,6 @@ def get_model_path(model):

if model == UTSModel.tc_general:
return Path(CACHE_ROOT) / "models" / "tc_general"

if model == UTSModel.sa_bank:
return Path(CACHE_ROOT) / "models" / "sa_bank"
7 changes: 7 additions & 0 deletions underthesea/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,11 @@
"year": "2019",
"model_path": "tc_general"
},
"sa_bank": {
"cache_dir": "models",
"type": "Sentiment",
"license": "Open",
"year": "2019",
"model_path": "sa_bank"
},
}
9 changes: 5 additions & 4 deletions underthesea/sentiment/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,18 @@ def sentiment(X, domain=None):
* bank: bank domain
Returns
=======
tokens: list
sentiment of sentence
Text: Text of input sentence
Labels: Sentiment of sentence
Examples
--------
>>> # -*- coding: utf-8 -*-
>>> from underthesea import sentiment
>>> sentence = "Vừa smartbidv, vừa bidv online mà lại k dùng chung 1 tài khoản đăng nhập, rắc rối!"
>>> sentence = "Chuyen tiền k nhận Dc tiên"
>>> sentiment(sentence, domain='bank')
('INTERNET BANKING#NEGATIVE',)
[MONEY_TRANSFER#negative (1.0)]
"""
if X == "":
return None
Expand Down
49 changes: 28 additions & 21 deletions underthesea/sentiment/bank/__init__.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,33 @@
import joblib
from os.path import join, dirname
import logging
import os
from os.path import dirname
import sys
from languageflow.data import Sentence
from languageflow.models.text_classifier import TextClassifier
from underthesea.model_fetcher import ModelFetcher, UTSModel

sys.path.insert(0, dirname(__file__))

bank_sentiment = {}
FORMAT = '%(message)s'
logging.basicConfig(format=FORMAT)
logger = logging.getLogger('underthesea')

sys.path.insert(0, dirname(dirname(__file__)))
model_path = ModelFetcher.get_model_path(UTSModel.sa_bank)
classifier = None

def sentiment(X):
global bank_sentiment
if "x_transform" not in bank_sentiment:
bank_sentiment["x_transform"] = joblib.load(join(dirname(__file__), "count.transformer.bin"))
if "y_transform" not in bank_sentiment:
bank_sentiment["y_transform"] = joblib.load(join(dirname(__file__), "label.transformer.bin"))
if "estimator" not in bank_sentiment:
bank_sentiment["estimator"] = joblib.load(join(dirname(__file__), "model.bin"))
x_transform = bank_sentiment["x_transform"]
y_transform = bank_sentiment["y_transform"]
estimator = bank_sentiment["estimator"]
if isinstance(X, list):
return y_transform.inverse_transform(
estimator.predict(x_transform.transform(X)))
else:
return y_transform.inverse_transform(
estimator.predict(x_transform.transform([X])))[0]

def sentiment(text):
global classifier

if not classifier:
if os.path.exists(model_path):
classifier = TextClassifier.load(model_path)
else:
logger.error(
f"Could not load model at {model_path}.\n"
f"Download model with \"underthesea download {UTSModel.sa_bank.value}\".")
sys.exit(1)
sentence = Sentence(text)
classifier.predict(sentence)
labels = sentence.labels
return [label.value for label in labels]
Binary file removed underthesea/sentiment/bank/count.transformer.bin
Binary file not shown.
Binary file removed underthesea/sentiment/bank/label.transformer.bin
Binary file not shown.
Binary file removed underthesea/sentiment/bank/model.bin
Binary file not shown.
70 changes: 70 additions & 0 deletions underthesea/sentiment/text_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import unidecode
from sklearn.base import BaseEstimator, TransformerMixin
import string
from underthesea.word_tokenize.regex_tokenize import tokenize


negative_emoticons = {':(', '☹', '❌', '👎', '👹', '💀', '🔥', '🤔', '😏', '😐', '😑', '😒', '😓', '😔', '😕', '😖',
'😞', '😟', '😠', '😡', '😢', '😣', '😤', '😥', '😧', '😨', '😩', '😪', '😫', '😭', '😰', '😱',
'😳', '😵', '😶', '😾', '🙁', '🙏', '🚫', '>:[', ':-(', ':(', ':-c', ':c', ':-<', ':っC', ':<',
':-[', ':[', ':{'}

positive_emoticons = {'=))', 'v', ';)', '^^', '<3', '☀', '☺', '♡', '♥', '✌', '✨', '❣', '❤', '🌝', '🌷', '🌸',
'🌺', '🌼', '🍓', '🎈', '🐅', '🐶', '🐾', '👉', '👌', '👍', '👏', '👻', '💃', '💄', '💋',
'💌', '💎', '💐', '💓', '💕', '💖', '💗', '💙', '💚', '💛', '💜', '💞', ':-)', ':)', ':D', ':o)',
':]', ':3', ':c)', ':>', '=]', '8)'}


class Lowercase(BaseEstimator, TransformerMixin):
def transform(self, x):
return [s.lower() for s in x]

def fit(self, x, y=None):
return self


class RemoveTone(BaseEstimator, TransformerMixin):
def remove_tone(self, s):
return unidecode.unidecode(s)

def transform(self, x):
return [self.remove_tone(s) for s in x]

def fit(self, x, y=None):
return self


class CountEmoticons(BaseEstimator, TransformerMixin):
def count_emoticon(self, s):
positive_count = 0
negative_count = 0
for emoticon in positive_emoticons:
positive_count += s.count(emoticon)
for emoticon in negative_emoticons:
negative_count += s.count(emoticon)
return positive_count, negative_count

def transform(self, x):
return [self.count_emoticon(s) for s in x]

def fit(self, x, y=None):
return self


class Tokenize(BaseEstimator, TransformerMixin):
def pun_num(self, s):
for token in s.split():
if token in string.punctuation:
if token == '.':
s = s
else:
s = s.replace(token, 'punc')
else:
s = s
return s

def transform(self, x):
return [self.pun_num(tokenize(s, format='text')) for s in x]

def fit(self, x, y=None):
return self

0 comments on commit 6b3b061

Please sign in to comment.