diff --git a/README.rst b/README.rst index 759cbcdd..d9eb0229 100644 --- a/README.rst +++ b/README.rst @@ -236,7 +236,10 @@ Install dependencies .. code-block:: bash - $ pip install future scipy numpy scikit-learn==0.19.2 joblib + $ pip install git+https://github.com/facebookresearch/fastText.git + $ pip install unidecode + $ underthesea download sa_general + Usage @@ -245,10 +248,10 @@ Usage >>> # -*- coding: utf-8 -*- >>> from underthesea import sentiment - >>> sentiment('Gọi mấy lần mà lúc nào cũng là các chuyên viên đang bận hết ạ', domain='bank') - ('CUSTOMER SUPPORT#NEGATIVE',) - >>> sentiment('bidv cho vay hay ko phu thuoc y thich cua thang tham dinh, ko co quy dinh ro rang', domain='bank') - ('LOAN#NEGATIVE',) + >>> sentiment('Đky qua đường link ở bài viết này từ thứ 6 mà giờ chưa thấy ai lhe hết', domain='bank') + ['CUSTOMER_SUPPORT#negative'] + >>> sentiment('Xem lại vẫn thấy xúc động và tự hào về BIDV của mình', domain='bank') + ['TRADEMARK#positive'] Up Coming Features ---------------------------------------- diff --git a/tests/sentiment/test_sentiment.py b/tests/sentiment/test_sentiment.py index 8e6b5c9a..6db5c24e 100644 --- a/tests/sentiment/test_sentiment.py +++ b/tests/sentiment/test_sentiment.py @@ -4,44 +4,34 @@ class TestSentiment(TestCase): - def test_sentiment(self): + def test_no_text(self): text = "" actual = sentiment(text) expected = None self.assertEqual(actual, expected) - def test_sentiment_1(self): - text = "Gọi mấy lần mà lúc nào cũng là các chuyên viên đang bận hết ạ" - actual = sentiment(text, domain="bank") - expected = ('CUSTOMER SUPPORT#NEGATIVE',) + def test_one_label_1(self): + text = "Xem lại vẫn thấy xúc động và tự hào về BIDV của mình!" + actual = [str(label) for label in sentiment(text, domain="bank")] + expected = ['TRADEMARK#positive'] self.assertEqual(actual, expected) - def test_sentiment_2(self): - text = "bidv cho vay hay ko phu thuoc y thich cua thang tham dinh, ko co quy dinh ro rang" - actual = sentiment(text, domain="bank") - expected = ('LOAN#NEGATIVE',) + def test_one_label_2(self): + text = "Đky qua đường link ở bài viết này từ thứ 6 mà giờ chưa thấy ai lhe hết" + actual = [str(label) for label in sentiment(text, domain="bank")] + expected = ['CUSTOMER_SUPPORT#negative'] self.assertEqual(actual, expected) - def test_sentiment_3(self): - text = "Vừa smartbidv, vừa bidv online mà lại k dùng chung 1 tài khoản đăng nhập, rắc rối!" - actual = sentiment(text, domain="bank") - expected = ('INTERNET BANKING#NEGATIVE',) - self.assertEqual(actual, expected) - - def test_sentiment_4(self): - text = "Không tin tưởng vào ngân hàng BIDV" - actual = sentiment(text, domain="bank") - expected = ('TRADEMARK#NEGATIVE',) - self.assertEqual(actual, expected) - - def test_sentiment_5(self): - text = "Chương trình này của BIDV thật ý nghĩa" - actual = sentiment(text, domain="bank") - expected = ('PROMOTION#POSITIVE',) - self.assertEqual(actual, expected) - - def test_sentiment_6(self): - text = "mình cũng vui vì tiết kệm được thời gian" - actual = sentiment(text, domain="bank") - expected = ('PAYMENT#POSITIVE',) - self.assertEqual(actual, expected) + def test_multi_label_1(self): + text = "Dkm t chuyển vẫn bị mất phí" + actual = [str(label) for label in sentiment(text, domain="bank")] + expected = ['INTEREST_RATE#negative', 'MONEY_TRANSFER#negative'] + self.assertEqual(sorted(actual), sorted(expected)) + + def test_multi_label_2(self): + text = '''TUI cũng bó tay với BIDV Cần Thơ. + Cả quận NK mà chỉ được lèo tèo mấy thùng ATM và luôn trong tình trạng nhìn thấy chữ Sorry cũng nh.ư hết tiền. + Chán ko buồn nói. Qd có khác ''' + actual = [str(label) for label in sentiment(text, domain="bank")] + expected = ['CARD#negative', 'CUSTOMER_SUPPORT#negative'] + self.assertEqual(sorted(actual), sorted(expected)) diff --git a/tox.ini b/tox.ini index a1b147f5..94f7fc8e 100644 --- a/tox.ini +++ b/tox.ini @@ -27,6 +27,7 @@ commands = python -m unittest discover tests.classification ; sentiment module + underthesea download sa_bank python -m unittest discover tests.sentiment ; If you want to make tox run the tests with the same versions, create a diff --git a/underthesea/cli.py b/underthesea/cli.py index b38a0711..5964a782 100644 --- a/underthesea/cli.py +++ b/underthesea/cli.py @@ -13,12 +13,6 @@ def main(args=None): pass -# @main.command() -# @click.argument('component') -# def download(component): -# download_component(component) - - @main.command() @click.argument('model', required=True) def download(model): diff --git a/underthesea/model_fetcher.py b/underthesea/model_fetcher.py index 2d362a4a..85ff0d91 100644 --- a/underthesea/model_fetcher.py +++ b/underthesea/model_fetcher.py @@ -15,6 +15,7 @@ class UTSModel(Enum): tc_bank = "tc_bank" tc_general = "tc_general" + sa_bank = "sa_bank" class ModelFetcher: @@ -58,6 +59,19 @@ def download_model(model_name): ) os.remove(model_path) + if model_name == "sa_bank": + url = "https://www.dropbox.com/s/yo6sf6ofpdb3hlh/sa_svm_uts2017_bank_20190611.zip?dl=1" + cached_path(url, cache_dir=cache_dir) + model_path = Path(CACHE_ROOT) / cache_dir / "sa_svm_uts2017_bank_20190611.zip?dl=1" + cache_folder = Path(CACHE_ROOT) / cache_dir + zip = zipfile.ZipFile(model_path) + zip.extractall(cache_folder) + os.rename( + Path(CACHE_ROOT) / cache_dir / "sa_svm_uts2017_bank_20190611", + Path(CACHE_ROOT) / cache_dir / "sa_bank", + ) + os.remove(model_path) + @staticmethod def list(all): models = [] @@ -99,3 +113,6 @@ def get_model_path(model): if model == UTSModel.tc_general: return Path(CACHE_ROOT) / "models" / "tc_general" + + if model == UTSModel.sa_bank: + return Path(CACHE_ROOT) / "models" / "sa_bank" diff --git a/underthesea/models.py b/underthesea/models.py index 7a7e5c1a..0d0da6c9 100644 --- a/underthesea/models.py +++ b/underthesea/models.py @@ -13,4 +13,11 @@ "year": "2019", "model_path": "tc_general" }, + "sa_bank": { + "cache_dir": "models", + "type": "Sentiment", + "license": "Open", + "year": "2019", + "model_path": "sa_bank" + }, } diff --git a/underthesea/sentiment/__init__.py b/underthesea/sentiment/__init__.py index 8f5803ef..6e7299ec 100644 --- a/underthesea/sentiment/__init__.py +++ b/underthesea/sentiment/__init__.py @@ -16,17 +16,18 @@ def sentiment(X, domain=None): * bank: bank domain Returns ======= - tokens: list - sentiment of sentence + Text: Text of input sentence + Labels: Sentiment of sentence Examples -------- >>> # -*- coding: utf-8 -*- >>> from underthesea import sentiment - >>> sentence = "Vừa smartbidv, vừa bidv online mà lại k dùng chung 1 tài khoản đăng nhập, rắc rối!" + >>> sentence = "Chuyen tiền k nhận Dc tiên" >>> sentiment(sentence, domain='bank') - ('INTERNET BANKING#NEGATIVE',) + + [MONEY_TRANSFER#negative (1.0)] """ if X == "": return None diff --git a/underthesea/sentiment/bank/__init__.py b/underthesea/sentiment/bank/__init__.py index a55f5c0e..3f6767d6 100644 --- a/underthesea/sentiment/bank/__init__.py +++ b/underthesea/sentiment/bank/__init__.py @@ -1,26 +1,33 @@ -import joblib -from os.path import join, dirname +import logging +import os +from os.path import dirname import sys +from languageflow.data import Sentence +from languageflow.models.text_classifier import TextClassifier +from underthesea.model_fetcher import ModelFetcher, UTSModel -sys.path.insert(0, dirname(__file__)) -bank_sentiment = {} +FORMAT = '%(message)s' +logging.basicConfig(format=FORMAT) +logger = logging.getLogger('underthesea') +sys.path.insert(0, dirname(dirname(__file__))) +model_path = ModelFetcher.get_model_path(UTSModel.sa_bank) +classifier = None -def sentiment(X): - global bank_sentiment - if "x_transform" not in bank_sentiment: - bank_sentiment["x_transform"] = joblib.load(join(dirname(__file__), "count.transformer.bin")) - if "y_transform" not in bank_sentiment: - bank_sentiment["y_transform"] = joblib.load(join(dirname(__file__), "label.transformer.bin")) - if "estimator" not in bank_sentiment: - bank_sentiment["estimator"] = joblib.load(join(dirname(__file__), "model.bin")) - x_transform = bank_sentiment["x_transform"] - y_transform = bank_sentiment["y_transform"] - estimator = bank_sentiment["estimator"] - if isinstance(X, list): - return y_transform.inverse_transform( - estimator.predict(x_transform.transform(X))) - else: - return y_transform.inverse_transform( - estimator.predict(x_transform.transform([X])))[0] + +def sentiment(text): + global classifier + + if not classifier: + if os.path.exists(model_path): + classifier = TextClassifier.load(model_path) + else: + logger.error( + f"Could not load model at {model_path}.\n" + f"Download model with \"underthesea download {UTSModel.sa_bank.value}\".") + sys.exit(1) + sentence = Sentence(text) + classifier.predict(sentence) + labels = sentence.labels + return [label.value for label in labels] diff --git a/underthesea/sentiment/bank/count.transformer.bin b/underthesea/sentiment/bank/count.transformer.bin deleted file mode 100644 index dbe0b2fd..00000000 Binary files a/underthesea/sentiment/bank/count.transformer.bin and /dev/null differ diff --git a/underthesea/sentiment/bank/label.transformer.bin b/underthesea/sentiment/bank/label.transformer.bin deleted file mode 100644 index 34c96b68..00000000 Binary files a/underthesea/sentiment/bank/label.transformer.bin and /dev/null differ diff --git a/underthesea/sentiment/bank/model.bin b/underthesea/sentiment/bank/model.bin deleted file mode 100644 index 570ea213..00000000 Binary files a/underthesea/sentiment/bank/model.bin and /dev/null differ diff --git a/underthesea/sentiment/text_features.py b/underthesea/sentiment/text_features.py new file mode 100644 index 00000000..7e8ea67d --- /dev/null +++ b/underthesea/sentiment/text_features.py @@ -0,0 +1,70 @@ +import unidecode +from sklearn.base import BaseEstimator, TransformerMixin +import string +from underthesea.word_tokenize.regex_tokenize import tokenize + + +negative_emoticons = {':(', '☹', '❌', '👎', '👹', '💀', '🔥', '🤔', '😏', '😐', '😑', '😒', '😓', '😔', '😕', '😖', + '😞', '😟', '😠', '😡', '😢', '😣', '😤', '😥', '😧', '😨', '😩', '😪', '😫', '😭', '😰', '😱', + '😳', '😵', '😶', '😾', '🙁', '🙏', '🚫', '>:[', ':-(', ':(', ':-c', ':c', ':-<', ':っC', ':<', + ':-[', ':[', ':{'} + +positive_emoticons = {'=))', 'v', ';)', '^^', '<3', '☀', '☺', '♡', '♥', '✌', '✨', '❣', '❤', '🌝', '🌷', '🌸', + '🌺', '🌼', '🍓', '🎈', '🐅', '🐶', '🐾', '👉', '👌', '👍', '👏', '👻', '💃', '💄', '💋', + '💌', '💎', '💐', '💓', '💕', '💖', '💗', '💙', '💚', '💛', '💜', '💞', ':-)', ':)', ':D', ':o)', + ':]', ':3', ':c)', ':>', '=]', '8)'} + + +class Lowercase(BaseEstimator, TransformerMixin): + def transform(self, x): + return [s.lower() for s in x] + + def fit(self, x, y=None): + return self + + +class RemoveTone(BaseEstimator, TransformerMixin): + def remove_tone(self, s): + return unidecode.unidecode(s) + + def transform(self, x): + return [self.remove_tone(s) for s in x] + + def fit(self, x, y=None): + return self + + +class CountEmoticons(BaseEstimator, TransformerMixin): + def count_emoticon(self, s): + positive_count = 0 + negative_count = 0 + for emoticon in positive_emoticons: + positive_count += s.count(emoticon) + for emoticon in negative_emoticons: + negative_count += s.count(emoticon) + return positive_count, negative_count + + def transform(self, x): + return [self.count_emoticon(s) for s in x] + + def fit(self, x, y=None): + return self + + +class Tokenize(BaseEstimator, TransformerMixin): + def pun_num(self, s): + for token in s.split(): + if token in string.punctuation: + if token == '.': + s = s + else: + s = s.replace(token, 'punc') + else: + s = s + return s + + def transform(self, x): + return [self.pun_num(tokenize(s, format='text')) for s in x] + + def fit(self, x, y=None): + return self