GH-244: update UTS2017_BANK_SA model (#258)

undertheseanlp · Jun 15, 2019 · 6b3b061 · 6b3b061
1 parent 862f59d
commit 6b3b061
Show file tree

Hide file tree

Showing 12 changed files with 158 additions and 68 deletions.
diff --git a/README.rst b/README.rst
@@ -236,7 +236,10 @@ Install dependencies
 
 .. code-block:: bash
 
-    $ pip install future scipy numpy scikit-learn==0.19.2 joblib
+    $ pip install git+https://github.com/facebookresearch/fastText.git
+    $ pip install unidecode
+    $ underthesea download sa_general
+
 
 Usage
 
@@ -245,10 +248,10 @@ Usage
 
     >>> # -*- coding: utf-8 -*-
     >>> from underthesea import sentiment
-    >>> sentiment('Gọi mấy lần mà lúc nào cũng là các chuyên viên đang bận hết ạ', domain='bank')
-    ('CUSTOMER SUPPORT#NEGATIVE',)
-    >>> sentiment('bidv cho vay hay ko phu thuoc y thich cua thang tham dinh, ko co quy dinh ro rang', domain='bank')
-    ('LOAN#NEGATIVE',)
+    >>> sentiment('Đky qua đường link ở bài viết này từ thứ 6 mà giờ chưa thấy ai lhe hết', domain='bank')
+    ['CUSTOMER_SUPPORT#negative']
+    >>> sentiment('Xem lại vẫn thấy xúc động và tự hào về BIDV của mình', domain='bank')
+    ['TRADEMARK#positive']
 
 Up Coming Features
 ----------------------------------------

diff --git a/tests/sentiment/test_sentiment.py b/tests/sentiment/test_sentiment.py
@@ -4,44 +4,34 @@
 
 
 class TestSentiment(TestCase):
-    def test_sentiment(self):
+    def test_no_text(self):
         text = ""
         actual = sentiment(text)
         expected = None
         self.assertEqual(actual, expected)
 
-    def test_sentiment_1(self):
-        text = "Gọi mấy lần mà lúc nào cũng là các chuyên viên đang bận hết ạ"
-        actual = sentiment(text, domain="bank")
-        expected = ('CUSTOMER SUPPORT#NEGATIVE',)
+    def test_one_label_1(self):
+        text = "Xem lại vẫn thấy xúc động và tự hào về BIDV của mình!"
+        actual = [str(label) for label in sentiment(text, domain="bank")]
+        expected = ['TRADEMARK#positive']
         self.assertEqual(actual, expected)
 
-    def test_sentiment_2(self):
-        text = "bidv cho vay hay ko phu thuoc y thich cua thang tham dinh, ko co quy dinh ro rang"
-        actual = sentiment(text, domain="bank")
-        expected = ('LOAN#NEGATIVE',)
+    def test_one_label_2(self):
+        text = "Đky qua đường link ở bài viết này từ thứ 6 mà giờ chưa thấy ai lhe hết"
+        actual = [str(label) for label in sentiment(text, domain="bank")]
+        expected = ['CUSTOMER_SUPPORT#negative']
         self.assertEqual(actual, expected)
 
-    def test_sentiment_3(self):
-        text = "Vừa smartbidv, vừa bidv online mà lại k dùng chung 1 tài khoản đăng nhập, rắc rối!"
-        actual = sentiment(text, domain="bank")
-        expected = ('INTERNET BANKING#NEGATIVE',)
-        self.assertEqual(actual, expected)
-
-    def test_sentiment_4(self):
-        text = "Không tin tưởng vào ngân hàng BIDV"
-        actual = sentiment(text, domain="bank")
-        expected = ('TRADEMARK#NEGATIVE',)
-        self.assertEqual(actual, expected)
-
-    def test_sentiment_5(self):
-        text = "Chương trình này của BIDV thật ý nghĩa"
-        actual = sentiment(text, domain="bank")
-        expected = ('PROMOTION#POSITIVE',)
-        self.assertEqual(actual, expected)
-
-    def test_sentiment_6(self):
-        text = "mình cũng vui vì tiết kệm được thời gian"
-        actual = sentiment(text, domain="bank")
-        expected = ('PAYMENT#POSITIVE',)
-        self.assertEqual(actual, expected)
+    def test_multi_label_1(self):
+        text = "Dkm t chuyển vẫn bị mất phí"
+        actual = [str(label) for label in sentiment(text, domain="bank")]
+        expected = ['INTEREST_RATE#negative', 'MONEY_TRANSFER#negative']
+        self.assertEqual(sorted(actual), sorted(expected))
+
+    def test_multi_label_2(self):
+        text = '''TUI cũng bó tay với BIDV Cần Thơ.
+                Cả quận NK mà chỉ được lèo tèo mấy thùng ATM và luôn trong tình trạng nhìn thấy chữ Sorry cũng nh.ư hết tiền.
+                Chán ko buồn nói. Qd có khác '''
+        actual = [str(label) for label in sentiment(text, domain="bank")]
+        expected = ['CARD#negative', 'CUSTOMER_SUPPORT#negative']
+        self.assertEqual(sorted(actual), sorted(expected))
diff --git a/tox.ini b/tox.ini
@@ -27,6 +27,7 @@ commands =
     python -m unittest discover tests.classification
 
     ; sentiment module
+    underthesea download sa_bank
     python -m unittest discover tests.sentiment
 
 ; If you want to make tox run the tests with the same versions, create a

diff --git a/underthesea/cli.py b/underthesea/cli.py
@@ -13,12 +13,6 @@ def main(args=None):
     pass
 
 
-# @main.command()
-# @click.argument('component')
-# def download(component):
-#     download_component(component)
-
-
 @main.command()
 @click.argument('model', required=True)
 def download(model):

diff --git a/underthesea/model_fetcher.py b/underthesea/model_fetcher.py
@@ -15,6 +15,7 @@
 class UTSModel(Enum):
     tc_bank = "tc_bank"
     tc_general = "tc_general"
+    sa_bank = "sa_bank"
 
 
 class ModelFetcher:
@@ -58,6 +59,19 @@ def download_model(model_name):
             )
             os.remove(model_path)
 
+        if model_name == "sa_bank":
+            url = "https://www.dropbox.com/s/yo6sf6ofpdb3hlh/sa_svm_uts2017_bank_20190611.zip?dl=1"
+            cached_path(url, cache_dir=cache_dir)
+            model_path = Path(CACHE_ROOT) / cache_dir / "sa_svm_uts2017_bank_20190611.zip?dl=1"
+            cache_folder = Path(CACHE_ROOT) / cache_dir
+            zip = zipfile.ZipFile(model_path)
+            zip.extractall(cache_folder)
+            os.rename(
+                Path(CACHE_ROOT) / cache_dir / "sa_svm_uts2017_bank_20190611",
+                Path(CACHE_ROOT) / cache_dir / "sa_bank",
+            )
+            os.remove(model_path)
+
     @staticmethod
     def list(all):
         models = []
@@ -99,3 +113,6 @@ def get_model_path(model):
 
         if model == UTSModel.tc_general:
             return Path(CACHE_ROOT) / "models" / "tc_general"
+
+        if model == UTSModel.sa_bank:
+            return Path(CACHE_ROOT) / "models" / "sa_bank"
diff --git a/underthesea/models.py b/underthesea/models.py
@@ -13,4 +13,11 @@
         "year": "2019",
         "model_path": "tc_general"
     },
+    "sa_bank": {
+        "cache_dir": "models",
+        "type": "Sentiment",
+        "license": "Open",
+        "year": "2019",
+        "model_path": "sa_bank"
+    },
 }
diff --git a/underthesea/sentiment/__init__.py b/underthesea/sentiment/__init__.py
@@ -16,17 +16,18 @@ def sentiment(X, domain=None):
             * bank: bank domain
     Returns
     =======
-    tokens: list
-        sentiment of sentence
+    Text: Text of input sentence
+    Labels: Sentiment of sentence
 
     Examples
     --------
 
     >>> # -*- coding: utf-8 -*-
     >>> from underthesea import sentiment
-    >>> sentence = "Vừa smartbidv, vừa bidv online mà lại k dùng chung 1 tài khoản đăng nhập, rắc rối!"
+    >>> sentence = "Chuyen tiền k nhận Dc tiên"
     >>> sentiment(sentence, domain='bank')
-    ('INTERNET BANKING#NEGATIVE',)
+
+    [MONEY_TRANSFER#negative (1.0)]
     """
     if X == "":
         return None

diff --git a/underthesea/sentiment/bank/__init__.py b/underthesea/sentiment/bank/__init__.py
@@ -1,26 +1,33 @@
-import joblib
-from os.path import join, dirname
+import logging
+import os
+from os.path import dirname
 import sys
+from languageflow.data import Sentence
+from languageflow.models.text_classifier import TextClassifier
+from underthesea.model_fetcher import ModelFetcher, UTSModel
 
-sys.path.insert(0, dirname(__file__))
 
-bank_sentiment = {}
+FORMAT = '%(message)s'
+logging.basicConfig(format=FORMAT)
+logger = logging.getLogger('underthesea')
 
+sys.path.insert(0, dirname(dirname(__file__)))
+model_path = ModelFetcher.get_model_path(UTSModel.sa_bank)
+classifier = None
 
-def sentiment(X):
-    global bank_sentiment
-    if "x_transform" not in bank_sentiment:
-        bank_sentiment["x_transform"] = joblib.load(join(dirname(__file__), "count.transformer.bin"))
-    if "y_transform" not in bank_sentiment:
-        bank_sentiment["y_transform"] = joblib.load(join(dirname(__file__), "label.transformer.bin"))
-    if "estimator" not in bank_sentiment:
-        bank_sentiment["estimator"] = joblib.load(join(dirname(__file__), "model.bin"))
-    x_transform = bank_sentiment["x_transform"]
-    y_transform = bank_sentiment["y_transform"]
-    estimator = bank_sentiment["estimator"]
-    if isinstance(X, list):
-        return y_transform.inverse_transform(
-            estimator.predict(x_transform.transform(X)))
-    else:
-        return y_transform.inverse_transform(
-            estimator.predict(x_transform.transform([X])))[0]
+
+def sentiment(text):
+    global classifier
+
+    if not classifier:
+        if os.path.exists(model_path):
+            classifier = TextClassifier.load(model_path)
+        else:
+            logger.error(
+                f"Could not load model at {model_path}.\n"
+                f"Download model with \"underthesea download {UTSModel.sa_bank.value}\".")
+            sys.exit(1)
+    sentence = Sentence(text)
+    classifier.predict(sentence)
+    labels = sentence.labels
+    return [label.value for label in labels]
diff --git a/underthesea/sentiment/bank/count.transformer.bin b/underthesea/sentiment/bank/count.transformer.bin
diff --git a/underthesea/sentiment/bank/label.transformer.bin b/underthesea/sentiment/bank/label.transformer.bin
diff --git a/underthesea/sentiment/bank/model.bin b/underthesea/sentiment/bank/model.bin
diff --git a/underthesea/sentiment/text_features.py b/underthesea/sentiment/text_features.py
@@ -0,0 +1,70 @@
+import unidecode
+from sklearn.base import BaseEstimator, TransformerMixin
+import string
+from underthesea.word_tokenize.regex_tokenize import tokenize
+
+
+negative_emoticons = {':(', '☹', '❌', '👎', '👹', '💀', '🔥', '🤔', '😏', '😐', '😑', '😒', '😓', '😔', '😕', '😖',
+                      '😞', '😟', '😠', '😡', '😢', '😣', '😤', '😥', '😧', '😨', '😩', '😪', '😫', '😭', '😰', '😱',
+                      '😳', '😵', '😶', '😾', '🙁', '🙏', '🚫', '>:[', ':-(', ':(', ':-c', ':c', ':-<', ':っC', ':<',
+                      ':-[', ':[', ':{'}
+
+positive_emoticons = {'=))', 'v', ';)', '^^', '<3', '☀', '☺', '♡', '♥', '✌', '✨', '❣', '❤', '🌝', '🌷', '🌸',
+                      '🌺', '🌼', '🍓', '🎈', '🐅', '🐶', '🐾', '👉', '👌', '👍', '👏', '👻', '💃', '💄', '💋',
+                      '💌', '💎', '💐', '💓', '💕', '💖', '💗', '💙', '💚', '💛', '💜', '💞', ':-)', ':)', ':D', ':o)',
+                      ':]', ':3', ':c)', ':>', '=]', '8)'}
+
+
+class Lowercase(BaseEstimator, TransformerMixin):
+    def transform(self, x):
+        return [s.lower() for s in x]
+
+    def fit(self, x, y=None):
+        return self
+
+
+class RemoveTone(BaseEstimator, TransformerMixin):
+    def remove_tone(self, s):
+        return unidecode.unidecode(s)
+
+    def transform(self, x):
+        return [self.remove_tone(s) for s in x]
+
+    def fit(self, x, y=None):
+        return self
+
+
+class CountEmoticons(BaseEstimator, TransformerMixin):
+    def count_emoticon(self, s):
+        positive_count = 0
+        negative_count = 0
+        for emoticon in positive_emoticons:
+            positive_count += s.count(emoticon)
+        for emoticon in negative_emoticons:
+            negative_count += s.count(emoticon)
+        return positive_count, negative_count
+
+    def transform(self, x):
+        return [self.count_emoticon(s) for s in x]
+
+    def fit(self, x, y=None):
+        return self
+
+
+class Tokenize(BaseEstimator, TransformerMixin):
+    def pun_num(self, s):
+        for token in s.split():
+            if token in string.punctuation:
+                if token == '.':
+                    s = s
+                else:
+                    s = s.replace(token, 'punc')
+            else:
+                s = s
+        return s
+
+    def transform(self, x):
+        return [self.pun_num(tokenize(s, format='text')) for s in x]
+
+    def fit(self, x, y=None):
+        return self