Release language identification APIs which can recognize 176 languages

hankcs · hankcs · commit 5ba95eabc96e · 2022-09-28T19:23:30.000-04:00
diff --git a/README.md b/README.md
@@ -69,7 +69,7 @@ the [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) licens
   <dependency>
     <groupId>com.hankcs.hanlp.restful</groupId>
     <artifactId>hanlp-restful</artifactId>
-    <version>0.0.11</version>
+    <version>0.0.12</version>
   </dependency>
   ```
 
diff --git a/hanlp/components/classifiers/fasttext_classifier.py b/hanlp/components/classifiers/fasttext_classifier.py
@@ -0,0 +1,95 @@
+# -*- coding:utf-8 -*-
+# Author: hankcs
+# Date: 2022-09-28 13:31
+import os
+import sys
+from typing import List, Union
+
+import fasttext
+from fasttext.FastText import _FastText
+
+import hanlp
+from hanlp.common.component import Component
+from hanlp.utils.io_util import get_resource, stdout_redirected
+from hanlp_common.io import load_json
+from hanlp_common.reflection import classpath_of
+from hanlp_common.structure import SerializableDict
+
+
+class FastTextClassifier(Component):
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._model: _FastText = None
+        self.config = SerializableDict({
+            'classpath': classpath_of(self),
+            'hanlp_version': hanlp.__version__,
+        })
+
+    def load(self, save_dir, model_path=None, **kwargs):
+        config_path = os.path.join(save_dir, 'config.json')
+        if os.path.isfile(config_path):
+            self.config: dict = load_json(config_path)
+            model_path = self.config.get('model_path', model_path)
+        else:
+            model_path = model_path or save_dir
+            self.config['model_path'] = model_path
+        filepath = get_resource(model_path)
+        with stdout_redirected(to=os.devnull, stdout=sys.stderr):
+            self._model = fasttext.load_model(filepath)
+
+    def predict(self, text: Union[str, List[str]], topk=False, prob=False, max_len=None, **kwargs):
+        """
+        Classify text.
+
+        Args:
+            text: A document or a list of documents.
+            topk: ``True`` or ``int`` to return the top-k labels.
+            prob: Return also probabilities.
+            max_len: Strip long document into ``max_len`` characters for faster prediction.
+            **kwargs: Not used
+
+        Returns:
+            Classification results.
+        """
+        num_labels = len(self._model.get_labels())
+        flat = isinstance(text, str)
+        if flat:
+            text = [text]
+        if not isinstance(topk, list):
+            topk = [topk] * len(text)
+        if not isinstance(prob, list):
+            prob = [prob] * len(text)
+        if max_len:
+            text = [x[:max_len] for x in text]
+        text = [x.replace('\n', ' ') for x in text]
+        batch_labels, batch_probs = self._model.predict(text, k=num_labels)
+        results = []
+        for labels, probs, k, p in zip(batch_labels, batch_probs, topk, prob):
+            labels = [self._strip_prefix(x) for x in labels]
+            if k is False:
+                labels = labels[0]
+            elif k is True:
+                pass
+            elif k:
+                labels = labels[:k]
+            if p:
+                probs = probs.tolist()
+                if k is False:
+                    result = labels, probs[0]
+                else:
+                    result = dict(zip(labels, probs))
+            else:
+                result = labels
+            results.append(result)
+        if flat:
+            results = results[0]
+        return results
+
+    @property
+    def labels(self):
+        return [self._strip_prefix(x) for x in self._model.get_labels()]
+
+    @staticmethod
+    def _strip_prefix(label: str):
+        return label[len('__label__'):]
diff --git a/hanlp/pretrained/classifiers.py b/hanlp/pretrained/classifiers.py
@@ -6,4 +6,13 @@
 CHNSENTICORP_BERT_BASE_ZH = HANLP_URL + 'classification/chnsenticorp_bert_base_20211228_163210.zip'
 SST2_ALBERT_BASE_EN = HANLP_URL + 'classification/sst2_albert_base_20211228_164917.zip'
 
+LID_176_FASTTEXT_BASE = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'
+'''
+126MB FastText model for language identification trained on data from Wikipedia, Tatoeba and SETimes.
+'''
+LID_176_FASTTEXT_SMALL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'
+'''
+917kB FastText model for language identification trained on data from Wikipedia, Tatoeba and SETimes.
+'''
+
 ALL = {}
diff --git a/hanlp/utils/component_util.py b/hanlp/utils/component_util.py
@@ -65,6 +65,13 @@ def load_from_meta_file(save_dir: str, meta_filename='meta.json', transform_only
                        'embed': {'classpath': 'hanlp.layers.embeddings.fast_text.FastTextEmbedding',
                                  'filepath': identifier, 'src': 'token'},
                        'hanlp_version': version.__version__}, metapath)
+        elif identifier in {pretrained.classifiers.LID_176_FASTTEXT_SMALL,
+                            pretrained.classifiers.LID_176_FASTTEXT_BASE}:
+            save_dir = os.path.dirname(save_dir)
+            metapath = os.path.join(save_dir, 'config.json')
+            save_json({'classpath': 'hanlp.components.classifiers.fasttext_classifier.FastTextClassifier',
+                       'model_path': identifier,
+                       'hanlp_version': version.__version__}, metapath)
         else:
             raise FileNotFoundError(f'The identifier {save_dir} resolves to a nonexistent meta file {metapath}. {tips}')
     meta: dict = load_json(metapath)
diff --git a/hanlp/version.py b/hanlp/version.py
@@ -2,7 +2,7 @@
 # Author: hankcs
 # Date: 2019-12-28 19:26
 
-__version__ = '2.1.0-beta.41'
+__version__ = '2.1.0-beta.42'
 """HanLP version"""
 
 
diff --git a/plugins/hanlp_demo/hanlp_demo/mul/demo_lid.py b/plugins/hanlp_demo/hanlp_demo/mul/demo_lid.py
@@ -0,0 +1,19 @@
+# -*- coding:utf-8 -*-
+# Author: hankcs
+# Date: 2022-09-28 16:49
+import hanlp
+
+lid = hanlp.load(hanlp.pretrained.classifiers.LID_176_FASTTEXT_BASE)
+
+print(lid('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.'))
+lang, prob = lid('2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True)
+print(f'{lang} language identified with probability {prob:.3%}')
+print(lid('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2))
+
+# For a combination of languages, predict top-k languages with probabilities:
+text = '''
+2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。
+In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.
+'''
+
+print(lid(text, topk=3, prob=True))
diff --git a/plugins/hanlp_demo/hanlp_demo/mul/demo_lid_restful.py b/plugins/hanlp_demo/hanlp_demo/mul/demo_lid_restful.py
@@ -0,0 +1,12 @@
+# -*- coding:utf-8 -*-
+# Author: hankcs
+# Date: 2022-09-28 16:49
+from hanlp_restful import HanLPClient
+
+HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None, language='mul')
+
+print(HanLP.language_identification([
+    'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.',
+    '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
+    '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。',
+]))
diff --git a/plugins/hanlp_restful/hanlp_restful/__init__.py b/plugins/hanlp_restful/hanlp_restful/__init__.py
@@ -468,3 +468,53 @@ def grammatical_error_correction(self, text: Union[str, List[str]], language: st
                                         {'text': text,
                                          'language': language or self._language})
         return response
+
+    def text_classification(self, text: Union[str, List[str]], model, topk=False, prob=False) -> Union[
+        str, Dict[str, float], List[Union[str, Dict[str, float]]]]:
+        """
+        Text classification is the task of assigning a sentence or document an appropriate category.
+        The categories depend on the chosen dataset and can range from topics.
+
+        Args:
+            text: A document or a list of documents.
+            model: The model to use for prediction.
+            topk: ``True`` or ``int`` to return the top-k labels.
+            prob: Return also probabilities.
+
+        Returns:
+
+            Classification results.
+        """
+        response = self._send_post_json(self._url + '/text_classification',
+                                        {'text': text, 'model': model, 'topk': topk, 'prob': prob})
+        return response
+
+    def language_identification(self, text: Union[str, List[str]], topk=False, prob=False) -> Union[
+        str, Dict[str, float], List[Union[str, Dict[str, float]]]]:
+        """
+        Recognize the language of a given text.
+
+        Args:
+            text: A document or a list of documents.
+            topk: ``True`` or ``int`` to return the top-k languages.
+            prob: Return also probabilities.
+
+        Returns:
+
+            Identified language in `ISO 639-1 codes`_.
+
+        Examples::
+
+            lid('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.')
+            'en'
+            lang, prob = lid('2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True)
+            ('ja', 0.9976244568824768)
+            lid('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2)
+            ['zh', 'ja']
+            lid('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2, prob=True)
+            {'zh': 0.3952908217906952, 'en': 0.37189167737960815, 'ja': 0.056213412433862686}
+
+        .. _ISO 639-1 codes:
+           https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
+        """
+        return self.text_classification(text, 'lid', topk, prob)
diff --git a/plugins/hanlp_restful/setup.py b/plugins/hanlp_restful/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name='hanlp_restful',
-    version='0.0.20',
+    version='0.0.21',
     description='HanLP: Han Language Processing',
     long_description=long_description,
     long_description_content_type="text/markdown",
diff --git a/plugins/hanlp_restful_java/pom.xml b/plugins/hanlp_restful_java/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>com.hankcs.hanlp.restful</groupId>
     <artifactId>hanlp-restful</artifactId>
-    <version>0.0.11</version>
+    <version>0.0.12</version>
 
     <name>HanLP RESTful Client in Java</name>
     <url>https://github.com/hankcs/HanLP</url>
diff --git a/plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/HanLPClient.java b/plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/HanLPClient.java
@@ -459,6 +459,67 @@ public Map<String, Double> extractiveSummarization(String text, int topk) throws
         return mapper.readValue(post("/extractive_summarization", input), LinkedHashMap.class);
     }
 
+    /**
+     * Text classification is the task of assigning a sentence or document an appropriate category.
+     * The categories depend on the chosen dataset and can range from topics.
+     *
+     * @param text  The text content of the document.
+     * @param model The model to use for prediction.
+     * @return Classification results.
+     * @throws IOException HTTP errors.
+     */
+    public String textClassification(String text, String model) throws IOException
+    {
+        return (String) textClassification(text, model, false, false);
+    }
+
+
+    /**
+     * Text classification is the task of assigning a sentence or document an appropriate category.
+     * The categories depend on the chosen dataset and can range from topics.
+     *
+     * @param text  A document or a list of documents.
+     * @param model The model to use for prediction.
+     * @param topk  `true` or `int` to return the top-k languages.
+     * @param prob  Return also probabilities.
+     * @return Classification results.
+     * @throws IOException HTTP errors.
+     */
+    public Object textClassification(Object text, String model, Object topk, boolean prob) throws IOException
+    {
+        Map<String, Object> input = new HashMap<>();
+        input.put("text", text);
+        input.put("model", model);
+        input.put("topk", topk);
+        input.put("prob", prob);
+        //noinspection unchecked
+        return mapper.readValue(post("/text_classification", input), Object.class);
+    }
+
+    /**
+     * Recognize the language of a given text.
+     *
+     * @param text The text content of the document.
+     * @return Identified language in <a href="https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes">ISO 639-1 codes</a>.
+     * @throws IOException HTTP errors.
+     */
+    public String languageIdentification(String text) throws IOException
+    {
+        return textClassification(text, "lid");
+    }
+
+    /**
+     * Recognize the language of a given text.
+     *
+     * @param text The text content of the document.
+     * @return Identified language in <a href="https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes">ISO 639-1 codes</a>.
+     * @throws IOException HTTP errors.
+     */
+    public List<String> languageIdentification(String[] text) throws IOException
+    {
+        return (List<String>) textClassification(text, "lid", false, false);
+    }
+
     /**
      * Keyphrase extraction aims to identify keywords or phrases reflecting the main topics of a document.
      *
diff --git a/plugins/hanlp_restful_java/src/test/java/com/hankcs/hanlp/restful/HanLPClientTest.java b/plugins/hanlp_restful_java/src/test/java/com/hankcs/hanlp/restful/HanLPClientTest.java
@@ -149,6 +149,16 @@ void grammaticalErrorCorrection() throws IOException
         prettyPrint(client.grammaticalErrorCorrection(new String[]{"每个青年都应当有远大的报复。", "有的同学对语言很兴趣。"}));
     }
 
+    @Test
+    void languageIdentification() throws IOException
+    {
+        prettyPrint(client.languageIdentification(new String[]{
+                "In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.",
+                "2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。",
+                "2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。",
+        }));
+    }
+
     void prettyPrint(Object object) throws JsonProcessingException
     {
         ObjectMapper mapper = new ObjectMapper();
diff --git a/setup.py b/setup.py
@@ -11,19 +11,21 @@
 with open(join(this_dir, "hanlp", "version.py")) as fp:
     exec(fp.read(), version)
 
+FASTTEXT = 'fasttext-wheel==0.9.2'
 extras_require = {
     'amr': [
         'penman==1.2.1',
         'networkx>=2.5.1',
         'perin-parser>=0.0.12',
     ],
+    'fasttext': [FASTTEXT],
     'tf': [
-        'fasttext-wheel==0.9.2',
+        FASTTEXT,
         'tensorflow==2.6.0',
         'keras==2.6.0',
     ]
 }
-extras_require['full'] = sum(extras_require.values(), [])
+extras_require['full'] = list(set(sum(extras_require.values(), [])))
 
 setup(
     name='hanlp',