Skip to content

Commit 5ba95ea

Browse files
committed
Release language identification APIs which can recognize 176 languages
1 parent 125b2b0 commit 5ba95ea

File tree

13 files changed

+271
-6
lines changed

13 files changed

+271
-6
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ the [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) licens
6969
<dependency>
7070
<groupId>com.hankcs.hanlp.restful</groupId>
7171
<artifactId>hanlp-restful</artifactId>
72-
<version>0.0.11</version>
72+
<version>0.0.12</version>
7373
</dependency>
7474
```
7575

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# -*- coding:utf-8 -*-
2+
# Author: hankcs
3+
# Date: 2022-09-28 13:31
4+
import os
5+
import sys
6+
from typing import List, Union
7+
8+
import fasttext
9+
from fasttext.FastText import _FastText
10+
11+
import hanlp
12+
from hanlp.common.component import Component
13+
from hanlp.utils.io_util import get_resource, stdout_redirected
14+
from hanlp_common.io import load_json
15+
from hanlp_common.reflection import classpath_of
16+
from hanlp_common.structure import SerializableDict
17+
18+
19+
class FastTextClassifier(Component):
20+
21+
def __init__(self) -> None:
22+
super().__init__()
23+
self._model: _FastText = None
24+
self.config = SerializableDict({
25+
'classpath': classpath_of(self),
26+
'hanlp_version': hanlp.__version__,
27+
})
28+
29+
def load(self, save_dir, model_path=None, **kwargs):
30+
config_path = os.path.join(save_dir, 'config.json')
31+
if os.path.isfile(config_path):
32+
self.config: dict = load_json(config_path)
33+
model_path = self.config.get('model_path', model_path)
34+
else:
35+
model_path = model_path or save_dir
36+
self.config['model_path'] = model_path
37+
filepath = get_resource(model_path)
38+
with stdout_redirected(to=os.devnull, stdout=sys.stderr):
39+
self._model = fasttext.load_model(filepath)
40+
41+
def predict(self, text: Union[str, List[str]], topk=False, prob=False, max_len=None, **kwargs):
42+
"""
43+
Classify text.
44+
45+
Args:
46+
text: A document or a list of documents.
47+
topk: ``True`` or ``int`` to return the top-k labels.
48+
prob: Return also probabilities.
49+
max_len: Strip long document into ``max_len`` characters for faster prediction.
50+
**kwargs: Not used
51+
52+
Returns:
53+
Classification results.
54+
"""
55+
num_labels = len(self._model.get_labels())
56+
flat = isinstance(text, str)
57+
if flat:
58+
text = [text]
59+
if not isinstance(topk, list):
60+
topk = [topk] * len(text)
61+
if not isinstance(prob, list):
62+
prob = [prob] * len(text)
63+
if max_len:
64+
text = [x[:max_len] for x in text]
65+
text = [x.replace('\n', ' ') for x in text]
66+
batch_labels, batch_probs = self._model.predict(text, k=num_labels)
67+
results = []
68+
for labels, probs, k, p in zip(batch_labels, batch_probs, topk, prob):
69+
labels = [self._strip_prefix(x) for x in labels]
70+
if k is False:
71+
labels = labels[0]
72+
elif k is True:
73+
pass
74+
elif k:
75+
labels = labels[:k]
76+
if p:
77+
probs = probs.tolist()
78+
if k is False:
79+
result = labels, probs[0]
80+
else:
81+
result = dict(zip(labels, probs))
82+
else:
83+
result = labels
84+
results.append(result)
85+
if flat:
86+
results = results[0]
87+
return results
88+
89+
@property
90+
def labels(self):
91+
return [self._strip_prefix(x) for x in self._model.get_labels()]
92+
93+
@staticmethod
94+
def _strip_prefix(label: str):
95+
return label[len('__label__'):]

hanlp/pretrained/classifiers.py

+9
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,13 @@
66
CHNSENTICORP_BERT_BASE_ZH = HANLP_URL + 'classification/chnsenticorp_bert_base_20211228_163210.zip'
77
SST2_ALBERT_BASE_EN = HANLP_URL + 'classification/sst2_albert_base_20211228_164917.zip'
88

9+
LID_176_FASTTEXT_BASE = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'
10+
'''
11+
126MB FastText model for language identification trained on data from Wikipedia, Tatoeba and SETimes.
12+
'''
13+
LID_176_FASTTEXT_SMALL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'
14+
'''
15+
917kB FastText model for language identification trained on data from Wikipedia, Tatoeba and SETimes.
16+
'''
17+
918
ALL = {}

hanlp/utils/component_util.py

+7
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,13 @@ def load_from_meta_file(save_dir: str, meta_filename='meta.json', transform_only
6565
'embed': {'classpath': 'hanlp.layers.embeddings.fast_text.FastTextEmbedding',
6666
'filepath': identifier, 'src': 'token'},
6767
'hanlp_version': version.__version__}, metapath)
68+
elif identifier in {pretrained.classifiers.LID_176_FASTTEXT_SMALL,
69+
pretrained.classifiers.LID_176_FASTTEXT_BASE}:
70+
save_dir = os.path.dirname(save_dir)
71+
metapath = os.path.join(save_dir, 'config.json')
72+
save_json({'classpath': 'hanlp.components.classifiers.fasttext_classifier.FastTextClassifier',
73+
'model_path': identifier,
74+
'hanlp_version': version.__version__}, metapath)
6875
else:
6976
raise FileNotFoundError(f'The identifier {save_dir} resolves to a nonexistent meta file {metapath}. {tips}')
7077
meta: dict = load_json(metapath)

hanlp/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Author: hankcs
33
# Date: 2019-12-28 19:26
44

5-
__version__ = '2.1.0-beta.41'
5+
__version__ = '2.1.0-beta.42'
66
"""HanLP version"""
77

88

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# -*- coding:utf-8 -*-
2+
# Author: hankcs
3+
# Date: 2022-09-28 16:49
4+
import hanlp
5+
6+
lid = hanlp.load(hanlp.pretrained.classifiers.LID_176_FASTTEXT_BASE)
7+
8+
print(lid('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.'))
9+
lang, prob = lid('2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True)
10+
print(f'{lang} language identified with probability {prob:.3%}')
11+
print(lid('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2))
12+
13+
# For a combination of languages, predict top-k languages with probabilities:
14+
text = '''
15+
2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。
16+
In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.
17+
'''
18+
19+
print(lid(text, topk=3, prob=True))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# -*- coding:utf-8 -*-
2+
# Author: hankcs
3+
# Date: 2022-09-28 16:49
4+
from hanlp_restful import HanLPClient
5+
6+
HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None, language='mul')
7+
8+
print(HanLP.language_identification([
9+
'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.',
10+
'2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
11+
'2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。',
12+
]))

plugins/hanlp_restful/hanlp_restful/__init__.py

+50
Original file line numberDiff line numberDiff line change
@@ -468,3 +468,53 @@ def grammatical_error_correction(self, text: Union[str, List[str]], language: st
468468
{'text': text,
469469
'language': language or self._language})
470470
return response
471+
472+
def text_classification(self, text: Union[str, List[str]], model, topk=False, prob=False) -> Union[
473+
str, Dict[str, float], List[Union[str, Dict[str, float]]]]:
474+
"""
475+
Text classification is the task of assigning a sentence or document an appropriate category.
476+
The categories depend on the chosen dataset and can range from topics.
477+
478+
Args:
479+
text: A document or a list of documents.
480+
model: The model to use for prediction.
481+
topk: ``True`` or ``int`` to return the top-k labels.
482+
prob: Return also probabilities.
483+
484+
Returns:
485+
486+
Classification results.
487+
"""
488+
response = self._send_post_json(self._url + '/text_classification',
489+
{'text': text, 'model': model, 'topk': topk, 'prob': prob})
490+
return response
491+
492+
def language_identification(self, text: Union[str, List[str]], topk=False, prob=False) -> Union[
493+
str, Dict[str, float], List[Union[str, Dict[str, float]]]]:
494+
"""
495+
Recognize the language of a given text.
496+
497+
Args:
498+
text: A document or a list of documents.
499+
topk: ``True`` or ``int`` to return the top-k languages.
500+
prob: Return also probabilities.
501+
502+
Returns:
503+
504+
Identified language in `ISO 639-1 codes`_.
505+
506+
Examples::
507+
508+
lid('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.')
509+
'en'
510+
lang, prob = lid('2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True)
511+
('ja', 0.9976244568824768)
512+
lid('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2)
513+
['zh', 'ja']
514+
lid('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2, prob=True)
515+
{'zh': 0.3952908217906952, 'en': 0.37189167737960815, 'ja': 0.056213412433862686}
516+
517+
.. _ISO 639-1 codes:
518+
https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
519+
"""
520+
return self.text_classification(text, 'lid', topk, prob)

plugins/hanlp_restful/setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
setup(
1212
name='hanlp_restful',
13-
version='0.0.20',
13+
version='0.0.21',
1414
description='HanLP: Han Language Processing',
1515
long_description=long_description,
1616
long_description_content_type="text/markdown",

plugins/hanlp_restful_java/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
<groupId>com.hankcs.hanlp.restful</groupId>
88
<artifactId>hanlp-restful</artifactId>
9-
<version>0.0.11</version>
9+
<version>0.0.12</version>
1010

1111
<name>HanLP RESTful Client in Java</name>
1212
<url>https://github.com/hankcs/HanLP</url>

plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/HanLPClient.java

+61
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,67 @@ public Map<String, Double> extractiveSummarization(String text, int topk) throws
459459
return mapper.readValue(post("/extractive_summarization", input), LinkedHashMap.class);
460460
}
461461

462+
/**
463+
* Text classification is the task of assigning a sentence or document an appropriate category.
464+
* The categories depend on the chosen dataset and can range from topics.
465+
*
466+
* @param text The text content of the document.
467+
* @param model The model to use for prediction.
468+
* @return Classification results.
469+
* @throws IOException HTTP errors.
470+
*/
471+
public String textClassification(String text, String model) throws IOException
472+
{
473+
return (String) textClassification(text, model, false, false);
474+
}
475+
476+
477+
/**
478+
* Text classification is the task of assigning a sentence or document an appropriate category.
479+
* The categories depend on the chosen dataset and can range from topics.
480+
*
481+
* @param text A document or a list of documents.
482+
* @param model The model to use for prediction.
483+
* @param topk `true` or `int` to return the top-k languages.
484+
* @param prob Return also probabilities.
485+
* @return Classification results.
486+
* @throws IOException HTTP errors.
487+
*/
488+
public Object textClassification(Object text, String model, Object topk, boolean prob) throws IOException
489+
{
490+
Map<String, Object> input = new HashMap<>();
491+
input.put("text", text);
492+
input.put("model", model);
493+
input.put("topk", topk);
494+
input.put("prob", prob);
495+
//noinspection unchecked
496+
return mapper.readValue(post("/text_classification", input), Object.class);
497+
}
498+
499+
/**
500+
* Recognize the language of a given text.
501+
*
502+
* @param text The text content of the document.
503+
* @return Identified language in <a href="https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes">ISO 639-1 codes</a>.
504+
* @throws IOException HTTP errors.
505+
*/
506+
public String languageIdentification(String text) throws IOException
507+
{
508+
return textClassification(text, "lid");
509+
}
510+
511+
/**
512+
* Recognize the language of a given text.
513+
*
514+
* @param text The text content of the document.
515+
* @return Identified language in <a href="https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes">ISO 639-1 codes</a>.
516+
* @throws IOException HTTP errors.
517+
*/
518+
public List<String> languageIdentification(String[] text) throws IOException
519+
{
520+
return (List<String>) textClassification(text, "lid", false, false);
521+
}
522+
462523
/**
463524
* Keyphrase extraction aims to identify keywords or phrases reflecting the main topics of a document.
464525
*

plugins/hanlp_restful_java/src/test/java/com/hankcs/hanlp/restful/HanLPClientTest.java

+10
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,16 @@ void grammaticalErrorCorrection() throws IOException
149149
prettyPrint(client.grammaticalErrorCorrection(new String[]{"每个青年都应当有远大的报复。", "有的同学对语言很兴趣。"}));
150150
}
151151

152+
@Test
153+
void languageIdentification() throws IOException
154+
{
155+
prettyPrint(client.languageIdentification(new String[]{
156+
"In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.",
157+
"2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。",
158+
"2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。",
159+
}));
160+
}
161+
152162
void prettyPrint(Object object) throws JsonProcessingException
153163
{
154164
ObjectMapper mapper = new ObjectMapper();

setup.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,21 @@
1111
with open(join(this_dir, "hanlp", "version.py")) as fp:
1212
exec(fp.read(), version)
1313

14+
FASTTEXT = 'fasttext-wheel==0.9.2'
1415
extras_require = {
1516
'amr': [
1617
'penman==1.2.1',
1718
'networkx>=2.5.1',
1819
'perin-parser>=0.0.12',
1920
],
21+
'fasttext': [FASTTEXT],
2022
'tf': [
21-
'fasttext-wheel==0.9.2',
23+
FASTTEXT,
2224
'tensorflow==2.6.0',
2325
'keras==2.6.0',
2426
]
2527
}
26-
extras_require['full'] = sum(extras_require.values(), [])
28+
extras_require['full'] = list(set(sum(extras_require.values(), [])))
2729

2830
setup(
2931
name='hanlp',

0 commit comments

Comments
 (0)