From 2010d17a27029930a7bd004ac7743fa368222184 Mon Sep 17 00:00:00 2001
From: Vu Anh <anhv.ict91@gmail.com>
Date: Thu, 18 Nov 2021 00:28:50 +0700
Subject: [PATCH] GH-185: fix error in pyproject.toml (#509)

---
 .github/workflows/release-pypi-core.yml       |   4 +-
 extensions/underthesea_core/Cargo.toml        |  25 +-
 extensions/underthesea_core/benches/lab.rs    |  41 +++
 extensions/underthesea_core/pyproject.toml    |   3 +-
 .../underthesea_core/src/featurizers.rs       | 258 ++++++------------
 extensions/underthesea_core/src/lib.rs        |   9 +-
 tests/benchmark/benchmark_word_tokenize.py    |  26 +-
 .../benchmark_word_tokenize_compare.py        |  79 ++++++
 8 files changed, 229 insertions(+), 216 deletions(-)
 create mode 100644 extensions/underthesea_core/benches/lab.rs
 create mode 100644 tests/benchmark/benchmark_word_tokenize_compare.py

diff --git a/.github/workflows/release-pypi-core.yml b/.github/workflows/release-pypi-core.yml
index ec1b45e9..dc79de99 100644
--- a/.github/workflows/release-pypi-core.yml
+++ b/.github/workflows/release-pypi-core.yml
@@ -37,7 +37,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.6, 3.7]
+        python-version: [3.6, 3.7, 3.8, 3.9]
         os: [ubuntu-latest, macos-latest]
     steps:
       - uses: actions/checkout@v1
@@ -68,4 +68,4 @@ jobs:
         working-directory: ./extensions/underthesea_core
         env:
           MATURIN_PASSWORD: ${{ secrets.PYPI_UNDERTHESEA_CORE_API_TOKEN }}
-        run: poetry run maturin publish --username __token__
\ No newline at end of file
+        run: poetry run maturin publish --username __token__ --interpreter python${{ matrix.python-version }}
\ No newline at end of file
diff --git a/extensions/underthesea_core/Cargo.toml b/extensions/underthesea_core/Cargo.toml
index d8b106c6..c56fc841 100644
--- a/extensions/underthesea_core/Cargo.toml
+++ b/extensions/underthesea_core/Cargo.toml
@@ -1,24 +1,33 @@
 [package]
 name = "underthesea_core"
-version = "0.0.4-alpha.6"
+version = "0.0.4-alpha.8"
+homepage = "https://github.com/undertheseanlp/underthesea/"
+repository = "https://github.com/undertheseanlp/underthesea/"
 authors = ["Vu Anh <anhv.ict91@gmail.com>"]
-description = "Underthesea Core (fast & fun)"
 license = "GPL-3.0"
 edition = "2021"
 readme = "README.md"
-repository = "https://github.com/undertheseanlp/underthesea/"
-homepage = "https://github.com/undertheseanlp/underthesea/"
+description = "Underthesea Core (fast & fun)"
 
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+exclude = ["rust-toolchain", "target/*", "Cargo.lock", "benches/*.txt"]
+
+[lib]
+name = "underthesea_core"
+path = "src/lib.rs"
+crate-type = ["cdylib"]
+
+[[bench]]
+name = "lab"
+harness = false
 
 [dependencies]
 serde = { version = "1.0", features = [ "derive" ] }
 regex = "1"
+rayon = "1.5"
 
 [dependencies.pyo3]
 version = "0.15.0"
 features = ["extension-module"]
 
-[lib]
-name = "underthesea_core"
-crate-type = ["cdylib"]
\ No newline at end of file
+[dev-dependencies]
+criterion = "0.3.5"
\ No newline at end of file
diff --git a/extensions/underthesea_core/benches/lab.rs b/extensions/underthesea_core/benches/lab.rs
new file mode 100644
index 00000000..edd20667
--- /dev/null
+++ b/extensions/underthesea_core/benches/lab.rs
@@ -0,0 +1,41 @@
+extern crate rayon;
+extern crate criterion;
+
+use std::time::Duration;
+use rayon::prelude::*;
+use criterion::*;
+
+fn fibonacci(n: u32) -> u32 {
+    match n {
+        0 => 1,
+        1 => 1,
+        _ => fibonacci(n - 1) + fibonacci(n - 2),
+    }
+}
+
+fn f1() {
+    let mut arr: [u32; 2] = [10, 100];
+    arr.par_iter_mut().map(|n| fibonacci(*n));
+    return;
+}
+
+fn f2() {
+    let mut arr: [u32; 2] = [10, 100];
+    arr.map(|n| fibonacci(n));
+    return;
+}
+
+
+fn criterion_benchmark(c: &mut Criterion){
+    let mut group = c.benchmark_group("abc");
+    group.bench_function("my-function", |b| b.iter(|| f1()));
+    group.finish();
+}
+
+criterion_group! {
+    name = benches;
+    config = Criterion::default().sample_size(10);
+    targets = criterion_benchmark
+}
+
+criterion_main!(benches);
\ No newline at end of file
diff --git a/extensions/underthesea_core/pyproject.toml b/extensions/underthesea_core/pyproject.toml
index 83bbcf0b..87f0ac8e 100644
--- a/extensions/underthesea_core/pyproject.toml
+++ b/extensions/underthesea_core/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "underthesea_core"
-version = "0.0.4-alpha.6"
+version = "0.0.4-alpha.8"
 authors = ["Vu Anh <anhv.ict91@gmail.com>"]
 description = "Underthesea Core (fast & fun)"
 license = "GPL-3.0"
@@ -15,5 +15,4 @@ python = "^3.5"
 maturin = "==0.9.4"
 
 [build-system]
-requires = ["==0.9.4"]
 build-backend = "maturin"
\ No newline at end of file
diff --git a/extensions/underthesea_core/src/featurizers.rs b/extensions/underthesea_core/src/featurizers.rs
index c03e6b6b..1a9318b3 100644
--- a/extensions/underthesea_core/src/featurizers.rs
+++ b/extensions/underthesea_core/src/featurizers.rs
@@ -87,209 +87,109 @@ impl CRFFeaturizer {
         }
     }
 
-    pub fn process(&self, sentences: Vec<Vec<Vec<String>>>) -> Vec<Vec<Vec<String>>>  {
-        let mut sentences_features = Vec::new();
-        for sentence in sentences {
-            // generate features for each sentence
-            let mut sentence_features = Vec::new();
-            for position in 0..sentence.len() {
-                let token_features = generate_token_features(&sentence, position, &self.feature_templates, &self.dictionary);
-                sentence_features.push(token_features);
+    /// generate features for each token <position>-th in sentence
+    /// Sentence Example
+    /// Messi   X
+    /// giành   X
+    /// quả     X
+    pub fn generate_token_features(&self, sentence: &Vec<Vec<String>>, position: usize) -> Vec<String> {
+        let mut features = Vec::new();
+        for feature_template in &self.feature_templates {
+            let index1 = position as isize + feature_template.offset1;
+            let bos_value = String::from(&feature_template.syntax) + "=" + "BOS";
+            let eos_value = String::from(&feature_template.syntax) + "=" + "EOS";
+            let column = feature_template.column;
+            let n = sentence.len() as isize;
+            let mut text: String;
+            if index1 < 0 {
+                features.push(bos_value);
+                continue;
+            } else if index1 >= n {
+                features.push(eos_value);
+                continue;
+            } else {
+                text = String::from(&sentence[index1 as usize][column as usize]);
             }
-            sentences_features.push(sentence_features);
-        }
-        return sentences_features;
-    }
-}
-
-/// generate features for each token <position>-th in sentence
-/// Sentence Example
-/// Messi   X
-/// giành   X
-/// quả     X
-pub fn generate_token_features(sentence: &Vec<Vec<String>>, position: usize, feature_templates: &Vec<FeatureTemplate>, dictionary: &HashSet<String>) -> Vec<String> {
-    let mut features = Vec::new();
-    for feature_template in feature_templates {
-        let index1 = position as isize + feature_template.offset1;
-        let bos_value = String::from(&feature_template.syntax) + "=" + "BOS";
-        let eos_value = String::from(&feature_template.syntax) + "=" + "EOS";
-        let column = feature_template.column;
-        let n = sentence.len() as isize;
-        let mut text: String;
-        if index1 < 0 {
-            features.push(bos_value);
-            continue;
-        } else if index1 >= n {
-            features.push(eos_value);
-            continue;
-        } else {
-            text = String::from(&sentence[index1 as usize][column as usize]);
-        }
 
-        match feature_template.offset2 {
-            None => {}
-            Some(offset2) => {
-                let index2 = position as isize + offset2;
-                if index2 < 0 {
-                    features.push(bos_value);
-                    continue;
-                } else if index2 >= n {
-                    features.push(eos_value);
-                    continue;
-                } else {
-                    for i in index1 + 1..index2 + 1 {
-                        text = text + " " + &sentence[i as usize][column as usize];
+            match feature_template.offset2 {
+                None => {}
+                Some(offset2) => {
+                    let index2 = position as isize + offset2;
+                    if index2 < 0 {
+                        features.push(bos_value);
+                        continue;
+                    } else if index2 >= n {
+                        features.push(eos_value);
+                        continue;
+                    } else {
+                        for i in index1 + 1..index2 + 1 {
+                            text = text + " " + &sentence[i as usize][column as usize];
+                        }
                     }
                 }
             }
-        }
 
-        // apply function
-        match feature_template.function.as_ref() {
-            None => {}
-            Some(function_name) => {
-                match function_name.as_ref() {
-                    "lower" => {
-                        text = text.to_lowercase();
-                    }
-                    "isdigit" => {
-                        let is_digit = text.parse::<i32>();
-                        match is_digit {
-                            Ok(_) => { text = String::from("True") }
-                            Err(_) => { text = String::from("False") }
+            // apply function
+            match feature_template.function.as_ref() {
+                None => {}
+                Some(function_name) => {
+                    match function_name.as_ref() {
+                        "lower" => {
+                            text = text.to_lowercase();
                         }
-                    }
-                    "istitle" => {
-                        let mut is_title = "True";
-                        for part in text.split(" ") {
-                            let first_char = String::from(part.chars().nth(0).unwrap());
-                            if first_char != first_char.to_uppercase() {
-                                is_title = "False";
-                                break;
+                        "isdigit" => {
+                            let is_digit = text.parse::<i32>();
+                            match is_digit {
+                                Ok(_) => { text = String::from("True") }
+                                Err(_) => { text = String::from("False") }
                             }
                         }
-                        text = String::from(is_title);
-                    }
-                    "is_in_dict" => {
-                        if dictionary.contains(text.to_lowercase().as_str()) {
-                            text = String::from("True");
-                        } else {
-                            text = String::from("False");
+                        "istitle" => {
+                            let mut is_title = "True";
+                            for part in text.split(" ") {
+                                let first_char = String::from(part.chars().nth(0).unwrap());
+                                if first_char != first_char.to_uppercase() {
+                                    is_title = "False";
+                                    break;
+                                }
+                            }
+                            text = String::from(is_title);
                         }
+                        "is_in_dict" => {
+                            if self.dictionary.contains(text.to_lowercase().as_str()) {
+                                text = String::from("True");
+                            } else {
+                                text = String::from("False");
+                            }
+                        }
+                        _ => {}
                     }
-                    _ => {}
                 }
             }
+            let value = String::from(&feature_template.syntax) + "=" + text.as_str();
+            features.push(value);
         }
-        let value = String::from(&feature_template.syntax) + "=" + text.as_str();
-        features.push(value);
+        return features;
     }
-    return features;
-}
-
-
-pub fn featurizer(sentences: Vec<Vec<Vec<String>>>, feature_configs: Vec<String>, dictionary: HashSet<String>) -> Vec<Vec<Vec<String>>> {
-    // Step 1: Parse FeatureTemplates
-    let re = Regex::new(
-        r"T\[(?P<index1>-?\d+)(,(?P<index2>-?\d+))?](\[(?P<column>.*)])?(\.(?P<function>.*))?"
-    ).unwrap();
-    let mut feature_templates: Vec<FeatureTemplate> = Vec::new();
-    for feature_config in feature_configs {
-        let mut feature_template = FeatureTemplate {
-            syntax: String::from(""),
-            offset1: 0,
-            offset2: None,
-            column: 0,
-            function: None,
-        };
-        feature_template.syntax = String::from(&feature_config);
-
-        for cap in re.captures_iter(feature_config.as_str()) {
-            match cap.name("index1") {
-                Some(s) => {
-                    feature_template.offset1 = s.as_str().parse::<isize>().unwrap();
-                }
-                _ => ()
-            }
-            match cap.name("index2") {
-                Some(s) => {
-                    feature_template.offset2 = Option::from(s.as_str().parse::<isize>().unwrap());
-                }
-                _ => ()
-            }
-
-            // match cap.name("column") {
-            //     Some(s) => {
-            //         feature_template.column = s.as_str().parse::<i32>().unwrap();
-            //     }
-            //     _ => ()
-            // }
-
-            match cap.name("function") {
-                Some(s) => {
-                    feature_template.function = Option::from(String::from(s.as_str()));
-                }
-                _ => ()
+    pub fn process(&self, sentences: Vec<Vec<Vec<String>>>) -> Vec<Vec<Vec<String>>> {
+        let mut sentences_features = Vec::new();
+        for sentence in sentences {
+            // generate features for each sentence
+            let mut sentence_features = Vec::new();
+            for position in 0..sentence.len() {
+                let token_features = self.generate_token_features(&sentence, position);
+                sentence_features.push(token_features);
             }
+            sentences_features.push(sentence_features);
         }
-
-        feature_templates.push(feature_template);
-    }
-
-    // Step 2: Generate features
-    let mut sentences_features = Vec::new();
-    for sentence in sentences {
-        // generate features for each sentence
-        let mut sentence_features = Vec::new();
-        for position in 0..sentence.len() {
-            let token_features = generate_token_features(&sentence, position, &feature_templates, &dictionary);
-            sentence_features.push(token_features);
-        }
-        sentences_features.push(sentence_features);
+        return sentences_features;
     }
-    return sentences_features;
 }
 
 #[cfg(test)]
 mod tests {
     use std::collections::HashSet;
 
-    #[test]
-    fn test_featurizer() {
-        let sentences = vec![
-            vec![
-                vec!["Messi".to_string(), "X".to_string()],
-                vec!["giành".to_string(), "X".to_string()],
-                vec!["quả".to_string(), "X".to_string()],
-                vec!["Bóng".to_string(), "X".to_string()],
-                vec!["Đá".to_string(), "X".to_string()],
-                vec!["1".to_string(), "X".to_string()],
-            ]
-        ];
-        let features = vec![
-            "T[0]".to_string(),
-            "T[0].is_in_dict".to_string()
-        ];
-
-        let mut dictionary = HashSet::new();
-        dictionary.insert("giành".to_string());
-        dictionary.insert("quả".to_string());
-        dictionary.insert("bóng".to_string());
-
-        let output = super::featurizer(sentences, features, dictionary);
-        let expected: Vec<Vec<Vec<String>>> = vec![
-            vec![
-                vec!["T[0]=Messi".to_string(), "T[0].is_in_dict=False".to_string()],
-                vec!["T[0]=giành".to_string(), "T[0].is_in_dict=True".to_string()],
-                vec!["T[0]=quả".to_string(), "T[0].is_in_dict=True".to_string()],
-                vec!["T[0]=Bóng".to_string(), "T[0].is_in_dict=True".to_string()],
-                vec!["T[0]=Đá".to_string(), "T[0].is_in_dict=False".to_string()],
-                vec!["T[0]=1".to_string(), "T[0].is_in_dict=False".to_string()]
-            ],
-        ];
-        assert_eq!(output, expected);
-    }
-
     #[test]
     fn test_crf_featurizer() {
         let sentences = vec![
diff --git a/extensions/underthesea_core/src/lib.rs b/extensions/underthesea_core/src/lib.rs
index e31a0369..8a291d06 100644
--- a/extensions/underthesea_core/src/lib.rs
+++ b/extensions/underthesea_core/src/lib.rs
@@ -4,12 +4,8 @@ extern crate pyo3;
 use pyo3::prelude::*;
 use std::collections::HashSet;
 
-mod featurizers;
-
-#[pyfunction]
-fn featurizer(sentences: Vec<Vec<Vec<String>>>, feature_configs: Vec<String>, dictionary: HashSet<String>) -> PyResult<Vec<Vec<Vec<String>>>> {
-    Ok(featurizers::featurizer(sentences, feature_configs, dictionary))
-}
+#[macro_use]
+pub mod featurizers;
 
 #[pyclass]
 pub struct CRFFeaturizer {
@@ -38,7 +34,6 @@ impl CRFFeaturizer {
 
 #[pymodule]
 fn underthesea_core(_py: Python, m: &PyModule) -> PyResult<()> {
-    m.add_function(wrap_pyfunction!(featurizer, m)?)?;
     m.add_class::<CRFFeaturizer>()?;
     Ok(())
 }
\ No newline at end of file
diff --git a/tests/benchmark/benchmark_word_tokenize.py b/tests/benchmark/benchmark_word_tokenize.py
index 9047105b..8d922bff 100644
--- a/tests/benchmark/benchmark_word_tokenize.py
+++ b/tests/benchmark/benchmark_word_tokenize.py
@@ -54,26 +54,16 @@ def word_tokenize_new():
         word_tokenize_nightly(s)
 
 
-old_profiler = cProfile.Profile()
-old_profiler.enable()
+profiler = cProfile.Profile()
+profiler.enable()
 word_tokenize_old()
-old_profiler.disable()
-old_stats = pstats.Stats(old_profiler).sort_stats('tottime')
-old_stats.print_stats()
-
-new_profiler = cProfile.Profile()
-new_profiler.enable()
-word_tokenize_new()
-new_profiler.disable()
-new_stats = pstats.Stats(new_profiler).sort_stats('tottime')
-new_stats.print_stats()
-
-old_time = old_stats.total_tt
-new_time = new_stats.total_tt
-print('Ratio', old_time / new_time, "(", old_time, '->', new_time, ")")
+profiler.disable()
+stats = pstats.Stats(profiler).sort_stats('tottime')
+stats.print_stats()
 
+run_time = stats.total_tt
 print('Current Speed')
-sentences_per_sec = total_sentence / new_time
-tokens_per_sec = total_tokens / new_time
+sentences_per_sec = total_sentence / run_time
+tokens_per_sec = total_tokens / run_time
 print(f'{sentences_per_sec:06.2f} sentences/sec')
 print(f'{tokens_per_sec:06.2f} tokens/sec')
diff --git a/tests/benchmark/benchmark_word_tokenize_compare.py b/tests/benchmark/benchmark_word_tokenize_compare.py
new file mode 100644
index 00000000..9047105b
--- /dev/null
+++ b/tests/benchmark/benchmark_word_tokenize_compare.py
@@ -0,0 +1,79 @@
+import cProfile
+import pstats
+from os.path import join
+
+from underthesea.file_utils import DATASETS_FOLDER
+from underthesea.pipeline.word_tokenize import tokenize
+
+# ======================================================
+# PERFORMANCE
+# ======================================================
+# DATA
+# 1000 sentences, 50k tokens
+# ======================================================
+# v1.1.3     :  3.771s
+# Nightly    :  1.545s !!! >.<
+# Expected 1 :  0.025s (speed:  2M tokens/s)
+# Expected 2 :  0.002s (speed: 20M tokens/s)
+# ======================================================
+
+total_sentence = 0
+total_tokens = 0
+
+
+def get_sentences():
+    global total_sentence
+    global total_tokens
+    sentences = []
+    with open(join(DATASETS_FOLDER, "LTA", "VNESEScorpus.txt")) as f:
+        for i, line in enumerate(f):
+            sentences.append(line)
+            tokens = tokenize(line)
+            total_tokens += len(tokens)
+            total_sentence += 1
+            if i > 1000:
+                break
+
+    print(f"Load {total_sentence} sentences, {total_tokens} tokens")
+    print("=========================================")
+    return sentences
+
+
+sentences = get_sentences()
+
+
+def word_tokenize_old():
+    from underthesea import word_tokenize
+    for s in sentences:
+        word_tokenize(s)
+
+
+def word_tokenize_new():
+    from underthesea.pipeline.word_tokenize.nightly import word_tokenize as word_tokenize_nightly
+    for s in sentences:
+        word_tokenize_nightly(s)
+
+
+old_profiler = cProfile.Profile()
+old_profiler.enable()
+word_tokenize_old()
+old_profiler.disable()
+old_stats = pstats.Stats(old_profiler).sort_stats('tottime')
+old_stats.print_stats()
+
+new_profiler = cProfile.Profile()
+new_profiler.enable()
+word_tokenize_new()
+new_profiler.disable()
+new_stats = pstats.Stats(new_profiler).sort_stats('tottime')
+new_stats.print_stats()
+
+old_time = old_stats.total_tt
+new_time = new_stats.total_tt
+print('Ratio', old_time / new_time, "(", old_time, '->', new_time, ")")
+
+print('Current Speed')
+sentences_per_sec = total_sentence / new_time
+tokens_per_sec = total_tokens / new_time
+print(f'{sentences_per_sec:06.2f} sentences/sec')
+print(f'{tokens_per_sec:06.2f} tokens/sec')