From 2010d17a27029930a7bd004ac7743fa368222184 Mon Sep 17 00:00:00 2001 From: Vu Anh Date: Thu, 18 Nov 2021 00:28:50 +0700 Subject: [PATCH] GH-185: fix error in pyproject.toml (#509) --- .github/workflows/release-pypi-core.yml | 4 +- extensions/underthesea_core/Cargo.toml | 25 +- extensions/underthesea_core/benches/lab.rs | 41 +++ extensions/underthesea_core/pyproject.toml | 3 +- .../underthesea_core/src/featurizers.rs | 258 ++++++------------ extensions/underthesea_core/src/lib.rs | 9 +- tests/benchmark/benchmark_word_tokenize.py | 26 +- .../benchmark_word_tokenize_compare.py | 79 ++++++ 8 files changed, 229 insertions(+), 216 deletions(-) create mode 100644 extensions/underthesea_core/benches/lab.rs create mode 100644 tests/benchmark/benchmark_word_tokenize_compare.py diff --git a/.github/workflows/release-pypi-core.yml b/.github/workflows/release-pypi-core.yml index ec1b45e9..dc79de99 100644 --- a/.github/workflows/release-pypi-core.yml +++ b/.github/workflows/release-pypi-core.yml @@ -37,7 +37,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.6, 3.7] + python-version: [3.6, 3.7, 3.8, 3.9] os: [ubuntu-latest, macos-latest] steps: - uses: actions/checkout@v1 @@ -68,4 +68,4 @@ jobs: working-directory: ./extensions/underthesea_core env: MATURIN_PASSWORD: ${{ secrets.PYPI_UNDERTHESEA_CORE_API_TOKEN }} - run: poetry run maturin publish --username __token__ \ No newline at end of file + run: poetry run maturin publish --username __token__ --interpreter python${{ matrix.python-version }} \ No newline at end of file diff --git a/extensions/underthesea_core/Cargo.toml b/extensions/underthesea_core/Cargo.toml index d8b106c6..c56fc841 100644 --- a/extensions/underthesea_core/Cargo.toml +++ b/extensions/underthesea_core/Cargo.toml @@ -1,24 +1,33 @@ [package] name = "underthesea_core" -version = "0.0.4-alpha.6" +version = "0.0.4-alpha.8" +homepage = "https://github.com/undertheseanlp/underthesea/" +repository = "https://github.com/undertheseanlp/underthesea/" authors = ["Vu Anh "] -description = "Underthesea Core (fast & fun)" license = "GPL-3.0" edition = "2021" readme = "README.md" -repository = "https://github.com/undertheseanlp/underthesea/" -homepage = "https://github.com/undertheseanlp/underthesea/" +description = "Underthesea Core (fast & fun)" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +exclude = ["rust-toolchain", "target/*", "Cargo.lock", "benches/*.txt"] + +[lib] +name = "underthesea_core" +path = "src/lib.rs" +crate-type = ["cdylib"] + +[[bench]] +name = "lab" +harness = false [dependencies] serde = { version = "1.0", features = [ "derive" ] } regex = "1" +rayon = "1.5" [dependencies.pyo3] version = "0.15.0" features = ["extension-module"] -[lib] -name = "underthesea_core" -crate-type = ["cdylib"] \ No newline at end of file +[dev-dependencies] +criterion = "0.3.5" \ No newline at end of file diff --git a/extensions/underthesea_core/benches/lab.rs b/extensions/underthesea_core/benches/lab.rs new file mode 100644 index 00000000..edd20667 --- /dev/null +++ b/extensions/underthesea_core/benches/lab.rs @@ -0,0 +1,41 @@ +extern crate rayon; +extern crate criterion; + +use std::time::Duration; +use rayon::prelude::*; +use criterion::*; + +fn fibonacci(n: u32) -> u32 { + match n { + 0 => 1, + 1 => 1, + _ => fibonacci(n - 1) + fibonacci(n - 2), + } +} + +fn f1() { + let mut arr: [u32; 2] = [10, 100]; + arr.par_iter_mut().map(|n| fibonacci(*n)); + return; +} + +fn f2() { + let mut arr: [u32; 2] = [10, 100]; + arr.map(|n| fibonacci(n)); + return; +} + + +fn criterion_benchmark(c: &mut Criterion){ + let mut group = c.benchmark_group("abc"); + group.bench_function("my-function", |b| b.iter(|| f1())); + group.finish(); +} + +criterion_group! { + name = benches; + config = Criterion::default().sample_size(10); + targets = criterion_benchmark +} + +criterion_main!(benches); \ No newline at end of file diff --git a/extensions/underthesea_core/pyproject.toml b/extensions/underthesea_core/pyproject.toml index 83bbcf0b..87f0ac8e 100644 --- a/extensions/underthesea_core/pyproject.toml +++ b/extensions/underthesea_core/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "underthesea_core" -version = "0.0.4-alpha.6" +version = "0.0.4-alpha.8" authors = ["Vu Anh "] description = "Underthesea Core (fast & fun)" license = "GPL-3.0" @@ -15,5 +15,4 @@ python = "^3.5" maturin = "==0.9.4" [build-system] -requires = ["==0.9.4"] build-backend = "maturin" \ No newline at end of file diff --git a/extensions/underthesea_core/src/featurizers.rs b/extensions/underthesea_core/src/featurizers.rs index c03e6b6b..1a9318b3 100644 --- a/extensions/underthesea_core/src/featurizers.rs +++ b/extensions/underthesea_core/src/featurizers.rs @@ -87,209 +87,109 @@ impl CRFFeaturizer { } } - pub fn process(&self, sentences: Vec>>) -> Vec>> { - let mut sentences_features = Vec::new(); - for sentence in sentences { - // generate features for each sentence - let mut sentence_features = Vec::new(); - for position in 0..sentence.len() { - let token_features = generate_token_features(&sentence, position, &self.feature_templates, &self.dictionary); - sentence_features.push(token_features); + /// generate features for each token -th in sentence + /// Sentence Example + /// Messi X + /// giành X + /// quả X + pub fn generate_token_features(&self, sentence: &Vec>, position: usize) -> Vec { + let mut features = Vec::new(); + for feature_template in &self.feature_templates { + let index1 = position as isize + feature_template.offset1; + let bos_value = String::from(&feature_template.syntax) + "=" + "BOS"; + let eos_value = String::from(&feature_template.syntax) + "=" + "EOS"; + let column = feature_template.column; + let n = sentence.len() as isize; + let mut text: String; + if index1 < 0 { + features.push(bos_value); + continue; + } else if index1 >= n { + features.push(eos_value); + continue; + } else { + text = String::from(&sentence[index1 as usize][column as usize]); } - sentences_features.push(sentence_features); - } - return sentences_features; - } -} - -/// generate features for each token -th in sentence -/// Sentence Example -/// Messi X -/// giành X -/// quả X -pub fn generate_token_features(sentence: &Vec>, position: usize, feature_templates: &Vec, dictionary: &HashSet) -> Vec { - let mut features = Vec::new(); - for feature_template in feature_templates { - let index1 = position as isize + feature_template.offset1; - let bos_value = String::from(&feature_template.syntax) + "=" + "BOS"; - let eos_value = String::from(&feature_template.syntax) + "=" + "EOS"; - let column = feature_template.column; - let n = sentence.len() as isize; - let mut text: String; - if index1 < 0 { - features.push(bos_value); - continue; - } else if index1 >= n { - features.push(eos_value); - continue; - } else { - text = String::from(&sentence[index1 as usize][column as usize]); - } - match feature_template.offset2 { - None => {} - Some(offset2) => { - let index2 = position as isize + offset2; - if index2 < 0 { - features.push(bos_value); - continue; - } else if index2 >= n { - features.push(eos_value); - continue; - } else { - for i in index1 + 1..index2 + 1 { - text = text + " " + &sentence[i as usize][column as usize]; + match feature_template.offset2 { + None => {} + Some(offset2) => { + let index2 = position as isize + offset2; + if index2 < 0 { + features.push(bos_value); + continue; + } else if index2 >= n { + features.push(eos_value); + continue; + } else { + for i in index1 + 1..index2 + 1 { + text = text + " " + &sentence[i as usize][column as usize]; + } } } } - } - // apply function - match feature_template.function.as_ref() { - None => {} - Some(function_name) => { - match function_name.as_ref() { - "lower" => { - text = text.to_lowercase(); - } - "isdigit" => { - let is_digit = text.parse::(); - match is_digit { - Ok(_) => { text = String::from("True") } - Err(_) => { text = String::from("False") } + // apply function + match feature_template.function.as_ref() { + None => {} + Some(function_name) => { + match function_name.as_ref() { + "lower" => { + text = text.to_lowercase(); } - } - "istitle" => { - let mut is_title = "True"; - for part in text.split(" ") { - let first_char = String::from(part.chars().nth(0).unwrap()); - if first_char != first_char.to_uppercase() { - is_title = "False"; - break; + "isdigit" => { + let is_digit = text.parse::(); + match is_digit { + Ok(_) => { text = String::from("True") } + Err(_) => { text = String::from("False") } } } - text = String::from(is_title); - } - "is_in_dict" => { - if dictionary.contains(text.to_lowercase().as_str()) { - text = String::from("True"); - } else { - text = String::from("False"); + "istitle" => { + let mut is_title = "True"; + for part in text.split(" ") { + let first_char = String::from(part.chars().nth(0).unwrap()); + if first_char != first_char.to_uppercase() { + is_title = "False"; + break; + } + } + text = String::from(is_title); } + "is_in_dict" => { + if self.dictionary.contains(text.to_lowercase().as_str()) { + text = String::from("True"); + } else { + text = String::from("False"); + } + } + _ => {} } - _ => {} } } + let value = String::from(&feature_template.syntax) + "=" + text.as_str(); + features.push(value); } - let value = String::from(&feature_template.syntax) + "=" + text.as_str(); - features.push(value); + return features; } - return features; -} - - -pub fn featurizer(sentences: Vec>>, feature_configs: Vec, dictionary: HashSet) -> Vec>> { - // Step 1: Parse FeatureTemplates - let re = Regex::new( - r"T\[(?P-?\d+)(,(?P-?\d+))?](\[(?P.*)])?(\.(?P.*))?" - ).unwrap(); - let mut feature_templates: Vec = Vec::new(); - for feature_config in feature_configs { - let mut feature_template = FeatureTemplate { - syntax: String::from(""), - offset1: 0, - offset2: None, - column: 0, - function: None, - }; - feature_template.syntax = String::from(&feature_config); - - for cap in re.captures_iter(feature_config.as_str()) { - match cap.name("index1") { - Some(s) => { - feature_template.offset1 = s.as_str().parse::().unwrap(); - } - _ => () - } - match cap.name("index2") { - Some(s) => { - feature_template.offset2 = Option::from(s.as_str().parse::().unwrap()); - } - _ => () - } - - // match cap.name("column") { - // Some(s) => { - // feature_template.column = s.as_str().parse::().unwrap(); - // } - // _ => () - // } - - match cap.name("function") { - Some(s) => { - feature_template.function = Option::from(String::from(s.as_str())); - } - _ => () + pub fn process(&self, sentences: Vec>>) -> Vec>> { + let mut sentences_features = Vec::new(); + for sentence in sentences { + // generate features for each sentence + let mut sentence_features = Vec::new(); + for position in 0..sentence.len() { + let token_features = self.generate_token_features(&sentence, position); + sentence_features.push(token_features); } + sentences_features.push(sentence_features); } - - feature_templates.push(feature_template); - } - - // Step 2: Generate features - let mut sentences_features = Vec::new(); - for sentence in sentences { - // generate features for each sentence - let mut sentence_features = Vec::new(); - for position in 0..sentence.len() { - let token_features = generate_token_features(&sentence, position, &feature_templates, &dictionary); - sentence_features.push(token_features); - } - sentences_features.push(sentence_features); + return sentences_features; } - return sentences_features; } #[cfg(test)] mod tests { use std::collections::HashSet; - #[test] - fn test_featurizer() { - let sentences = vec![ - vec![ - vec!["Messi".to_string(), "X".to_string()], - vec!["giành".to_string(), "X".to_string()], - vec!["quả".to_string(), "X".to_string()], - vec!["Bóng".to_string(), "X".to_string()], - vec!["Đá".to_string(), "X".to_string()], - vec!["1".to_string(), "X".to_string()], - ] - ]; - let features = vec![ - "T[0]".to_string(), - "T[0].is_in_dict".to_string() - ]; - - let mut dictionary = HashSet::new(); - dictionary.insert("giành".to_string()); - dictionary.insert("quả".to_string()); - dictionary.insert("bóng".to_string()); - - let output = super::featurizer(sentences, features, dictionary); - let expected: Vec>> = vec![ - vec![ - vec!["T[0]=Messi".to_string(), "T[0].is_in_dict=False".to_string()], - vec!["T[0]=giành".to_string(), "T[0].is_in_dict=True".to_string()], - vec!["T[0]=quả".to_string(), "T[0].is_in_dict=True".to_string()], - vec!["T[0]=Bóng".to_string(), "T[0].is_in_dict=True".to_string()], - vec!["T[0]=Đá".to_string(), "T[0].is_in_dict=False".to_string()], - vec!["T[0]=1".to_string(), "T[0].is_in_dict=False".to_string()] - ], - ]; - assert_eq!(output, expected); - } - #[test] fn test_crf_featurizer() { let sentences = vec![ diff --git a/extensions/underthesea_core/src/lib.rs b/extensions/underthesea_core/src/lib.rs index e31a0369..8a291d06 100644 --- a/extensions/underthesea_core/src/lib.rs +++ b/extensions/underthesea_core/src/lib.rs @@ -4,12 +4,8 @@ extern crate pyo3; use pyo3::prelude::*; use std::collections::HashSet; -mod featurizers; - -#[pyfunction] -fn featurizer(sentences: Vec>>, feature_configs: Vec, dictionary: HashSet) -> PyResult>>> { - Ok(featurizers::featurizer(sentences, feature_configs, dictionary)) -} +#[macro_use] +pub mod featurizers; #[pyclass] pub struct CRFFeaturizer { @@ -38,7 +34,6 @@ impl CRFFeaturizer { #[pymodule] fn underthesea_core(_py: Python, m: &PyModule) -> PyResult<()> { - m.add_function(wrap_pyfunction!(featurizer, m)?)?; m.add_class::()?; Ok(()) } \ No newline at end of file diff --git a/tests/benchmark/benchmark_word_tokenize.py b/tests/benchmark/benchmark_word_tokenize.py index 9047105b..8d922bff 100644 --- a/tests/benchmark/benchmark_word_tokenize.py +++ b/tests/benchmark/benchmark_word_tokenize.py @@ -54,26 +54,16 @@ def word_tokenize_new(): word_tokenize_nightly(s) -old_profiler = cProfile.Profile() -old_profiler.enable() +profiler = cProfile.Profile() +profiler.enable() word_tokenize_old() -old_profiler.disable() -old_stats = pstats.Stats(old_profiler).sort_stats('tottime') -old_stats.print_stats() - -new_profiler = cProfile.Profile() -new_profiler.enable() -word_tokenize_new() -new_profiler.disable() -new_stats = pstats.Stats(new_profiler).sort_stats('tottime') -new_stats.print_stats() - -old_time = old_stats.total_tt -new_time = new_stats.total_tt -print('Ratio', old_time / new_time, "(", old_time, '->', new_time, ")") +profiler.disable() +stats = pstats.Stats(profiler).sort_stats('tottime') +stats.print_stats() +run_time = stats.total_tt print('Current Speed') -sentences_per_sec = total_sentence / new_time -tokens_per_sec = total_tokens / new_time +sentences_per_sec = total_sentence / run_time +tokens_per_sec = total_tokens / run_time print(f'{sentences_per_sec:06.2f} sentences/sec') print(f'{tokens_per_sec:06.2f} tokens/sec') diff --git a/tests/benchmark/benchmark_word_tokenize_compare.py b/tests/benchmark/benchmark_word_tokenize_compare.py new file mode 100644 index 00000000..9047105b --- /dev/null +++ b/tests/benchmark/benchmark_word_tokenize_compare.py @@ -0,0 +1,79 @@ +import cProfile +import pstats +from os.path import join + +from underthesea.file_utils import DATASETS_FOLDER +from underthesea.pipeline.word_tokenize import tokenize + +# ====================================================== +# PERFORMANCE +# ====================================================== +# DATA +# 1000 sentences, 50k tokens +# ====================================================== +# v1.1.3 : 3.771s +# Nightly : 1.545s !!! >.< +# Expected 1 : 0.025s (speed: 2M tokens/s) +# Expected 2 : 0.002s (speed: 20M tokens/s) +# ====================================================== + +total_sentence = 0 +total_tokens = 0 + + +def get_sentences(): + global total_sentence + global total_tokens + sentences = [] + with open(join(DATASETS_FOLDER, "LTA", "VNESEScorpus.txt")) as f: + for i, line in enumerate(f): + sentences.append(line) + tokens = tokenize(line) + total_tokens += len(tokens) + total_sentence += 1 + if i > 1000: + break + + print(f"Load {total_sentence} sentences, {total_tokens} tokens") + print("=========================================") + return sentences + + +sentences = get_sentences() + + +def word_tokenize_old(): + from underthesea import word_tokenize + for s in sentences: + word_tokenize(s) + + +def word_tokenize_new(): + from underthesea.pipeline.word_tokenize.nightly import word_tokenize as word_tokenize_nightly + for s in sentences: + word_tokenize_nightly(s) + + +old_profiler = cProfile.Profile() +old_profiler.enable() +word_tokenize_old() +old_profiler.disable() +old_stats = pstats.Stats(old_profiler).sort_stats('tottime') +old_stats.print_stats() + +new_profiler = cProfile.Profile() +new_profiler.enable() +word_tokenize_new() +new_profiler.disable() +new_stats = pstats.Stats(new_profiler).sort_stats('tottime') +new_stats.print_stats() + +old_time = old_stats.total_tt +new_time = new_stats.total_tt +print('Ratio', old_time / new_time, "(", old_time, '->', new_time, ")") + +print('Current Speed') +sentences_per_sec = total_sentence / new_time +tokens_per_sec = total_tokens / new_time +print(f'{sentences_per_sec:06.2f} sentences/sec') +print(f'{tokens_per_sec:06.2f} tokens/sec')