diff --git a/.gitignore b/.gitignore index 45ca6fa4..1c981aee 100644 --- a/.gitignore +++ b/.gitignore @@ -347,3 +347,4 @@ _doc_report.txt data.csv data.txt +/build/TestCoverageReport diff --git a/build.cmd b/build.cmd index 5fdbff68..38ea5b6e 100644 --- a/build.cmd +++ b/build.cmd @@ -299,9 +299,13 @@ set TestsPath1=%PackagePath%\tests set TestsPath2=%__currentScriptDir%src\python\tests set ReportPath=%__currentScriptDir%build\TestCoverageReport call "%PythonExe%" -m pytest --verbose --maxfail=1000 --capture=sys "%TestsPath1%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%" +if errorlevel 1 ( + goto :Exit_Error +) call "%PythonExe%" -m pytest --verbose --maxfail=1000 --capture=sys "%TestsPath2%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%" -goto :Exit_Success - +if errorlevel 1 ( + goto :Exit_Error +) :Exit_Success endlocal diff --git a/src/python/docs/sphinx/installationguide.rst b/src/python/docs/sphinx/installationguide.rst index 0fd2ea9d..6a6042a1 100644 --- a/src/python/docs/sphinx/installationguide.rst +++ b/src/python/docs/sphinx/installationguide.rst @@ -8,7 +8,7 @@ Installation Guide Supported Platforms ------------------- -Release 0.6.0: +Release 0.6: * Windows 10, Ubuntu 14.04, Ubuntu 16.04, CentOS 7, RHEL 7, Mac OS 10.11, 10.12, 10.13 diff --git a/src/python/nimbusml/__init__.py b/src/python/nimbusml/__init__.py index 943abf68..1403e7b4 100644 --- a/src/python/nimbusml/__init__.py +++ b/src/python/nimbusml/__init__.py @@ -2,7 +2,7 @@ Microsoft Machine Learning for Python """ -__version__ = '0.6.0' +__version__ = '0.6.1' # CoreCLR version of MicrosoftML is built on Windows. # But file permissions are not preserved when it's copied to Linux. diff --git a/src/python/nimbusml/feature_extraction/text/ngramfeaturizer.py b/src/python/nimbusml/feature_extraction/text/ngramfeaturizer.py index 4f726eee..7b19e916 100644 --- a/src/python/nimbusml/feature_extraction/text/ngramfeaturizer.py +++ b/src/python/nimbusml/feature_extraction/text/ngramfeaturizer.py @@ -227,9 +227,12 @@ def __init__( dictionary=None, word_feature_extractor=Ngram( max_num_terms=[10000000]), - char_feature_extractor=None, - vector_normalizer='L2', - columns=None, + char_feature_extractor=Ngram( + ngram_length=3, + all_lengths=False, + max_num_terms=[10000000]), + vector_normalizer='L2', + columns=None, **params): if columns: diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/ngramfeaturizer.py b/src/python/nimbusml/internal/core/feature_extraction/text/ngramfeaturizer.py index ff3262ef..9452ee7d 100644 --- a/src/python/nimbusml/internal/core/feature_extraction/text/ngramfeaturizer.py +++ b/src/python/nimbusml/internal/core/feature_extraction/text/ngramfeaturizer.py @@ -206,8 +206,11 @@ def __init__( dictionary=None, word_feature_extractor=n_gram( max_num_terms=[10000000]), - char_feature_extractor=None, - vector_normalizer='L2', + char_feature_extractor=n_gram( + ngram_length=3, + all_lengths=False, + max_num_terms=[10000000]), + vector_normalizer='L2', **params): BasePipelineItem.__init__( self, type='transform', **params) diff --git a/src/python/nimbusml/tests/data_type/test_text.py b/src/python/nimbusml/tests/data_type/test_text.py index 65b8adc4..802459d0 100644 --- a/src/python/nimbusml/tests/data_type/test_text.py +++ b/src/python/nimbusml/tests/data_type/test_text.py @@ -9,9 +9,9 @@ from nimbusml import Pipeline from nimbusml.ensemble import LightGbmClassifier from nimbusml.feature_extraction.text import NGramFeaturizer -from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_equal +from sklearn.utils.testing import assert_array_almost_equal def transform_data(data=None, datatype=None): @@ -34,7 +34,7 @@ def train_data_type_single( "Talk about second", "Thrid one", "Final example."] - model = NGramFeaturizer(word_feature_extractor=n_gram()) + model = NGramFeaturizer() data_with_new_type = transform_data(data, fit_X_type) model.fit(data_with_new_type) test_data_with_new_type = transform_data(data, predict_X_type) @@ -49,7 +49,7 @@ def train_data_type_ppl(fit_X_type=None, fit_Y_type=None, predict_X_type=None): "Final example."] label = [1, 0, 1, 1] model = Pipeline([ - NGramFeaturizer(word_feature_extractor=n_gram()), + NGramFeaturizer(), LightGbmClassifier(min_data_per_leaf=1, n_thread=1) ]) data_with_new_type = transform_data(data, fit_X_type) @@ -66,127 +66,127 @@ class TestTextDataType(unittest.TestCase): def test_check_text_datatype_single_list_list_series(self): result = train_data_type_single("list", "list", "series") assert len(result) == 4 - assert len(result.columns) == 11 + assert len(result.columns) == 66 assert all([col.startswith('F0') for col in result.columns]) def test_check_text_datatype_single_series_list_series(self): result = train_data_type_single("series", "list", "series") assert len(result) == 4 - assert len(result.columns) == 11 + assert len(result.columns) == 66 assert all([col.startswith('F0') for col in result.columns]) def test_check_text_datatype_single_series_list_list(self): result = train_data_type_single("series", "list", "list") assert len(result) == 4 - assert len(result.columns) == 11 + assert len(result.columns) == 66 assert all([col.startswith('F0') for col in result.columns]) def test_check_text_datatype_single_array_list_series(self): result = train_data_type_single("array", "list", "series") assert len(result) == 4 - assert len(result.columns) == 11 + assert len(result.columns) == 66 assert all([col.startswith('F0') for col in result.columns]) def test_check_text_datatype_single_series_array_dataframe(self): result = train_data_type_single("series", "array", "dataframe") assert len(result) == 4 - assert len(result.columns) == 11 + assert len(result.columns) == 66 assert all([col.startswith('F0') for col in result.columns]) def test_check_text_datatype_single_array_series_series(self): result = train_data_type_single("array", "series", "series") assert len(result) == 4 - assert len(result.columns) == 11 + assert len(result.columns) == 66 assert all([col.startswith('F0') for col in result.columns]) def test_check_text_datatype_single_dataframe_list_series(self): result = train_data_type_single("dataframe", "list", "series") assert len(result) == 4 - assert len(result.columns) == 11 + assert len(result.columns) == 66 assert all([col.startswith('F0') for col in result.columns]) def test_check_text_datatype_single_series_series_dataframe(self): result = train_data_type_single("series", "series", "dataframe") assert len(result) == 4 - assert len(result.columns) == 11 + assert len(result.columns) == 66 assert all([col.startswith('F0') for col in result.columns]) def test_check_text_datatype_single_dataframe_series_list(self): result = train_data_type_single("dataframe", "series", "list") assert len(result) == 4 - assert len(result.columns) == 11 + assert len(result.columns) == 66 assert all([col.startswith('F0') for col in result.columns]) def test_check_text_datatype_ppl_series_list_array(self): result, scores, metrics = train_data_type_ppl( "series", "list", "array") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.69314718) - assert_array_equal(scores['Score.0'].values, scores['Score.1'].values) - assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5]) + assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_array_equal(scores['Score.0'].values, result['Score.0'].values) + assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) def test_check_text_datatype_ppl_list_series_dataframe(self): result, scores, metrics = train_data_type_ppl( "list", "series", "dataframe") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.69314718) - assert_array_equal(scores['Score.0'].values, scores['Score.1'].values) - assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5]) + assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_array_equal(scores['Score.0'].values, result['Score.0'].values) + assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) def test_check_text_datatype_ppl_list_list_series(self): result, scores, metrics = train_data_type_ppl("list", "list", "series") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.69314718) - assert_array_equal(scores['Score.0'].values, scores['Score.1'].values) - assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5]) + assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_array_equal(scores['Score.0'].values, result['Score.0'].values) + assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) def test_check_text_datatype_ppl_array_series_array(self): result, scores, metrics = train_data_type_ppl( "array", "series", "array") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.69314718) - assert_array_equal(scores['Score.0'].values, scores['Score.1'].values) - assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5]) + assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_array_equal(scores['Score.0'].values, result['Score.0'].values) + assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) def test_check_text_datatype_ppl_series_array_dataframe(self): result, scores, metrics = train_data_type_ppl( "series", "array", "dataframe") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.69314718) - assert_array_equal(scores['Score.0'].values, scores['Score.1'].values) - assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5]) + assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_array_equal(scores['Score.0'].values, result['Score.0'].values) + assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) def test_check_text_datatype_ppl_array_series_list(self): result, scores, metrics = train_data_type_ppl( "array", "series", "list") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.69314718) - assert_array_equal(scores['Score.0'].values, scores['Score.1'].values) - assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5]) + assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_array_equal(scores['Score.0'].values, result['Score.0'].values) + assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) def test_check_text_datatype_ppl_dataframe_list_series(self): result, scores, metrics = train_data_type_ppl( "dataframe", "list", "series") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.69314718) - assert_array_equal(scores['Score.0'].values, scores['Score.1'].values) - assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5]) + assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_array_equal(scores['Score.0'].values, result['Score.0'].values) + assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) def test_check_text_datatype_ppl_series_series_dataframe(self): result, scores, metrics = train_data_type_ppl( "series", "series", "dataframe") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.69314718) - assert_array_equal(scores['Score.0'].values, scores['Score.1'].values) - assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5]) + assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_array_equal(scores['Score.0'].values, result['Score.0'].values) + assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) def test_check_text_datatype_ppl_dataframe_series_series(self): result, scores, metrics = train_data_type_ppl( "dataframe", "series", "series") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.69314718) - assert_array_equal(scores['Score.0'].values, scores['Score.1'].values) - assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5]) + assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_array_equal(scores['Score.0'].values, result['Score.0'].values) + assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) if __name__ == '__main__': diff --git a/src/python/nimbusml/tests/feature_extraction/text/test_ngramfeaturizer.py b/src/python/nimbusml/tests/feature_extraction/text/test_ngramfeaturizer.py index 39528871..6b183b91 100644 --- a/src/python/nimbusml/tests/feature_extraction/text/test_ngramfeaturizer.py +++ b/src/python/nimbusml/tests/feature_extraction/text/test_ngramfeaturizer.py @@ -38,7 +38,7 @@ def test_ngramfeaturizer(self): X_train = texttransform.fit_transform(X_train[:100]) sum = X_train.iloc[:].sum().sum() print(sum) - assert_equal(sum, 4594, "sum of all features is incorrect!") + assert_equal(sum, 30513, "sum of all features is incorrect!") if __name__ == '__main__': diff --git a/src/python/nimbusml/tests/feature_extraction/text/test_wordembedding.py b/src/python/nimbusml/tests/feature_extraction/text/test_wordembedding.py index fa31e671..4e66a667 100644 --- a/src/python/nimbusml/tests/feature_extraction/text/test_wordembedding.py +++ b/src/python/nimbusml/tests/feature_extraction/text/test_wordembedding.py @@ -91,7 +91,7 @@ def test_word_embedding_example(self): ]) features = pipeline.fit_transform(data) - assert features.shape == (248, 409) + assert features.shape == (248, 802) # TODO: fix ssl issue on test centos7 & ubuntu14 boxes. # Test works on ubuntu16. @@ -127,7 +127,7 @@ def test_word_embedding_example2(self): ]) features = pipeline.fit_transform(data) - assert features.shape == (248, 409) + assert features.shape == (248, 802) assert 'features_TransformedText.94' in list(features.columns) # TODO: fix ssl issue on test centos7 & ubuntu14 boxes. @@ -166,7 +166,7 @@ def test_word_embedding_example_dict_same_name(self): ]) features = pipeline.fit_transform(data) - assert features.shape == (248, 409) + assert features.shape == (248, 802) @unittest.skip('System.ArgumentOutOfRangeException') def test_word_embedding_example_dict_newname(self): diff --git a/src/python/nimbusml/tests/preprocessing/text/test_ngramfeaturizer.py b/src/python/nimbusml/tests/preprocessing/text/test_ngramfeaturizer.py index c2c2cb22..42543f88 100644 --- a/src/python/nimbusml/tests/preprocessing/text/test_ngramfeaturizer.py +++ b/src/python/nimbusml/tests/preprocessing/text/test_ngramfeaturizer.py @@ -91,7 +91,7 @@ def test_ngramfeaturizer(self): textt = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review' X = textt.fit_transform(X) - assert X.shape == (25, 21) + assert X.shape == (25, 116) mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0) X_test = textt.transform(test_reviews) @@ -180,7 +180,7 @@ def test_ngramfeaturizer_syntax_dict(self): 'outg': ['review']} X = textt.fit_transform(X) - assert X.shape == (25, 22) + assert X.shape == (25, 117) # columns ordering changed between 0.22 and 0.23 assert 'review' in (X.columns[0], X.columns[-1]) X = X.drop('review', axis=1) @@ -204,7 +204,7 @@ def test_ngramfeaturizer_single(self): columns={'features': ['id', 'education']}) features = xf.fit_transform(data) - assert features.shape == (248, 259) + assert features.shape == (248, 652) def test_ngramfeaturizer_multi(self): diff --git a/src/python/nimbusml/tests/test_syntax_onehotvectorizer.py b/src/python/nimbusml/tests/test_syntax_onehotvectorizer.py index 4345afa6..b48cf7a4 100644 --- a/src/python/nimbusml/tests/test_syntax_onehotvectorizer.py +++ b/src/python/nimbusml/tests/test_syntax_onehotvectorizer.py @@ -146,4 +146,4 @@ def test_syntax9_multiple_inputs(self): ng4 = NGramFeaturizer(word_feature_extractor=n_gram()) << { 'out1': ['education1', 'education2']} output4 = ng4.fit_transform(X) - assert output4.shape == (5, 7) + assert output4.shape == (5, 13) diff --git a/src/python/setup.py b/src/python/setup.py index c58ddf0d..5736a546 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -40,7 +40,7 @@ # Versions should comply with PEP440. For a discussion on # single-sourcing the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='0.6.0', + version='0.6.1', description='NimbusML', long_description=long_description, diff --git a/src/python/tools/code_fixer.py b/src/python/tools/code_fixer.py index 0f259ee4..3bd5d71c 100644 --- a/src/python/tools/code_fixer.py +++ b/src/python/tools/code_fixer.py @@ -87,7 +87,9 @@ ('name=name,', 'output=output,')], 'NGramFeaturizer': [(NG_1, NG_1_correct), ('word_feature_extractor = n_gram', - 'word_feature_extractor = Ngram')], + 'word_feature_extractor = Ngram'), + ('char_feature_extractor = n_gram', + 'char_feature_extractor = Ngram')], 'CountSelector': ('count = 0,', 'count = 1.0,'), 'OneClassSvmAnomalyDetector': ( 'label_column=label_column,', 'label_column=None,'), diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json index a518c821..0ec2eb23 100644 --- a/src/python/tools/manifest_diff.json +++ b/src/python/tools/manifest_diff.json @@ -682,13 +682,7 @@ "Name": "Transforms.TextFeaturizer", "NewName": "NGramFeaturizer", "Module": "feature_extraction.text", - "Type": "Transform", - "Inputs": [ - { - "Name": "CharFeatureExtractor", - "Default": null - } - ] + "Type": "Transform" }, { "Name": "Transforms.WordEmbeddings", diff --git a/version.txt b/version.txt index 09a3acfa..7ceb0404 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.6.0 \ No newline at end of file +0.6.1 \ No newline at end of file