[Numpy] Numpy version of GluonNLP (#1225)

* numpy version * Enable Github Actions * Update unittests.yml * Update unittests.yml * Update setup.py * fix test * Update README.md * Update test_models_bert.py * Update tmpdir * Enable codecov * fix a commit id * Separate codecov per platform * Revert "Update tmpdir" This reverts commit 6625af9. pytest-dev/pytest#1120 * Remove files * add symlinks * update Merge conversion toolkits update unittests by fixing the version update datasets add scripts Delete __init__.py add src update Update setup.py Update setup.py update all tests revise test cases Update unittests.yml Update initializer.py Create preprocessing.py Update __init__.py Update attention_cell.py Update prepare_wmt.py move ubuntu + windows to TODO * Update unittests.yml * fix alpha in sentencepiece * fix bug * update * fix README * Update unittests.yml * Update README.md * update Co-authored-by: Leonard Lausen <[email protected]>
dmlc · Jun 10, 2020 · 01122db · 01122db
1 parent de7b23d
commit 01122db
Show file tree

Hide file tree

Showing 146 changed files with 28,464 additions and 11 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,4 @@
+[flake8]
+max-line-length = 100
+max-complexity = 18
+exclude = tests,__init__.py
diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
@@ -0,0 +1,46 @@
+name: continuous build
+
+on: [push, pull_request]
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  unittest:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        # TODO Add ubuntu test by "ubuntu-latest", Add windows test by using "windows-latest"
+        os: [macos-latest]
+        python-version: [ '3.6', '3.7', '3.8' ]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      # Install OS specific dependencies
+      - name: Install Linux dependencies
+        if: matrix.os == 'ubuntu-latest'
+        # TODO https://github.com/apache/incubator-mxnet/issues/18293
+        run: sudo apt-get install libopenblas-dev
+
+      - name: Setup python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+      - name: Install Other Dependencies
+        run: |
+          python -m pip install --user --upgrade pip
+          python -m pip install --user setuptools pytest pytest-cov
+          python -m pip install --upgrade cython
+          python -m pip install --pre --user mxnet==2.0.0b20200604 -f https://dist.mxnet.io/python
+          python -m pip install --user -e .[extras]
+      - name: Test project
+        run: |
+          python -m pytest --cov=./ --cov-report=xml --durations=50 tests/
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v1
+        with:
+          env_vars: OS,PYTHON
diff --git a/.gitmodules b/.gitmodules
diff --git a/.pytype.cfg b/.pytype.cfg
@@ -5,4 +5,4 @@ inputs =
     src/gluonnlp
 
 # Python version (major.minor) of the target code.
-python_version = 3.5
+python_version = 3.6
diff --git a/README.md b/README.md
@@ -0,0 +1,72 @@
+# GluonNLP + Numpy
+
+Implementing NLP algorithms using the new numpy-like interface of MXNet. It's also a testbed for the next-generation release of GluonNLP.
+
+This is a work-in-progress.
+
+
+# Features
+
+- Data Pipeline for NLP
+- AutoML support (TODO)
+- Pretrained Model Zoo
+- Fast Deployment
+    - [TVM](https://tvm.apache.org/) (TODO)
+- AWS Integration
+
+
+# Installation
+First of all, install the latest MXNet. You may use the following commands:
+
+```bash
+
+# Install the version with CUDA 10.1
+pip install -U --pre mxnet-cu101==2.0.0b20200604 -f https://dist.mxnet.io/python
+
+# Install the cpu-only version
+pip install -U --pre mxnet==2.0.0b20200604 -f https://dist.mxnet.io/python
+```
+
+
+To install, use
+
+```bash
+pip install -U -e .
+
+# Also, you may install all the extra requirements via
+pip install -U -e .[extras]
+
+# In case you are using zsh, try to use the following command for installing
+pip install -U -e ."[extras]" 
+```
+
+If you find that you do not have the permission, you can also install to the user folder:
+
+```bash
+pip install -U -e . --user
+```
+
+For Windows users, we recommend to use the [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/about). 
+
+
+# Access the Command-line Toolkits
+
+To facilitate the researcher and the engineers, we provide command-line-toolkits for 
+downloading and preprocessing the NLP datasets. For more details, you may refer to
+ [GluonNLP Datasets](./scripts/datasets) and [GluonNLP Preprocessing Tools](./scripts/preprocess).
+
+```bash
+# CLI for downloading / preparing the dataset
+nlp_data help
+
+# CLI for accessing some common data preprocessing scripts
+nlp_preprocess help
+
+# Also, you can use `python -m` to access the toolkits
+python -m gluonnlp.cli.data help
+python -m gluonnlp.cli.preprocess help
+
+```
+
+# Run Unittests
+You may go to [tests](tests) to see all how to run the unittests.
diff --git a/conftest.py b/conftest.py
@@ -0,0 +1,208 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""conftest.py contains configuration for pytest.
+
+Configuration file for tests in tests/ and scripts/ folders.
+
+Note that fixtures of higher-scoped fixtures (such as ``session``) are
+instantiated before lower-scoped fixtures (such as ``function``).
+
+"""
+
+import logging
+import os
+import random
+
+import numpy as np
+import mxnet as mx
+import gluonnlp
+import pytest
+
+
+def pytest_sessionfinish(session, exitstatus):
+    if exitstatus == 5:  # Don't fail if no tests were run
+        session.exitstatus = 0
+
+
+# * Random seed setup
+def pytest_configure():
+    """Pytest configuration hook to help reproduce test segfaults
+
+    Sets and outputs rng seeds.
+
+    The segfault-debug procedure on a module called test_module.py is:
+
+    1. run "pytest --verbose test_module.py".  A seg-faulting output might be:
+
+       [INFO] np, mx and python random seeds = 4018804151
+       test_module.test1 ... ok
+       test_module.test2 ... Illegal instruction (core dumped)
+
+    2. Copy the module-starting seed into the next command, then run:
+
+       MXNET_MODULE_SEED=4018804151 pytest --log-level=DEBUG --verbose test_module.py
+
+       Output might be:
+
+       [WARNING] **** module-level seed is set: all tests running deterministically ****
+       [INFO] np, mx and python random seeds = 4018804151
+       test_module.test1 ... [DEBUG] np and mx random seeds = 3935862516
+       ok
+       test_module.test2 ... [DEBUG] np and mx random seeds = 1435005594
+       Illegal instruction (core dumped)
+
+    3. Copy the segfaulting-test seed into the command:
+       MXNET_TEST_SEED=1435005594 pytest --log-level=DEBUG --verbose test_module.py:test2
+       Output might be:
+
+       [INFO] np, mx and python random seeds = 2481884723
+       test_module.test2 ... [DEBUG] np and mx random seeds = 1435005594
+       Illegal instruction (core dumped)
+
+    3. Finally reproduce the segfault directly under gdb (might need additional os packages)
+       by editing the bottom of test_module.py to be
+
+       if __name__ == '__main__':
+           logging.getLogger().setLevel(logging.DEBUG)
+           test2()
+
+       MXNET_TEST_SEED=1435005594 gdb -ex r --args python test_module.py
+
+    4. When finished debugging the segfault, remember to unset any exported MXNET_ seed
+       variables in the environment to return to non-deterministic testing (a good thing).
+    """
+
+    module_seed_str = os.getenv('MXNET_MODULE_SEED')
+    if module_seed_str is None:
+        seed = np.random.randint(0, np.iinfo(np.int32).max)
+    else:
+        seed = int(module_seed_str)
+        logging.warning('*** module-level seed is set: '
+                        'all tests running deterministically ***')
+    print('Setting module np/mx/python random seeds, '
+          'use MXNET_MODULE_SEED={} to reproduce.'.format(seed))
+
+    np.random.seed(seed)
+    mx.npx.random.seed(seed)
+    random.seed(seed)
+
+    # The MXNET_TEST_SEED environment variable will override MXNET_MODULE_SEED for tests with
+    #  the 'with_seed()' decoration.  Inform the user of this once here at the module level.
+    if os.getenv('MXNET_TEST_SEED') is not None:
+        logging.warning('*** test-level seed set: all "@with_seed()" '
+                        'tests run deterministically ***')
+
+
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+    """Make test outcome available to fixture.
+
+    https://docs.pytest.org/en/latest/example/simple.html#making-test-result-information-available-in-fixtures
+    """
+    # execute all other hooks to obtain the report object
+    outcome = yield
+    rep = outcome.get_result()
+
+    # set a report attribute for each phase of a call, which can
+    # be "setup", "call", "teardown"
+    setattr(item, "rep_" + rep.when, rep)
+
+
+@pytest.fixture(scope='function', autouse=True)
+def function_scope_seed(request):
+    """A function scope fixture that manages rng seeds.
+
+    This fixture automatically initializes the python, numpy and mxnet random
+    number generators randomly on every test run.
+
+    def test_ok_with_random_data():
+        ...
+
+    To fix the seed used for a test case mark the test function with the
+    desired seed:
+
+    @pytest.mark.seed(1)
+    def test_not_ok_with_random_data():
+        '''This testcase actually works.'''
+        assert 17 == random.randint(0, 100)
+
+    When a test fails, the fixture outputs the seed used. The user can then set
+    the environment variable MXNET_TEST_SEED to the value reported, then rerun
+    the test with:
+
+        pytest --verbose -s <test_module_name.py> -k <failing_test>
+
+    To run a test repeatedly, install pytest-repeat and add the --count argument:
+
+        pip install pytest-repeat
+        pytest --verbose -s <test_module_name.py> -k <failing_test> --count 1000
+
+    """
+
+    seed = request.node.get_closest_marker('seed')
+    env_seed_str = os.getenv('MXNET_TEST_SEED')
+
+    if seed is not None:
+        seed = seed.args[0]
+        assert isinstance(seed, int)
+    elif env_seed_str is not None:
+        seed = int(env_seed_str)
+    else:
+        seed = np.random.randint(0, np.iinfo(np.int32).max)
+
+    post_test_state = np.random.get_state()
+    np.random.seed(seed)
+    mx.random.seed(seed)
+    random.seed(seed)
+
+    seed_message = ('np/mx/python random seeds are set to '
+                    '{}, use MXNET_TEST_SEED={} to reproduce.')
+    seed_message = seed_message.format(seed, seed)
+
+    # Always log seed on DEBUG log level. This makes sure we can find out the
+    # value of the seed even if the test case causes a segfault and subsequent
+    # teardown code is not run.
+    logging.debug(seed_message)
+
+    yield  # run the test
+
+    if request.node.rep_setup.failed:
+        logging.info("Setting up a test failed: {}", request.node.nodeid)
+    elif request.node.rep_call.outcome == 'failed':
+        # Either request.node.rep_setup.failed or request.node.rep_setup.passed
+        # should be True
+        assert request.node.rep_setup.passed
+        # On failure also log seed on INFO log level
+        logging.info(seed_message)
+
+    np.random.set_state(post_test_state)
+
+
+# * Shared test fixtures
+@pytest.fixture(params=[True, False])
+def hybridize(request):
+    return request.param
+
+
+@pytest.fixture(autouse=True)
+def doctest(doctest_namespace):
+    doctest_namespace['np'] = np
+    doctest_namespace['gluonnlp'] = gluonnlp
+    doctest_namespace['mx'] = mx
+    doctest_namespace['gluon'] = mx.gluon
+    import doctest
+    doctest.ELLIPSIS_MARKER = '-etc-'
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,8 @@
+[pytest]
+markers =
+    seed: set the python, numpy and mxnet random seeds to a specified value for test reproducibility
+    serial: mark a test that requires more resources to run that are thus only suitable for serial run.
+    remote_required: mark a test that requires internet access.
+    gpu: mark a test that requires GPU.
+    integration: mark an integration test
+    skip_master: mark a test that is temporarily skipped for mxnet master validation.
diff --git a/scripts/__init__.py b/scripts/__init__.py