diff --git a/.azure-pipelines/code-scan.yml b/.azure-pipelines/code-scan.yml index 3c8a8c8daae..27d742b4d5b 100644 --- a/.azure-pipelines/code-scan.yml +++ b/.azure-pipelines/code-scan.yml @@ -9,6 +9,7 @@ pr: paths: include: - neural_compressor + - setup.py pool: ICX-16C @@ -16,6 +17,18 @@ variables: CODE_SCAN_LOG_PATH: ".azure-pipelines/scripts/codeScan/scanLog" stages: + - stage: DocStyleCodeScan + displayName: DocStyle Code Scan + dependsOn: [] + jobs: + - job: DocStyle + displayName: DocStyle + steps: + - template: template/code-scan-template.yml + parameters: + codeScanFileName: "pydocstyle" + uploadPath: "pydocstyle.log" + - stage: BanditCodeScan displayName: Bandit Code Scan dependsOn: [] @@ -26,7 +39,7 @@ stages: - template: template/code-scan-template.yml parameters: codeScanFileName: "bandit" - uploadPath: "lpot-bandit.log" + uploadPath: "bandit.log" - stage: PylintCodeScan displayName: Pylint Code Scan @@ -38,7 +51,7 @@ stages: - template: template/code-scan-template.yml parameters: codeScanFileName: "pylint" - uploadPath: "lpot-pylint.json" + uploadPath: "pylint.json" - stage: CopyRight displayName: CopyRight Code Scan diff --git a/.azure-pipelines/docker/DockerfileCodeScan.devel b/.azure-pipelines/docker/DockerfileCodeScan.devel index 93321aa0f14..8c33984f23d 100644 --- a/.azure-pipelines/docker/DockerfileCodeScan.devel +++ b/.azure-pipelines/docker/DockerfileCodeScan.devel @@ -38,6 +38,7 @@ RUN python -m pip install --no-cache-dir pylint==2.12.1\ tf_slim\ transformers\ horovod\ - flask==2.1.3 + flask==2.1.3 \ + pydocstyle WORKDIR / diff --git a/.azure-pipelines/model-test.yml b/.azure-pipelines/model-test.yml index 4f4ce12c680..8ff2c10cc50 100644 --- a/.azure-pipelines/model-test.yml +++ b/.azure-pipelines/model-test.yml @@ -9,6 +9,7 @@ pr: paths: include: - neural_compressor + - setup.py exclude: - neural_compressor/ux diff --git a/.azure-pipelines/scripts/codeScan/bandit/bandit.sh b/.azure-pipelines/scripts/codeScan/bandit/bandit.sh index a23f2f3000d..b8238ef5f92 100644 --- a/.azure-pipelines/scripts/codeScan/bandit/bandit.sh +++ b/.azure-pipelines/scripts/codeScan/bandit/bandit.sh @@ -1,17 +1,21 @@ #!/bin/bash source /neural-compressor/.azure-pipelines/scripts/change_color.sh -mkdir -p /neural-compressor/.azure-pipelines/scripts/codeScan/scanLog -bandit_log_dir="/neural-compressor/.azure-pipelines/scripts/codeScan/scanLog" +RESET="echo -en \\E[0m \\n" # close color -python -m bandit -r -lll -iii /neural-compressor/neural_compressor > $bandit_log_dir/lpot-bandit.log +log_dir="/neural-compressor/.azure-pipelines/scripts/codeScan/scanLog" +mkdir -p $log_dir + +python -m bandit -r -lll -iii /neural-compressor/neural_compressor > $log_dir/bandit.log exit_code=$? -# code-scan close -RESET="echo -en \\E[0m \\n" + +$BOLD_YELLOW && echo " ----------------- Current bandit cmd start --------------------------" && $RESET +echo "python -m bandit -r -lll -iii /neural-compressor/neural_compressor > $log_dir/bandit.log" +$BOLD_YELLOW && echo " ----------------- Current bandit cmd end --------------------------" && $RESET $BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------" -cat $bandit_log_dir/lpot-bandit.log +cat $log_dir/bandit.log $BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET diff --git a/.azure-pipelines/scripts/codeScan/pydocstyle/pydocstyle.sh b/.azure-pipelines/scripts/codeScan/pydocstyle/pydocstyle.sh new file mode 100644 index 00000000000..8b8a09939e8 --- /dev/null +++ b/.azure-pipelines/scripts/codeScan/pydocstyle/pydocstyle.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +source /neural-compressor/.azure-pipelines/scripts/change_color.sh +RESET="echo -en \\E[0m \\n" # close color + +log_dir="/neural-compressor/.azure-pipelines/scripts/codeScan/scanLog" +mkdir -p $log_dir + +pydocstyle --convention=google /neural-compressor/neural_compressor/experimental > $log_dir/pydocstyle.log +exit_code=$? + + +$BOLD_YELLOW && echo " ----------------- Current pydocstyle cmd start --------------------------" && $RESET +echo "python pydocstyle --convention=google /neural-compressor/neural_compressor/experimental > $log_dir/pydocstyle.log" +$BOLD_YELLOW && echo " ----------------- Current pydocstyle cmd end --------------------------" && $RESET + +$BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------" +cat $log_dir/pydocstyle.log +$BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET + + +if [ ${exit_code} -ne 0 ] ; then + $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view DocStyle error details." && $RESET; exit 1 +fi +$BOLD_PURPLE && echo "Congratulations, DocStyle check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET; exit 0 \ No newline at end of file diff --git a/.azure-pipelines/scripts/codeScan/pylint/pylint.sh b/.azure-pipelines/scripts/codeScan/pylint/pylint.sh index de55395e5ce..b15da8c91b3 100644 --- a/.azure-pipelines/scripts/codeScan/pylint/pylint.sh +++ b/.azure-pipelines/scripts/codeScan/pylint/pylint.sh @@ -1,24 +1,26 @@ #!/bin/bash source /neural-compressor/.azure-pipelines/scripts/change_color.sh -mkdir -p /neural-compressor/.azure-pipelines/scripts/codeScan/scanLog -pylint_log_dir="/neural-compressor/.azure-pipelines/scripts/codeScan/scanLog" +RESET="echo -en \\E[0m \\n" # close color + +log_dir="/neural-compressor/.azure-pipelines/scripts/codeScan/scanLog" +mkdir -p $log_dir pip install -r /neural-compressor/requirements.txt pip install torch==1.12.0 -python -m pylint -f json --disable=R,C,W,E1129 --enable=line-too-long --max-line-length=120 --extension-pkg-whitelist=numpy --ignored-classes=TensorProto,NodeProto --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,intel_extension_for_pytorch /neural-compressor/neural_compressor > $pylint_log_dir/lpot-pylint.json -exit_code=$? +python -m pylint -f json --disable=R,C,W,E1129 --enable=line-too-long --max-line-length=120 --extension-pkg-whitelist=numpy --ignored-classes=TensorProto,NodeProto \ +--ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,intel_extension_for_pytorch /neural-compressor/neural_compressor \ +> $log_dir/pylint.json -# code-scan close -RESET="echo -en \\E[0m \\n" +exit_code=$? $BOLD_YELLOW && echo " ----------------- Current pylint cmd start --------------------------" && $RESET -echo "python -m pylint -f json --disable=R,C,W,E1129 --enable=line-too-long --max-line-length=120 --extension-pkg-whitelist=numpy --ignored-classes=TensorProto,NodeProto --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,intel_extension_for_pytorch /neural-compressor/neural_compressor > $pylint_log_dir/lpot-pylint.json" +echo "python -m pylint -f json --disable=R,C,W,E1129 --enable=line-too-long --max-line-length=120 --extension-pkg-whitelist=numpy --ignored-classes=TensorProto,NodeProto --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,intel_extension_for_pytorch /neural-compressor/neural_compressor > $log_dir/pylint.json" $BOLD_YELLOW && echo " ----------------- Current pylint cmd end --------------------------" && $RESET $BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------" && $RESET -cat $pylint_log_dir/lpot-pylint.json +cat $log_dir/pylint.json $BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET diff --git a/.azure-pipelines/scripts/codeScan/pyspelling/lpot_dict.txt b/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt similarity index 99% rename from .azure-pipelines/scripts/codeScan/pyspelling/lpot_dict.txt rename to .azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt index ec5c0321b29..4601c3ab69e 100644 --- a/.azure-pipelines/scripts/codeScan/pyspelling/lpot_dict.txt +++ b/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt @@ -696,6 +696,7 @@ Goyal gpg GPG gpt +GPTJ gpu gpus GPUs @@ -2378,3 +2379,9 @@ constfold grappler amsgrad qoperator +apis +CPz +Nsh +UmK +fe +vmware diff --git a/.azure-pipelines/scripts/codeScan/pyspelling/pyspelling_conf.yaml b/.azure-pipelines/scripts/codeScan/pyspelling/pyspelling_conf.yaml index 07fe82c07f3..3cf19530020 100644 --- a/.azure-pipelines/scripts/codeScan/pyspelling/pyspelling_conf.yaml +++ b/.azure-pipelines/scripts/codeScan/pyspelling/pyspelling_conf.yaml @@ -4,10 +4,10 @@ matrix: d: en_US.ISO8859-15 dictionary: wordlists: - - ${DICT_DIR}/lpot_dict.txt - output: ${DICT_DIR}/lpot_dict.dic + - ${DICT_DIR}/inc_dict.txt + output: ${DICT_DIR}/inc_dict.dic sources: - - ${REPO_DIR}/docs/* + - ${REPO_DIR}/docs/source/*.md - ${REPO_DIR}/*.md - ${REPO_DIR}/examples/**/*.md|!${REPO_DIR}/examples/pytorch/**/huggingface_models/**/*.md - ${REPO_DIR}/neural_compressor/**/*.md diff --git a/.azure-pipelines/scripts/models/generate_report.sh b/.azure-pipelines/scripts/models/generate_report.sh index 568799ebbc1..9271008d2e8 100644 --- a/.azure-pipelines/scripts/models/generate_report.sh +++ b/.azure-pipelines/scripts/models/generate_report.sh @@ -237,6 +237,7 @@ function generate_html_core { status_png = "background-color:#90EE90"; } else { status_png = "background-color:#FFD2D2"; + job_status = "fail" } printf("%.2f", status_png, target); } diff --git a/.azure-pipelines/scripts/ut/env_setup.sh b/.azure-pipelines/scripts/ut/env_setup.sh index 07fa00a8d35..6a9fd879fad 100644 --- a/.azure-pipelines/scripts/ut/env_setup.sh +++ b/.azure-pipelines/scripts/ut/env_setup.sh @@ -20,7 +20,7 @@ echo "mxnet version is $mxnet_version" if [[ "${tensorflow_version}" == *"-official" ]]; then pip install tensorflow==${tensorflow_version%-official} elif [[ "${tensorflow_version}" == "spr-base" ]]; then - pip install /tf_dataset/tf_binary/tensorflow*.whl + pip install /tf_dataset/tf_binary/221125/tensorflow*.whl if [[ $? -ne 0 ]]; then exit 1 fi diff --git a/.azure-pipelines/template/model-template.yml b/.azure-pipelines/template/model-template.yml index 8bde0d72a89..cc6f58f9c60 100644 --- a/.azure-pipelines/template/model-template.yml +++ b/.azure-pipelines/template/model-template.yml @@ -46,13 +46,13 @@ steps: - script: | docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \ && bash run_${{ parameters.framework }}_models_trigger.sh --model=${{ parameters.modelName }} --mode='int8_benchmark' --USE_TUNE_ACC=$(USE_TUNE_ACC) --PERF_STABLE_CHECK=$(PERF_STABLE_CHECK)" - continueOnError: true + condition: succeededOrFailed() displayName: INT8 Benchmark - script: | docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \ && bash run_${{ parameters.framework }}_models_trigger.sh --model=${{ parameters.modelName }} --mode='fp32_benchmark' --USE_TUNE_ACC=$(USE_TUNE_ACC) --PERF_STABLE_CHECK=$(PERF_STABLE_CHECK)" - continueOnError: true + condition: succeededOrFailed() displayName: FP32 Benchmark - task: Bash@3 diff --git a/.azure-pipelines/ut-basic.yml b/.azure-pipelines/ut-basic.yml index e69a1ae12fc..bdeb6dd6d6f 100644 --- a/.azure-pipelines/ut-basic.yml +++ b/.azure-pipelines/ut-basic.yml @@ -10,6 +10,7 @@ pr: include: - neural_compressor - test + - setup.py exclude: - neural_compressor/ux - test/ux diff --git a/.azure-pipelines/ut-ncoder.yml b/.azure-pipelines/ut-ncoder.yml index d26c4a78c7e..16ee8cdb6ba 100644 --- a/.azure-pipelines/ut-ncoder.yml +++ b/.azure-pipelines/ut-ncoder.yml @@ -10,6 +10,7 @@ pr: include: - neural_coder - test/neural_coder + - setup.py pool: ICX-16C diff --git a/.azure-pipelines/ut-ux.yml b/.azure-pipelines/ut-ux.yml index 0c76f4d5519..2b79fb4b9b4 100644 --- a/.azure-pipelines/ut-ux.yml +++ b/.azure-pipelines/ut-ux.yml @@ -10,6 +10,7 @@ pr: include: - neural_compressor/ux - test/ux + - setup.py pool: ICX-16C diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index e4e37f8fc72..69348ebd344 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -14,16 +14,17 @@ jobs: - uses: actions/checkout@v1 - name: Install dependencies run: | - export PATH="$HOME/.local/bin:$PATH" + export PATH="$HOME/.local/bin:$PATH/docs" sudo apt-get install -y python3-setuptools - pip3 install --user -r sphinx-requirements.txt + pip3 install --user -r docs/sphinx-requirements.txt - name: Build the docs run: | export PATH="$HOME/.local/bin:$PATH" + cd docs/ make html - name: Push the docs uses: peaceiris/actions-gh-pages@v3 with: github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: _build/html + publish_dir: docs/_build/html publish_branch: latestHTML \ No newline at end of file diff --git a/Makefile b/Makefile deleted file mode 100644 index 7b308f858cc..00000000000 --- a/Makefile +++ /dev/null @@ -1,34 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -SPHINXPROJ = ProjectnameIntelLowPrecisionOptimizationTool -SOURCEDIR = . -BUILDDIR = _build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - - -html: - $(SPHINXBUILD) -b html "$(SOURCEDIR)" "$(BUILDDIR)/html" $(SPHINXOPTS) $(O) - cp _static/index.html $(BUILDDIR)/html/index.html - mkdir "$(BUILDDIR)/html/docs/imgs" - cp docs/imgs/architecture.png "$(BUILDDIR)/html/docs/imgs/architecture.png" - cp docs/imgs/workflow.png "$(BUILDDIR)/html/docs/imgs/workflow.png" - cp docs/imgs/INC_GUI.gif "$(BUILDDIR)/html/docs/imgs/INC_GUI.gif" - cp docs/imgs/release_data.png "$(BUILDDIR)/html/docs/imgs/release_data.png" - cp "$(BUILDDIR)/html/README.html" "$(BUILDDIR)/html/README.html.tmp" - sed 's/.md/.html/g' "$(BUILDDIR)/html/README.html.tmp" > "$(BUILDDIR)/html/README.html" - rm -f "$(BUILDDIR)/html/README.html.tmp" - - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/README.md b/README.md index dc76479528d..0cd8878222e 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Python version: 3.7, 3.8, 3.9, 3.10 # Or install nightly full version from pip (including GUI) pip install -i https://test.pypi.org/simple/ neural-compressor-full ``` -More installation methods can be found at [Installation Guide](./docs/installation_guide.md). Please check out our [FAQ](./docs/faq.md) for more details. +More installation methods can be found at [Installation Guide](./docs/source/installation_guide.md). Please check out our [FAQ](./docs/source/faq.md) for more details. ## Getting Started ### Quantization with Python API @@ -71,7 +71,7 @@ Search for ```jupyter-lab-neural-compressor``` in the Extension Manager in Jupyt Extension -### Quantization with [GUI](./docs/bench.md) +### Quantization with [GUI](./docs/source/bench.md) ```shell # An ONNX Example pip install onnx==1.12.0 onnxruntime==1.12.1 onnxruntime-extensions @@ -80,8 +80,8 @@ wget https://github.com/onnx/models/raw/main/vision/classification/resnet/model/ # Start GUI inc_bench ``` - - Architecture + + Architecture ## System Requirements @@ -98,7 +98,7 @@ inc_bench #### Intel® Neural Compressor quantized ONNX models support multiple hardware vendors through ONNX Runtime: -* Intel CPU, AMD/ARM CPU, and NVidia GPU. Please refer to the validated model [list](./docs/validated_model_list.md#Validated-ONNX-QDQ-INT8-models-on-multiple-hardware-through-ONNX-Runtime). +* Intel CPU, AMD/ARM CPU, and NVidia GPU. Please refer to the validated model [list](./docs/source/validated_model_list.md#Validated-ONNX-QDQ-INT8-models-on-multiple-hardware-through-ONNX-Runtime). ### Validated Software Environment @@ -146,11 +146,11 @@ inc_bench > Set the environment variable ``TF_ENABLE_ONEDNN_OPTS=1`` to enable oneDNN optimizations if you are using TensorFlow v2.6 to v2.8. oneDNN is the default for TensorFlow v2.9. ### Validated Models -Intel® Neural Compressor validated 420+ [examples](./examples) for quantization with a performance speedup geomean of 2.2x and up to 4.2x on VNNI while minimizing accuracy loss. Over 30 pruning and knowledge distillation samples are also available. More details for validated models are available [here](docs/validated_model_list.md). +Intel® Neural Compressor validated 420+ [examples](./examples) for quantization with a performance speedup geomean of 2.2x and up to 4.2x on VNNI while minimizing accuracy loss. Over 30 pruning and knowledge distillation samples are also available. More details for validated models are available [here](./docs/source/validated_model_list.md).
- - Architecture + + Architecture
@@ -164,10 +164,10 @@ Intel® Neural Compressor validated 420+ [examples](./examples) for quantization - Architecture + Architecture Examples - GUI - APIs + GUI + APIs Intel oneAPI AI Analytics Toolkit @@ -181,10 +181,10 @@ Intel® Neural Compressor validated 420+ [examples](./examples) for quantization - Transform - Dataset - Metric - Objective + Transform + Dataset + Metric + Objective @@ -194,21 +194,21 @@ Intel® Neural Compressor validated 420+ [examples](./examples) for quantization - Quantization - Pruning(Sparsity) - Knowledge Distillation - Mixed Precision - Orchestration + Quantization + Pruning(Sparsity) + Knowledge Distillation + Mixed Precision + Orchestration - Benchmarking - Distributed Training - Model Conversion - TensorBoard + Benchmarking + Distributed Training + Model Conversion + TensorBoard - Distillation for Quantization - Neural Coder + Distillation for Quantization + Neural Coder @@ -219,14 +219,15 @@ Intel® Neural Compressor validated 420+ [examples](./examples) for quantization - Adaptor - Strategy - Reference Example + Adaptor + Strategy ## Selected Publications/Events +* [Intel together with Tencent deepens the cooperation to build a cloud foundation for digital and intelligent industry](https://mp.weixin.qq.com/s/CPz9-5Nsh-5N9Q8-UmK--w) (Dec 2022) +* [Intel Neural Compressor for TF Virtual Appliance packaged by Bitnami](https://marketplace.cloud.vmware.com/services/details/e9c3d891-ca51-4f07-a5aa-3fe6394f15ae) (Nov 2022) * [Neural Compressor: an open-source Python library for network compression](https://cloud.tencent.com/developer/article/2165895) (Nov 2022) * [Running Fast Transformers on CPUs: Intel Approach Achieves Significant Speed Ups and SOTA Performance](https://medium.com/syncedreview/running-fast-transformers-on-cpus-intel-approach-achieves-significant-speed-ups-and-sota-448521704c5e) (Nov 2022) * [Personalized Stable Diffusion with Few-Shot Fine-Tuning](https://medium.com/intel-analytics-software/personalized-stable-diffusion-with-few-shot-fine-tuning-on-a-single-cpu-f01a3316b13) (Nov 2022) @@ -235,13 +236,13 @@ Intel® Neural Compressor validated 420+ [examples](./examples) for quantization * Neural Coder, a new plug-in for Intel Neural Compressor was covered by [Twitter](https://twitter.com/IntelDevTools/status/1583629213697212416), [LinkedIn](https://www.linkedin.com/posts/intel-software_oneapi-ai-deeplearning-activity-6989377309917007872-Dbzg?utm_source=share&utm_medium=member_desktop), and [Intel Developer Zone](https://mp.weixin.qq.com/s/LL-4eD-R0YagFgODM23oQA) from Intel, and [Twitter](https://twitter.com/IntelDevTools/status/1583629213697212416/retweets) and [LinkedIn](https://www.linkedin.com/feed/update/urn:li:share:6990377841435574272/) from Hugging Face. (Oct 2022) * Intel Neural Compressor successfully landed on [GCP](https://console.cloud.google.com/marketplace/product/bitnami-launchpad/inc-tensorflow-intel?project=verdant-sensor-286207), [AWS](https://aws.amazon.com/marketplace/pp/prodview-yjyh2xmggbmga#pdp-support), and [Azure](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/bitnami.inc-tensorflow-intel) marketplace. (Oct 2022) -> View our [full publication list](docs/publication_list.md). +> View our [full publication list](./docs/source/publication_list.md). ## Additional Content -* [Release Information](docs/releases_info.md) -* [Contribution Guidelines](docs/contributions.md) -* [Legal Information](docs/legal_information.md) +* [Release Information](./docs/source/releases_info.md) +* [Contribution Guidelines](./docs/source/contributions.md) +* [Legal Information](./docs/source/legal_information.md) * [Security Policy](SECURITY.md) * [Intel® Neural Compressor Website](https://intel.github.io/neural-compressor) diff --git a/_static/custom.css b/_static/custom.css deleted file mode 100755 index b2d7a2ec6c2..00000000000 --- a/_static/custom.css +++ /dev/null @@ -1,18 +0,0 @@ -/* make the page 1000px */ -.wy-nav-content { - max-width: 1000px; -} - -/* code block highlight color in rtd changed to lime green, no no no */ - -.rst-content tt.literal, .rst-content code.literal, .highlight { - background: #f0f0f0; -} -.rst-content tt.literal, .rst-content code.literal { - color: #000000; -} - -table.docutils th { - text-align: center; - vertical-align: middle; -} \ No newline at end of file diff --git a/_static/index.html b/_static/index.html deleted file mode 100644 index 5f62e3d9bef..00000000000 --- a/_static/index.html +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/api-documentation/api-reference.rst b/api-documentation/api-reference.rst deleted file mode 100755 index 784f6bae5eb..00000000000 --- a/api-documentation/api-reference.rst +++ /dev/null @@ -1,16 +0,0 @@ -API Reference -############# - -Read an `introduction to Intel Neural Compressor APIs <../docs/api-introduction.md>`__. - -The following APIs are available: - -.. toctree:: - :maxdepth: 1 - - benchmark-api - objective-api - pruning-api - quantization-api - - \ No newline at end of file diff --git a/api-documentation/benchmark-api.rst b/api-documentation/benchmark-api.rst deleted file mode 100755 index c6f3da1e87b..00000000000 --- a/api-documentation/benchmark-api.rst +++ /dev/null @@ -1,10 +0,0 @@ -.. _benchmark-api - -Benchmark -######### - -.. automodule:: neural_compressor.benchmark - :members: - -.. autoclass:: neural_compressor.benchmark.Benchmark - :members: \ No newline at end of file diff --git a/api-documentation/objective-api.rst b/api-documentation/objective-api.rst deleted file mode 100755 index de63375c256..00000000000 --- a/api-documentation/objective-api.rst +++ /dev/null @@ -1,13 +0,0 @@ -.. _objective-api - -Objective -######### - -.. automodule:: neural_compressor.objective - :members: - -.. autoclass:: neural_compressor.objective.Measurer - :members: - -.. autoclass:: neural_compressor.objective.FootprintMeasure - :members: \ No newline at end of file diff --git a/api-documentation/pruning-api.rst b/api-documentation/pruning-api.rst deleted file mode 100755 index ff2fd584546..00000000000 --- a/api-documentation/pruning-api.rst +++ /dev/null @@ -1,10 +0,0 @@ -.. _pruning-api - -Pruning -####### - -.. automodule:: neural_compressor.pruning - :members: - -.. autoclass:: neural_compressor.pruning.Pruning - :members: \ No newline at end of file diff --git a/api-documentation/quantization-api.rst b/api-documentation/quantization-api.rst deleted file mode 100755 index af1017d0ef1..00000000000 --- a/api-documentation/quantization-api.rst +++ /dev/null @@ -1,7 +0,0 @@ -.. _quantization-api - -Quantization -############ - -.. automodule:: neural_compressor.quantization - :members: \ No newline at end of file diff --git a/conda_meta/basic/meta.yaml b/conda_meta/basic/meta.yaml index ca5af8c4ec6..69787c5815d 100644 --- a/conda_meta/basic/meta.yaml +++ b/conda_meta/basic/meta.yaml @@ -21,17 +21,13 @@ requirements: - scikit-learn - schema - py-cpuinfo - - hyperopt - pandas - pycocotools - opencv - psutil - Pillow - requests - - sigopt - prettytable - - cryptography - - cython - packaging test: imports: diff --git a/conda_meta/full/meta.yaml b/conda_meta/full/meta.yaml index ac5eedc1a23..1ecf15214b3 100644 --- a/conda_meta/full/meta.yaml +++ b/conda_meta/full/meta.yaml @@ -23,7 +23,6 @@ requirements: - scikit-learn - schema - py-cpuinfo - - hyperopt - pandas - pycocotools - opencv @@ -35,12 +34,10 @@ requirements: - psutil - Pillow - requests - - sigopt - prettytable - cryptography - sqlalchemy==1.4.27 - alembic==1.7.7 - - cython - pywin32 # [win] - packaging test: diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000000..7653b4d006c --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,45 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SOURCEDIR = source +BUILDDIR = _build +IMGDIR = source/_static/imgs +BUILDIMGDIR = _build/html/imgs +CODEIMGDIR = _build/html/_static + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + + +html: + # cp README.md to docs, modify response-link + cp -f "../README.md" "./source/Welcome.md" + cp -f "../SECURITY.md" "./source/SECURITY.md" + cp -f "./source/Welcome.md" "./source/Welcome.md.tmp" + sed 's/.md/.html/g; s/.\/docs\/source\//.\//g; s/.\/neural_coder\/extensions\/screenshots/imgs/g; s/.\/docs\/source\/_static/..\/\/_static/g; ' "./source/Welcome.md.tmp" > "./source/Welcome.md" + rm -f "./source/Welcome.md.tmp" + + # make sure other png can display normal + $(SPHINXBUILD) -b html "$(SOURCEDIR)" "$(BUILDDIR)/html" $(SPHINXOPTS) $(O) + + cp source/_static/index.html $(BUILDDIR)/html/index.html + mkdir -p "$(BUILDIMGDIR)" + # common svg + cp -f "$(CODEIMGDIR)/imgs/common/code.svg" "$(CODEIMGDIR)/images/view-page-source-icon.svg" + cp -f "$(CODEIMGDIR)/imgs/common/right.svg" "$(CODEIMGDIR)/images/chevron-right-orange.svg" + + cp "../neural_coder/extensions/screenshots/extmanager.png" "$(BUILDIMGDIR)/extmanager.png" + cp "$(IMGDIR)/INC_GUI.gif" "$(BUILDIMGDIR)/INC_GUI.gif" + cp "$(IMGDIR)/release_data.png" "$(BUILDIMGDIR)/release_data.png" + + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/api-introduction.md b/docs/api-introduction.md deleted file mode 100644 index e23fbaf8d62..00000000000 --- a/docs/api-introduction.md +++ /dev/null @@ -1,210 +0,0 @@ -API Documentation -================= - -## Introduction - -Intel® Neural Compressor is an open-source Python library designed to help users quickly deploy low-precision inference solutions on popular deep learning (DL) frameworks such as TensorFlow*, PyTorch*, MXNet, and ONNX Runtime. It automatically optimizes low-precision recipes for deep learning models in order to achieve optimal product objectives, such as inference performance and memory usage, with expected accuracy criteria. - - -## User-facing APIs - -These APIs are intended to unify low-precision quantization interfaces cross multiple DL frameworks for the best out-of-the-box experiences. - -> **Note** -> -> Neural Compressor is continuously improving user-facing APIs to create a better user experience. - -> Two sets of user-facing APIs exist. One is the default one supported from Neural Compressor v1.0 for backwards compatibility. The other set consists of new APIs in -the `neural_compressor.experimental` package. - -> We recommend that you use the APIs located in neural_compressor.experimental. All examples have been updated to use the experimental APIs. - -The major differences between the default user-facing APIs and the experimental APIs are: - -1. The experimental APIs abstract the `neural_compressor.experimental.common.Model` concept to cover those cases whose weight and graph files are stored separately. -2. The experimental APIs unify the calling style of the `Quantization`, `Pruning`, and `Benchmark` classes by setting model, calibration dataloader, evaluation dataloader, and metric through class attributes rather than passing them as function inputs. -3. The experimental APIs refine Neural Compressor built-in transforms/datasets/metrics by unifying the APIs cross different framework backends. - -## Experimental user-facing APIs - -Experimental user-facing APIs consist of the following components: - -### Quantization-related APIs - -```python -# neural_compressor.experimental.Quantization -class Quantization(object): - def __init__(self, conf_fname_or_obj): - ... - - def __call__(self): - ... - - @property - def calib_dataloader(self): - ... - - @property - def eval_dataloader(self): - ... - - @property - def model(self): - ... - - @property - def metric(self): - ... - - @property - def postprocess(self, user_postprocess): - ... - - @property - def q_func(self): - ... - - @property - def eval_func(self): - ... - -``` -The `conf_fname_or_obj` parameter used in the class initialization is the path to the user yaml configuration file or Quantization_Conf class. This yaml file is used to control the entire tuning behavior on the model. - -**Neural Compressor User YAML Syntax** - -> Intel® Neural Compressor provides template yaml files for [Post-Training Quantization](../neural_compressor/template/ptq.yaml), [Quantization-Aware Training](../neural_compressor/template/qat.yaml), and [Pruning](../neural_compressor/template/pruning.yaml) scenarios. Refer to these template files to understand the meaning of each field. - -> Note that most fields in the yaml templates are optional. View the [HelloWorld Yaml](../examples/helloworld/tf_example2/conf.yaml) example for reference. - -```python -# Typical Launcher code -from neural_compressor.experimental import Quantization, common - -# optional if Neural Compressor built-in dataset could be used as model input in yaml -class dataset(object): - def __init__(self, *args): - ... - - def __getitem__(self, idx): - # return single sample and label tuple without collate. label should be 0 for label-free case - ... - - def len(self): - ... - -# optional if Neural Compressor built-in metric could be used to do accuracy evaluation on model output in yaml -class custom_metric(object): - def __init__(self): - ... - - def update(self, predict, label): - # metric update per mini-batch - ... - - def result(self): - # final metric calculation invoked only once after all mini-batch are evaluated - # return a scalar to neural_compressor for accuracy-driven tuning. - # by default the scalar is higher-is-better. if not, set tuning.accuracy_criterion.higher_is_better to false in yaml. - ... - -quantizer = Quantization(conf.yaml) -quantizer.model = '/path/to/model' -# below two lines are optional if Neural Compressor built-in dataset is used as model calibration input in yaml -cal_dl = dataset('/path/to/calibration/dataset') -quantizer.calib_dataloader = common.DataLoader(cal_dl, batch_size=32) -# below two lines are optional if Neural Compressor built-in dataset is used as model evaluation input in yaml -dl = dataset('/path/to/evaluation/dataset') -quantizer.eval_dataloader = common.DataLoader(dl, batch_size=32) -# optional if Neural Compressor built-in metric could be used to do accuracy evaluation in yaml -quantizer.metric = common.Metric(custom_metric) -q_model = quantizer.fit() -q_model.save('/path/to/output/dir') -``` - -`model` attribute in `Quantization` class is an abstraction of model formats across different frameworks. Neural Compressor supports passing the path of `keras model`, `frozen pb`, `checkpoint`, `saved model`, `torch.nn.model`, `mxnet.symbol.Symbol`, `gluon.HybirdBlock`, and `onnx model` to instantiate a `neural_compressor.experimental.` class and set to `quantizer.model`. - -`calib_dataloader` and `eval_dataloader` attribute in `Quantization` class is used to set up a calibration dataloader by code. It is optional to set if the user sets corresponding fields in yaml. - -`metric` attribute in `Quantization` class is used to set up a custom metric by code. It is optional to set if user finds Neural Compressor built-in metric could be used with their model and sets corresponding fields in yaml. - -`postprocess` attribute in `Quantization` class is not necessary in most of the use cases. It is only needed when the user wants to use the built-in metric but the model output can not directly be handled by Neural Compressor built-in metrics. In this case, the user can register a transform to convert the model output to the expected one required by the built-in metric. - -`q_func` attribute in `Quantization` class is only for `Quantization Aware Training` case, in which the user needs to register a function that takes `model` as the input parameter and executes the entire training process with self-contained training hyper-parameters. - -`eval_func` attribute in `Quantization` class is reserved for special cases. If the user had an evaluation function when train a model, the user must implement a `calib_dataloader` and leave `eval_dataloader` as None. Then, modify this evaluation function to take `model` as the input parameter and return a higher-is-better scaler. In some scenarios, it may reduce development effort. - - -### Pruning-related APIs (POC) - -```python -class Pruning(object): - def __init__(self, conf_fname_or_obj): - ... - - def on_epoch_begin(self, epoch): - ... - - def on_step_begin(self, batch_id): - ... - - def on_step_end(self): - ... - - def on_epoch_end(self): - ... - - def __call__(self): - ... - - @property - def model(self): - ... - - @property - def q_func(self): - ... - -``` - -This API is used to do sparsity pruning. Currently, it is a Proof of Concept; Neural Compressor only supports `magnitude pruning` on PyTorch. - -To learn how to use this API, refer to the [pruning document](../docs/pruning.md). - -### Benchmarking-related APIs -```python -class Benchmark(object): - def __init__(self, conf_fname_or_obj): - ... - - def __call__(self): - ... - - @property - def model(self): - ... - - @property - def metric(self): - ... - - @property - def b_dataloader(self): - ... - - @property - def postprocess(self, user_postprocess): - ... -``` - -This API is used to measure model performance and accuracy. - -To learn how to use this API, refer to the [benchmarking document](../docs/benchmark.md). - -## Default user-facing APIs - -The default user-facing APIs exist for backwards compatibility from the v1.0 release. Refer to [v1.1 API](https://github.com/intel/neural-compressor/blob/v1.1/docs/introduction.md) to understand how the default user-facing APIs work. - -View the [HelloWorld example](/examples/helloworld/tf_example6) that uses default user-facing APIs for user reference. - -Full examples using default user-facing APIs can be found [here](https://github.com/intel/neural-compressor/tree/v1.1/examples). diff --git a/docs/design.md b/docs/design.md deleted file mode 100644 index bee2fa124b8..00000000000 --- a/docs/design.md +++ /dev/null @@ -1,15 +0,0 @@ -Design -===== -Intel® Neural Compressor features an architecture and workflow that aids in increasing performance and faster deployments across infrastructures. - -## Architecture - - - Architecture - - -## Workflow - - - Workflow - diff --git a/docs/doclist.rst b/docs/doclist.rst deleted file mode 100644 index d5be5857470..00000000000 --- a/docs/doclist.rst +++ /dev/null @@ -1,68 +0,0 @@ -Developer Documentation -####################### - -Read the following material as you learn how to use Neural Compressor. - -Get Started -=========== - -* `Transform `__ introduces how to utilize Neural Compressor's built-in data processing and how to develop a custom data processing method. -* `Dataset `__ introduces how to utilize Neural Compressor's built-in dataset and how to develop a custom dataset. -* `Metrics `__ introduces how to utilize Neural Compressor's built-in metrics and how to develop a custom metric. -* `UX `__ is a web-based system used to simplify Neural Compressor usage. -* `Intel oneAPI AI Analytics Toolkit Get Started Guide `__ explains the AI Kit components, installation and configuration guides, and instructions for building and running sample apps. -* `AI and Analytics Samples `__ includes code samples for Intel oneAPI libraries. - -.. toctree:: - :maxdepth: 1 - :hidden: - - transform.md - dataset.md - metric.md - ux.md - Intel oneAPI AI Analytics Toolkit Get Started Guide - AI and Analytics Samples - - -Deep Dive -========= - -* `Quantization `__ are processes that enable inference and training by performing computations at low-precision data types, such as fixed-point integers. Neural Compressor supports Post-Training Quantization (`PTQ `__) and Quantization-Aware Training (`QAT `__). Note that `Dynamic Quantization `__ currently has limited support. -* `Pruning `__ provides a common method for introducing sparsity in weights and activations. -* `Benchmarking `__ introduces how to utilize the benchmark interface of Neural Compressor. -* `Mixed precision `__ introduces how to enable mixed precision, including BFP16 and int8 and FP32, on Intel platforms during tuning. -* `Graph Optimization `__ introduces how to enable graph optimization for FP32 and auto-mixed precision. -* `Model Conversion ` introduces how to convert TensorFlow QAT model to quantized model running on Intel platforms. -* `TensorBoard `__ provides tensor histograms and execution graphs for tuning debugging purposes. - - -.. toctree:: - :maxdepth: 1 - :hidden: - - Quantization.md - PTQ.md - QAT.md - dynamic_quantization.md - pruning.md - benchmark.md - mixed_precision.md - graph_optimization.md - model_conversion.md - tensorboard.md - - -Advanced Topics -=============== - -* `Adaptor `__ is the interface between Neural Compressor and framework. The method to develop adaptor extension is introduced with ONNX Runtime as example. -* `Tuning strategies `__ can automatically optimized low-precision recipes for deep learning models to achieve optimal product objectives like inference performance and memory usage with expected accuracy criteria. The method to develop a new strategy is introduced. - - -.. toctree:: - :maxdepth: 1 - :hidden: - - adaptor.md - tuning_strategies.md diff --git a/docs/getting_started.md b/docs/getting_started.md deleted file mode 100644 index e320126de94..00000000000 --- a/docs/getting_started.md +++ /dev/null @@ -1,451 +0,0 @@ -Getting Started -=============== - -## Installation - -The Intel® Neural Compressor library is released as part of the -[Intel® oneAPI AI Analytics Toolkit](https://software.intel.com/content/www/us/en/develop/tools/oneapi/ai-analytics-toolkit.html) (AI Kit). -The AI Kit provides a consolidated package of Intel's latest deep learning and -machine optimizations all in one place for ease of development. Along with -Neural Compressor, the AI Kit includes Intel-optimized versions of deep learning frameworks -(such as TensorFlow and PyTorch) and high-performing Python libraries to -streamline end-to-end data science and AI workflows on Intel architectures. - - -### Linux Installation - -You can install just the library from binary or source, or you can get -the Intel-optimized framework together with the library by installing the -Intel® oneAPI AI Analytics Toolkit. - -#### Install from binary - - ```Shell - # install from pip - pip install neural-compressor - - # install from conda - conda install neural-compressor -c conda-forge -c intel - ``` - -#### Install from source - - ```Shell - git clone https://github.com/intel/neural-compressor.git - cd neural-compressor - pip install -r requirements.txt - python setup.py install - ``` - -#### Install from AI Kit - -The AI Kit, which includes the -library, is distributed through many common channels, -including from Intel's website, YUM, APT, Anaconda, and more. -Select and [download](https://software.intel.com/content/www/us/en/develop/tools/oneapi/ai-analytics-toolkit/download.html) -the AI Kit distribution package that's best suited for you and follow the -[Get Started Guide](https://software.intel.com/content/www/us/en/develop/documentation/get-started-with-ai-linux/top.html) -for post-installation instructions. - -|[Download AI Kit](https://software.intel.com/content/www/us/en/develop/tools/oneapi/ai-analytics-toolkit/) |[AI Kit Get Started Guide](https://software.intel.com/content/www/us/en/develop/documentation/get-started-with-ai-linux/top.html) | -|---|---| - -### Windows Installation - -**Prerequisites** - -The following prerequisites and requirements must be satisfied for a successful installation: - -- Python version: 3.6 or 3.7 or 3.8 or 3.9 - -- Download and install [anaconda](https://anaconda.org/). - -- Create a virtual environment named nc in anaconda: - - ```shell - # Here we install python 3.7 for instance. You can also choose python 3.6, 3.8, or 3.9. - conda create -n nc python=3.7 - conda activate nc - ``` - -#### Install from binary - - ```Shell - # install from pip - pip install neural-compressor - - # install from conda - conda install neural-compressor -c conda-forge -c intel - ``` - -#### Install from source - -```shell -git clone https://github.com/intel/neural-compressor.git -cd neural-compressor -pip install -r requirements.txt -python setup.py install -``` - -## Examples - -[Examples](examples_readme.md) are provided to demonstrate the usage of Intel® Neural Compressor in different frameworks: TensorFlow, PyTorch, MXNet, and ONNX Runtime. Hello World examples are also available. - -## Developer Documentation - -View Neural Compressor [Documentation](doclist.rst) for getting started, deep dive, and advanced resources to help you use and develop Neural Compressor. - -## System Requirements - -Intel® Neural Compressor supports systems based on [Intel 64 architecture or compatible processors](https://en.wikipedia.org/wiki/X86-64), specially optimized for the following CPUs: - -* Intel Xeon Scalable processor (formerly Skylake, Cascade Lake, Cooper Lake, and Icelake) -* future Intel Xeon Scalable processor (code name Sapphire Rapids) - -Intel® Neural Compressor requires installing the Intel-optimized framework version for the supported DL framework you use: TensorFlow, PyTorch, MXNet, or ONNX runtime. - -Note: Intel Neural Compressor supports Intel-optimized and official frameworks for some TensorFlow versions. Refer to [Supported Frameworks](../README.md#Supported-Frameworks) for specifics. - -### Validated Hardware/Software Environment - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
PlatformOSPythonFrameworkVersion
Cascade Lake

Cooper Lake

Skylake

Ice Lake
CentOS 8.3

Ubuntu 18.04
3.6

3.7

3.8

3.9
TensorFlow2.5.0
2.4.0
2.3.0
2.2.0
2.1.0
1.15.0 UP1
1.15.0 UP2
1.15.0 UP3
1.15.2
PyTorch1.5.0+cpu
1.6.0+cpu
1.8.0+cpu
IPEX
MXNet1.7.0
1.6.0
ONNX Runtime1.6.0
1.7.0
1.8.0
- -## Validated Models - -Intel® Neural Compressor provides numerous examples to show promising accuracy loss with the best performance gain. A full quantized model list on various frameworks is available in the [Model List](validated_model_list.md). - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
FrameworkversionModeldatasetAccuracyPerformance speed up
INT8 Tuning AccuracyFP32 Accuracy BaselineAcc Ratio[(INT8-FP32)/FP32]Realtime Latency Ratio[FP32/INT8]
tensorflow2.4.0resnet50v1.5ImageNet76.70%76.50%0.26%3.23x
tensorflow2.4.0Resnet101ImageNet77.20%76.40%1.05%2.42x
tensorflow2.4.0inception_v1ImageNet70.10%69.70%0.57%1.88x
tensorflow2.4.0inception_v2ImageNet74.10%74.00%0.14%1.96x
tensorflow2.4.0inception_v3ImageNet77.20%76.70%0.65%2.36x
tensorflow2.4.0inception_v4ImageNet80.00%80.30%-0.37%2.59x
tensorflow2.4.0inception_resnet_v2ImageNet80.10%80.40%-0.37%1.97x
tensorflow2.4.0Mobilenetv1ImageNet71.10%71.00%0.14%2.88x
tensorflow2.4.0ssd_resnet50_v1Coco37.90%38.00%-0.26%2.97x
tensorflow2.4.0mask_rcnn_inception_v2Coco28.90%29.10%-0.69%2.66x
tensorflow2.4.0vgg16ImageNet72.50%70.90%2.26%3.75x
tensorflow2.4.0vgg19ImageNet72.40%71.00%1.97%3.79x
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
FrameworkversionmodeldatasetAccuracyPerformance speed up
INT8 Tuning AccuracyFP32 Accuracy BaselineAcc Ratio[(INT8-FP32)/FP32]Realtime Latency Ratio[FP32/INT8]
pytorch1.5.0+cpuresnet50ImageNet75.96%76.13%-0.23%2.63x
pytorch1.5.0+cpuresnext101_32x8dImageNet79.12%79.31%-0.24%2.61x
pytorch1.6.0a0+24aac32bert_base_mrpcMRPC88.90%88.73%0.19%1.98x
pytorch1.6.0a0+24aac32bert_base_colaCOLA59.06%58.84%0.37%2.19x
pytorch1.6.0a0+24aac32bert_base_sts-bSTS-B88.40%89.27%-0.97%2.28x
pytorch1.6.0a0+24aac32bert_base_sst-2SST-291.51%91.86%-0.37%2.30x
pytorch1.6.0a0+24aac32bert_base_rteRTE69.31%69.68%-0.52%2.15x
pytorch1.6.0a0+24aac32bert_large_mrpcMRPC87.45%88.33%-0.99%2.73x
pytorch1.6.0a0+24aac32bert_large_squadSQUAD92.85%93.05%-0.21%2.01x
pytorch1.6.0a0+24aac32bert_large_qnliQNLI91.20%91.82%-0.68%2.69x
diff --git a/make.bat b/docs/make.bat similarity index 95% rename from make.bat rename to docs/make.bat index 695a8b3ecfd..f9a02b02da3 100644 --- a/make.bat +++ b/docs/make.bat @@ -1,36 +1,36 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=. -set BUILDDIR=_build -set SPHINXPROJ=ProjectnameIntelLowPrecisionOptimizationTool - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% - -:end -popd +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build +set SPHINXPROJ=ProjectnameIntelLowPrecisionOptimizationTool + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/docs/reference_examples.md b/docs/reference_examples.md deleted file mode 100644 index 4fa1dc38a42..00000000000 --- a/docs/reference_examples.md +++ /dev/null @@ -1,149 +0,0 @@ -Reference Examples -=== -## Validated Models - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelAccuracyPerformance
INT8FP32Acc Ratio[(INT8-FP32)/FP32]INT8FP32Performance Ratio[INT8/FP32]
bert_large_squad_static - 90.78%90.87%-0.11%49.0813.483.64x
bert_base_mrpc_static - 82.35%83.09%-0.89%497.28151.163.29x
bert_base_nli_mean_tokens_stsb_static - 89.23%89.55%-0.36%546.97151.773.60x
bert_base_sparse_mrpc_static - 70.59%70.59%0.00%551.90153.803.59x
bert_mini_mrpc_static - 78.19%78.68%-0.62%6962.583252.142.14x
bert_mini_sst2_static - 87.16%86.93%0.26%6850.383218.982.13x
distilbert_base_uncased_sst2_static - 90.14%90.25%-0.12%1086.13306.453.54x
distilbert_base_uncased_mrpc_static - 83.82%84.07%-0.30%1091.99303.923.59x
distilbert_base_uncased_emotion_static - 93.90%94.20%-0.32%1081.35306.333.53x
minilm_l6_h384_uncased_sst2_static - 89.33%90.14%-0.90%2594.771083.842.39x
roberta_base_mrpc_static - 88.24%88.97%-0.82%508.14153.373.31x
distilroberta_base_wnli_static - 56.34%56.34%0.00%1097.22315.943.47x
paraphrase_xlm_r_multilingual_v1_stsb_static - 86.66%87.23%-0.65%552.44153.743.59x
finbert_financial_phrasebank_static - 82.57%82.80%-0.28%999.94292.553.42x
-Note: measured by batch size 1, 4 cores/instance, 10 instances on 1 socket of Intel Xeon Platinum 8380 Scalable processor diff --git a/docs/CODE_OF_CONDUCT.md b/docs/source/CODE_OF_CONDUCT.md similarity index 100% rename from docs/CODE_OF_CONDUCT.md rename to docs/source/CODE_OF_CONDUCT.md diff --git a/docs/FX.md b/docs/source/FX.md similarity index 100% rename from docs/FX.md rename to docs/source/FX.md diff --git a/docs/NAS.md b/docs/source/NAS.md similarity index 99% rename from docs/NAS.md rename to docs/source/NAS.md index a2eb0eb456b..98eac4d8217 100644 --- a/docs/NAS.md +++ b/docs/source/NAS.md @@ -136,7 +136,7 @@ Dynamic Neural Architecture Search (DyNAS) is a super-network-based NAS approach
The flow of the DyNAS approach is shown in the following figure. In the first phase of the search, a small population of sub-networks are randomly sampled from the super-network and evaluated (validation measurement) to provide the initial training set for the inner predictor loop. After the predictors are trained, a multi-objective evolutionary search is performed in the predictor objective space. After this extensive search is performed, the best performing sub-network configurations are selected to be the next iteration's validation population. The cycle continues until the search concludes when the user defined evaluation count is met.
-![DyNAS Workflow](./imgs/dynas.png) +![DyNAS Workflow](./_static/imgs/dynas.png)
This class is also registered to the Intel® Neural Compressor as a built-in NAS method through a decorator `nas_registry`, its interface is shown below. diff --git a/docs/PTQ.md b/docs/source/PTQ.md similarity index 100% rename from docs/PTQ.md rename to docs/source/PTQ.md diff --git a/docs/QAT.md b/docs/source/QAT.md similarity index 98% rename from docs/QAT.md rename to docs/source/QAT.md index 7bad1c0fcd0..e899f2157c1 100644 --- a/docs/QAT.md +++ b/docs/source/QAT.md @@ -4,7 +4,7 @@ Quantization-aware training (QAT) simulates low-precision inference-time computation in the forward pass of the training process. With QAT, all weights and activations are "fake quantized" during both the forward and backward passes of training: that is, float values are rounded to mimic int8 values, but all computations are still done with floating point numbers. Thus, all the weight adjustments during training are made while "aware" of the fact that the model will ultimately be quantized; after quantizing, therefore, this method will usually yield higher accuracy than either dynamic quantization or post-training static quantization. -fake quantize +fake quantize ## Usage diff --git a/docs/source/SECURITY.md b/docs/source/SECURITY.md new file mode 100644 index 00000000000..71a71eff1b6 --- /dev/null +++ b/docs/source/SECURITY.md @@ -0,0 +1,13 @@ +Security Policy +=============== + +## Report a Vulnerability + +Please report security issues or vulnerabilities to the [Intel® Security Center]. + +For more information on how Intel® works to resolve security issues, see +[Vulnerability Handling Guidelines]. + +[Intel® Security Center]:https://www.intel.com/security + +[Vulnerability Handling Guidelines]:https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html diff --git a/docs/source/Welcome.md b/docs/source/Welcome.md new file mode 100644 index 00000000000..35d9e3841a9 --- /dev/null +++ b/docs/source/Welcome.md @@ -0,0 +1,249 @@ +
+ +Intel® Neural Compressor +=========================== +

An open-source Python library supporting popular model compression techniques on all mainstream deep learning frameworks (TensorFlow, PyTorch, ONNX Runtime, and MXNet)

+ +[![python](https://img.shields.io/badge/python-3.7%2B-blue)](https://github.com/intel/neural-compressor) +[![version](https://img.shields.io/badge/release-1.14-green)](https://github.com/intel/neural-compressor/releases) +[![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/intel/neural-compressor/blob/master/LICENSE) +[![coverage](https://img.shields.io/badge/coverage-90%25-green)](https://github.com/intel/neural-compressor) +[![Downloads](https://static.pepy.tech/personalized-badge/neural-compressor?period=total&units=international_system&left_color=grey&right_color=green&left_text=downloads)](https://pepy.tech/project/neural-compressor) +
+ +--- +
+ +Intel® Neural Compressor, formerly known as Intel® Low Precision Optimization Tool, is an open-source Python library that runs on Intel CPUs and GPUs, which delivers unified interfaces across multiple deep-learning frameworks for popular network compression technologies such as quantization, pruning, and knowledge distillation. This tool supports automatic accuracy-driven tuning strategies to help the user quickly find out the best quantized model. It also implements different weight-pruning algorithms to generate a pruned model with predefined sparsity goal. It also supports knowledge distillation to distill the knowledge from the teacher model to the student model. +Intel® Neural Compressor is a critical AI software component in the [Intel® oneAPI AI Analytics Toolkit](https://software.intel.com/content/www/us/en/develop/tools/oneapi/ai-analytics-toolkit.html). + + +**Visit the Intel® Neural Compressor online document website at: .** + +## Installation + +### Prerequisites + +Python version: 3.7, 3.8, 3.9, 3.10 + +### Install on Linux +- Release binary install + ```Shell + # install stable basic version from pip + pip install neural-compressor + # Or install stable full version from pip (including GUI) + pip install neural-compressor-full + ``` +- Nightly binary install + ```Shell + git clone https://github.com/intel/neural-compressor.git + cd neural-compressor + pip install -r requirements.txt + # install nightly basic version from pip + pip install -i https://test.pypi.org/simple/ neural-compressor + # Or install nightly full version from pip (including GUI) + pip install -i https://test.pypi.org/simple/ neural-compressor-full + ``` +More installation methods can be found at [Installation Guide](./installation_guide.html). Please check out our [FAQ](./faq.html) for more details. + +## Getting Started +### Quantization with Python API + +```shell +# A TensorFlow Example +pip install tensorflow +# Prepare fp32 model +wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_6/mobilenet_v1_1.0_224_frozen.pb +``` +```python +import tensorflow as tf +from neural_compressor.experimental import Quantization, common +quantizer = Quantization() +quantizer.model = './mobilenet_v1_1.0_224_frozen.pb' +dataset = quantizer.dataset('dummy', shape=(1, 224, 224, 3)) +quantizer.calib_dataloader = common.DataLoader(dataset) +quantizer.fit() +``` +### Quantization with [JupyterLab Extension](./neural_coder/extensions/neural_compressor_ext_lab/README.html) +Search for ```jupyter-lab-neural-compressor``` in the Extension Manager in JupyterLab and install with one click: + + + Extension + + +### Quantization with [GUI](./bench.html) +```shell +# An ONNX Example +pip install onnx==1.12.0 onnxruntime==1.12.1 onnxruntime-extensions +# Prepare fp32 model +wget https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v1-12.onnx +# Start GUI +inc_bench +``` + + Architecture + + +## System Requirements + +### Validated Hardware Environment +#### Intel® Neural Compressor supports CPUs based on [Intel 64 architecture or compatible processors](https://en.wikipedia.org/wiki/X86-64): + +* Intel Xeon Scalable processor (formerly Skylake, Cascade Lake, Cooper Lake, and Icelake) +* Future Intel Xeon Scalable processor (code name Sapphire Rapids) + +#### Intel® Neural Compressor supports GPUs built on Intel's Xe architecture: + +* [Intel® Data Center GPU Flex Series](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/data-center-gpu/flex-series/overview.html) + +#### Intel® Neural Compressor quantized ONNX models support multiple hardware vendors through ONNX Runtime: + +* Intel CPU, AMD/ARM CPU, and NVidia GPU. Please refer to the validated model [list](./validated_model_list.html#Validated-ONNX-QDQ-INT8-models-on-multiple-hardware-through-ONNX-Runtime). + +### Validated Software Environment + +* OS version: CentOS 8.4, Ubuntu 20.04 +* Python version: 3.7, 3.8, 3.9, 3.10 + + + + + + + + + + + + + + + + + + + + + + +
FrameworkTensorFlowIntel TensorFlowPyTorchIntel® Extension for PyTorch*ONNX RuntimeMXNet
Version2.10.0
+ 2.9.1
+ 2.8.2
+
2.10.0
+ 2.9.1
+ 2.8.0
+
1.12.1+cpu
+ 1.11.0+cpu
+ 1.10.0+cpu
1.12.0
+ 1.11.0
+ 1.10.0
1.12.1
+ 1.11.0
+ 1.10.0
1.8.0
+ 1.7.0
+ 1.6.0
+ +> **Note:** +> Set the environment variable ``TF_ENABLE_ONEDNN_OPTS=1`` to enable oneDNN optimizations if you are using TensorFlow v2.6 to v2.8. oneDNN is the default for TensorFlow v2.9. + +### Validated Models +Intel® Neural Compressor validated 420+ [examples](./examples) for quantization with a performance speedup geomean of 2.2x and up to 4.2x on VNNI while minimizing accuracy loss. Over 30 pruning and knowledge distillation samples are also available. More details for validated models are available [here](./validated_model_list.html). + + + +## Documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Overview
ArchitectureExamplesGUIAPIs
Intel oneAPI AI Analytics ToolkitAI and Analytics Samples
Basic API
TransformDatasetMetricObjective
Deep Dive
QuantizationPruning(Sparsity)Knowledge DistillationMixed PrecisionOrchestration
BenchmarkingDistributed TrainingModel ConversionTensorBoard
Distillation for QuantizationNeural Coder
Advanced Topics
AdaptorStrategy
+ +## Selected Publications/Events +* [Neural Compressor: an open-source Python library for network compression](https://cloud.tencent.com/developer/article/2165895) (Nov 2022) +* [Running Fast Transformers on CPUs: Intel Approach Achieves Significant Speed Ups and SOTA Performance](https://medium.com/syncedreview/running-fast-transformers-on-cpus-intel-approach-achieves-significant-speed-ups-and-sota-448521704c5e) (Nov 2022) +* [Personalized Stable Diffusion with Few-Shot Fine-Tuning](https://medium.com/intel-analytics-software/personalized-stable-diffusion-with-few-shot-fine-tuning-on-a-single-cpu-f01a3316b13) (Nov 2022) +* [Meet the Innovation of Intel AI Software: Intel® Extension for TensorFlow*](https://www.intel.com/content/www/us/en/developer/articles/technical/innovation-of-ai-software-extension-tensorflow.html) (Oct 2022) +* [PyTorch* Inference Acceleration with Intel® Neural Compressor](https://www.intel.com/content/www/us/en/developer/articles/technical/pytorch-inference-with-intel-neural-compressor.html#gs.gnq0cj) (Oct 2022) +* Neural Coder, a new plug-in for Intel Neural Compressor was covered by [Twitter](https://twitter.com/IntelDevTools/status/1583629213697212416), [LinkedIn](https://www.linkedin.com/posts/intel-software_oneapi-ai-deeplearning-activity-6989377309917007872-Dbzg?utm_source=share&utm_medium=member_desktop), and [Intel Developer Zone](https://mp.weixin.qq.com/s/LL-4eD-R0YagFgODM23oQA) from Intel, and [Twitter](https://twitter.com/IntelDevTools/status/1583629213697212416/retweets) and [LinkedIn](https://www.linkedin.com/feed/update/urn:li:share:6990377841435574272/) from Hugging Face. (Oct 2022) +* Intel Neural Compressor successfully landed on [GCP](https://console.cloud.google.com/marketplace/product/bitnami-launchpad/inc-tensorflow-intel?project=verdant-sensor-286207), [AWS](https://aws.amazon.com/marketplace/pp/prodview-yjyh2xmggbmga#pdp-support), and [Azure](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/bitnami.inc-tensorflow-intel) marketplace. (Oct 2022) + +> View our [full publication list](./publication_list.html). + +## Additional Content + +* [Release Information](./releases_info.html) +* [Contribution Guidelines](./contributions.html) +* [Legal Information](./legal_information.html) +* [Security Policy](SECURITY.html) +* [Intel® Neural Compressor Website](https://intel.github.io/neural-compressor) + +## Hiring + +We are actively hiring. Send your resume to inc.maintainers@intel.com if you are interested in model compression techniques. diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css new file mode 100644 index 00000000000..d802267a2cf --- /dev/null +++ b/docs/source/_static/custom.css @@ -0,0 +1,201 @@ +.site-footer { + display: none; +} + +.container { + padding-left: 0px; +} +.header-holder .main-menu { + left: 65%; +} + +.header-holder .container { + padding-left: 0px; +} + +.headTitleStyle { + font-size: 1.3rem; + font-family: auto; + color: #eee; +} + +.headTitleStyle:before{ + content: "🏠"; +} + +.header-container { + display: flex; +} + +.iconStyle { + width: 100%; +} + +.docs-tutorials-resources { + display: none; +} + + +.navbar-logo { + background: #0068b5; + line-height: 1; + width: 25.7%; + padding: 0; + margin-right: 12px; + position: relative; + display: flex; + justify-content: center; + align-items: center; + height: 70px; +} + +@media screen and (min-width: 1600px) { + .navbar-logo { + width: 26.2rem; + } + } + + .pytorch-page-level-bar { + width: 100%; + } + + .pytorch-content-left { + width: 100%; +} + + +article.pytorch-article h2 { + margin-top: 0; +} + +article.pytorch-article ul li { + line-height: 1.75rem; + margin-bottom: 0; + font-family: FreightSans, Helvetica Neue, Helvetica, Arial, sans-serif; +} + +ul.pytorch-breadcrumbs a, p a:link, p a:visited, p a:hover, +.pytorch-left-menu li.toctree-l1.current > a, .pytorch-right-menu li.toctree-l1.current > a, +.pytorch-left-menu li.toctree-l1.current > a:before, .pytorch-right-menu li.toctree-l1.current > a:before, +.header-holder .main-menu ul li a:hover, +a:link, a:visited, a:hover { + color: #0068b5; +} + +article.pytorch-article .class dt { + border-left: none; + border-top: 3px solid #0068b5; + padding-right: 1.25rem; +} + +article.pytorch-article .class .method dt, article.pytorch-article .class .staticmethod dt { + border-left: 3px solid #0068b5; + border-top: none; +} + + +article.pytorch-article .function dt, article.pytorch-article .attribute dt, article.pytorch-article .class .attribute dt { + border-left: 3px solid #0068b5; +} + +article.pytorch-article table tbody td { + color: #6c6c6d; + white-space: normal; + padding: 0.9375rem; + font-size: 1rem; + line-height: 1.375rem; +} + +article.pytorch-article table tr td:first-of-type { + width: 30%; +} + +article.pytorch-article dl { + margin-bottom: 0rem; +} +ol, ul, dl { + margin-top: revert; + margin-bottom: 0rem; +} + +article.pytorch-article .class dd { + padding-left: 3.75rem; +} + +article.pytorch-article .class dd p { + color: #262626; +} + +article.pytorch-article table tr td:nth-of-type(2) { + text-align: left; +} + +article.pytorch-article table tbody td .pre { + color: #0068b5; + font-size: 1rem; +} + +article.pytorch-article table tr td > p { + margin-bottom: 0rem; +} + +article.pytorch-article p, article.pytorch-article ol li, article.pytorch-article dl dt, article.pytorch-article dl dd, article.pytorch-article blockquote { + font-size: 1rem; + line-height: 1.375rem; + color: #262626; + letter-spacing: 0.01px; + font-weight: 500; +} + +p { + line-height: 24px; + font-size: 16px; + margin: 0 0 24px; +} + +.rst-content p a { + overflow-wrap: anywhere; +} + +.rst-content img { + max-width: 100%; + height: auto; +} + +article.pytorch-article blockquote { + width: 100%; +} + +article.pytorch-article img { + width: auto; +} + +article.pytorch-article hr { + margin: 0px; +} +article.pytorch-article table th, article.pytorch-article table tr, +article.pytorch-article table tr th:first-of-type, article.pytorch-article table tr td:first-of-type { + border: 1px solid #eee; + text-align: center; +} + +article.pytorch-article table td { + border: 1px solid #eee; +} + +h1 { + margin-bottom: 1.375rem; + font-size: 1.8rem; +} + +article.pytorch-article table tbody .row-odd { + background-color: #f3f4f7; +} + +article.pytorch-article { + margin: 0; +} + +#validated-mlperf-models table tr td { + text-align: center; +} \ No newline at end of file diff --git a/docs/imgs/Distillation_workflow.png b/docs/source/_static/imgs/Distillation_workflow.png similarity index 100% rename from docs/imgs/Distillation_workflow.png rename to docs/source/_static/imgs/Distillation_workflow.png diff --git a/docs/imgs/INC_GUI.gif b/docs/source/_static/imgs/INC_GUI.gif similarity index 100% rename from docs/imgs/INC_GUI.gif rename to docs/source/_static/imgs/INC_GUI.gif diff --git a/docs/imgs/accuracy_aware_tuning_flow.png b/docs/source/_static/imgs/accuracy_aware_tuning_flow.png similarity index 100% rename from docs/imgs/accuracy_aware_tuning_flow.png rename to docs/source/_static/imgs/accuracy_aware_tuning_flow.png diff --git a/docs/imgs/architecture.png b/docs/source/_static/imgs/architecture.png similarity index 100% rename from docs/imgs/architecture.png rename to docs/source/_static/imgs/architecture.png diff --git a/docs/imgs/bench/benchmark-details.png b/docs/source/_static/imgs/bench/benchmark-details.png similarity index 100% rename from docs/imgs/bench/benchmark-details.png rename to docs/source/_static/imgs/bench/benchmark-details.png diff --git a/docs/imgs/bench/benchmark-edit-wizard.png b/docs/source/_static/imgs/bench/benchmark-edit-wizard.png similarity index 100% rename from docs/imgs/bench/benchmark-edit-wizard.png rename to docs/source/_static/imgs/bench/benchmark-edit-wizard.png diff --git a/docs/imgs/bench/benchmarks-table.png b/docs/source/_static/imgs/bench/benchmarks-table.png similarity index 100% rename from docs/imgs/bench/benchmarks-table.png rename to docs/source/_static/imgs/bench/benchmarks-table.png diff --git a/docs/imgs/bench/benchmarks-wizard.png b/docs/source/_static/imgs/bench/benchmarks-wizard.png similarity index 100% rename from docs/imgs/bench/benchmarks-wizard.png rename to docs/source/_static/imgs/bench/benchmarks-wizard.png diff --git a/docs/imgs/bench/custom-dataset.png b/docs/source/_static/imgs/bench/custom-dataset.png similarity index 100% rename from docs/imgs/bench/custom-dataset.png rename to docs/source/_static/imgs/bench/custom-dataset.png diff --git a/docs/imgs/bench/dataset-details.png b/docs/source/_static/imgs/bench/dataset-details.png similarity index 100% rename from docs/imgs/bench/dataset-details.png rename to docs/source/_static/imgs/bench/dataset-details.png diff --git a/docs/imgs/bench/datasets-table.png b/docs/source/_static/imgs/bench/datasets-table.png similarity index 100% rename from docs/imgs/bench/datasets-table.png rename to docs/source/_static/imgs/bench/datasets-table.png diff --git a/docs/imgs/bench/datasets-wizard.png b/docs/source/_static/imgs/bench/datasets-wizard.png similarity index 100% rename from docs/imgs/bench/datasets-wizard.png rename to docs/source/_static/imgs/bench/datasets-wizard.png diff --git a/docs/imgs/bench/diagnosis-actions.png b/docs/source/_static/imgs/bench/diagnosis-actions.png similarity index 100% rename from docs/imgs/bench/diagnosis-actions.png rename to docs/source/_static/imgs/bench/diagnosis-actions.png diff --git a/docs/imgs/bench/diagnosis-model-wise-wizard.png b/docs/source/_static/imgs/bench/diagnosis-model-wise-wizard.png similarity index 100% rename from docs/imgs/bench/diagnosis-model-wise-wizard.png rename to docs/source/_static/imgs/bench/diagnosis-model-wise-wizard.png diff --git a/docs/imgs/bench/diagnosis-tab.png b/docs/source/_static/imgs/bench/diagnosis-tab.png similarity index 100% rename from docs/imgs/bench/diagnosis-tab.png rename to docs/source/_static/imgs/bench/diagnosis-tab.png diff --git a/docs/imgs/bench/graph_bert.png b/docs/source/_static/imgs/bench/graph_bert.png similarity index 100% rename from docs/imgs/bench/graph_bert.png rename to docs/source/_static/imgs/bench/graph_bert.png diff --git a/docs/imgs/bench/home.png b/docs/source/_static/imgs/bench/home.png similarity index 100% rename from docs/imgs/bench/home.png rename to docs/source/_static/imgs/bench/home.png diff --git a/docs/imgs/bench/menu.png b/docs/source/_static/imgs/bench/menu.png similarity index 100% rename from docs/imgs/bench/menu.png rename to docs/source/_static/imgs/bench/menu.png diff --git a/docs/imgs/bench/optimization-details.png b/docs/source/_static/imgs/bench/optimization-details.png similarity index 100% rename from docs/imgs/bench/optimization-details.png rename to docs/source/_static/imgs/bench/optimization-details.png diff --git a/docs/imgs/bench/optimization-tuning-details.png b/docs/source/_static/imgs/bench/optimization-tuning-details.png similarity index 100% rename from docs/imgs/bench/optimization-tuning-details.png rename to docs/source/_static/imgs/bench/optimization-tuning-details.png diff --git a/docs/imgs/bench/optimizations-table.png b/docs/source/_static/imgs/bench/optimizations-table.png similarity index 100% rename from docs/imgs/bench/optimizations-table.png rename to docs/source/_static/imgs/bench/optimizations-table.png diff --git a/docs/imgs/bench/optimizations-wizard.png b/docs/source/_static/imgs/bench/optimizations-wizard.png similarity index 100% rename from docs/imgs/bench/optimizations-wizard.png rename to docs/source/_static/imgs/bench/optimizations-wizard.png diff --git a/docs/imgs/bench/profiling-details.png b/docs/source/_static/imgs/bench/profiling-details.png similarity index 100% rename from docs/imgs/bench/profiling-details.png rename to docs/source/_static/imgs/bench/profiling-details.png diff --git a/docs/imgs/bench/profiling-edit-wizard.png b/docs/source/_static/imgs/bench/profiling-edit-wizard.png similarity index 100% rename from docs/imgs/bench/profiling-edit-wizard.png rename to docs/source/_static/imgs/bench/profiling-edit-wizard.png diff --git a/docs/imgs/bench/profiling-table.png b/docs/source/_static/imgs/bench/profiling-table.png similarity index 100% rename from docs/imgs/bench/profiling-table.png rename to docs/source/_static/imgs/bench/profiling-table.png diff --git a/docs/imgs/bench/profiling-wizard.png b/docs/source/_static/imgs/bench/profiling-wizard.png similarity index 100% rename from docs/imgs/bench/profiling-wizard.png rename to docs/source/_static/imgs/bench/profiling-wizard.png diff --git a/docs/imgs/bench/project-info.png b/docs/source/_static/imgs/bench/project-info.png similarity index 100% rename from docs/imgs/bench/project-info.png rename to docs/source/_static/imgs/bench/project-info.png diff --git a/docs/imgs/bench/project1.png b/docs/source/_static/imgs/bench/project1.png similarity index 100% rename from docs/imgs/bench/project1.png rename to docs/source/_static/imgs/bench/project1.png diff --git a/docs/imgs/bench/project2.png b/docs/source/_static/imgs/bench/project2.png similarity index 100% rename from docs/imgs/bench/project2.png rename to docs/source/_static/imgs/bench/project2.png diff --git a/docs/imgs/bench/project3-custom.png b/docs/source/_static/imgs/bench/project3-custom.png similarity index 100% rename from docs/imgs/bench/project3-custom.png rename to docs/source/_static/imgs/bench/project3-custom.png diff --git a/docs/imgs/bench/project3-predefined.png b/docs/source/_static/imgs/bench/project3-predefined.png similarity index 100% rename from docs/imgs/bench/project3-predefined.png rename to docs/source/_static/imgs/bench/project3-predefined.png diff --git a/docs/imgs/bench/remove1.png b/docs/source/_static/imgs/bench/remove1.png similarity index 100% rename from docs/imgs/bench/remove1.png rename to docs/source/_static/imgs/bench/remove1.png diff --git a/docs/imgs/bench/remove2.png b/docs/source/_static/imgs/bench/remove2.png similarity index 100% rename from docs/imgs/bench/remove2.png rename to docs/source/_static/imgs/bench/remove2.png diff --git a/docs/imgs/bench/see_models.png b/docs/source/_static/imgs/bench/see_models.png similarity index 100% rename from docs/imgs/bench/see_models.png rename to docs/source/_static/imgs/bench/see_models.png diff --git a/docs/imgs/bench/show_graph_button.png b/docs/source/_static/imgs/bench/show_graph_button.png similarity index 100% rename from docs/imgs/bench/show_graph_button.png rename to docs/source/_static/imgs/bench/show_graph_button.png diff --git a/docs/imgs/bench/system_info.png b/docs/source/_static/imgs/bench/system_info.png similarity index 100% rename from docs/imgs/bench/system_info.png rename to docs/source/_static/imgs/bench/system_info.png diff --git a/docs/imgs/bench/system_info_table.png b/docs/source/_static/imgs/bench/system_info_table.png similarity index 100% rename from docs/imgs/bench/system_info_table.png rename to docs/source/_static/imgs/bench/system_info_table.png diff --git a/docs/imgs/bf16_convert_pt.png b/docs/source/_static/imgs/bf16_convert_pt.png similarity index 100% rename from docs/imgs/bf16_convert_pt.png rename to docs/source/_static/imgs/bf16_convert_pt.png diff --git a/docs/imgs/bf16_convert_tf.png b/docs/source/_static/imgs/bf16_convert_tf.png similarity index 100% rename from docs/imgs/bf16_convert_tf.png rename to docs/source/_static/imgs/bf16_convert_tf.png diff --git a/docs/source/_static/imgs/common/code.svg b/docs/source/_static/imgs/common/code.svg new file mode 100644 index 00000000000..0fb7f50cd53 --- /dev/null +++ b/docs/source/_static/imgs/common/code.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/source/_static/imgs/common/intel.svg b/docs/source/_static/imgs/common/intel.svg new file mode 100644 index 00000000000..b0265ac5dd8 --- /dev/null +++ b/docs/source/_static/imgs/common/intel.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/source/_static/imgs/common/right.svg b/docs/source/_static/imgs/common/right.svg new file mode 100644 index 00000000000..2ec0a64127a --- /dev/null +++ b/docs/source/_static/imgs/common/right.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/imgs/data_format.png b/docs/source/_static/imgs/data_format.png similarity index 100% rename from docs/imgs/data_format.png rename to docs/source/_static/imgs/data_format.png diff --git a/docs/imgs/dynas.png b/docs/source/_static/imgs/dynas.png similarity index 100% rename from docs/imgs/dynas.png rename to docs/source/_static/imgs/dynas.png diff --git a/docs/imgs/fake_quant.png b/docs/source/_static/imgs/fake_quant.png similarity index 100% rename from docs/imgs/fake_quant.png rename to docs/source/_static/imgs/fake_quant.png diff --git a/docs/imgs/metric.jpg b/docs/source/_static/imgs/metric.jpg similarity index 100% rename from docs/imgs/metric.jpg rename to docs/source/_static/imgs/metric.jpg diff --git a/docs/imgs/model.png b/docs/source/_static/imgs/model.png similarity index 100% rename from docs/imgs/model.png rename to docs/source/_static/imgs/model.png diff --git a/docs/imgs/pruning/2in4_sparsity_demo.png b/docs/source/_static/imgs/pruning/2in4_sparsity_demo.png similarity index 100% rename from docs/imgs/pruning/2in4_sparsity_demo.png rename to docs/source/_static/imgs/pruning/2in4_sparsity_demo.png diff --git a/docs/imgs/pruning/pruning_intro.png b/docs/source/_static/imgs/pruning/pruning_intro.png similarity index 100% rename from docs/imgs/pruning/pruning_intro.png rename to docs/source/_static/imgs/pruning/pruning_intro.png diff --git a/docs/imgs/pruning/pruning_patterns.png b/docs/source/_static/imgs/pruning/pruning_patterns.png similarity index 100% rename from docs/imgs/pruning/pruning_patterns.png rename to docs/source/_static/imgs/pruning/pruning_patterns.png diff --git a/docs/imgs/pruning/sparse_dim.png b/docs/source/_static/imgs/pruning/sparse_dim.png similarity index 100% rename from docs/imgs/pruning/sparse_dim.png rename to docs/source/_static/imgs/pruning/sparse_dim.png diff --git a/docs/imgs/pruning/train_for_sparsity.png b/docs/source/_static/imgs/pruning/train_for_sparsity.png similarity index 100% rename from docs/imgs/pruning/train_for_sparsity.png rename to docs/source/_static/imgs/pruning/train_for_sparsity.png diff --git a/docs/imgs/release_data.png b/docs/source/_static/imgs/release_data.png similarity index 100% rename from docs/imgs/release_data.png rename to docs/source/_static/imgs/release_data.png diff --git a/docs/imgs/self-distillation.png b/docs/source/_static/imgs/self-distillation.png similarity index 100% rename from docs/imgs/self-distillation.png rename to docs/source/_static/imgs/self-distillation.png diff --git a/docs/imgs/strategy.png b/docs/source/_static/imgs/strategy.png similarity index 100% rename from docs/imgs/strategy.png rename to docs/source/_static/imgs/strategy.png diff --git a/docs/imgs/tensorboard_baseline_v0_cg_conv0.png b/docs/source/_static/imgs/tensorboard_baseline_v0_cg_conv0.png similarity index 100% rename from docs/imgs/tensorboard_baseline_v0_cg_conv0.png rename to docs/source/_static/imgs/tensorboard_baseline_v0_cg_conv0.png diff --git a/docs/imgs/tensorboard_tune_1_v0_cg_conv0.png b/docs/source/_static/imgs/tensorboard_tune_1_v0_cg_conv0.png similarity index 100% rename from docs/imgs/tensorboard_tune_1_v0_cg_conv0.png rename to docs/source/_static/imgs/tensorboard_tune_1_v0_cg_conv0.png diff --git a/docs/imgs/tensorboard_v0_cg_conv0_histogram.png b/docs/source/_static/imgs/tensorboard_v0_cg_conv0_histogram.png similarity index 100% rename from docs/imgs/tensorboard_v0_cg_conv0_histogram.png rename to docs/source/_static/imgs/tensorboard_v0_cg_conv0_histogram.png diff --git a/docs/imgs/tutorial.png b/docs/source/_static/imgs/tutorial.png similarity index 100% rename from docs/imgs/tutorial.png rename to docs/source/_static/imgs/tutorial.png diff --git a/docs/imgs/workflow.png b/docs/source/_static/imgs/workflow.png similarity index 100% rename from docs/imgs/workflow.png rename to docs/source/_static/imgs/workflow.png diff --git a/docs/source/_static/index.html b/docs/source/_static/index.html new file mode 100644 index 00000000000..eb630c63d1f --- /dev/null +++ b/docs/source/_static/index.html @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html new file mode 100644 index 00000000000..8154cad6356 --- /dev/null +++ b/docs/source/_templates/layout.html @@ -0,0 +1,76 @@ +{% extends "!layout.html" %} + + + +{% block extrabody %} +
+ +
+{% endblock %} + +{% block menu %} + +{{ super() }} +{% endblock %} + +{% block sidebartitle %} + +{% endblock %} + + +{%- block footer %} + + + + + +{% endblock %} + + + + diff --git a/docs/adaptor.md b/docs/source/adaptor.md similarity index 100% rename from docs/adaptor.md rename to docs/source/adaptor.md diff --git a/docs/source/api-documentation/adaptor.rst b/docs/source/api-documentation/adaptor.rst new file mode 100644 index 00000000000..83dc04c0181 --- /dev/null +++ b/docs/source/api-documentation/adaptor.rst @@ -0,0 +1,9 @@ +Adaptor +########### + +The adaptor API information is available: + +.. toctree:: + :maxdepth: 1 + + diff --git a/api-documentation/apis.rst b/docs/source/api-documentation/apis.rst old mode 100755 new mode 100644 similarity index 59% rename from api-documentation/apis.rst rename to docs/source/api-documentation/apis.rst index a9bac5c00e4..921dc1beb34 --- a/api-documentation/apis.rst +++ b/docs/source/api-documentation/apis.rst @@ -6,5 +6,8 @@ The following API information is available: .. toctree:: :maxdepth: 1 - api-reference - ../docs/api-introduction.md \ No newline at end of file + component + common + strategy + adaptor + pythonic diff --git a/docs/source/api-documentation/common.rst b/docs/source/api-documentation/common.rst new file mode 100644 index 00000000000..285c9136026 --- /dev/null +++ b/docs/source/api-documentation/common.rst @@ -0,0 +1,15 @@ +Common +########### + +The common API information is available: + +.. toctree:: + :maxdepth: 1 + + common/data + common/metric + common/model + common/criterion + common/benchmark + common/optimizer + diff --git a/docs/source/api-documentation/common/benchmark.rst b/docs/source/api-documentation/common/benchmark.rst new file mode 100644 index 00000000000..0af068b64ba --- /dev/null +++ b/docs/source/api-documentation/common/benchmark.rst @@ -0,0 +1,7 @@ +Benchmark +============== + +.. autoapisummary:: + + neural_compressor.experimental.benchmark + diff --git a/docs/source/api-documentation/common/bleu.rst b/docs/source/api-documentation/common/bleu.rst new file mode 100644 index 00000000000..7b4c04997e4 --- /dev/null +++ b/docs/source/api-documentation/common/bleu.rst @@ -0,0 +1,61 @@ +BLEU +==================================================== + +.. py:module:: neural_compressor.experimental.metric.bleu + + +Module Contents +--------------- + +Classes +~~~~~~~ + +.. autoapisummary:: + + neural_compressor.experimental.metric.bleu.BLEU + +.. py:class:: BLEU + + Bases: :py:obj:`object` + + Computes the BLEU (Bilingual Evaluation Understudy) score. + + BLEU is an algorithm for evaluating the quality of text which has + been machine-translated from one natural language to another. + This implementent approximate the BLEU score since we do not + glue word pieces or decode the ids and tokenize the output. + By default, we use ngram order of 4 and use brevity penalty. + Also, this does not have beam search. + + .. attribute:: predictions + + List of translations to score. + + .. attribute:: labels + + List of the reference corresponding to the prediction result. + + .. py:method:: reset() -> None + + Clear the predictions and labels in the cache. + + + .. py:method:: update(prediction: Sequence[str], label: Sequence[str]) -> None + + Add the prediction and label. + + :param prediction: The prediction result. + :param label: The reference corresponding to the prediction result. + + :raises ValueError: An error occurred when the length of the prediction + :raises and label are different.: + + + .. py:method:: result() -> float + + Compute the BLEU score. + + :returns: The approximate BLEU score. + :rtype: bleu_score + + diff --git a/docs/source/api-documentation/common/criterion.rst b/docs/source/api-documentation/common/criterion.rst new file mode 100644 index 00000000000..64d5c053dc6 --- /dev/null +++ b/docs/source/api-documentation/common/criterion.rst @@ -0,0 +1,5 @@ +Criterion +============== + +.. autoapisummary:: + neural_compressor.experimental.common.criterion diff --git a/docs/source/api-documentation/common/data.rst b/docs/source/api-documentation/common/data.rst new file mode 100644 index 00000000000..9f045f42a5c --- /dev/null +++ b/docs/source/api-documentation/common/data.rst @@ -0,0 +1,11 @@ +Data +########### + +The data API information is available: + +.. toctree:: + :maxdepth: 1 + + data/datasets.rst + data/dataloader.rst + data/transforms.rst \ No newline at end of file diff --git a/docs/source/api-documentation/common/data/dataloader.rst b/docs/source/api-documentation/common/data/dataloader.rst new file mode 100644 index 00000000000..38809581d3a --- /dev/null +++ b/docs/source/api-documentation/common/data/dataloader.rst @@ -0,0 +1,19 @@ +Dataloader +============== + +BaseDataLoader +--------------- + +.. autoapisummary:: + + neural_compressor.experimental.data.dataloaders.base_dataloader + +dataloaders +------------ +.. autoapisummary:: + + neural_compressor.experimental.data.dataloaders.default_dataloader + neural_compressor.experimental.data.dataloaders.mxnet_dataloader + neural_compressor.experimental.data.dataloaders.onnxrt_dataloader + neural_compressor.experimental.data.dataloaders.pytorch_dataloader + neural_compressor.experimental.data.dataloaders.tensorflow_dataloader \ No newline at end of file diff --git a/docs/source/api-documentation/common/data/datasets.rst b/docs/source/api-documentation/common/data/datasets.rst new file mode 100644 index 00000000000..f86f41c0040 --- /dev/null +++ b/docs/source/api-documentation/common/data/datasets.rst @@ -0,0 +1,12 @@ +Datasets +============== + +.. autoapisummary:: + + neural_compressor.experimental.data.datasets.bert_dataset + neural_compressor.experimental.data.datasets.coco_dataset + neural_compressor.experimental.data.datasets.dataset + neural_compressor.experimental.data.datasets.dummy_dataset + neural_compressor.experimental.data.datasets.imagenet_dataset + neural_compressor.experimental.data.datasets.dummy_dataset_v2 + neural_compressor.experimental.data.datasets.style_transfer_dataset \ No newline at end of file diff --git a/docs/source/api-documentation/common/data/transforms.rst b/docs/source/api-documentation/common/data/transforms.rst new file mode 100644 index 00000000000..d9e63e1fc3e --- /dev/null +++ b/docs/source/api-documentation/common/data/transforms.rst @@ -0,0 +1,6 @@ +Transforms +============== + +.. autoapisummary:: + + neural_compressor.experimental.data.transforms.transform \ No newline at end of file diff --git a/docs/source/api-documentation/common/metric.rst b/docs/source/api-documentation/common/metric.rst new file mode 100644 index 00000000000..9349942d733 --- /dev/null +++ b/docs/source/api-documentation/common/metric.rst @@ -0,0 +1,6 @@ +Metric +============== + +.. autoapisummary:: + neural_compressor.experimental.metric.metric + neural_compressor.experimental.metric.bleu \ No newline at end of file diff --git a/docs/source/api-documentation/common/model.rst b/docs/source/api-documentation/common/model.rst new file mode 100644 index 00000000000..b632b177c65 --- /dev/null +++ b/docs/source/api-documentation/common/model.rst @@ -0,0 +1,6 @@ +Model +============== + +.. autoapisummary:: + + neural_compressor.experimental.common.model diff --git a/docs/source/api-documentation/common/optimizer.rst b/docs/source/api-documentation/common/optimizer.rst new file mode 100644 index 00000000000..e714f856f7e --- /dev/null +++ b/docs/source/api-documentation/common/optimizer.rst @@ -0,0 +1,5 @@ +Optimizer +============== + +.. autoapisummary:: + neural_compressor.experimental.common.optimizer diff --git a/docs/source/api-documentation/component.rst b/docs/source/api-documentation/component.rst new file mode 100644 index 00000000000..b156d207558 --- /dev/null +++ b/docs/source/api-documentation/component.rst @@ -0,0 +1,15 @@ +Component(experiemental API, deprecated in 2.0) +################################################# + +The component API information is available: + +.. toctree:: + :maxdepth: 1 + + component/Quantization + component/Pruning + component/Distillation + component/Scheduler + component/MixedPrecision + component/ModelConversion + component/Nas \ No newline at end of file diff --git a/docs/source/api-documentation/component/Distillation.rst b/docs/source/api-documentation/component/Distillation.rst new file mode 100644 index 00000000000..7cb9766904a --- /dev/null +++ b/docs/source/api-documentation/component/Distillation.rst @@ -0,0 +1,6 @@ +Distillation +============== + +.. autoapisummary:: + + neural_compressor.experimental.distillation diff --git a/docs/source/api-documentation/component/MixedPrecision.rst b/docs/source/api-documentation/component/MixedPrecision.rst new file mode 100644 index 00000000000..6152894ac75 --- /dev/null +++ b/docs/source/api-documentation/component/MixedPrecision.rst @@ -0,0 +1,6 @@ +MixedPrecision +================ + +.. autoapisummary:: + + neural_compressor.experimental.mixed_precision \ No newline at end of file diff --git a/docs/source/api-documentation/component/ModelConversion.rst b/docs/source/api-documentation/component/ModelConversion.rst new file mode 100644 index 00000000000..3ce2e1fb891 --- /dev/null +++ b/docs/source/api-documentation/component/ModelConversion.rst @@ -0,0 +1,6 @@ +ModelConversion +================ + +.. autoapisummary:: + + neural_compressor.experimental.model_conversion diff --git a/docs/source/api-documentation/component/Nas.rst b/docs/source/api-documentation/component/Nas.rst new file mode 100644 index 00000000000..572f1cf21f6 --- /dev/null +++ b/docs/source/api-documentation/component/Nas.rst @@ -0,0 +1,15 @@ +Neural architecture search (NAS) +================================= + +Package Contents +---------------- +.. autoapisummary:: + + neural_compressor.experimental.nas.nas + +Classes +---------------- +.. autoapisummary:: + + neural_compressor.experimental.nas.basic_nas + neural_compressor.experimental.nas.dynas \ No newline at end of file diff --git a/docs/source/api-documentation/component/Pruning.rst b/docs/source/api-documentation/component/Pruning.rst new file mode 100644 index 00000000000..3bec7485947 --- /dev/null +++ b/docs/source/api-documentation/component/Pruning.rst @@ -0,0 +1,7 @@ +Pruning +============== + +.. autoapisummary:: + + neural_compressor.experimental.pruning + neural_compressor.experimental.pytorch_pruner.pruning \ No newline at end of file diff --git a/docs/source/api-documentation/component/Quantization.rst b/docs/source/api-documentation/component/Quantization.rst new file mode 100644 index 00000000000..afa6fc3cf75 --- /dev/null +++ b/docs/source/api-documentation/component/Quantization.rst @@ -0,0 +1,6 @@ +Quantization +============== + +.. autoapisummary:: + + neural_compressor.experimental.quantization diff --git a/docs/source/api-documentation/component/Scheduler.rst b/docs/source/api-documentation/component/Scheduler.rst new file mode 100644 index 00000000000..44bc31212ce --- /dev/null +++ b/docs/source/api-documentation/component/Scheduler.rst @@ -0,0 +1,6 @@ +Scheduler +============== + +.. autoapisummary:: + + neural_compressor.experimental.scheduler diff --git a/docs/source/api-documentation/pythonic.rst b/docs/source/api-documentation/pythonic.rst new file mode 100644 index 00000000000..77514870ff7 --- /dev/null +++ b/docs/source/api-documentation/pythonic.rst @@ -0,0 +1,8 @@ +Pythonic +########### + +The Pythonic API information is available: + +.. toctree:: + :maxdepth: 1 + diff --git a/docs/source/api-documentation/strategy.rst b/docs/source/api-documentation/strategy.rst new file mode 100644 index 00000000000..4fc0671d95d --- /dev/null +++ b/docs/source/api-documentation/strategy.rst @@ -0,0 +1,9 @@ +Strategy +########### + +The strategy API information is available: + +.. toctree:: + :maxdepth: 1 + + diff --git a/docs/backend_quant.md b/docs/source/backend_quant.md similarity index 100% rename from docs/backend_quant.md rename to docs/source/backend_quant.md diff --git a/docs/bench.md b/docs/source/bench.md similarity index 79% rename from docs/bench.md rename to docs/source/bench.md index 8938b916357..e0b9802ee7a 100644 --- a/docs/bench.md +++ b/docs/source/bench.md @@ -91,113 +91,113 @@ or specify different port that is already opened, for example 8080: ## Home screen This view shows introduction to Intel® Neural Compressor Bench and a button for creating new project. After clicking this button, pop-up with project wizard will be shown. -![Home](imgs/bench/home.png "Home") +![Home](_static/imgs/bench/home.png "Home") # Create new project To create a new project, in first step you need to choose its name. -![Project1](imgs/bench/project1.png "Project1") +![Project1](_static/imgs/bench/project1.png "Project1") In second step there are 2 possible options to choose from: * *predefined model* - you choose model from predefined examples list, you don't need to set any additional parameters, * *custom model* - in this scenario you can set more parameters and customize your model. -![Project2](imgs/bench/project2.png "Project2") +![Project2](_static/imgs/bench/project2.png "Project2") ## Predefined model First you need to choose domain for the model (image recognition or object detection). For each domain there are few available models to choose from. When you click *Finish* the chosen model will be downloaded. -![Project3-predefined](imgs/bench/project3-predefined.png "Project3-predefined") +![Project3-predefined](_static/imgs/bench/project3-predefined.png "Project3-predefined") ## Custom model First you have to choose the model path. When it is chosen, in most cases all other fields will be completed automatically. You can edit its input and output nodes, see the model graph (if it is available for this model) and set shape for synthetic dataset. If model domain was not detected, you need to choose it from the list. Model domain is used to set some default parameters for the model. -![Project3-custom](imgs/bench/project3-custom.png "Project3-custom") +![Project3-custom](_static/imgs/bench/project3-custom.png "Project3-custom") ## Display model graph -For several model types there will be a button available ![Show graph](imgs/bench/show_graph_button.png "Show graph") in the project wizard. It is also possible to see the graph in Diagnosis tab. The graph by default is collapsed, but when you click on plus icon, sections will be unfolded. +For several model types there will be a button available ![Show graph](_static/imgs/bench/show_graph_button.png "Show graph") in the project wizard. It is also possible to see the graph in Diagnosis tab. The graph by default is collapsed, but when you click on plus icon, sections will be unfolded. -![Bert model graph](imgs/bench/graph_bert.png "Bert model graph"). +![Bert model graph](_static/imgs/bench/graph_bert.png "Bert model graph"). # Project list On the left hand side there is a panel with list of created projects. When you click on the project name, you can see its details. "Create new project" button navigates to new project wizard pop-up described in previous section. -![Menu](imgs/bench/menu.png "Menu") +![Menu](_static/imgs/bench/menu.png "Menu") ## Remove project If you want to remove project, you have to click the trash icon next to project name (it is visible when the cursor is on the project name). -![Remove1](imgs/bench/remove1.png "Remove1") +![Remove1](_static/imgs/bench/remove1.png "Remove1") Then you will be prompted to confirm your choice by typing the project name. Project removal is not reversible. -![Remove2](imgs/bench/remove2.png "Remove2") +![Remove2](_static/imgs/bench/remove2.png "Remove2") # Develop the project ## Optimization tab ### Optimization table In Optimizations tab you can see list of optimizations in the project. Currently UI supports three optimization precisions and two types of optimization. -![Optimizations-table](imgs/bench/optimizations-table.png "Optimizations-table") +![Optimizations-table](_static/imgs/bench/optimizations-table.png "Optimizations-table") ### Optimization wizard To add new optimization, click "Add new optimization" button at the bottom of the table and follow the steps. -![Optimizations-wizard](imgs/bench/optimizations-wizard.png "Optimizations-wizard") +![Optimizations-wizard](_static/imgs/bench/optimizations-wizard.png "Optimizations-wizard") ### Editing optimization entries There is a possibility to modify some optimization parameters even after exit from Wizard. If optimization has not been run yet, the pencil icon on the right hand side should be in light blue color. That indicates that it can be modified. After click on that pencil icon you can select different precision or dataset. For Quantization you can also modify Tuning details before optimizing model. -![Optimization-tuning-details](imgs/bench/optimization-tuning-details.png "Optimization-tuning-details") +![Optimization-tuning-details](_static/imgs/bench/optimization-tuning-details.png "Optimization-tuning-details") ### Optimization details To perform optimization click "Run" button. Once process is finished you can click on row with specific optimization to display details about optimization parameters and optimized model. When you click on blue arrow icon in model path line, you can download optimized model. -![Optimization-details](imgs/bench/optimization-details.png "Optimization-details") +![Optimization-details](_static/imgs/bench/optimization-details.png "Optimization-details") ## Benchmark tab ### Benchmark table For each optimization and input model you can add benchmark. Benchmark have 2 modes: accuracy and performance. In benchmark tab you can see all your benchmarks. When you check checkboxes in the last column you can choose benchmark you want to compare in the chart (visible after clicking "Compare selected"). -![Benchmarks-table](imgs/bench/benchmarks-table.png "Benchmarks-table") +![Benchmarks-table](_static/imgs/bench/benchmarks-table.png "Benchmarks-table") ### Benchmark wizard To add new benchmark, click "Add new benchmark" button at the bottom of the table and follow the steps. -![Benchmarks-wizard](imgs/bench/benchmarks-wizard.png "Benchmarks-wizard") +![Benchmarks-wizard](_static/imgs/bench/benchmarks-wizard.png "Benchmarks-wizard") ### Editing benchmark entries As for optimizations you can also modify benchmark parameters. You can modify benchmark mode, dataset and benchmark parameters like batch size, number of instances and number of cores per instance. -![Benchmark-edit-wizard](imgs/bench/benchmark-edit-wizard.png "Benchmark-edit-wizard") +![Benchmark-edit-wizard](_static/imgs/bench/benchmark-edit-wizard.png "Benchmark-edit-wizard") ### Benchmark details When the benchmark is added, you can click "Run" button to execute it. Results will be filled in the table and in details view visible after clicking row in the table. You can also see config and output logs when clicking links highlighted in blue. -![Benchmark-details](imgs/bench/benchmark-details.png "Benchmark-details") +![Benchmark-details](_static/imgs/bench/benchmark-details.png "Benchmark-details") ## Profiling tab ### Profiling table It is also possible to do profiling of all Tensorflow frozen models in project. -![Profiling-table](imgs/bench/profiling-table.png "Profiling-table") +![Profiling-table](_static/imgs/bench/profiling-table.png "Profiling-table") ### Profiling wizard To profile model, click "Add new profiling" button at the bottom of the table and follow the steps. -![Profiling-wizard](imgs/bench/profiling-wizard.png "Profiling-wizard") +![Profiling-wizard](_static/imgs/bench/profiling-wizard.png "Profiling-wizard") ### Editing profiling entries In Profiling tab you can edit dataset and number or threads. -![Profiling-edit-wizard](imgs/bench/profiling-edit-wizard.png "Profiling-edit-wizard") +![Profiling-edit-wizard](_static/imgs/bench/profiling-edit-wizard.png "Profiling-edit-wizard") ### Profiling details Once profiling entry is added, you can click "Run" button to execute it. After completing the process, the results will appear in the form of a bar chart and a table with full profiling data. The table is also used to control which operations are included in the chart. Check the box next to the selected row and click "Update chart" button to include it in the bar chart. Click "Download .csv file" button to get profiling data in .csv file. -![Profiling-details](imgs/bench/profiling-details.png "Profiling-details") +![Profiling-details](_static/imgs/bench/profiling-details.png "Profiling-details") # Diagnosis tab @@ -205,43 +205,43 @@ Diagnosis tab offers convenient debug information for optimizations with easy wa To get OP list you need to execute quantization optimization and select optimized model on left hand side. In OP table you can see list of OPs with MSE and min/max activation values. Selecting one of OP in table highlights its position in graph. Configuration for currently selected OP can be set in section under OP table. -![Diagnosis-tab](imgs/bench/diagnosis-tab.png "Diagnosis-tab") +![Diagnosis-tab](_static/imgs/bench/diagnosis-tab.png "Diagnosis-tab") You can set model wise parameters that apply to whole model by clicking button with "Model wise". When you set specific configuration you can view summary and generate new optimization config. -![Diagnosis-actions](imgs/bench/diagnosis-actions.png "Diagnosis-actions") +![Diagnosis-actions](_static/imgs/bench/diagnosis-actions.png "Diagnosis-actions") Model wise configuration provides separate settings for weights and activations. -![Diagnosis-model-wise-wizard](imgs/bench/diagnosis-model-wise-wizard.png "Diagnosis-model-wise-wizard") +![Diagnosis-model-wise-wizard](_static/imgs/bench/diagnosis-model-wise-wizard.png "Diagnosis-model-wise-wizard") ## Dataset tab ### Dataset list Dataset tab presents list of datasets assigned to a project. In most cases the "dummy" dataset consisting of synthetic data should be automatically added while creating a project. -![Datasets-table](imgs/bench/datasets-table.png "Datasets-table") +![Datasets-table](_static/imgs/bench/datasets-table.png "Datasets-table") ### Dataset wizard New dataset can be defined by clicking "Add new profiling" button at the bottom of the table and follow the steps. -![Datasets-wizard](imgs/bench/datasets-wizard.png "Datasets-wizard") +![Datasets-wizard](_static/imgs/bench/datasets-wizard.png "Datasets-wizard") ### Dataset details Dataset details can be inspected by clicking specific row. -![Dataset-details](imgs/bench/dataset-details.png "Dataset-details") +![Dataset-details](_static/imgs/bench/dataset-details.png "Dataset-details") ### Custom dataset When adding the dataset, you can choose *custom* in dataloader and metric field. In that case a template file will be created. The path to the template file will be available in dataset details. You should edit this file to add your custom configuration before using this dataset in optimizations or benchmarks. Small yellow warning will remind about it. -![Custom dataset](imgs/bench/custom-dataset.png "Custom dataset") +![Custom dataset](_static/imgs/bench/custom-dataset.png "Custom dataset") ## Project information Last tab is called "Project info". You can find here details about the project, when it was created and modified, what is the framework and some details about input model. It is also possible to add some notes about the project. -![Project info](imgs/bench/project-info.png "Project info") +![Project info](_static/imgs/bench/project-info.png "Project info") ## System information -One can see system information by clicking ![System info](imgs/bench/system_info.png "System info") button. The result is details dialog: +One can see system information by clicking ![System info](_static/imgs/bench/system_info.png "System info") button. The result is details dialog: -![System info table](imgs/bench/system_info_table.png "System info table") +![System info table](_static/imgs/bench/system_info_table.png "System info table") ## Security diff --git a/docs/benchmark.md b/docs/source/benchmark.md similarity index 100% rename from docs/benchmark.md rename to docs/source/benchmark.md diff --git a/conf.py b/docs/source/conf.py old mode 100755 new mode 100644 similarity index 85% rename from conf.py rename to docs/source/conf.py index e6316de8938..a721b5d0ba1 --- a/conf.py +++ b/docs/source/conf.py @@ -12,12 +12,15 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # +from os import getenv +import importlib.util import os import sys -sys.path.insert(0, os.path.abspath('.')) -import importlib.util +sys.path.insert(0, os.path.abspath('../../')) moduleName = 'version' -modulePath = os.getcwd() + '/neural_compressor/version.py' +# get version.py +modulePathNeu = os.path.abspath(os.path.join(os.getcwd(), "../..")) +modulePath = modulePathNeu + '/neural_compressor/version.py' spec = importlib.util.spec_from_file_location(moduleName,modulePath) NCversion = importlib.util.module_from_spec(spec) spec.loader.exec_module(NCversion) @@ -44,15 +47,20 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ["recommonmark","sphinx_markdown_tables","sphinx_md", "sphinx.ext.autodoc"] - +extensions = ['recommonmark', 'sphinx_markdown_tables', 'sphinx.ext.coverage', 'sphinx.ext.autosummary', + 'sphinx_md', 'autoapi.extension', 'sphinx.ext.napoleon'] +autoapi_dirs = ['../../neural_compressor'] +autoapi_add_toctree_entry = False +autosummary_generate = True +autoapi_options = ['members', 'show-inheritance', + 'show-module-summary', 'imported-members', ] +autoapi_ignore = [] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # -# source_suffix = ['.rst', '.md'] source_suffix = ['.rst', '.md'] # The master toctree document. @@ -77,13 +85,13 @@ # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. +# a list of builtin themes. # # html_theme = "asteroid_sphinx_theme" # html_theme = "classic" # html_theme = "alabaster" # html_theme = "sphinx_book_theme" -html_theme = "sphinx_rtd_theme" +html_theme = "pytorch_sphinx_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -96,6 +104,17 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] +def skip_util_classes(app, what, name, obj, skip, options): + if what == "class" and obj.docstring.startswith("Not displayed in API Docs.") : + skip = True + return skip + + +def setup(app): + app.add_css_file("custom.css") + app.connect("autoapi-skip-member", skip_util_classes) + +html_favicon = '_static/imgs/common/intel.svg' # Custom sidebar templates, must be a dictionary that maps document names # to template names. # @@ -163,13 +182,9 @@ 'Miscellaneous'), ] -def setup(app): - app.add_css_file("custom.css") - -from os import getenv sphinx_md_useGitHubURL = True -baseBranch = "master" +baseBranch = "api-docs" commitSHA = getenv('GITHUB_SHA') githubBaseURL = 'https://github.com/' + (getenv('GITHUB_REPOSITORY') or 'intel/neural-compressor') + '/' githubFileURL = githubBaseURL + "blob/" diff --git a/docs/contributions.md b/docs/source/contributions.md similarity index 100% rename from docs/contributions.md rename to docs/source/contributions.md diff --git a/docs/dataloader.md b/docs/source/dataloader.md similarity index 100% rename from docs/dataloader.md rename to docs/source/dataloader.md diff --git a/docs/dataset.md b/docs/source/dataset.md similarity index 100% rename from docs/dataset.md rename to docs/source/dataset.md diff --git a/docs/source/design.md b/docs/source/design.md new file mode 100644 index 00000000000..e75e25df785 --- /dev/null +++ b/docs/source/design.md @@ -0,0 +1,15 @@ +Design +===== +Intel® Neural Compressor features an architecture and workflow that aids in increasing performance and faster deployments across infrastructures. + +## Architecture + + + Architecture + + +## Workflow + + + Workflow + diff --git a/docs/distillation.md b/docs/source/distillation.md similarity index 95% rename from docs/distillation.md rename to docs/source/distillation.md index b5f363a5a67..49cec901185 100644 --- a/docs/distillation.md +++ b/docs/source/distillation.md @@ -1,138 +1,138 @@ -Distillation -============ - -1. [Introduction](#introduction) - - 1.1. [Knowledge Distillation](#knowledge-distillation) - - 1.2. [Intermediate Layer Knowledge Distillation](#intermediate-layer-knowledge-distillation) - - 1.3. [Self Distillation](#self-distillation) - -2. [Distillation Support Matrix](#distillation-support-matrix) -3. [Get Started with Distillation API ](#get-started-with-distillation-api) -4. [Examples](#examples) - -## Introduction - -Distillation is one of popular approaches of network compression, which transfers knowledge from a large model to a smaller one without loss of validity. As smaller models are less expensive to evaluate, they can be deployed on less powerful hardware (such as a mobile device). Graph shown below is the workflow of the distillation, the teacher model will take the same input that feed into the student model to produce the output that contains knowledge of the teacher model to instruct the student model. -
- -Architecture - -Intel® Neural Compressor supports Knowledge Distillation and Intermediate Layer Knowledge Distillation algorithms. - -### Knowledge Distillation -Knowledge distillation is proposed in [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531). It leverages the logits (the input of softmax in the classification tasks) of teacher and student model to minimize the the difference between their predicted class distributions, this can be done by minimizing the below loss function. - -$$L_{KD} = D(z_t, z_s)$$ - -Where $D$ is a distance measurement, e.g. Euclidean distance and Kullback–Leibler divergence, $z_t$ and $z_s$ are the logits of teacher and student model, or predicted distributions from softmax of the logits in case the distance is measured in terms of distribution. - -### Intermediate Layer Knowledge Distillation - -There are more information contained in the teacher model beside its logits, for example, the output features of the teacher model's intermediate layers often been used to guide the student model, as in [Patient Knowledge Distillation for BERT Model Compression](https://arxiv.org/pdf/1908.09355) and [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984). The general loss function for this approach can be summarized as follow. - -$$L_{KD} = \sum\limits_i D(T_t^{n_i}(F_t^{n_i}), T_s^{m_i}(F_s^{m_i}))$$ - -Where $D$ is a distance measurement as before, $F_t^{n_i}$ the output feature of the $n_i$'s layer of the teacher model, $F_s^{m_i}$ the output feature of the $m_i$'s layer of the student model. Since the dimensions of $F_t^{n_i}$ and $F_s^{m_i}$ are usually different, the transformations $T_t^{n_i}$ and $T_s^{m_i}$ are needed to match dimensions of the two features. Specifically, the transformation can take the forms like identity, linear transformation, 1X1 convolution etc. - -### Self Distillation - -Self-distillation ia a one-stage training method where the teacher model and student models can be trained together. It attaches several attention modules and shallow classifiers at different depths of neural networks and distills knowledge from the deepest classifier to the shallower classifiers. Different from the conventional knowledge distillation methods where the knowledge of the teacher model is transferred to another student model, self-distillation can be considered as knowledge transfer in the same model, from the deeper layers to the shallower layers. -The additional classifiers in self-distillation allow the neural network to work in a dynamic manner, which leads to a much higher acceleration. -
- -Architecture - -Architecture from paper [Self-Distillation: Towards Efficient and Compact Neural Networks](https://ieeexplore.ieee.org/document/9381661) - -## Distillation Support Matrix - -|Distillation Algorithm |PyTorch |TensorFlow | -|------------------------------------------------|:--------:|:---------:| -|Knowledge Distillation |✔ |✔ | -|Intermediate Layer Knowledge Distillation |✔ |Will be supported| -|Self Distillation |✔ |✖ | - -## Get Started with Distillation API - -Simplest launcher code if training behavior is defined in user-defined yaml. - -```python -from neural_compressor.experimental import Distillation, common -distiller = Distillation('/path/to/user/yaml') -distiller.student_model = student_model -distiller.teacher_model = teacher_model -model = distiller.fit() -``` -Distillation class also support DistillationConf class as it's argument. - -```python -from neural_compressor.experimental import Distillation, common -from neural_compressor.conf.config import DistillationConf -conf = DistillationConf('/path/to/user/yaml') -distiller = Distillation(conf) -distiller.student_model = student_model -distiller.teacher_model = teacher_model -model = distiller.fit() -``` - -User can pass the customized training/evaluation functions to `Distillation` for flexible scenarios. In this case, distillation process can be done by pre-defined hooks in Neural Compressor. User needs to put those hooks inside the training function. - -Neural Compressor defines several hooks for user pass - -``` -on_train_begin() : Hook executed before training begins -on_after_compute_loss(input, student_output, student_loss) : Hook executed after each batch inference of student model -on_epoch_end() : Hook executed at each epoch end -``` - -Following section shows how to use hooks in user pass-in training function which is part of example from BlendCNN distillation: - -```python -def train_func(model): - distiller.on_train_begin() - for nepoch in range(epochs): - model.train() - cnt = 0 - loss_sum = 0. - iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)') - for batch in iter_bar: - teacher_logits, input_ids, segment_ids, input_mask, target = batch - cnt += 1 - output = model(input_ids, segment_ids, input_mask) - loss = criterion(output, target) - loss = distiller.on_after_compute_loss( - {'input_ids':input_ids, 'segment_ids':segment_ids, 'input_mask':input_mask}, - output, - loss, - teacher_logits) - optimizer.zero_grad() - loss.backward() - optimizer.step() - if cnt >= iters: - break - print('Average Loss: {}'.format(loss_sum / cnt)) - distiller.on_epoch_end() -... -``` - -In this case, the launcher code is like the following: - -```python -from neural_compressor.experimental import Distillation, common -from neural_compressor.experimental.common.criterion import PyTorchKnowledgeDistillationLoss -distiller = Distillation(args.config) -distiller.student_model = model -distiller.teacher_model = teacher -distiller.criterion = PyTorchKnowledgeDistillationLoss() -distiller.train_func = train_func -model = distiller.fit() -``` - -## Examples - -[Distillation Examples](../examples/README.md#distillation) -
-[Distillation Examples Results](./validated_model_list.md#validated-knowledge-distillation-examples) +Distillation +============ + +1. [Introduction](#introduction) + + 1.1. [Knowledge Distillation](#knowledge-distillation) + + 1.2. [Intermediate Layer Knowledge Distillation](#intermediate-layer-knowledge-distillation) + + 1.3. [Self Distillation](#self-distillation) + +2. [Distillation Support Matrix](#distillation-support-matrix) +3. [Get Started with Distillation API ](#get-started-with-distillation-api) +4. [Examples](#examples) + +## Introduction + +Distillation is one of popular approaches of network compression, which transfers knowledge from a large model to a smaller one without loss of validity. As smaller models are less expensive to evaluate, they can be deployed on less powerful hardware (such as a mobile device). Graph shown below is the workflow of the distillation, the teacher model will take the same input that feed into the student model to produce the output that contains knowledge of the teacher model to instruct the student model. +
+ +Architecture + +Intel® Neural Compressor supports Knowledge Distillation and Intermediate Layer Knowledge Distillation algorithms. + +### Knowledge Distillation +Knowledge distillation is proposed in [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531). It leverages the logits (the input of softmax in the classification tasks) of teacher and student model to minimize the the difference between their predicted class distributions, this can be done by minimizing the below loss function. + +$$L_{KD} = D(z_t, z_s)$$ + +Where $D$ is a distance measurement, e.g. Euclidean distance and Kullback–Leibler divergence, $z_t$ and $z_s$ are the logits of teacher and student model, or predicted distributions from softmax of the logits in case the distance is measured in terms of distribution. + +### Intermediate Layer Knowledge Distillation + +There are more information contained in the teacher model beside its logits, for example, the output features of the teacher model's intermediate layers often been used to guide the student model, as in [Patient Knowledge Distillation for BERT Model Compression](https://arxiv.org/pdf/1908.09355) and [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984). The general loss function for this approach can be summarized as follow. + +$$L_{KD} = \sum\limits_i D(T_t^{n_i}(F_t^{n_i}), T_s^{m_i}(F_s^{m_i}))$$ + +Where $D$ is a distance measurement as before, $F_t^{n_i}$ the output feature of the $n_i$'s layer of the teacher model, $F_s^{m_i}$ the output feature of the $m_i$'s layer of the student model. Since the dimensions of $F_t^{n_i}$ and $F_s^{m_i}$ are usually different, the transformations $T_t^{n_i}$ and $T_s^{m_i}$ are needed to match dimensions of the two features. Specifically, the transformation can take the forms like identity, linear transformation, 1X1 convolution etc. + +### Self Distillation + +Self-distillation ia a one-stage training method where the teacher model and student models can be trained together. It attaches several attention modules and shallow classifiers at different depths of neural networks and distills knowledge from the deepest classifier to the shallower classifiers. Different from the conventional knowledge distillation methods where the knowledge of the teacher model is transferred to another student model, self-distillation can be considered as knowledge transfer in the same model, from the deeper layers to the shallower layers. +The additional classifiers in self-distillation allow the neural network to work in a dynamic manner, which leads to a much higher acceleration. +
+ +Architecture + +Architecture from paper [Self-Distillation: Towards Efficient and Compact Neural Networks](https://ieeexplore.ieee.org/document/9381661) + +## Distillation Support Matrix + +|Distillation Algorithm |PyTorch |TensorFlow | +|------------------------------------------------|:--------:|:---------:| +|Knowledge Distillation |✔ |✔ | +|Intermediate Layer Knowledge Distillation |✔ |Will be supported| +|Self Distillation |✔ |✖ | + +## Get Started with Distillation API + +Simplest launcher code if training behavior is defined in user-defined yaml. + +```python +from neural_compressor.experimental import Distillation, common +distiller = Distillation('/path/to/user/yaml') +distiller.student_model = student_model +distiller.teacher_model = teacher_model +model = distiller.fit() +``` +Distillation class also support DistillationConf class as it's argument. + +```python +from neural_compressor.experimental import Distillation, common +from neural_compressor.conf.config import DistillationConf +conf = DistillationConf('/path/to/user/yaml') +distiller = Distillation(conf) +distiller.student_model = student_model +distiller.teacher_model = teacher_model +model = distiller.fit() +``` + +User can pass the customized training/evaluation functions to `Distillation` for flexible scenarios. In this case, distillation process can be done by pre-defined hooks in Neural Compressor. User needs to put those hooks inside the training function. + +Neural Compressor defines several hooks for user pass + +``` +on_train_begin() : Hook executed before training begins +on_after_compute_loss(input, student_output, student_loss) : Hook executed after each batch inference of student model +on_epoch_end() : Hook executed at each epoch end +``` + +Following section shows how to use hooks in user pass-in training function which is part of example from BlendCNN distillation: + +```python +def train_func(model): + distiller.on_train_begin() + for nepoch in range(epochs): + model.train() + cnt = 0 + loss_sum = 0. + iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)') + for batch in iter_bar: + teacher_logits, input_ids, segment_ids, input_mask, target = batch + cnt += 1 + output = model(input_ids, segment_ids, input_mask) + loss = criterion(output, target) + loss = distiller.on_after_compute_loss( + {'input_ids':input_ids, 'segment_ids':segment_ids, 'input_mask':input_mask}, + output, + loss, + teacher_logits) + optimizer.zero_grad() + loss.backward() + optimizer.step() + if cnt >= iters: + break + print('Average Loss: {}'.format(loss_sum / cnt)) + distiller.on_epoch_end() +... +``` + +In this case, the launcher code is like the following: + +```python +from neural_compressor.experimental import Distillation, common +from neural_compressor.experimental.common.criterion import PyTorchKnowledgeDistillationLoss +distiller = Distillation(args.config) +distiller.student_model = model +distiller.teacher_model = teacher +distiller.criterion = PyTorchKnowledgeDistillationLoss() +distiller.train_func = train_func +model = distiller.fit() +``` + +## Examples + +[Distillation Examples](../examples/README.md#distillation) +
+[Distillation Examples Results](./validated_model_list.md#validated-knowledge-distillation-examples) diff --git a/docs/distillation_quantization.md b/docs/source/distillation_quantization.md similarity index 100% rename from docs/distillation_quantization.md rename to docs/source/distillation_quantization.md diff --git a/docs/distributed.md b/docs/source/distributed.md similarity index 100% rename from docs/distributed.md rename to docs/source/distributed.md diff --git a/docs/dynamic_quantization.md b/docs/source/dynamic_quantization.md similarity index 100% rename from docs/dynamic_quantization.md rename to docs/source/dynamic_quantization.md diff --git a/docs/examples_readme.md b/docs/source/examples_readme.md similarity index 100% rename from docs/examples_readme.md rename to docs/source/examples_readme.md diff --git a/docs/faq.md b/docs/source/faq.md similarity index 100% rename from docs/faq.md rename to docs/source/faq.md diff --git a/docs/framework_yaml.md b/docs/source/framework_yaml.md similarity index 97% rename from docs/framework_yaml.md rename to docs/source/framework_yaml.md index cb4b72da20a..7e8f6136a4d 100644 --- a/docs/framework_yaml.md +++ b/docs/source/framework_yaml.md @@ -1,194 +1,194 @@ -Framework YAML Configuration Files -==== -1. [Introduction](#introduction) -2. [Supported Feature Matrix](#supported-feature-matrix) -2. [Get Started with Framework YAML Files](#get-started-with-framework-yaml-files) - - - -## Introduction - -Intel® Neural Compressor uses YAML files for quick -and user-friendly configurations. There are two types of YAML files - -user YAML files and framework YAML files, which are used in -running user cases and setting up framework capabilities, respectively. - -Here, we introduce the framework YAML file, which describes the behavior of -a specific framework. There is a corresponding framework YAML file for each framework supported by -Intel® Neural Compressor - TensorFlow -, Intel® Extension for TensorFlow*, PyTorch, Intel® Extension for PyTorch*, ONNX Runtime, and MXNet. - ->**Note**: Before diving to the details, we recommend that the end users do NOT make modifications -unless they have clear requirements that can only be met by modifying the attributes. - -## Supported Feature Matrix - -| Framework | YAML Configuration Files | -|------------|:------------------------:| -| TensorFlow | ✔ | -| PyTorch | ✔ | -| ONNX | ✔ | -| MXNet | ✔ | - - -## Get started with Framework YAML Files - -For the purpose of framework setup, let's take a look at a tensorflow framework YAML file; -other framework YAML files follow same syntax. A framework YAML file specifies following -information and capabilities for current runtime framework. Let's go through -them one by one: - -* ***version***: This specifies the supported versions. -```yaml - version: - name: ['2.1.0', '2.2.0', '2.3.0', '2.4.0', '2.5.0', '2.6.0', '2.6.1', '2.6.2', '2.7.0', '2.8.0', '1.15.0-up1', '1.15.0-up2'] -``` - -* ***precisions***: This defines the supported precisions of specific versions. -```yaml - precisions: - names: int8, uint8, bf16, fp32 - valid_mixed_precisions: [] -``` -* ***op***: This defines a list of valid OP types for each precision. -```yaml - ops: - int8: ['Conv2D', 'MatMul', 'ConcatV2', 'MaxPool', 'AvgPool'] - uint8: ['Conv2D', 'DepthwiseConv2dNative', 'MatMul', 'ConcatV2', 'MaxPool', 'AvgPool'] - bf16: ['Conv2D'] - fp32: ['*'] # '*' means all op types -``` -* ***capabilities***: This defines the quantization ability of specific ops, such as -granularity, scheme, and algorithm. The activation assumes that input and output activations -share the same data type by default, which is based on op semantics defined by -frameworks. -```yaml - capabilities: - int8: { - 'Conv2D': { - 'weight': { - 'dtype': ['int8', 'fp32'], - 'scheme': ['sym'], - 'granularity': ['per_channel','per_tensor'], - 'algorithm': ['minmax'] - }, - 'activation': { - 'dtype': ['int8', 'fp32'], - 'scheme': ['sym'], - 'granularity': ['per_tensor'], - 'algorithm': ['minmax', 'kl'] - } - }, - 'MatMul': { - 'weight': { - 'dtype': ['int8', 'fp32'], - 'scheme': ['sym'], - 'granularity': ['per_tensor'], - 'algorithm': ['minmax'] - }, - 'activation': { - 'dtype': ['int8', 'fp32'], - 'scheme': ['asym', 'sym'], - 'granularity': ['per_tensor'], - 'algorithm': ['minmax'] - } - }, - 'default': { - 'activation': { - 'dtype': ['uint8', 'fp32'], - 'algorithm': ['minmax'], - 'scheme': ['sym'], - 'granularity': ['per_tensor'] - } - }, - } - - uint8: { - 'Conv2D': { - 'weight': { - 'dtype': ['int8', 'fp32'], - 'scheme': ['sym'], - 'granularity': ['per_channel','per_tensor'], - 'algorithm': ['minmax'] - }, - 'activation': { - 'dtype': ['uint8', 'fp32'], - 'scheme': ['sym'], - 'granularity': ['per_tensor'], - 'algorithm': ['minmax', 'kl'] - } - }, - 'MatMul': { - 'weight': { - 'dtype': ['int8', 'fp32'], - 'scheme': ['sym'], - 'granularity': ['per_tensor'], - 'algorithm': ['minmax'] - }, - 'activation': { - 'dtype': ['uint8', 'fp32'], - 'scheme': ['asym', 'sym'], - 'granularity': ['per_tensor'], - 'algorithm': ['minmax'] - } - }, - 'default': { - 'activation': { - 'dtype': ['uint8', 'fp32'], - 'algorithm': ['minmax'], - 'scheme': ['sym'], - 'granularity': ['per_tensor'] - } - }, - } -``` -* ***patterns***: This defines the supported fusion sequence for each op. -```yaml - patterns: - fp32: [ - 'Conv2D + Add + Relu', - 'Conv2D + Add + Relu6', - 'Conv2D + Relu', - 'Conv2D + Relu6', - 'Conv2D + BiasAdd' - ] - int8: [ - 'Conv2D + BiasAdd', - 'Conv2D + BiasAdd + Relu', - 'Conv2D + BiasAdd + Relu6' - ] - uint8: [ - 'Conv2D + BiasAdd + AddN + Relu', - 'Conv2D + BiasAdd + AddN + Relu6', - 'Conv2D + BiasAdd + AddV2 + Relu', - 'Conv2D + BiasAdd + AddV2 + Relu6', - 'Conv2D + BiasAdd + Add + Relu', - 'Conv2D + BiasAdd + Add + Relu6', - 'Conv2D + BiasAdd + Relu', - 'Conv2D + BiasAdd + Relu6', - 'Conv2D + Add + Relu', - 'Conv2D + Add + Relu6', - 'Conv2D + Relu', - 'Conv2D + Relu6', - 'Conv2D + BiasAdd', - 'DepthwiseConv2dNative + BiasAdd + Relu6', - 'DepthwiseConv2dNative + BiasAdd + Relu', - 'DepthwiseConv2dNative + Add + Relu6', - 'DepthwiseConv2dNative + BiasAdd', - 'MatMul + BiasAdd + Relu', - 'MatMul + BiasAdd', - ] -``` - -* ***grappler_optimization***: This defines the grappler optimization. -```yaml - grappler_optimization: - pruning: True # optional. grappler pruning optimizer,default value is True. - shape: True # optional. grappler shape optimizer,default value is True. - constfold: False # optional. grappler constant folding optimizer, default value is True. - arithmetic: False # optional. grappler arithmetic optimizer,default value is False. - dependency: True # optional. grappler dependency optimizer,default value is True. - debug_stripper: True # optional. grappler debug_stripper optimizer,default value is True. - loop: True # optional. grappler loop optimizer,default value is True. - -``` +Framework YAML Configuration Files +==== +1. [Introduction](#introduction) +2. [Supported Feature Matrix](#supported-feature-matrix) +2. [Get Started with Framework YAML Files](#get-started-with-framework-yaml-files) + + + +## Introduction + +Intel® Neural Compressor uses YAML files for quick +and user-friendly configurations. There are two types of YAML files - +user YAML files and framework YAML files, which are used in +running user cases and setting up framework capabilities, respectively. + +Here, we introduce the framework YAML file, which describes the behavior of +a specific framework. There is a corresponding framework YAML file for each framework supported by +Intel® Neural Compressor - TensorFlow +, Intel® Extension for TensorFlow*, PyTorch, Intel® Extension for PyTorch*, ONNX Runtime, and MXNet. + +>**Note**: Before diving to the details, we recommend that the end users do NOT make modifications +unless they have clear requirements that can only be met by modifying the attributes. + +## Supported Feature Matrix + +| Framework | YAML Configuration Files | +|------------|:------------------------:| +| TensorFlow | ✔ | +| PyTorch | ✔ | +| ONNX | ✔ | +| MXNet | ✔ | + + +## Get started with Framework YAML Files + +For the purpose of framework setup, let's take a look at a tensorflow framework YAML file; +other framework YAML files follow same syntax. A framework YAML file specifies following +information and capabilities for current runtime framework. Let's go through +them one by one: + +* ***version***: This specifies the supported versions. +```yaml + version: + name: ['2.1.0', '2.2.0', '2.3.0', '2.4.0', '2.5.0', '2.6.0', '2.6.1', '2.6.2', '2.7.0', '2.8.0', '1.15.0-up1', '1.15.0-up2'] +``` + +* ***precisions***: This defines the supported precisions of specific versions. +```yaml + precisions: + names: int8, uint8, bf16, fp32 + valid_mixed_precisions: [] +``` +* ***op***: This defines a list of valid OP types for each precision. +```yaml + ops: + int8: ['Conv2D', 'MatMul', 'ConcatV2', 'MaxPool', 'AvgPool'] + uint8: ['Conv2D', 'DepthwiseConv2dNative', 'MatMul', 'ConcatV2', 'MaxPool', 'AvgPool'] + bf16: ['Conv2D'] + fp32: ['*'] # '*' means all op types +``` +* ***capabilities***: This defines the quantization ability of specific ops, such as +granularity, scheme, and algorithm. The activation assumes that input and output activations +share the same data type by default, which is based on op semantics defined by +frameworks. +```yaml + capabilities: + int8: { + 'Conv2D': { + 'weight': { + 'dtype': ['int8', 'fp32'], + 'scheme': ['sym'], + 'granularity': ['per_channel','per_tensor'], + 'algorithm': ['minmax'] + }, + 'activation': { + 'dtype': ['int8', 'fp32'], + 'scheme': ['sym'], + 'granularity': ['per_tensor'], + 'algorithm': ['minmax', 'kl'] + } + }, + 'MatMul': { + 'weight': { + 'dtype': ['int8', 'fp32'], + 'scheme': ['sym'], + 'granularity': ['per_tensor'], + 'algorithm': ['minmax'] + }, + 'activation': { + 'dtype': ['int8', 'fp32'], + 'scheme': ['asym', 'sym'], + 'granularity': ['per_tensor'], + 'algorithm': ['minmax'] + } + }, + 'default': { + 'activation': { + 'dtype': ['uint8', 'fp32'], + 'algorithm': ['minmax'], + 'scheme': ['sym'], + 'granularity': ['per_tensor'] + } + }, + } + + uint8: { + 'Conv2D': { + 'weight': { + 'dtype': ['int8', 'fp32'], + 'scheme': ['sym'], + 'granularity': ['per_channel','per_tensor'], + 'algorithm': ['minmax'] + }, + 'activation': { + 'dtype': ['uint8', 'fp32'], + 'scheme': ['sym'], + 'granularity': ['per_tensor'], + 'algorithm': ['minmax', 'kl'] + } + }, + 'MatMul': { + 'weight': { + 'dtype': ['int8', 'fp32'], + 'scheme': ['sym'], + 'granularity': ['per_tensor'], + 'algorithm': ['minmax'] + }, + 'activation': { + 'dtype': ['uint8', 'fp32'], + 'scheme': ['asym', 'sym'], + 'granularity': ['per_tensor'], + 'algorithm': ['minmax'] + } + }, + 'default': { + 'activation': { + 'dtype': ['uint8', 'fp32'], + 'algorithm': ['minmax'], + 'scheme': ['sym'], + 'granularity': ['per_tensor'] + } + }, + } +``` +* ***patterns***: This defines the supported fusion sequence for each op. +```yaml + patterns: + fp32: [ + 'Conv2D + Add + Relu', + 'Conv2D + Add + Relu6', + 'Conv2D + Relu', + 'Conv2D + Relu6', + 'Conv2D + BiasAdd' + ] + int8: [ + 'Conv2D + BiasAdd', + 'Conv2D + BiasAdd + Relu', + 'Conv2D + BiasAdd + Relu6' + ] + uint8: [ + 'Conv2D + BiasAdd + AddN + Relu', + 'Conv2D + BiasAdd + AddN + Relu6', + 'Conv2D + BiasAdd + AddV2 + Relu', + 'Conv2D + BiasAdd + AddV2 + Relu6', + 'Conv2D + BiasAdd + Add + Relu', + 'Conv2D + BiasAdd + Add + Relu6', + 'Conv2D + BiasAdd + Relu', + 'Conv2D + BiasAdd + Relu6', + 'Conv2D + Add + Relu', + 'Conv2D + Add + Relu6', + 'Conv2D + Relu', + 'Conv2D + Relu6', + 'Conv2D + BiasAdd', + 'DepthwiseConv2dNative + BiasAdd + Relu6', + 'DepthwiseConv2dNative + BiasAdd + Relu', + 'DepthwiseConv2dNative + Add + Relu6', + 'DepthwiseConv2dNative + BiasAdd', + 'MatMul + BiasAdd + Relu', + 'MatMul + BiasAdd', + ] +``` + +* ***grappler_optimization***: This defines the grappler optimization. +```yaml + grappler_optimization: + pruning: True # optional. grappler pruning optimizer,default value is True. + shape: True # optional. grappler shape optimizer,default value is True. + constfold: False # optional. grappler constant folding optimizer, default value is True. + arithmetic: False # optional. grappler arithmetic optimizer,default value is False. + dependency: True # optional. grappler dependency optimizer,default value is True. + debug_stripper: True # optional. grappler debug_stripper optimizer,default value is True. + loop: True # optional. grappler loop optimizer,default value is True. + +``` diff --git a/docs/graph_optimization.md b/docs/source/graph_optimization.md similarity index 100% rename from docs/graph_optimization.md rename to docs/source/graph_optimization.md diff --git a/docs/incompatible_changes.md b/docs/source/incompatible_changes.md similarity index 100% rename from docs/incompatible_changes.md rename to docs/source/incompatible_changes.md diff --git a/index.rst b/docs/source/index.rst old mode 100755 new mode 100644 similarity index 59% rename from index.rst rename to docs/source/index.rst index 3b323adc4c8..b742e5360f3 --- a/index.rst +++ b/docs/source/index.rst @@ -10,15 +10,14 @@ Sections .. toctree:: :maxdepth: 1 - README.md - docs/tutorial.md - docs/examples_readme.md + Welcome.md + examples_readme.md api-documentation/apis.rst - docs/doclist.rst - docs/releases_info.md - docs/contributions.md - docs/legal_information.md - docs/security_policy.md + releases_info.md + contributions.md + legal_information.md + security_policy.md + Intel® Neural Compressor repository diff --git a/docs/infrastructure.md b/docs/source/infrastructure.md similarity index 98% rename from docs/infrastructure.md rename to docs/source/infrastructure.md index f09ec322d6a..c2c2aae62e4 100644 --- a/docs/infrastructure.md +++ b/docs/source/infrastructure.md @@ -11,8 +11,8 @@ Neural Coder automatically inserts quantization code snippets on a PyTorch model ## Architecture - - Architecture + + Architecture Intel® Neural Compressor has unified interfaces which dispatch tasks to different frameworks via adaptor layer. The adaptor layer is the bridge between the tuning strategy and vanilla framework quantization APIs. Users can select tuning strategies and the strategy module contains model configs and tuning configs. Model configs define the quantization approach, if it's post-training static quantization, users need to set more parameters like calibration and so on. There are several tuning strategies for users to choose from while the basic strategy is set as default. diff --git a/docs/installation_guide.md b/docs/source/installation_guide.md similarity index 100% rename from docs/installation_guide.md rename to docs/source/installation_guide.md diff --git a/docs/legal_information.md b/docs/source/legal_information.md similarity index 94% rename from docs/legal_information.md rename to docs/source/legal_information.md index 5c595853b8a..511a04b7a58 100644 --- a/docs/legal_information.md +++ b/docs/source/legal_information.md @@ -16,7 +16,7 @@ See the accompanying [license](https://github.com/intel/neural-compressor/tree/m ## Citation -If you use Intel® Neural Compressor in your research or you wish to refer to the tuning results published in the [Validated Models](getting_started.md#validated-models), use the following BibTeX entry. +If you use Intel® Neural Compressor in your research or you wish to refer to the tuning results published in the [Validated Models](validated_model_list.md), use the following BibTeX entry. ``` @misc{Intel® Neural Compressor, diff --git a/docs/metric.md b/docs/source/metric.md similarity index 100% rename from docs/metric.md rename to docs/source/metric.md diff --git a/docs/mixed_precision.md b/docs/source/mixed_precision.md similarity index 95% rename from docs/mixed_precision.md rename to docs/source/mixed_precision.md index 04b155bb8f1..4a0ff3830fe 100644 --- a/docs/mixed_precision.md +++ b/docs/source/mixed_precision.md @@ -12,9 +12,9 @@ The recent growth of Deep Learning has driven the development of more complex mo The recently launched 3rd Gen Intel® Xeon® Scalable processor (codenamed Cooper Lake), featuring Intel® Deep Learning Boost, is the first general-purpose x86 CPU to support the bfloat16 format. Specifically, three new bfloat16 instructions are added as a part of the AVX512_BF16 extension within Intel Deep Learning Boost: VCVTNE2PS2BF16, VCVTNEPS2BF16, and VDPBF16PS. The first two instructions allow converting to and from bfloat16 data type, while the last one performs a dot product of bfloat16 pairs. Further details can be found in the [hardware numerics document](https://software.intel.com/content/www/us/en/develop/download/bfloat16-hardware-numerics-definition.html) published by Intel. - +
- Architecture + Architecture
diff --git a/docs/model.md b/docs/source/model.md similarity index 95% rename from docs/model.md rename to docs/source/model.md index 3bb4f0e9a52..b0ca55236f0 100644 --- a/docs/model.md +++ b/docs/source/model.md @@ -11,9 +11,9 @@ Model ## Introduction The Neural Compressor Model feature is used to encapsulate the behavior of model building and saving. By simply providing information such as different model formats and framework_specific_info, Neural Compressor performs optimizations and quantization on this model object and returns a Neural Compressor Model object for further model persistence or benchmarking. A Neural Compressor Model helps users to maintain necessary model information which is required during optimization and quantization such as the input/output names, workspace path, and other model format knowledge. This helps unify the features gap brought by different model formats and frameworks. - +
- Architecture + Architecture
diff --git a/docs/model_conversion.md b/docs/source/model_conversion.md similarity index 100% rename from docs/model_conversion.md rename to docs/source/model_conversion.md diff --git a/docs/objective.md b/docs/source/objective.md similarity index 100% rename from docs/objective.md rename to docs/source/objective.md diff --git a/docs/orchestration.md b/docs/source/orchestration.md old mode 100755 new mode 100644 similarity index 97% rename from docs/orchestration.md rename to docs/source/orchestration.md index 9de1d46172b..fb7e4fa4099 --- a/docs/orchestration.md +++ b/docs/source/orchestration.md @@ -1,112 +1,112 @@ -Optimization Orchestration -============ - -1. [Introduction](#introduction) - - 1.1. [One-shot](#one-shot) - - 1.2. [Multi-shot](#multi-shot) - -2. [Orchestration Support Matrix](#orchestration-support-matrix) -3. [Get Started with Orchestration API ](#get-started-with-orchestration-api) -4. [Examples](#examples) - -## Introduction - -Orchestration is the combination of multiple optimization techniques, either applied simultaneously (one-shot) or sequentially (multi-shot). Intel Neural Compressor supports arbitrary meaningful combinations of supported optimization methods under one-shot or multi-shot, such as pruning during quantization-aware training, or pruning and then post-training quantization, pruning and then distillation and then quantization. - -### One-shot -Since quantization-aware training, pruning and distillation all leverage training process for optimization, we can achieve the goal of optimization through one shot training with arbitrary meaningful combinations of these methods, which often gain more benefits in terms of performance and accuracy than just one compression technique applied, and usually are as efficient as applying just one compression technique. The three possible combinations are shown below. -- Pruning during quantization-aware training -- Distillation with pattern lock pruning -- Distillation with pattern lock pruning and quantization-aware training - -### Multi-shot -Of course, besides one-shot, we also support separate execution of each optimization process. -- Pruning and then post-training quantization -- Distillation and then post-training quantization -- Distillation, then pruning and post-training quantization - -## Orchestration Support Matrix - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OrchestrationCombinationsSupported
One-shotPruning + Quantization Aware Training
Distillation + Quantization Aware Training
Distillation + Pruning
Distillation + Pruning + Quantization Aware Training
Multi-shotPruning then Quantization
Distillation then Quantization
Distillation then Pruning
Distillation then Pruning then Quantization
- -## Get Started with Orchestration API - -Neural Compressor defines `Scheduler` class to automatically pipeline execute model optimization with one shot or multiple shots way. - -User instantiates model optimization components, such as quantization, pruning, distillation, separately. After that, user could append -those separate optimization objects into scheduler's pipeline, the scheduler API executes them one by one. - -In following example it executes the pruning and then post-training quantization with two-shot way. - -```python -from neural_compressor.experimental import Quantization, Pruning, Scheduler -prune = Pruning(prune_conf) -quantizer = Quantization(post_training_quantization_conf) -scheduler = Scheduler() -scheduler.model = model -scheduler.append(prune) -scheduler.append(quantizer) -opt_model = scheduler.fit() -``` - -If user wants to execute the pruning and quantization-aware training with one-shot way, the code is like below. - -```python -from neural_compressor.experimental import Quantization, Pruning, Scheduler -prune = Pruning(prune_conf) -quantizer = Quantization(quantization_aware_training_conf) -scheduler = Scheduler() -scheduler.model = model -combination = scheduler.combine(prune, quantizer) -scheduler.append(combination) -opt_model = scheduler.fit() -``` - -## Examples - -[Orchestration Examples](../examples/README.md#orchestration) +Optimization Orchestration +============ + +1. [Introduction](#introduction) + + 1.1. [One-shot](#one-shot) + + 1.2. [Multi-shot](#multi-shot) + +2. [Orchestration Support Matrix](#orchestration-support-matrix) +3. [Get Started with Orchestration API ](#get-started-with-orchestration-api) +4. [Examples](#examples) + +## Introduction + +Orchestration is the combination of multiple optimization techniques, either applied simultaneously (one-shot) or sequentially (multi-shot). Intel Neural Compressor supports arbitrary meaningful combinations of supported optimization methods under one-shot or multi-shot, such as pruning during quantization-aware training, or pruning and then post-training quantization, pruning and then distillation and then quantization. + +### One-shot +Since quantization-aware training, pruning and distillation all leverage training process for optimization, we can achieve the goal of optimization through one shot training with arbitrary meaningful combinations of these methods, which often gain more benefits in terms of performance and accuracy than just one compression technique applied, and usually are as efficient as applying just one compression technique. The three possible combinations are shown below. +- Pruning during quantization-aware training +- Distillation with pattern lock pruning +- Distillation with pattern lock pruning and quantization-aware training + +### Multi-shot +Of course, besides one-shot, we also support separate execution of each optimization process. +- Pruning and then post-training quantization +- Distillation and then post-training quantization +- Distillation, then pruning and post-training quantization + +## Orchestration Support Matrix + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
OrchestrationCombinationsSupported
One-shotPruning + Quantization Aware Training
Distillation + Quantization Aware Training
Distillation + Pruning
Distillation + Pruning + Quantization Aware Training
Multi-shotPruning then Quantization
Distillation then Quantization
Distillation then Pruning
Distillation then Pruning then Quantization
+ +## Get Started with Orchestration API + +Neural Compressor defines `Scheduler` class to automatically pipeline execute model optimization with one shot or multiple shots way. + +User instantiates model optimization components, such as quantization, pruning, distillation, separately. After that, user could append +those separate optimization objects into scheduler's pipeline, the scheduler API executes them one by one. + +In following example it executes the pruning and then post-training quantization with two-shot way. + +```python +from neural_compressor.experimental import Quantization, Pruning, Scheduler +prune = Pruning(prune_conf) +quantizer = Quantization(post_training_quantization_conf) +scheduler = Scheduler() +scheduler.model = model +scheduler.append(prune) +scheduler.append(quantizer) +opt_model = scheduler.fit() +``` + +If user wants to execute the pruning and quantization-aware training with one-shot way, the code is like below. + +```python +from neural_compressor.experimental import Quantization, Pruning, Scheduler +prune = Pruning(prune_conf) +quantizer = Quantization(quantization_aware_training_conf) +scheduler = Scheduler() +scheduler.model = model +combination = scheduler.combine(prune, quantizer) +scheduler.append(combination) +opt_model = scheduler.fit() +``` + +## Examples + +[Orchestration Examples](../examples/README.md#orchestration) diff --git a/docs/platform_configuration.md b/docs/source/platform_configuration.md similarity index 100% rename from docs/platform_configuration.md rename to docs/source/platform_configuration.md diff --git a/docs/pruning.md b/docs/source/pruning.md old mode 100755 new mode 100644 similarity index 89% rename from docs/pruning.md rename to docs/source/pruning.md index 1cf9f570642..89e6567737e --- a/docs/pruning.md +++ b/docs/source/pruning.md @@ -1,234 +1,234 @@ -Pruning -============ - -1. [Introduction](#introduction) - - 1.1. [Neural Network Pruning](#neural-network-pruning) - - 1.2. [Pruning Patterns](#pruning-patterns) - - 1.3. [Pruning Criteria](#pruning-criteria) - - 1.4. [Pruning Schedule](#pruning-schedule) - -2. [Pruning Support Matrix](#pruning-support-matrix) - -3. [Get Started With Pruning API](#get-started-with-pruning-api) - -4. [Examples](#examples) - -## Introduction - -### Neural Network Pruning -Neural network pruning (briefly known as pruning or sparsity) is one of the most promising model compression techniques. It removes the least important parameters in the network and achieves compact architectures with minimal accuracy drop and maximal inference acceleration. As current state-of-the-art models have increasingly more parameters, pruning plays a crucial role in enabling them to run on devices whose memory footprints and computing resources are limited. - - - pruning intro - - - -### Pruning Patterns - -Pruning patterns defines the rules of pruned weights' arrangements in space. - - - Sparsity Pattern - - - -- Unstructured Pruning - -Unstructured pruning means finding and removing the less salient connection in the model where the nonzero patterns are irregular and could be anywhere in the matrix. - -- 2in4 Pruning - -NVIDIA proposed [2:4 sparsity](https://developer.nvidia.com/blog/accelerating-inference-with-sparsity-using-ampere-and-tensorrt/) (or known as "2in4 sparsity") in Ampere architecture, for every 4 continuous elements in a matrix, two of them are zero and others are non-zero. - -- Structured Pruning - -Structured pruning means finding parameters in groups, deleting entire blocks, filters, or channels according to some pruning criterions. In general, structured pruning leads to lower accuracy due to restrictive structure than unstructured pruning; However, it can accelerate the model execution significantly because it can fit hardware design better. - -Different from 2:4 sparsity above, we propose the block-wise structured sparsity patterns that we are able to demonstrate the performance benefits on existing Intel hardwares even without the support of hardware sparsity. A block-wise sparsity pattern with block size ```S``` means the contiguous ```S``` elements in this block are all zero values. - -For a typical GEMM, the weight dimension is ```IC``` x ```OC```, where ```IC``` is the number of input channels and ```OC``` is the number of output channels. Note that sometimes ```IC``` is also called dimension ```K```, and ```OC``` is called dimension ```N```. The sparsity dimension is on ```OC``` (or ```N```). - -For a typical Convolution, the weight dimension is ```OC x IC x KH x KW```, where ```OC``` is the number of output channels, ```IC``` is the number of input channels, and ```KH``` and ```KW``` is the kernel height and weight. The sparsity dimension is also on ```OC```. - -Here is a figure showing a matrix with ```IC``` = 32 and ```OC``` = 16 dimension, and a block-wise sparsity pattern with block size 4 on ```OC``` dimension. - - - block sparsity Pattern - - -### Pruning Criteria - -Pruning criteria defines the rules of which weights are least important to be pruned, in order to maintain the model's original accuracy. Most popular criteria examine weights' absolute value and their corresponding gradients. - -- Magnitude - - The algorithm prunes the weight by the lowest absolute value at each layer with given sparsity target. - -- Gradient sensitivity - - The algorithm prunes the head, intermediate layers, and hidden states in NLP model according to importance score calculated by following the paper [FastFormers](https://arxiv.org/abs/2010.13382). - -- Group Lasso - - The algorithm uses Group lasso regularization to prune entire rows, columns or blocks of parameters that result in a smaller dense network. - -- Pattern Lock - - The algorithm locks the sparsity pattern in fine tune phase by freezing those zero values of weight tensor during weight update of training. - -- SNIP - - The algorithm prunes the dense model at its initialization, by analyzing the weights' effect to the loss function when they are masked. Please refer to the original [paper](https://arxiv.org/abs/1810.02340) for details - -- SNIP with momentum - - The algorithm improves original SNIP algorithms and introduces weights' score maps which updates in a momentum way.\ - In the following formula, $n$ is the pruning step and $W$ and $G$ are model's weights and gradients respectively. - $$Score_{n} = 1.0 \times Score_{n-1} + 0.9 \times |W_{n} \times G_{n}|$$ - -### Pruning Schedule - -Pruning schedule defines the way the model reach the target sparsity (the ratio of pruned weights). - -- One-shot Pruning - - One-shot pruning means the model is pruned to its target sparsity with one single step. This pruning method often works at model's initialization step. It can easily cause accuracy drop, but save much training time. - -- Iterative Pruning - - Iterative pruning means the model is gradually pruned to its target sparsity during a training process. The pruning process contains several pruning steps, and each step raises model's sparsity to a higher value. In the final pruning step, the model reaches target sparsity and the pruning process ends. - -## Pruning Support Matrix - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Pruning TypePruning GranularityPruning AlgorithmFramework
Unstructured PruningElement-wiseMagnitudePyTorch, TensorFlow
Pattern LockPyTorch
SNIP with momentumPyTorch
Structured PruningFilter/Channel-wiseGradient SensitivityPyTorch
SNIP with momentumPyTorch
Block-wiseGroup LassoPyTorch
SNIP with momentumPyTorch
Element-wisePattern LockPyTorch
SNIP with momentumPyTorch
- -## Get Started with Pruning API - -Neural Compressor `Pruning` API is defined under `neural_compressor.experimental.Pruning`, which takes a user defined yaml file as input. Below is the launcher code of applying the API to execute a pruning process. - -```python -from neural_compressor.experimental import Pruning -prune = Pruning('/path/to/user/pruning/yaml') -prune.model = model -model = prune.fit() -``` - -Users can pass the customized training/evaluation functions to `Pruning` for flexible scenarios. In this case, pruning process can be done by pre-defined hooks in Neural Compressor. Users need to put those hooks inside the training function. - -Neural Compressor defines several hooks for users to use: - -``` -on_epoch_begin(epoch) : Hook executed at each epoch beginning -on_step_begin(batch) : Hook executed at each batch beginning -on_step_end() : Hook executed at each batch end -on_epoch_end() : Hook executed at each epoch end -on_before_optimizer_step() : Hook executed after gradients calculated and before backward -``` - -Following section shows how to use hooks in user pass-in training function which is part of example from BERT training: - -```python -def pruning_func(model): - for epoch in range(int(args.num_train_epochs)): - pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') - model.train() - prune.on_epoch_begin(epoch) - for step, batch in enumerate(train_dataloader): - prune.on_step_begin(step) - batch = tuple(t.to(args.device) for t in batch) - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'labels': batch[3]} - #inputs['token_type_ids'] = batch[2] - outputs = model(**inputs) - loss = outputs[0] # model outputs are always tuple in transformers (see doc) - - if args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training - if args.gradient_accumulation_steps > 1: - loss = loss / args.gradient_accumulation_steps - - loss.backward() - torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) - - if (step + 1) % args.gradient_accumulation_steps == 0: - prune.on_before_optimizer_step() - optimizer.step() - scheduler.step() # Update learning rate schedule - model.zero_grad() - - prune.on_step_end() -... -``` -In this case, the launcher code is like the following: - -```python -from neural_compressor.experimental import Pruning, common -prune = Pruning(args.config) -prune.model = model -prune.train_func = pruning_func -model = prune.fit() -``` - -## Examples - -We validate the sparsity on typical models across different domains (including CV, NLP, and Recommendation System). [Validated pruning examples](../docs/validated_model_list.md#validated-pruning-examples) shows the sparsity pattern, sparsity ratio, and accuracy of sparse and dense (Reference) model for each model. - -Please refer to pruning examples([TensorFlow](../examples/README.md#Pruning), [PyTorch](../examples/README.md#Pruning-1)) for more information. +Pruning +============ + +1. [Introduction](#introduction) + + 1.1. [Neural Network Pruning](#neural-network-pruning) + + 1.2. [Pruning Patterns](#pruning-patterns) + + 1.3. [Pruning Criteria](#pruning-criteria) + + 1.4. [Pruning Schedule](#pruning-schedule) + +2. [Pruning Support Matrix](#pruning-support-matrix) + +3. [Get Started With Pruning API](#get-started-with-pruning-api) + +4. [Examples](#examples) + +## Introduction + +### Neural Network Pruning +Neural network pruning (briefly known as pruning or sparsity) is one of the most promising model compression techniques. It removes the least important parameters in the network and achieves compact architectures with minimal accuracy drop and maximal inference acceleration. As current state-of-the-art models have increasingly more parameters, pruning plays a crucial role in enabling them to run on devices whose memory footprints and computing resources are limited. + + + pruning intro + + + +### Pruning Patterns + +Pruning patterns defines the rules of pruned weights' arrangements in space. + + + Sparsity Pattern + + + +- Unstructured Pruning + +Unstructured pruning means finding and removing the less salient connection in the model where the nonzero patterns are irregular and could be anywhere in the matrix. + +- 2in4 Pruning + +NVIDIA proposed [2:4 sparsity](https://developer.nvidia.com/blog/accelerating-inference-with-sparsity-using-ampere-and-tensorrt/) (or known as "2in4 sparsity") in Ampere architecture, for every 4 continuous elements in a matrix, two of them are zero and others are non-zero. + +- Structured Pruning + +Structured pruning means finding parameters in groups, deleting entire blocks, filters, or channels according to some pruning criterions. In general, structured pruning leads to lower accuracy due to restrictive structure than unstructured pruning; However, it can accelerate the model execution significantly because it can fit hardware design better. + +Different from 2:4 sparsity above, we propose the block-wise structured sparsity patterns that we are able to demonstrate the performance benefits on existing Intel hardwares even without the support of hardware sparsity. A block-wise sparsity pattern with block size ```S``` means the contiguous ```S``` elements in this block are all zero values. + +For a typical GEMM, the weight dimension is ```IC``` x ```OC```, where ```IC``` is the number of input channels and ```OC``` is the number of output channels. Note that sometimes ```IC``` is also called dimension ```K```, and ```OC``` is called dimension ```N```. The sparsity dimension is on ```OC``` (or ```N```). + +For a typical Convolution, the weight dimension is ```OC x IC x KH x KW```, where ```OC``` is the number of output channels, ```IC``` is the number of input channels, and ```KH``` and ```KW``` is the kernel height and weight. The sparsity dimension is also on ```OC```. + +Here is a figure showing a matrix with ```IC``` = 32 and ```OC``` = 16 dimension, and a block-wise sparsity pattern with block size 4 on ```OC``` dimension. + + + block sparsity Pattern + + +### Pruning Criteria + +Pruning criteria defines the rules of which weights are least important to be pruned, in order to maintain the model's original accuracy. Most popular criteria examine weights' absolute value and their corresponding gradients. + +- Magnitude + + The algorithm prunes the weight by the lowest absolute value at each layer with given sparsity target. + +- Gradient sensitivity + + The algorithm prunes the head, intermediate layers, and hidden states in NLP model according to importance score calculated by following the paper [FastFormers](https://arxiv.org/abs/2010.13382). + +- Group Lasso + + The algorithm uses Group lasso regularization to prune entire rows, columns or blocks of parameters that result in a smaller dense network. + +- Pattern Lock + + The algorithm locks the sparsity pattern in fine tune phase by freezing those zero values of weight tensor during weight update of training. + +- SNIP + + The algorithm prunes the dense model at its initialization, by analyzing the weights' effect to the loss function when they are masked. Please refer to the original [paper](https://arxiv.org/abs/1810.02340) for details + +- SNIP with momentum + + The algorithm improves original SNIP algorithms and introduces weights' score maps which updates in a momentum way.\ + In the following formula, $n$ is the pruning step and $W$ and $G$ are model's weights and gradients respectively. + $$Score_{n} = 1.0 \times Score_{n-1} + 0.9 \times |W_{n} \times G_{n}|$$ + +### Pruning Schedule + +Pruning schedule defines the way the model reach the target sparsity (the ratio of pruned weights). + +- One-shot Pruning + + One-shot pruning means the model is pruned to its target sparsity with one single step. This pruning method often works at model's initialization step. It can easily cause accuracy drop, but save much training time. + +- Iterative Pruning + + Iterative pruning means the model is gradually pruned to its target sparsity during a training process. The pruning process contains several pruning steps, and each step raises model's sparsity to a higher value. In the final pruning step, the model reaches target sparsity and the pruning process ends. + +## Pruning Support Matrix + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Pruning TypePruning GranularityPruning AlgorithmFramework
Unstructured PruningElement-wiseMagnitudePyTorch, TensorFlow
Pattern LockPyTorch
SNIP with momentumPyTorch
Structured PruningFilter/Channel-wiseGradient SensitivityPyTorch
SNIP with momentumPyTorch
Block-wiseGroup LassoPyTorch
SNIP with momentumPyTorch
Element-wisePattern LockPyTorch
SNIP with momentumPyTorch
+ +## Get Started with Pruning API + +Neural Compressor `Pruning` API is defined under `neural_compressor.experimental.Pruning`, which takes a user defined yaml file as input. Below is the launcher code of applying the API to execute a pruning process. + +```python +from neural_compressor.experimental import Pruning +prune = Pruning('/path/to/user/pruning/yaml') +prune.model = model +model = prune.fit() +``` + +Users can pass the customized training/evaluation functions to `Pruning` for flexible scenarios. In this case, pruning process can be done by pre-defined hooks in Neural Compressor. Users need to put those hooks inside the training function. + +Neural Compressor defines several hooks for users to use: + +``` +on_epoch_begin(epoch) : Hook executed at each epoch beginning +on_step_begin(batch) : Hook executed at each batch beginning +on_step_end() : Hook executed at each batch end +on_epoch_end() : Hook executed at each epoch end +on_before_optimizer_step() : Hook executed after gradients calculated and before backward +``` + +Following section shows how to use hooks in user pass-in training function which is part of example from BERT training: + +```python +def pruning_func(model): + for epoch in range(int(args.num_train_epochs)): + pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') + model.train() + prune.on_epoch_begin(epoch) + for step, batch in enumerate(train_dataloader): + prune.on_step_begin(step) + batch = tuple(t.to(args.device) for t in batch) + inputs = {'input_ids': batch[0], + 'attention_mask': batch[1], + 'labels': batch[3]} + #inputs['token_type_ids'] = batch[2] + outputs = model(**inputs) + loss = outputs[0] # model outputs are always tuple in transformers (see doc) + + if args.n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) + + if (step + 1) % args.gradient_accumulation_steps == 0: + prune.on_before_optimizer_step() + optimizer.step() + scheduler.step() # Update learning rate schedule + model.zero_grad() + + prune.on_step_end() +... +``` +In this case, the launcher code is like the following: + +```python +from neural_compressor.experimental import Pruning, common +prune = Pruning(args.config) +prune.model = model +prune.train_func = pruning_func +model = prune.fit() +``` + +## Examples + +We validate the sparsity on typical models across different domains (including CV, NLP, and Recommendation System). [Validated pruning examples](../../docs/source/validated_model_list.md#validated-pruning-examples) shows the sparsity pattern, sparsity ratio, and accuracy of sparse and dense (Reference) model for each model. + +Please refer to pruning examples([TensorFlow](../../examples/README.md#Pruning), [PyTorch](../../examples/README.md#Pruning-1)) for more information. diff --git a/docs/publication_list.md b/docs/source/publication_list.md similarity index 95% rename from docs/publication_list.md rename to docs/source/publication_list.md index cabe32daa7a..ef6dc13506e 100644 --- a/docs/publication_list.md +++ b/docs/source/publication_list.md @@ -1,6 +1,8 @@ -Full Publications/Events (45) +Full Publications/Events (47) ========== -## 2022 (27) +## 2022 (29) +* [Intel together with Tencent deepens the cooperation to build a cloud foundation for digital and intelligent industry](https://mp.weixin.qq.com/s/CPz9-5Nsh-5N9Q8-UmK--w) (Dec 2022) +* [Intel Neural Compressor for TF Virtual Appliance packaged by Bitnami](https://marketplace.cloud.vmware.com/services/details/e9c3d891-ca51-4f07-a5aa-3fe6394f15ae) (Nov 2022) * [Neural Compressor: an open-source Python library for network compression](https://cloud.tencent.com/developer/article/2165895) (Nov 2022) * [Running Fast Transformers on CPUs: Intel Approach Achieves Significant Speed Ups and SOTA Performance](https://medium.com/syncedreview/running-fast-transformers-on-cpus-intel-approach-achieves-significant-speed-ups-and-sota-448521704c5e) (Nov 2022) * [Personalized Stable Diffusion with Few-Shot Fine-Tuning](https://medium.com/intel-analytics-software/personalized-stable-diffusion-with-few-shot-fine-tuning-on-a-single-cpu-f01a3316b13) (Nov 2022) diff --git a/docs/pythonic_style.md b/docs/source/pythonic_style.md similarity index 97% rename from docs/pythonic_style.md rename to docs/source/pythonic_style.md index 508f315ff56..3f09059f197 100644 --- a/docs/pythonic_style.md +++ b/docs/source/pythonic_style.md @@ -1,136 +1,136 @@ -Pythonic Style Access for Configurations -==== - -1. [Introduction](#introduction) -2. [Supported Feature Matrix](#supported-feature-matrix) -3. [Get Started with Pythonic API for Configurations](#get-started-with-pythonic-api-for-configurations) - -## Introduction -To meet the variety of needs arising from various circumstances, INC now provides a -pythonic style access - Pythonic API - for same purpose of either user or framework configurations. - -The Pythonic API for Configuration allows users to specify configurations -directly in their python codes without referring to -a separate YAML file. While we support both simultaneously, -the Pythonic API for Configurations has several advantages over YAML files, -which one can tell from usages in the context below. Hence, we recommend -users to use the Pythonic API for Configurations moving forward. - -## Supported Feature Matrix - -### Pythonic API for User Configurations -| Optimization Techniques | Pythonic API | -|-------------------------|:------------:| -| Quantization | ✔ | -| Pruning | ✔ | -| Distillation | ✔ | -| NAS | ✔ | -### Pythonic API for Framework Configurations - -| Framework | Pythonic API | -|------------|:------------:| -| TensorFlow | ✔ | -| PyTorch | ✔ | -| ONNX | ✔ | -| MXNet | ✔ | - -## Get Started with Pythonic API for Configurations - -### Pythonic API for User Configurations -Now, let's go through the Pythonic API for Configurations in the order of -sections similar as in user YAML files. - -#### Quantization - -To specify quantization configurations, users can use the following -Pythonic API step by step. - -* First, load the ***config*** module -```python -from neural_compressor import config -``` -* Next, assign values to the attributes of *config.quantization* to use specific configurations, and pass the config to *Quantization* API. -```python -config.quantization.inputs = ['image'] # list of str -config.quantization.outputs = ['out'] # list of str -config.quantization.backend = 'onnxrt_integerops' # support tensorflow, tensorflow_itex, pytorch, pytorch_ipex, pytorch_fx, onnxrt_qlinearops, onnxrt_integerops, onnxrt_qdq, onnxrt_qoperator, mxnet -config.quantization.approach = 'post_training_dynamic_quant' # support post_training_static_quant, post_training_dynamic_quant, quant_aware_training -config.quantization.device = 'cpu' # support cpu, gpu -config.quantization.op_type_list = {'Conv': {'weight': {'dtype': ['fp32']}, 'activation': {'dtype': ['fp32']}}} # dict -config.quantization.strategy = 'mse' # support basic, mse, bayesian, random, exhaustive -config.quantization.objective = 'accuracy' # support performance, accuracy, modelsize, footprint -config.quantization.timeout = 100 # int, default is 0 -config.quantization.accuracy_criterion.relative = 0.5 # float, default is 0.01 -config.quantization.reduce_range = False # bool. default value depends on hardware, True if cpu supports VNNI instruction, otherwise is False -config.quantization.use_bf16 = False # bool -from neural_compressor.experimental import Quantization -quantizer = Quantization(config) -``` - -#### Distillation -To specify distillation configurations, users can assign values to -the corresponding attributes. -```python -from neural_compressor import config -config.distillation.optimizer = {'SGD': {'learning_rate': 0.0001}} - -from neural_compressor.experimental import Distillation -distiller = Distillation(config) -``` -#### Pruning -To specify pruning configurations, users can assign values to the corresponding attributes. -```python -from neural_compressor import config -config.pruning.weight_compression.initial_sparsity = 0.0 -config.pruning.weight_compression.target_sparsity = 0.9 -config.pruning.weight_compression.max_sparsity_ratio_per_layer = 0.98 -config.pruning.weight_compression.prune_type = "basic_magnitude" -config.pruning.weight_compression.start_epoch = 0 -config.pruning.weight_compression.end_epoch = 3 -config.pruning.weight_compression.start_step = 0 -config.pruning.weight_compression.end_step = 0 -config.pruning.weight_compression.update_frequency = 1.0 -config.pruning.weight_compression.update_frequency_on_step = 1 -config.pruning.weight_compression.prune_domain = "global" -config.pruning.weight_compression.pattern = "tile_pattern_1x1" - -from neural_compressor.experimental import Pruning -prune = Pruning(config) -``` -#### NAS -To specify nas configurations, users can assign values to the -corresponding attributes. - -```python -from neural_compressor import config -config.nas.approach = 'dynas' -from neural_compressor.experimental import NAS -nas = NAS(config) -``` - - -#### Benchmark -To specify benchmark configurations, users can assign values to the -corresponding attributes. -```python -from neural_compressor import config -config.benchmark.warmup = 10 -config.benchmark.iteration = 10 -config.benchmark.cores_per_instance = 10 -config.benchmark.num_of_instance = 10 -config.benchmark.inter_num_of_threads = 10 -config.benchmark.intra_num_of_threads = 10 - -from neural_compressor.experimental import Benchmark -benchmark = Benchmark(config) -``` -### Pythonic API for Framework Configurations -Now, let's go through the Pythonic API for Configurations in setting up similar framework -capabilities as in YAML files. Users can specify a framework's (eg. ONNX Runtime) capability by -assigning values to corresponding attributes. - -```python -config.onnxruntime.precisions = ['int8', 'uint8'] -config.onnxruntime.graph_optimization_level = 'DISABLE_ALL' # only onnxruntime has graph_optimization_level attribute -``` - +Pythonic Style Access for Configurations +==== + +1. [Introduction](#introduction) +2. [Supported Feature Matrix](#supported-feature-matrix) +3. [Get Started with Pythonic API for Configurations](#get-started-with-pythonic-api-for-configurations) + +## Introduction +To meet the variety of needs arising from various circumstances, INC now provides a +pythonic style access - Pythonic API - for same purpose of either user or framework configurations. + +The Pythonic API for Configuration allows users to specify configurations +directly in their python codes without referring to +a separate YAML file. While we support both simultaneously, +the Pythonic API for Configurations has several advantages over YAML files, +which one can tell from usages in the context below. Hence, we recommend +users to use the Pythonic API for Configurations moving forward. + +## Supported Feature Matrix + +### Pythonic API for User Configurations +| Optimization Techniques | Pythonic API | +|-------------------------|:------------:| +| Quantization | ✔ | +| Pruning | ✔ | +| Distillation | ✔ | +| NAS | ✔ | +### Pythonic API for Framework Configurations + +| Framework | Pythonic API | +|------------|:------------:| +| TensorFlow | ✔ | +| PyTorch | ✔ | +| ONNX | ✔ | +| MXNet | ✔ | + +## Get Started with Pythonic API for Configurations + +### Pythonic API for User Configurations +Now, let's go through the Pythonic API for Configurations in the order of +sections similar as in user YAML files. + +#### Quantization + +To specify quantization configurations, users can use the following +Pythonic API step by step. + +* First, load the ***config*** module +```python +from neural_compressor import config +``` +* Next, assign values to the attributes of *config.quantization* to use specific configurations, and pass the config to *Quantization* API. +```python +config.quantization.inputs = ['image'] # list of str +config.quantization.outputs = ['out'] # list of str +config.quantization.backend = 'onnxrt_integerops' # support tensorflow, tensorflow_itex, pytorch, pytorch_ipex, pytorch_fx, onnxrt_qlinearops, onnxrt_integerops, onnxrt_qdq, onnxrt_qoperator, mxnet +config.quantization.approach = 'post_training_dynamic_quant' # support post_training_static_quant, post_training_dynamic_quant, quant_aware_training +config.quantization.device = 'cpu' # support cpu, gpu +config.quantization.op_type_list = {'Conv': {'weight': {'dtype': ['fp32']}, 'activation': {'dtype': ['fp32']}}} # dict +config.quantization.strategy = 'mse' # support basic, mse, bayesian, random, exhaustive +config.quantization.objective = 'accuracy' # support performance, accuracy, modelsize, footprint +config.quantization.timeout = 100 # int, default is 0 +config.quantization.accuracy_criterion.relative = 0.5 # float, default is 0.01 +config.quantization.reduce_range = False # bool. default value depends on hardware, True if cpu supports VNNI instruction, otherwise is False +config.quantization.use_bf16 = False # bool +from neural_compressor.experimental import Quantization +quantizer = Quantization(config) +``` + +#### Distillation +To specify distillation configurations, users can assign values to +the corresponding attributes. +```python +from neural_compressor import config +config.distillation.optimizer = {'SGD': {'learning_rate': 0.0001}} + +from neural_compressor.experimental import Distillation +distiller = Distillation(config) +``` +#### Pruning +To specify pruning configurations, users can assign values to the corresponding attributes. +```python +from neural_compressor import config +config.pruning.weight_compression.initial_sparsity = 0.0 +config.pruning.weight_compression.target_sparsity = 0.9 +config.pruning.weight_compression.max_sparsity_ratio_per_layer = 0.98 +config.pruning.weight_compression.prune_type = "basic_magnitude" +config.pruning.weight_compression.start_epoch = 0 +config.pruning.weight_compression.end_epoch = 3 +config.pruning.weight_compression.start_step = 0 +config.pruning.weight_compression.end_step = 0 +config.pruning.weight_compression.update_frequency = 1.0 +config.pruning.weight_compression.update_frequency_on_step = 1 +config.pruning.weight_compression.prune_domain = "global" +config.pruning.weight_compression.pattern = "tile_pattern_1x1" + +from neural_compressor.experimental import Pruning +prune = Pruning(config) +``` +#### NAS +To specify nas configurations, users can assign values to the +corresponding attributes. + +```python +from neural_compressor import config +config.nas.approach = 'dynas' +from neural_compressor.experimental import NAS +nas = NAS(config) +``` + + +#### Benchmark +To specify benchmark configurations, users can assign values to the +corresponding attributes. +```python +from neural_compressor import config +config.benchmark.warmup = 10 +config.benchmark.iteration = 10 +config.benchmark.cores_per_instance = 10 +config.benchmark.num_of_instance = 10 +config.benchmark.inter_num_of_threads = 10 +config.benchmark.intra_num_of_threads = 10 + +from neural_compressor.experimental import Benchmark +benchmark = Benchmark(config) +``` +### Pythonic API for Framework Configurations +Now, let's go through the Pythonic API for Configurations in setting up similar framework +capabilities as in YAML files. Users can specify a framework's (eg. ONNX Runtime) capability by +assigning values to corresponding attributes. + +```python +config.onnxruntime.precisions = ['int8', 'uint8'] +config.onnxruntime.graph_optimization_level = 'DISABLE_ALL' # only onnxruntime has graph_optimization_level attribute +``` + diff --git a/docs/quantization.md b/docs/source/quantization.md similarity index 98% rename from docs/quantization.md rename to docs/source/quantization.md index 951c6e4e5d1..cae3e0845f8 100644 --- a/docs/quantization.md +++ b/docs/source/quantization.md @@ -80,7 +80,7 @@ Currently `accuracy aware tuning` supports `post training quantization`, `quanti User could refer to below chart to understand the whole tuning flow. -accuracy aware tuning working flow +accuracy aware tuning working flow ## Supported Feature Matrix diff --git a/docs/quantization_mixed_precision.md b/docs/source/quantization_mixed_precision.md similarity index 87% rename from docs/quantization_mixed_precision.md rename to docs/source/quantization_mixed_precision.md index 728c854da5c..9352a81f8cf 100644 --- a/docs/quantization_mixed_precision.md +++ b/docs/source/quantization_mixed_precision.md @@ -1,59 +1,59 @@ -### Turn ON Auto Mixed Precision during Quantization - -BF16 conversion during quantization is default OFF. To force enable it, users need to turn on use_bf16 by pythonic config: - -```python -from neural_compressor import config -from neural_compressor.experimental import Quantization - -config.quantization.use_bf16 = True -quantizer = Quantization(config) -``` - -### Tensorflow - -Intel has worked with the TensorFlow development team to enhance TensorFlow to include bfloat16 data support for CPUs. For more information about BF16 in TensorFlow, please read [Accelerating AI performance on 3rd Gen Intel® Xeon® Scalable processors with TensorFlow and Bfloat16](https://blog.tensorflow.org/2020/06/accelerating-ai-performance-on-3rd-gen-processors-with-tensorflow-bfloat16.html). - -- BF16 conversion during quantization in TensorFlow - - -
- Architecture -
-
- - -- Three steps - -1. Convert to a `FP32 + INT8` mixed precision Graph - - In this steps, TF adaptor will regard all fallback datatype as `FP32`. According to the per op datatype in tuning config passed by strategy, TF adaptor will generate a `FP32 + INT8` mixed precision graph. - -2. Convert to a `BF16 + FP32 + INT8` mixed precision Graph - - In this phase, adaptor will convert some `FP32` ops to `BF16` according to `bf16_ops` list in tuning config. - -3. Optimize the `BF16 + FP32 + INT8` mixed precision Graph - - After the mixed precision graph generated, there are still some optimization need to be applied to improved the performance, for example `Cast + Cast` and so on. The `BF16Convert` transformer also apply a depth-first method to make it possible to take the ops use `BF16` which can support `BF16` datatype to reduce the insertion of `Cast` op. - -### PyTorch - -Intel has also worked with the PyTorch development team to enhance PyTorch to include bfloat16 data support for CPUs. - -- BF16 conversion during quantization in PyTorch - - -
- Architecture -
-
- -- Two steps -1. Convert to a `FP32 + INT8` mixed precision Graph or Module - - In this steps, PT adaptor will combine the `INT8` ops and all fallback ops to `FP32 + INT8` mixed precision Graph or Module no matter in Eager mode or Fx Graph mode. - -2. Convert to a `BF16 + FP32 + INT8` mixed precision Graph or Module - - In this phase, adaptor will according to `BF16` op list from strategy tune config to wrapper the `FP32` module with `BF16Wrapper` to realize the `BF16 + FP32 + INT8` mixed precision Graph or Module. adaptor will do retrace the `GraphModule` again if using Fx Graph mode. +### Turn ON Auto Mixed Precision during Quantization + +BF16 conversion during quantization is default OFF. To force enable it, users need to turn on use_bf16 by pythonic config: + +```python +from neural_compressor import config +from neural_compressor.experimental import Quantization + +config.quantization.use_bf16 = True +quantizer = Quantization(config) +``` + +### Tensorflow + +Intel has worked with the TensorFlow development team to enhance TensorFlow to include bfloat16 data support for CPUs. For more information about BF16 in TensorFlow, please read [Accelerating AI performance on 3rd Gen Intel® Xeon® Scalable processors with TensorFlow and Bfloat16](https://blog.tensorflow.org/2020/06/accelerating-ai-performance-on-3rd-gen-processors-with-tensorflow-bfloat16.html). + +- BF16 conversion during quantization in TensorFlow + + +
+ Architecture +
+
+ + +- Three steps + +1. Convert to a `FP32 + INT8` mixed precision Graph + + In this steps, TF adaptor will regard all fallback datatype as `FP32`. According to the per op datatype in tuning config passed by strategy, TF adaptor will generate a `FP32 + INT8` mixed precision graph. + +2. Convert to a `BF16 + FP32 + INT8` mixed precision Graph + + In this phase, adaptor will convert some `FP32` ops to `BF16` according to `bf16_ops` list in tuning config. + +3. Optimize the `BF16 + FP32 + INT8` mixed precision Graph + + After the mixed precision graph generated, there are still some optimization need to be applied to improved the performance, for example `Cast + Cast` and so on. The `BF16Convert` transformer also apply a depth-first method to make it possible to take the ops use `BF16` which can support `BF16` datatype to reduce the insertion of `Cast` op. + +### PyTorch + +Intel has also worked with the PyTorch development team to enhance PyTorch to include bfloat16 data support for CPUs. + +- BF16 conversion during quantization in PyTorch + + +
+ Architecture +
+
+ +- Two steps +1. Convert to a `FP32 + INT8` mixed precision Graph or Module + + In this steps, PT adaptor will combine the `INT8` ops and all fallback ops to `FP32 + INT8` mixed precision Graph or Module no matter in Eager mode or Fx Graph mode. + +2. Convert to a `BF16 + FP32 + INT8` mixed precision Graph or Module + + In this phase, adaptor will according to `BF16` op list from strategy tune config to wrapper the `FP32` module with `BF16Wrapper` to realize the `BF16 + FP32 + INT8` mixed precision Graph or Module. adaptor will do retrace the `GraphModule` again if using Fx Graph mode. diff --git a/docs/releases_info.md b/docs/source/releases_info.md similarity index 93% rename from docs/releases_info.md rename to docs/source/releases_info.md index 81d078a8229..7367fa284b7 100644 --- a/docs/releases_info.md +++ b/docs/source/releases_info.md @@ -15,6 +15,6 @@ The MSE tuning strategy does not work with the PyTorch adaptor layer. This strat [Neural Compressor v1.2](https://github.com/intel/neural-compressor/tree/v1.2) introduces incompatible changes in user facing APIs. Please refer to [incompatible changes](incompatible_changes.md) to know which incompatible changes are made in v1.2. -[Neural Compressor v1.2.1](https://github.com/intel/neural-compressor/tree/v1.2.1) solves this backward compatible issues introduced in v1.2 by moving new user facing APIs to neural_compressor.experimental package and keep old one as is. Please refer to [API documentation](/api-documentation/api-introduction.md) to know the details of user-facing APIs. +[Neural Compressor v1.2.1](https://github.com/intel/neural-compressor/tree/v1.2.1) solves this backward compatible issues introduced in v1.2 by moving new user facing APIs to neural_compressor.experimental package and keep old one as is. Please refer to [API documentation](./api-documentation/apis.rst) to know the details of user-facing APIs. [Neural Compressor v1.7](https://github.com/intel/neural-compressor/tree/v1.7) renames the pip/conda package name from lpot to neural_compressor. To run old examples on latest software, please replace package name for compatibility with `sed -i "s|lpot|neural_compressor|g" your_script.py` diff --git a/docs/sigopt_strategy.md b/docs/source/sigopt_strategy.md similarity index 100% rename from docs/sigopt_strategy.md rename to docs/source/sigopt_strategy.md diff --git a/docs/tensorboard.md b/docs/source/tensorboard.md similarity index 96% rename from docs/tensorboard.md rename to docs/source/tensorboard.md index ad8965032fc..716e094eb35 100644 --- a/docs/tensorboard.md +++ b/docs/source/tensorboard.md @@ -185,13 +185,13 @@ See the [tensorflow.py](https://github.com/intel/neural-compressor/tree/master/n * From the **GRAPHS** tab, select "baseline/." in the "Run" box and find the first 'Conv2d' op after 'input' op. The op name is "v0/cg/conv0/Relu": -![TensorBoard Baseline](imgs/tensorboard_baseline_v0_cg_conv0.png "TensorBoard Baseline") +![TensorBoard Baseline](./_static/imgs/tensorboard_baseline_v0_cg_conv0.png "TensorBoard Baseline") * From the **GRAPHS** tab, select "tune_1/." in the "Run" box and find the first 'Conv2d' op after 'input' op. The tensor name is 'v0/cg/conv0/conv2d/Conv2D_eightbit_requantize': -![TensorBoard Tuning](imgs/tensorboard_tune_1_v0_cg_conv0.png "TensorBoard Tuning") +![TensorBoard Tuning](./_static/imgs/tensorboard_tune_1_v0_cg_conv0.png "TensorBoard Tuning") * Switch to the **HISTOGRAMS** tab. Click the 'v0/cg/conv0' op name in the search box. TensorBoard groups the tensors with the same op name together so you can compare the tensor of baseline 'v0/cg/conv0/Relu' with the tensor of tune_1 'v0/cg/conv0/conv2d/Conv2D_eightbit_requantize_int8.output'. Note that the tensor name can be changed after quantization, so group the tensor by op name and compare. From the chart below, we can see that the histogram of the first conv2d output tensor are different. This is due to a known TensorFlow issue. After filtering the 'v0/cg/conv0/conv2d/Conv2D' op by adding "op_wise" in the yaml file, the issue disappears. -![TensorBoard Histogram](imgs/tensorboard_v0_cg_conv0_histogram.png "TensorBoard Histogram") +![TensorBoard Histogram](./_static/imgs/tensorboard_v0_cg_conv0_histogram.png "TensorBoard Histogram") diff --git a/docs/transform.md b/docs/source/transform.md similarity index 100% rename from docs/transform.md rename to docs/source/transform.md diff --git a/docs/tuning_strategies.md b/docs/source/tuning_strategies.md similarity index 99% rename from docs/tuning_strategies.md rename to docs/source/tuning_strategies.md index 55c5fcae411..6e11941559a 100644 --- a/docs/tuning_strategies.md +++ b/docs/source/tuning_strategies.md @@ -17,7 +17,7 @@ Each strategy generates the next quantization configuration according to its logic and the last quantization result. The function of strategies is shown below: -![Tuning Strategy](imgs/strategy.png "Strategy Framework") +![Tuning Strategy](./_static/imgs/strategy.png "Strategy Framework") Strategies begin with an adaptor layer (Framework Adaptor) where the user passes a framework-specific model to initialize an instance of the diff --git a/docs/user_yaml.md b/docs/source/user_yaml.md similarity index 97% rename from docs/user_yaml.md rename to docs/source/user_yaml.md index 179bf197531..c50d28a0e01 100644 --- a/docs/user_yaml.md +++ b/docs/source/user_yaml.md @@ -1,167 +1,167 @@ -User YAML Configuration Files -===== -1. [Introduction](#introduction) -2. [Supported Feature Matrix](#supported-feature-matrix) -3. [Get Started with User YAML Files](#get-started-with-user-yaml-files) - - -## Introduction - -Intel® Neural Compressor uses YAML files for quick -and user-friendly configurations. There are two types of YAML files - -user YAML files and framework YAML files, which are used in -running user cases and setting up framework capabilities, respectively. - -First, let's take a look at a user YAML file, It defines the model, tuning -strategies, tuning calibrations and evaluations, and performance benchmarking -of the passing model vs. original model. - -## Supported Feature Matrix - -| Optimization Techniques | YAML Configuration Files | -|-------------------------|:------------------------:| -| Quantization | ✔ | -| Pruning | ✔ | -| Distillation | ✔ | - - -## Get started with User YAML Files - - -A complete user YAML file is organized logically into several sections: - -* ***model***: The model specifications define a user model's name, inputs, outputs and framework. - - -```yaml -model: # mandatory. used to specify model specific information. - name: mobilenet_v1 - framework: tensorflow # mandatory. supported values are tensorflow, pytorch, pytorch_ipex, onnxrt_integer, onnxrt_qlinear or mxnet; allow new framework backend extension. - inputs: image_tensor # optional. inputs field is only required in tensorflow. - outputs: num_detections,detection_boxes,detection_scores,detection_classes # optional. outputs field is only required in tensorflow. -``` -* ***quantization***: The quantization specifications define quantization tuning space and related calibrations. To calibrate, users can -specify *sampling_size* (optional) and use the subsection *dataloader* to specify -the dataset location using *root* and transformation using *transform*. To -implement tuning space constraints, users can use the subsection *model_wise* and *op_wise* for specific configurations. - -```yaml -quantization: # optional. tuning constraints on model-wise for advance user to reduce tuning space. - calibration: - sampling_size: 20 # optional. default value is 100. used to set how many samples should be used in calibration. - dataloader: - dataset: - ImageRecord: - root: /path/to/imagenet/ # NOTE: modify to calibration dataset location if needed - transform: - BilinearImagenet: - height: 224 - width: 224 - model_wise: # optional. tuning constraints on model-wise for advance user to reduce tuning space. - weight: - granularity: per_channel - scheme: asym - dtype: int8 - algorithm: minmax - activation: - granularity: per_tensor - scheme: asym - dtype: int8, fp32 - algorithm: minmax, kl - op_wise: { # optional. tuning constraints on op-wise for advance user to reduce tuning space. - 'conv1': { - 'activation': {'dtype': ['uint8', 'fp32'], - 'algorithm': ['minmax', 'kl'], - 'scheme':['sym']}, - 'weight': {'dtype': ['int8', 'fp32'], - 'algorithm': ['minmax']} - } - } -``` - -* ***pruning***: The pruning specifications define pruning tuning space. To define the training behavior, uses can -use the subsection *train* to specify the training hyper-parameters and the training dataloader. -To define the pruning approach, users can use the subsection *approach* to specify -pruning target, choose the type of pruning algorithm, and the way to apply it -during training process. - -```yaml -pruning: - train: - dataloader: - ... - epoch: 40 - optimizer: - Adam: - learning_rate: 1e-06 - beta_1: 0.9 - beta_2: 0.999 - epsilon: 1e-07 - criterion: - SparseCategoricalCrossentropy: - reduction: sum_over_batch_size - from_logits: False - approach: - weight_compression: - initial_sparsity: 0.0 - target_sparsity: 0.54 - start_epoch: 0 - end_epoch: 19 - pruners: - - !Pruner - start_epoch: 0 - end_epoch: 19 - prune_type: basic_magnitude -``` -* ***distillation***: The distillation specifications define distillation's tuning -space. Similar to pruning, to define the training behavior, users can use the -subsection *train* to specify the training hyper-parameters and the training -dataloader and it is optional if users implement *train_func* and set the attribute -of distillation instance to *train_func*. For criterion, Intel® Neural Compressor provides a built-in -knowledge distillation loss class to calculate distillation loss. -```yaml -distillation: - train: - start_epoch: 0 - end_epoch: 90 - iteration: 1000 - frequency: 1 - dataloader: - ... - optimizer: - SGD: - learning_rate: 0.001 - momentum: 0.1 - nesterov: True - weight_decay: 0.001 - criterion: - KnowledgeDistillationLoss: - temperature: 1.0 - loss_types: ['CE', 'CE'] - loss_weights: [0.5, 0.5] -``` -* ***evaluation***: The evaluation specifications define the dataloader and metric for accuracy evaluation as well as dataloader -and configurations for performance benchmarking. -```yaml -evaluation: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization. - accuracy: - metric: - ... - dataloader: - ... -``` -* ***tuning***: The tuning specifications define overall tuning targets. Users can -use *accuracy_criterion* to specify the target of accuracy loss percentage and use -*exit_policy* to specify the tuning timeout in seconds. The random -seed can be specified using *random_seed*. - -```yaml -tuning: - accuracy_criterion: - relative: 0.01 # the tuning target of accuracy loss percentage: 1% - higher_is_better: True - exit_policy: - timeout: 0 # tuning timeout (seconds), 0 means early stop - random_seed: 9527 # random seed -``` - +User YAML Configuration Files +===== +1. [Introduction](#introduction) +2. [Supported Feature Matrix](#supported-feature-matrix) +3. [Get Started with User YAML Files](#get-started-with-user-yaml-files) + + +## Introduction + +Intel® Neural Compressor uses YAML files for quick +and user-friendly configurations. There are two types of YAML files - +user YAML files and framework YAML files, which are used in +running user cases and setting up framework capabilities, respectively. + +First, let's take a look at a user YAML file, It defines the model, tuning +strategies, tuning calibrations and evaluations, and performance benchmarking +of the passing model vs. original model. + +## Supported Feature Matrix + +| Optimization Techniques | YAML Configuration Files | +|-------------------------|:------------------------:| +| Quantization | ✔ | +| Pruning | ✔ | +| Distillation | ✔ | + + +## Get started with User YAML Files + + +A complete user YAML file is organized logically into several sections: + +* ***model***: The model specifications define a user model's name, inputs, outputs and framework. + + +```yaml +model: # mandatory. used to specify model specific information. + name: mobilenet_v1 + framework: tensorflow # mandatory. supported values are tensorflow, pytorch, pytorch_ipex, onnxrt_integer, onnxrt_qlinear or mxnet; allow new framework backend extension. + inputs: image_tensor # optional. inputs field is only required in tensorflow. + outputs: num_detections,detection_boxes,detection_scores,detection_classes # optional. outputs field is only required in tensorflow. +``` +* ***quantization***: The quantization specifications define quantization tuning space and related calibrations. To calibrate, users can +specify *sampling_size* (optional) and use the subsection *dataloader* to specify +the dataset location using *root* and transformation using *transform*. To +implement tuning space constraints, users can use the subsection *model_wise* and *op_wise* for specific configurations. + +```yaml +quantization: # optional. tuning constraints on model-wise for advance user to reduce tuning space. + calibration: + sampling_size: 20 # optional. default value is 100. used to set how many samples should be used in calibration. + dataloader: + dataset: + ImageRecord: + root: /path/to/imagenet/ # NOTE: modify to calibration dataset location if needed + transform: + BilinearImagenet: + height: 224 + width: 224 + model_wise: # optional. tuning constraints on model-wise for advance user to reduce tuning space. + weight: + granularity: per_channel + scheme: asym + dtype: int8 + algorithm: minmax + activation: + granularity: per_tensor + scheme: asym + dtype: int8, fp32 + algorithm: minmax, kl + op_wise: { # optional. tuning constraints on op-wise for advance user to reduce tuning space. + 'conv1': { + 'activation': {'dtype': ['uint8', 'fp32'], + 'algorithm': ['minmax', 'kl'], + 'scheme':['sym']}, + 'weight': {'dtype': ['int8', 'fp32'], + 'algorithm': ['minmax']} + } + } +``` + +* ***pruning***: The pruning specifications define pruning tuning space. To define the training behavior, uses can +use the subsection *train* to specify the training hyper-parameters and the training dataloader. +To define the pruning approach, users can use the subsection *approach* to specify +pruning target, choose the type of pruning algorithm, and the way to apply it +during training process. + +```yaml +pruning: + train: + dataloader: + ... + epoch: 40 + optimizer: + Adam: + learning_rate: 1e-06 + beta_1: 0.9 + beta_2: 0.999 + epsilon: 1e-07 + criterion: + SparseCategoricalCrossentropy: + reduction: sum_over_batch_size + from_logits: False + approach: + weight_compression: + initial_sparsity: 0.0 + target_sparsity: 0.54 + start_epoch: 0 + end_epoch: 19 + pruners: + - !Pruner + start_epoch: 0 + end_epoch: 19 + prune_type: basic_magnitude +``` +* ***distillation***: The distillation specifications define distillation's tuning +space. Similar to pruning, to define the training behavior, users can use the +subsection *train* to specify the training hyper-parameters and the training +dataloader and it is optional if users implement *train_func* and set the attribute +of distillation instance to *train_func*. For criterion, Intel® Neural Compressor provides a built-in +knowledge distillation loss class to calculate distillation loss. +```yaml +distillation: + train: + start_epoch: 0 + end_epoch: 90 + iteration: 1000 + frequency: 1 + dataloader: + ... + optimizer: + SGD: + learning_rate: 0.001 + momentum: 0.1 + nesterov: True + weight_decay: 0.001 + criterion: + KnowledgeDistillationLoss: + temperature: 1.0 + loss_types: ['CE', 'CE'] + loss_weights: [0.5, 0.5] +``` +* ***evaluation***: The evaluation specifications define the dataloader and metric for accuracy evaluation as well as dataloader +and configurations for performance benchmarking. +```yaml +evaluation: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization. + accuracy: + metric: + ... + dataloader: + ... +``` +* ***tuning***: The tuning specifications define overall tuning targets. Users can +use *accuracy_criterion* to specify the target of accuracy loss percentage and use +*exit_policy* to specify the tuning timeout in seconds. The random +seed can be specified using *random_seed*. + +```yaml +tuning: + accuracy_criterion: + relative: 0.01 # the tuning target of accuracy loss percentage: 1% + higher_is_better: True + exit_policy: + timeout: 0 # tuning timeout (seconds), 0 means early stop + random_seed: 9527 # random seed +``` + diff --git a/docs/validated_model_list.md b/docs/source/validated_model_list.md similarity index 98% rename from docs/validated_model_list.md rename to docs/source/validated_model_list.md index 5c1c95eb21c..7a8e50fbfbe 100644 --- a/docs/validated_model_list.md +++ b/docs/source/validated_model_list.md @@ -1819,12 +1819,22 @@ Performance varies by use, configuration and other factors. See [platform config + + Bert-Mini + text classification
MRPC + f1=87.52
f1=86.8 + -0.83% + 60%
structured per channel + snip momentum
unbalanced + + + Bert-Mini text classification
SST-2 accuracy=87.61
accuracy=86.92 -0.79% - 90%
Structured 4x1 + 90%
structured 4x1 snip momentum
unbalanced @@ -1839,6 +1849,16 @@ Performance varies by use, configuration and other factors. See [platform config + + Bert-Mini + text classification
SST-2 + accuracy=87.61
accuracy=86.92 + -0.79% + 50%
structured per channel + snip momentum
unbalanced + + + diff --git a/docs/sphinx-requirements.txt b/docs/sphinx-requirements.txt new file mode 100644 index 00000000000..b38e80ab0e9 --- /dev/null +++ b/docs/sphinx-requirements.txt @@ -0,0 +1,6 @@ +sphinx +pytorch_sphinx_theme +recommonmark +sphinx-markdown-tables +sphinx-md +sphinx-autoapi \ No newline at end of file diff --git a/docs/welcome.md b/docs/welcome.md deleted file mode 100644 index 51e12e13a40..00000000000 --- a/docs/welcome.md +++ /dev/null @@ -1,26 +0,0 @@ -Introduction to Intel® Neural Compressor -========================== - -Intel® Neural Compressor (formerly known as Intel® Low Precision Optimization Tool) is an open-source Python library running on Intel CPUs and GPUs, which delivers unified interfaces across multiple deep learning frameworks for popular network compression technologies, such as quantization, pruning, knowledge distillation. This tool supports automatic accuracy-driven tuning strategies to help user quickly find out the best quantized model. It also implements different weight pruning algorithms to generate pruned model with predefined sparsity goal and supports knowledge distillation to distill the knowledge from the teacher model to the student model. - -> **Note**: GPU support is under development. - -| Architecture | Workflow | -| - | - | -| ![Architecture](imgs/architecture.png "Architecture") | ![Workflow](imgs/workflow.png "Workflow") | - -Supported deep learning frameworks are: - -* [TensorFlow\*](https://github.com/Intel-tensorflow/tensorflow), including [1.15.0 UP3](https://github.com/Intel-tensorflow/tensorflow/tree/v1.15.0up3), [1.15.0 UP2](https://github.com/Intel-tensorflow/tensorflow/tree/v1.15.0up2), [1.15.0 UP1](https://github.com/Intel-tensorflow/tensorflow/tree/v1.15.0up1), [2.1.0](https://github.com/Intel-tensorflow/tensorflow/tree/v2.1.0), [2.2.0](https://github.com/Intel-tensorflow/tensorflow/tree/v2.2.0), [2.3.0](https://github.com/Intel-tensorflow/tensorflow/tree/v2.3.0), [2.4.0](https://github.com/Intel-tensorflow/tensorflow/tree/v2.4.0), [2.5.0](https://github.com/Intel-tensorflow/tensorflow/tree/v2.5.0), [Official TensorFlow 2.6.0](https://github.com/tensorflow/tensorflow/tree/v2.6.0) - -> **Note**: Intel Optimized TensorFlow 2.5.0 requires setting environment variable TF_ENABLE_MKL_NATIVE_FORMAT=0 before running quantization process or deploying the quantized model. - -> **Note**: From the official TensorFlow 2.6.0, oneDNN support has been upstreamed. Download the official TensorFlow 2.6.0 binary for the CPU device and set the environment variable TF_ENABLE_ONEDNN_OPTS=1 before running the quantization process or deploying the quantized model. - -* [PyTorch\*](https://pytorch.org/), including [1.5.0+cpu](https://download.pytorch.org/whl/torch_stable.html), [1.6.0+cpu](https://download.pytorch.org/whl/torch_stable.html), [1.8.0+cpu](https://download.pytorch.org/whl/torch_stable.html) -* [Apache\* MXNet](https://mxnet.apache.org), including [1.6.0](https://github.com/apache/incubator-mxnet/tree/1.6.0), [1.7.0](https://github.com/apache/incubator-mxnet/tree/1.7.0), [1.8.0](https://github.com/apache/incubator-mxnet/tree/1.8.0) -* [ONNX\* Runtime](https://github.com/microsoft/onnxruntime), including [1.6.0](https://github.com/microsoft/onnxruntime/tree/v1.6.0), [1.7.0](https://github.com/microsoft/onnxruntime/tree/v1.7.0), [1.8.0](https://github.com/microsoft/onnxruntime/tree/v1.8.0) - -[Get started](getting_started.md) with installation, tutorials, examples, and more! - -View the Intel® Neural Compressor repo at: . diff --git a/examples/.config/model_params_pytorch.json b/examples/.config/model_params_pytorch.json index 848c1e9f0c6..df42ff22308 100644 --- a/examples/.config/model_params_pytorch.json +++ b/examples/.config/model_params_pytorch.json @@ -531,6 +531,15 @@ "batch_size": 64, "new_benchmark": false }, + "gpt_j_wikitext":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/ptq_static/fx", + "dataset_location": "", + "input_model": "/tf_dataset2/models/pytorch/gpt-j-6B", + "yaml": "conf.yaml", + "strategy": "basic", + "batch_size": 8, + "new_benchmark": false + }, "xlm-roberta-base_MRPC": { "model_src_dir": "nlp/huggingface_models/text-classification/quantization/ptq_static/eager", "dataset_location": "", diff --git a/examples/README.md b/examples/README.md index 3cbfd9758d9..c2b6cbab811 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,6 +1,6 @@ Examples -=== -Intel® Neural Compressor validated examples with multiple compression techniques, including quantization, pruning, knowledge distillation and orchestration. Part of the validated cases can be found in the example tables, and the release data is available [here](../docs/validated_model_list.md). +========== +Intel® Neural Compressor validated examples with multiple compression techniques, including quantization, pruning, knowledge distillation and orchestration. Part of the validated cases can be found in the example tables, and the release data is available [here](../docs/source/validated_model_list.md). ## Helloworld Examples @@ -519,6 +519,12 @@ Intel® Neural Compressor validated examples with multiple compression technique Post-Training Dynamic Quantization eager + + GPTJ + Natural Language Processing + Post-Training Static Quantization + fx + diff --git a/examples/notebook/usage_example.md b/examples/notebook/usage_example.md index b1454ce04c8..90f910140e6 100644 --- a/examples/notebook/usage_example.md +++ b/examples/notebook/usage_example.md @@ -3,7 +3,7 @@ ## Steps The following diagram shows steps for enabling model with Neural Compressor: -Tutorial +Tutorial ## Example diff --git a/examples/pytorch/image_recognition/CNN-2/distillation/eager/README.md b/examples/pytorch/image_recognition/CNN-2/distillation/eager/README.md index 484ddc0d93a..3180b03d1ac 100644 --- a/examples/pytorch/image_recognition/CNN-2/distillation/eager/README.md +++ b/examples/pytorch/image_recognition/CNN-2/distillation/eager/README.md @@ -9,3 +9,14 @@ python train_without_distillation.py --model_type CNN-10 --epochs 200 --lr 0.1 - # for distillation of the student model CNN-2 with the teacher model CNN-10 python main.py --epochs 200 --lr 0.02 --name CNN-2-distillation --student_type CNN-2 --teacher_type CNN-10 --teacher_model runs/CNN-10/model_best.pth.tar --tensorboard ``` + +We also supported Distributed Data Parallel training on single node and multi nodes settings for distillation. To use Distributed Data Parallel to speedup training, the bash command needs a small adjustment. +
+For example, bash command will look like the following, where *``* is the address of the master node, it won't be necessary for single node case, *``* is the desired processes to use in current node, for node with GPU, usually set to number of GPUs in this node, for node without GPU and use CPU for training, it's recommended set to 1, *``* is the number of nodes to use, *``* is the rank of the current node, rank starts from 0 to *``*`-1`. +
+Also please note that to use CPU for training in each node with multi nodes settings, argument `--no_cuda` is mandatory. In multi nodes setting, following command needs to be launched in each node, and all the commands should be the same except for *``*, which should be integer from 0 to *``*`-1` assigned to each node. + +```bash +python -m torch.distributed.launch --master_addr= --nproc_per_node= --nnodes= --node_rank= \ + main.py --epochs 200 --lr 0.02 --name CNN-2-distillation --student_type CNN-2 --teacher_type CNN-10 --teacher_model runs/CNN-10/model_best.pth.tar --tensorboard +``` \ No newline at end of file diff --git a/examples/pytorch/image_recognition/CNN-2/distillation/eager/main.py b/examples/pytorch/image_recognition/CNN-2/distillation/eager/main.py index e24eb7767ff..685a0109450 100644 --- a/examples/pytorch/image_recognition/CNN-2/distillation/eager/main.py +++ b/examples/pytorch/image_recognition/CNN-2/distillation/eager/main.py @@ -10,6 +10,7 @@ import torchvision.datasets as datasets import torchvision.transforms as transforms +from accelerate import Accelerator from plain_cnn_cifar import ConvNetMaker, plane_cifar100_book # used for logging to TensorBoard @@ -60,6 +61,7 @@ help='loss weights of distillation, should be a list of length 2, ' 'and sum to 1.0, first for student targets loss weight, ' 'second for teacher student loss weight.') +parser.add_argument("--no_cuda", action='store_true', help='use cpu for training.') parser.set_defaults(augment=True) @@ -75,10 +77,13 @@ def set_seed(seed): def main(): global args, best_prec1 args, _ = parser.parse_known_args() + accelerator = Accelerator(cpu=args.no_cuda) + best_prec1 = 0 if args.seed is not None: set_seed(args.seed) - if args.tensorboard: configure("runs/%s" % (args.name)) + with accelerator.local_main_process_first(): + if args.tensorboard: configure("runs/%s"%(args.name)) # Data loading code normalize = transforms.Normalize(mean=[0.5071, 0.4866, 0.4409], std=[0.2675, 0.2565, 0.2761]) @@ -121,9 +126,9 @@ def main(): raise NotImplementedError('Unsupported student model type') # get the number of model parameters - print('Number of teacher model parameters: {}'.format( + accelerator.print('Number of teacher model parameters: {}'.format( sum([p.data.nelement() for p in teacher_model.parameters()]))) - print('Number of student model parameters: {}'.format( + accelerator.print('Number of student model parameters: {}'.format( sum([p.data.nelement() for p in student_model.parameters()]))) kwargs = {'num_workers': 0, 'pin_memory': True} @@ -135,10 +140,10 @@ def main(): if args.loss_weights[1] > 0: from tqdm import tqdm def get_logits(teacher_model, train_dataset): - print("***** Getting logits of teacher model *****") - print(f" Num examples = {len(train_dataset) }") + accelerator.print("***** Getting logits of teacher model *****") + accelerator.print(f" Num examples = {len(train_dataset) }") logits_file = os.path.join(os.path.dirname(args.teacher_model), 'teacher_logits.npy') - if not os.path.exists(logits_file): + if not os.path.exists(logits_file) and accelerator.is_local_main_process: teacher_model.eval() train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, **kwargs) train_dataloader = tqdm(train_dataloader, desc="Evaluating") @@ -147,8 +152,8 @@ def get_logits(teacher_model, train_dataset): outputs = teacher_model(input) teacher_logits += [x for x in outputs.numpy()] np.save(logits_file, np.array(teacher_logits)) - else: - teacher_logits = np.load(logits_file) + accelerator.wait_for_everyone() + teacher_logits = np.load(logits_file) train_dataset.targets = [{'labels':l, 'teacher_logits':tl} \ for l, tl in zip(train_dataset.targets, teacher_logits)] return train_dataset @@ -163,15 +168,15 @@ def get_logits(teacher_model, train_dataset): # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): - print("=> loading checkpoint '{}'".format(args.resume)) + accelerator.print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] student_model.load_state_dict(checkpoint['state_dict']) - print("=> loaded checkpoint '{}' (epoch {})" + accelerator.print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: - print("=> no checkpoint found at '{}'".format(args.resume)) + accelerator.print("=> no checkpoint found at '{}'".format(args.resume)) # define optimizer optimizer = torch.optim.SGD(student_model.parameters(), args.lr, @@ -179,13 +184,18 @@ def get_logits(teacher_model, train_dataset): weight_decay=args.weight_decay) # cosine learning rate - scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(train_loader)*args.epochs) + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, len(train_loader) * args.epochs // accelerator.num_processes + ) + + student_model, teacher_model, train_loader, val_loader, optimizer = \ + accelerator.prepare(student_model, teacher_model, train_loader, val_loader, optimizer) def train_func(model): - return train(train_loader, model, scheduler, distiller, best_prec1) + return train(train_loader, model, scheduler, distiller, best_prec1, accelerator) def eval_func(model): - return validate(val_loader, model, distiller) + return validate(val_loader, model, distiller, accelerator) from neural_compressor.experimental import Distillation, common from neural_compressor.experimental.common.criterion import PyTorchKnowledgeDistillationLoss @@ -204,11 +214,12 @@ def eval_func(model): directory = "runs/%s/"%(args.name) os.makedirs(directory, exist_ok=True) + model._model = accelerator.unwrap_model(model.model) model.save(directory) # change to framework model for further use model = model.model -def train(train_loader, model, scheduler, distiller, best_prec1): +def train(train_loader, model, scheduler, distiller, best_prec1, accelerator): distiller.on_train_begin() for epoch in range(args.start_epoch, args.epochs): """Train for one epoch on the training set""" @@ -233,13 +244,15 @@ def train(train_loader, model, scheduler, distiller, best_prec1): loss = distiller.on_after_compute_loss(input, output, loss, teacher_logits) # measure accuracy and record loss + output = accelerator.gather(output) + target = accelerator.gather(target) prec1 = accuracy(output.data, target, topk=(1,))[0] - losses.update(loss.data.item(), input.size(0)) - top1.update(prec1.item(), input.size(0)) + losses.update(accelerator.gather(loss).sum().data.item(), input.size(0)*accelerator.num_processes) + top1.update(prec1.item(), input.size(0)*accelerator.num_processes) # compute gradient and do SGD step distiller.optimizer.zero_grad() - loss.backward() + accelerator.backward(loss) # loss.backward() distiller.optimizer.step() scheduler.step() @@ -248,7 +261,7 @@ def train(train_loader, model, scheduler, distiller, best_prec1): end = time.time() if i % args.print_freq == 0: - print('Epoch: [{0}][{1}/{2}]\t' + accelerator.print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' @@ -260,19 +273,20 @@ def train(train_loader, model, scheduler, distiller, best_prec1): # remember best prec@1 and save checkpoint is_best = distiller.best_score > best_prec1 best_prec1 = max(distiller.best_score, best_prec1) - save_checkpoint({ - 'epoch': distiller._epoch_runned + 1, - 'state_dict': model.state_dict(), - 'best_prec1': best_prec1, - }, is_best) - # log to TensorBoard - if args.tensorboard: - log_value('train_loss', losses.avg, epoch) - log_value('train_acc', top1.avg, epoch) - log_value('learning_rate', scheduler._last_lr[0], epoch) + if accelerator.is_local_main_process: + save_checkpoint({ + 'epoch': distiller._epoch_runned + 1, + 'state_dict': model.state_dict(), + 'best_prec1': best_prec1, + }, is_best) + # log to TensorBoard + if args.tensorboard: + log_value('train_loss', losses.avg, epoch) + log_value('train_acc', top1.avg, epoch) + log_value('learning_rate', scheduler._last_lr[0], epoch) -def validate(val_loader, model, distiller): +def validate(val_loader, model, distiller, accelerator): """Perform validation on the validation set""" batch_time = AverageMeter() top1 = AverageMeter() @@ -287,6 +301,8 @@ def validate(val_loader, model, distiller): output = model(input) # measure accuracy + output = accelerator.gather(output) + target = accelerator.gather(target) prec1 = accuracy(output.data, target, topk=(1,))[0] top1.update(prec1.item(), input.size(0)) @@ -295,15 +311,15 @@ def validate(val_loader, model, distiller): end = time.time() if i % args.print_freq == 0: - print('Test: [{0}/{1}]\t' + accelerator.print('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format( i, len(val_loader), batch_time=batch_time, top1=top1)) - print(' * Prec@1 {top1.avg:.3f}'.format(top1=top1)) + accelerator.print(' * Prec@1 {top1.avg:.3f}'.format(top1=top1)) # log to TensorBoard - if args.tensorboard: + if accelerator.is_local_main_process and args.tensorboard: log_value('val_acc', top1.avg, distiller._epoch_runned) return top1.avg diff --git a/examples/pytorch/image_recognition/CNN-2/distillation/eager/requirements.txt b/examples/pytorch/image_recognition/CNN-2/distillation/eager/requirements.txt index 8db2f310ef5..71252629880 100644 --- a/examples/pytorch/image_recognition/CNN-2/distillation/eager/requirements.txt +++ b/examples/pytorch/image_recognition/CNN-2/distillation/eager/requirements.txt @@ -2,3 +2,4 @@ torch==1.5.0+cpu torchvision==0.6.0+cpu tensorboard_logger +accelerate \ No newline at end of file diff --git a/examples/pytorch/image_recognition/MobileNetV2-0.35/distillation/eager/README.md b/examples/pytorch/image_recognition/MobileNetV2-0.35/distillation/eager/README.md index 14841061fdc..d449d5f797b 100644 --- a/examples/pytorch/image_recognition/MobileNetV2-0.35/distillation/eager/README.md +++ b/examples/pytorch/image_recognition/MobileNetV2-0.35/distillation/eager/README.md @@ -8,4 +8,15 @@ pip install -r requirements.txt python train_without_distillation.py --epochs 200 --lr 0.1 --layers 40 --widen-factor 2 --name WideResNet-40-2 --tensorboard # for distillation of the teacher model WideResNet40-2 to the student model MobileNetV2-0.35 python main.py --epochs 200 --lr 0.02 --name MobileNetV2-0.35-distillation --teacher_model runs/WideResNet-40-2/model_best.pth.tar --tensorboard --seed 9 +``` + +We also supported Distributed Data Parallel training on single node and multi nodes settings for distillation. To use Distributed Data Parallel to speedup training, the bash command needs a small adjustment. +
+For example, bash command will look like the following, where *``* is the address of the master node, it won't be necessary for single node case, *``* is the desired processes to use in current node, for node with GPU, usually set to number of GPUs in this node, for node without GPU and use CPU for training, it's recommended set to 1, *``* is the number of nodes to use, *``* is the rank of the current node, rank starts from 0 to *``*`-1`. +
+Also please note that to use CPU for training in each node with multi nodes settings, argument `--no_cuda` is mandatory. In multi nodes setting, following command needs to be launched in each node, and all the commands should be the same except for *``*, which should be integer from 0 to *``*`-1` assigned to each node. + +```bash +python -m torch.distributed.launch --master_addr= --nproc_per_node= --nnodes= --node_rank= \ + main.py --epochs 200 --lr 0.02 --name MobileNetV2-0.35-distillation --teacher_model runs/WideResNet-40-2/model_best.pth.tar --tensorboard --seed 9 ``` \ No newline at end of file diff --git a/examples/pytorch/image_recognition/MobileNetV2-0.35/distillation/eager/main.py b/examples/pytorch/image_recognition/MobileNetV2-0.35/distillation/eager/main.py index e7f4e56888b..3778162d968 100644 --- a/examples/pytorch/image_recognition/MobileNetV2-0.35/distillation/eager/main.py +++ b/examples/pytorch/image_recognition/MobileNetV2-0.35/distillation/eager/main.py @@ -10,6 +10,7 @@ import torchvision.datasets as datasets import torchvision.transforms as transforms +from accelerate import Accelerator from wideresnet import WideResNet # used for logging to TensorBoard @@ -60,6 +61,7 @@ help='loss weights of distillation, should be a list of length 2, ' 'and sum to 1.0, first for student targets loss weight, ' 'second for teacher student loss weight.') +parser.add_argument("--no_cuda", action='store_true', help='use cpu for training.') parser.set_defaults(augment=True) def set_seed(seed): @@ -73,10 +75,13 @@ def set_seed(seed): def main(): global args, best_prec1 args, _ = parser.parse_known_args() + accelerator = Accelerator(cpu=args.no_cuda) + best_prec1 = 0 if args.seed is not None: set_seed(args.seed) - if args.tensorboard: configure("runs/%s"%(args.name)) + with accelerator.local_main_process_first(): + if args.tensorboard: configure("runs/%s"%(args.name)) # Data loading code normalize = transforms.Normalize(mean=[x/255.0 for x in [125.3, 123.0, 113.9]], @@ -111,9 +116,9 @@ def main(): student_model = mobilenet.MobileNetV2(num_classes=10, width_mult=0.35) # get the number of model parameters - print('Number of teacher model parameters: {}'.format( + accelerator.print('Number of teacher model parameters: {}'.format( sum([p.data.nelement() for p in teacher_model.parameters()]))) - print('Number of student model parameters: {}'.format( + accelerator.print('Number of student model parameters: {}'.format( sum([p.data.nelement() for p in student_model.parameters()]))) kwargs = {'num_workers': 0, 'pin_memory': True} @@ -125,10 +130,10 @@ def main(): if args.loss_weights[1] > 0: from tqdm import tqdm def get_logits(teacher_model, train_dataset): - print("***** Getting logits of teacher model *****") - print(f" Num examples = {len(train_dataset) }") + accelerator.print("***** Getting logits of teacher model *****") + accelerator.print(f" Num examples = {len(train_dataset) }") logits_file = os.path.join(os.path.dirname(args.teacher_model), 'teacher_logits.npy') - if not os.path.exists(logits_file): + if not os.path.exists(logits_file) and accelerator.is_local_main_process: teacher_model.eval() train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, **kwargs) train_dataloader = tqdm(train_dataloader, desc="Evaluating") @@ -137,8 +142,8 @@ def get_logits(teacher_model, train_dataset): outputs = teacher_model(input) teacher_logits += [x for x in outputs.numpy()] np.save(logits_file, np.array(teacher_logits)) - else: - teacher_logits = np.load(logits_file) + accelerator.wait_for_everyone() + teacher_logits = np.load(logits_file) train_dataset.targets = [{'labels':l, 'teacher_logits':tl} \ for l, tl in zip(train_dataset.targets, teacher_logits)] return train_dataset @@ -153,15 +158,15 @@ def get_logits(teacher_model, train_dataset): # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): - print("=> loading checkpoint '{}'".format(args.resume)) + accelerator.print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] student_model.load_state_dict(checkpoint['state_dict']) - print("=> loaded checkpoint '{}' (epoch {})" + accelerator.print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: - print("=> no checkpoint found at '{}'".format(args.resume)) + accelerator.print("=> no checkpoint found at '{}'".format(args.resume)) # define optimizer optimizer = torch.optim.SGD(student_model.parameters(), args.lr, @@ -169,13 +174,18 @@ def get_logits(teacher_model, train_dataset): weight_decay=args.weight_decay) # cosine learning rate - scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(train_loader)*args.epochs) + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, len(train_loader) * args.epochs // accelerator.num_processes + ) + + student_model, teacher_model, train_loader, val_loader, optimizer = \ + accelerator.prepare(student_model, teacher_model, train_loader, val_loader, optimizer) def train_func(model): - return train(train_loader, model, scheduler, distiller, best_prec1) + return train(train_loader, model, scheduler, distiller, best_prec1, accelerator) def eval_func(model): - return validate(val_loader, model, distiller) + return validate(val_loader, model, distiller, accelerator) from neural_compressor.experimental import Distillation, common from neural_compressor.experimental.common.criterion import PyTorchKnowledgeDistillationLoss @@ -194,11 +204,12 @@ def eval_func(model): directory = "runs/%s/"%(args.name) os.makedirs(directory, exist_ok=True) + model._model = accelerator.unwrap_model(model.model) model.save(directory) # change to framework model for further use model = model.model -def train(train_loader, model, scheduler, distiller, best_prec1): +def train(train_loader, model, scheduler, distiller, best_prec1, accelerator): distiller.on_train_begin() for epoch in range(args.start_epoch, args.epochs): """Train for one epoch on the training set""" @@ -222,13 +233,15 @@ def train(train_loader, model, scheduler, distiller, best_prec1): loss = distiller.on_after_compute_loss(input, output, loss, teacher_logits) # measure accuracy and record loss + output = accelerator.gather(output) + target = accelerator.gather(target) prec1 = accuracy(output.data, target, topk=(1,))[0] - losses.update(loss.data.item(), input.size(0)) - top1.update(prec1.item(), input.size(0)) + losses.update(accelerator.gather(loss).sum().data.item(), input.size(0)*accelerator.num_processes) + top1.update(prec1.item(), input.size(0)*accelerator.num_processes) # compute gradient and do SGD step distiller.optimizer.zero_grad() - loss.backward() + accelerator.backward(loss) # loss.backward() distiller.optimizer.step() scheduler.step() @@ -237,7 +250,7 @@ def train(train_loader, model, scheduler, distiller, best_prec1): end = time.time() if i % args.print_freq == 0: - print('Epoch: [{0}][{1}/{2}]\t' + accelerator.print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' @@ -249,19 +262,20 @@ def train(train_loader, model, scheduler, distiller, best_prec1): # remember best prec@1 and save checkpoint is_best = distiller.best_score > best_prec1 best_prec1 = max(distiller.best_score, best_prec1) - save_checkpoint({ - 'epoch': distiller._epoch_runned + 1, - 'state_dict': model.state_dict(), - 'best_prec1': best_prec1, - }, is_best) - # log to TensorBoard - if args.tensorboard: - log_value('train_loss', losses.avg, epoch) - log_value('train_acc', top1.avg, epoch) - log_value('learning_rate', scheduler._last_lr[0], epoch) + if accelerator.is_local_main_process: + save_checkpoint({ + 'epoch': distiller._epoch_runned + 1, + 'state_dict': model.state_dict(), + 'best_prec1': best_prec1, + }, is_best) + # log to TensorBoard + if args.tensorboard: + log_value('train_loss', losses.avg, epoch) + log_value('train_acc', top1.avg, epoch) + log_value('learning_rate', scheduler._last_lr[0], epoch) -def validate(val_loader, model, distiller): +def validate(val_loader, model, distiller, accelerator): """Perform validation on the validation set""" batch_time = AverageMeter() top1 = AverageMeter() @@ -276,6 +290,8 @@ def validate(val_loader, model, distiller): output = model(input) # measure accuracy + output = accelerator.gather(output) + target = accelerator.gather(target) prec1 = accuracy(output.data, target, topk=(1,))[0] top1.update(prec1.item(), input.size(0)) @@ -284,15 +300,15 @@ def validate(val_loader, model, distiller): end = time.time() if i % args.print_freq == 0: - print('Test: [{0}/{1}]\t' + accelerator.print('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format( i, len(val_loader), batch_time=batch_time, top1=top1)) - print(' * Prec@1 {top1.avg:.3f}'.format(top1=top1)) + accelerator.print(' * Prec@1 {top1.avg:.3f}'.format(top1=top1)) # log to TensorBoard - if args.tensorboard: + if accelerator.is_local_main_process and args.tensorboard: log_value('val_acc', top1.avg, distiller._epoch_runned) return top1.avg diff --git a/examples/pytorch/image_recognition/MobileNetV2-0.35/distillation/eager/requirements.txt b/examples/pytorch/image_recognition/MobileNetV2-0.35/distillation/eager/requirements.txt index 8db2f310ef5..71252629880 100644 --- a/examples/pytorch/image_recognition/MobileNetV2-0.35/distillation/eager/requirements.txt +++ b/examples/pytorch/image_recognition/MobileNetV2-0.35/distillation/eager/requirements.txt @@ -2,3 +2,4 @@ torch==1.5.0+cpu torchvision==0.6.0+cpu tensorboard_logger +accelerate \ No newline at end of file diff --git a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/ipex/main.py b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/ipex/main.py index 7186718ab61..a5ce5a9885c 100644 --- a/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/ipex/main.py +++ b/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/ipex/main.py @@ -24,6 +24,7 @@ import torchvision.models as models from neural_compressor.adaptor.pytorch import get_torch_version from packaging.version import Version +import intel_extension_for_pytorch model_names = sorted(name for name in models.__dict__ diff --git a/examples/pytorch/nlp/huggingface_models/common/README.md b/examples/pytorch/nlp/huggingface_models/common/README.md index 17a2b3d22e3..4904434f6b4 100644 --- a/examples/pytorch/nlp/huggingface_models/common/README.md +++ b/examples/pytorch/nlp/huggingface_models/common/README.md @@ -16,7 +16,7 @@ limitations under the License.


- +

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/README.md new file mode 100644 index 00000000000..b3599c59a88 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/README.md @@ -0,0 +1,38 @@ +Step-by-Step +============ + +This document is used to list steps of reproducing PyTorch BERT tuning zoo result. + +# Prerequisite + +## 1. Installation + +The dependent packages are all in requirements, please install as following. + +``` +pip install -r requirements.txt +``` + +## 2. Run + +If the automatic download from modelhub fails, you can download [EleutherAI/gpt-j-6B](https://huggingface.co/EleutherAI/gpt-j-6B?text=My+name+is+Clara+and+I+am) offline. + +```shell + +python run_clm.py \ + --model_name_or_path EleutherAI/gpt-j-6B \ + --dataset_name wikitext\ + --dataset_config_name wikitext-2-raw-v1 \ + --do_train \ + --do_eval \ + --tune \ + --output_dir /path/to/checkpoint/dir +``` + + +## 3. Command + +``` +bash run_tuning.sh --topology=gpt_j_wikitext +bash run_benchmark.sh --topology=gpt_j_wikitext --mode=performance --int8=true +``` diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/conf.yaml b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/conf.yaml new file mode 100644 index 00000000000..0f75f809781 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/conf.yaml @@ -0,0 +1,31 @@ +# +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +version: 1.0 + +model: # mandatory. used to specify model specific information. + name: bert + framework: pytorch_fx # mandatory. possible values are tensorflow, mxnet, pytorch, pytorch_ipex, onnxrt_integerops and onnxrt_qlinearops. + +quantization: # optional. tuning constraints on model-wise for advance user to reduce tuning space. + approach: post_training_static_quant + +tuning: + accuracy_criterion: + relative: 0.5 # optional. default value is relative, other value is absolute. this example allows relative accuracy loss: 1%. + higher_is_better: False + exit_policy: + max_trials: 600 + random_seed: 9527 # optional. random seed for deterministic tuning. diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/requirements.txt new file mode 100644 index 00000000000..763bed755a8 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/requirements.txt @@ -0,0 +1,5 @@ +sentencepiece != 0.1.92 +protobuf +evaluate +datasets +transformers >= 4.22.0 diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_benchmark.sh new file mode 100644 index 00000000000..a36507f4fca --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_benchmark.sh @@ -0,0 +1,91 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + iters=100 + batch_size=16 + tuned_checkpoint=saved_results + max_eval_samples=`expr ${iters} \* ${batch_size}` + echo ${max_eval_samples} + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + --iters=*) + iters=$(echo ${var} |cut -f2 -d=) + ;; + --int8=*) + int8=$(echo ${var} |cut -f2 -d=) + ;; + --config=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + + +# run_benchmark +function run_benchmark { + extra_cmd='' + + if [[ ${mode} == "accuracy" ]]; then + mode_cmd=" --accuracy_only " + elif [[ ${mode} == "benchmark" ]]; then + mode_cmd=" --benchmark " + extra_cmd=$extra_cmd" --max_eval_samples ${max_eval_samples}" + else + echo "Error: No such mode: ${mode}" + exit 1 + fi + + if [ "${topology}" = "gpt_j_wikitext" ]; then + TASK_NAME='wikitext' + model_name_or_path=$input_model + extra_cmd='--dataset_config_name=wikitext-2-raw-v1' + fi + + if [[ ${int8} == "true" ]]; then + extra_cmd=$extra_cmd" --int8" + fi + echo $extra_cmd + + python -u run_clm.py \ + --model_name_or_path ${model_name_or_path} \ + --dataset_name ${TASK_NAME} \ + --do_eval \ + --per_device_eval_batch_size ${batch_size} \ + --output_dir ${tuned_checkpoint} \ + ${mode_cmd} \ + ${extra_cmd} + +} + +main "$@" diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_clm.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_clm.py new file mode 100644 index 00000000000..17a32f1b57a --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_clm.py @@ -0,0 +1,650 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. +Here is the full list of checkpoints on the hub that can be fine-tuned by this script: +https://huggingface.co/models?filter=text-generation +""" +# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. + +import logging +import math +import os +import sys +from dataclasses import dataclass, field +from itertools import chain +from typing import Optional + +import datasets +from datasets import load_dataset + +import evaluate +import transformers +from transformers import ( + CONFIG_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + HfArgumentParser, + Trainer, + TrainingArguments, + default_data_collator, + is_torch_tpu_available, + set_seed, +) +from transformers.testing_utils import CaptureLogger +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.22.0.dev0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") + +logger = logging.getLogger(__name__) + + +MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + """ + + model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": ( + "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch." + ) + }, + ) + model_type: Optional[str] = field( + default=None, + metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + ) + config_overrides: Optional[str] = field( + default=None, + metadata={ + "help": ( + "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + ) + }, + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": ( + "Will use the token generated when running `huggingface-cli login` (necessary to use this script " + "with private models)." + ) + }, + ) + tune: bool = field( + default=False, metadata={"help": "tune quantized model with Neural Compressor"} + ) + int8: bool = field( + default=False, metadata={"help": "use int8 model to get accuracy or benchmark"} + ) + benchmark: bool = field( + default=False, metadata={"help": "get benchmark instead of accuracy"} + ) + accuracy_only: bool = field( + default=False, metadata={"help": "get accuracy"} + ) + + def __post_init__(self): + if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): + raise ValueError( + "--config_overrides can't be used in combination with --config_name or --model_name_or_path" + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + ) + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + ) + }, + ) + + block_size: Optional[int] = field( + default=None, + metadata={ + "help": ( + "Optional input sequence length after tokenization. " + "The training dataset will be truncated in block of this size for training. " + "Default to the model max input length for single sentence inputs (take into account special tokens)." + ) + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + validation_split_percentage: Optional[int] = field( + default=5, + metadata={ + "help": "The percentage of the train set used as validation set in case there's no validation split" + }, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + keep_linebreaks: bool = field( + default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} + ) + + def __post_init__(self): + if self.dataset_name is None and self.train_file is None and self.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The + # information sent is the one passed as arguments along with your Python/PyTorch versions. + send_example_telemetry("run_clm", model_args, data_args) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) + raw_datasets["train"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + data_files = {} + dataset_args = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = ( + data_args.train_file.split(".")[-1] + if data_args.train_file is not None + else data_args.validation_file.split(".")[-1] + ) + if extension == "txt": + extension = "text" + dataset_args["keep_linebreaks"] = data_args.keep_linebreaks + raw_datasets = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + **dataset_args, + ) + # If no validation data is there, validation_split_percentage will be used to divide the dataset. + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + **dataset_args, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + **dataset_args, + ) + + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + config_kwargs = { + "cache_dir": model_args.cache_dir, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.config_name: + config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) + elif model_args.model_name_or_path: + config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) + else: + config = CONFIG_MAPPING[model_args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + if model_args.config_overrides is not None: + logger.info(f"Overriding config: {model_args.config_overrides}") + config.update_from_string(model_args.config_overrides) + logger.info(f"New config: {config}") + + tokenizer_kwargs = { + "cache_dir": model_args.cache_dir, + "use_fast": model_args.use_fast_tokenizer, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) + elif model_args.model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) + else: + raise ValueError( + "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You can do it from another script, save it, and load it from here, using --tokenizer_name." + ) + + if model_args.model_name_or_path: + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + model = AutoModelForCausalLM.from_config(config) + n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) + + # Preprocessing the datasets. + # First we tokenize all the texts. + if training_args.do_train: + column_names = raw_datasets["train"].column_names + else: + column_names = raw_datasets["validation"].column_names + text_column_name = "text" if "text" in column_names else column_names[0] + + # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function + tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + + def tokenize_function(examples): + with CaptureLogger(tok_logger) as cl: + output = tokenizer(examples[text_column_name]) + # clm input could be much much longer than block_size + if "Token indices sequence length is longer than the" in cl.out: + tok_logger.warning( + "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits" + " before being passed to the model." + ) + return output + + with training_args.main_process_first(desc="dataset map tokenization"): + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + + if data_args.block_size is None: + block_size = tokenizer.model_max_length + if block_size > 1024: + logger.warning( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + "Picking 1024 instead. You can change that default value by passing --block_size xxx." + ) + block_size = 1024 + else: + if data_args.block_size > tokenizer.model_max_length: + logger.warning( + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" + f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + ) + block_size = min(data_args.block_size, tokenizer.model_max_length) + + # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= block_size: + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder + # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower + # to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + + with training_args.main_process_first(desc="grouping texts together"): + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) + + if training_args.do_train: + if "train" not in tokenized_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = lm_datasets["train"] + if data_args.max_train_samples is not None: + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) + + if training_args.do_eval: + if "validation" not in tokenized_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = lm_datasets["validation"] + if data_args.max_eval_samples is not None: + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) + + def preprocess_logits_for_metrics(logits, labels): + if isinstance(logits, tuple): + # Depending on the model and config, logits may contain extra tensors, + # like past_key_values, but logits always come first + logits = logits[0] + return logits.argmax(dim=-1) + + metric = evaluate.load("accuracy") + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # preds have the same shape as the labels, after the argmax(-1) has been calculated + # by preprocess_logits_for_metrics but we need to shift the labels + labels = labels[:, 1:].reshape(-1) + preds = preds[:, :-1].reshape(-1) + return metric.compute(predictions=preds, references=labels) + + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + # Data collator will default to DataCollatorWithPadding, so we change it. + data_collator=default_data_collator, + compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None, + preprocess_logits_for_metrics=preprocess_logits_for_metrics + if training_args.do_eval and not is_torch_tpu_available() + else None, + ) + + # Tune + if model_args.tune: + def eval_func_for_nc(model_tuned): + trainer.model = model_tuned + eval_output = trainer.evaluate(eval_dataset=eval_dataset) + perplexity = math.exp(eval_output["eval_loss"]) + results = {"perplexity":perplexity,"eval_loss":eval_output["eval_loss"],\ + "eval_samples_per_second":eval_output['eval_samples_per_second']} + clm_task_metrics_keys = ["perplexity","eval_loss"] + for key in clm_task_metrics_keys: + if key in results.keys(): + logger.info("Finally Eval {}:{}".format(key, results[key])) + if key=="eval_loss": + eval_loss = results[key] + break + print("Accuracy: %.5f" % eval_loss) + print('Throughput: %.3f samples/sec' % (results["eval_samples_per_second"])) + print('Latency: %.3f ms' % (1 * 1000 / results["eval_samples_per_second"])) + print('Batch size = %d' % training_args.per_device_eval_batch_size) + + return eval_loss + + from neural_compressor.experimental import Quantization, common + quantizer = Quantization("./conf.yaml") + quantizer.model = common.Model(model) + quantizer.calib_dataloader = trainer.get_eval_dataloader() + quantizer.eval_func = eval_func_for_nc + q_model = quantizer.fit() + q_model.save(training_args.output_dir) + exit(0) + + # Benchmark or accuracy + if model_args.benchmark or model_args.accuracy_only: + if model_args.int8: + from neural_compressor.utils.pytorch import load + new_model = load( + os.path.abspath(os.path.expanduser(training_args.output_dir)), model) + else: + new_model = model + trainer.model = new_model + eval_output = trainer.evaluate(eval_dataset=eval_dataset) + perplexity = math.exp(eval_output["eval_loss"]) + results = {"perplexity":perplexity,"eval_loss":eval_output["eval_loss"],\ + "eval_samples_per_second":eval_output['eval_samples_per_second']} + clm_task_metrics_keys = ["eval_loss"] + for key in clm_task_metrics_keys: + if key in results.keys(): + acc = results[key] + break + print("Accuracy: %.5f" % acc) + print('Throughput: %.3f samples/sec' % (results["eval_samples_per_second"])) + print('Latency: %.3f ms' % (1 * 1000 / results["eval_samples_per_second"])) + print('Batch size = %d' % training_args.per_device_eval_batch_size) + exit(0) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + metrics = trainer.evaluate() + + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + try: + perplexity = math.exp(metrics["eval_loss"]) + except OverflowError: + perplexity = float("inf") + metrics["perplexity"] = perplexity + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + + if training_args.push_to_hub: + trainer.push_to_hub(**kwargs) + else: + trainer.create_model_card(**kwargs) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_tuning.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_tuning.sh new file mode 100644 index 00000000000..04b16872a59 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_tuning.sh @@ -0,0 +1,63 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + tuned_checkpoint=saved_results + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + +# run_tuning +function run_tuning { + extra_cmd='' + batch_size=8 + model_type='bert' + approach='post_training_static_quant' + + if [ "${topology}" = "gpt_j_wikitext" ]; then + TASK_NAME='wikitext' + model_name_or_path=$input_model + extra_cmd='--dataset_config_name=wikitext-2-raw-v1' + fi + + + python -u run_clm.py \ + --model_name_or_path ${model_name_or_path} \ + --dataset_name ${TASK_NAME} \ + --do_eval \ + --per_device_eval_batch_size ${batch_size} \ + --output_dir ${tuned_checkpoint} \ + --tune \ + ${extra_cmd} + +} + +main "$@" diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/README.md b/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/README.md index 610b624d747..4d340c5e466 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/README.md +++ b/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/README.md @@ -144,6 +144,40 @@ python3 ./run_glue_no_trainer.py \ --lr_scheduler_type "constant"\ --do_prune ``` +Also, per-channel pruning is also supported. +``` +python3 ./run_glue_no_trainer.py \ + --model_name_or_path "./mrpcbaseline/bert-mini/" \ + --pruning_config "./bert_mini_mrpc_1xchannel.yaml" \ + --task_name "mrpc" \ + --max_length "128" \ + --per_device_train_batch_size "16" \ + --learning_rate "1e-3" \ + --num_train_epochs "15" \ + --weight_decay "1e-3" \ + --cooldown_epochs "5" \ + --sparsity_warm_epochs "1"\ + --lr_scheduler_type "constant"\ + --distill_loss_weight "5"\ + --do_prune +``` +``` +python3 ./run_glue_no_trainer.py \ + --model_name_or_path "./sst2_baseline/bert-mini/" \ + --pruning_config "./bert_mini_sst2_1xchannel.yaml" \ + --task_name "sst2" \ + --max_length "128" \ + --per_device_train_batch_size "16" \ + --learning_rate "5e-5" \ + --distill_loss_weight "2.0" \ + --num_train_epochs "15" \ + --weight_decay "5e-5" \ + --cooldown_epochs "5" \ + --sparsity_warm_epochs "0"\ + --lr_scheduler_type "constant"\ + --do_prune +``` + We can also train a dense model on glue datasets (by setting --do_prune to False): ``` python run_glue_no_trainer.py --model_name_or_path "./bert-mini" --task_name "sst2" --max_length "128" --per_device_train_batch_size "32" --learning_rate "5e-5" --num_train_epochs "10" --output_dir "result/" 2>&1 | tee sst2_orig.log @@ -158,12 +192,14 @@ python3 run_glue_no_trainer.py --model_name_or_path "./bert-mini" --task_name | :----: | :----: | :----: | :----: |:----:|:----:| :----: | :----: | :----: | | Bert-Mini | MRPC | 4x1 |Snip-momentum| 0.8804 | Dense & Finetuned | 0.8619/0.8752 | 0.8610/0.8722 | -0.34% | | Bert-Mini | MRPC | 2:4 |Snip-momentum| 0.4795 | Dense & Finetuned | 0.8619/0.8752| 0.8562/0.8695 | -0.65% | +| Bert-Mini | MRPC | per channel |Snip-momentum| 0.66 | Dense & Finetuned | 0.8619/0.8752| 0.8629/0.8680 | -0.83% | #### SST-2 | Model | Dataset | Sparsity pattern | Pruning methods |Element-wise/matmul, Gemm, conv ratio | Init model | Dense Accuracy (mean/max) | Sparse Accuracy (mean/max)| Relative drop| | :----: | :----: | :----: | :----: |:----:|:----:| :----: | :----: | :----: | | Bert-Mini | SST-2 | 4x1 |Snip-momentum| 0.8815 | Dense & Finetuned | 0.8660/0.8761 | 0.8651/0.8692 | -0.79% | | Bert-Mini | SST-2 | 2:4 |Snip-momentum| 0.4795 | Dense & Finetuned | 0.8660/0.8761 | 0.8609/0.8693| -0.78% | +| Bert-Mini | SST-2 | per channel |Snip-momentum| 0.53 | Dense & Finetuned | 0.8660/0.8761 | 0.8651/0.8692| -0.79% | ## References * [SNIP: Single-shot Network Pruning based on Connection Sensitivity](https://arxiv.org/abs/1810.02340) diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/bert_mini_mrpc_1xchannel.yaml b/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/bert_mini_mrpc_1xchannel.yaml new file mode 100644 index 00000000000..33b29c17c6c --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/bert_mini_mrpc_1xchannel.yaml @@ -0,0 +1,23 @@ +version: 1.0 + +model: + name: "bert-mini" + framework: "pytorch" + +pruning: + approach: + weight_compression_pytorch: + start_step: 0 + end_step: 0 + excluded_names: ["classifier", "pooler", ".*embeddings*"] + prune_layer_type: ["Linear"] + target_sparsity: 0.9 + max_sparsity_ratio_per_layer: 0.98 + + pruners: + - !Pruner + pattern: "1xchannel" + update_frequency_on_step: 50 + prune_domain: "global" + prune_type: "snip_momentum" + sparsity_decay_type: "exp" diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/bert_mini_sst2_1xchannel.yaml b/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/bert_mini_sst2_1xchannel.yaml new file mode 100644 index 00000000000..ebb118ecc87 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/bert_mini_sst2_1xchannel.yaml @@ -0,0 +1,25 @@ +version: 1.0 + +model: + name: "bert-mini" + framework: "pytorch" + +pruning: + approach: + weight_compression_pytorch: + start_step: 0 + end_step: 0 + excluded_names: ["classifier", "pooler", ".*embeddings*", "LayerNorm"] + prune_layer_type: ["Linear"] + target_sparsity: 0.9 + update_frequency_on_step: 500 + max_sparsity_ratio_per_layer: 0.98 + prune_domain: "global" + sparsity_decay_type: "exp" + pruners: + - !Pruner + pattern: "ic_pattern_1xchannel" + update_frequency_on_step: 500 + prune_domain: "global" + prune_type: "snip_momentum" + sparsity_decay_type: "exp" diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py index 2da5db448cb..13812b30b4e 100755 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py @@ -432,11 +432,11 @@ def eval_func_for_nc(model_tuned): acc = result[key] break return acc - from neural_compressor.experimental import Quantization, common - quantizer = Quantization("./conf.yaml") - quantizer.model = common.Model(model) - quantizer.eval_func = eval_func_for_nc - q_model = quantizer.fit() + from neural_compressor.quantization import fit + from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion + tuning_criterion = TuningCriterion(max_trials=600) + conf = PostTrainingQuantConfig(approach="dynamic", backend="pytorch", tuning_criterion=tuning_criterion) + q_model = fit(model, conf=conf, eval_func=eval_func_for_nc) from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir) exit(0) diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py index 8ea43ea4a41..717ae91d886 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py @@ -498,13 +498,11 @@ def eval_func(model): # optimize and quantize with Neural Compressor if model_args.tune: - from neural_compressor.experimental import Quantization, common - calib_dataloader = eval_dataloader - quantizer = Quantization('conf.yaml') - quantizer.eval_func = eval_func - quantizer.calib_dataloader = calib_dataloader - quantizer.model = common.Model(model) - model = quantizer.fit() + from neural_compressor.quantization import fit + from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion + tuning_criterion = TuningCriterion(max_trials=600) + conf = PostTrainingQuantConfig(approach="static", backend="pytorch_fx", tuning_criterion=tuning_criterion) + model = fit(model, conf=conf, calib_dataloader=eval_dataloader, eval_func=eval_func) from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream save_for_huggingface_upstream(model, tokenizer, training_args.output_dir) return diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py index 79c785850c0..f5bc771e712 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py @@ -502,12 +502,6 @@ def compute_metrics(p: EvalPrediction): eval_dataloader = trainer.get_eval_dataloader() batch_size = eval_dataloader.batch_size - def train_func(model): - trainer.model_wrapped = model - trainer.model = model - trainer.train() - return trainer.model - def eval_func(model): trainer.model = model result = trainer.evaluate(eval_dataset=eval_dataset) @@ -526,12 +520,17 @@ def benchmark(model): # optimize and quantize with Neural Compressor if model_args.tune: - from neural_compressor.experimental import Quantization, common - quantizer = Quantization('conf_qat.yaml') - quantizer.eval_func = eval_func - quantizer.q_func = train_func - quantizer.model = common.Model(model) - model = quantizer.fit() + from neural_compressor.training import prepare_compression + from neural_compressor.config import QuantizationAwareTrainingConfig + conf = QuantizationAwareTrainingConfig(backend="pytorch_fx") + compression_manager = prepare_compression(model, conf) + compression_manager.callbacks.on_train_begin() + model = compression_manager.model + trainer.model_wrapped = model + trainer.model = model + trainer.train() + compression_manager.callbacks.on_train_end() + from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream save_for_huggingface_upstream(model, tokenizer, training_args.output_dir) return diff --git a/examples/pytorch/object_detection/ssd_resnet34/quantization/ptq/ipex/ssd_r34.py b/examples/pytorch/object_detection/ssd_resnet34/quantization/ptq/ipex/ssd_r34.py index 5edbe2580ad..4e2db16cb99 100644 --- a/examples/pytorch/object_detection/ssd_resnet34/quantization/ptq/ipex/ssd_r34.py +++ b/examples/pytorch/object_detection/ssd_resnet34/quantization/ptq/ipex/ssd_r34.py @@ -24,6 +24,7 @@ from base_model import ResNet34 from typing import List +import intel_extension_for_pytorch Vector = List[torch.Tensor] diff --git a/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet121.yaml b/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet121.yaml index 79da662a36f..b9da893f6da 100644 --- a/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet121.yaml +++ b/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet121.yaml @@ -38,6 +38,14 @@ quantization: # optional. tuning constrai algorithm: minmax weight: granularity: per_channel + op_wise: { + 'densenet121/MaxPool2D/MaxPool': { + 'activation': {'dtype': ['fp32']} + }, + 'densenet121/transition_block[1-3]/AvgPool2D/AvgPool': { + 'activation': {'dtype': ['fp32']}, + } + } evaluation: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization. accuracy: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization. diff --git a/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet161.yaml b/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet161.yaml index b5629ad649c..5312ed341fa 100644 --- a/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet161.yaml +++ b/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet161.yaml @@ -38,6 +38,14 @@ quantization: # optional. tuning constrai algorithm: minmax weight: granularity: per_channel + op_wise: { + 'densenet161/MaxPool2D/MaxPool': { + 'activation': {'dtype': ['fp32']} + }, + 'densenet161/transition_block[1-3]/AvgPool2D/AvgPool': { + 'activation': {'dtype': ['fp32']}, + } + } evaluation: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization. accuracy: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization. diff --git a/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet169.yaml b/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet169.yaml index 6892b69dc73..b63414d8acf 100644 --- a/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet169.yaml +++ b/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet169.yaml @@ -38,6 +38,14 @@ quantization: # optional. tuning constrai algorithm: minmax weight: granularity: per_channel + op_wise: { + 'densenet169/MaxPool2D/MaxPool': { + 'activation': {'dtype': ['fp32']} + }, + 'densenet169/transition_block[1-3]/AvgPool2D/AvgPool': { + 'activation': {'dtype': ['fp32']}, + } + } evaluation: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization. accuracy: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization. diff --git a/examples/tensorflow/oob_models/quantization/ptq/model_detail.py b/examples/tensorflow/oob_models/quantization/ptq/model_detail.py index 07ce37cf892..8c5f2d1770d 100644 --- a/examples/tensorflow/oob_models/quantization/ptq/model_detail.py +++ b/examples/tensorflow/oob_models/quantization/ptq/model_detail.py @@ -385,5 +385,11 @@ 'low': -1.0, 'high': 1.0 }, + # centernet_hg104 + { + 'model_name': 'centernet_hg104', + 'input': {'input_tensor': generate_data([224, 224, 3]),}, + 'output': ['Identity'], + }, ] diff --git a/examples/tensorflow/oob_models/quantization/ptq/run_benchmark.sh b/examples/tensorflow/oob_models/quantization/ptq/run_benchmark.sh index 87d16a45c1e..efd68dde04d 100755 --- a/examples/tensorflow/oob_models/quantization/ptq/run_benchmark.sh +++ b/examples/tensorflow/oob_models/quantization/ptq/run_benchmark.sh @@ -101,6 +101,9 @@ function set_args { NeuMF PRNet DIEN_Deep-Interest-Evolution-Network + EfficientDet-D2-768x768 + EfficientDet-D4-1024x1024 + centernet_hg104 -------- ) diff --git a/examples/tensorflow/oob_models/quantization/ptq/run_tuning.sh b/examples/tensorflow/oob_models/quantization/ptq/run_tuning.sh index 2971bedf7c3..a183dbb52e6 100755 --- a/examples/tensorflow/oob_models/quantization/ptq/run_tuning.sh +++ b/examples/tensorflow/oob_models/quantization/ptq/run_tuning.sh @@ -83,6 +83,7 @@ function set_args { DIEN_Deep-Interest-Evolution-Network EfficientDet-D2-768x768 EfficientDet-D4-1024x1024 + centernet_hg104 -------- ) diff --git a/neural_coder/__main__.py b/neural_coder/__main__.py index f9011e91f8b..8d9da0472c8 100644 --- a/neural_coder/__main__.py +++ b/neural_coder/__main__.py @@ -28,8 +28,8 @@ def parse_args(): parser.add_argument("--opt", type=str, default="", help="optimization feature to enable") - parser.add_argument("--strategy", type=str, default="static", - help="quantization strategy") + parser.add_argument("--approach", type=str, default="static", + help="quantization approach (strategy)") parser.add_argument('--config', type=str, default="", help='quantization configuration file path') @@ -53,11 +53,11 @@ def parse_args(): # optimize on copied script with Neural Coder from neural_coder import enable if args.opt == "": - if args.strategy == "static": + if args.approach == "static": features=["pytorch_inc_static_quant_fx"] - if args.strategy == "static_ipex": + if args.approach == "static_ipex": features=["pytorch_inc_static_quant_ipex"] - if args.strategy == "dynamic": + if args.approach == "dynamic": features=["pytorch_inc_dynamic_quant"] else: features=[args.opt] diff --git a/neural_coder/backends/onnx_inc_dynamic_quant.yaml b/neural_coder/backends/onnx_inc_dynamic_quant.yaml new file mode 100644 index 00000000000..3c50de8da8e --- /dev/null +++ b/neural_coder/backends/onnx_inc_dynamic_quant.yaml @@ -0,0 +1,30 @@ +# Copyright (c) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +transformation: + location: + - insert_below_model_definition_line + content: + - |- + [+] from neural_compressor.experimental import Quantization, common + [+] from neural_compressor import options, conf + [+] conf.model.framework = 'onnxrt_integerops' + [+] conf.quantization.approach = 'post_training_dynamic_quant' + [+] quantizer = Quantization(conf) + [+] quantizer.model = common.Model(MODEL_NAME) + [+] quantizer.eval_func = EVAL_FUNCTION_NAME + [+] MODEL_NAME = quantizer() + order: + - below: + above: diff --git a/neural_coder/backends/onnx_inc_static_quant_qdq.yaml b/neural_coder/backends/onnx_inc_static_quant_qdq.yaml new file mode 100644 index 00000000000..730c3220f45 --- /dev/null +++ b/neural_coder/backends/onnx_inc_static_quant_qdq.yaml @@ -0,0 +1,31 @@ +# Copyright (c) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +transformation: + location: + - insert_below_model_definition_line + content: + - |- + [+] from neural_compressor.experimental import Quantization, common + [+] from neural_compressor import options, conf + [+] conf.model.framework = 'onnxrt_qdqops' + [+] conf.quantization.approach = 'post_training_static_quant' + [+] quantizer = Quantization(conf) + [+] quantizer.model = common.Model(MODEL_NAME) + [+] quantizer.calib_dataloader = DATALOADER_NAME + [+] quantizer.eval_func = EVAL_FUNCTION_NAME + [+] MODEL_NAME = quantizer() + order: + - below: + above: diff --git a/neural_coder/docs/PythonLauncher.md b/neural_coder/docs/PythonLauncher.md index 38e3bd5fae7..d73257a3a97 100644 --- a/neural_coder/docs/PythonLauncher.md +++ b/neural_coder/docs/PythonLauncher.md @@ -27,7 +27,7 @@ Note: Any modification on the optimized code ```run_glue_optimized.py``` will be Users can specify which Deep Learning optimization they want to conduct using ```--opt``` argument. The list of supported Deep Learning optimization features can be found [here](SupportMatrix.md). -Note that if specifically optimizing with INT8 quantization by Intel® Neural Compressor, ```--strategy``` argument can be specified with either ```static```, ```static_ipex``` or ```dynamic```. For example, to run INT8 dynamic quantization by Intel® Neural Compressor instead of the default static quantization: +Note that if specifically optimizing with INT8 quantization by Intel® Neural Compressor, to choose a quantization approach (strategy), ```--approach``` argument can be specified with either ```static```, ```static_ipex``` or ```dynamic```. For example, to run INT8 dynamic quantization by Intel® Neural Compressor instead of the default static quantization: ```bash -python -m neural_coder --strategy dynamic run_glue.py --model_name_or_path bert-base-cased --task_name mrpc --do_eval --output_dir result +python -m neural_coder --approach dynamic run_glue.py --model_name_or_path bert-base-cased --task_name mrpc --do_eval --output_dir result ``` diff --git a/neural_compressor/__init__.py b/neural_compressor/__init__.py index 6bdf202786c..bc46fdbd916 100644 --- a/neural_compressor/__init__.py +++ b/neural_compressor/__init__.py @@ -24,7 +24,8 @@ from .utils.utility import set_backend from .utils import options from .conf.config import conf -from .conf.pythonic_config import config, DistillationConfig, Options, PostTrainingConfig, \ - PruningConfig, QuantizationAwareTrainingConfig +from .conf.pythonic_config import config +from .config import DistillationConfig, PostTrainingQuantConfig, \ + PruningConfig, QuantizationAwareTrainingConfig -set_backend('NA') +set_backend('NA') \ No newline at end of file diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py index 833d011f858..2a5f62af196 100644 --- a/neural_compressor/adaptor/pytorch.py +++ b/neural_compressor/adaptor/pytorch.py @@ -1029,7 +1029,7 @@ def _get_quantizable_ops(self, model): # get bf16 capability - if (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1') and \ + if self.use_bf16 and (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1') and \ (self.version.release >= Version("1.11.0").release): self.bf16_ops = self.query_handler.get_op_types_by_precision("bf16") bf16_ops = [] @@ -1308,19 +1308,34 @@ def _pre_hook_for_qat(self, dataloader=None): qscheme=torch.per_tensor_affine, reduce_range=REDUCE_RANGE), weight=torch.quantization.default_weight_fake_quant) + self.non_quant_dict = self.get_non_quant_modules(self.model.kwargs) + quantizable_ops = [] + self._get_quantizable_ops_recursively(self.model._model, '', quantizable_ops) + self.bf16_ops = self.query_handler.get_op_types_by_precision("bf16") + bf16_ops = [] + if self.version.release >= Version("1.11.0").release and self.use_bf16 and \ + (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1'): # pragma: no cover + self._get_bf16_ops_recursively(self.model._model, '', bf16_ops) + bf16_ops_list = [(op) for op in bf16_ops if op not in quantizable_ops] self.model.model.training = True torch.quantization.prepare_qat(self.model._model, inplace=True) - def _post_hook_for_qat(self): - torch.quantization.convert(self.model._model, inplace=True) # This is a flag for reloading self.model.q_config = { 'is_oneshot': True, 'framework': 'pytorch', 'reduce_range': REDUCE_RANGE, - 'approach': 'quant_aware_training' + 'approach': 'quant_aware_training', + 'bf16_ops_list': bf16_ops_list, } + def _post_hook_for_qat(self): + torch.quantization.convert(self.model._model, inplace=True) + if len(self.model.q_config['bf16_ops_list']) > 0 and \ + self.version.release >= Version("1.11.0").release and self.use_bf16 and \ + (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1'): # pragma: no cover + self.model._model = torch_utils.bf16_convert.Convert(self.model._model, self.model.q_config) + def _pre_hook_for_hvd(self, dataloader=None): # TODO: lazy init here hvd.init() @@ -2220,7 +2235,8 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None): self.model_calibration(q_model, dataloader, iterations, None, tune_cfg.get('calib_sampling_size', 1)) q_model.save_qconf_summary(qconf_summary=self.ipex_config_path) - if self.use_bf16: + if self.use_bf16 and (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1') and \ + (self.version.release >= Version("1.11.0").release): with torch.no_grad(): with torch.cpu.amp.autocast(): q_model = ipex.quantization.convert(q_model) @@ -2231,6 +2247,7 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None): q_model = torch.jit.trace(q_model, example_inputs, strict=False) q_model = torch.jit.freeze(q_model.eval()) else: + q_model = ipex.quantization.convert(q_model) with torch.no_grad(): try: q_model = torch.jit.trace(q_model, example_inputs) @@ -2486,7 +2503,7 @@ def _get_quantizable_ops_recursively(self, model, prefix, quantizable_ops): if isinstance(self.q_dataloader, BaseDataLoader): self.q_dataloader.batch(batch_size) logger.info('Recovery `calibration.dataloader.batchsize` {} according \ - to config.yaml'.format(batch_size)) + to config.yaml' .format(batch_size)) del init_model with open(self.ipex_config_path, 'r') as f: self.cfgs = json.load(f) @@ -2661,12 +2678,11 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None): self.tune_cfg = tune_cfg self.tune_cfg["approach"] = self.approach self.tune_cfg["framework"] = "pytorch_fx" - # pragma: no cover - if self.approach != 'post_training_dynamic_quant' and self.version.release >= Version("1.13.0").release: - assert dataloader is not None, "Please pass a dataloader to quantizer!" - example_inputs = get_example_inputs(model._model, dataloader) - else: - example_inputs = None + + # PyTorch 1.13 and above version, need example_inputs for fx trace, but it not realy used, + # so set it to None. + example_inputs = None + if self.default_qconfig is not None: default_qconfig = copy.deepcopy(self.default_qconfig) default_qconfig['activation']['dtype'] = \ @@ -2773,7 +2789,7 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None): q_model._model, prefix='') if len(self.tune_cfg['bf16_ops_list']) > 0 and \ - self.version.release >= Version("1.11.0").release and \ + self.version.release >= Version("1.11.0").release and self.use_bf16 and \ (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1'): # pragma: no cover q_model._model = torch_utils.bf16_convert.Convert(q_model._model, self.tune_cfg) @@ -2843,6 +2859,12 @@ def _pre_hook_for_qat(self, dataloader=None): quantizable_ops = [] tmp_model = self.fuse_fx_model(self.model, is_qat=True) self._get_quantizable_ops_recursively(tmp_model, '', quantizable_ops) + self.bf16_ops = self.query_handler.get_op_types_by_precision("bf16") + bf16_ops = [] + if self.version.release >= Version("1.11.0").release and self.use_bf16 and \ + (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1'): # pragma: no cover + self._get_bf16_ops_recursively(tmp_model, '', bf16_ops) + bf16_ops_list = [(op) for op in bf16_ops if op not in quantizable_ops] quantized_ops = OrderedDict() for op in quantizable_ops: if op[1] in [ @@ -2851,6 +2873,10 @@ def _pre_hook_for_qat(self, dataloader=None): quantized_ops[op[0]] = torch.quantization.default_dynamic_qconfig else: quantized_ops[op[0]] = q_cfgs + # build for fetching scale and zeropoint + op_config_dict = {} + for op in quantizable_ops: + op_config_dict[op] = {'weight': {'dtype': 'int8'}, 'activation': {'dtype': 'uint8'}} if self.version.release < Version("1.11.0").release: quantized_ops["default_qconfig"] = None else: @@ -2861,11 +2887,10 @@ def _pre_hook_for_qat(self, dataloader=None): from torch.quantization.quantize_fx import prepare_qat_fx fx_op_cfgs = _cfgs_to_fx_cfgs(quantized_ops, 'quant_aware_training') self.model._model.train() - if self.version.release >= Version("1.13.0").release: # pragma: no cover - assert dataloader is not None, "Please pass dataloader to qat hook!" - example_inputs = get_example_inputs(self.model._model, dataloader) - else: - example_inputs = None + + # PyTorch 1.13 and above version, need example_inputs for fx trace, but it not realy used, + # so set it to None. + example_inputs = None if self.sub_module_list is None: if self.version.release >= Version("1.13.0").release: # pragma: no cover @@ -2893,10 +2918,13 @@ def _pre_hook_for_qat(self, dataloader=None): example_inputs=example_inputs) # This is a flag for reloading self.model.q_config = { + 'calib_sampling_size': 100, # tmp arg for export API 'is_oneshot': True, 'framework': 'pytorch_fx', 'reduce_range': REDUCE_RANGE, 'quantizable_ops': quantizable_ops, + 'bf16_ops_list': bf16_ops_list, + 'op': op_config_dict, 'sub_module_list': self.sub_module_list, 'approach': 'quant_aware_training' } @@ -2919,6 +2947,15 @@ def _post_hook_for_qat(self): PyTorch_FXAdaptor.convert_sub_graph(self.sub_module_list, \ self.model._model, prefix='') + if self.approach != 'post_training_dynamic_quant': + self._get_scale_zeropoint(self.model._model, self.model.q_config) + if len(self.model.q_config['bf16_ops_list']) > 0 and \ + self.version.release >= Version("1.11.0").release and self.use_bf16 and \ + (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1'): # pragma: no cover + self.model._model = torch_utils.bf16_convert.Convert(self.model._model, self.model.q_config) + self._dump_model_op_stats(self.model._model, self.model.q_config, self.approach) + torch_utils.util.get_embedding_contiguous(self.model._model) + def train(self, model, dataloader, optimizer_tuple, criterion_tuple, hooks, **kwargs): """Execute the train process on the specified model. @@ -3092,7 +3129,7 @@ def _dump_model_op_stats(self, model, tune_cfg, approach): res = dict() self._get_sub_module_op_stats(model, tune_cfg, approach, res) - if (self.version.release >= Version("1.11.0").release) and \ + if self.use_bf16 and (self.version.release >= Version("1.11.0").release) and \ (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1'): # pragma: no cover bf16_ops_list = tune_cfg['bf16_ops_list'] if len(bf16_ops_list) > 0: diff --git a/neural_compressor/adaptor/tensorflow.py b/neural_compressor/adaptor/tensorflow.py index ccc57341fc6..f9f229e96b2 100644 --- a/neural_compressor/adaptor/tensorflow.py +++ b/neural_compressor/adaptor/tensorflow.py @@ -648,6 +648,8 @@ def _dump_model_op_stats(self, model_graphdef): origin_op_type = 'DepthwiseConv2dNative' if origin_op_type == 'BatchMatMul': origin_op_type = 'BatchMatMulV2' + if origin_op_type == 'FusedBatchMatMulV2': + origin_op_type = '_MklFusedBatchMatMulV2' if origin_op_type == 'Deconv2D': origin_op_type = 'Conv2DBackpropInput' if origin_op_type == 'Deconv3D': diff --git a/neural_compressor/adaptor/tensorflow.yaml b/neural_compressor/adaptor/tensorflow.yaml index 5502158a443..62524f544db 100644 --- a/neural_compressor/adaptor/tensorflow.yaml +++ b/neural_compressor/adaptor/tensorflow.yaml @@ -30,7 +30,7 @@ 'MaxPool', 'MaxPool3D', 'AvgPool', 'Conv2DBackpropInput', 'Conv3DBackpropInputV2'] bf16: ["_MklLayerNorm", "Conv2D", "Conv2DBackpropFilter", "Conv2DBackpropInput", "Conv3D", "Conv3DBackpropFilterV2", "Conv3DBackpropInputV2", "DepthwiseConv2dNative", "DepthwiseConv2dNativeBackpropFilter", "DepthwiseConv2dNativeBackpropInput", "GRUBlockCell", - "AUGRUBlockCell", "MklGRU", "MklAUGRU", "MatMul", "BatchMatMul", "BatchMatMulV2", "Einsum", # allow_list + "AUGRUBlockCell", "MklGRU", "MklAUGRU", "MatMul", "BatchMatMul", "BatchMatMulV2", "_MklFusedBatchMatMulV2", "Einsum", # allow_list "Add", "AddN", "AddV2", "AvgPool", "AvgPool3D", "AvgPool3DGrad", "AvgPoolGrad", "BiasAdd", "BiasAddGrad", "BiasAddV1", "Erf", "FusedBatchNormV2", "FusedBatchNormGradV2", "FusedBatchNormV3", "FusedBatchNormGradV3", "LeakyRelu", "LeakyReluGrad", "Mean", "Mul", "Sub", "Elu", "EluGrad", "FloorDiv", "_FusedBatchNormEx", "Log", "Log1p", "LogSoftmax", "Prod", "RealDiv", @@ -299,6 +299,7 @@ 'Dequantize + DepthwiseConv2dNative + Add + Relu6 + QuantizeV2', 'Dequantize + DepthwiseConv2dNative + BiasAdd + QuantizeV2', 'Dequantize + FusedBatchNormV3 + Relu + QuantizeV2', + 'Dequantize + FusedBatchNormV3 + LeakyRelu + QuantizeV2', 'Dequantize + _MklFusedInstanceNorm + Relu + QuantizeV2', 'Dequantize + _MklFusedInstanceNorm + LeakyRelu + QuantizeV2', 'Dequantize + Conv2DBackpropInput + BiasAdd + QuantizeV2', diff --git a/neural_compressor/adaptor/tf_utils/graph_converter.py b/neural_compressor/adaptor/tf_utils/graph_converter.py index 6e09ae02751..ca6573baf9f 100644 --- a/neural_compressor/adaptor/tf_utils/graph_converter.py +++ b/neural_compressor/adaptor/tf_utils/graph_converter.py @@ -160,6 +160,10 @@ def _inference(self, model): Args: model(TensorflowBaseModel): input TensorflowBaseModel """ + # ITEX optimization has broken INC calibration process. + # INC needs turn off ITEX optimization pass in calibration stage. + # TODO ITEX will provide API to replace setting environment variable. + os.environ["ITEX_REMAPPER"] = "0" sess = model.sess iter_op = model.iter_op input_tensor = model.input_tensor @@ -220,24 +224,25 @@ def check_shape(tensor, data): return True disorder_tensors = [] - disorder_inputs = [] + disorder_inputs = [] for idx, sort_tensor in enumerate(input_tensor): sort_input = inputs[idx] if check_shape(sort_tensor, sort_input): - feed_dict.update({sort_tensor: sort_input}) + feed_dict.update({sort_tensor: sort_input}) else: disorder_tensors.append(sort_tensor) disorder_inputs.append(sort_input) for i, dis_tensor in enumerate(disorder_tensors): - for j, dis_input in enumerate(disorder_inputs): - if check_shape(dis_tensor, dis_input): - feed_dict.update({dis_tensor: dis_input}) - break + for j, dis_input in enumerate(disorder_inputs): + if check_shape(dis_tensor, dis_input): + feed_dict.update({dis_tensor: dis_input}) + break _ = sess.run(output_tensor, feed_dict) if iter_op==[] \ else iterator_sess_run(sess, iter_op, \ feed_dict, output_tensor, self.calib_iteration) if idx + 1 == self.calib_iteration: break + os.environ["ITEX_REMAPPER"] = "1" def _check_tf_version(self): is_supported_version = False @@ -517,6 +522,7 @@ def bf16_convert(self): FP32 + INT8 mixed precision graph. """ try: + logger.info("Start BF16 conversion.") self._tmp_model.graph_def = BF16Convert( self._tmp_model.graph_def, self.fp32_ops, diff --git a/neural_compressor/adaptor/tf_utils/graph_rewriter/bf16/bf16_convert.py b/neural_compressor/adaptor/tf_utils/graph_rewriter/bf16/bf16_convert.py index 0a79543b409..707bd69c47d 100644 --- a/neural_compressor/adaptor/tf_utils/graph_rewriter/bf16/bf16_convert.py +++ b/neural_compressor/adaptor/tf_utils/graph_rewriter/bf16/bf16_convert.py @@ -243,11 +243,6 @@ def _model_bf16_convert(self): for bf16_node_name in set(self.bf16_ops): if bf16_node_name not in self.cur_graph.node_name_details: self.bf16_ops.remove(bf16_node_name) - continue - else: - if "fused_ops" in self.cur_graph.node_name_details[bf16_node_name].node.attr: - self.bf16_ops.remove(bf16_node_name) - continue for bf16_node_name in sorted(list(set(self.bf16_ops))): self._bf16_convert(bf16_node_name) return self.cur_graph.dump_graph() diff --git a/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_pad_with_conv.py b/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_pad_with_conv.py index 042c89769d9..e5f1da798ca 100644 --- a/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_pad_with_conv.py +++ b/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_pad_with_conv.py @@ -45,6 +45,7 @@ def do_transformation(self): target_nodes = cur_graph.query_fusion_pattern_nodes( [["Pad"], ["Conv2D", "Conv3D", "DepthwiseConv2dNative"], ('BiasAdd', 'Add', 'AddV2')]) + padding_tensor_dict = {} for node_combination in target_nodes: conv_name = node_combination[1] @@ -70,21 +71,26 @@ def do_transformation(self): continue padding_tensor = None - pad_node = graph_info[node_combination[0]].node - if graph_info[pad_node.input[1]].node.op != 'Const': - input_node = graph_info[pad_node.input[1]].node - if input_node.op == 'DataFormatVecPermute': - parent_input_node = graph_info[input_node.input[0]].node - if parent_input_node.op == 'Const': - padding_tensor = tensor_util.MakeNdarray( \ - parent_input_node.attr["value"].tensor).flatten() + pad_node = None + if node_combination[0] not in padding_tensor_dict: + pad_node = graph_info[node_combination[0]].node + if graph_info[pad_node.input[1]].node.op != 'Const': + input_node = graph_info[pad_node.input[1]].node + if input_node.op == 'DataFormatVecPermute': + parent_input_node = graph_info[input_node.input[0]].node + if parent_input_node.op == 'Const': + padding_tensor = tensor_util.MakeNdarray( \ + parent_input_node.attr["value"].tensor).flatten() + else: + continue else: continue else: - continue + padding_tensor = tensor_util.MakeNdarray( + graph_info[pad_node.input[1]].node.attr["value"].tensor).flatten() + padding_tensor_dict[node_combination[0]] = padding_tensor else: - padding_tensor = tensor_util.MakeNdarray( - graph_info[pad_node.input[1]].node.attr["value"].tensor).flatten() + padding_tensor = padding_tensor_dict[node_combination[0]] if self.itex_qdq_mode: enabled_pad_conv2d = bool(tf.version.VERSION == '1.15.0-up3' or \ @@ -95,12 +101,13 @@ def do_transformation(self): if any(padding_tensor) and not enabled_pad_conv2d: # pragma: no cover continue - if graph_info[pad_node.input[1]].node.op != 'Const': - cur_graph.node_name_details[pad_node.name].node.input.remove(pad_node.input[1]) - cur_graph.remove_node_with_single_input_output(pad_node.name) - else: - cur_graph.remove_node_with_single_input_output(pad_node.name) - cur_graph.remove_node(pad_node.input[1]) + if pad_node: + if graph_info[pad_node.input[1]].node.op != 'Const': + cur_graph.node_name_details[pad_node.name].node.input.remove(pad_node.input[1]) + cur_graph.remove_node_with_single_input_output(pad_node.name) + else: + cur_graph.remove_node_with_single_input_output(pad_node.name) + cur_graph.remove_node(pad_node.input[1]) conv_node = graph_info[node_combination[1]].node if self.itex_qdq_mode: if any(padding_tensor) and enabled_pad_conv2d: # pragma: no cover diff --git a/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_pad_with_fp32_conv.py b/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_pad_with_fp32_conv.py index 8b63b17ff31..2866a40ec04 100644 --- a/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_pad_with_fp32_conv.py +++ b/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_pad_with_fp32_conv.py @@ -46,6 +46,7 @@ def do_transformation(self): target_nodes = cur_graph.query_fusion_pattern_nodes( [["Pad"], ["Conv2D", "DepthwiseConv2dNative"], ('BiasAdd', 'Add', 'AddV2')]) + padding_tensor_dict = {} for node_combination in target_nodes: conv_name = node_combination[1] @@ -71,21 +72,26 @@ def do_transformation(self): continue padding_tensor = None - pad_node = graph_info[node_combination[0]].node - if graph_info[pad_node.input[1]].node.op != 'Const': - input_node = graph_info[pad_node.input[1]].node - if input_node.op == 'DataFormatVecPermute': - parent_input_node = graph_info[input_node.input[0]].node - if parent_input_node.op == 'Const': - padding_tensor = tensor_util.MakeNdarray( \ - parent_input_node.attr["value"].tensor).flatten() + pad_node = None + if node_combination[0] not in padding_tensor_dict: + pad_node = graph_info[node_combination[0]].node + if graph_info[pad_node.input[1]].node.op != 'Const': + input_node = graph_info[pad_node.input[1]].node + if input_node.op == 'DataFormatVecPermute': + parent_input_node = graph_info[input_node.input[0]].node + if parent_input_node.op == 'Const': + padding_tensor = tensor_util.MakeNdarray( \ + parent_input_node.attr["value"].tensor).flatten() + else: + continue else: continue else: - continue + padding_tensor = tensor_util.MakeNdarray( + graph_info[pad_node.input[1]].node.attr["value"].tensor).flatten() + padding_tensor_dict[node_combination[0]] = padding_tensor else: - padding_tensor = tensor_util.MakeNdarray( - graph_info[pad_node.input[1]].node.attr["value"].tensor).flatten() + padding_tensor = padding_tensor_dict[node_combination[0]] if self.itex_qdq_mode: enabled_pad_conv2d = bool(tf.version.VERSION == '1.15.0-up3' or \ @@ -95,12 +101,14 @@ def do_transformation(self): if any(padding_tensor) and not enabled_pad_conv2d: # pragma: no cover continue - if graph_info[pad_node.input[1]].node.op != 'Const': - cur_graph.node_name_details[pad_node.name].node.input.remove(pad_node.input[1]) - cur_graph.remove_node_with_single_input_output(pad_node.name) - else: - cur_graph.remove_node_with_single_input_output(pad_node.name) - cur_graph.remove_node(pad_node.input[1]) + + if pad_node: + if graph_info[pad_node.input[1]].node.op != 'Const': + cur_graph.node_name_details[pad_node.name].node.input.remove(pad_node.input[1]) + cur_graph.remove_node_with_single_input_output(pad_node.name) + else: + cur_graph.remove_node_with_single_input_output(pad_node.name) + cur_graph.remove_node(pad_node.input[1]) conv_node = graph_info[node_combination[1]].node # Helper.set_attr_int_list(conv_node, "padding_list", padding_tensor) # only when padding attr is explicit, the explicit_paddings is not empty diff --git a/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/pre_optimize.py b/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/pre_optimize.py index 4bb1d1a2b04..d7c2e33ca83 100644 --- a/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/pre_optimize.py +++ b/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/pre_optimize.py @@ -146,16 +146,16 @@ def get_optimized_model(self, itex_mode=False): self._tmp_graph_def = ConvertPlaceholderToConst(self._tmp_graph_def).do_transformation() - self._tmp_graph_def = RemoveTrainingNodesOptimizer( - self._tmp_graph_def, protected_nodes=input_output_names).do_transformation() - self._tmp_graph_def = SwitchOptimizer(self._tmp_graph_def).do_transformation() + self._tmp_graph_def = GrapplerOptimizer( + self._tmp_graph_def, input_output_names, self.optimization).do_transformation() + self._tmp_graph_def = StripUnusedNodesOptimizer(self._tmp_graph_def, input_node_names, output_node_names).do_transformation() - self._tmp_graph_def = GrapplerOptimizer( - self._tmp_graph_def, input_output_names, self.optimization).do_transformation() + self._tmp_graph_def = RemoveTrainingNodesOptimizer( + self._tmp_graph_def, protected_nodes=input_output_names).do_transformation() self._tmp_graph_def = SplitSharedInputOptimizer(self._tmp_graph_def).do_transformation() @@ -204,7 +204,7 @@ def get_optimized_model(self, itex_mode=False): self._tmp_graph_def = FetchWeightFromReshapeOptimizer( self._tmp_graph_def).do_transformation() - if not self.new_api: + if not self.new_api and not itex_mode: #TODO we need to remove below optimizer once the TF enabled the single # matmul op quantization self._tmp_graph_def = InjectDummyBiasAddOptimizer( @@ -221,7 +221,7 @@ def get_optimized_model(self, itex_mode=False): self._tmp_graph_def = StripEquivalentNodesOptimizer( self._tmp_graph_def, output_node_names).do_transformation() - if self.new_api: + if self.new_api or itex_mode: self._tmp_graph_def = DilatedContraction( self._tmp_graph_def).do_transformation() self._tmp_graph_def.library.CopyFrom(self.model.graph_def.library) diff --git a/neural_compressor/adaptor/tf_utils/graph_rewriter/int8/fuse_matmul_requantize.py b/neural_compressor/adaptor/tf_utils/graph_rewriter/int8/fuse_matmul_requantize.py index 2060fecbc4e..9647b657d4c 100644 --- a/neural_compressor/adaptor/tf_utils/graph_rewriter/int8/fuse_matmul_requantize.py +++ b/neural_compressor/adaptor/tf_utils/graph_rewriter/int8/fuse_matmul_requantize.py @@ -588,10 +588,16 @@ def do_transformation(self): min_filter_node = None # The Min and Max of non-const weight node are from QuantizeV2's output, not valid nodes. # Add check here for excluding this case. - if ":2" not in new_node.input[6]: - max_filter_node = self.graph_info[new_node.input[6]].node - if ":1" not in new_node.input[5]: - min_filter_node = self.graph_info[new_node.input[5]].node + if len(attr_fused_ops) == 0: # single matmul case + if ":2" not in new_node.input[5]: + max_filter_node = self.graph_info[new_node.input[5]].node + if ":1" not in new_node.input[4]: + min_filter_node = self.graph_info[new_node.input[4]].node + else: + if ":2" not in new_node.input[6]: + max_filter_node = self.graph_info[new_node.input[6]].node + if ":1" not in new_node.input[5]: + min_filter_node = self.graph_info[new_node.input[5]].node last_node = self.graph_info[new_node.input[0]].node is_min_first = bool(quantized_node.attr['input_quant_mode'].s == b'MIN_FIRST') weight_node = self.graph_info[new_node.input[1]].node diff --git a/neural_compressor/adaptor/tf_utils/graph_rewriter/qdq/insert_qdq_pattern.py b/neural_compressor/adaptor/tf_utils/graph_rewriter/qdq/insert_qdq_pattern.py index 863876590a5..091d02add98 100644 --- a/neural_compressor/adaptor/tf_utils/graph_rewriter/qdq/insert_qdq_pattern.py +++ b/neural_compressor/adaptor/tf_utils/graph_rewriter/qdq/insert_qdq_pattern.py @@ -81,6 +81,8 @@ def do_transformation(self): self.g.graph = copy.deepcopy(self.model) self.graph_info = self.g.parse_graph() + self.g.get_frame_info() + # insert QDQ pattern for op's input for op_name in quantizable_op_names: if self._ignore_insert_qdq_pattern(op_name): @@ -115,20 +117,16 @@ def do_transformation(self): computational_node = self.graph_info[computational_node_name].node weight_name = computational_node.input[1] - weight_node = self.graph_info[weight_name].node if re.search(r"\w+:\d+", weight_name): weight_node = self.graph_info[weight_name.rsplit(':', 1)[0]].node else: weight_node = self.graph_info[weight_name].node - enter_node = None if weight_node.op == 'Enter': if self.itex_mode: parent_node = self.graph_info[Helper.node_name_from_input(weight_node.input[0])].node if not parent_node.op == 'Const': continue - else: - enter_node = weight_node - weight_node = parent_node + weight_node = parent_node else: continue @@ -139,10 +137,10 @@ def do_transformation(self): else: per_channel = False weight_bit = 7 - + self._insert_qdq_pattern_for_weight_node(computational_node, weight_node, - enter_node, + weight_name, min_max_values, per_channel, weight_bit, @@ -184,7 +182,7 @@ def _check_op_list(self, node_type): "MaxPool", "MaxPool3D", "FusedBatchNormV3", "Requantize", "RequantizePerChannel", "AvgPool", "Pad", "CropAndResize", "Dequantize", "Mean", "MatMul", "BatchMatMul", "BatchMatMulV2", "FakeQuantWithMinMaxVars", "_MklFusedInstanceNorm", - "Conv2DBackpropInput", "Conv3DBackpropInputV2") + "Conv2DBackpropInput", "Conv3DBackpropInputV2", "Sigmoid", "BiasAdd") return any([node_type.find(i) != -1 for i in op_list]) def _find_relu_node(self, node): @@ -200,7 +198,7 @@ def _find_relu_node(self, node): or len(self.node_name_mapping \ [Helper.node_name_from_input(node.input[0])].output) > 1): return True - elif 'T' in node.attr and node.attr['T'].type in (dtypes.quint8, dtypes.uint8): + elif 'T' in node.attr and dtypes.DType(node.attr['T'].type) in (dtypes.quint8, dtypes.uint8): return True elif (node.op.find("QuantizedConv") != -1 or node.op.find("QuantizedDepthwiseConv") != -1 or @@ -414,7 +412,7 @@ def _insert_qdq_pattern_for_each_input(self, op_name, namespace_prefix, def _insert_qdq_pattern_for_weight_node(self, computational_node, weight_node, - enter_node, + weight_name, min_max_values, per_channel, weight_bit=7.0, @@ -504,41 +502,27 @@ def _insert_qdq_pattern_for_weight_node(self, max_node = Helper.create_constant_node(max_name, max_value, dtypes.float32, device="cpu") if "BatchMatMul" in host_op_type and "BatchMatMul" not in weight_node.op: - min_node.input.append("^" + weight_node.name) - max_node.input.append("^" + weight_node.name) + min_node.input.append("^" + weight_name) + max_node.input.append("^" + weight_name) - quant_const_enter_node = None min_enter_node = None max_enter_node = None - if enter_node: - quant_const_enter_node = Helper.create_node('Enter', \ - qint8_const_name + '_enter', [weight_node.name]) - Helper.set_attr_string(quant_const_enter_node, - 'frame_name', enter_node.attr['frame_name'].s) - Helper.set_attr_dtype(quant_const_enter_node, 'T', dtypes.float32) - Helper.set_attr_bool(quant_const_enter_node, 'is_constant', True) - Helper.set_attr_int(quant_const_enter_node, \ - 'parallel_iterations', enter_node.attr['parallel_iterations'].i) + if insert_reshape: + reshape_dims_4to3_name = qint8_const_name + "_reshape_dims_4to3_" + reshape_dims_4to3_node = Helper.create_constant_node( + reshape_dims_4to3_name, shape_convert, dtypes.int32) + reshape_4to3_name = qint8_const_name + "_reshape_4to3_" + reshape_4to3_node = Helper.create_node("Reshape", reshape_4to3_name, + [weight_node.name, reshape_dims_4to3_name]) + reshape_4to3_node.attr["T"].CopyFrom( + attr_value_pb2.AttrValue(type=dtypes.float32.as_datatype_enum)) quant_node = Helper.create_node( "QuantizeV2", qint8_const_name + '_quant', - [quant_const_enter_node.name, min_name, max_name]) + [reshape_4to3_name, min_name, max_name]) else: - if insert_reshape: - reshape_dims_4to3_name = qint8_const_name + "_reshape_dims_4to3_" - reshape_dims_4to3_node = Helper.create_constant_node( - reshape_dims_4to3_name, shape_convert, dtypes.int32) - reshape_4to3_name = qint8_const_name + "_reshape_4to3_" - reshape_4to3_node = Helper.create_node("Reshape", reshape_4to3_name, - [weight_node.name, reshape_dims_4to3_name]) - reshape_4to3_node.attr["T"].CopyFrom( - attr_value_pb2.AttrValue(type=dtypes.float32.as_datatype_enum)) - quant_node = Helper.create_node( - "QuantizeV2", qint8_const_name + '_quant', - [reshape_4to3_name, min_name, max_name]) - else: - quant_node = Helper.create_node( - "QuantizeV2", qint8_const_name + '_quant', - [weight_node.name, min_name, max_name]) + quant_node = Helper.create_node( + "QuantizeV2", qint8_const_name + '_quant', + [weight_node.name, min_name, max_name]) dequant_node = Helper.create_node( "Dequantize", base_name + '_dequant', @@ -549,10 +533,10 @@ def _insert_qdq_pattern_for_weight_node(self, Helper.set_attr_dtype(dequant_node, "T", dtypes.qint8) Helper.set_attr_string(dequant_node, "mode", b"SCALED") if per_channel: - if host_op_type == 'Conv2D' or host_op_type == 'Conv2DBackpropInput': + if host_op_type in ('Conv2D', 'Conv2DBackpropInput'): Helper.set_attr_int(quant_node, 'axis', 3) Helper.set_attr_int(dequant_node, 'axis', 3) - elif host_op_type == 'Conv3D' or host_op_type == 'Conv3DBackpropInputV2': + elif host_op_type in ('Conv3D', 'Conv3DBackpropInputV2'): Helper.set_attr_int(quant_node, 'axis', 4) Helper.set_attr_int(dequant_node, 'axis', 4) elif host_op_type == 'MatMul': @@ -584,25 +568,24 @@ def _insert_qdq_pattern_for_weight_node(self, self.g_weight.add_node(reshape_3to4_node, dequant_node.name, [computational_node.name]) computational_node.input[1] = reshape_3to4_node.name else: - if enter_node: + if weight_node.name in self.g.parent_frame_details and self.g.parent_frame_details[weight_node.name]: min_enter_node = Helper.create_node('Enter', min_name + '_enter', [min_name]) - Helper.set_attr_string(min_enter_node, - 'frame_name', enter_node.attr['frame_name'].s) + Helper.set_attr_string(min_enter_node, 'frame_name', + self.g.parent_frame_details[weight_node.name].attr['frame_name'].s) Helper.set_attr_dtype(min_enter_node, 'T', dtypes.float32) Helper.set_attr_bool(min_enter_node, 'is_constant', True) Helper.set_attr_int(min_enter_node, 'parallel_iterations', \ - enter_node.attr['parallel_iterations'].i) + self.g.parent_frame_details[weight_node.name].attr['parallel_iterations'].i) max_enter_node = Helper.create_node('Enter', max_name + '_enter', [max_name]) - Helper.set_attr_string(max_enter_node, - 'frame_name', enter_node.attr['frame_name'].s) + Helper.set_attr_string(max_enter_node, 'frame_name', + self.g.parent_frame_details[weight_node.name].attr['frame_name'].s) Helper.set_attr_dtype(max_enter_node, 'T', dtypes.float32) Helper.set_attr_bool(max_enter_node, 'is_constant', True) Helper.set_attr_int(max_enter_node, 'parallel_iterations',\ - enter_node.attr['parallel_iterations'].i) + self.g.parent_frame_details[weight_node.name].attr['parallel_iterations'].i) - self.g_weight.add_node(quant_const_enter_node, weight_node.name, [quant_node.name]) - self.g_weight.add_node(quant_node, quant_const_enter_node.name, []) + self.g_weight.add_node(quant_node, weight_name, []) self.g_weight.add_node(min_node, None, [min_enter_node.name]) self.g_weight.add_node(max_node, None, [max_enter_node.name]) self.g_weight.add_node(min_enter_node, min_node.name, [quant_node.name]) @@ -610,7 +593,7 @@ def _insert_qdq_pattern_for_weight_node(self, self.g_weight.add_node(dequant_node, quant_node.name, [computational_node.name]) computational_node.input[1] = dequant_node.name else: - self.g_weight.add_node(quant_node, weight_node.name, []) + self.g_weight.add_node(quant_node, weight_name, []) self.g_weight.add_node(min_node, None, [quant_node.name]) self.g_weight.add_node(max_node, None, [quant_node.name]) self.g_weight.add_node(dequant_node, quant_node.name, [computational_node.name]) diff --git a/neural_compressor/adaptor/tf_utils/graph_util.py b/neural_compressor/adaptor/tf_utils/graph_util.py index 77903d4b62c..d810f1d87a1 100644 --- a/neural_compressor/adaptor/tf_utils/graph_util.py +++ b/neural_compressor/adaptor/tf_utils/graph_util.py @@ -918,11 +918,13 @@ def gen_per_iter(data): if i.startswith(first_line): iterations += 1 - step = len(valid_data) / iterations + step = int(len(valid_data) / iterations) final_res = [] for i in range(iterations): final_res.extend(gen_per_iter(valid_data[int(i*step): int(step*( i+ 1))])) + if i + 1 == iterations and int(step*( i+ 1)) < len(valid_data): + final_res.extend(gen_per_iter(valid_data[int(step*( i+ 1)): len(valid_data)])) return final_res diff --git a/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_bn.py b/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_bn.py index 9dbe1c82f0a..f36b02a3e94 100644 --- a/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_bn.py +++ b/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_bn.py @@ -31,8 +31,9 @@ def __init__(self, **kwargs): reverse=True) if self.new_api: self.fusion_mapping = { + 'FusedBatchNormV3': self.apply_newly_bn_relu_fusion, 'FusedBatchNormV3Relu': self.apply_newly_bn_relu_fusion, - 'FusedBatchNormV3': self.apply_newly_bn_relu_fusion + 'FusedBatchNormV3LeakyRelu': self.apply_newly_bn_leakyrelu_fusion } else: self.fusion_mapping = {} @@ -75,8 +76,7 @@ def apply_newly_bn_relu_fusion(self, match_node_name): [output_min_node_name] + [output_max_node_name] + control_inputs output_min_node = helper.create_constant_node(output_min_node_name, -1., dtypes.float32) output_max_node = helper.create_constant_node(output_max_node_name, 1., dtypes.float32) - quantized_bn_node = helper.create_node(node_op, quantized_node_name, - quantized_node_input_names) + quantized_bn_node = helper.create_node(node_op, quantized_node_name, quantized_node_input_names) if relu_node_name is not None: helper.set_attr_string(quantized_bn_node, "activation_mode", b'Relu') if self.node_name_mapping[offset_name].node.op == "Const": @@ -141,6 +141,108 @@ def apply_newly_bn_relu_fusion(self, match_node_name): new_node.CopyFrom(node) self.add_output_graph_node(new_node) + def apply_newly_bn_leakyrelu_fusion(self, match_node_name): + matched_node = self.node_name_mapping[match_node_name[0]] + skip_node_name = match_node_name[1:] + control_inputs, normal_inputs = self._get_node_input( + matched_node.node.name) + scale_name = normal_inputs[1] + offset_name = normal_inputs[2] + mean_name = normal_inputs[3] + variance_name = normal_inputs[4] + + all_input_names = self._add_eightbit_prologue_nodes(matched_node.node.name) + all_input_names = [ + all_input_names[0], + scale_name, + offset_name, + mean_name, + variance_name, + all_input_names[1], + all_input_names[2] + ] + + for _, node in enumerate(self.input_graph.node): + if node.name in skip_node_name: + self.logger.debug("skip node {}".format(node.name)) + elif node.name == match_node_name[0]: + self.logger.debug("Matched node {} with input {}.".format(node.name, node.input)) + leakyrelu_node_name = match_node_name[1] + node_op = '_QuantizedFusedBatchNorm' + quantized_node_name = node.name + "_eightbit_quantized_bn" + output_min_node_name = quantized_node_name + "_input7_output_min" + output_max_node_name = quantized_node_name + "_input8_output_max" + quantized_node_input_names = all_input_names + \ + [output_min_node_name] + [output_max_node_name] + control_inputs + output_min_node = helper.create_constant_node(output_min_node_name, -1., dtypes.float32) + output_max_node = helper.create_constant_node(output_max_node_name, 1., dtypes.float32) + quantized_bn_node = helper.create_node(node_op, quantized_node_name, quantized_node_input_names) + + helper.set_attr_string(quantized_bn_node, "activation_mode", b'LeakyRelu') + helper.copy_attr(quantized_bn_node, "alpha", \ + self.node_name_mapping[leakyrelu_node_name].node.attr["alpha"]) + if self.node_name_mapping[offset_name].node.op == "Const": + helper.set_attr_bool(quantized_bn_node, "is_offset_const", True) + else: + helper.set_attr_bool(quantized_bn_node, "is_offset_const", False) + if self.node_name_mapping[mean_name].node.op == "Const": + helper.set_attr_bool(quantized_bn_node, "is_mean_const", True) + else: + helper.set_attr_bool(quantized_bn_node, "is_mean_const", False) + helper.set_attr_dtype(quantized_bn_node, "T", dtypes.qint8) + helper.set_attr_dtype(quantized_bn_node, "U", dtypes.float32) + helper.set_attr_dtype(quantized_bn_node, "Tout", dtypes.qint8) + + """ + # 0. x + # 1. scale + # 2. offset + # 3. mean + # 4. variance + # 5. x_min + # 6. x_max + # 7. {output_min} + # 8. {output_max} + """ + helper.set_attr_type_list(quantized_bn_node, 'input_types', [ + dtypes.qint8.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + ]) + + + """ + # 0. output + # 1. output_min + # 2. output_max + """ + helper.set_attr_type_list(quantized_bn_node, 'out_types', [ + dtypes.qint8.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + ]) + self.add_output_graph_node(output_min_node) + self.add_output_graph_node(output_max_node) + self.add_output_graph_node(quantized_bn_node) + self._intel_cpu_add_dequantize_result_node( + quantized_output_name = quantized_node_name, + original_node_name = match_node_name[-1], + dtype = dtypes.qint8, + min_tensor_index = 1, + performance_only=self.performance_only + ) + + else: + new_node = node_def_pb2.NodeDef() + new_node.CopyFrom(node) + self.add_output_graph_node(new_node) + def get_longest_fuse(self): self._get_op_list() real_patterns = [pattern[1 :-1] for pattern in self.sorted_patterns] diff --git a/neural_compressor/adaptor/tf_utils/quantize_graph/quantize_graph_bn.py b/neural_compressor/adaptor/tf_utils/quantize_graph/quantize_graph_bn.py index 5bf86c74e72..9a425505dc1 100644 --- a/neural_compressor/adaptor/tf_utils/quantize_graph/quantize_graph_bn.py +++ b/neural_compressor/adaptor/tf_utils/quantize_graph/quantize_graph_bn.py @@ -31,8 +31,9 @@ def __init__(self, **kwargs): reverse=True) if self.new_api: self.fusion_mapping = { + 'FusedBatchNormV3': self.apply_newly_bn_relu_fusion, 'FusedBatchNormV3Relu': self.apply_newly_bn_relu_fusion, - 'FusedBatchNormV3': self.apply_newly_bn_relu_fusion + 'FusedBatchNormV3LeakyRelu': self.apply_newly_bn_leakyrelu_fusion } else: self.fusion_mapping = {} @@ -75,8 +76,7 @@ def apply_newly_bn_relu_fusion(self, match_node_name): [output_min_node_name] + [output_max_node_name] + control_inputs output_min_node = helper.create_constant_node(output_min_node_name, -1., dtypes.float32) output_max_node = helper.create_constant_node(output_max_node_name, 1., dtypes.float32) - quantized_bn_node = helper.create_node(node_op, quantized_node_name, - quantized_node_input_names) + quantized_bn_node = helper.create_node(node_op, quantized_node_name, quantized_node_input_names) if relu_node_name is not None: helper.set_attr_string(quantized_bn_node, "activation_mode", b'Relu') if self.node_name_mapping[offset_name].node.op == "Const": @@ -140,6 +140,108 @@ def apply_newly_bn_relu_fusion(self, match_node_name): new_node.CopyFrom(node) self.add_output_graph_node(new_node) + def apply_newly_bn_leakyrelu_fusion(self, match_node_name): + matched_node = self.node_name_mapping[match_node_name[0]] + skip_node_name = match_node_name[1:] + control_inputs, normal_inputs = self._get_node_input( + matched_node.node.name) + scale_name = normal_inputs[1] + offset_name = normal_inputs[2] + mean_name = normal_inputs[3] + variance_name = normal_inputs[4] + + all_input_names = self._add_eightbit_prologue_nodes(matched_node.node.name) + all_input_names = [ + all_input_names[0], + scale_name, + offset_name, + mean_name, + variance_name, + all_input_names[1], + all_input_names[2] + ] + + for _, node in enumerate(self.input_graph.node): + if node.name in skip_node_name: + self.logger.debug("skip node {}".format(node.name)) + elif node.name == match_node_name[0]: + self.logger.debug("Matched node {} with input {}.".format(node.name, node.input)) + leakyrelu_node_name = match_node_name[1] + node_op = '_QuantizedFusedBatchNorm' + quantized_node_name = node.name + "_eightbit_quantized_bn" + output_min_node_name = quantized_node_name + "_input7_output_min" + output_max_node_name = quantized_node_name + "_input8_output_max" + quantized_node_input_names = all_input_names + \ + [output_min_node_name] + [output_max_node_name] + control_inputs + output_min_node = helper.create_constant_node(output_min_node_name, -1., dtypes.float32) + output_max_node = helper.create_constant_node(output_max_node_name, 1., dtypes.float32) + quantized_bn_node = helper.create_node(node_op, quantized_node_name, quantized_node_input_names) + + helper.set_attr_string(quantized_bn_node, "activation_mode", b'LeakyRelu') + helper.copy_attr(quantized_bn_node, "alpha", \ + self.node_name_mapping[leakyrelu_node_name].node.attr["alpha"]) + if self.node_name_mapping[offset_name].node.op == "Const": + helper.set_attr_bool(quantized_bn_node, "is_offset_const", True) + else: + helper.set_attr_bool(quantized_bn_node, "is_offset_const", False) + if self.node_name_mapping[mean_name].node.op == "Const": + helper.set_attr_bool(quantized_bn_node, "is_mean_const", True) + else: + helper.set_attr_bool(quantized_bn_node, "is_mean_const", False) + helper.set_attr_dtype(quantized_bn_node, "T", dtypes.qint8) + helper.set_attr_dtype(quantized_bn_node, "U", dtypes.float32) + helper.set_attr_dtype(quantized_bn_node, "Tout", dtypes.qint8) + + """ + # 0. x + # 1. scale + # 2. offset + # 3. mean + # 4. variance + # 5. x_min + # 6. x_max + # 7. {output_min} + # 8. {output_max} + """ + helper.set_attr_type_list(quantized_bn_node, 'input_types', [ + dtypes.qint8.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + ]) + + + """ + # 0. output + # 1. output_min + # 2. output_max + """ + helper.set_attr_type_list(quantized_bn_node, 'out_types', [ + dtypes.qint8.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + ]) + self.add_output_graph_node(output_min_node) + self.add_output_graph_node(output_max_node) + self.add_output_graph_node(quantized_bn_node) + self._intel_cpu_add_dequantize_result_node( + quantized_output_name = quantized_node_name, + original_node_name = match_node_name[-1], + dtype = dtypes.qint8, + min_tensor_index = 1, + performance_only=self.performance_only + ) + + else: + new_node = node_def_pb2.NodeDef() + new_node.CopyFrom(node) + self.add_output_graph_node(new_node) + def get_longest_fuse(self): self._get_op_list() matched_rule, matched_node_name = self._is_match(self.sorted_patterns) diff --git a/neural_compressor/adaptor/torch_utils/onnx.py b/neural_compressor/adaptor/torch_utils/onnx.py index aadcb80f810..c667281cb66 100644 --- a/neural_compressor/adaptor/torch_utils/onnx.py +++ b/neural_compressor/adaptor/torch_utils/onnx.py @@ -30,17 +30,30 @@ def __init__(self, dataloader, sample_size=100): self.datasize = self.batch_num * self.batch_size self.data = [] - for i, (input, label) in enumerate(self.dataloader): - if i * self.batch_size >= self.datasize: - break - if isinstance(input, dict) or isinstance(input, UserDict): - batch = {k: v.detach().cpu().numpy() for k, v in input.items()} - elif isinstance(input, list) or isinstance(input, tuple): - batch = {'input': [v.detach().cpu().numpy() for v in input]} - else: - batch = {'input': input.detach().cpu().numpy()} - self.data.append(batch) - self.data = iter(self.data) + try: + for i, (input, label) in enumerate(self.dataloader): + if i * self.batch_size >= self.datasize: + break + if isinstance(input, dict) or isinstance(input, UserDict): + batch = {k: v.detach().cpu().numpy() for k, v in input.items()} + elif isinstance(input, list) or isinstance(input, tuple): + batch = {'input': [v.detach().cpu().numpy() for v in input]} + else: + batch = {'input': input.detach().cpu().numpy()} + self.data.append(batch) + self.data = iter(self.data) + except: + for i, input in enumerate(self.dataloader): + if i * self.batch_size >= self.datasize: + break + if isinstance(input, dict) or isinstance(input, UserDict): + batch = {k: v.detach().cpu().numpy() for k, v in input.items()} + elif isinstance(input, list) or isinstance(input, tuple): + batch = {'input': [v.detach().cpu().numpy() for v in input]} + else: + batch = {'input': input.detach().cpu().numpy()} + self.data.append(batch) + self.data = iter(self.data) def get_next(self): return next(self.data, None) diff --git a/neural_compressor/benchmark.py b/neural_compressor/benchmark.py index 30e3bf8aa28..87d425a846b 100644 --- a/neural_compressor/benchmark.py +++ b/neural_compressor/benchmark.py @@ -18,6 +18,8 @@ from .utils import logger from .data import DATALOADERS from .experimental import Benchmark as ExpBenchmark +from .conf.pythonic_config import Config +from .config import BenchmarkConfig class Benchmark(object): """Benchmark class can be used to evaluate the model performance, with the objective @@ -67,9 +69,11 @@ def postprocess(self, name, postprocess_cls, **kwargs): self.exp_benchmarker.postprocess = nc_postprocess -def benchmark( +def fit( model, config=None, b_dataloader=None, b_func=None ): + if isinstance(config, BenchmarkConfig): + config = Config(benchmark=config) benchmarker = ExpBenchmark(config) benchmarker.model = model if b_func is not None: @@ -78,3 +82,6 @@ def benchmark( benchmarker.b_dataloader = b_dataloader benchmarker() return benchmarker.results + + +benchmark = fit diff --git a/neural_compressor/conf/config.py b/neural_compressor/conf/config.py index aef8f695291..86f1cac018b 100644 --- a/neural_compressor/conf/config.py +++ b/neural_compressor/conf/config.py @@ -746,6 +746,7 @@ def percent_to_float(data): 'pre_post_process_quantization': True}, 'model_wise': {'weight': {'bit': [7.0]}, 'activation': {}}, + 'optimization_level': 1, }): { Optional('approach', default='post_training_static_quant'): And( str, @@ -839,8 +840,10 @@ def percent_to_float(data): Optional('op_wise', default=None): { str: ops_schema }, + Optional('optimization_level', default=1): And(int, lambda level: level in [0, 1]), }, - Optional('use_bf16', default=False): bool, + Optional('use_bf16', default=True): bool, + Optional('optimization_level', default=1): And(int, lambda level: level in [0, 1]), Optional('graph_optimization'): graph_optimization_schema, Optional('mixed_precision'): mixed_precision_schema, @@ -1111,6 +1114,7 @@ def percent_to_float(data): 'activation': {}}, }): dict, Optional('use_bf16', default=False): bool, + Optional('optimization_level', default=1): int, Optional('tuning', default={ 'strategy': {'name': 'basic'}, 'accuracy_criterion': {'relative': 0.01, 'higher_is_better': True}, @@ -1346,8 +1350,17 @@ def map_pyconfig_to_cfg(self, pythonic_config): 'tuning.exit_policy.max_trials': pythonic_config.quantization.max_trials, 'tuning.exit_policy.performance_only': pythonic_config.quantization.performance_only, 'use_bf16': pythonic_config.quantization.use_bf16, + 'quantization.optimization_level': pythonic_config.quantization.optimization_level, 'reduce_range': pythonic_config.quantization.reduce_range }) + if pythonic_config.quantization.strategy_kwargs: + st_kwargs = pythonic_config.quantization.strategy_kwargs + for st_key in ['sigopt_api_token', 'sigopt_project_id', 'sigopt_experiment_name', \ + 'accuracy_weight', 'latency_weight']: + if st_key in st_kwargs: + st_val = st_kwargs[st_key] + mapping.update({'tuning.strategy.' + st_key: st_val}) + if pythonic_config.distillation is not None: mapping.update({ 'distillation.train.criterion': pythonic_config.distillation.criterion, @@ -1371,6 +1384,10 @@ def map_pyconfig_to_cfg(self, pythonic_config): 'tuning.tensorboard': pythonic_config.options.tensorboard, }) if pythonic_config.benchmark is not None: + if pythonic_config.benchmark.inputs != []: + mapping.update({'model.inputs': pythonic_config.benchmark.inputs}) + if pythonic_config.benchmark.outputs != []: + mapping.update({'model.outputs': pythonic_config.benchmark.outputs}) mapping.update({ 'evaluation.performance.warmup': pythonic_config.benchmark.warmup, 'evaluation.performance.iteration': pythonic_config.benchmark.iteration, diff --git a/neural_compressor/conf/pythonic_config.py b/neural_compressor/conf/pythonic_config.py index 89d0b773d40..c9975a9ebc6 100644 --- a/neural_compressor/conf/pythonic_config.py +++ b/neural_compressor/conf/pythonic_config.py @@ -16,377 +16,12 @@ # limitations under the License. import logging -import datetime -from typing import List -from schema import Schema, And, Use, Optional, Or from .dotdict import DotDict -from .config import Pruner +from ..config import _BaseQuantizationConfig, accuracy_criterion, BenchmarkConfig, \ + check_value, DistillationConfig, options, PruningConfig logger = logging.getLogger("neural_compressor") -default_workspace = './nc_workspace/{}/'.format( - datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) - -ops_schema = Schema({ - Optional('weight', default=None): { - Optional('granularity'): And( - list, - lambda s: all(i in ['per_channel', 'per_tensor'] for i in s)), - Optional('scheme'): And( - list, - lambda s: all(i in ['asym', 'sym', 'asym_float'] for i in s)), - Optional('dtype'): And( - list, - lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16', 'fp16'] for i in s)), - Optional('algorithm'): And( - list, - lambda s: all(i in ['minmax'] for i in s))}, - Optional('activation', default=None): { - Optional('granularity'): And( - list, - lambda s: all(i in ['per_channel', 'per_tensor'] for i in s)), - Optional('scheme'): And( - list, - lambda s: all(i in ['asym', 'sym'] for i in s)), - Optional('dtype'): And( - list, - lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16', 'fp16', 'None'] for i in s)), - Optional('algorithm'): And( - list, - lambda s: all(i in ['minmax', 'kl', 'placeholder'] for i in s))}}) - - -def check_value(name, src, supported_type, supported_value=[]): - if isinstance(src, list) and any([not isinstance(i, supported_type) for i in src]): - logger.warning("Type of {} items should be {} but not {}, " \ - "use its default value.".format(name, str(supported_type), [type(i) for i in src])) - return False - elif not isinstance(src, list) and not isinstance(src, supported_type): - logger.warning("Type of {} should be {} but not {}, " \ - "use its default value.".format(name, str(supported_type), type(src))) - return False - - if len(supported_value) > 0: - if isinstance(src, str) and src not in supported_value: - logger.warning("{} is not in supported {}: {}. Skip setting it and" \ - " use default value.".format(src, name, str(supported_value))) - return False - elif isinstance(src, list) and all([isinstance(i, str) for i in src]) and \ - any([i not in supported_value for i in src]): - logger.warning("{} is not in supported {}: {}. Skip setting it and" \ - " use default value.".format(src, name, str(supported_value))) - return False - - return True - -class BenchmarkConfig: - def __init__(self, warmup=5, iteration=-1, cores_per_instance=None, num_of_instance=None, - inter_num_of_threads=None, intra_num_of_threads=None): - self._warmup = warmup - self._iteration = iteration - self._cores_per_instance = cores_per_instance - self._num_of_instance = num_of_instance - self._inter_num_of_threads = inter_num_of_threads - self._intra_num_of_threads = intra_num_of_threads - - @property - def warmup(self): - return self._warmup - - @warmup.setter - def warmup(self, warmup): - if check_value('warmup', warmup, int): - self._warmup = warmup - - @property - def iteration(self): - return self._iteration - - @iteration.setter - def iteration(self, iteration): - if check_value('iteration', iteration, int): - self._iteration = iteration - - @property - def cores_per_instance(self): - return self._cores_per_instance - - @cores_per_instance.setter - def cores_per_instance(self, cores_per_instance): - if check_value('cores_per_instance', cores_per_instance, int): - self._cores_per_instance = cores_per_instance - - @property - def num_of_instance(self): - return self._num_of_instance - - @num_of_instance.setter - def num_of_instance(self, num_of_instance): - if check_value('num_of_instance', num_of_instance, int): - self._num_of_instance = num_of_instance - - @property - def inter_num_of_threads(self): - return self._inter_num_of_threads - - @inter_num_of_threads.setter - def inter_num_of_threads(self, inter_num_of_threads): - if check_value('inter_num_of_threads', inter_num_of_threads, int): - self._inter_num_of_threads = inter_num_of_threads - - @property - def intra_num_of_threads(self): - return self._intra_num_of_threads - - @intra_num_of_threads.setter - def intra_num_of_threads(self, intra_num_of_threads): - if check_value('intra_num_of_threads', intra_num_of_threads, int): - self._intra_num_of_threads = intra_num_of_threads - -class AccuracyLoss: - def __init__(self, loss=0.01): - self._loss = loss - - @property - def relative(self): - return self._loss - - @relative.setter - def relative(self, relative): - if check_value('relative tolerable loss', relative, float): - self._loss = relative - - @property - def absolute(self): - return self._loss - - @absolute.setter - def absolute(self, absolute): - if check_value('absolute tolerable loss', absolute, float): - self._loss = absolute - -tolerable_loss = AccuracyLoss() - -class AccuracyCriterion: - def __init__(self, higher_is_better=True, criterion='relative', tolerable_loss=tolerable_loss): - self._higher_is_better = higher_is_better - self._criterion = criterion - self._tolerable_loss = tolerable_loss - - @property - def higher_is_better(self): - return self._higher_is_better - - @higher_is_better.setter - def higher_is_better(self, higher_is_better): - if check_value('higher_is_better', higher_is_better, bool): - self._higher_is_better = higher_is_better - - @property - def relative(self): - if self._criterion != 'relative': - return None - return self._tolerable_loss.relative - - @relative.setter - def relative(self, relative): - self._criterion = 'relative' - self._tolerable_loss.relative = relative - - @property - def absolute(self): - if self._criterion != 'absolute': - return None - return self._tolerable_loss.absolute - - @absolute.setter - def absolute(self, absolute): - self._criterion = 'absolute' - self._tolerable_loss.absolute = absolute - - def __str__(self): - return self._criterion - -accuracy_criterion = AccuracyCriterion() - -class _BaseQuantizationConfig: - def __init__(self, - inputs=[], - outputs=[], - backend='NA', - device='cpu', - calibration_sampling_size=[100], - op_type_list=None, - op_name_list=None, - strategy='basic', - objective='performance', - timeout=0, - max_trials=100, - performance_only=False, - reduce_range=None, - use_bf16=False, - accuracy_criterion=accuracy_criterion): - self._inputs = inputs - self._outputs = outputs - self._backend = backend - self._device = device - self._op_type_list = op_type_list - self._op_name_list = op_name_list - self._strategy = strategy - self._objective = objective - self._timeout = timeout - self._max_trials = max_trials - self._performance_only = performance_only - self._reduce_range = reduce_range - self._use_bf16 = use_bf16 - self._accuracy_criterion = accuracy_criterion - self._calibration_sampling_size = calibration_sampling_size - - @property - def accuracy_criterion(self): - return self._accuracy_criterion - - @property - def use_bf16(self): - return self._use_bf16 - - @use_bf16.setter - def use_bf16(self, use_bf16): - if check_value('use_bf16', use_bf16, bool): - self._use_bf16 = use_bf16 - - @property - def reduce_range(self): - return self._reduce_range - - @reduce_range.setter - def reduce_range(self, reduce_range): - if check_value('reduce_range', reduce_range, bool): - self._reduce_range = reduce_range - - @property - def performance_only(self): - return self._performance_only - - @performance_only.setter - def performance_only(self, performance_only): - if check_value('performance_only', performance_only, bool): - self._performance_only = performance_only - - @property - def max_trials(self): - return self._max_trials - - @max_trials.setter - def max_trials(self, max_trials): - if check_value('max_trials', max_trials, int): - self._max_trials = max_trials - - @property - def timeout(self): - return self._timeout - - @timeout.setter - def timeout(self, timeout): - if check_value('timeout', timeout, int): - self._timeout = timeout - - @property - def objective(self): - return self._objective - - @objective.setter - def objective(self, objective): - if check_value('objective', objective, str, - ['performance', 'accuracy', 'modelsize', 'footprint']): - self._objective = objective - - @property - def strategy(self): - return self._strategy - - @strategy.setter - def strategy(self, strategy): - if check_value('strategy', strategy, str, - ['basic', 'mse', 'bayesian', 'random', 'exhaustive']): - self._strategy = strategy - - @property - def op_name_list(self): - return self._op_name_list - - @op_name_list.setter - def op_name_list(self, op_name_list): - if not isinstance(op_name_list, dict): - logger.warning("Type of op_name_list should be dict but not {}, " \ - "use its default value.".format(type(op_name_list))) - else: - for k, v in op_name_list.items(): - ops_schema.validate(v) - self._op_name_list = op_name_list - - @property - def op_type_list(self): - return self._op_type_list - - @op_type_list.setter - def op_type_list(self, op_type_list): - if not isinstance(op_type_list, dict): - logger.warning("Type of op_type_list should be dict but not {}, " \ - "use its default value.".format(type(op_type_list))) - else: - for k, v in op_type_list.items(): - ops_schema.validate(v) - self._op_type_list = op_type_list - - @property - def calibration_sampling_size(self): - return self._calibration_sampling_size - - @calibration_sampling_size.setter - def calibration_sampling_size(self, sampling_size): - if check_value('calibration_sampling_size', sampling_size, int): - self._calibration_sampling_size = sampling_size - - @property - def device(self): - return self._device - - @device.setter - def device(self, device): - if check_value('device', device, str, ['cpu', 'gpu']): - self._device = device - - @property - def backend(self): - return self._backend - - @backend.setter - def backend(self, backend): - if check_value('backend', backend, str, [ - 'tensorflow', 'tensorflow_itex', 'pytorch', 'pytorch_ipex', 'pytorch_fx', - 'onnxrt_qlinearops', 'onnxrt_integerops', 'onnxrt_qdq', 'onnxrt_qoperator', 'mxnet' - ]): - self._backend = backend - - @property - def outputs(self): - return self._outputs - - @outputs.setter - def outputs(self, outputs): - if check_value('outputs', outputs, str): - self._outputs = outputs - - @property - def inputs(self): - return self._inputs - - @inputs.setter - def inputs(self, inputs): - if check_value('inputs', inputs, str): - self._inputs = inputs - class QuantizationConfig(_BaseQuantizationConfig): def __init__(self, @@ -399,16 +34,33 @@ def __init__(self, op_type_list=None, op_name_list=None, strategy='basic', + strategy_kwargs=None, objective='performance', timeout=0, max_trials=100, performance_only=False, reduce_range=None, - use_bf16=False, + use_bf16=True, + optimization_level=1, accuracy_criterion=accuracy_criterion): - super().__init__(inputs, outputs, backend, device, calibration_sampling_size, op_type_list, - op_name_list, strategy, objective, timeout, max_trials, performance_only, - reduce_range, use_bf16, accuracy_criterion) + extra_precisions = ["bf16"] if use_bf16 else [] + super().__init__(inputs=inputs, + outputs=outputs, + backend=backend, + device=device, + calibration_sampling_size=calibration_sampling_size, + op_type_list=op_type_list, + op_name_list=op_name_list, + strategy=strategy, + strategy_kwargs=strategy_kwargs, + objective=objective, + timeout=timeout, + max_trials=max_trials, + performance_only=performance_only, + reduce_range=reduce_range, + extra_precisions=extra_precisions, + optimization_level=optimization_level, + accuracy_criterion=accuracy_criterion) self._approach = approach @property @@ -424,112 +76,6 @@ def approach(self, approach): self._approach = approach -class PostTrainingConfig(_BaseQuantizationConfig): - def __init__(self, - inputs=[], - outputs=[], - backend='NA', - device='cpu', - approach='post_training_auto_quant', - calibration_sampling_size=[100], - op_type_list=None, - op_name_list=None, - strategy='basic', - objective='performance', - timeout=0, - max_trials=100, - performance_only=False, - reduce_range=None, - use_bf16=False, - accuracy_criterion=accuracy_criterion): - super().__init__(inputs, outputs, backend, device, calibration_sampling_size, op_type_list, - op_name_list, strategy, objective, timeout, max_trials, performance_only, - reduce_range, use_bf16, accuracy_criterion) - self._approach = approach - - @property - def approach(self): - return self._approach - - @approach.setter - def approach(self, approach): - if check_value("approach", approach, str, [ - "post_training_static_quant", "post_training_dynamic_quant", - "post_training_auto_quant" - ]): - self._approach = approach - - -class QuantizationAwareTrainingConfig(_BaseQuantizationConfig): - def __init__(self, - inputs=[], - outputs=[], - backend='NA', - device='cpu', - op_type_list=None, - op_name_list=None, - reduce_range=None, - use_bf16=False): - super().__init__(inputs=inputs, outputs=outputs, backend=backend, device=device, - op_type_list=op_type_list, op_name_list=op_name_list, - reduce_range=reduce_range, use_bf16=use_bf16) - self._approach = 'quant_aware_training' - - @property - def approach(self): - return self._approach - - @approach.setter - def approach(self, approach): - if check_value('approach', approach, str, - ['quant_aware_training']): - self._approach = approach - - -class Options: - def __init__(self, random_seed=1978, workspace=default_workspace, - resume_from=None, tensorboard=False): - self._random_seed = random_seed - self._workspace = workspace - self._resume_from = resume_from - self._tensorboard = tensorboard - - @property - def random_seed(self): - return self._random_seed - - @random_seed.setter - def random_seed(self, random_seed): - if check_value('random_seed', random_seed, int): - self._random_seed = random_seed - - @property - def workspace(self): - return self._workspace - - @workspace.setter - def workspace(self, workspace): - if check_value('workspace', workspace, str): - self._workspace = workspace - - @property - def resume_from(self): - return self._resume_from - - @resume_from.setter - def resume_from(self, resume_from): - if check_value('resume_from', resume_from, str): - self._resume_from = resume_from - - @property - def tensorboard(self): - return self._tensorboard - - @tensorboard.setter - def tensorboard(self, tensorboard): - if check_value('tensorboard', tensorboard, bool): - self._tensorboard = tensorboard - class WeightConf: def __init__(self, datatype=None, scheme=None, granularity=None, algorithm=None): self._datatype = datatype @@ -641,134 +187,6 @@ class PyTorch(MXNet): def __init__(self, precisions=None): super().__init__(precisions) -pruners = [Pruner()] - -class PruningConfig: - def __init__(self, pruners=pruners, initial_sparsity=0.0, target_sparsity=0.97, - max_sparsity_ratio_per_layer=0.98, prune_type="basic_magnitude", - start_epoch=0, end_epoch=4, start_step=0, end_step=0, update_frequency=1.0, - update_frequency_on_step=1, not_to_prune_names=[], prune_domain="global", - names=[], exclude_names=[], prune_layer_type=[], sparsity_decay_type="exp", - pattern="tile_pattern_1x1"): - self._weight_compression = DotDict({ - 'initial_sparsity': initial_sparsity, - 'target_sparsity': target_sparsity, - 'max_sparsity_ratio_per_layer': max_sparsity_ratio_per_layer, - 'prune_type': prune_type, - 'start_epoch': start_epoch, - 'end_epoch': end_epoch, - 'start_step': start_step, - 'end_step': end_step, - 'update_frequency': update_frequency, - 'update_frequency_on_step': update_frequency_on_step, - 'not_to_prune_names': not_to_prune_names, - 'prune_domain': prune_domain, - 'names': names, - 'exclude_names': exclude_names, - 'prune_layer_type': prune_layer_type, - 'sparsity_decay_type': sparsity_decay_type, - 'pattern': pattern, - 'pruners': pruners - }) - - @property - def weight_compression(self): - return self._weight_compression - - @weight_compression.setter - def weight_compression(self, weight_compression): - self._weight_compression = weight_compression - - -class KnowledgeDistillationLossConfig: - def __init__(self, temperature=1.0, loss_types=['CE', 'CE'], loss_weights=[0.5, 0.5]): - self.config = DotDict({ - 'KnowledgeDistillationLoss': { - 'temperature': temperature, - 'loss_types': loss_types, - 'loss_weights': loss_weights - } - }) - - -class IntermediateLayersKnowledgeDistillationLossConfig: - def __init__(self, layer_mappings=[], loss_types=[], loss_weights=[], add_origin_loss=False): - self.config = DotDict({ - 'IntermediateLayersKnowledgeDistillationLoss': { - 'layer_mappings': layer_mappings, - 'loss_types': loss_types, - 'loss_weights': loss_weights, - 'add_origin_loss': add_origin_loss - } - }) - - -class SelfKnowledgeDistillationLossConfig: - def __init__(self, - layer_mappings=[], - temperature=1.0, - loss_types=[], - loss_weights=[], - add_origin_loss=False): - self.config = DotDict({ - 'SelfKnowledgeDistillationLoss': { - 'layer_mappings': layer_mappings, - 'temperature': temperature, - 'loss_types': loss_types, - 'loss_weights': loss_weights, - 'add_origin_loss': add_origin_loss, - } - }) - - -criterion = KnowledgeDistillationLossConfig() - -class DistillationConfig: - """Config of distillation. - - Args: - - teacher_model (Callable): Teacher model for distillation. Defaults to None. - features (optional): Teacher features for distillation, features and teacher_model are alternative. - Defaults to None. - criterion (Callable, optional): Distillation loss configure. - optimizer (dictionary, optional): Optimizer configure. - """ - - def __init__(self, - teacher_model, - criterion=criterion, - optimizer={'SGD': { - 'learning_rate': 0.0001 - }}): - self._criterion = criterion.config - self._optimizer = optimizer - self._teacher_model = teacher_model - - @property - def criterion(self): - return self._criterion - - @criterion.setter - def criterion(self, criterion): - self._criterion = criterion - - @property - def optimizer(self): - return self._optimizer - - @optimizer.setter - def optimizer(self, optimizer): - self._optimizer = optimizer - - @property - def teacher_model(self): - return self._teacher_model - - @teacher_model.setter - def teacher_model(self, teacher_model): - self._teacher_model = teacher_model - class DyNASConfig: def __init__(self, supernet=None, metrics=None, population=50, num_evals=100000, @@ -816,41 +234,8 @@ def search(self, search): self._search = search -class MixedPrecisionConfig(PostTrainingConfig): - def __init__(self, - inputs=[], - outputs=[], - backend='NA', - device='cpu', - op_type_list=None, - op_name_list=None, - strategy='basic', - objective='performance', - timeout=0, - max_trials=100, - performance_only=False, - reduce_range=None, - accuracy_criterion=accuracy_criterion, - precisions=["bf16"]): - super().__init__(inputs, outputs, backend, device, op_type_list=op_type_list, - op_name_list=op_name_list, strategy=strategy, objective=objective, - timeout=timeout, max_trials=max_trials, performance_only=performance_only, - reduce_range=reduce_range, accuracy_criterion=accuracy_criterion, - use_bf16=True) - self._precisions = precisions if isinstance(precisions, List) else [precisions] - - @property - def precisions(self): - return self._precisions - - @precisions.setter - def precisions(self, precisions): - self._precisions = precisions - - quantization = QuantizationConfig() benchmark = BenchmarkConfig() -options = Options() pruning = PruningConfig() distillation = DistillationConfig(teacher_model=None) nas = NASConfig() diff --git a/neural_compressor/config.py b/neural_compressor/config.py new file mode 100644 index 00000000000..b3a9fd4352e --- /dev/null +++ b/neural_compressor/config.py @@ -0,0 +1,912 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import logging +from typing import List +from schema import Schema, And, Optional +from .conf.dotdict import DotDict +from .conf.config import Pruner + +logger = logging.getLogger("neural_compressor") +default_workspace = './nc_workspace/{}/'.format( + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + +QUANTMAPPING = { + "auto": "post_training_auto_quant", + "dynamic": "post_training_dynamic_quant", + "static": "post_training_static_quant", + "qat": "quant_aware_training", +} + + +ops_schema = Schema({ + Optional('weight', default=None): { + Optional('granularity'): And( + list, + lambda s: all(i in ['per_channel', 'per_tensor'] for i in s)), + Optional('scheme'): And( + list, + lambda s: all(i in ['asym', 'sym', 'asym_float'] for i in s)), + Optional('dtype'): And( + list, + lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16', 'fp16'] for i in s)), + Optional('algorithm'): And( + list, + lambda s: all(i in ['minmax'] for i in s))}, + Optional('activation', default=None): { + Optional('granularity'): And( + list, + lambda s: all(i in ['per_channel', 'per_tensor'] for i in s)), + Optional('scheme'): And( + list, + lambda s: all(i in ['asym', 'sym'] for i in s)), + Optional('dtype'): And( + list, + lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16', 'fp16', 'None'] for i in s)), + Optional('algorithm'): And( + list, + lambda s: all(i in ['minmax', 'kl', 'placeholder'] for i in s))}}) + + +def check_value(name, src, supported_type, supported_value=[]): + if isinstance(src, list) and any([not isinstance(i, supported_type) for i in src]): + logger.warning("Type of {} items should be {} but not {}, " \ + "use its default value.".format(name, str(supported_type), [type(i) for i in src])) + return False + elif not isinstance(src, list) and not isinstance(src, supported_type): + logger.warning("Type of {} should be {} but not {}, " \ + "use its default value.".format(name, str(supported_type), type(src))) + return False + + if len(supported_value) > 0: + if isinstance(src, str) and src not in supported_value: + logger.warning("{} is not in supported {}: {}. Skip setting it and" \ + " use default value.".format(src, name, str(supported_value))) + return False + elif isinstance(src, list) and all([isinstance(i, str) for i in src]) and \ + any([i not in supported_value for i in src]): + logger.warning("{} is not in supported {}: {}. Skip setting it and" \ + " use default value.".format(src, name, str(supported_value))) + return False + + return True + + +class Options: + def __init__(self, random_seed=1978, workspace=default_workspace, + resume_from=None, tensorboard=False): + self._random_seed = random_seed + self._workspace = workspace + self._resume_from = resume_from + self._tensorboard = tensorboard + + @property + def random_seed(self): + return self._random_seed + + @random_seed.setter + def random_seed(self, random_seed): + if check_value('random_seed', random_seed, int): + self._random_seed = random_seed + + @property + def workspace(self): + return self._workspace + + @workspace.setter + def workspace(self, workspace): + if check_value('workspace', workspace, str): + self._workspace = workspace + + @property + def resume_from(self): + return self._resume_from + + @resume_from.setter + def resume_from(self, resume_from): + if check_value('resume_from', resume_from, str): + self._resume_from = resume_from + + @property + def tensorboard(self): + return self._tensorboard + + @tensorboard.setter + def tensorboard(self, tensorboard): + if check_value('tensorboard', tensorboard, bool): + self._tensorboard = tensorboard + + +options = Options() + + +class BenchmarkConfig: + def __init__(self, + inputs=[], + outputs=[], + warmup=5, + iteration=-1, + cores_per_instance=None, + num_of_instance=None, + inter_num_of_threads=None, + intra_num_of_threads=None): + self._inputs = inputs + self._outputs = outputs + self._warmup = warmup + self._iteration = iteration + self._cores_per_instance = cores_per_instance + self._num_of_instance = num_of_instance + self._inter_num_of_threads = inter_num_of_threads + self._intra_num_of_threads = intra_num_of_threads + + @property + def outputs(self): + return self._outputs + + @outputs.setter + def outputs(self, outputs): + if check_value('outputs', outputs, str): + self._outputs = outputs + + @property + def inputs(self): + return self._inputs + + @inputs.setter + def inputs(self, inputs): + if check_value('inputs', inputs, str): + self._inputs = inputs + + @property + def warmup(self): + return self._warmup + + @warmup.setter + def warmup(self, warmup): + if check_value('warmup', warmup, int): + self._warmup = warmup + + @property + def iteration(self): + return self._iteration + + @iteration.setter + def iteration(self, iteration): + if check_value('iteration', iteration, int): + self._iteration = iteration + + @property + def cores_per_instance(self): + return self._cores_per_instance + + @cores_per_instance.setter + def cores_per_instance(self, cores_per_instance): + if check_value('cores_per_instance', cores_per_instance, int): + self._cores_per_instance = cores_per_instance + + @property + def num_of_instance(self): + return self._num_of_instance + + @num_of_instance.setter + def num_of_instance(self, num_of_instance): + if check_value('num_of_instance', num_of_instance, int): + self._num_of_instance = num_of_instance + + @property + def inter_num_of_threads(self): + return self._inter_num_of_threads + + @inter_num_of_threads.setter + def inter_num_of_threads(self, inter_num_of_threads): + if check_value('inter_num_of_threads', inter_num_of_threads, int): + self._inter_num_of_threads = inter_num_of_threads + + @property + def intra_num_of_threads(self): + return self._intra_num_of_threads + + @intra_num_of_threads.setter + def intra_num_of_threads(self, intra_num_of_threads): + if check_value('intra_num_of_threads', intra_num_of_threads, int): + self._intra_num_of_threads = intra_num_of_threads + + +class AccuracyLoss: + def __init__(self, loss=0.01): + self._loss = loss + + @property + def relative(self): + return self._loss + + @relative.setter + def relative(self, relative): + if check_value('relative tolerable loss', relative, float): + self._loss = relative + + @property + def absolute(self): + return self._loss + + @absolute.setter + def absolute(self, absolute): + if check_value('absolute tolerable loss', absolute, float): + self._loss = absolute + + +tolerable_loss = AccuracyLoss() + + +class AccuracyCriterion: + def __init__(self, higher_is_better=True, criterion='relative', tolerable_loss=tolerable_loss): + self._higher_is_better = higher_is_better + self._criterion = criterion + self._tolerable_loss = tolerable_loss + + @property + def higher_is_better(self): + return self._higher_is_better + + @higher_is_better.setter + def higher_is_better(self, higher_is_better): + if check_value('higher_is_better', higher_is_better, bool): + self._higher_is_better = higher_is_better + + @property + def relative(self): + if self._criterion != 'relative': + return None + return self._tolerable_loss.relative + + @relative.setter + def relative(self, relative): + self._criterion = 'relative' + self._tolerable_loss.relative = relative + + @property + def absolute(self): + if self._criterion != 'absolute': + return None + return self._tolerable_loss.absolute + + @absolute.setter + def absolute(self, absolute): + self._criterion = 'absolute' + self._tolerable_loss.absolute = absolute + + def __str__(self): + return self._criterion + + +accuracy_criterion = AccuracyCriterion() + + +class _BaseQuantizationConfig: + def __init__(self, + inputs=[], + outputs=[], + backend="NA", + device="cpu", + calibration_sampling_size=[100], + op_type_list=None, + op_name_list=None, + strategy="basic", + strategy_kwargs=None, + objective="performance", + timeout=0, + max_trials=100, + performance_only=False, + reduce_range=None, + extra_precisions=["bf16"], + optimization_level=1, + accuracy_criterion=accuracy_criterion): + self._inputs = inputs + self._outputs = outputs + self._backend = backend + self._device = device + self._op_type_list = op_type_list + self._op_name_list = op_name_list + self._strategy = strategy + self._strategy_kwargs = strategy_kwargs + self._objective = objective + self._timeout = timeout + self._max_trials = max_trials + self._performance_only = performance_only + self._reduce_range = reduce_range + self._extra_precisions = extra_precisions \ + if isinstance(extra_precisions, List) else [extra_precisions] + self._optimization_level = optimization_level + self.use_bf16 = "bf16" in self._extra_precisions + self._accuracy_criterion = accuracy_criterion + self._calibration_sampling_size = calibration_sampling_size + + @property + def accuracy_criterion(self): + return self._accuracy_criterion + + @property + def extra_precisions(self): + return self._extra_precisions + + @extra_precisions.setter + def extra_precisions(self, extra_precisions): + if check_value('extra_precisions', extra_precisions, List): + self._extra_precisions = extra_precisions + self._use_bf16 = "bf16" in extra_precisions + + @property + def optimization_level(self): + return self._optimization_level + + @optimization_level.setter + def optimization_level(self, optimization_level): + self._optimization_level = optimization_level + + @property + def reduce_range(self): + return self._reduce_range + + @reduce_range.setter + def reduce_range(self, reduce_range): + if check_value('reduce_range', reduce_range, bool): + self._reduce_range = reduce_range + + @property + def performance_only(self): + return self._performance_only + + @performance_only.setter + def performance_only(self, performance_only): + if check_value('performance_only', performance_only, bool): + self._performance_only = performance_only + + @property + def max_trials(self): + return self._max_trials + + @max_trials.setter + def max_trials(self, max_trials): + if check_value('max_trials', max_trials, int): + self._max_trials = max_trials + + @property + def timeout(self): + return self._timeout + + @timeout.setter + def timeout(self, timeout): + if check_value('timeout', timeout, int): + self._timeout = timeout + + @property + def objective(self): + return self._objective + + @objective.setter + def objective(self, objective): + if check_value('objective', objective, str, + ['performance', 'accuracy', 'modelsize', 'footprint']): + self._objective = objective + + @property + def strategy(self): + return self._strategy + + @strategy.setter + def strategy(self, strategy): + if check_value('strategy', strategy, str, + ['basic', 'mse', 'bayesian', 'random', 'exhaustive', 'sigopt', 'tpe']): + self._strategy = strategy + + @property + def strategy_kwargs(self): + return self._strategy_kwargs + + @strategy_kwargs.setter + def strategy_kwargs(self, strategy_kwargs): + self._strategy_kwargs = strategy_kwargs + + @property + def op_name_list(self): + return self._op_name_list + + @op_name_list.setter + def op_name_list(self, op_name_list): + if not isinstance(op_name_list, dict): + logger.warning("Type of op_name_list should be dict but not {}, " \ + "use its default value.".format(type(op_name_list))) + else: + for k, v in op_name_list.items(): + ops_schema.validate(v) + self._op_name_list = op_name_list + + @property + def op_type_list(self): + return self._op_type_list + + @op_type_list.setter + def op_type_list(self, op_type_list): + if not isinstance(op_type_list, dict): + logger.warning("Type of op_type_list should be dict but not {}, " \ + "use its default value.".format(type(op_type_list))) + else: + for k, v in op_type_list.items(): + ops_schema.validate(v) + self._op_type_list = op_type_list + + @property + def calibration_sampling_size(self): + return self._calibration_sampling_size + + @calibration_sampling_size.setter + def calibration_sampling_size(self, sampling_size): + if check_value('calibration_sampling_size', sampling_size, int): + self._calibration_sampling_size = sampling_size + + @property + def device(self): + return self._device + + @device.setter + def device(self, device): + if check_value('device', device, str, ['cpu', 'gpu']): + self._device = device + + @property + def backend(self): + return self._backend + + @backend.setter + def backend(self, backend): + if check_value('backend', backend, str, [ + 'tensorflow', 'tensorflow_itex', 'pytorch', 'pytorch_ipex', 'pytorch_fx', + 'onnxrt_qlinearops', 'onnxrt_integerops', 'onnxrt_qdq', 'onnxrt_qoperator', 'mxnet' + ]): + self._backend = backend + + @property + def outputs(self): + return self._outputs + + @outputs.setter + def outputs(self, outputs): + if check_value('outputs', outputs, str): + self._outputs = outputs + + @property + def inputs(self): + return self._inputs + + @inputs.setter + def inputs(self, inputs): + if check_value('inputs', inputs, str): + self._inputs = inputs + + +class TuningCriterion: + def __init__(self, strategy="basic", strategy_kwargs=None, timeout=0, max_trials=100, objective="performance"): + self._strategy = strategy + self._timeout = timeout + self._max_trials = max_trials + self._objective = objective + self._strategy_kwargs = strategy_kwargs + + @property + def max_trials(self): + return self._max_trials + + @max_trials.setter + def max_trials(self, max_trials): + if check_value('max_trials', max_trials, int): + self._max_trials = max_trials + + @property + def timeout(self): + return self._timeout + + @timeout.setter + def timeout(self, timeout): + if check_value('timeout', timeout, int): + self._timeout = timeout + + @property + def objective(self): + return self._objective + + @objective.setter + def objective(self, objective): + if check_value('objective', objective, str, + ['performance', 'accuracy', 'modelsize', 'footprint']): + self._objective = objective + + @property + def strategy(self): + return self._strategy + + @strategy.setter + def strategy(self, strategy): + if check_value('strategy', strategy, str, + ['basic', 'mse', 'bayesian', 'random', 'exhaustive', 'sigopt', 'tpe']): + self._strategy = strategy + + @property + def strategy_kwargs(self): + return self._strategy_kwargs + + @strategy_kwargs.setter + def strategy_kwargs(self, strategy_kwargs): + self._strategy_kwargs = strategy_kwargs + +tuning_criterion = TuningCriterion() + + +class PostTrainingQuantConfig(_BaseQuantizationConfig): + def __init__(self, + device="cpu", + backend="NA", + inputs=[], + outputs=[], + approach="auto", + calibration_sampling_size=[100], + op_type_list=None, + op_name_list=None, + reduce_range=None, + extra_precisions = ["bf16"], + optimization_level=1, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion, + ): + super().__init__(inputs=inputs, + outputs=outputs, + device=device, + backend=backend, + calibration_sampling_size=calibration_sampling_size, + op_type_list=op_type_list, + op_name_list=op_name_list, + strategy=tuning_criterion.strategy, + strategy_kwargs=tuning_criterion.strategy_kwargs, + objective=tuning_criterion.objective, + timeout=tuning_criterion.timeout, + max_trials=tuning_criterion.max_trials, + reduce_range=reduce_range, + extra_precisions=extra_precisions, + optimization_level=optimization_level, + accuracy_criterion=accuracy_criterion) + self.approach = approach + + @property + def approach(self): + return self._approach + + @approach.setter + def approach(self, approach): + if check_value("approach", approach, str, ["static", "dynamic", "auto"]): + self._approach = QUANTMAPPING[approach] + + +class QuantizationAwareTrainingConfig(_BaseQuantizationConfig): + def __init__(self, + device="cpu", + backend="NA", + inputs=[], + outputs=[], + op_type_list=None, + op_name_list=None, + reduce_range=None, + extra_precisions=["bf16"], + optimization_level=1): + super().__init__(inputs=inputs, outputs=outputs, device=device, backend=backend, + op_type_list=op_type_list, op_name_list=op_name_list, + reduce_range=reduce_range, extra_precisions=extra_precisions, + optimization_level=optimization_level) + self._approach = 'quant_aware_training' + + @property + def approach(self): + return self._approach + + +pruners = [Pruner()] + + +class PruningConfig: + def __init__(self, pruners=pruners, initial_sparsity=0.0, target_sparsity=0.97, + max_sparsity_ratio_per_layer=0.98, prune_type="basic_magnitude", + start_epoch=0, end_epoch=4, start_step=0, end_step=0, update_frequency=1.0, + update_frequency_on_step=1, not_to_prune_names=[], prune_domain="global", + names=[], exclude_names=[], prune_layer_type=[], sparsity_decay_type="exp", + pattern="tile_pattern_1x1"): + self._weight_compression = DotDict({ + 'initial_sparsity': initial_sparsity, + 'target_sparsity': target_sparsity, + 'max_sparsity_ratio_per_layer': max_sparsity_ratio_per_layer, + 'prune_type': prune_type, + 'start_epoch': start_epoch, + 'end_epoch': end_epoch, + 'start_step': start_step, + 'end_step': end_step, + 'update_frequency': update_frequency, + 'update_frequency_on_step': update_frequency_on_step, + 'not_to_prune_names': not_to_prune_names, + 'prune_domain': prune_domain, + 'names': names, + 'exclude_names': exclude_names, + 'prune_layer_type': prune_layer_type, + 'sparsity_decay_type': sparsity_decay_type, + 'pattern': pattern, + 'pruners': pruners + }) + + @property + def weight_compression(self): + return self._weight_compression + + @weight_compression.setter + def weight_compression(self, weight_compression): + self._weight_compression = weight_compression + + +class KnowledgeDistillationLossConfig: + def __init__(self, temperature=1.0, loss_types=['CE', 'CE'], loss_weights=[0.5, 0.5]): + self.config = DotDict({ + 'KnowledgeDistillationLoss': { + 'temperature': temperature, + 'loss_types': loss_types, + 'loss_weights': loss_weights + } + }) + + +class IntermediateLayersKnowledgeDistillationLossConfig: + def __init__(self, layer_mappings=[], loss_types=[], loss_weights=[], add_origin_loss=False): + self.config = DotDict({ + 'IntermediateLayersKnowledgeDistillationLoss': { + 'layer_mappings': layer_mappings, + 'loss_types': loss_types, + 'loss_weights': loss_weights, + 'add_origin_loss': add_origin_loss + } + }) + + +class SelfKnowledgeDistillationLossConfig: + def __init__(self, + layer_mappings=[], + temperature=1.0, + loss_types=[], + loss_weights=[], + add_origin_loss=False): + self.config = DotDict({ + 'SelfKnowledgeDistillationLoss': { + 'layer_mappings': layer_mappings, + 'temperature': temperature, + 'loss_types': loss_types, + 'loss_weights': loss_weights, + 'add_origin_loss': add_origin_loss, + } + }) + + +criterion = KnowledgeDistillationLossConfig() + + +class DistillationConfig: + """Config of distillation. + + Args: + + teacher_model (Callable): Teacher model for distillation. Defaults to None. + features (optional): Teacher features for distillation, features and teacher_model are alternative. + Defaults to None. + criterion (Callable, optional): Distillation loss configure. + optimizer (dictionary, optional): Optimizer configure. + """ + + def __init__(self, + teacher_model, + criterion=criterion, + optimizer={'SGD': { + 'learning_rate': 0.0001 + }}): + self._criterion = criterion.config + self._optimizer = optimizer + self._teacher_model = teacher_model + + @property + def criterion(self): + return self._criterion + + @criterion.setter + def criterion(self, criterion): + self._criterion = criterion + + @property + def optimizer(self): + return self._optimizer + + @optimizer.setter + def optimizer(self, optimizer): + self._optimizer = optimizer + + @property + def teacher_model(self): + return self._teacher_model + + @teacher_model.setter + def teacher_model(self, teacher_model): + self._teacher_model = teacher_model + + +class MixedPrecisionConfig(PostTrainingQuantConfig): + def __init__(self, + device="cpu", + backend="NA", + inputs=[], + outputs=[], + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion, + extra_precisions=["bf16"]): + super().__init__(inputs=inputs, + outputs=outputs, + device=device, + backend=backend, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion, + extra_precisions=extra_precisions, + ) + + +class ExportConfig: + def __init__( + self, + dtype="int8", + opset_version=14, + quant_format="QDQ", + example_inputs=None, + input_names=None, + output_names=None, + dynamic_axes=None, + ): + self._dtype = dtype + self._opset_version = opset_version + self._quant_format = quant_format + self._example_inputs = example_inputs + self._input_names = input_names + self._output_names = output_names + self._dynamic_axes = dynamic_axes + + @property + def dtype(self): + return self._dtype + + @dtype.setter + def dtype(self, dtype): + self._dtype = dtype + + @property + def opset_version(self): + return self._opset_version + + @opset_version.setter + def opset_version(self, opset_version): + self._opset_version = opset_version + + @property + def quant_format(self): + return self._quant_format + + @quant_format.setter + def quant_format(self, quant_format): + self._quant_format = quant_format + + @property + def example_inputs(self): + return self._example_inputs + + @example_inputs.setter + def example_inputs(self, example_inputs): + self._example_inputs = example_inputs + + @property + def input_names(self): + return self._input_names + + @input_names.setter + def input_names(self, input_names): + self._input_names = input_names + + @property + def output_names(self): + return self._output_names + + @output_names.setter + def output_names(self, output_names): + self._output_names = output_names + + @property + def dynamic_axes(self): + return self._dynamic_axes + + @dynamic_axes.setter + def dynamic_axes(self, dynamic_axes): + self._dynamic_axes = dynamic_axes + + +class Torch2ONNXConfig(ExportConfig): + def __init__( + self, + dtype="int8", + opset_version=14, + quant_format="QDQ", + example_inputs=None, + input_names=None, + output_names=None, + dynamic_axes=None, + **kwargs, + ): + super().__init__( + dtype=dtype, + opset_version=opset_version, + quant_format=quant_format, + example_inputs=example_inputs, + input_names=input_names, + output_names=output_names, + dynamic_axes=dynamic_axes, + ) + self.kwargs = kwargs + + +class TF2ONNXConfig(ExportConfig): + def __init__( + self, + dtype="int8", + opset_version=14, + quant_format="QDQ", + example_inputs=None, + input_names=None, + output_names=None, + dynamic_axes=None, + **kwargs, + ): + super().__init__( + dtype=dtype, + opset_version=opset_version, + quant_format=quant_format, + example_inputs=example_inputs, + input_names=input_names, + output_names=output_names, + dynamic_axes=dynamic_axes, + ) + self.kwargs = kwargs + + +def set_random_seed(seed: int): + options.random_seed = seed + + +def set_workspace(workspace: str): + options.workspace = workspace + + +def set_resume_from(resume_from: str): + options.resume_from = resume_from + + +def set_tensorboard(tensorboard: bool): + options.tensorboard = tensorboard diff --git a/neural_compressor/contrib/strategy/sigopt.py b/neural_compressor/contrib/strategy/sigopt.py index 54593fb2e32..19b3ae1ed3e 100644 --- a/neural_compressor/contrib/strategy/sigopt.py +++ b/neural_compressor/contrib/strategy/sigopt.py @@ -17,12 +17,14 @@ import copy from neural_compressor.utils import logger +from neural_compressor.utils.utility import LazyImport from neural_compressor.strategy.strategy import strategy_registry, TuneStrategy -from sigopt import Connection from collections import OrderedDict from neural_compressor.strategy.st_utils.tuning_sampler import OpWiseTuningSampler from neural_compressor.strategy.st_utils.tuning_structs import OpTuningConfig +sigopt = LazyImport('sigopt') + @strategy_registry class SigOptTuneStrategy(TuneStrategy): """The tuning strategy using SigOpt HPO search in tuning space. @@ -80,7 +82,15 @@ def __init__(self, model, conf, q_dataloader, q_func=None, eval_func, dicts, q_hooks) - + # Initialize the SigOpt tuning strategy if the user specified to use it. + strategy_name = conf.usr_cfg.tuning.strategy.name + if strategy_name.lower() == "sigopt": + try: + import sigopt + except ImportError: + ImportError(f"Please install sigopt for using {strategy_name} strategy.") + else: + pass # SigOpt init client_token = conf.usr_cfg.tuning.strategy.sigopt_api_token self.project_id = conf.usr_cfg.tuning.strategy.sigopt_project_id @@ -107,7 +117,7 @@ def __init__(self, model, conf, q_dataloader, q_func=None, else: logger.info("Experiment name is {}.".format(self.experiment_name)) - self.conn = Connection(client_token) + self.conn = sigopt.Connection(client_token) self.experiment = None def params_to_tune_configs(self, params): diff --git a/neural_compressor/contrib/strategy/tpe.py b/neural_compressor/contrib/strategy/tpe.py index 9baf2911904..39362f1749b 100644 --- a/neural_compressor/contrib/strategy/tpe.py +++ b/neural_compressor/contrib/strategy/tpe.py @@ -20,14 +20,14 @@ from pathlib import Path from functools import partial import numpy as np -import hyperopt as hpo -from hyperopt import fmin, hp, STATUS_OK, Trials from neural_compressor.utils import logger +from neural_compressor.utils.utility import LazyImport from neural_compressor.strategy.strategy import strategy_registry, TuneStrategy from collections import OrderedDict from neural_compressor.strategy.st_utils.tuning_sampler import OpWiseTuningSampler from neural_compressor.strategy.st_utils.tuning_structs import OpTuningConfig +hyperopt = LazyImport('hyperopt') try: import pandas as pd @@ -85,10 +85,19 @@ def __init__(self, model, conf, q_dataloader, q_func=None, eval_dataloader=None, eval_func=None, dicts=None, q_hooks=None): assert conf.usr_cfg.quantization.approach == 'post_training_static_quant', \ "TPE strategy is only for post training static quantization!" + # Initialize the tpe tuning strategy if the user specified to use it. + strategy_name = conf.usr_cfg.tuning.strategy.name + if strategy_name.lower() == "tpe": + try: + import hyperopt + except ImportError: + raise ImportError(f"Please install hyperopt for using {strategy_name} strategy.") + else: + pass self.hpopt_search_space = None self.warm_start = False self.cfg_evaluated = False - self.hpopt_trials = Trials() + self.hpopt_trials = hyperopt.Trials() self.max_trials = conf.usr_cfg.tuning.exit_policy.get('max_trials', 200) self.loss_function_config = { 'acc_th': conf.usr_cfg.tuning.accuracy_criterion.relative if \ @@ -140,7 +149,7 @@ def __getstate__(self): def _configure_hpopt_search_space_and_params(self, search_space): self.hpopt_search_space = {} for param, configs in search_space.items(): - self.hpopt_search_space[(param)] = hp.choice((param[0]), configs) + self.hpopt_search_space[(param)] = hyperopt.hp.choice((param[0]), configs) # Find minimum number of choices for params with more than one choice multichoice_params = [len(configs) for param, configs in search_space.items() if len(configs) > 1] @@ -149,7 +158,7 @@ def _configure_hpopt_search_space_and_params(self, search_space): min_param_size = min(multichoice_params) if len(multichoice_params) > 0 else 1 self.tpe_params['n_EI_candidates'] = min_param_size self.tpe_params['prior_weight'] = 1 / min_param_size - self._algo = partial(hpo.tpe.suggest, + self._algo = partial(hyperopt.tpe.suggest, n_startup_jobs=self.tpe_params['n_initial_point'], gamma=self.tpe_params['gamma'], n_EI_candidates=self.tpe_params['n_EI_candidates'], @@ -225,12 +234,12 @@ def initial_op_quant_mode(items_lst, target_quant_mode, op_item_dtype_dict): self._configure_hpopt_search_space_and_params(first_run_cfg) # Run first iteration with best result from history trials_count = len(self.hpopt_trials.trials) + 1 - fmin(partial(self.object_evaluation, model=self.model), - space=self.hpopt_search_space, - algo=self._algo, - max_evals=trials_count, - trials=self.hpopt_trials, - show_progressbar=False) + hyperopt.fmin(partial(self.object_evaluation, model=self.model), + space=self.hpopt_search_space, + algo=self._algo, + max_evals=trials_count, + trials=self.hpopt_trials, + show_progressbar=False) if pd is not None: self._save_trials(trials_file) self._update_best_result(best_result_file) @@ -266,12 +275,12 @@ def initial_op_quant_mode(items_lst, target_quant_mode, op_item_dtype_dict): self.cfg_evaluated = False logger.debug("Trial iteration start: {} / {}.".format( trials_count, self.max_trials)) - fmin(partial(self.object_evaluation, model=self.model), - space=self.hpopt_search_space, - algo=self._algo, - max_evals=trials_count, - trials=self.hpopt_trials, - show_progressbar=False) + hyperopt.fmin(partial(self.object_evaluation, model=self.model), + space=self.hpopt_search_space, + algo=self._algo, + max_evals=trials_count, + trials=self.hpopt_trials, + show_progressbar=False) trials_count += 1 if pd is not None: self._save_trials(trials_file) @@ -349,7 +358,7 @@ def _compute_metrics(self, tune_cfg, acc, lat): 'acc_loss': acc_diff, 'lat_diff': lat_diff, 'quantization_ratio': quantization_ratio, - 'status': STATUS_OK} + 'status': hyperopt.STATUS_OK} def _calculate_acc_lat_diff(self, acc, lat): int8_acc = acc diff --git a/neural_compressor/experimental/benchmark.py b/neural_compressor/experimental/benchmark.py index 28c790ce7ca..00329dabd43 100644 --- a/neural_compressor/experimental/benchmark.py +++ b/neural_compressor/experimental/benchmark.py @@ -179,16 +179,10 @@ def __call__(self, mode='performance'): """ cfg = self.conf.usr_cfg assert cfg.evaluation is not None, 'benchmark evaluation filed should not be None...' - if self._b_func is None: - assert cfg.evaluation is not None, \ - 'You must pass b_func or benchmark evaluation filed should be set in config yaml file...' - # use first eval config in yaml if mode from __call__not same with yaml config - if not mode in cfg.evaluation: - mode = list(cfg.evaluation.keys())[0] assert sys.platform in ['linux', 'win32'], 'only support platform windows and linux...' set_all_env_var(deep_get(cfg, 'evaluation.{}.configs'.format(mode))) - # disable multi-instance for accuracy mode - if mode == "accuracy": + # disable multi-instance for accuracy mode or running bechmark on GPU device + if mode == "accuracy" or cfg.device == 'gpu': set_env_var('NC_ENV_CONF', True, overwrite_existing=True) logger.info("Start to run Benchmark.") @@ -344,7 +338,6 @@ def run_instance(self, mode): b_dataloader_cfg = deep_get(cfg, 'evaluation.{}.dataloader'.format(mode)) self._b_dataloader = create_dataloader(self.framework, b_dataloader_cfg) - is_measure = True if self._b_func is None: self._b_func = create_eval_func(self.framework, \ self._b_dataloader, \ @@ -354,14 +347,13 @@ def run_instance(self, mode): iteration=iteration) else: self._custom_b_func = True - is_measure = False objectives = [i.lower() for i in cfg.tuning.multi_objectives.objective] if \ deep_get(cfg, 'tuning.multi_objectives') else [cfg.tuning.objective] assert len(objectives) == 1, 'benchmark supports one objective at a time' self.objectives = MultiObjective(objectives, cfg.tuning.accuracy_criterion, - is_measure=is_measure) + is_measure=True) if self._custom_b_func: val = self.objectives.evaluate(self._b_func, self._model.model) @@ -370,7 +362,8 @@ def run_instance(self, mode): # measurer contain info not only performance(eg, memory, model_size) # also measurer have result list among steps acc, _ = val - warmup = 0 if deep_get(cfg, 'evaluation.{}.warmup'.format(mode)) is None \ + batch_size = self._b_dataloader.batch_size + warmup = 0 if deep_get(cfg, 'evaluation.{}.warmup'.format(mode)) is None \ else deep_get(cfg, 'evaluation.{}.warmup'.format(mode)) if len(self.objectives.objectives[0].result_list()) < warmup: @@ -380,20 +373,19 @@ def run_instance(self, mode): warmup = 0 result_list = self.objectives.objectives[0].result_list()[warmup:] + latency = np.array(result_list).mean() / batch_size + self._results[mode] = acc, batch_size, result_list logger.info("\n{} mode benchmark result:".format(mode)) for i, res in enumerate(result_list): logger.debug("Iteration {} result {}:".format(i, res)) if mode == 'accuracy': - self._results[mode] = acc, result_list + logger.info("Batch size = {}".format(batch_size)) if isinstance(acc, list): logger.info("Accuracy is" + "".join([" {:.4f}".format(i) for i in acc])) else: logger.info("Accuracy is {:.4f}".format(acc)) elif mode == 'performance': - batch_size = self._b_dataloader.batch_size - latency = np.array(result_list).mean() / batch_size - self._results[mode] = acc, batch_size, result_list logger.info("Batch size = {}".format(batch_size)) logger.info("Latency: {:.3f} ms".format(latency * 1000)) logger.info("Throughput: {:.3f} images/sec".format(1. / latency)) @@ -475,10 +467,9 @@ def model(self, user_model): auto inferenced, but sometimes auto inferenced inputs/outputs will not meet your requests, so it is better to set them manually in config yaml file. - Another corner case is slim model of tensorflow, - be careful of the name of model configured in yaml file, - make sure the name is in supported slim model list. - + Another corner case is the slim model of tensorflow, + be careful of the name of the model configured in the yaml file, + make sure the name is in the supported slim model list. """ if not isinstance(user_model, BaseModel): logger.warning("Force convert framework model to neural_compressor model.") @@ -525,7 +516,7 @@ def metric(self, user_metric): if deep_get(self.conf.usr_cfg, "evaluation.accuracy.metric"): logger.warning("Override the value of `metric` field defined in yaml file" \ " as user defines the value of `metric` attribute by code.") - + if isinstance(user_metric, NCMetric): metric_cfg = {user_metric.name : {**user_metric.kwargs}} deep_set(self.conf.usr_cfg, "evaluation.accuracy.metric", metric_cfg) diff --git a/neural_compressor/experimental/common/__init__.py b/neural_compressor/experimental/common/__init__.py index 6313abcf296..a5f07849745 100644 --- a/neural_compressor/experimental/common/__init__.py +++ b/neural_compressor/experimental/common/__init__.py @@ -1,3 +1,4 @@ +"""Intel® Neural Compressor: An open-source Python library supporting common model.""" #!/usr/bin/env python # -*- coding: utf-8 -*- # diff --git a/neural_compressor/experimental/common/criterion.py b/neural_compressor/experimental/common/criterion.py index 4382e827225..11308854d10 100644 --- a/neural_compressor/experimental/common/criterion.py +++ b/neural_compressor/experimental/common/criterion.py @@ -1252,14 +1252,17 @@ def __call__(self, **kwargs): class SelfKnowledgeDistillationLoss(KnowledgeDistillationFramework): """SelfKnowledge Distillation Loss.""" - def __init__(self, layer_mappings=[], loss_types=None, loss_weights=None, temperature=1.0,add_origin_loss=False, student_model=None, teacher_model=None): + def __init__(self, layer_mappings=[], loss_types=None, loss_weights=None, temperature=1.0,add_origin_loss=False, + student_model=None, teacher_model=None): """Initialize SelfKnowledge Distillation Loss class. Args: layer_mappings (list): layers of distillation.Format like - [[[student1_layer_name1, teacher_layer_name1],[student2_layer_name1, teacher_layer_name1]],[[student1_layer_name2, teacher_layer_name2],[student2_layer_name2, teacher_layer_name2]]] + [[[student1_layer_name1, teacher_layer_name1],[student2_layer_name1, teacher_layer_name1]], + [[student1_layer_name2, teacher_layer_name2],[student2_layer_name2, teacher_layer_name2]]] loss_types (list, optional): loss types. Defaults to ['CE'] * len(layer_mappings). - loss_weights (list, optional): loss weights. Defaults to [1.0 / len(layer_mappings)] * len(layer_mappings).temperature (float, optional): use to calculate the soft label CE. + loss_weights (list, optional): loss weights. Defaults to [1.0 / len(layer_mappings)] * + len(layer_mappings).temperature (float, optional): use to calculate the soft label CE. temperature (optional): temperature. Defaults to 1.0. add_origin_loss (bool, optional): whether to add origin loss for hard label loss. student_model (optional): student model. Defaults to None. @@ -1342,14 +1345,17 @@ class PyTorchSelfKnowledgeDistillationLoss( SelfKnowledgeDistillationLoss ): """PyTorch SelfKnowledge Distillation Loss.""" - def __init__(self, layer_mappings=[], loss_types=None, loss_weights=None, temperature=1.0,add_origin_loss=False, student_model=None, teacher_model=None): + def __init__(self, layer_mappings=[], loss_types=None, loss_weights=None, temperature=1.0,add_origin_loss=False, + student_model=None, teacher_model=None): """Initialize PyTorch SelfKnowledge Distillation Loss class. Args: layer_mappings (list): layers of distillation.Format like - [[[student1_layer_name1, teacher_layer_name1],[student2_layer_name1, teacher_layer_name1]],[[student1_layer_name2, teacher_layer_name2],[student2_layer_name2, teacher_layer_name2]]] + [[[student1_layer_name1, teacher_layer_name1],[student2_layer_name1, teacher_layer_name1]], + [[student1_layer_name2, teacher_layer_name2],[student2_layer_name2, teacher_layer_name2]]] loss_types (list, optional): loss types. Defaults to ['CE'] * len(layer_mappings). - loss_weights (list, optional): loss weights. Defaults to [1.0 / len(layer_mappings)] * len(layer_mappings).temperature (float, optional): use to calculate the soft label CE. + loss_weights (list, optional): loss weights. Defaults to [1.0 / len(layer_mappings)] * + len(layer_mappings).temperature (float, optional): use to calculate the soft label CE. temperature (optional): temperature. Defaults to 1.0. add_origin_loss (bool, optional): whether to add origin loss for hard label loss. student_model (optional): student model. Defaults to None. @@ -1512,4 +1518,4 @@ def __call__(self, **kwargs): class: PyTorchSelfKnowledgeDistillationLoss param dict (dict): param dict """ - return PyTorchSelfKnowledgeDistillationLoss, self._param_check() + return PyTorchSelfKnowledgeDistillationLoss, self._param_check() \ No newline at end of file diff --git a/neural_compressor/experimental/component.py b/neural_compressor/experimental/component.py index 7a3a225b54e..25ab4d4ba93 100644 --- a/neural_compressor/experimental/component.py +++ b/neural_compressor/experimental/component.py @@ -105,14 +105,6 @@ def _init_with_conf(self): logger.error("{}.".format(e)) raise RuntimeError("{} is not correctly installed. " \ "Please check your environment".format(lib)) - if self.framework == 'tensorflow' or self.framework == 'inteltensorflow': - try: - import tensorflow as tf - except Exception as e: - logger.error("{}.".format(e)) - raise RuntimeError( - "The TensorFlow framework is not correctly installed. Please check your environment" - ) def prepare(self): """Register Quantization Aware Training hooks.""" @@ -133,7 +125,6 @@ def prepare(self): self.register_hook('on_train_begin', self.adaptor._pre_hook_for_qat) self.register_hook('on_train_end', self.adaptor._post_hook_for_qat) - def prepare_qat(self): """Register Quantization Aware Training hooks.""" if self.adaptor is None: diff --git a/neural_compressor/experimental/data/datasets/bert_dataset.py b/neural_compressor/experimental/data/datasets/bert_dataset.py index 636b3bef28f..c22abaa996e 100644 --- a/neural_compressor/experimental/data/datasets/bert_dataset.py +++ b/neural_compressor/experimental/data/datasets/bert_dataset.py @@ -33,7 +33,7 @@ @dataset_registry(dataset_type="bert", framework="pytorch", dataset_format='') class PytorchBertDataset(Dataset): """PyTorch dataset used for model Bert. - + This Dataset is to construct from the Bert TensorDataset and not a full implementation from yaml config. The original repo link is: https://github.com/huggingface/transformers. When you want use this Dataset, you should add it before you initialize your DataLoader. diff --git a/neural_compressor/experimental/distillation.py b/neural_compressor/experimental/distillation.py index 7ad630506ee..c87ef341f22 100644 --- a/neural_compressor/experimental/distillation.py +++ b/neural_compressor/experimental/distillation.py @@ -92,6 +92,7 @@ def _on_train_begin(self, dataloader=None): self.best_model = copy.deepcopy(self._model) else: self.best_model = self._model + def _on_step_begin(self, batch_id): """Operations called on the beginning of batches.""" if self.criterion is not None and hasattr(self.criterion, 'clear_features'): @@ -144,7 +145,10 @@ def _on_epoch_end(self): if (isinstance(score, list) and all([s > b_s for s, b_s in zip(score, self.best_score)])) or score > self.best_score: self.best_score = score - self.best_model = copy.deepcopy(self._model._model) + if self.framework == "pytorch": + self.best_model = copy.deepcopy(self._model) + else: + self.best_model = self._model def init_train_cfg(self): """Initialize the training configuration.""" @@ -288,11 +292,7 @@ def execute(self): logger.info("Model distillation is done.") if self._eval_func is not None: logger.info("Start to evaluate the distilled model.") - if self.best_model: - if self.framework == "pytorch": - self._model._model = self.best_model - else: - self._model = self.best_model + self._model = self.best_model if self.best_model else self._model score = self._eval_func( self._model if getattr(self._eval_func, 'builtin', None) else self._model.model ) diff --git a/neural_compressor/experimental/model_conversion.py b/neural_compressor/experimental/model_conversion.py index f5d11f0f671..489128d93e3 100644 --- a/neural_compressor/experimental/model_conversion.py +++ b/neural_compressor/experimental/model_conversion.py @@ -157,7 +157,7 @@ def dataset(self, dataset_type, *args, **kwargs): """Return dataset. Args: - dataset_typ: dataset type + dataset_type: dataset type Returns: class: dataset class diff --git a/neural_compressor/experimental/pruning.py b/neural_compressor/experimental/pruning.py index f005e3ee5db..7c318e38bf9 100644 --- a/neural_compressor/experimental/pruning.py +++ b/neural_compressor/experimental/pruning.py @@ -114,6 +114,7 @@ def _on_after_optimizer_step(self): pruner.on_after_optimizer_step() def prepare(self): + """Functions prepare for generate_hooks, generate_pruners.""" self.generate_hooks() self.generate_pruners() diff --git a/neural_compressor/experimental/quantization.py b/neural_compressor/experimental/quantization.py index 5f0eda5ecf9..cab874bcca7 100644 --- a/neural_compressor/experimental/quantization.py +++ b/neural_compressor/experimental/quantization.py @@ -133,6 +133,9 @@ def pre_process(self): self._create_eval_dataloader(cfg) self._create_calib_dataloader(cfg) strategy = cfg.tuning.strategy.name.lower() + if cfg.quantization.optimization_level == 0: + strategy = "conservative" + logger.info(f"On the premise that the accuracy meets the conditions, improve the performance.") assert strategy in STRATEGIES, "Tuning strategy {} is NOT supported".format(strategy) _resume = None @@ -390,12 +393,11 @@ def q_func(self): return None @q_func.setter - @deprecated(version='2.0', reason="please use `train_func` instead") def q_func(self, user_q_func): - """Training function for Quantization-Aware Training. + """Calibrate quantization parameters for Post-training static quantization. It is optional and only takes effect when user choose - "quant_aware_training" approach in yaml. + "post_training_static_quant" approach in yaml. Args: user_q_func: This function takes "model" as input parameter diff --git a/neural_compressor/mix_precision.py b/neural_compressor/mix_precision.py index ab0a774aad3..f89686887b7 100644 --- a/neural_compressor/mix_precision.py +++ b/neural_compressor/mix_precision.py @@ -17,14 +17,14 @@ from .experimental.mixed_precision import MixedPrecision -from neural_compressor.conf.pythonic_config import Config, MixedPrecisionConfig, Options +from neural_compressor.conf.pythonic_config import Config +from neural_compressor.config import MixedPrecisionConfig def fit(model, config=None, eval_func=None, eval_dataloader=None, eval_metric=None, **kwargs): assert isinstance(config, MixedPrecisionConfig), "Please provide MixedPrecisionConfig!" - options = Options() if "options" not in kwargs else kwargs["options"] - conf = Config(quantization=config, options=options) + conf = Config(quantization=config) converter = MixedPrecision(conf) - converter.precisions = config.precisions + converter.precisions = config.extra_precisions converter.model = model if eval_func is not None: converter.eval_func = eval_func diff --git a/neural_compressor/model/base_model.py b/neural_compressor/model/base_model.py index 029723ad821..c42604f96fa 100644 --- a/neural_compressor/model/base_model.py +++ b/neural_compressor/model/base_model.py @@ -42,12 +42,7 @@ def save(self, root, *args, **kwargs): def export( self, save_path: str, - input, - target_model_type: str = 'ONNX', - quant_format: str = 'QDQ', - opset_version: int = 14, - *args, - **kwargs + conf, ): ''' abstract method of model convertion to ONNX''' raise NotImplementedError diff --git a/neural_compressor/model/model.py b/neural_compressor/model/model.py index 13629a19038..59a87d51a29 100644 --- a/neural_compressor/model/model.py +++ b/neural_compressor/model/model.py @@ -949,11 +949,6 @@ def save(self, root=None): f.write(self.graph_def.SerializeToString()) logger.info("Save quantized model to {}.".format(pb_file)) - @abstractmethod - def convert(self, src_type="QDQ", dst_type="TFDO", *args, **kwargs): - ''' abstract method of model saving, Tensorflow model only''' - raise NotImplementedError - class TensorflowSavedModelModel(TensorflowBaseModel): def get_all_weight_names(self): diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py index fa09e64e45d..42b5cee2d29 100644 --- a/neural_compressor/model/torch_model.py +++ b/neural_compressor/model/torch_model.py @@ -20,11 +20,9 @@ import inspect import sys from collections import OrderedDict, UserDict -from abc import abstractmethod from ..adaptor.torch_utils.util import input2tuple from neural_compressor.utils.utility import LazyImport, compute_sparsity from neural_compressor.utils import logger -from neural_compressor.conf.dotdict import deep_get, deep_set from neural_compressor.conf import config as cfg from neural_compressor.model.base_model import BaseModel @@ -47,8 +45,41 @@ def __init__(self, model, **kwargs): self.q_config = None self._workspace_path = '' self.is_quantized = False + try: + self.fp32_model = copy.deepcopy(model) + except Exception as e: # pragma: no cover + logger.warning("Fail to deep copy the model due to {}, inplace is used now.".format( + repr(e))) + self.fp32_model = model self.kwargs = kwargs if kwargs else None + def __repr__(self): + # rewirte this func to avoid printing fp32_model + from torch.nn.modules.module import _addindent + # We treat the extra repr like the sub-module, one item per line + extra_lines = [] + extra_repr = self.extra_repr() + # empty string will be split into list [''] + if extra_repr: + extra_lines = extra_repr.split('\n') + child_lines = [] + for key, module in self._modules.items(): + if key == 'fp32_model': + continue + mod_str = repr(module) + mod_str = _addindent(mod_str, 2) + child_lines.append('(' + key + '): ' + mod_str) + lines = extra_lines + child_lines + main_str = self._get_name() + '(' + if lines: + # simple one-liner info, which most builtin Modules will use + if len(extra_lines) == 1 and not child_lines: + main_str += extra_lines[0] + else: + main_str += '\n ' + '\n '.join(lines) + '\n' + main_str += ')' + return main_str + def forward(self, *args, **kwargs): return self._model(*args, **kwargs) @@ -356,13 +387,18 @@ def export_to_fp32_onnx( opset_version=14, dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}}, + input_names=None, + output_names=None, do_constant_folding=True, verbose=True, fp32_model=None, ): - example_input_names = ['input'] - if isinstance(example_inputs, dict) or isinstance(example_inputs, UserDict): - example_input_names = list(example_inputs.keys()) + if input_names: + example_input_names = input_names + else: + example_input_names = ['input'] + if isinstance(example_inputs, dict) or isinstance(example_inputs, UserDict): + example_input_names = list(example_inputs.keys()) model = self.model if fp32_model: model = fp32_model @@ -372,6 +408,7 @@ def export_to_fp32_onnx( save_path, opset_version=opset_version, input_names=example_input_names, + output_names=output_names, dynamic_axes=dynamic_axes, do_constant_folding=do_constant_folding, ) @@ -387,6 +424,8 @@ def export_to_bf16_onnx(self, opset_version=14, dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}}, + input_names=None, + output_names=None, do_constant_folding=True, verbose=True, ): @@ -396,6 +435,8 @@ def export_to_bf16_onnx(self, example_inputs = example_inputs, opset_version=opset_version, dynamic_axes=dynamic_axes, + input_names=input_names, + output_names=output_names, do_constant_folding=do_constant_folding, verbose=False, ) @@ -438,6 +479,8 @@ def export_to_int8_onnx( opset_version=14, dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}}, + input_names=None, + output_names=None, do_constant_folding=True, quant_format='QDQ', dtype='S8S8', @@ -466,22 +509,13 @@ def export_to_int8_onnx( "No quantization configuration found, " + \ "please use the model generated by INC quantizer" if 'dynamic' in self.q_config['approach']: - op_types_to_quantize=['MatMul', 'Gather', "LSTM", 'Conv'] - pytorch_op_types_to_quantize=['Linear', 'Embedding', "LSTM", - 'Conv1d', 'Conv2d'] - addition_op_to_quantize = list(ortq.registry.IntegerOpsRegistry.keys()) + op_types_to_quantize=['MatMul', 'Gather', "LSTM"] + pytorch_op_types_to_quantize=['Linear', 'Embedding', "LSTM"] + addition_op_to_quantize = [] else: op_types_to_quantize=['MatMul', 'Gather', 'Conv'] pytorch_op_types_to_quantize=['Linear', 'Embedding', 'Conv1d', 'Conv2d'] - if quant_format == 'QDQ': - addition_op_to_quantize = list(ortq.registry.QDQRegistry.keys()) - addition_op_to_quantize.remove('Relu') # ValueError: x not in list - else: - addition_op_to_quantize = list(ortq.registry.QLinearOpsRegistry.keys()) - - if 'U8S8' in dtype: - op_types_to_quantize.remove('Gather') - pytorch_op_types_to_quantize.remove('Embedding') + addition_op_to_quantize = [] if quant_format == 'QDQ' and opset_version < 13: # pragma: no cover opset_version = 13 @@ -496,6 +530,8 @@ def export_to_int8_onnx( example_inputs = example_inputs, opset_version=opset_version, dynamic_axes=dynamic_axes, + input_names=input_names, + output_names=output_names, do_constant_folding=do_constant_folding, verbose=False, fp32_model=fp32_model @@ -623,17 +659,35 @@ def export_to_int8_onnx( def export( self, save_path: str, - input, - target_model_type: str = 'ONNX', - quant_mode: str = 'QDQ', - opset_version: int = 14, - *args, - **kwargs + conf, ): - if self.q_config is not None: - assert False, "Unsupport convertion from PyTorch to ONNX" - else: - self.export_to_fp32_onnx(save_path, input, opset_version=opset_version) + if conf.dtype == 'int8': + calib_dataloader = conf.kwargs.pop("calib_dataloader", None) + self.export_to_int8_onnx( + save_path=save_path, + example_inputs=conf.example_inputs, + opset_version=conf.opset_version, + dynamic_axes=conf.dynamic_axes, + input_names=conf.input_names, + output_names=conf.output_names, + quant_format=conf.quant_format, + dtype='U8S8', + fp32_model=self.fp32_model, + calib_dataloader=calib_dataloader, + ) + elif conf.dtype == 'fp32': + self.export_to_fp32_onnx( + save_path=save_path, + example_inputs=conf.example_inputs, + opset_version=conf.opset_version, + dynamic_axes=conf.dynamic_axes, + input_names=conf.input_names, + output_names=conf.output_names, + verbose=True, + fp32_model=self.fp32_model, + ) + else: # pragma: no cover + assert False, "Not allowed dtype: {}, pleas use 'fp32' or 'int8'.".format(conf.dtype) class PyTorchFXModel(PyTorchModel): diff --git a/neural_compressor/objective.py b/neural_compressor/objective.py index 81c96117ef9..f373db46c1b 100644 --- a/neural_compressor/objective.py +++ b/neural_compressor/objective.py @@ -18,6 +18,7 @@ from abc import abstractmethod import time import numpy as np +from copy import deepcopy import tracemalloc from .utils.utility import get_size @@ -178,7 +179,7 @@ def __init__(self, objectives, accuracy_criterion, metric_criterion=[True], \ self.objectives = [OBJECTIVES[i]() for i in objectives] self.representation = [str(i).capitalize() for i in self.objectives] - self.baseline = None + self._baseline = None self.val = None if obj_criterion: if len(self.objectives) != len(obj_criterion) and len(obj_criterion) == 1: @@ -192,7 +193,24 @@ def __init__(self, objectives, accuracy_criterion, metric_criterion=[True], \ self.metric_criterion = metric_criterion self.obj_weight = obj_weight self.is_measure = is_measure - + self._accuracy_target = None + + @property + def baseline(self): + return self._baseline + + @baseline.setter + def baseline(self, val): + self._baseline = val + + @property + def accuracy_target(self): + return self._accuracy_target + + @accuracy_target.setter + def accuracy_target(self, val): + self._accuracy_target = val + def compare(self, last, baseline): """The interface of comparing if metric reaches the goal with acceptable accuracy loss. @@ -248,6 +266,49 @@ def compare(self, last, baseline): zip(acc, acc_target, self.metric_criterion)]) else: return False + + def _get_accuracy_target(self): + assert self._baseline is not None, "Baseline is None" + base_acc, _ = self._baseline + if not isinstance(base_acc, list): + base_acc = [base_acc] + if self.metric_weight is not None and len(base_acc) > 1: + base_acc = [np.mean(np.array(base_acc) * self.metric_weight)] + + if self.relative: + if len(base_acc) == 1: + acc_target = [base_acc[0] * (1 - float(self.acc_goal)) if self.higher_is_better \ + else base_acc[0] * (1 + float(self.acc_goal))] + else: + # use metric_criterion to replace acc_criterion + acc_target = [b_acc * (1 - float(self.acc_goal)) if higher_is_better \ + else b_acc * (1 + float(self.acc_goal)) \ + for b_acc, higher_is_better in zip(base_acc, self.metric_criterion)] + else: + if len(base_acc) == 1: + acc_target = [base_acc[0] - float(self.acc_goal) if self.higher_is_better \ + else base_acc[0] + float(self.acc_goal)] + else: + # use metric_criterion to replace acc_criterion + acc_target = [b_acc - float(self.acc_goal) if higher_is_better \ + else b_acc + float(self.acc_goal) \ + for b_acc, higher_is_better in zip(base_acc, self.metric_criterion)] + return acc_target + + def accuracy_meets(self): + last_acc, _ = deepcopy(self.val) + got_better_result = False + if not isinstance(last_acc, list): + last_acc = [last_acc] + + if self.metric_weight is not None and len(last_acc) > 1: + last_acc = [np.mean(np.array(last_acc) * self.metric_weight)] + if not self._accuracy_target: + self.accuracy_target = self._get_accuracy_target() + all_higher = all([_last > _target for _last, _target in zip(last_acc, self.accuracy_target) ]) + all_lower = all([_last < _target for _last, _target in zip(last_acc, self.accuracy_target) ]) + got_better_result = (all_higher and self.higher_is_better) or (all_lower and not self.higher_is_better) + return got_better_result def evaluate(self, eval_func, model): """The interface of calculating the objective. diff --git a/neural_compressor/quantization.py b/neural_compressor/quantization.py index 025e4c23fa5..272b86fdc0f 100644 --- a/neural_compressor/quantization.py +++ b/neural_compressor/quantization.py @@ -20,7 +20,8 @@ from .data import DATALOADERS, DATASETS from .experimental import Quantization as ExpQuantization from deprecated import deprecated -from neural_compressor.conf.pythonic_config import Config, PostTrainingConfig +from neural_compressor.conf.pythonic_config import Config +from neural_compressor.config import PostTrainingQuantConfig class Quantization(object): """Quantization class automatically searches for optimal quantization recipes for low @@ -155,7 +156,7 @@ def eval_func(model): self.exp_quantizer.q_func = q_func if eval_func is not None: - self.exp_quantizer.eval_func = eval_func + self.exp_quantizer.eval_func = eval_func elif eval_dataloader is not None: self.exp_quantizer.eval_dataloader = eval_dataloader @@ -197,10 +198,14 @@ def postprocess(self, name, postprocess_cls, **kwargs): self.exp_quantizer.postprocess = nc_postprocess -def fit( - model, conf, calib_dataloader=None, calib_func=None, eval_dataloader=None, - eval_func=None, eval_metric=None, options=None, **kwargs -): +def fit(model, + conf, + calib_dataloader=None, + calib_func=None, + eval_dataloader=None, + eval_func=None, + eval_metric=None, + **kwargs): """Quantize the model with a given configure. Args: @@ -256,22 +261,21 @@ def eval_func(model): output = model(input) accuracy = metric(output, label) return accuracy - options (Options, optional): The configure for random_seed, workspace, - resume path and tensorboard flag. """ - if isinstance(conf, PostTrainingConfig): - if options is None: - conf = Config(quantization=conf) - else: - conf = Config(quantization=conf, options=options) + if isinstance(conf, PostTrainingQuantConfig): + if eval_func is None and eval_dataloader is None: + conf.performance_only = True + conf = Config(quantization=conf) quantizer = ExpQuantization(conf) quantizer.model = model if eval_func is not None: quantizer.eval_func = eval_func if calib_dataloader is not None: quantizer.calib_dataloader = calib_dataloader + if calib_func is not None: + quantizer.calib_func = calib_func if eval_dataloader is not None: quantizer.eval_dataloader = eval_dataloader if eval_metric is not None: diff --git a/neural_compressor/strategy/conservative.py b/neural_compressor/strategy/conservative.py new file mode 100644 index 00000000000..d4806e59ad5 --- /dev/null +++ b/neural_compressor/strategy/conservative.py @@ -0,0 +1,425 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import os +import numpy as np + +from collections import deque +from collections import OrderedDict as COrderedDict +from copy import deepcopy +from typing import Dict, List, Tuple, OrderedDict + +from .strategy import strategy_registry, TuneStrategy +from .st_utils.tuning_space import TuningItem +from ..utils import logger +from ..utils.utility import Statistics + +@strategy_registry +class ConservativeTuneStrategy(TuneStrategy): + def __init__(self, model, conf, q_dataloader, q_func=None, + eval_dataloader=None, eval_func=None, dicts=None, q_hooks=None): + super( + ConservativeTuneStrategy, + self).__init__( + model, + conf, + q_dataloader, + q_func, + eval_dataloader, + eval_func, + dicts, + q_hooks) + self.acc_meet_flag = False + + def next_tune_cfg(self): + """ + Conservative tuning: accuracy first, performance second + + 1. Query all quantifiable ops and save as a list: quantifiable_ops = [(op_name, op_type), ...] + 2. Classify the op by its op type + 3. Add op to quant_queue according to the op type priority + 4. Go through the quant_queue and replace it with the fp32 config in tune_cfg if + accuracy meets the requirements else continue + + For bf16 and fp16, do the same thing as int8 + Note: + 1) other tunable items will using the first option as the default value. + + Yields: + tune_config (dict): It's a dict containing the tuning configuration to run. + """ + + tuning_space = self.tuning_space + calib_sampling_size_lst = tuning_space.root_item.get_option_by_name('calib_sampling_size').options + calib_sampling_size = calib_sampling_size_lst[0] + tune_cfg = self._initialize_tune_cfg() + tune_cfg['calib_sampling_size'] = calib_sampling_size + op_type_priority = self._get_op_type_priority() + quant_items_pool = self._quant_items_pool(op_type_priority) + logger.info(f"*** Try to convert op into lower precision to improve performance.") + for dtype, op_items in quant_items_pool.items(): + logger.info(f"*** Start to convert op into {dtype}.") + for op_type, items_lst in op_items.items(): + logger.info(f"*** Try to convert all {op_type} ops into {dtype}.") + tmp_tune_cfg = deepcopy(tune_cfg) + for item, quant_mode in items_lst: + op_info = item.name + op_config = tuning_space.set_deafult_config(op_info, quant_mode) + tmp_tune_cfg[op_info] = op_config + yield tmp_tune_cfg + if self.acc_meet_flag: + logger.info(f"*** Convert all {op_type} ops to {dtype} and accuracy still meet the requirements") + tune_cfg = deepcopy(tmp_tune_cfg) + else: + tmp_tune_cfg = deepcopy(tune_cfg) + logger.info(f"*** Convert all {op_type} ops to {dtype} but accuracy not meet the requirements") + logger.info(f"*** Try to convert {op_type} op into {dtype} one by one.") + for item, quant_mode in items_lst: + op_info = item.name + op_config = tuning_space.set_deafult_config(op_info, quant_mode) + tmp_tune_cfg[op_info] = op_config + yield tmp_tune_cfg + if self.acc_meet_flag: + tune_cfg[op_info] = op_config + logger.info((f"*** Convert one {op_type} op({op_info}) " + f"into {dtype} and accuracy still meet the requirements")) + else: + tmp_tune_cfg[op_info] = tune_cfg[op_info] + logger.info(f"*** Skip convert {op_info}.") + logger.info(f"*** Ending tuning process due to no quantifiable op left.") + + def traverse(self): + if not (self.cfg.evaluation and self.cfg.evaluation.accuracy and \ + (self.cfg.evaluation.accuracy.metric or self.cfg.evaluation.accuracy.multi_metrics)) \ + and self.eval_func is None: + logger.info("Neither evaluation function nor metric is defined." \ + " Generate a quantized model with default quantization configuration.") + self.cfg.tuning.exit_policy.performance_only = True + logger.info("Force setting 'tuning.exit_policy.performance_only = True'.") + logger.info("Generate a fake evaluation function.") + self.eval_func = self._fake_eval_func + + # Get fp32 model baseline + if self.baseline is None: + logger.info("Get FP32 model baseline.") + self._fp32_model = self.model + self.baseline = self._evaluate(self.model) + self.objectives.baseline = self.baseline + # self.best_tune_result = self.baseline + # Initialize the best qmodel as fp32 model + # self.best_qmodel = self._fp32_model + # Record the FP32 baseline + self._add_tuning_history() + self.show_baseline_info() + + # Start tuning + trials_count = 0 + for op_tuning_cfg in self.next_tune_cfg(): + tune_cfg = self._tune_cfg_converter(op_tuning_cfg) + trials_count += 1 + tuning_history = self._find_tuning_history(tune_cfg) + if tuning_history and trials_count < self.cfg.tuning.exit_policy.max_trials: + self.last_tune_result = tuning_history['last_tune_result'] + self.best_tune_result = tuning_history['best_tune_result'] + logger.warn("Find evaluated tuning config, skip.") + continue + logger.debug("Dump current tuning configuration:") + logger.debug(tune_cfg) + self.tuning_times += 1 + self.q_model = self.adaptor.quantize( + copy.deepcopy(tune_cfg), self.model, self.calib_dataloader, self.q_func) + self.algo.calib_iter = tune_cfg['calib_iteration'] + self.algo.q_model = self.q_model + # TODO align the api to let strategy has access to pre_optimized model + assert self.adaptor.pre_optimized_model + self.algo.origin_model = self.adaptor.pre_optimized_model + if self.cfg.quantization.recipes.fast_bias_correction: + self.algo.algorithms[0].quantization_cfg = tune_cfg + self.last_qmodel = self.algo() + assert self.last_qmodel + self.last_tune_result = self._evaluate(self.last_qmodel) + self.acc_meet_flag = self.objectives.accuracy_meets() + if self.acc_meet_flag: + # For the first tuning + if not self.best_tune_result: + self.best_tune_result = self.last_tune_result + self.best_qmodel = self.last_qmodel + self.best_tune_result = self.last_tune_result + else: + # Update current tuning config and model with best performance + get_better_performance = self.compare_performace(self.last_tune_result, self.best_tune_result) + if get_better_performance: + logger.info(f"*** Update the model with better performance.") + self.best_qmodel = self.last_qmodel + self.best_tune_result = self.last_tune_result + else: + logger.info(f"*** The qmodel was not updated due to not achieving better performance.") + # Dump the current state to log + self.dump_tuning_state(trials_count, self.last_tune_result, self.best_tune_result, self.baseline) + # Judge stop or continue tuning + need_stop = self.stop(trials_count) + # Record the tuning history + saved_tune_cfg = copy.deepcopy(tune_cfg) + saved_last_tune_result = copy.deepcopy(self.last_tune_result) + self._add_tuning_history(saved_tune_cfg, + saved_last_tune_result, + q_config=self.q_model.q_config) + self.tune_result_record.append(copy.deepcopy(self.last_tune_result)) + self.tune_cfg = tune_cfg + self._dump_tuning_process_statistics() + if need_stop: + if self.cfg.tuning.diagnosis and self.cfg.tuning.diagnosis.diagnosis_after_tuning: + logger.debug(f'*** Start to do diagnosis (inspect tensor).') + self._diagnosis() + if self.use_multi_objective and len(self.tune_result_record) > 1 and \ + self.best_tune_result is not None: + best_trail, best_result = self.objectives.best_result(self.tune_result_record, + copy.deepcopy(self.baseline)) + if best_result != self.best_tune_result: + from neural_compressor.utils.utility import recover + self.best_qmodel = recover(self.model.model, + os.path.join(self.cfg.tuning.workspace.path, 'history.snapshot'), + best_trail) + self.best_tune_result = best_result + self._dump_tuning_process_statistics() + break + + def stop(self, trials_count): + need_stop = False + if trials_count >= self.cfg.tuning.exit_policy.max_trials: + need_stop = True + return need_stop + + def compare_performace(self, last_tune_result, best_tune_result): # pragma: no cover + _, last_perf = last_tune_result + _, best_perf = best_tune_result + return last_perf[0] < best_perf[0] + + def dump_tuning_state(self, trials_count, last_tune_result, best_tune_result, baseline): + if last_tune_result: + last_tune = last_tune_result[0] if \ + isinstance(last_tune_result[0], list) else [last_tune_result[0]] + for name, data in zip(self.metric_name, last_tune): + if len(self.tune_data[name]) == 1: + self.tune_data[name].append(data) + else: + self.tune_data[name][1] = data + + if self.metric_weight and len(last_tune) > 1: + weighted_acc = np.mean(np.array(last_tune) * self.metric_weight) + if len(self.tune_data['Weighted accuracy']) == 1: + self.tune_data['Weighted accuracy'].append(weighted_acc) + else: + self.tune_data['Weighted accuracy'][1] = weighted_acc + last_tune = [weighted_acc] + + last_tune_msg = '[Accuracy (int8|fp32):' + \ + ''.join([' {:.4f}|{:.4f}'.format(last, base) for last, base in \ + zip(last_tune, self.tune_data['baseline'])]) + \ + ''.join([', {} (int8|fp32): {:.4f}|{:.4f}'.format( \ + x, y, z) for x, y, z in zip( \ + self.objectives.representation, last_tune_result[1], baseline[1]) \ + if x != 'Accuracy']) + ']' + else: # pragma: no cover + last_tune_msg = 'n/a' + for name in self.tune_data.keys() - {'baseline'}: + if len(self.tune_data[name]) == 1: + self.tune_data[name].append('n/a') + else: + self.tune_data[name][1] = 'n/a' + + if best_tune_result: + best_tune = best_tune_result[0] if isinstance(best_tune_result[0], list) \ + else [best_tune_result[0]] + + for name, data in zip(self.metric_name, best_tune): + if len(self.tune_data[name]) == 2: + self.tune_data[name].append(data) + else: + self.tune_data[name][2] = data + + if self.metric_weight and len(best_tune) > 1: + weighted_acc = np.mean(np.array(best_tune) * self.metric_weight) + + if len(self.tune_data['Weighted accuracy']) == 2: + self.tune_data['Weighted accuracy'].append(weighted_acc) + else: # pragma: no cover + self.tune_data['Weighted accuracy'][2] = weighted_acc + + best_tune = [weighted_acc] + + best_tune_msg = '[Accuracy:' + ''.join([' {:.4f}'.format(best) \ + for best in best_tune]) + ''.join([', {}: {:.4f}'.format(x,y) \ + for x,y in zip(self.objectives.representation, \ + best_tune_result[1]) if x != 'Accuracy']) + ']' + + else: + best_tune_msg = 'n/a' + for name in self.tune_data.keys() - {'baseline'}: + if len(self.tune_data[name]) == 2: + self.tune_data[name].append('n/a') + else: + self.tune_data[name][2] = 'n/a' + + logger.info("Tune {} result is: {}, Best tune result is: {}".format(trials_count, + last_tune_msg, + best_tune_msg)) + output_data = [[info_type, + '{:.4f} '.format(self.tune_data[info_type][0]) if \ + not isinstance(self.tune_data[info_type][0], str) else self.tune_data[info_type][0], + '{:.4f} '.format(self.tune_data[info_type][1]) if \ + not isinstance(self.tune_data[info_type][1], str) else self.tune_data[info_type][1], + '{:.4f} '.format(self.tune_data[info_type][2]) if \ + not isinstance(self.tune_data[info_type][2], str) else self.tune_data[info_type][2]] \ + for info_type in self.tune_data.keys() if info_type != 'baseline'] + + output_data.extend([[obj, + '{:.4f} '.format(baseline[1][i]) if baseline else 'n/a', + '{:.4f} '.format(last_tune_result[1][i]) if last_tune_result else 'n/a', + '{:.4f} '.format(best_tune_result[1][i]) if best_tune_result else 'n/a'] \ + for i, obj in enumerate(self.objectives.representation)]) + + Statistics(output_data, + header='Tune Result Statistics', + field_names=['Info Type', 'Baseline', 'Tune {} result'.format(trials_count), \ + 'Best tune result']).print_stat() + + def _get_op_type_priority(self): + optypewise_cap = self.capability['optypewise'] + op_type_priority = list(optypewise_cap.keys()) + return op_type_priority + + def _sorted_item_by_op_type(self, + items_lst: List[Tuple[TuningItem, str]], + op_type_priority: List[str]) -> OrderedDict[str, List]: + """ Socring the tuning items according to its op type. + + Args: + items_lst: The tuning item list. # [(op_item, quant_mode), ... ] + op_type_priority: The op type list with the order. # [optype_1, optype_2] + + Returns: + The tuning items list that sorted according to its op type. + OrderDict: + # op_type: [(TuningItem, quant_mode), ...] + conv2d: [(TuningItem, static), (TuningItem, static)] + linear: [(TuningItem, static), (TuningItem, static)] + """ + op_type_lst_from_items_lst = list(set([item[0].name[1] for item in items_lst])) + # For items whose op type does not exist in the priority list, assign it with lowest priority. + sorted_op_type_lst = [op_type for op_type in op_type_priority if op_type in op_type_lst_from_items_lst] + sorted_op_type_lst += list(set(op_type_lst_from_items_lst) - set(op_type_priority)) + sorted_items = COrderedDict() + for op_type in sorted_op_type_lst: + sorted_items[op_type] = [] + for op_item, quant_mode in items_lst: + op_type = op_item.name[1] + sorted_items[op_type].append((op_item, quant_mode)) + return sorted_items + + def _initialize_tune_cfg(self): + """Initialize the tuning config with fp32 AMAP. + + Returns: + The intialized tuning config. + """ + tuning_space = self.tuning_space + quant_mode_wise_items = tuning_space.quant_mode_wise_items + # Initialize the tuning config + initial_tuning_cfg = {} + all_ops = set() + fp32_ops = [] + for quant_mode, items_lst in quant_mode_wise_items.items(): + items_name_lst = [item.name for item in items_lst] + all_ops = all_ops.union(set(items_name_lst)) + if quant_mode == "fp32": + fp32_ops += [item.name for item in items_lst] + non_fp32_ops_dtype = {} + fp32_ops_set = set(fp32_ops) + for quant_mode, items_lst in quant_mode_wise_items.items(): + items_name_set = set([item.name for item in items_lst]) + tmp_non_fp32_ops = items_name_set.difference(fp32_ops_set) + if tmp_non_fp32_ops: + for op_info in tmp_non_fp32_ops: + non_fp32_ops_dtype[op_info] = quant_mode + for op_info in fp32_ops: + initial_tuning_cfg[op_info] = tuning_space.set_deafult_config(op_info, "fp32") + for op_info, quant_mode in non_fp32_ops_dtype.items(): + initial_tuning_cfg[op_info] = tuning_space.set_deafult_config(op_info, quant_mode) + return initial_tuning_cfg + + def _quant_items_pool(self, op_type_priority: List[str]) -> OrderedDict[ + str, OrderedDict[str, List[Tuple[TuningItem, str]]]]: + """Create the op queue to be quantized. + + -------------------------------------------------------------------------- + | Level 1 | bf16 | fp16 | static/dynamic | + | Level 2 | conv2d, linear, ...| conv2d, linear, ...| conv2d, linear, ...| + + Args: + op_type_priority: The optype list with priority. + + Returns: + The op item pool to convert into lower precision. + quant_items_pool(OrderDict): + bf16: + OrderDict: + conv2d: [(TuningItem, bf16), (TuningItem, bf16)] + linear: [(TuningItem, bf16), (TuningItem, bf16)] + int8: + OrderDict: + # (TuningItem, quant_mode) + conv2d: [(TuningItem, static), (TuningItem, static)] + linear: [(TuningItem, static), (TuningItem, static)] + """ + quant_mode_wise_items = self.tuning_space.quant_mode_wise_items + # Add all quantized pair into queue + quant_items_pool = COrderedDict() + # collect and sorted all ops that support bf16 and fp16 + for quant_mode in ['bf16', 'fp16']: + if quant_mode in quant_mode_wise_items: + op_item_pairs = [(op_item, quant_mode) for op_item in quant_mode_wise_items[quant_mode]] + op_item_pairs = self._sorted_item_by_op_type(op_item_pairs, op_type_priority) + quant_items_pool[quant_mode] = op_item_pairs + op_item_pairs = [] + quant_ops_name_set = set() + # collect and sorted all ops that support int8 + for quant_mode, items_lst in quant_mode_wise_items.items(): + if "static" in quant_mode or 'dynamic' in quant_mode: + _quant_mode = "static" if "static" in quant_mode else "dynamic" + op_item_pairs += [(item, _quant_mode) for item in items_lst if item.name not in quant_ops_name_set] + quant_ops_name_set = quant_ops_name_set.union([item.name for item in items_lst]) + op_item_pairs = self._sorted_item_by_op_type(op_item_pairs, op_type_priority) + quant_items_pool['int8'] = op_item_pairs + return quant_items_pool + + + + + + + + + + + + + + + + diff --git a/neural_compressor/strategy/st_utils/tuning_space.py b/neural_compressor/strategy/st_utils/tuning_space.py index b2cfddbdd38..0c4a71559ec 100644 --- a/neural_compressor/strategy/st_utils/tuning_space.py +++ b/neural_compressor/strategy/st_utils/tuning_space.py @@ -422,16 +422,15 @@ def set_deafult_config(self, op_name_type, quant_mode): # set the first option as the default if the not support the required quant mode quant_mode_item = op_item.options[0] for quant_item in op_item.options: - if quant_mode == quant_item.name or quant_mode in quant_item.name: + if quant_mode == quant_item.name or (isinstance(quant_mode, str) and quant_mode in quant_item.name): quant_mode_item = quant_item break # set the first option as the default for each tuning item config = {item.name: item.options[0] for item in quant_mode_item.options} op_tuning_config = OpTuningConfig(op_name_type[0], op_name_type[1], - quant_mode_item.name, + quant_mode, self, config) return op_tuning_config - diff --git a/neural_compressor/strategy/strategy.py b/neural_compressor/strategy/strategy.py index 63710b43264..7be1897a948 100644 --- a/neural_compressor/strategy/strategy.py +++ b/neural_compressor/strategy/strategy.py @@ -219,7 +219,8 @@ def traverse(self): if self.baseline is None: logger.info("Get FP32 model baseline.") self._fp32_model = self.model - self.baseline = self._evaluate(self.model) + self.baseline = self._evaluate(self.model) + self.objectives.baseline = self.baseline # record the FP32 baseline self._add_tuning_history() self.show_baseline_info() diff --git a/neural_compressor/training.py b/neural_compressor/training.py index 4cb93e39409..8f0dcecb57e 100644 --- a/neural_compressor/training.py +++ b/neural_compressor/training.py @@ -16,8 +16,8 @@ # limitations under the License. import copy -from .conf.pythonic_config import Config, DistillationConfig, Options, \ - PruningConfig, QuantizationAwareTrainingConfig +from .conf.pythonic_config import Config +from .config import DistillationConfig, PruningConfig, QuantizationAwareTrainingConfig from .experimental.distillation import Distillation from .experimental.pruning import Pruning from .experimental.quantization import Quantization @@ -54,8 +54,7 @@ class CompressionManager: compression_manager.save("path_to_save") """ def __init__(self, component): - self.callbacks = \ - component.components[0] if isinstance(component, Scheduler) else component + self.callbacks = self.CallBacks(component) self.model = component.model try: # TODO: export to ONNX model need original fp32 model now, will remove it @@ -65,6 +64,46 @@ def __init__(self, component): logger.warning("Fail to deep copy the model due to {}.".format(repr(e))) self.fp32_model = None + class CallBacks: + def __init__(self, component): + self.callbacks = \ + component.components[0] if isinstance(component, Scheduler) else component + + def on_train_begin(self, dataloader=None): + """ called before the beginning of epochs""" + self.callbacks.on_train_begin(dataloader) + + def on_train_end(self): + """ called after the end of epochs""" + self.callbacks.on_train_end() + + def on_epoch_begin(self, epoch): + """ called on the beginning of epochs""" + self.callbacks.on_epoch_begin(epoch) + + def on_step_begin(self, batch_id): + """ called on the beginning of batches""" + self.callbacks.on_step_begin(batch_id) + + def on_after_compute_loss(self, input, student_output, student_loss, teacher_output=None): + """ called on the end of loss computation""" + return self.callbacks.on_after_compute_loss( + input, student_output, student_loss, teacher_output=None + ) + + def on_before_optimizer_step(self): + """ called on the end of backward""" + self.callbacks.on_before_optimizer_step() + + + def on_step_end(self): + """ called on the end of batches""" + return self.callbacks.on_step_end() + + def on_epoch_end(self): + """ called on the end of epochs""" + return self.callbacks.on_epoch_end() + def save(self, root=None): """Save compressed model. @@ -101,7 +140,7 @@ def export( assert False, "Unsupport export for {} model".format(type(self.model)) -def prepare_compression(model: Callable, confs: Union[Callable, List], options=None, **kwargs): +def prepare_compression(model: Callable, confs: Union[Callable, List], **kwargs): """_summary_ Args: @@ -135,20 +174,18 @@ def prepare_compression(model: Callable, confs: Union[Callable, List], options=N compression_manager.on_train_end() """ - if options is None: - options = Options() if isinstance(confs, List): from .experimental.scheduler import Scheduler comps = [] for conf in confs: if isinstance(conf, QuantizationAwareTrainingConfig): - conf_ = Config(quantization=conf, options=options) + conf_ = Config(quantization=conf) com = Quantization(conf_) elif isinstance(conf, PruningConfig): - conf_ = Config(pruning=conf, options=options) + conf_ = Config(pruning=conf) com = Pruning(conf_) elif isinstance(conf, DistillationConfig): - conf_ = Config(distillation=conf, options=options) + conf_ = Config(distillation=conf) com = Distillation(conf_) assert conf.teacher_model is not None, \ "Please set teacher_model in DistillationConfig" @@ -165,13 +202,13 @@ def prepare_compression(model: Callable, confs: Union[Callable, List], options=N component = scheduler else: if isinstance(confs, QuantizationAwareTrainingConfig): - conf = Config(quantization=confs, options=options) + conf = Config(quantization=confs) component = Quantization(conf) elif type(confs) == PruningConfig: - conf = Config(pruning=confs, options=options) + conf = Config(pruning=confs) component = Pruning(conf) elif type(confs) == DistillationConfig: - conf = Config(distillation=confs, options=options) + conf = Config(distillation=confs) component = Distillation(conf) assert confs.teacher_model is not None, \ "Please set teacher_model in DistillationConfig" diff --git a/requirements.txt b/requirements.txt index 6da20f57fee..ebae42235a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -Cython numpy pandas pyyaml @@ -7,7 +6,6 @@ scikit-image matplotlib schema py-cpuinfo -hyperopt contextlib2 requests Flask @@ -20,7 +18,6 @@ Pillow pycocotools-windows; sys_platform != 'linux' pycocotools; sys_platform == 'linux' opencv-python -sigopt prettytable cryptography sqlalchemy==1.4.27 diff --git a/setup.py b/setup.py index 7dd91a69efb..9ae6370a040 100644 --- a/setup.py +++ b/setup.py @@ -36,11 +36,11 @@ # define install requirements install_requires_list = [ - 'numpy', 'pyyaml', 'scikit-learn', 'schema', 'py-cpuinfo', 'hyperopt', 'pandas', 'pycocotools', - 'opencv-python', 'requests', 'psutil', 'Pillow', 'sigopt', 'prettytable', 'cryptography', 'Cython', - 'deprecated'] + 'numpy', 'pyyaml', 'scikit-learn', 'schema', 'py-cpuinfo', 'pandas', 'pycocotools', + 'opencv-python', 'requests', 'psutil', 'Pillow', 'prettytable', 'deprecated'] ux_install_requires_list = [ - 'Flask-Cors', 'Flask-SocketIO', 'Flask', 'gevent-websocket', 'gevent','sqlalchemy==1.4.27', 'alembic==1.7.7'] + 'Flask-Cors', 'Flask-SocketIO', 'Flask', 'gevent-websocket', 'gevent','sqlalchemy==1.4.27', + 'alembic==1.7.7', 'cryptography'] # define scripts scripts_list = [] diff --git a/sphinx-requirements.txt b/sphinx-requirements.txt deleted file mode 100755 index 71cfc10b849..00000000000 --- a/sphinx-requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -sphinx -sphinx-rtd-theme -recommonmark -sphinx-markdown-tables -sphinx-md \ No newline at end of file diff --git a/test/adaptor/pytorch_adaptor/test_adaptor_pytorch.py b/test/adaptor/pytorch_adaptor/test_adaptor_pytorch.py deleted file mode 100644 index aeeafd0b660..00000000000 --- a/test/adaptor/pytorch_adaptor/test_adaptor_pytorch.py +++ /dev/null @@ -1,1406 +0,0 @@ -import copy -import neural_compressor.adaptor.pytorch as nc_torch -import numpy as np -import os -import shutil -import torch -import torch.nn as nn -import torch.nn.quantized as nnq -import unittest -import os -from neural_compressor import Options, PostTrainingConfig, QuantizationAwareTrainingConfig -from neural_compressor.conf.config import QuantConf -from neural_compressor.data import DATASETS, DATALOADERS -from neural_compressor.adaptor import FRAMEWORKS -from neural_compressor.model import MODELS -from neural_compressor.experimental import Quantization, common -from neural_compressor.experimental.data.datasets.dataset import DATASETS -from neural_compressor import quantization -from neural_compressor.training import prepare_compression -from neural_compressor.utils.pytorch import load -from neural_compressor.utils.utility import recover -from neural_compressor.utils.utility import LazyImport -from torch.quantization import QuantStub, DeQuantStub -from packaging.version import Version - - -# improve lazy import UT coverage -resnet18 = LazyImport("torchvision.models.resnet18") -q_resnet18 = LazyImport("torchvision.models.quantization.resnet18") - -PT_VERSION = nc_torch.get_torch_version().release -if PT_VERSION >= Version("1.8.0").release: - FX_MODE = True -else: - FX_MODE = False - - -fake_dyn_yaml = """ - model: - name: imagenet - framework: pytorch - - quantization: - approach: post_training_dynamic_quant - op_wise: { - "decoder": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - } - } - evaluation: - accuracy: - metric: - topk: 1 - performance: - warmup: 5 - iteration: 10 - - tuning: - accuracy_criterion: - relative: 0.01 - exit_policy: - timeout: 0 - random_seed: 9527 - workspace: - path: saved - """ - - -fake_ptq_yaml = """ - model: - name: imagenet - framework: pytorch - - quantization: - op_wise: { - - "layer1.0.conv1": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - }, - "layer1.0.conv2": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - }, - "layer2.0.conv1": { - "activation": {"dtype": ["uint8"], "algorithm": ["minmax"], "granularity": ["per_tensor"], "scheme":["sym"]}, - "weight": {"dtype": ["int8"], "algorithm": ["minmax"], "granularity": ["per_channel"], "scheme":["sym"]} - }, - "layer3.0.conv1": { - "activation": {"dtype": ["uint8"], "algorithm": ["kl"], "granularity": ["per_tensor"], "scheme":["sym"]}, - "weight": {"dtype": ["int8"], "algorithm": ["minmax"], "granularity": ["per_channel"], "scheme":["sym"]} - }, - "layer1.0.add_relu": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - }, - } - evaluation: - accuracy: - metric: - topk: 1 - performance: - warmup: 1 - iteration: 10 - - tuning: - accuracy_criterion: - relative: 0.01 - exit_policy: - timeout: 0 - random_seed: 9527 - workspace: - path: saved - """ - -fake_auto_yaml = """ - model: - name: imagenet - framework: pytorch_fx - - quantization: - approach: post_training_auto_quant - evaluation: - accuracy: - metric: - topk: 1 - performance: - warmup: 1 - iteration: 10 - - tuning: - accuracy_criterion: - relative: 0.01 - exit_policy: - timeout: 1000 - max_trials: 3 - random_seed: 9527 - workspace: - path: saved - """ - - -fake_ptq_yaml_for_fx = """ - model: - name: imagenet - framework: pytorch_fx - - quantization: - approach: post_training_auto_quant - op_wise: { - "layer1.0.conv1": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - }, - "layer1.0.conv2": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - }, - "layer2.0.conv1": { - "activation": {"dtype": ["uint8"], "algorithm": ["minmax"], "granularity": ["per_tensor"], "scheme":["sym"]}, - "weight": {"dtype": ["int8"], "algorithm": ["minmax"], "granularity": ["per_channel"], "scheme":["sym"]} - }, - "layer3.0.conv1": { - "activation": {"dtype": ["uint8"], "algorithm": ["kl"], "granularity": ["per_tensor"], "scheme":["sym"]}, - "weight": {"dtype": ["int8"], "algorithm": ["minmax"], "granularity": ["per_channel"], "scheme":["sym"]} - }, - "layer1.0.add_relu": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - }, - "conv.module": { - "weight": {"dtype": ["fp32"]}, - "activation": {"dtype": ["fp32"]} - }, - "default_qconfig": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - } - } - evaluation: - accuracy: - metric: - topk: 1 - performance: - warmup: 5 - iteration: 10 - - tuning: - accuracy_criterion: - relative: 0.01 - exit_policy: - timeout: 0 - random_seed: 9527 - workspace: - path: saved - """ - - -fake_qat_yaml = """ - model: - name: imagenet - framework: pytorch - - quantization: - approach: quant_aware_training - train: - end_epoch: 1 - iteration: 1 - optimizer: - SGD: - learning_rate: 0.0001 - criterion: - CrossEntropyLoss: - reduction: mean - op_wise: { - "layer1.0.conv1": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - }, - "layer1.0.conv2": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - }, - "layer2.0.conv1": { - "activation": {"dtype": ["uint8"], "algorithm": ["minmax"], "granularity": ["per_tensor"], "scheme":["sym"]}, - "weight": {"dtype": ["int8"], "algorithm": ["minmax"], "granularity": ["per_channel"], "scheme":["sym"]} - }, - "layer3.0.conv1": { - "activation": {"dtype": ["uint8"], "algorithm": ["kl"], "granularity": ["per_tensor"], "scheme":["sym"]}, - "weight": {"dtype": ["int8"], "algorithm": ["minmax"], "granularity": ["per_channel"], "scheme":["sym"]} - }, - "layer1.0.add_relu": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - } - } - evaluation: - accuracy: - metric: - topk: 1 - - tuning: - accuracy_criterion: - relative: 0.01 - exit_policy: - timeout: 0 - random_seed: 9527 - workspace: - path: saved - """ - -dyn_op_name_list = {"decoder": {"activation": {"dtype": ["fp32"]}, "weight": {"dtype": ["fp32"]}}} - -ptq_op_name_list = { - "layer1.0.conv1": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - }, - "layer1.0.conv2": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - }, - "layer2.0.conv1": { - "activation": { - "dtype": ["uint8"], - "algorithm": ["minmax"], - "granularity": ["per_tensor"], - "scheme": ["sym"] - }, - "weight": { - "dtype": ["int8"], - "algorithm": ["minmax"], - "granularity": ["per_channel"], - "scheme": ["sym"] - } - }, - "layer3.0.conv1": { - "activation": { - "dtype": ["uint8"], - "algorithm": ["kl"], - "granularity": ["per_tensor"], - "scheme": ["sym"] - }, - "weight": { - "dtype": ["int8"], - "algorithm": ["minmax"], - "granularity": ["per_channel"], - "scheme": ["sym"] - } - }, - "layer1.0.add_relu": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - }, -} - -ptq_fx_op_name_list = { - "layer1.0.conv1": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - }, - "layer1.0.conv2": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - }, - "layer2.0.conv1": { - "activation": { - "dtype": ["uint8"], - "algorithm": ["minmax"], - "granularity": ["per_tensor"], - "scheme": ["sym"] - }, - "weight": { - "dtype": ["int8"], - "algorithm": ["minmax"], - "granularity": ["per_channel"], - "scheme": ["sym"] - } - }, - "layer3.0.conv1": { - "activation": { - "dtype": ["uint8"], - "algorithm": ["kl"], - "granularity": ["per_tensor"], - "scheme": ["sym"] - }, - "weight": { - "dtype": ["int8"], - "algorithm": ["minmax"], - "granularity": ["per_channel"], - "scheme": ["sym"] - } - }, - "layer1.0.add_relu": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - }, - "conv.module": { - "weight": { - "dtype": ["fp32"] - }, - "activation": { - "dtype": ["fp32"] - } - }, - "default_qconfig": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - } -} - -qat_op_name_list = { - "layer1.0.conv1": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - }, - "layer1.0.conv2": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - }, - "layer2.0.conv1": { - "activation": { - "dtype": ["uint8"], - "algorithm": ["minmax"], - "granularity": ["per_tensor"], - "scheme": ["sym"] - }, - "weight": { - "dtype": ["int8"], - "algorithm": ["minmax"], - "granularity": ["per_channel"], - "scheme": ["sym"] - } - }, - "layer3.0.conv1": { - "activation": { - "dtype": ["uint8"], - "algorithm": ["kl"], - "granularity": ["per_tensor"], - "scheme": ["sym"] - }, - "weight": { - "dtype": ["int8"], - "algorithm": ["minmax"], - "granularity": ["per_channel"], - "scheme": ["sym"] - } - }, - "layer1.0.add_relu": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - } -} - - -def build_pytorch_yaml(): - with open("ptq_yaml.yaml", "w", encoding="utf-8") as f: - f.write(fake_ptq_yaml) - - with open("dynamic_yaml.yaml", "w", encoding="utf-8") as f: - f.write(fake_dyn_yaml) - - with open("qat_yaml.yaml", "w", encoding="utf-8") as f: - f.write(fake_qat_yaml) - - with open("auto_yaml.yaml", "w", encoding="utf-8") as f: - f.write(fake_auto_yaml) - -def build_pytorch_fx_yaml(): - if PT_VERSION >= Version("1.9.0").release: - fake_fx_ptq_yaml = fake_ptq_yaml_for_fx - else: - fake_fx_ptq_yaml = fake_ptq_yaml.replace("pytorch", "pytorch_fx") - with open("fx_ptq_yaml.yaml", "w", encoding="utf-8") as f: - f.write(fake_fx_ptq_yaml) - - fake_fx_dyn_yaml = fake_dyn_yaml.replace("pytorch", "pytorch_fx") - with open("fx_dynamic_yaml.yaml", "w", encoding="utf-8") as f: - f.write(fake_fx_dyn_yaml) - - fake_fx_qat_yaml = fake_qat_yaml.replace("pytorch", "pytorch_fx") - with open("fx_qat_yaml.yaml", "w", encoding="utf-8") as f: - f.write(fake_fx_qat_yaml) - -def build_dump_tensors_yaml(): - fake_yaml = """ - model: - name: imagenet - framework: pytorch - - evaluation: - accuracy: - metric: - topk: 1 - - tuning: - accuracy_criterion: - relative: 0.01 - exit_policy: - timeout: 0 - random_seed: 9527 - workspace: - path: saved - tensorboard: true - """ - with open("dump_yaml.yaml", "w", encoding="utf-8") as f: - f.write(fake_yaml) - - -class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.quant = QuantStub() - self.conv = nn.Conv2d(3, 1, 1) - self.linear = nn.Linear(224 * 224, 5) - self.dequant = DeQuantStub() - - def forward(self, x): - x = self.quant(x) - x = self.conv(x) - x = x.view(1, -1) - x = self.linear(x) - x = self.dequant(x) - return x - - -class FP32Model(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - times = x.size(1) - if times == 1: - return x + x - return x - - -class DynamicModel(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = nn.Conv2d(1, 1, 1) - def forward(self, x): - if x is not None: - x = self.conv(x) - return x - - -class SubModel(torch.nn.Module): - def __init__(self, bypass=True): - super().__init__() - self.quant = QuantStub() - self.conv = nn.Conv2d(1, 1, 1) - self.conv1 = nn.Conv2d(1, 1, 1) - self.bn = nn.BatchNorm2d(1) - self.relu = nn.ReLU() - self.fp32 = FP32Model() - self.norm = nn.LayerNorm([1, 224, 224]) - self.dequant = DeQuantStub() - self.bypass = bypass - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.quant(x) - x = self.relu(x) - x = self.conv1(x) - x = self.dequant(x) - if not self.bypass: - x = self.fp32(x) - x = self.norm(x) - return x - - -class PartialQuantModel(torch.nn.Module): - def __init__(self): - super().__init__() - self.quant = QuantStub() - self.conv = nn.Conv2d(3, 1, 1) - self.bn = nn.BatchNorm2d(1) - self.conv1 = nn.Conv2d(1, 1, 1) - self.bn1 = nn.BatchNorm2d(1) - self.conv2 = nn.Conv2d(1, 1, 1) - self.linear = nn.Linear(224 * 224, 1) - self.dequant = DeQuantStub() - self.sub = SubModel(bypass=False) - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.conv1(x) - x = self.bn1(x) - x = self.sub(x) - x = self.quant(x) - x = self.conv2(x) - x = x.view(1, -1) - x = self.linear(x) - x = self.dequant(x) - return x - -class DynamicControlModel(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = nn.Conv2d(3, 1, 1) - self.bn = nn.BatchNorm2d(1) - self.linear = nn.Linear(224 * 224, 1) - self.sub = SubModel() - self.fp32 = FP32Model() - self.dyn = DynamicModel() - - def forward(self, x): - x = self.conv(x) - x = self.dyn(x) - x = self.bn(x) - x = self.sub(x) - x = self.fp32(x) - x = x.view(1, -1) - x = self.linear(x) - return x - - -class LSTMModel(nn.Module): - """Container module with an encoder, a recurrent module, and a decoder.""" - - def __init__(self, ntoken=10, ninp=512, nhid=256, nlayers=5, dropout=0.5): - super(LSTMModel, self).__init__() - self.drop = nn.Dropout(dropout) - self.encoder = nn.Embedding(ntoken, ninp) - self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout) - self.decoder = nn.Linear(nhid, ntoken) - self.init_weights() - self.nhid = nhid - self.nlayers = nlayers - - def init_weights(self): - initrange = 0.1 - self.encoder.weight.data.uniform_(-initrange, initrange) - self.decoder.bias.data.zero_() - self.decoder.weight.data.uniform_(-initrange, initrange) - - def forward(self, input): - input = torch.ones((3, 10), dtype=torch.int32) - h0 = torch.randn(2, 10, 256) - c0 = torch.randn(2, 10, 256) - hidden = (h0, c0) - emb = self.encoder(input) - output, hidden = self.rnn(emb, hidden) - output = self.drop(output) - decoded = self.decoder(output) - return decoded, hidden - - -def eval_func(model): - # switch to evaluate mode - model.eval() - with torch.no_grad(): - input = torch.randn(1, 3, 224, 224) - # compute output - output = model(input) - return 0.0 - - -def train_func(compression_manager, model, dataloader=None): - compression_manager.callbacks.on_train_begin(dataloader=dataloader) - optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) - # switch to evaluate mode - model.train() - input = torch.randn(1, 3, 224, 224) - # compute output - output = model(input) - loss = output[0].mean() if isinstance(output, tuple) else output.mean() - optimizer.zero_grad() - loss.backward() - optimizer.step() - compression_manager.callbacks.on_train_end() - return model - - -def q_func(model): - optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) - # switch to evaluate mode - model.train() - input = torch.randn(1, 3, 224, 224) - # compute output - output = model(input) - loss = output.mean() - optimizer.zero_grad() - loss.backward() - optimizer.step() - return model - - -class TestPytorchAdaptor(unittest.TestCase): - # some UT would be affected when IPEX installed. - try: - import intel_extension_for_pytorch as ipex - IPEX = True - except: - IPEX = False - framework_specific_info = {"device": "cpu", - "approach": "post_training_static_quant", - "random_seed": 1234, - "q_dataloader": None, - "workspace_path": "./"} - framework = "pytorch" - adaptor = FRAMEWORKS[framework](framework_specific_info) - model = q_resnet18() - nc_model = MODELS["pytorch"](model) - - @classmethod - def setUpClass(self): - build_pytorch_yaml() - build_dump_tensors_yaml() - - @classmethod - def tearDownClass(self): - os.remove("ptq_yaml.yaml") - os.remove("dynamic_yaml.yaml") - os.remove("qat_yaml.yaml") - os.remove("dump_yaml.yaml") - os.remove("auto_yaml.yaml") - shutil.rmtree("./saved", ignore_errors=True) - shutil.rmtree("runs", ignore_errors=True) - - def test_get_all_weight_name(self): - assert len(list(self.nc_model.get_all_weight_names())) == 62 - - def test_get_weight(self): - for name, param in self.model.named_parameters(): - if name == "layer4.1.conv2.weight": - param.data.fill_(0.0) - if name == "fc.bias": - param.data.fill_(0.1) - assert int(torch.sum(self.nc_model.get_weight("layer4.1.conv2.weight"))) == 0 - assert torch.allclose( - torch.sum( - self.nc_model.get_weight("fc.bias")), - torch.tensor(100.)) - - def test_get_input(self): - model = MODELS["pytorch"](q_resnet18()) - model.model.eval().fuse_model() - model.register_forward_pre_hook() - rand_input = torch.rand(100, 3, 224, 224).float() - model.model(rand_input) - assert torch.equal(model.get_inputs("x"), rand_input) - model.remove_hooks() - - def test_update_weights(self): - self.nc_model.update_weights("fc.bias", torch.zeros([1000])) - assert int(torch.sum(self.nc_model.get_weight("fc.bias"))) == 0 - - def test_get_gradient(self): - with self.assertRaises(AssertionError): - self.nc_model.get_gradient("fc.bias") - - for name, tensor in self.nc_model._model.named_parameters(): - if name == "fc.bias": - tensor.grad = torch.zeros_like(tensor) - break - assert torch.equal(torch.Tensor(self.nc_model.get_gradient("fc.bias")), torch.zeros_like(tensor)) - - rand_input = torch.rand(100, 3, 224, 224).float() - rand_input.grad = torch.ones_like(rand_input) - assert torch.equal(torch.Tensor(self.nc_model.get_gradient(rand_input)), - torch.ones_like(rand_input)) - - def test_report_sparsity(self): - df, total_sparsity = self.nc_model.report_sparsity() - self.assertTrue(total_sparsity > 0) - self.assertTrue(len(df) == 22) - - def test_quantization_saved(self): - for fake_yaml in ["dynamic_yaml.yaml", "qat_yaml.yaml", "ptq_yaml.yaml"]: - model = M() - quantizer = Quantization(fake_yaml) - quantizer.conf.usr_cfg.tuning.exit_policy["performance_only"] = True - dataset = quantizer.dataset("dummy", (100, 3, 224, 224), label=True) - quantizer.model = model - quantizer.calib_dataloader = common.DataLoader(dataset) - quantizer.eval_dataloader = common.DataLoader(dataset) - q_model = quantizer.fit() - eval_func(q_model) - q_model.save("./saved") - # Load configure and weights by neural_compressor.utils - saved_model = load("./saved", model) - eval_func(saved_model) - # recover int8 model from history - history_file = "./saved/history.snapshot" - model_recover = recover(model, history_file, 0) - eval_func(model_recover) - self.assertEqual(type(saved_model.conv), \ - type(model_recover.conv)) - shutil.rmtree("./saved", ignore_errors=True) - from neural_compressor.experimental import Benchmark - evaluator = Benchmark("ptq_yaml.yaml") - # Load configure and weights by neural_compressor.model - evaluator.model = model - evaluator.b_dataloader = common.DataLoader(dataset) - evaluator.fit("accuracy") - - for fake_yaml in ["qat_yaml.yaml", "ptq_yaml.yaml"]: - model = copy.deepcopy(self.model) - if fake_yaml == "ptq_yaml.yaml": - model.eval().fuse_model() - conf = QuantConf(fake_yaml) - quantizer = Quantization(conf) - dataset = quantizer.dataset("dummy", (100, 3, 224, 224)) - quantizer.model = model - if fake_yaml == "qat_yaml.yaml": - quantizer.q_func = q_func - else: - quantizer.calib_dataloader = common.DataLoader(dataset) - quantizer.eval_func = eval_func - q_model = quantizer.fit() - q_model.save("./saved") - # Load configure and weights by neural_compressor.utils - saved_model = load("./saved", model) - eval_func(saved_model) - shutil.rmtree("./saved", ignore_errors=True) - - def test_quantization_new_saved(self): - for fake_yaml in ["dynamic_yaml.yaml", "qat_yaml.yaml", "ptq_yaml.yaml"]: - model = M() - quantizer = Quantization(fake_yaml) - quantizer.conf.usr_cfg.tuning.exit_policy["performance_only"] = True - dataset = quantizer.dataset("dummy", (100, 3, 224, 224), label=True) - quantizer.model = model - quantizer.calib_dataloader = common.DataLoader(dataset) - quantizer.eval_dataloader = common.DataLoader(dataset) - q_model = quantizer.fit() - eval_func(q_model) - torch.save(q_model.quantized_state_dict(), "./saved/model.pt") - # Load configure and weights by neural_compressor.utils - from neural_compressor.experimental.common import Model - common_model = Model(model) - common_model.load_quantized_state_dict(torch.load("./saved/model.pt")) - eval_func(common_model) - self.assertEqual(type(q_model._model.linear), \ - type(common_model._model.linear)) - shutil.rmtree("./saved", ignore_errors=True) - - def test_quantization_new_API(self): - for fake_yaml in ["dynamic", "qat", "static"]: - model = M() - if fake_yaml == "qat": - quant_conf = QuantizationAwareTrainingConfig(op_name_list=qat_op_name_list) - compression_manager = prepare_compression(copy.deepcopy(model), quant_conf) - q_model = train_func(compression_manager, compression_manager.model) - else: - dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) - dataloader = DATALOADERS["pytorch"](dataset) - if fake_yaml == "dynamic": - quant_conf = PostTrainingConfig(approach="post_training_dynamic_quant", - op_name_list=dyn_op_name_list, - performance_only=True) - elif fake_yaml == "static": - quant_conf = PostTrainingConfig(approach="post_training_static_quant", - op_name_list=ptq_op_name_list, - performance_only=True) - q_model = quantization.fit( - model, - quant_conf, - calib_dataloader=dataloader if fake_yaml == "static" else None, - eval_func=eval_func) - q_model.save("./saved") - # Load configure and weights by neural_compressor.utils - saved_model = load("./saved", model) - shutil.rmtree("./saved", ignore_errors=True) - - @unittest.skipIf(IPEX, "this function is affected by IPEX, Fixing now.") - def test_non_quant_module(self): - for fake_yaml in ["qat_yaml.yaml", "ptq_yaml.yaml"]: - model = PartialQuantModel() - conf = QuantConf(fake_yaml) - quantizer = Quantization(conf) - dataset = quantizer.dataset("dummy", (1, 3, 224, 224)) - non_quant_dict = {"non_quant_module_name": ["conv", "conv1", "sub.conv"], \ - "non_quant_module_class": ["BatchNorm2d", "FP32Model"]} - quantizer.model = common.Model(model, **non_quant_dict) - if fake_yaml == "qat_yaml.yaml": - quantizer.q_func = q_func - else: - quantizer.calib_func = eval_func - quantizer.eval_func = eval_func - q_model = quantizer.fit() - q_model.save("./saved") - saved_model = load("./saved", model, **non_quant_dict) - eval_func(saved_model) - shutil.rmtree("./saved", ignore_errors=True) - - def test_auto_quant(self): - def eval_func(model): - return 1 - - model_origin = LSTMModel( - ntoken = 10, - ninp = 512, - nhid = 256, - nlayers = 2, - ) - # run fx_quant in neural_compressor and save the quantized GraphModule - quant_conf = PostTrainingConfig(approach="post_training_auto_quant") - dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) - dataloader = common.DataLoader(dataset) - model = common.Model(model_origin) - q_model = quantization.fit(model, - quant_conf, - calib_dataloader=dataloader, - eval_func=eval_func) - self.assertNotEqual(q_model, None) - - def test_workspace_path(self): - model = M() - quant_conf = PostTrainingConfig(approach="post_training_static_quant", - op_name_list=ptq_op_name_list, - performance_only=True) - dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) - dataloader = common.DataLoader(dataset) - q_model = quantization.fit(model, - quant_conf, - calib_dataloader=dataloader, - eval_func=eval_func) - eval_func(q_model) - os.makedirs("./saved", exist_ok=True) - torch.save(q_model.quantized_state_dict(), "./saved/best_model.pt") - # Load configure and weights by workspace_path - from neural_compressor.experimental.common import Model - common_model = Model(model) - common_model.workspace_path = "./saved" - eval_func(common_model) - self.assertEqual(type(q_model._model.linear), - type(common_model._model.linear)) - shutil.rmtree("./saved", ignore_errors=True) - - def test_get_graph_info(self): - from neural_compressor.model.torch_model import PyTorchModel - model = PyTorchModel(self.model) - op_map = model.graph_info - self.assertTrue(op_map["conv1"] == "Conv2d") - - def test_tensorboard(self): - model = copy.deepcopy(self.nc_model) - model.model.eval().fuse_model() - quant_conf = PostTrainingConfig(approach="post_training_static_quant", - backend="pytorch", - performance_only=True) - options = Options(tensorboard=True) - dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) - dataloader = common.DataLoader(dataset) - quantization.fit( - model.model, quant_conf, calib_dataloader=dataloader, - eval_func=eval_func, options=options - ) - self.assertTrue(True if os.path.exists("runs/eval/baseline_acc0.0") else False) - quantization.fit(model.model, - quant_conf, - calib_dataloader=dataloader, - eval_dataloader=dataloader, - eval_func=None) - self.assertTrue(True if os.path.exists("runs/eval/baseline_acc0.0") else False) - - def test_tensor_dump_and_set(self): - model = copy.deepcopy(self.nc_model) - model.model.eval().fuse_model() - quantizer = Quantization("ptq_yaml.yaml") - dataset = quantizer.dataset("dummy", (100, 3, 224, 224), label=True) - dataloader = common.DataLoader(dataset) - dataloader = common._generate_common_dataloader(dataloader, "pytorch") - quantizer.eval_dataloader = dataloader - quantizer.calib_dataloader = dataloader - quantizer.model = model.model - q_model = quantizer.fit() - quantizer.strategy.adaptor.inspect_tensor( - model, dataloader, op_list=["conv1.0", "layer1.0.conv1.0"], - iteration_list=[1, 2], inspect_type="all", save_to_disk=True) - load_array = lambda *a, **k: np.load(*a, allow_pickle=True, **k) - a = load_array("saved/dump_tensor/activation_iter1.npz") - w = load_array("saved/dump_tensor/weight.npz") - if PT_VERSION >= Version("1.8.0").release: - self.assertTrue(w["conv1.0"].item()["conv1.0.weight"].shape[0] == - a["conv1.0"].item()["conv1.0.output0"].shape[1]) - else: - self.assertTrue(w["conv1.0"].item()["conv1.0.weight"].shape[0] == - a["conv1.0"].item()["conv1.1.output0"].shape[1]) - data = np.random.random(w["conv1.0"].item()["conv1.0.weight"].shape).astype(np.float32) - quantizer.strategy.adaptor.set_tensor(q_model, {"conv1.0.weight": data}) - changed_tensor = q_model.get_weight("conv1.weight") - scales = changed_tensor.q_per_channel_scales() - changed_tensor_fp32 = torch.dequantize(changed_tensor) - self.assertTrue(np.allclose(data, changed_tensor_fp32.numpy(), atol=2 / np.min(scales.numpy()))) - quantizer.strategy.adaptor.inspect_tensor( - q_model, dataloader, op_list=["conv1.0", "layer1.0.conv1.0"], - iteration_list=[1, 2], inspect_type="all", save_to_disk=False) - - def test_get_graph_info(self): - from neural_compressor.adaptor.pytorch import get_ops_recursively - model = copy.deepcopy(self.model) - op_map = {} - get_ops_recursively(model, "", op_map) - self.assertTrue(op_map["conv1"] == "Conv2d") - - def test_forward_wrapper(self): - vision_model = resnet18() - class dummymodel(torch.nn.Module): - def __init__(self, model): - super(dummymodel, self).__init__() - self._model = model - def forward(self,input=None): - return self._model(input) - - data = [[{"input": torch.rand(3,224,224)}, torch.ones(1,1)], ] - # dataloader.batch_size=100 - dataloader = common.DataLoader(data, batch_size=1) - quant_conf = QuantConf("dynamic_yaml.yaml") - model = dummymodel(vision_model) - q_model = quantization.fit(model, - quant_conf, - calib_dataloader=dataloader, - eval_func=eval_func) - - def test_floatfunctions_fallback(self): - class ModelWithFunctionals(torch.nn.Module): - def __init__(self): - super(ModelWithFunctionals, self).__init__() - self.mycat = nnq.FloatFunctional() - self.myadd = nnq.FloatFunctional() - self.myadd_relu = nnq.FloatFunctional() - # Tracing doesnt work yet for c10 ops with scalar inputs - # https://github.com/pytorch/pytorch/issues/27097 - self.my_scalar_add = nnq.FloatFunctional() - self.mymul = nnq.FloatFunctional() - self.my_scalar_mul = nnq.FloatFunctional() - self.quant = QuantStub() - self.dequant = DeQuantStub() - - def forward(self, x): - x = self.quant(x) - y = self.mycat.cat([x, x, x]) - z = self.myadd.add(y, y) - w = self.myadd_relu.add_relu(z, z) - # Tracing doesnt work yet for c10 ops with scalar inputs - # https://github.com/pytorch/pytorch/issues/27097 - w = self.my_scalar_add.add_scalar(w, -0.5) - w = self.mymul.mul(w, w) - w = self.my_scalar_mul.mul_scalar(w, 0.5) - w = self.dequant(w) - return w - - model = ModelWithFunctionals() - model = MODELS["pytorch"](model) - x = torch.rand(10, 1, dtype=torch.float) - y = model.model(x) - fallback_ops = [] - q_capability = self.adaptor.query_fw_capability(model) - for k, v in q_capability["opwise"].items(): - if k[0] != "quant" and k[0] != "dequant": - fallback_ops.append(k[0]) - model.model.qconfig = torch.quantization.default_qconfig - model.model.quant.qconfig = torch.quantization.default_qconfig - if PT_VERSION >= Version("1.8.0").release: - model.model.dequant.qconfig = torch.quantization.default_qconfig - nc_torch._fallback_quantizable_ops_recursively( - model.model, "", fallback_ops, op_qcfgs={}) - torch.quantization.add_observer_(model.model) - model.model(x) - torch.quantization.convert(model.model, self.adaptor.q_mapping, inplace=True) - qy = model.model(x) - tol = {"atol": 1e-01, "rtol": 1e-03} - self.assertTrue(np.allclose(y, qy, **tol)) - - -@unittest.skipIf(not FX_MODE, "Unsupport Fx Mode with PyTorch Version Below 1.8") -class TestPytorchFXAdaptor(unittest.TestCase): - framework_specific_info = {"device": "cpu", - "approach": "post_training_static_quant", - "random_seed": 1234, - "q_dataloader": None, - "workspace_path": "./"} - framework = "pytorch_fx" - adaptor = FRAMEWORKS[framework](framework_specific_info) - @classmethod - def setUpClass(self): - build_pytorch_fx_yaml() - - @classmethod - def tearDownClass(self): - os.remove("fx_ptq_yaml.yaml") - os.remove("fx_dynamic_yaml.yaml") - shutil.rmtree("./saved", ignore_errors=True) - shutil.rmtree("runs", ignore_errors=True) - - def test_fx_quant(self): - for fake_yaml in ["qat", "static"]: - model_origin = resnet18() - model = common.Model(model_origin, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []} - } - ) - dataset = DATASETS("pytorch")["dummy"]((10, 3, 224, 224), label=True) - dataloader = DATALOADERS["pytorch"](dataset) - if fake_yaml == "qat": - conf = QuantizationAwareTrainingConfig( - op_name_list=qat_op_name_list, backend="pytorch_fx" - ) - compression_manager = prepare_compression(copy.deepcopy(model), conf) - q_model = train_func(compression_manager, compression_manager.model, dataloader) - else: - conf = PostTrainingConfig( - op_name_list=ptq_fx_op_name_list, backend="pytorch_fx", performance_only=True - ) - options = Options(workspace="./saved") - q_model = quantization.fit(model, - conf, - calib_dataloader=dataloader, - eval_func=eval_func, - calib_func=eval_func, - options=options) - q_model.save("./saved") - # Load configure and weights with neural_compressor.utils - model_fx = load("./saved", model_origin, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []}, \ - "dataloader": torch.utils.data.DataLoader(dataset) - }) - self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) - - if fake_yaml != "qat": - # recover int8 model with only tune_cfg - history_file = "./saved/history.snapshot" - model_fx_recover = recover(model_origin, history_file, 0, - **{"prepare_custom_config_dict": - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": - {"preserved_attributes": []}, - "dataloader": dataloader - }) - self.assertEqual(model_fx.code, model_fx_recover.code) - shutil.rmtree("./saved", ignore_errors=True) - for fake_yaml in ["fx_qat_yaml.yaml", "fx_ptq_yaml.yaml"]: - model_origin = M() - # run fx_quant in neural_compressor and save the quantized GraphModule - dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224), label=True) - dataloader = DATALOADERS["pytorch"](dataset) - model = common.Model(model_origin, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []} - }) - if fake_yaml == "fx_qat_yaml.yaml": - conf = QuantizationAwareTrainingConfig( - op_name_list=qat_op_name_list, backend="pytorch_fx" - ) - compression_manager = prepare_compression(copy.deepcopy(model), conf) - q_model = train_func(compression_manager, compression_manager.model, dataloader) - compression_manager.save("./saved") - else: - conf = PostTrainingConfig( - op_name_list=ptq_fx_op_name_list, backend="pytorch_fx", performance_only=True - ) - q_model = quantization.fit(model, - conf, - calib_dataloader=dataloader, - eval_dataloader=dataloader) - q_model.save("./saved") - # Load configure and weights with neural_compressor.utils - model_fx = load("./saved", model_origin, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []}, \ - "dataloader": torch.utils.data.DataLoader(dataset) - }) - self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) - shutil.rmtree("./saved", ignore_errors=True) - - @unittest.skipIf(PT_VERSION < Version("1.9.0").release, - "Please use PyTroch 1.9 or higher version for dynamic quantization with pytorch_fx backend") - def test_fx_dynamic_quant(self): - origin_model = LSTMModel( - ntoken = 10, - ninp = 512, - nhid = 256, - nlayers = 5, - ) - # run fx_quant in neural_compressor and save the quantized GraphModule - origin_model.eval() - quant_conf = QuantConf("fx_dynamic_yaml.yaml") - model = common.Model(origin_model, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []} - }) - q_model = quantization.fit(model, - quant_conf - ) - q_model.save("./saved") - - # Load configure and weights by neural_compressor.utils - model_fx = load("./saved", origin_model, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []} - }) - self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) - - # Test the functionality of older model saving type - state_dict = torch.load("./saved/best_model.pt") - tune_cfg = state_dict.pop("best_configure") - import yaml - with open("./saved/best_configure.yaml", "w") as f: - yaml.dump(tune_cfg, f, default_flow_style=False) - torch.save(state_dict, "./saved/best_model_weights.pt") - os.remove("./saved/best_model.pt") - model_fx = load("./saved", origin_model, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []} - }) - self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) - - # recover int8 model with only tune_cfg - history_file = "./saved/history.snapshot" - model_fx_recover = recover(origin_model, history_file, 0, - **{"prepare_custom_config_dict": - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": - {"preserved_attributes": []} - }) - self.assertEqual(model_fx.code, model_fx_recover.code) - shutil.rmtree("./saved", ignore_errors=True) - - def test_default_dynamic_quant(self): - def eval_func(model): - return 1 - - # Model Definition - for fake_yaml in ["fx_qat_yaml.yaml", "fx_ptq_yaml.yaml"]: - model_origin = LSTMModel( - ntoken = 10, - ninp = 512, - nhid = 256, - nlayers = 2, - ) - dataset = DATASETS("pytorch")["dummy"]((3, 10)) - dataloader = DATALOADERS["pytorch"](dataset) - # run fx_quant in neural_compressor and save the quantized GraphModule - if fake_yaml == "fx_qat_yaml.yaml": - conf = QuantizationAwareTrainingConfig( - op_name_list=qat_op_name_list, backend="pytorch_fx" - ) - compression_manager = prepare_compression(copy.deepcopy(model_origin), conf) - q_model = train_func(compression_manager, compression_manager.model, dataloader=dataloader) - self.assertTrue("quantize" in str(type(q_model.model.encoder))) - self.assertTrue("quantize" in str(type(q_model.model.rnn))) - else: - conf = PostTrainingConfig(backend="pytorch_fx", performance_only=True) - q_model = quantization.fit(model_origin, - conf, - calib_dataloader=dataloader, - eval_func=eval_func) - self.assertTrue("quantize" in str(type(q_model.model.encoder))) - self.assertTrue("quantize" in str(type(q_model.model.rnn))) - - def test_fx_sub_module_quant(self): - for fake_yaml in ["fx_qat_yaml.yaml", "fx_ptq_yaml.yaml", "fx_dynamic_yaml.yaml"]: - model_origin = DynamicControlModel() - model = common.Model(model_origin, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []} - }) - dataset = DATASETS("pytorch")["dummy"]((1, 3, 224, 224)) - dataloader = DATALOADERS["pytorch"](dataset) - # run fx_quant in neural_compressor and save the quantized GraphModule - if fake_yaml == "fx_qat_yaml.yaml": - conf = QuantizationAwareTrainingConfig( - op_name_list=qat_op_name_list, backend="pytorch_fx" - ) - compression_manager = prepare_compression(copy.deepcopy(model), conf) - q_model = train_func(compression_manager, compression_manager.model, dataloader) - else: - options = Options(workspace="./saved") - conf = PostTrainingConfig(backend="pytorch_fx", performance_only=True) - q_model = quantization.fit(model, - conf, - calib_dataloader=dataloader, - eval_func=eval_func, - options=options) - q_model.save("./saved") - # Load configure and weights with neural_compressor.utils - model_fx = load("./saved/best_model.pt", model_origin, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []}, \ - "dataloader": torch.utils.data.DataLoader(dataset) - }) - self.assertTrue(isinstance(model_fx.sub, torch.fx.graph_module.GraphModule)) - - if fake_yaml != "fx_qat_yaml.yaml": - # recover int8 model with only tune_cfg - history_file = "./saved/history.snapshot" - model_fx_recover = recover(model_origin, history_file, 0, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []}, \ - "dataloader": torch.utils.data.DataLoader(dataset) - }) - self.assertEqual(model_fx.sub.code, model_fx_recover.sub.code) - shutil.rmtree("./saved", ignore_errors=True) - - def test_deepcopy_failure(self): - def eval_func(model): - return 1 - - # To build an object t2, which will fail on deepcopy. - class T1(): - def __init__(self, t1) -> None: - self.t1 = t1 - self.j = 1 - - # required for usage with set in T1 - def __hash__(self): - return hash(self.j) - - t1 = set() - t2 = T1([t1]) - t1.add(t2) - - for fake_yaml in ['fx_ptq_yaml.yaml']: - model_origin = M() - model_origin.tmp = t2 - # run fx_quant in neural_compressor and save the quantized GraphModule - quantizer = Quantization(fake_yaml) - dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True) - quantizer.eval_func = eval_func - quantizer.calib_dataloader = common.DataLoader(dataset) - quantizer.model = common.Model(model_origin) - q_model = quantizer.fit() - self.assertTrue(isinstance(q_model.model, torch.fx.graph_module.GraphModule)) - - @unittest.skipIf(PT_VERSION < Version("1.11.0").release, - "Please use PyTroch 1.11 or higher version for mixed precision with pytorch_fx or pytorch backend") - def test_bf16_capability(self): - model_origin = DynamicControlModel() - os.environ["FORCE_BF16"] = "1" - q_capability = self.adaptor._get_quantizable_ops(model_origin) - del os.environ["FORCE_BF16"] - - self.assertEqual( - [elem["weight"]["dtype"] for elem in q_capability["optypewise"]["Conv2d"]], - [["int8"], "fp32"]) - self.assertEqual( - [elem["activation"]["dtype"] for elem in q_capability["optypewise"]["Conv2d"]], - [["uint8"], "fp32"]) - self.assertEqual( - [elem["weight"]["dtype"] for elem in q_capability["opwise"][("conv", "Conv2d")]], - [["int8"], "fp32"]) - self.assertEqual( - [elem["activation"]["dtype"] for elem in q_capability["opwise"][("conv", "Conv2d")]], - [["uint8"], "fp32"]) - self.assertEqual( - [elem["weight"]["dtype"] for elem in q_capability["opwise"][("linear", "Linear")]], - [["int8"], "fp32", "bf16"]) - self.assertEqual( - [elem["activation"]["dtype"] for elem in q_capability["opwise"][("linear", "Linear")]], - [["uint8"], "fp32", "bf16"]) - - @unittest.skipIf(PT_VERSION < Version("1.11.0").release, - "Please use PyTroch 1.11 or higher version for mixed precision with pytorch_fx or pytorch backend") - def test_mix_precision(self): - model_origin = DynamicControlModel() - # run fx_quant in neural_compressor and save the quantized GraphModule - dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) - dataloader = DATALOADERS["pytorch"](dataset) - model = common.Model(model_origin, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []} - }) - options = Options(workspace="./saved") - conf = PostTrainingConfig(op_name_list=ptq_fx_op_name_list, backend="pytorch_fx", performance_only=True) - q_model = quantization.fit(model_origin, - conf, - calib_dataloader=dataloader, - eval_func=eval_func, - calib_func = eval_func, - options=options) - tune_cfg = q_model.q_config - tune_cfg["op"][("conv.module", "Conv2d")].clear() - tune_cfg["op"][("conv.module", "Conv2d")] = \ - {"weight": {"dtype": "bf16"}, "activation": {"dtype": "bf16"}} - tune_cfg["bf16_ops_list"].append(("conv.module", "Conv2d")) - from neural_compressor.adaptor.torch_utils.bf16_convert import Convert - q_model._model = Convert(q_model._model, tune_cfg) - - self.assertEqual(q_model._model.conv.module.module.weight.dtype, torch.bfloat16) - self.assertEqual(q_model._model.conv.module.module.bias.dtype, torch.bfloat16) - - def test_symbolic_trace(self): - from neural_compressor.adaptor.torch_utils.symbolic_trace import symbolic_trace - model_origin = DynamicControlModel() - traced_model = symbolic_trace(model_origin, is_qat=False) - if PT_VERSION >= Version("1.11.0").release: - self.assertTrue(isinstance(traced_model.sub, torch.nn.Module)) - self.assertTrue(isinstance(traced_model.conv, torch.fx.graph_module.GraphModule)) - else: - self.assertTrue(isinstance(traced_model.sub, torch.fx.graph_module.GraphModule)) - traced_model_qat = symbolic_trace(model_origin, is_qat=True) - self.assertTrue(isinstance(traced_model_qat.sub, torch.fx.graph_module.GraphModule)) - -if __name__ == "__main__": - unittest.main() diff --git a/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_1.x.py b/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_1.x.py new file mode 100644 index 00000000000..effd890bdd7 --- /dev/null +++ b/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_1.x.py @@ -0,0 +1,1118 @@ +import copy +import neural_compressor.adaptor.pytorch as nc_torch +import numpy as np +import os +import shutil +import torch +import torch.nn as nn +import torch.nn.quantized as nnq +import unittest +from neural_compressor.adaptor import FRAMEWORKS +from neural_compressor.model import MODELS +from neural_compressor.experimental import Quantization, common +from neural_compressor.conf.config import QuantConf +from neural_compressor.utils.pytorch import load +from neural_compressor.utils.utility import recover +from neural_compressor.utils.utility import LazyImport +from torch.quantization import QuantStub, DeQuantStub +from packaging.version import Version +try: + import intel_extension_for_pytorch as ipex + IPEX = True +except: + IPEX = False + +# improve lazy import UT coverage +resnet18 = LazyImport("torchvision.models.resnet18") +q_resnet18 = LazyImport("torchvision.models.quantization.resnet18") + +PT_VERSION = nc_torch.get_torch_version().release +if PT_VERSION >= Version("1.8.0").release: + FX_MODE = True +else: + FX_MODE = False + + +fake_dyn_yaml = ''' + model: + name: imagenet + framework: pytorch + + quantization: + approach: post_training_dynamic_quant + op_wise: { + 'decoder': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + } + } + evaluation: + accuracy: + metric: + topk: 1 + performance: + warmup: 5 + iteration: 10 + + tuning: + accuracy_criterion: + relative: 0.01 + exit_policy: + timeout: 0 + random_seed: 9527 + workspace: + path: saved + ''' + + +fake_ptq_yaml = ''' + model: + name: imagenet + framework: pytorch + + quantization: + op_wise: { + + 'layer1.0.conv1': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + 'layer1.0.conv2': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + 'layer2.0.conv1': { + 'activation': {'dtype': ['uint8'], 'algorithm': ['minmax'], 'granularity': ['per_tensor'], 'scheme':['sym']}, + 'weight': {'dtype': ['int8'], 'algorithm': ['minmax'], 'granularity': ['per_channel'], 'scheme':['sym']} + }, + 'layer3.0.conv1': { + 'activation': {'dtype': ['uint8'], 'algorithm': ['kl'], 'granularity': ['per_tensor'], 'scheme':['sym']}, + 'weight': {'dtype': ['int8'], 'algorithm': ['minmax'], 'granularity': ['per_channel'], 'scheme':['sym']} + }, + 'layer1.0.add_relu': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + } + evaluation: + accuracy: + metric: + topk: 1 + performance: + warmup: 1 + iteration: 10 + + tuning: + accuracy_criterion: + relative: 0.01 + exit_policy: + timeout: 0 + random_seed: 9527 + workspace: + path: saved + ''' + +fake_auto_yaml = ''' + model: + name: imagenet + framework: pytorch_fx + + quantization: + approach: post_training_auto_quant + evaluation: + accuracy: + metric: + topk: 1 + performance: + warmup: 1 + iteration: 10 + + tuning: + accuracy_criterion: + relative: 0.01 + exit_policy: + timeout: 1000 + max_trials: 3 + random_seed: 9527 + workspace: + path: saved + ''' + + +fake_ptq_yaml_for_fx = ''' + model: + name: imagenet + framework: pytorch_fx + + quantization: + approach: post_training_auto_quant + op_wise: { + 'layer1.0.conv1': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + 'layer1.0.conv2': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + 'layer2.0.conv1': { + 'activation': {'dtype': ['uint8'], 'algorithm': ['minmax'], 'granularity': ['per_tensor'], 'scheme':['sym']}, + 'weight': {'dtype': ['int8'], 'algorithm': ['minmax'], 'granularity': ['per_channel'], 'scheme':['sym']} + }, + 'layer3.0.conv1': { + 'activation': {'dtype': ['uint8'], 'algorithm': ['kl'], 'granularity': ['per_tensor'], 'scheme':['sym']}, + 'weight': {'dtype': ['int8'], 'algorithm': ['minmax'], 'granularity': ['per_channel'], 'scheme':['sym']} + }, + 'layer1.0.add_relu': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + 'conv.module': { + 'weight': {'dtype': ['fp32']}, + 'activation': {'dtype': ['fp32']} + }, + 'default_qconfig': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + } + } + evaluation: + accuracy: + metric: + topk: 1 + performance: + warmup: 5 + iteration: 10 + + tuning: + accuracy_criterion: + relative: 0.01 + exit_policy: + timeout: 0 + random_seed: 9527 + workspace: + path: saved + ''' + + +fake_qat_yaml = ''' + model: + name: imagenet + framework: pytorch + + quantization: + approach: quant_aware_training + train: + end_epoch: 1 + iteration: 1 + optimizer: + SGD: + learning_rate: 0.0001 + criterion: + CrossEntropyLoss: + reduction: mean + op_wise: { + 'layer1.0.conv1': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + 'layer1.0.conv2': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + 'layer2.0.conv1': { + 'activation': {'dtype': ['uint8'], 'algorithm': ['minmax'], 'granularity': ['per_tensor'], 'scheme':['sym']}, + 'weight': {'dtype': ['int8'], 'algorithm': ['minmax'], 'granularity': ['per_channel'], 'scheme':['sym']} + }, + 'layer3.0.conv1': { + 'activation': {'dtype': ['uint8'], 'algorithm': ['kl'], 'granularity': ['per_tensor'], 'scheme':['sym']}, + 'weight': {'dtype': ['int8'], 'algorithm': ['minmax'], 'granularity': ['per_channel'], 'scheme':['sym']} + }, + 'layer1.0.add_relu': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + } + } + evaluation: + accuracy: + metric: + topk: 1 + + tuning: + accuracy_criterion: + relative: 0.01 + exit_policy: + timeout: 0 + random_seed: 9527 + workspace: + path: saved + ''' + + +def build_pytorch_yaml(): + with open('ptq_yaml.yaml', 'w', encoding="utf-8") as f: + f.write(fake_ptq_yaml) + + with open('dynamic_yaml.yaml', 'w', encoding="utf-8") as f: + f.write(fake_dyn_yaml) + + with open('qat_yaml.yaml', 'w', encoding="utf-8") as f: + f.write(fake_qat_yaml) + + with open('auto_yaml.yaml', 'w', encoding="utf-8") as f: + f.write(fake_auto_yaml) + +def build_pytorch_fx_yaml(): + if PT_VERSION >= Version("1.9.0").release: + fake_fx_ptq_yaml = fake_ptq_yaml_for_fx + else: + fake_fx_ptq_yaml = fake_ptq_yaml.replace('pytorch', 'pytorch_fx') + with open('fx_ptq_yaml.yaml', 'w', encoding="utf-8") as f: + f.write(fake_fx_ptq_yaml) + + fake_fx_dyn_yaml = fake_dyn_yaml.replace('pytorch', 'pytorch_fx') + with open('fx_dynamic_yaml.yaml', 'w', encoding="utf-8") as f: + f.write(fake_fx_dyn_yaml) + + fake_fx_qat_yaml = fake_qat_yaml.replace('pytorch', 'pytorch_fx') + with open('fx_qat_yaml.yaml', 'w', encoding="utf-8") as f: + f.write(fake_fx_qat_yaml) + +def build_dump_tensors_yaml(): + fake_yaml = ''' + model: + name: imagenet + framework: pytorch + + evaluation: + accuracy: + metric: + topk: 1 + + tuning: + accuracy_criterion: + relative: 0.01 + exit_policy: + timeout: 0 + random_seed: 9527 + workspace: + path: saved + tensorboard: true + ''' + with open('dump_yaml.yaml', 'w', encoding="utf-8") as f: + f.write(fake_yaml) + + +class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = QuantStub() + self.conv = nn.Conv2d(3, 1, 1) + self.linear = nn.Linear(224 * 224, 5) + self.dequant = DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.conv(x) + x = x.view(1, -1) + x = self.linear(x) + x = self.dequant(x) + return x + + +class FP32Model(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + times = x.size(1) + if times == 1: + return x + x + return x + + +class DynamicModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv2d(1, 1, 1) + def forward(self, x): + if x is not None: + x = self.conv(x) + return x + + +class SubModel(torch.nn.Module): + def __init__(self, bypass=True): + super().__init__() + self.quant = QuantStub() + self.conv = nn.Conv2d(1, 1, 1) + self.conv1 = nn.Conv2d(1, 1, 1) + self.bn = nn.BatchNorm2d(1) + self.relu = nn.ReLU() + self.fp32 = FP32Model() + self.norm = nn.LayerNorm([1, 224, 224]) + self.dequant = DeQuantStub() + self.bypass = bypass + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.quant(x) + x = self.relu(x) + x = self.conv1(x) + x = self.dequant(x) + if not self.bypass: + x = self.fp32(x) + x = self.norm(x) + return x + + +class PartialQuantModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = QuantStub() + self.conv = nn.Conv2d(3, 1, 1) + self.bn = nn.BatchNorm2d(1) + self.conv1 = nn.Conv2d(1, 1, 1) + self.bn1 = nn.BatchNorm2d(1) + self.conv2 = nn.Conv2d(1, 1, 1) + self.linear = nn.Linear(224 * 224, 1) + self.dequant = DeQuantStub() + self.sub = SubModel(bypass=False) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.conv1(x) + x = self.bn1(x) + x = self.sub(x) + x = self.quant(x) + x = self.conv2(x) + x = x.view(1, -1) + x = self.linear(x) + x = self.dequant(x) + return x + +class DynamicControlModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv2d(3, 1, 1) + self.bn = nn.BatchNorm2d(1) + self.linear = nn.Linear(224 * 224, 1) + self.sub = SubModel() + self.fp32 = FP32Model() + self.dyn = DynamicModel() + + def forward(self, x): + x = self.conv(x) + x = self.dyn(x) + x = self.bn(x) + x = self.sub(x) + x = self.fp32(x) + x = x.view(1, -1) + x = self.linear(x) + return x + + +class LSTMModel(nn.Module): + '''Container module with an encoder, a recurrent module, and a decoder.''' + + def __init__(self, ntoken=10, ninp=512, nhid=256, nlayers=5, dropout=0.5): + super(LSTMModel, self).__init__() + self.drop = nn.Dropout(dropout) + self.encoder = nn.Embedding(ntoken, ninp) + self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout) + self.decoder = nn.Linear(nhid, ntoken) + self.init_weights() + self.nhid = nhid + self.nlayers = nlayers + + def init_weights(self): + initrange = 0.1 + self.encoder.weight.data.uniform_(-initrange, initrange) + self.decoder.bias.data.zero_() + self.decoder.weight.data.uniform_(-initrange, initrange) + + def forward(self, input): + input = torch.ones((3, 10), dtype=torch.int32) + h0 = torch.randn(2, 10, 256) + c0 = torch.randn(2, 10, 256) + hidden = (h0, c0) + emb = self.encoder(input) + output, hidden = self.rnn(emb, hidden) + output = self.drop(output) + decoded = self.decoder(output) + return decoded, hidden + + +def eval_func(model): + # switch to evaluate mode + model.eval() + with torch.no_grad(): + input = torch.randn(1, 3, 224, 224) + # compute output + output = model(input) + return 0.0 + + +def q_func(model): + optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) + # switch to evaluate mode + model.train() + input = torch.randn(1, 3, 224, 224) + # compute output + output = model(input) + loss = output.mean() + optimizer.zero_grad() + loss.backward() + optimizer.step() + return model + + +class TestPytorchAdaptor(unittest.TestCase): + framework_specific_info = {"device": "cpu", + "approach": "post_training_static_quant", + "random_seed": 1234, + "q_dataloader": None, + "workspace_path": "./"} + framework = "pytorch" + adaptor = FRAMEWORKS[framework](framework_specific_info) + model = q_resnet18() + nc_model = MODELS['pytorch'](model) + + @classmethod + def setUpClass(self): + build_pytorch_yaml() + build_dump_tensors_yaml() + + @classmethod + def tearDownClass(self): + os.remove('ptq_yaml.yaml') + os.remove('dynamic_yaml.yaml') + os.remove('qat_yaml.yaml') + os.remove('dump_yaml.yaml') + os.remove('auto_yaml.yaml') + shutil.rmtree('./saved', ignore_errors=True) + shutil.rmtree('runs', ignore_errors=True) + + def test_get_all_weight_name(self): + assert len(list(self.nc_model.get_all_weight_names())) == 62 + + def test_get_weight(self): + for name, param in self.model.named_parameters(): + if name == "layer4.1.conv2.weight": + param.data.fill_(0.0) + if name == "fc.bias": + param.data.fill_(0.1) + assert int(torch.sum(self.nc_model.get_weight("layer4.1.conv2.weight"))) == 0 + assert torch.allclose( + torch.sum( + self.nc_model.get_weight("fc.bias")), + torch.tensor(100.)) + + def test_get_input(self): + model = MODELS['pytorch'](q_resnet18()) + model.model.eval().fuse_model() + model.register_forward_pre_hook() + rand_input = torch.rand(100, 3, 224, 224).float() + model.model(rand_input) + assert torch.equal(model.get_inputs('x'), rand_input) + model.remove_hooks() + + def test_update_weights(self): + self.nc_model.update_weights('fc.bias', torch.zeros([1000])) + assert int(torch.sum(self.nc_model.get_weight("fc.bias"))) == 0 + + def test_get_gradient(self): + with self.assertRaises(AssertionError): + self.nc_model.get_gradient('fc.bias') + + for name, tensor in self.nc_model._model.named_parameters(): + if name == 'fc.bias': + tensor.grad = torch.zeros_like(tensor) + break + assert torch.equal(torch.Tensor(self.nc_model.get_gradient('fc.bias')), torch.zeros_like(tensor)) + + rand_input = torch.rand(100, 3, 224, 224).float() + rand_input.grad = torch.ones_like(rand_input) + assert torch.equal(torch.Tensor(self.nc_model.get_gradient(rand_input)), + torch.ones_like(rand_input)) + + def test_report_sparsity(self): + df, total_sparsity = self.nc_model.report_sparsity() + self.assertTrue(total_sparsity > 0) + self.assertTrue(len(df) == 22) + + def test_quantization_saved(self): + for fake_yaml in ['dynamic_yaml.yaml', 'qat_yaml.yaml', 'ptq_yaml.yaml']: + model = M() + quantizer = Quantization(fake_yaml) + quantizer.conf.usr_cfg.tuning.exit_policy['performance_only'] = True + dataset = quantizer.dataset('dummy', (100, 3, 224, 224), label=True) + quantizer.model = model + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.eval_dataloader = common.DataLoader(dataset) + q_model = quantizer.fit() + eval_func(q_model) + q_model.save('./saved') + # Load configure and weights by neural_compressor.utils + saved_model = load("./saved", model) + eval_func(saved_model) + # recover int8 model from history + history_file = './saved/history.snapshot' + model_recover = recover(model, history_file, 0) + eval_func(model_recover) + self.assertEqual(type(saved_model.conv), \ + type(model_recover.conv)) + shutil.rmtree('./saved', ignore_errors=True) + from neural_compressor.experimental import Benchmark + evaluator = Benchmark('ptq_yaml.yaml') + # Load configure and weights by neural_compressor.model + evaluator.model = model + evaluator.b_dataloader = common.DataLoader(dataset) + evaluator.fit('accuracy') + + for fake_yaml in ['qat_yaml.yaml', 'ptq_yaml.yaml']: + model = copy.deepcopy(self.model) + if fake_yaml == 'ptq_yaml.yaml': + model.eval().fuse_model() + conf = QuantConf(fake_yaml) + quantizer = Quantization(conf) + dataset = quantizer.dataset('dummy', (100, 3, 224, 224)) + quantizer.model = model + if fake_yaml == 'qat_yaml.yaml': + quantizer.q_func = q_func + else: + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.eval_func = eval_func + q_model = quantizer.fit() + q_model.save('./saved') + # Load configure and weights by neural_compressor.utils + saved_model = load("./saved", model) + eval_func(saved_model) + shutil.rmtree('./saved', ignore_errors=True) + + def test_quantization_new_saved(self): + for fake_yaml in ['dynamic_yaml.yaml', 'qat_yaml.yaml', 'ptq_yaml.yaml']: + model = M() + quantizer = Quantization(fake_yaml) + quantizer.conf.usr_cfg.tuning.exit_policy['performance_only'] = True + dataset = quantizer.dataset('dummy', (100, 3, 224, 224), label=True) + quantizer.model = model + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.eval_dataloader = common.DataLoader(dataset) + q_model = quantizer.fit() + eval_func(q_model) + torch.save(q_model.quantized_state_dict(), './saved/model.pt') + # Load configure and weights by neural_compressor.utils + from neural_compressor.experimental.common import Model + common_model = Model(model) + common_model.load_quantized_state_dict(torch.load('./saved/model.pt')) + eval_func(common_model) + self.assertEqual(type(q_model._model.linear), \ + type(common_model._model.linear)) + shutil.rmtree('./saved', ignore_errors=True) + + @unittest.skipIf(IPEX, "this function is affected by IPEX, Fixing now.") + def test_non_quant_module(self): + for fake_yaml in ['qat_yaml.yaml', 'ptq_yaml.yaml']: + model = PartialQuantModel() + conf = QuantConf(fake_yaml) + quantizer = Quantization(conf) + dataset = quantizer.dataset('dummy', (1, 3, 224, 224)) + non_quant_dict = {'non_quant_module_name': ['conv', 'conv1', 'sub.conv'], \ + 'non_quant_module_class': ['BatchNorm2d', 'FP32Model']} + quantizer.model = common.Model(model, **non_quant_dict) + if fake_yaml == 'qat_yaml.yaml': + quantizer.q_func = q_func + else: + quantizer.calib_func = eval_func + quantizer.eval_func = eval_func + q_model = quantizer.fit() + q_model.save('./saved') + saved_model = load("./saved", model, **non_quant_dict) + eval_func(saved_model) + shutil.rmtree('./saved', ignore_errors=True) + + def test_auto_quant(self): + def eval_func(model): + return 1 + + model_origin = LSTMModel( + ntoken = 10, + ninp = 512, + nhid = 256, + nlayers = 2, + ) + # run fx_quant in neural_compressor and save the quantized GraphModule + quantizer = Quantization('auto_yaml.yaml') + dataset = quantizer.dataset('dummy', (3, 10), label=True) + quantizer.eval_func = eval_func + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.model = common.Model(model_origin) + q_model = quantizer.fit() + self.assertNotEqual(q_model, None) + + def test_workspace_path(self): + model = M() + quantizer = Quantization('ptq_yaml.yaml') + quantizer.conf.usr_cfg.tuning.exit_policy['performance_only'] = True + dataset = quantizer.dataset('dummy', (100, 3, 224, 224), label=True) + quantizer.model = model + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.eval_dataloader = common.DataLoader(dataset) + q_model = quantizer.fit() + eval_func(q_model) + torch.save(q_model.quantized_state_dict(), './saved/best_model.pt') + # Load configure and weights by workspace_path + from neural_compressor.experimental.common import Model + common_model = Model(model) + common_model.workspace_path = './saved' + eval_func(common_model) + self.assertEqual(type(q_model._model.linear), \ + type(common_model._model.linear)) + shutil.rmtree('./saved', ignore_errors=True) + + def test_get_graph_info(self): + from neural_compressor.model.torch_model import PyTorchModel + model = PyTorchModel(self.model) + op_map = model.graph_info + self.assertTrue(op_map['conv1'] == 'Conv2d') + + def test_tensorboard(self): + model = copy.deepcopy(self.nc_model) + model.model.eval().fuse_model() + quantizer = Quantization('dump_yaml.yaml') + dataset = quantizer.dataset('dummy', (100, 3, 224, 224), label=True) + quantizer.model = model.model + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.eval_func = eval_func + quantizer.fit() + self.assertTrue(True if os.path.exists('runs/eval/baseline_acc0.0') else False) + quantizer.eval_dataloader = common.DataLoader(dataset) + quantizer.eval_func = None + quantizer.fit() + self.assertTrue(True if os.path.exists('runs/eval/baseline_acc0.0') else False) + + def test_tensor_dump_and_set(self): + model = copy.deepcopy(self.nc_model) + model.model.eval().fuse_model() + quantizer = Quantization('ptq_yaml.yaml') + dataset = quantizer.dataset('dummy', (100, 3, 224, 224), label=True) + dataloader = common.DataLoader(dataset) + dataloader = common._generate_common_dataloader(dataloader, 'pytorch') + quantizer.eval_dataloader = dataloader + quantizer.calib_dataloader = dataloader + quantizer.model = model.model + q_model = quantizer.fit() + quantizer.strategy.adaptor.inspect_tensor( + model, dataloader, op_list=['conv1.0', 'layer1.0.conv1.0'], + iteration_list=[1, 2], inspect_type='all', save_to_disk=True) + load_array = lambda *a, **k: np.load(*a, allow_pickle=True, **k) + a = load_array('saved/dump_tensor/activation_iter1.npz') + w = load_array('saved/dump_tensor/weight.npz') + if PT_VERSION >= Version("1.8.0").release: + self.assertTrue(w['conv1.0'].item()['conv1.0.weight'].shape[0] == + a['conv1.0'].item()['conv1.0.output0'].shape[1]) + else: + self.assertTrue(w['conv1.0'].item()['conv1.0.weight'].shape[0] == + a['conv1.0'].item()['conv1.1.output0'].shape[1]) + data = np.random.random(w['conv1.0'].item()['conv1.0.weight'].shape).astype(np.float32) + quantizer.strategy.adaptor.set_tensor(q_model, {'conv1.0.weight': data}) + changed_tensor = q_model.get_weight('conv1.weight') + scales = changed_tensor.q_per_channel_scales() + changed_tensor_fp32 = torch.dequantize(changed_tensor) + self.assertTrue(np.allclose(data, changed_tensor_fp32.numpy(), atol=2 / np.min(scales.numpy()))) + quantizer.strategy.adaptor.inspect_tensor( + q_model, dataloader, op_list=['conv1.0', 'layer1.0.conv1.0'], + iteration_list=[1, 2], inspect_type='all', save_to_disk=False) + + def test_get_graph_info(self): + from neural_compressor.adaptor.pytorch import get_ops_recursively + model = copy.deepcopy(self.model) + op_map = {} + get_ops_recursively(model, '', op_map) + self.assertTrue(op_map['conv1'] == 'Conv2d') + + def test_forward_wrapper(self): + vision_model = resnet18() + class dummymodel(torch.nn.Module): + def __init__(self, model): + super(dummymodel, self).__init__() + self._model = model + def forward(self,input=None): + return self._model(input) + + data = [[{'input': torch.rand(3,224,224)}, torch.ones(1,1)], ] + # dataloader.batch_size=100 + dataloader = common.DataLoader(data, batch_size=1) + quantizer = Quantization('dynamic_yaml.yaml') + model = dummymodel(vision_model) + quantizer.model = model + quantizer.calib_dataloader = dataloader + quantizer.eval_dataloader = dataloader + quantizer.fit() + + def test_floatfunctions_fallback(self): + class ModelWithFunctionals(torch.nn.Module): + def __init__(self): + super(ModelWithFunctionals, self).__init__() + self.mycat = nnq.FloatFunctional() + self.myadd = nnq.FloatFunctional() + self.myadd_relu = nnq.FloatFunctional() + # Tracing doesnt work yet for c10 ops with scalar inputs + # https://github.com/pytorch/pytorch/issues/27097 + self.my_scalar_add = nnq.FloatFunctional() + self.mymul = nnq.FloatFunctional() + self.my_scalar_mul = nnq.FloatFunctional() + self.quant = QuantStub() + self.dequant = DeQuantStub() + + def forward(self, x): + x = self.quant(x) + y = self.mycat.cat([x, x, x]) + z = self.myadd.add(y, y) + w = self.myadd_relu.add_relu(z, z) + # Tracing doesnt work yet for c10 ops with scalar inputs + # https://github.com/pytorch/pytorch/issues/27097 + w = self.my_scalar_add.add_scalar(w, -0.5) + w = self.mymul.mul(w, w) + w = self.my_scalar_mul.mul_scalar(w, 0.5) + w = self.dequant(w) + return w + + model = ModelWithFunctionals() + model = MODELS['pytorch'](model) + x = torch.rand(10, 1, dtype=torch.float) + y = model.model(x) + fallback_ops = [] + q_capability = self.adaptor.query_fw_capability(model) + for k, v in q_capability["opwise"].items(): + if k[0] != "quant" and k[0] != "dequant": + fallback_ops.append(k[0]) + model.model.qconfig = torch.quantization.default_qconfig + model.model.quant.qconfig = torch.quantization.default_qconfig + if PT_VERSION >= Version("1.8.0").release: + model.model.dequant.qconfig = torch.quantization.default_qconfig + nc_torch._fallback_quantizable_ops_recursively( + model.model, '', fallback_ops, op_qcfgs={}) + torch.quantization.add_observer_(model.model) + model.model(x) + torch.quantization.convert(model.model, self.adaptor.q_mapping, inplace=True) + qy = model.model(x) + tol = {'atol': 1e-01, 'rtol': 1e-03} + self.assertTrue(np.allclose(y, qy, **tol)) + +@unittest.skipIf(not FX_MODE, "Unsupport Fx Mode with PyTorch Version Below 1.8") +class TestPytorchFXAdaptor(unittest.TestCase): + framework_specific_info = {"device": "cpu", + "approach": "post_training_static_quant", + "random_seed": 1234, + "q_dataloader": None, + "workspace_path": "./"} + framework = "pytorch_fx" + adaptor = FRAMEWORKS[framework](framework_specific_info) + @classmethod + def setUpClass(self): + build_pytorch_fx_yaml() + + @classmethod + def tearDownClass(self): + os.remove('fx_ptq_yaml.yaml') + os.remove('fx_dynamic_yaml.yaml') + shutil.rmtree('./saved', ignore_errors=True) + shutil.rmtree('runs', ignore_errors=True) + + def test_fx_quant(self): + for fake_yaml in ['fx_qat_yaml.yaml', 'fx_ptq_yaml.yaml']: + model_origin = resnet18() + # run fx_quant in neural_compressor and save the quantized GraphModule + quantizer = Quantization(fake_yaml) + dataset = quantizer.dataset('dummy', (10, 3, 224, 224), label=True) + quantizer.eval_func = eval_func + if fake_yaml == 'fx_qat_yaml.yaml': + quantizer.q_func = q_func + else: + quantizer.calib_func = eval_func + dataloader = common.DataLoader(dataset) + quantizer.calib_dataloader = dataloader + quantizer.model = common.Model(model_origin, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []} + }) + q_model = quantizer.fit() + q_model.save('./saved') + # Load configure and weights with neural_compressor.utils + model_fx = load('./saved', model_origin, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []}, \ + 'dataloader': quantizer.calib_dataloader + }) + self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) + + # recover int8 model with only tune_cfg + history_file = './saved/history.snapshot' + model_fx_recover = recover(model_origin, history_file, 0, + **{'prepare_custom_config_dict': + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': + {'preserved_attributes': []}, + 'dataloader': quantizer.calib_dataloader + }) + self.assertEqual(model_fx.code, model_fx_recover.code) + shutil.rmtree('./saved', ignore_errors=True) + + for fake_yaml in ['fx_qat_yaml.yaml', 'fx_ptq_yaml.yaml']: + model_origin = M() + # run fx_quant in neural_compressor and save the quantized GraphModule + quantizer = Quantization(fake_yaml) + quantizer.conf.usr_cfg.tuning.exit_policy['performance_only'] = True + dataset = quantizer.dataset('dummy', (10, 3, 224, 224), label=True) + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.eval_dataloader = common.DataLoader(dataset) + quantizer.model = common.Model(model_origin, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []} + }) + q_model = quantizer.fit() + q_model.save('./saved') + # Load configure and weights with neural_compressor.utils + model_fx = load('./saved', model_origin, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []}, \ + 'dataloader': quantizer.calib_dataloader + }) + self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) + shutil.rmtree('./saved', ignore_errors=True) + + @unittest.skipIf(PT_VERSION < Version("1.9.0").release, + "Please use PyTroch 1.9 or higher version for dynamic quantization with pytorch_fx backend") + def test_fx_dynamic_quant(self): + model = LSTMModel( + ntoken = 10, + ninp = 512, + nhid = 256, + nlayers = 5, + ) + # run fx_quant in neural_compressor and save the quantized GraphModule + model.eval() + quantizer = Quantization('fx_dynamic_yaml.yaml') + quantizer.model = common.Model(model, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []} + }) + q_model = quantizer.fit() + q_model.save('./saved') + + # Load configure and weights by neural_compressor.utils + model_fx = load("./saved", model, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []} + }) + self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) + + # Test the functionality of older model saving type + state_dict = torch.load("./saved/best_model.pt") + tune_cfg = state_dict.pop('best_configure') + import yaml + with open("./saved/best_configure.yaml", 'w') as f: + yaml.dump(tune_cfg, f, default_flow_style=False) + torch.save(state_dict, "./saved/best_model_weights.pt") + os.remove('./saved/best_model.pt') + model_fx = load("./saved", model, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []} + }) + self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) + + # recover int8 model with only tune_cfg + history_file = './saved/history.snapshot' + model_fx_recover = recover(model, history_file, 0, + **{'prepare_custom_config_dict': + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': + {'preserved_attributes': []} + }) + self.assertEqual(model_fx.code, model_fx_recover.code) + shutil.rmtree('./saved', ignore_errors=True) + + def test_default_dynamic_quant(self): + def eval_func(model): + return 1 + + def q_func(model): + return model + + # Model Definition + for fake_yaml in ['fx_qat_yaml.yaml', 'fx_ptq_yaml.yaml']: + model_origin = LSTMModel( + ntoken = 10, + ninp = 512, + nhid = 256, + nlayers = 2, + ) + # run fx_quant in neural_compressor and save the quantized GraphModule + quantizer = Quantization(fake_yaml) + dataset = quantizer.dataset('dummy', (3, 10), label=True) + quantizer.eval_func = eval_func + if fake_yaml == 'fx_qat_yaml.yaml': + quantizer.q_func = q_func + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.model = common.Model(model_origin) + q_model = quantizer.fit() + self.assertTrue('quantize' in str(type(q_model.model.encoder))) + self.assertTrue('quantize' in str(type(q_model.model.rnn))) + + def test_fx_sub_module_quant(self): + for fake_yaml in ['fx_qat_yaml.yaml', 'fx_ptq_yaml.yaml', 'fx_dynamic_yaml.yaml']: + model_origin = DynamicControlModel() + # run fx_quant in neural_compressor and save the quantized GraphModule + quantizer = Quantization(fake_yaml) + dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True) + quantizer.eval_func = eval_func + if fake_yaml == 'fx_qat_yaml.yaml': + quantizer.q_func = q_func + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.model = common.Model(model_origin, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []} + }) + q_model = quantizer.fit() + q_model.save('./saved') + # Load configure and weights with neural_compressor.utils + model_fx = load('./saved/best_model.pt', model_origin, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []}, \ + 'dataloader': quantizer.calib_dataloader + }) + self.assertTrue(isinstance(model_fx.sub, torch.fx.graph_module.GraphModule)) + + # recover int8 model with only tune_cfg + history_file = './saved/history.snapshot' + model_fx_recover = recover(model_origin, history_file, 0, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []}, \ + 'dataloader': quantizer.calib_dataloader + }) + self.assertEqual(model_fx.sub.code, model_fx_recover.sub.code) + shutil.rmtree('./saved', ignore_errors=True) + + def test_deepcopy_failure(self): + def eval_func(model): + return 1 + + # To build an object t2, which will fail on deepcopy. + class T1(): + def __init__(self, t1) -> None: + self.t1 = t1 + self.j = 1 + + # required for usage with set in T1 + def __hash__(self): + return hash(self.j) + + t1 = set() + t2 = T1([t1]) + t1.add(t2) + + for fake_yaml in ['fx_ptq_yaml.yaml']: + model_origin = M() + model_origin.tmp = t2 + # run fx_quant in neural_compressor and save the quantized GraphModule + quantizer = Quantization(fake_yaml) + dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True) + quantizer.eval_func = eval_func + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.model = common.Model(model_origin) + q_model = quantizer.fit() + self.assertTrue(isinstance(q_model.model, torch.fx.graph_module.GraphModule)) + + @unittest.skipIf(PT_VERSION < Version("1.11.0").release, + "Please use PyTroch 1.11 or higher version for mixed precision with pytorch_fx or pytorch backend") + def test_bf16_capability(self): + model_origin = DynamicControlModel() + os.environ['FORCE_BF16'] = '1' + q_capability = self.adaptor._get_quantizable_ops(model_origin) + del os.environ['FORCE_BF16'] + + self.assertEqual( + [elem['weight']['dtype'] for elem in q_capability['optypewise']['Conv2d']], + [['int8'], 'fp32']) + self.assertEqual( + [elem['activation']['dtype'] for elem in q_capability['optypewise']['Conv2d']], + [['uint8'], 'fp32']) + self.assertEqual( + [elem['weight']['dtype'] for elem in q_capability['opwise'][('conv', 'Conv2d')]], + [['int8'], 'fp32']) + self.assertEqual( + [elem['activation']['dtype'] for elem in q_capability['opwise'][('conv', 'Conv2d')]], + [['uint8'], 'fp32']) + self.assertEqual( + [elem['weight']['dtype'] for elem in q_capability['opwise'][('linear', 'Linear')]], + [['int8'], 'fp32', 'bf16']) + self.assertEqual( + [elem['activation']['dtype'] for elem in q_capability['opwise'][('linear', 'Linear')]], + [['uint8'], 'fp32', 'bf16']) + + @unittest.skipIf(PT_VERSION < Version("1.11.0").release, + "Please use PyTroch 1.11 or higher version for mixed precision with pytorch_fx or pytorch backend") + def test_mix_precision(self): + fake_yaml = 'fx_ptq_yaml.yaml' + model_origin = DynamicControlModel() + # run fx_quant in neural_compressor and save the quantized GraphModule + quantizer = Quantization(fake_yaml) + dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True) + quantizer.eval_func = eval_func + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.model = common.Model(model_origin, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []} + }) + q_model = quantizer.fit() + tune_cfg = q_model.q_config + tune_cfg['op'][('conv.module', 'Conv2d')].clear() + tune_cfg['op'][('conv.module', 'Conv2d')] = \ + {'weight': {'dtype': 'bf16'}, 'activation': {'dtype': 'bf16'}} + tune_cfg["bf16_ops_list"].append(('conv.module', 'Conv2d')) + from neural_compressor.adaptor.torch_utils.bf16_convert import Convert + q_model._model = Convert(q_model._model, tune_cfg) + + self.assertEqual(q_model._model.conv.module.module.weight.dtype, torch.bfloat16) + self.assertEqual(q_model._model.conv.module.module.bias.dtype, torch.bfloat16) + + def test_symbolic_trace(self): + from neural_compressor.adaptor.torch_utils.symbolic_trace import symbolic_trace + model_origin = DynamicControlModel() + traced_model = symbolic_trace(model_origin, is_qat=False) + if PT_VERSION >= Version("1.11.0").release: + self.assertTrue(isinstance(traced_model.sub, torch.nn.Module)) + self.assertTrue(isinstance(traced_model.conv, torch.fx.graph_module.GraphModule)) + else: + self.assertTrue(isinstance(traced_model.sub, torch.fx.graph_module.GraphModule)) + traced_model_qat = symbolic_trace(model_origin, is_qat=True) + self.assertTrue(isinstance(traced_model_qat.sub, torch.fx.graph_module.GraphModule)) + +if __name__ == "__main__": + unittest.main() diff --git a/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_2.x.py b/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_2.x.py new file mode 100644 index 00000000000..3bea3e28673 --- /dev/null +++ b/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_2.x.py @@ -0,0 +1,682 @@ +import copy +import neural_compressor.adaptor.pytorch as nc_torch +import numpy as np +import os +import shutil +import torch +import torch.nn as nn +import torch.nn.quantized as nnq +import unittest +import os +from neural_compressor import PostTrainingQuantConfig, QuantizationAwareTrainingConfig +from neural_compressor.config import set_tensorboard, set_workspace +from neural_compressor.data import DATASETS, DATALOADERS +from neural_compressor.adaptor import FRAMEWORKS +from neural_compressor.model import MODELS +from neural_compressor.experimental import Quantization, common +from neural_compressor.experimental.data.datasets.dataset import DATASETS +from neural_compressor import quantization +from neural_compressor.training import prepare_compression +from neural_compressor.utils.pytorch import load +from neural_compressor.utils.utility import recover +from neural_compressor.utils.utility import LazyImport +from torch.quantization import QuantStub, DeQuantStub +from packaging.version import Version + + +# improve lazy import UT coverage +resnet18 = LazyImport("torchvision.models.resnet18") +q_resnet18 = LazyImport("torchvision.models.quantization.resnet18") + +PT_VERSION = nc_torch.get_torch_version().release +if PT_VERSION >= Version("1.8.0").release: + FX_MODE = True +else: + FX_MODE = False + + +dyn_op_name_list = {"decoder": {"activation": {"dtype": ["fp32"]}, "weight": {"dtype": ["fp32"]}}} + +ptq_op_name_list = { + "layer1.0.conv1": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + }, + "layer1.0.conv2": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + }, + "layer2.0.conv1": { + "activation": { + "dtype": ["uint8"], + "algorithm": ["minmax"], + "granularity": ["per_tensor"], + "scheme": ["sym"] + }, + "weight": { + "dtype": ["int8"], + "algorithm": ["minmax"], + "granularity": ["per_channel"], + "scheme": ["sym"] + } + }, + "layer3.0.conv1": { + "activation": { + "dtype": ["uint8"], + "algorithm": ["kl"], + "granularity": ["per_tensor"], + "scheme": ["sym"] + }, + "weight": { + "dtype": ["int8"], + "algorithm": ["minmax"], + "granularity": ["per_channel"], + "scheme": ["sym"] + } + }, + "layer1.0.add_relu": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + }, +} + +ptq_fx_op_name_list = { + "layer1.0.conv1": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + }, + "layer1.0.conv2": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + }, + "layer2.0.conv1": { + "activation": { + "dtype": ["uint8"], + "algorithm": ["minmax"], + "granularity": ["per_tensor"], + "scheme": ["sym"] + }, + "weight": { + "dtype": ["int8"], + "algorithm": ["minmax"], + "granularity": ["per_channel"], + "scheme": ["sym"] + } + }, + "layer3.0.conv1": { + "activation": { + "dtype": ["uint8"], + "algorithm": ["kl"], + "granularity": ["per_tensor"], + "scheme": ["sym"] + }, + "weight": { + "dtype": ["int8"], + "algorithm": ["minmax"], + "granularity": ["per_channel"], + "scheme": ["sym"] + } + }, + "layer1.0.add_relu": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + }, + "conv.module": { + "weight": { + "dtype": ["fp32"] + }, + "activation": { + "dtype": ["fp32"] + } + }, + "default_qconfig": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + } +} + +qat_op_name_list = { + "layer1.0.conv1": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + }, + "layer1.0.conv2": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + }, + "layer2.0.conv1": { + "activation": { + "dtype": ["uint8"], + "algorithm": ["minmax"], + "granularity": ["per_tensor"], + "scheme": ["sym"] + }, + "weight": { + "dtype": ["int8"], + "algorithm": ["minmax"], + "granularity": ["per_channel"], + "scheme": ["sym"] + } + }, + "layer3.0.conv1": { + "activation": { + "dtype": ["uint8"], + "algorithm": ["kl"], + "granularity": ["per_tensor"], + "scheme": ["sym"] + }, + "weight": { + "dtype": ["int8"], + "algorithm": ["minmax"], + "granularity": ["per_channel"], + "scheme": ["sym"] + } + }, + "layer1.0.add_relu": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + } +} + + + + +class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = QuantStub() + self.conv = nn.Conv2d(3, 1, 1) + self.linear = nn.Linear(224 * 224, 5) + self.dequant = DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.conv(x) + x = x.view(1, -1) + x = self.linear(x) + x = self.dequant(x) + return x + + +class FP32Model(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + times = x.size(1) + if times == 1: + return x + x + return x + + +class DynamicModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv2d(1, 1, 1) + def forward(self, x): + if x is not None: + x = self.conv(x) + return x + + +class SubModel(torch.nn.Module): + def __init__(self, bypass=True): + super().__init__() + self.quant = QuantStub() + self.conv = nn.Conv2d(1, 1, 1) + self.conv1 = nn.Conv2d(1, 1, 1) + self.bn = nn.BatchNorm2d(1) + self.relu = nn.ReLU() + self.fp32 = FP32Model() + self.norm = nn.LayerNorm([1, 224, 224]) + self.dequant = DeQuantStub() + self.bypass = bypass + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.quant(x) + x = self.relu(x) + x = self.conv1(x) + x = self.dequant(x) + if not self.bypass: + x = self.fp32(x) + x = self.norm(x) + return x + + +class PartialQuantModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = QuantStub() + self.conv = nn.Conv2d(3, 1, 1) + self.bn = nn.BatchNorm2d(1) + self.conv1 = nn.Conv2d(1, 1, 1) + self.bn1 = nn.BatchNorm2d(1) + self.conv2 = nn.Conv2d(1, 1, 1) + self.linear = nn.Linear(224 * 224, 1) + self.dequant = DeQuantStub() + self.sub = SubModel(bypass=False) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.conv1(x) + x = self.bn1(x) + x = self.sub(x) + x = self.quant(x) + x = self.conv2(x) + x = x.view(1, -1) + x = self.linear(x) + x = self.dequant(x) + return x + +class DynamicControlModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv2d(3, 1, 1) + self.bn = nn.BatchNorm2d(1) + self.linear = nn.Linear(224 * 224, 1) + self.sub = SubModel() + self.fp32 = FP32Model() + self.dyn = DynamicModel() + + def forward(self, x): + x = self.conv(x) + x = self.dyn(x) + x = self.bn(x) + x = self.sub(x) + x = self.fp32(x) + x = x.view(1, -1) + x = self.linear(x) + return x + + +class LSTMModel(nn.Module): + """Container module with an encoder, a recurrent module, and a decoder.""" + + def __init__(self, ntoken=10, ninp=512, nhid=256, nlayers=5, dropout=0.5): + super(LSTMModel, self).__init__() + self.drop = nn.Dropout(dropout) + self.encoder = nn.Embedding(ntoken, ninp) + self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout) + self.decoder = nn.Linear(nhid, ntoken) + self.init_weights() + self.nhid = nhid + self.nlayers = nlayers + + def init_weights(self): + initrange = 0.1 + self.encoder.weight.data.uniform_(-initrange, initrange) + self.decoder.bias.data.zero_() + self.decoder.weight.data.uniform_(-initrange, initrange) + + def forward(self, input): + input = torch.ones((3, 10), dtype=torch.int32) + h0 = torch.randn(2, 10, 256) + c0 = torch.randn(2, 10, 256) + hidden = (h0, c0) + emb = self.encoder(input) + output, hidden = self.rnn(emb, hidden) + output = self.drop(output) + decoded = self.decoder(output) + return decoded, hidden + + +def eval_func(model): + # switch to evaluate mode + model.eval() + with torch.no_grad(): + input = torch.randn(1, 3, 224, 224) + # compute output + output = model(input) + return 0.0 + + +def train_func(compression_manager, model, dataloader=None): + compression_manager.callbacks.on_train_begin(dataloader=dataloader) + optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) + # switch to evaluate mode + model.train() + input = torch.randn(1, 3, 224, 224) + # compute output + output = model(input) + loss = output[0].mean() if isinstance(output, tuple) else output.mean() + optimizer.zero_grad() + loss.backward() + optimizer.step() + compression_manager.callbacks.on_train_end() + return model + + +def q_func(model): + optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) + # switch to evaluate mode + model.train() + input = torch.randn(1, 3, 224, 224) + # compute output + output = model(input) + loss = output.mean() + optimizer.zero_grad() + loss.backward() + optimizer.step() + return model + + +class TestPytorchAdaptor(unittest.TestCase): + model = q_resnet18() + + @classmethod + def tearDownClass(self): + shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree("runs", ignore_errors=True) + + def test_quantization_new_API(self): + for fake_yaml in ["dynamic", "qat", "static"]: + model = M() + if fake_yaml == "qat": + quant_conf = QuantizationAwareTrainingConfig(op_name_list=qat_op_name_list) + compression_manager = prepare_compression(copy.deepcopy(model), quant_conf) + q_model = train_func(compression_manager, compression_manager.model) + else: + dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) + dataloader = DATALOADERS["pytorch"](dataset) + if fake_yaml == "dynamic": + quant_conf = PostTrainingQuantConfig(approach="dynamic", + op_name_list=dyn_op_name_list) + elif fake_yaml == "static": + quant_conf = PostTrainingQuantConfig(approach="static", + op_name_list=ptq_op_name_list) + q_model = quantization.fit( + model, + quant_conf, + calib_dataloader=dataloader if fake_yaml == "static" else None) + q_model.save("./saved") + # Load configure and weights by neural_compressor.utils + saved_model = load("./saved", model) + shutil.rmtree("./saved", ignore_errors=True) + + def test_auto_quant(self): + def eval_func(model): + return 1 + + model_origin = LSTMModel( + ntoken = 10, + ninp = 512, + nhid = 256, + nlayers = 2, + ) + # run fx_quant in neural_compressor and save the quantized GraphModule + quant_conf = PostTrainingQuantConfig(approach="auto") + set_workspace("./saved") + dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) + dataloader = common.DataLoader(dataset) + q_model = quantization.fit(model_origin, + quant_conf, + calib_dataloader=dataloader, + eval_func=eval_func) + q_model.save("./saved") + model = common.Model(model_origin) + model.workspace_path = "./saved" + self.assertNotEqual(q_model, None) + self.assertEqual(type(q_model._model.decoder), + type(model._model.decoder)) + shutil.rmtree("./saved", ignore_errors=True) + + def test_tensorboard(self): + model = copy.deepcopy(self.model) + model.eval().fuse_model() + quant_conf = PostTrainingQuantConfig(approach="static", + backend="pytorch") + set_tensorboard(True) + dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) + dataloader = common.DataLoader(dataset) + quantization.fit( + model, quant_conf, calib_dataloader=dataloader, eval_func=eval_func + ) + self.assertTrue(True if os.path.exists("runs/eval/baseline_acc0.0") else False) + quantization.fit(model, + quant_conf, + calib_dataloader=dataloader, + eval_dataloader=dataloader) + self.assertTrue(True if os.path.exists("runs/eval/baseline_acc0.0") else False) + set_tensorboard(False) + + +@unittest.skipIf(not FX_MODE, "Unsupport Fx Mode with PyTorch Version Below 1.8") +class TestPytorchFXAdaptor(unittest.TestCase): + @classmethod + def tearDownClass(self): + shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree("runs", ignore_errors=True) + + def test_fx_quant(self): + for fake_yaml in ["qat", "static"]: + model_origin = resnet18() + dataset = DATASETS("pytorch")["dummy"]((10, 3, 224, 224), label=True) + dataloader = DATALOADERS["pytorch"](dataset) + if fake_yaml == "qat": + conf = QuantizationAwareTrainingConfig( + op_name_list=qat_op_name_list, backend="pytorch_fx" + ) + compression_manager = prepare_compression(copy.deepcopy(model_origin), conf) + q_model = train_func(compression_manager, compression_manager.model, dataloader) + else: + conf = PostTrainingQuantConfig( + op_name_list=ptq_fx_op_name_list, backend="pytorch_fx" + ) + set_workspace("./saved") + q_model = quantization.fit(model_origin, + conf, + calib_dataloader=dataloader, + calib_func=eval_func) + q_model.save("./saved") + # Load configure and weights with neural_compressor.utils + model_fx = load("./saved", model_origin, + **{"dataloader": torch.utils.data.DataLoader(dataset)}) + self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) + + if fake_yaml != "qat": + # recover int8 model with only tune_cfg + history_file = "./saved/history.snapshot" + model_fx_recover = recover(model_origin, history_file, 0, + **{"dataloader": dataloader}) + self.assertEqual(model_fx.code, model_fx_recover.code) + shutil.rmtree("./saved", ignore_errors=True) + for fake_yaml in ["qat", "static"]: + model_origin = M() + # run fx_quant in neural_compressor and save the quantized GraphModule + dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224), label=True) + dataloader = DATALOADERS["pytorch"](dataset) + if fake_yaml == "qat": + conf = QuantizationAwareTrainingConfig( + op_name_list=qat_op_name_list, backend="pytorch_fx" + ) + compression_manager = prepare_compression(copy.deepcopy(model_origin), conf) + q_model = train_func(compression_manager, compression_manager.model, dataloader) + compression_manager.save("./saved") + else: + conf = PostTrainingQuantConfig( + op_name_list=ptq_fx_op_name_list, backend="pytorch_fx" + ) + q_model = quantization.fit(model_origin, + conf, + calib_dataloader=dataloader) + q_model.save("./saved") + # Load configure and weights with neural_compressor.utils + model_fx = load("./saved", model_origin, + **{"dataloader": torch.utils.data.DataLoader(dataset)}) + self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) + shutil.rmtree("./saved", ignore_errors=True) + + @unittest.skipIf(PT_VERSION < Version("1.9.0").release, + "Please use PyTroch 1.9 or higher version for dynamic quantization with pytorch_fx backend") + def test_fx_dynamic_quant(self): + origin_model = LSTMModel( + ntoken = 10, + ninp = 512, + nhid = 256, + nlayers = 5, + ) + # run fx_quant in neural_compressor and save the quantized GraphModule + origin_model.eval() + conf = PostTrainingQuantConfig(approach="dynamic", + op_name_list=ptq_fx_op_name_list, backend="pytorch_fx" + ) + set_workspace("./saved") + q_model = quantization.fit(origin_model, conf) + q_model.save("./saved") + + # Load configure and weights by neural_compressor.utils + model_fx = load("./saved", origin_model) + self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) + + # Test the functionality of older model saving type + state_dict = torch.load("./saved/best_model.pt") + tune_cfg = state_dict.pop("best_configure") + import yaml + with open("./saved/best_configure.yaml", "w") as f: + yaml.dump(tune_cfg, f, default_flow_style=False) + torch.save(state_dict, "./saved/best_model_weights.pt") + os.remove("./saved/best_model.pt") + model_fx = load("./saved", origin_model) + self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) + + # recover int8 model with only tune_cfg + history_file = "./saved/history.snapshot" + model_fx_recover = recover(origin_model, history_file, 0) + self.assertEqual(model_fx.code, model_fx_recover.code) + shutil.rmtree("./saved", ignore_errors=True) + + def test_default_dynamic_quant(self): + def eval_func(model): + return 1 + + # Model Definition + for fake_yaml in ["qat", "auto"]: + model_origin = LSTMModel( + ntoken = 10, + ninp = 512, + nhid = 256, + nlayers = 2, + ) + dataset = DATASETS("pytorch")["dummy"]((3, 10)) + dataloader = DATALOADERS["pytorch"](dataset) + # run fx_quant in neural_compressor and save the quantized GraphModule + if fake_yaml == "qat": + conf = QuantizationAwareTrainingConfig( + op_name_list=qat_op_name_list, backend="pytorch_fx" + ) + compression_manager = prepare_compression(copy.deepcopy(model_origin), conf) + q_model = train_func(compression_manager, compression_manager.model, dataloader=dataloader) + self.assertTrue("quantize" in str(type(q_model.model.encoder))) + self.assertTrue("quantize" in str(type(q_model.model.rnn))) + else: + conf = PostTrainingQuantConfig(backend="pytorch_fx") + q_model = quantization.fit(model_origin, + conf, + calib_dataloader=dataloader) + self.assertTrue("quantize" in str(type(q_model.model.encoder))) + self.assertTrue("quantize" in str(type(q_model.model.rnn))) + + def test_fx_sub_module_quant(self): + for fake_yaml in ["qat", "static"]: + model_origin = DynamicControlModel() + dataset = DATASETS("pytorch")["dummy"]((1, 3, 224, 224)) + dataloader = DATALOADERS["pytorch"](dataset) + # run fx_quant in neural_compressor and save the quantized GraphModule + if fake_yaml == "qat": + conf = QuantizationAwareTrainingConfig( + op_name_list=qat_op_name_list, backend="pytorch_fx" + ) + compression_manager = prepare_compression(copy.deepcopy(model_origin), conf) + q_model = train_func(compression_manager, compression_manager.model, dataloader) + else: + set_workspace("./saved") + conf = PostTrainingQuantConfig(backend="pytorch_fx") + q_model = quantization.fit(model_origin, + conf, + calib_dataloader=dataloader) + q_model.save("./saved") + # Load configure and weights with neural_compressor.utils + model_fx = load("./saved/best_model.pt", model_origin, + **{"dataloader": torch.utils.data.DataLoader(dataset) + }) + self.assertTrue(isinstance(model_fx.sub, torch.fx.graph_module.GraphModule)) + + if fake_yaml != "qat": + # recover int8 model with only tune_cfg + history_file = "./saved/history.snapshot" + model_fx_recover = recover(model_origin, history_file, 0, + **{"dataloader": torch.utils.data.DataLoader(dataset) + }) + self.assertEqual(model_fx.sub.code, model_fx_recover.sub.code) + shutil.rmtree("./saved", ignore_errors=True) + + @unittest.skipIf(PT_VERSION < Version("1.11.0").release, + "Please use PyTroch 1.11 or higher version for mixed precision with pytorch_fx or pytorch backend") + def test_mix_precision(self): + model_origin = DynamicControlModel() + # run fx_quant in neural_compressor and save the quantized GraphModule + dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) + dataloader = DATALOADERS["pytorch"](dataset) + set_workspace=("./saved") + conf = PostTrainingQuantConfig(op_name_list=ptq_fx_op_name_list, backend="pytorch_fx") + q_model = quantization.fit(model_origin, + conf, + calib_dataloader=dataloader, + calib_func = eval_func) + tune_cfg = q_model.q_config + tune_cfg["op"][("conv.module", "Conv2d")].clear() + tune_cfg["op"][("conv.module", "Conv2d")] = \ + {"weight": {"dtype": "bf16"}, "activation": {"dtype": "bf16"}} + tune_cfg["bf16_ops_list"].append(("conv.module", "Conv2d")) + from neural_compressor.adaptor.torch_utils.bf16_convert import Convert + q_model._model = Convert(q_model._model, tune_cfg) + + self.assertEqual(q_model._model.conv.module.module.weight.dtype, torch.bfloat16) + self.assertEqual(q_model._model.conv.module.module.bias.dtype, torch.bfloat16) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/adaptor/pytorch_adaptor/test_torch2onnx.py b/test/adaptor/pytorch_adaptor/test_torch2onnx.py index 977e621be84..8977b1a1dd4 100644 --- a/test/adaptor/pytorch_adaptor/test_torch2onnx.py +++ b/test/adaptor/pytorch_adaptor/test_torch2onnx.py @@ -8,9 +8,8 @@ import unittest import neural_compressor.adaptor.pytorch as nc_torch from neural_compressor import quantization -from neural_compressor.conf.pythonic_config import PostTrainingConfig, QuantizationAwareTrainingConfig +from neural_compressor.config import PostTrainingQuantConfig from neural_compressor.experimental.data.datasets.dataset import DATASETS -from neural_compressor.training import prepare_compression from packaging.version import Version from torch.quantization import QuantStub, DeQuantStub @@ -209,11 +208,9 @@ def test_fx_quant(self): for fake_yaml in ['dynamic', 'static']: model = DynamicControlModel() # run fx_quant in neural_compressor and save the quantized GraphModule - conf = PostTrainingConfig( - approach="post_training_dynamic_quant" \ - if fake_yaml == "dynamic" else "post_training_static_quant", - backend="pytorch_fx", - performance_only=True + conf = PostTrainingQuantConfig( + approach=fake_yaml, + backend="pytorch_fx" ) dataset = DATASETS("pytorch")['dummy']((100, 3, 224, 224)) dataloader = torch.utils.data.DataLoader(dataset) diff --git a/test/benchmark/test_benchmark.py b/test/benchmark/test_benchmark.py index 7815bb6cbfe..37aef1ca500 100644 --- a/test/benchmark/test_benchmark.py +++ b/test/benchmark/test_benchmark.py @@ -4,11 +4,13 @@ import os import yaml import numpy as np -import tensorflow as tf import tempfile import re +import platform from neural_compressor.adaptor.tf_utils.util import write_graph +import tensorflow as tf + def build_fake_yaml(): fake_yaml = ''' model: @@ -43,12 +45,14 @@ def build_benchmark(): arg_parser = ArgumentParser(description='Parse args') arg_parser.add_argument('--input_model', dest='input_model', default='input_model', help='input odel') args = arg_parser.parse_args() -import neural_compressor from neural_compressor.data import DATASETS -from neural_compressor.experimental import common dataset = DATASETS('tensorflow')['dummy']((100, 32, 32, 1), label=True) -b_dataloader = common.DataLoader(dataset, batch_size=10) -neural_compressor.benchmark(args.input_model, 'fake_yaml.yaml', b_dataloader=b_dataloader) +from neural_compressor.experimental import Benchmark, common +from neural_compressor.conf.config import BenchmarkConf +benchmarker = Benchmark('fake_yaml.yaml') +benchmarker.b_dataloader = common.DataLoader(dataset, batch_size=10) +benchmarker.model = args.input_model +benchmarker.fit() ''' seq1 = ''' @@ -56,14 +60,15 @@ def build_benchmark(): arg_parser = ArgumentParser(description='Parse args') arg_parser.add_argument('--input_model', dest='input_model', default='input_model', help='input odel') args = arg_parser.parse_args() -import neural_compressor from neural_compressor.data import DATASETS dataset = DATASETS('tensorflow')['dummy']((100, 32, 32, 1), label=True) -from neural_compressor.experimental import common +from neural_compressor.experimental import Benchmark, common from neural_compressor.conf.config import BenchmarkConf conf = BenchmarkConf('fake_yaml.yaml') -b_dataloader = common.DataLoader(dataset, batch_size=10) -neural_compressor.benchmark(args.input_model, conf, b_dataloader=b_dataloader) +benchmarker = Benchmark(conf) +benchmarker.b_dataloader = common.DataLoader(dataset, batch_size=10) +benchmarker.model = args.input_model +benchmarker.fit() ''' # test normal case @@ -88,13 +93,15 @@ def build_benchmark2(): "arg_parser = ArgumentParser(description='Parse args')\n", "arg_parser.add_argument('--input_model', dest='input_model', default='input_model', help='input model')\n", "args = arg_parser.parse_args()\n", - "import neural_compressor\n" + "from neural_compressor.data import DATASETS\n", "dataset = DATASETS('tensorflow')['dummy']((5, 32, 32, 1), label=True)\n", - "from neural_compressor.experimental import common\n", - "b_dataloader = common.DataLoader(dataset)\n", - "neural_compressor.benchmark(args.input_model, b_dataloader=b_dataloader)\n" + "from neural_compressor.experimental import Benchmark, common\n", + "benchmarker = Benchmark()\n", + "benchmarker.model = args.input_model\n", + "benchmarker.b_dataloader = common.DataLoader(dataset)\n", + "benchmarker.fit()\n" ] seq1 = ''' @@ -102,11 +109,13 @@ def build_benchmark2(): arg_parser = ArgumentParser(description='Parse args') arg_parser.add_argument('--input_model', dest='input_model', default='input_model', help='input odel') args = arg_parser.parse_args() -import neural_compressor + from neural_compressor import conf -from neural_compressor.experimental import common +from neural_compressor.experimental import Benchmark, common conf.evaluation.performance.dataloader.dataset = {'dummy': {'shape': [100,32,32,1], 'label':True}} -neural_compressor.benchmark(args.input_model, conf) +benchmarker = Benchmark(conf) +benchmarker.model = args.input_model +benchmarker.fit() ''' seq2 = ''' @@ -188,6 +197,7 @@ def setUpClass(self): build_benchmark() build_benchmark2() self.cpu_counts = psutil.cpu_count(logical=False) + self.platform = platform.system().lower() @classmethod def tearDownClass(self): @@ -195,11 +205,11 @@ def tearDownClass(self): os.remove('fake_yaml.yaml') if os.path.exists('fake.py'): os.remove('fake.py') - if os.path.exists('fake.py'): + if os.path.exists('fake2.py'): os.remove('fake2.py') - if os.path.exists('fake.py'): + if os.path.exists('fake3.py'): os.remove('fake3.py') - if os.path.exists('fake.py'): + if os.path.exists('fake4.py'): os.remove('fake4.py') if os.path.exists('fake_data_5.py'): os.remove('fake_data_5.py') @@ -248,8 +258,8 @@ def test_benchmark_without_yaml(self): os.system("python fake2.py --input_model={} 2>&1 | tee benchmark.log".format(self.graph_path)) with open('benchmark.log', "r") as f: for line in f: - accuracy = re.search(r"Accuracy is\s+(\d+(\.\d+)?)", line) - self.assertIsNotNone(accuracy) + throughput = re.search(r"Throughput sum: (\d+(\.\d+)?)", line) + self.assertIsNotNone(throughput) os.system("rm *.log") def test_benchmark_with_conf(self): @@ -259,7 +269,7 @@ def test_benchmark_with_conf(self): throughput = re.search(r"Throughput:\s+(\d+(\.\d+)?) images/sec", line) self.assertIsNotNone(throughput) os.system("rm *.log") - + def test_benchmark_with_custom_metric(self): os.system("python fake4.py --input_model={} 2>&1 | tee benchmark.log".format(self.graph_path)) with open('benchmark.log', "r") as f: @@ -267,6 +277,6 @@ def test_benchmark_with_custom_metric(self): accuracy = re.search(r"Accuracy is\s+(\d+(\.\d+)?)", line) self.assertIsNotNone(accuracy) os.system("rm *.log") - + if __name__ == "__main__": unittest.main() diff --git a/test/benchmark/test_benchmark_2.x.py b/test/benchmark/test_benchmark_2.x.py new file mode 100644 index 00000000000..fe5b0d0d710 --- /dev/null +++ b/test/benchmark/test_benchmark_2.x.py @@ -0,0 +1,176 @@ +"""Tests for neural_compressor benchmark""" +import psutil +import unittest +import os +import yaml +import numpy as np +import tensorflow as tf +import tempfile +import re +from neural_compressor.adaptor.tf_utils.util import write_graph + + +def build_benchmark(): + seq = ''' +from argparse import ArgumentParser +arg_parser = ArgumentParser(description='Parse args') +arg_parser.add_argument('--input_model', dest='input_model', default='input_model', help='input odel') +args = arg_parser.parse_args() +from neural_compressor.benchmark import fit +from neural_compressor.config import BenchmarkConfig +from neural_compressor.data import DATASETS +from neural_compressor.experimental import common +dataset = DATASETS('tensorflow')['dummy']((100, 32, 32, 1), label=True) +b_dataloader = common.DataLoader(dataset, batch_size=10) +conf = BenchmarkConfig(warmup=5, iteration=10, cores_per_instance=4, num_of_instance=2) +fit(args.input_model, conf, b_dataloader=b_dataloader) + ''' + + seq1 = ''' +from argparse import ArgumentParser +arg_parser = ArgumentParser(description='Parse args') +arg_parser.add_argument('--input_model', dest='input_model', default='input_model', help='input odel') +args = arg_parser.parse_args() +from neural_compressor.benchmark import fit +from neural_compressor.config import BenchmarkConfig +from neural_compressor.data import DATASETS +dataset = DATASETS('tensorflow')['dummy']((100, 32, 32, 1), label=True) +from neural_compressor.experimental import common +conf = BenchmarkConfig(warmup=5, iteration=10, cores_per_instance=4, num_of_instance=2) +b_dataloader = common.DataLoader(dataset, batch_size=10) +fit(args.input_model, conf, b_dataloader=b_dataloader) + ''' + + # test normal case + with open('fake.py', "w", encoding="utf-8") as f: + f.writelines(seq) + # test batchsize > len(dataset), use first batch + fake_data_5 = seq.replace('100, 32, 32, 1', '5, 32, 32, 1') + with open('fake_data_5.py', "w", encoding="utf-8") as f: + f.writelines(fake_data_5) + # test batchsize < len(dataset) < 2*batchsize, discard first batch + fake_data_15 = seq1.replace('100, 32, 32, 1', '15, 32, 32, 1') + with open('fake_data_15.py', "w", encoding="utf-8") as f: + f.writelines(fake_data_15) + # test 2*batchsize < len(dataset) < warmup*batchsize, discard last batch + fake_data_25 = seq1.replace('100, 32, 32, 1', '25, 32, 32, 1') + with open('fake_data_25.py', "w", encoding="utf-8") as f: + f.writelines(fake_data_25) + +def build_benchmark2(): + seq = [ + "from argparse import ArgumentParser\n", + "arg_parser = ArgumentParser(description='Parse args')\n", + "arg_parser.add_argument('--input_model', dest='input_model', default='input_model', help='input model')\n", + "args = arg_parser.parse_args()\n", + "from neural_compressor.benchmark import fit\n" + "from neural_compressor.data import DATASETS\n", + "dataset = DATASETS('tensorflow')['dummy']((5, 32, 32, 1), label=True)\n", + + "from neural_compressor.experimental import common\n", + "b_dataloader = common.DataLoader(dataset)\n", + "fit(args.input_model, b_dataloader=b_dataloader)\n" + ] + + with open('fake2.py', "w", encoding="utf-8") as f: + f.writelines(seq) + + +def build_fake_model(): + graph_path = tempfile.mkstemp(suffix='.pb')[1] + try: + graph = tf.Graph() + graph_def = tf.GraphDef() + with tf.Session(graph=graph) as sess: + x = tf.placeholder(tf.float64, shape=(None, 32, 32, 1), name='x') + y_1 = tf.constant(np.random.random((3, 3, 1, 1)), name='y_1') + y_2 = tf.constant(np.random.random((3, 3, 1, 1)), name='y_2') + conv1 = tf.nn.conv2d(input=x, filter=y_1, strides=[1, 1, 1, 1], \ + padding='VALID', name='conv1') + op = tf.nn.conv2d(input=conv1, filter=y_2, strides=[1, 1, 1, 1], \ + padding='VALID', name='op_to_store') + + sess.run(tf.global_variables_initializer()) + constant_graph = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def, ['op_to_store']) + + graph_def.ParseFromString(constant_graph.SerializeToString()) + write_graph(graph_def, graph_path) + except: + graph = tf.Graph() + graph_def = tf.compat.v1.GraphDef() + with tf.compat.v1.Session(graph=graph) as sess: + x = tf.compat.v1.placeholder(tf.float64, shape=(None, 32, 32, 1), name='x') + y_1 = tf.constant(np.random.random((3, 3, 1, 1)), name='y_1') + y_2 = tf.constant(np.random.random((3, 3, 1, 1)), name='y_2') + conv1 = tf.nn.conv2d(input=x, filters=y_1, strides=[1, 1, 1, 1], \ + padding='VALID', name='conv1') + op = tf.nn.conv2d(input=conv1, filters=y_2, strides=[1, 1, 1, 1], \ + padding='VALID', name='op_to_store') + + sess.run(tf.compat.v1.global_variables_initializer()) + constant_graph = tf.compat.v1.graph_util.convert_variables_to_constants(sess, sess.graph_def, ['op_to_store']) + + graph_def.ParseFromString(constant_graph.SerializeToString()) + write_graph(graph_def, graph_path) + return graph_path + +class TestObjective(unittest.TestCase): + @classmethod + def setUpClass(self): + self.graph_path = build_fake_model() + build_benchmark() + build_benchmark2() + self.cpu_counts = psutil.cpu_count(logical=False) + + @classmethod + def tearDownClass(self): + if os.path.exists('fake.py'): + os.remove('fake.py') + if os.path.exists('fake2.py'): + os.remove('fake2.py') + if os.path.exists('fake_data_5.py'): + os.remove('fake_data_5.py') + if os.path.exists('fake_data_15.py'): + os.remove('fake_data_15.py') + if os.path.exists('fake_data_25.py'): + os.remove('fake_data_25.py') + + def test_benchmark(self): + os.system("python fake.py --input_model={}".format(self.graph_path)) + for i in range(2): + with open(f'2_4_{i}.log', "r") as f: + for line in f: + throughput = re.search(r"Throughput:\s+(\d+(\.\d+)?) images/sec", line) + self.assertIsNotNone(throughput) + os.system("rm *.log") + + def test_benchmark_data_5(self): + os.system("python fake_data_5.py --input_model={}".format(self.graph_path)) + for i in range(2): + with open(f'2_4_{i}.log', "r") as f: + for line in f: + throughput = re.search(r"Throughput:\s+(\d+(\.\d+)?) images/sec", line) + self.assertIsNotNone(throughput) + os.system("rm *.log") + + def test_benchmark_data_15(self): + os.system("python fake_data_15.py --input_model={}".format(self.graph_path)) + for i in range(2): + with open(f'2_4_{i}.log', "r") as f: + for line in f: + throughput = re.search(r"Throughput:\s+(\d+(\.\d+)?) images/sec", line) + self.assertIsNotNone(throughput) + os.system("rm *.log") + + def test_benchmark_data_25(self): + os.system("python fake_data_25.py --input_model={}".format(self.graph_path)) + for i in range(2): + with open(f'2_4_{i}.log', "r") as f: + for line in f: + throughput = re.search(r"Throughput:\s+(\d+(\.\d+)?) images/sec", line) + self.assertIsNotNone(throughput) + os.system("rm *.log") + + +if __name__ == "__main__": + unittest.main() diff --git a/test/distillation/test_distillation.py b/test/distillation/test_distillation.py index 4d63baf5c00..a5a993f2fdf 100644 --- a/test/distillation/test_distillation.py +++ b/test/distillation/test_distillation.py @@ -7,7 +7,7 @@ import torch.nn as nn import tensorflow as tf from neural_compressor.data import DATASETS -from neural_compressor.conf.pythonic_config import DistillationConfig, KnowledgeDistillationLossConfig +from neural_compressor.config import DistillationConfig, KnowledgeDistillationLossConfig from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader diff --git a/test/distillation/test_self_distillation.py b/test/distillation/test_self_distillation.py index 5bd29d37432..e05a40ae56e 100644 --- a/test/distillation/test_self_distillation.py +++ b/test/distillation/test_self_distillation.py @@ -5,7 +5,6 @@ import torch import torch.nn as nn import torchvision -from neural_compressor.conf.config import DistillationConf from neural_compressor.data import DATASETS from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import \ PyTorchDataLoader @@ -82,8 +81,8 @@ def tearDownClass(cls): def test_self_distillation(self): import copy from neural_compressor.training import prepare_compression - from neural_compressor.conf.pythonic_config import DistillationConfig, \ - SelfKnowledgeDistillationLossConfig + from neural_compressor.config import DistillationConfig, \ + SelfKnowledgeDistillationLossConfig datasets = DATASETS("pytorch") dummy_dataset = datasets["dummy"]( diff --git a/test/export/test_torch2onnx.py b/test/export/test_torch2onnx.py new file mode 100644 index 00000000000..01410ff0952 --- /dev/null +++ b/test/export/test_torch2onnx.py @@ -0,0 +1,227 @@ +import os +import copy +import shutil +import torch +import unittest +import numpy as np +from neural_compressor import quantization +from neural_compressor.experimental.common import Model +from neural_compressor.config import Torch2ONNXConfig +from neural_compressor.experimental.data.datasets.dataset import DATASETS +from neural_compressor import PostTrainingQuantConfig, QuantizationAwareTrainingConfig +from neural_compressor.training import prepare_compression +from neural_compressor.data import DATASETS, DATALOADERS +from transformers import AutoModelForSequenceClassification, AutoTokenizer +import torch.utils.data as data + + +def train_func_cv(compression_manager, model): + compression_manager.callbacks.on_train_begin() + optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) + model.train() + input = torch.randn(1, 3, 224, 224) + output = model(input) + loss = output[0].mean() if isinstance(output, tuple) else output.mean() + optimizer.zero_grad() + loss.backward() + optimizer.step() + compression_manager.callbacks.on_train_end() + return model + +def train_func_nlp(compression_manager, model, input): + compression_manager.callbacks.on_train_begin() + optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) + model.train() + output = model(**input) + loss = output.logits[0][0] + optimizer.zero_grad() + loss.backward() + optimizer.step() + compression_manager.callbacks.on_train_end() + return model + +def check_CV_onnx(model_path, dataloader): + import onnxruntime as ort + ort_session = ort.InferenceSession(model_path) + it = iter(dataloader) + input = next(it) + input_dict = {'input': input[0].detach().cpu().numpy()} + ort_session.run(None, input_dict) + return True + +def check_NLP_onnx(model_path, input): + import onnxruntime as ort + ort_session = ort.InferenceSession(model_path, None) + input_dict = {} + for k, v in input.items(): + input_dict[k] = np.array(v) + ort_session.run(None, input_dict) + return True + + +class DummyNLPDataloader(object): + def __init__(self, model_name): + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.sequence_a = "intel-extension-for-transformers is based in SH" + self.sequence_b = "Where is intel-extension-for-transformers based? NYC or SH" + self.encoded_dict = self.tokenizer(self.sequence_a, self.sequence_b, return_tensors='pt') + self.encoded_dict['labels'] = 1 + self.batch_size = 1 + + def __iter__(self): + yield self.encoded_dict + + def __next__(self): + return self.encoded_dict + +class TestPytorch2ONNX(unittest.TestCase): + @classmethod + def setUpClass(self): + from torchvision.models.quantization import resnet18 + self.cv_model = resnet18() + self.cv_dataset = DATASETS("pytorch")["dummy"]((10, 3, 224, 224)) + self.cv_dataloader = DATALOADERS["pytorch"](self.cv_dataset) + self.nlp_model = AutoModelForSequenceClassification.from_pretrained( + "distilbert-base-uncased-finetuned-sst-2-english" + ) + self.nlp_dataloader = DummyNLPDataloader( + "distilbert-base-uncased-finetuned-sst-2-english" + ) + input = next(self.nlp_dataloader) + input.pop('labels') + self.nlp_input = input + + @classmethod + def tearDownClass(self): + shutil.rmtree('runs', ignore_errors=True) + # os.remove('fp32-cv-model.onnx') + # os.remove('int8-cv-model.onnx') + # os.remove('fp32-nlp-model.onnx') + # os.remove('int8-nlp-model.onnx') + shutil.rmtree("./saved", ignore_errors=True) + + def test_fp32_CV_models(self): + model = self.cv_model + inc_model = Model(model) + fp32_onnx_config = Torch2ONNXConfig( + dtype="fp32", + example_inputs=torch.randn(1, 3, 224, 224), + input_names=['input'], + output_names=['output'], + dynamic_axes={"input": {0: "batch_size"}, + "output": {0: "batch_size"}}, + ) + inc_model.export('fp32-cv-model.onnx', fp32_onnx_config) + check_CV_onnx('fp32-cv-model.onnx', self.cv_dataloader) + + def test_int8_CV_models(self): + for fake_yaml in ["dynamic", "qat", "static"]: + model = self.cv_model + if fake_yaml == "qat": + quant_conf = QuantizationAwareTrainingConfig(backend='pytorch_fx') + compression_manager = prepare_compression(copy.deepcopy(model), quant_conf) + q_model = train_func_cv(compression_manager, compression_manager.model) + else: + if fake_yaml == "dynamic": + quant_conf = PostTrainingQuantConfig(approach="dynamic") + elif fake_yaml == "static": + quant_conf = PostTrainingQuantConfig(approach="static", backend='pytorch_fx') + q_model = quantization.fit( + model, + quant_conf, + calib_dataloader=self.cv_dataloader if fake_yaml == "static" else None) + + if fake_yaml != "dynamic": + int8_onnx_config = Torch2ONNXConfig( + dtype="int8", + opset_version=14, + quant_format="QDQ", + example_inputs=torch.randn(1, 3, 224, 224), + input_names=['input'], + output_names=['output'], + dynamic_axes={"input": {0: "batch_size"}, + "output": {0: "batch_size"}}, + calib_dataloader=self.cv_dataloader, + ) + else: + int8_onnx_config = Torch2ONNXConfig( + dtype="int8", + opset_version=14, + quant_format="QDQ", + example_inputs=torch.randn(1, 3, 224, 224), + input_names=['input'], + output_names=['output'], + dynamic_axes={"input": {0: "batch_size"}, + "output": {0: "batch_size"}}, + ) + q_model.export('int8-cv-model.onnx', int8_onnx_config) + check_CV_onnx('int8-cv-model.onnx', self.cv_dataloader) + + def test_fp32_NLP_models(self): + symbolic_names = {0: 'batch_size', 1: 'max_seq_len'} + dynamic_axes = {k: symbolic_names for k in self.nlp_input.keys()} + + model = self.nlp_model + inc_model = Model(model) + fp32_onnx_config = Torch2ONNXConfig( + dtype="fp32", + example_inputs=tuple(self.nlp_input.values()), + input_names=list(self.nlp_input.keys()), + output_names=['labels'], + dynamic_axes=dynamic_axes, + ) + inc_model.export('fp32-nlp-model.onnx', fp32_onnx_config) + check_NLP_onnx('fp32-nlp-model.onnx', self.nlp_input) + + def test_int8_NLP_models(self): + symbolic_names = {0: 'batch_size', 1: 'max_seq_len'} + dynamic_axes = {k: symbolic_names for k in self.nlp_input.keys()} + + for fake_yaml in ["dynamic", "static", "qat"]: + model = self.nlp_model + if fake_yaml == "qat": + quant_conf = QuantizationAwareTrainingConfig(backend='pytorch_fx') + compression_manager = prepare_compression(copy.deepcopy(model), quant_conf) + q_model = train_func_nlp( + compression_manager, + compression_manager.model, + self.nlp_input + ) + else: + if fake_yaml == "dynamic": + quant_conf = PostTrainingQuantConfig(approach="dynamic") + elif fake_yaml == "static": + quant_conf = PostTrainingQuantConfig(approach="static", backend='pytorch_fx') + q_model = quantization.fit( + model, + quant_conf, + calib_dataloader=self.nlp_dataloader if fake_yaml == "static" else None) + + if fake_yaml != "dynamic": + int8_onnx_config = Torch2ONNXConfig( + dtype="int8", + opset_version=14, + quant_format="QDQ", + example_inputs=tuple(self.nlp_input.values()), + input_names=list(self.nlp_input.keys()), + output_names=['labels'], + dynamic_axes=dynamic_axes, + calib_dataloader=self.nlp_dataloader, + ) + else: + int8_onnx_config = Torch2ONNXConfig( + dtype="int8", + opset_version=14, + quant_format="QDQ", + example_inputs=tuple(self.nlp_input.values()), + input_names=list(self.nlp_input.keys()), + output_names=['labels'], + dynamic_axes=dynamic_axes, + ) + q_model.export('int8-nlp-model.onnx', int8_onnx_config) + check_NLP_onnx('int8-nlp-model.onnx', self.nlp_input) + +if __name__ == "__main__": + unittest.main() + + diff --git a/test/itex/test_tensorflow_itex_basic.py b/test/itex/test_tensorflow_itex_basic.py index 6fc3e9a518a..9d3cb1e58ef 100644 --- a/test/itex/test_tensorflow_itex_basic.py +++ b/test/itex/test_tensorflow_itex_basic.py @@ -5,13 +5,14 @@ import os import shutil import yaml +import platform import numpy as np from neural_compressor.adaptor.tf_utils.quantize_graph.quantize_graph_for_intel_cpu import QuantizeGraphForIntel from neural_compressor.adaptor.tf_utils.graph_rewriter.generic.strip_unused_nodes import StripUnusedNodesOptimizer from neural_compressor.adaptor.tf_utils.graph_rewriter.generic.fold_batch_norm import FoldBatchNormNodesOptimizer from neural_compressor.adaptor.tensorflow import TensorflowQuery from neural_compressor.adaptor.tf_utils.util import disable_random -from neural_compressor.experimental import Quantization, common +from neural_compressor.experimental import Quantization, Benchmark, common from neural_compressor.utils.utility import CpuInfo from neural_compressor.adaptor.tf_utils.util import version1_lt_version2, version1_gte_version2 @@ -217,5 +218,53 @@ def test_depthwiseconv2d_case(self): reshape_counter += 1 self.assertEqual(reshape_counter, 2) + @disable_random() + @unittest.skipIf(version1_lt_version2(tf.version.VERSION, '2.8.0') or \ + platform.system().lower() == "windows", "Only supports tf greater 2.7.0 and Linux") + def test_itex_benchmark_gpu(self): + x = tf.compat.v1.placeholder(tf.float32, [1, 56, 56, 16], name="input") + top_relu = tf.nn.relu(x) + paddings = tf.constant([[0, 0], [1, 1], [1, 1], [0, 0]]) + x_pad = tf.pad(top_relu, paddings, "CONSTANT") + conv_weights = tf.compat.v1.get_variable("weight", [3, 3, 16, 16], + initializer=tf.compat.v1.random_normal_initializer()) + conv = tf.nn.conv2d(x_pad, conv_weights, strides=[1, 2, 2, 1], padding="VALID") + normed = tf.compat.v1.layers.batch_normalization(conv) + conv_weights2 = tf.compat.v1.get_variable("weight2", [3, 3, 16, 16], + initializer=tf.compat.v1.random_normal_initializer()) + conv2 = tf.nn.conv2d(top_relu, conv_weights2, strides=[1, 2, 2, 1], padding="SAME") + normed2 = tf.compat.v1.layers.batch_normalization(conv2) + add = tf.raw_ops.Add(x=normed, y=normed2, name='addv2') + relu = tf.nn.relu(add) + relu6 = tf.nn.relu6(relu, name='op_to_store') + out_name = relu6.name.split(':')[0] + with tf.compat.v1.Session() as sess: + sess.run(tf.compat.v1.global_variables_initializer()) + output_graph_def = graph_util.convert_variables_to_constants( + sess=sess, + input_graph_def=sess.graph_def, + output_node_names=[out_name]) + + quantizer = Quantization('fake_yaml_2.yaml') + dataset = quantizer.dataset('dummy', shape=(100, 56, 56, 16), label=True) + quantizer.eval_dataloader = common.DataLoader(dataset) + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.model = output_graph_def + output_graph = quantizer.fit() + + evaluator = Benchmark('fake_yaml_2.yaml') + evaluator.b_dataloader = common.DataLoader(dataset) + evaluator.model = output_graph + evaluator('performance') + + found_multi_instance_log = False + for file_name in os.listdir(os.getcwd()): + if file_name.endswith(".log"): + found_multi_instance_log = True + break + + self.assertEqual(found_multi_instance_log, False) + + if __name__ == '__main__': unittest.main() diff --git a/test/mixed_precision/test_mixed_precision.py b/test/mixed_precision/test_mixed_precision.py index a5d5e09bfc9..a05a3e25e5c 100644 --- a/test/mixed_precision/test_mixed_precision.py +++ b/test/mixed_precision/test_mixed_precision.py @@ -10,7 +10,7 @@ from neural_compressor import mix_precision from neural_compressor.utils.utility import LazyImport, CpuInfo from neural_compressor.adaptor.torch_utils.bf16_convert import BF16ModuleWrapper -from neural_compressor.conf.pythonic_config import MixedPrecisionConfig, Options +from neural_compressor.config import MixedPrecisionConfig, set_workspace, TuningCriterion from onnx import helper, TensorProto from packaging.version import Version from tensorflow.core.framework import attr_value_pb2 @@ -262,26 +262,26 @@ def setUpClass(self): def test_on_non_enabled_host(self): # test onnx - conf = MixedPrecisionConfig(precisions=["fp16"], backend="onnxrt_qlinearops") + conf = MixedPrecisionConfig(extra_precisions=["fp16"], backend="onnxrt_qlinearops") with self.assertRaises(SystemExit) as cm: output_model = mix_precision.fit(self.onnx_model, conf) self.assertEqual(cm.exception.code, 0) @unittest.skipIf(CpuInfo().bf16, 'skip since hardware support bf16') def test_on_non_enabled_host_tf(self): - conf = MixedPrecisionConfig(precisions=["bf16"], backend="tensorflow") + conf = MixedPrecisionConfig(extra_precisions=["bf16"], backend="tensorflow") with self.assertRaises(SystemExit) as cm: output_model = mix_precision.fit(self.tf_model, conf) self.assertEqual(cm.exception.code, 0) def test_on_non_enabled_dtype(self): # test onnx - conf = MixedPrecisionConfig(precisions=["bf16"], backend="onnxrt_qlinearops") + conf = MixedPrecisionConfig(extra_precisions=["bf16"], backend="onnxrt_qlinearops") with self.assertRaises(SystemExit) as cm: output_model = mix_precision.fit(self.onnx_model, conf) self.assertEqual(cm.exception.code, 0) - conf = MixedPrecisionConfig(precisions=["fp16"], backend="tensorflow") + conf = MixedPrecisionConfig(extra_precisions=["fp16"], backend="tensorflow") with self.assertRaises(SystemExit) as cm: output_model = mix_precision.fit(self.tf_model, conf) self.assertEqual(cm.exception.code, 0) @@ -310,16 +310,16 @@ def test_mixed_precision_with_evaluation(self): from neural_compressor.experimental import common from neural_compressor.experimental.metric.metric import ONNXRT_QL_METRICS # test onnx - conf = MixedPrecisionConfig(precisions=["fp16"], + conf = MixedPrecisionConfig(extra_precisions=["fp16"], backend="onnxrt_qlinearops") - options = Options(workspace="./saved") - output_model = mix_precision.fit(self.onnx_model, conf, options=options) + set_workspace("./saved") + output_model = mix_precision.fit(self.onnx_model, conf) self.assertFalse(any([i.op_type == 'Cast' for i in output_model.nodes()])) - conf = MixedPrecisionConfig(precisions=["fp16"], + tuning_criterion = TuningCriterion(max_trials=3, timeout=50) + conf = MixedPrecisionConfig(extra_precisions=["fp16"], backend="onnxrt_qlinearops", - max_trials=3, - timeout=50) + tuning_criterion=tuning_criterion) output_model = mix_precision.fit(self.onnx_model, conf, @@ -347,7 +347,7 @@ def eval2(model): from neural_compressor.experimental import MixedPrecision, common from neural_compressor import conf my_metric = Metric() - conf = MixedPrecisionConfig(precisions=["fp16"], + conf = MixedPrecisionConfig(extra_precisions=["fp16"], backend="onnxrt_qlinearops") output_model = mix_precision.fit(self.onnx_model, @@ -355,7 +355,7 @@ def eval2(model): eval_dataloader=common.DataLoader(self.matmul_dataset), eval_metric=my_metric) self.assertFalse(any([i.op_type == 'Cast' for i in output_model.nodes()])) - conf = MixedPrecisionConfig(precisions=["fp16"], + conf = MixedPrecisionConfig(extra_precisions=["fp16"], backend="onnxrt_qlinearops") output_model = mix_precision.fit(self.onnx_model, @@ -367,7 +367,7 @@ def eval2(model): conf = MixedPrecisionConfig( inputs="input", outputs="final", - precisions=["bf16", "fp32"], + extra_precisions=["bf16", "fp32"], ) output_model = mix_precision.fit( @@ -376,15 +376,15 @@ def eval2(model): eval_func=eval, ) self.assertTrue(any([i.op == 'Cast' for i in output_model.graph_def.node])) - self.assertEqual(conf.precisions, ['bf16', 'fp32']) + self.assertEqual(conf.extra_precisions, ['bf16', 'fp32']) self.assertEqual(conf.inputs, 'input') self.assertEqual(conf.outputs, 'final') + tuning_criterion = TuningCriterion(max_trials=4, timeout=500) conf = MixedPrecisionConfig( - max_trials=4, - timeout=500, - precisions=["bf16"], backend="tensorflow", + tuning_criterion=tuning_criterion, + extra_precisions=["bf16"], ) output_model = mix_precision.fit( common.Model(self.tf_model), @@ -393,12 +393,12 @@ def eval2(model): ) self.assertTrue(any([i.op == 'Cast' for i in output_model.graph_def.node])) + tuning_criterion = TuningCriterion(max_trials=1, timeout=100) conf = MixedPrecisionConfig( inputs="input", outputs="final, test", - max_trials=1, - timeout=100, - precisions=["bf16", "fp32"], + tuning_criterion=tuning_criterion, + extra_precisions=["bf16", "fp32"], ) output_model = mix_precision.fit( self.tf_model, @@ -414,7 +414,7 @@ def eval(model): return 0.5 conf = MixedPrecisionConfig( - precisions=["bf16"], + extra_precisions=["bf16"], backend="pytorch" ) output_model = mix_precision.fit( diff --git a/test/pruning/test_pruning.py b/test/pruning/test_pruning.py index b5b437639c0..3e1290e6bb7 100644 --- a/test/pruning/test_pruning.py +++ b/test/pruning/test_pruning.py @@ -6,7 +6,7 @@ import torchvision import torch.nn as nn -from neural_compressor.conf.pythonic_config import Pruner, PruningConfig +from neural_compressor.config import Pruner, PruningConfig from neural_compressor.data import DATASETS from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader from neural_compressor.training import prepare_compression diff --git a/test/requirements.txt b/test/requirements.txt index c570fff1dec..30712c4bafb 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -14,6 +14,7 @@ transformers<=4.12.3; python_version < '3.10' transformers==4.16.0; python_version == '3.10' tensorflow_model_optimization sigopt +hyperopt horovod tensorflow-addons onnxruntime-extensions; python_version < '3.10' diff --git a/test/strategy/test_basic.py b/test/strategy/test_basic.py index 845e9b0ccae..0a2812b5f79 100644 --- a/test/strategy/test_basic.py +++ b/test/strategy/test_basic.py @@ -155,7 +155,7 @@ def build_fake_model(): tf.import_graph_def(graph_def, name='') return graph -class TestQuantization(unittest.TestCase): +class TestBasicTuningStrategy(unittest.TestCase): @classmethod def setUpClass(self): @@ -217,6 +217,25 @@ def test_run_basic_max_trials_multimetric_weight(self): quantizer.model = self.constant_graph quantizer.fit() + + def test_run_basic_one_trial_new_api(self): + from neural_compressor.quantization import fit + from neural_compressor.config import AccuracyCriterion, AccuracyLoss, PostTrainingQuantConfig, TuningCriterion + from neural_compressor.data import DATASETS, DATALOADERS + + # dataset and dataloader + dataset = DATASETS("tensorflow")["dummy"](((100, 3, 3, 1))) + dataloader = DATALOADERS["tensorflow"](dataset) + + # tuning and accuracy criterion + tolerable_loss = AccuracyLoss(0.01) + accuracy_criterion = AccuracyCriterion(criterion='relative', tolerable_loss=tolerable_loss) + tuning_criterion = TuningCriterion(strategy='basic') + conf = PostTrainingQuantConfig(approach="static", backend="tensorflow", + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion) + q_model = fit(model=self.constant_graph, conf=conf, calib_dataloader= dataloader, eval_dataloader=dataloader) + self.assertIsNotNone(q_model) if __name__ == "__main__": unittest.main() diff --git a/test/strategy/test_optimization_level_2.x.py b/test/strategy/test_optimization_level_2.x.py new file mode 100644 index 00000000000..b599c07bf2a --- /dev/null +++ b/test/strategy/test_optimization_level_2.x.py @@ -0,0 +1,151 @@ +"""Tests for optimization level & conservative strategy""" + +import shutil +import unittest +import time + +import numpy as np + +from neural_compressor.utils import logger + +def build_fake_model(): + import tensorflow as tf + try: + graph = tf.Graph() + graph_def = tf.compat.v1.GraphDef() + with tf.compat.v1.Session() as sess: + x = tf.compat.v1.placeholder(tf.float32, shape=(1,3,3,1), name='x') + y = tf.constant(np.random.random((2,2,1,1)).astype(np.float32), name='y') + z = tf.constant(np.random.random((1,1,1,1)).astype(np.float32), name='z') + op = tf.nn.conv2d(input=x, filters=y, strides=[1,1,1,1], padding='VALID', name='op_to_store') + op2 = tf.nn.conv2d(input=op, filters=z, strides=[1,1,1,1], padding='VALID', ) + last_identity = tf.identity(op2, name='op2_to_store') + sess.run(tf.compat.v1.global_variables_initializer()) + constant_graph = tf.compat.v1.graph_util.convert_variables_to_constants(sess, sess.graph_def, ['op2_to_store']) + + graph_def.ParseFromString(constant_graph.SerializeToString()) + with graph.as_default(): + tf.import_graph_def(graph_def, name='') + except: + graph = tf.Graph() + graph_def = tf.compat.v1.GraphDef() + with tf.compat.v1.Session() as sess: + x = tf.compat.v1.placeholder(tf.float32, shape=(1,3,3,1), name='x') + y = tf.constant(np.random.random((2,2,1,1)).astype(np.float32), name='y') + z = tf.constant(np.random.random((1,1,1,1)).astype(np.float32), name='z') + op = tf.nn.conv2d(input=x, filters=y, strides=[1,1,1,1], padding='VALID', name='op_to_store') + op2 = tf.nn.conv2d(input=op, filters=z, strides=[1,1,1,1], padding='VALID') + last_identity = tf.identity(op2, name='op2_to_store') + + sess.run(tf.compat.v1.global_variables_initializer()) + constant_graph = tf.compat.v1.graph_util.convert_variables_to_constants(sess, sess.graph_def, ['op2_to_store']) + + graph_def.ParseFromString(constant_graph.SerializeToString()) + with graph.as_default(): + tf.import_graph_def(graph_def, name='') + return graph + + +class TestOptimizationLevel(unittest.TestCase): + + @classmethod + def setUpClass(self): + self.constant_graph = build_fake_model() + + @classmethod + def tearDownClass(self): + shutil.rmtree('saved', ignore_errors=True) + shutil.rmtree('nc_workspace', ignore_errors=True) + + def test_tf_opt_level_0(self): + logger.info("*** Test: optimization level 0 with tensorflow model.") + from neural_compressor.quantization import fit + from neural_compressor.config import PostTrainingQuantConfig + from neural_compressor.data import DATASETS, DATALOADERS + + # fake evaluation function + def _fake_eval(model): + return 1 + + # dataset and dataloader + dataset = DATASETS("tensorflow")["dummy"](((100, 3, 3, 1))) + dataloader = DATALOADERS["tensorflow"](dataset) + + # tuning and accuracy criterion + optimization_level = 0 + conf = PostTrainingQuantConfig(approach="static", backend="tensorflow", optimization_level=0) + + # fit + q_model = fit(model=self.constant_graph, + conf=conf, + calib_dataloader= dataloader, + eval_dataloader=dataloader, + eval_func=_fake_eval) + self.assertIsNotNone(q_model) + + def test_tf_opt_level_1(self): + logger.info("*** Test: optimization level 1 with tensorflow model.") + from neural_compressor.quantization import fit + from neural_compressor.config import PostTrainingQuantConfig + from neural_compressor.data import DATASETS, DATALOADERS + + # fake evaluation function + self._fake_acc = 10 + def _fake_eval(model): + self._fake_acc -= 1 + return self._fake_acc + + # dataset and dataloader + dataset = DATASETS("tensorflow")["dummy"](((100, 3, 3, 1))) + dataloader = DATALOADERS["tensorflow"](dataset) + + # tuning and accuracy criterion + optimization_level = 1 + conf = PostTrainingQuantConfig(approach="static", backend="tensorflow", optimization_level=optimization_level) + + # fit + q_model = fit(model=self.constant_graph, + conf=conf, + calib_dataloader= dataloader, + eval_dataloader=dataloader, + eval_func=_fake_eval) + self.assertIsNone(q_model) + + def test_pt_opt_level_0(self): + logger.info("*** Test: optimization level 0 with pytorch model.") + from neural_compressor.quantization import fit + from neural_compressor.config import PostTrainingQuantConfig + from neural_compressor.data import DATASETS, DATALOADERS + import torchvision + + # model + resnet18 = torchvision.models.resnet18() + + # fake evaluation function + acc_lst = [2.0, 1.0, 2.1, 2.2, 2.3] + perf_lst = [2.0, 1.5, 1.0, 0.5, 0.1] + self.test_pt_opt_level_0_index = -1 + def _fake_eval(model): + self.test_pt_opt_level_0_index += 1 + perf = perf_lst[self.test_pt_opt_level_0_index] + time.sleep(perf) + return acc_lst[self.test_pt_opt_level_0_index] + + # dataset and dataloader + dataset = DATASETS("pytorch")["dummy"](((100, 3, 3, 1))) + dataloader = DATALOADERS["pytorch"](dataset) + + # tuning and accuracy criterion + optimization_level = 0 + conf = PostTrainingQuantConfig(approach="static", backend="pytorch", optimization_level=optimization_level) + + # fit + q_model = fit(model=resnet18, + conf=conf, + calib_dataloader= dataloader, + eval_dataloader=dataloader, + eval_func=_fake_eval) + self.assertIsNotNone(q_model) + +if __name__ == "__main__": + unittest.main() diff --git a/test/strategy/test_sigopt.py b/test/strategy/test_sigopt.py index ce7a7669862..5d443e3dba2 100644 --- a/test/strategy/test_sigopt.py +++ b/test/strategy/test_sigopt.py @@ -104,7 +104,7 @@ def build_fake_model(): return graph @unittest.skipIf(CONDITION , "missing the env variables 'SIGOPT_API_TOKEN' or 'SIGOPT_PROJECT_ID'") -class TestQuantization(unittest.TestCase): +class TestSigoptTuningStrategy(unittest.TestCase): @classmethod def setUpClass(self): @@ -140,6 +140,29 @@ def test_run_basic_max_trials(self): quantizer.eval_dataloader = common.DataLoader(dataset) quantizer.model = self.constant_graph quantizer.fit() + + def test_run_sigopt_one_trial_new_api(self): + from neural_compressor.quantization import fit + from neural_compressor.config import AccuracyCriterion, AccuracyLoss, PostTrainingQuantConfig, TuningCriterion + from neural_compressor.data import DATASETS, DATALOADERS + + # dataset and dataloader + dataset = DATASETS("tensorflow")["dummy"](((100, 3, 3, 1))) + dataloader = DATALOADERS["tensorflow"](dataset) + + # tuning and accuracy criterion + tolerable_loss = AccuracyLoss(0.01) + accuracy_criterion = AccuracyCriterion(criterion='relative', tolerable_loss=tolerable_loss) + strategy_kwargs = {'sigopt_api_token': 'sigopt_api_token_test', + 'sigopt_project_id': 'sigopt_project_id_test', + 'sigopt_experiment_name': 'nc-tune'} + tuning_criterion = TuningCriterion(strategy='sigopt', strategy_kwargs=strategy_kwargs, max_trials=3) + conf = PostTrainingQuantConfig(approach="static", backend="tensorflow", + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion) + q_model = fit(model=self.constant_graph, conf=conf, calib_dataloader= dataloader, eval_dataloader=dataloader) + self.assertIsNotNone(q_model) + if __name__ == "__main__": unittest.main() diff --git a/test/tfnewapi/test_tensorflow_graph_qdq_bn_fusion.py b/test/tfnewapi/test_tensorflow_graph_qdq_bn_fusion.py index acfbd049072..d99a48c1803 100644 --- a/test/tfnewapi/test_tensorflow_graph_qdq_bn_fusion.py +++ b/test/tfnewapi/test_tensorflow_graph_qdq_bn_fusion.py @@ -12,6 +12,8 @@ from tensorflow.python.framework import dtypes from neural_compressor.adaptor.tf_utils.util import disable_random from neural_compressor.utils.utility import CpuInfo +from neural_compressor.experimental import Quantization, common +from neural_compressor.utils import logger def build_fake_yaml_1(): fake_yaml_1 = ''' @@ -91,7 +93,7 @@ def tearDownClass(self): @disable_random() def test_bn_relu_depthwiseconv_biasadd_relu6_fusion(self): - logging.getLogger().info("test_depthwiseconv_biasadd_relu_fusion") + logger.info("test_bn_relu_depthwiseconv_biasadd_relu6_fusion") x = tf.compat.v1.placeholder(tf.float32, [1, 56, 56, 16], name="input") conv_weights = tf.compat.v1.get_variable("weight", [3, 3, 16, 16], initializer=tf.compat.v1.random_normal_initializer()) @@ -107,7 +109,7 @@ def test_bn_relu_depthwiseconv_biasadd_relu6_fusion(self): sess=sess, input_graph_def=sess.graph_def, output_node_names=[out_name]) - from neural_compressor.experimental import Quantization, common + quantizer = Quantization('fake_yaml_1.yaml') dataset = quantizer.dataset('dummy', shape=(100, 56, 56, 16), label=True) quantizer.eval_dataloader = common.DataLoader(dataset) @@ -137,7 +139,7 @@ def test_bn_relu_depthwiseconv_biasadd_relu6_fusion(self): @disable_random() def test_training_bn_relu_depthwiseconv_biasadd_relu6_fusion(self): - logging.getLogger().info("test_depthwiseconv_biasadd_relu_fusion") + logger.info("test_training_bn_relu_depthwiseconv_biasadd_relu6_fusion") x = tf.compat.v1.placeholder(tf.float32, [1, 56, 56, 16], name="input") conv_weights = tf.compat.v1.get_variable("weight", [3, 3, 16, 16], initializer=tf.compat.v1.random_normal_initializer()) @@ -153,7 +155,7 @@ def test_training_bn_relu_depthwiseconv_biasadd_relu6_fusion(self): sess=sess, input_graph_def=sess.graph_def, output_node_names=[out_name]) - from neural_compressor.experimental import Quantization, common + quantizer = Quantization('fake_yaml_1.yaml') dataset = quantizer.dataset('dummy', shape=(100, 56, 56, 16), label=True) quantizer.eval_dataloader = common.DataLoader(dataset) @@ -177,9 +179,68 @@ def test_training_bn_relu_depthwiseconv_biasadd_relu6_fusion(self): if bf16_enabled: self.assertEqual(bf16_bn_num, 1) + @disable_random() + def test_bn_leakyrelu_conv_biasadd_relu(self): + logger.info("test_bn_leakyrelu_conv_biasadd_relu") + x = tf.compat.v1.placeholder(tf.float32, [1, 56, 56, 16], name="input") + conv_weights = tf.compat.v1.get_variable("weight", [3, 3, 16, 16], + initializer=tf.compat.v1.random_normal_initializer()) + normed_0 = tf.compat.v1.layers.batch_normalization(x) + leaky_relu = tf.nn.leaky_relu(normed_0, alpha=0.3, name='op_to_store_0') + conv = tf.nn.conv2d(leaky_relu, conv_weights, strides=[1, 2, 2, 1], padding="VALID") + normed_1 = tf.compat.v1.layers.batch_normalization(conv) + relu = tf.nn.relu(normed_1, name='op_to_store_1') + out_name = relu.name.split(':')[0] + with tf.compat.v1.Session() as sess: + sess.run(tf.compat.v1.global_variables_initializer()) + output_graph_def = graph_util.convert_variables_to_constants( + sess=sess, + input_graph_def=sess.graph_def, + output_node_names=[out_name]) + + quantizer = Quantization('fake_yaml_1.yaml') + dataset = quantizer.dataset('dummy', shape=(100, 56, 56, 16), label=True) + quantizer.eval_dataloader = common.DataLoader(dataset) + quantizer.calib_dataloader = common.DataLoader(dataset) + + quantizer.model = output_graph_def + output_graph = quantizer.fit() + conv_input_type = True + found_fusion = True + qbn_num = 0 + dq_num = 0 + qbn_output_max_name = 'batch_normalization/FusedBatchNormV3_eightbit_quantized_bn/frozen_bn_output_max' + for i in output_graph.graph_def.node: + if i.op == '_FusedQuantizedConv2D' \ + and i.attr['Thost_inputs'].list.type != [11, 11, 1, 1, 1, 1, 1, 1, 1]: + conv_input_type = False + break + if i.op in ['Relu', 'LeakyRelu', 'FusedBatchNormV3']: + found_fusion = False + break + if i.op == '_QuantizedFusedBatchNorm': + is_offset_const = i.attr["is_offset_const"].b + is_mean_const = i.attr["is_mean_const"].b + qbn_alpha = i.attr["alpha"].f + frozen_qbn_output_max = i.input[8] + qbn_num += 1 + if i.name == qbn_output_max_name: + frozen_qbn_output_max_value = i.attr["value"].tensor.float_val[0] + if i.op == 'Dequantize': + dq_num += 1 + self.assertEqual(conv_input_type, True) + self.assertEqual(found_fusion, True) + self.assertEqual(qbn_num, 1) + self.assertEqual(dq_num, 1) + self.assertEqual(is_offset_const, True) + self.assertEqual(is_mean_const, True) + self.assertEqual(round(qbn_alpha, 7), 0.3) + self.assertEqual(frozen_qbn_output_max, qbn_output_max_name) + self.assertGreater(frozen_qbn_output_max_value, 126) + @disable_random() def test_bn_relu_conv_biasadd_relu(self): - logging.getLogger().info("test_conv_biasadd_relu_fusion") + logger.info("test_bn_relu_conv_biasadd_relu") x = tf.compat.v1.placeholder(tf.float32, [1, 56, 56, 16], name="input") conv_weights = tf.compat.v1.get_variable("weight", [3, 3, 16, 16], initializer=tf.compat.v1.random_normal_initializer()) @@ -195,7 +256,7 @@ def test_bn_relu_conv_biasadd_relu(self): sess=sess, input_graph_def=sess.graph_def, output_node_names=[out_name]) - from neural_compressor.experimental import Quantization, common + quantizer = Quantization('fake_yaml_1.yaml') dataset = quantizer.dataset('dummy', shape=(100, 56, 56, 16), label=True) quantizer.eval_dataloader = common.DataLoader(dataset) @@ -236,7 +297,7 @@ def test_bn_relu_conv_biasadd_relu(self): @disable_random() def test_bn_performance_only_false(self): - logging.getLogger().info("test_conv_biasadd_relu_fusion") + logger.info("test_bn_performance_only_false") x = tf.compat.v1.placeholder(tf.float32, [1, 56, 56, 16], name="input") conv_weights = tf.compat.v1.get_variable("weight", [3, 3, 16, 16], initializer=tf.compat.v1.random_normal_initializer()) @@ -252,7 +313,7 @@ def test_bn_performance_only_false(self): sess=sess, input_graph_def=sess.graph_def, output_node_names=[out_name]) - from neural_compressor.experimental import Quantization, common + quantizer = Quantization('fake_yaml_2.yaml') dataset = quantizer.dataset('dummy', shape=(100, 56, 56, 16), label=True) quantizer.eval_dataloader = common.DataLoader(dataset) @@ -281,7 +342,7 @@ def test_bn_performance_only_false(self): @disable_random() def test_bnex_performance_only_false(self): - logging.getLogger().info("test_conv_biasadd_relu_fusion") + logger.info("test_bnex_performance_only_false") x = tf.compat.v1.placeholder(tf.float32, [1, 56, 56, 16], name="input") conv_weights_0 = tf.compat.v1.get_variable("weight_0", [3, 3, 16, 16], initializer=tf.compat.v1.random_normal_initializer()) @@ -312,7 +373,7 @@ def test_bnex_performance_only_false(self): if node.name == "batch_normalization_1/FusedBatchNormV3": node.op = "_FusedBatchNormEx" node.attr["activation_mode"].CopyFrom(attr_value_pb2.AttrValue(s=b"Relu")) - from neural_compressor.experimental import Quantization, common + quantizer = Quantization('fake_yaml_2.yaml') dataset = quantizer.dataset('dummy', shape=(100, 56, 56, 16), label=True) quantizer.eval_dataloader = common.DataLoader(dataset)