diff --git a/.azure-pipelines/code-scan.yml b/.azure-pipelines/code-scan.yml index 3c8a8c8daae..27d742b4d5b 100644 --- a/.azure-pipelines/code-scan.yml +++ b/.azure-pipelines/code-scan.yml @@ -9,6 +9,7 @@ pr: paths: include: - neural_compressor + - setup.py pool: ICX-16C @@ -16,6 +17,18 @@ variables: CODE_SCAN_LOG_PATH: ".azure-pipelines/scripts/codeScan/scanLog" stages: + - stage: DocStyleCodeScan + displayName: DocStyle Code Scan + dependsOn: [] + jobs: + - job: DocStyle + displayName: DocStyle + steps: + - template: template/code-scan-template.yml + parameters: + codeScanFileName: "pydocstyle" + uploadPath: "pydocstyle.log" + - stage: BanditCodeScan displayName: Bandit Code Scan dependsOn: [] @@ -26,7 +39,7 @@ stages: - template: template/code-scan-template.yml parameters: codeScanFileName: "bandit" - uploadPath: "lpot-bandit.log" + uploadPath: "bandit.log" - stage: PylintCodeScan displayName: Pylint Code Scan @@ -38,7 +51,7 @@ stages: - template: template/code-scan-template.yml parameters: codeScanFileName: "pylint" - uploadPath: "lpot-pylint.json" + uploadPath: "pylint.json" - stage: CopyRight displayName: CopyRight Code Scan diff --git a/.azure-pipelines/docker/DockerfileCodeScan.devel b/.azure-pipelines/docker/DockerfileCodeScan.devel index 93321aa0f14..8c33984f23d 100644 --- a/.azure-pipelines/docker/DockerfileCodeScan.devel +++ b/.azure-pipelines/docker/DockerfileCodeScan.devel @@ -38,6 +38,7 @@ RUN python -m pip install --no-cache-dir pylint==2.12.1\ tf_slim\ transformers\ horovod\ - flask==2.1.3 + flask==2.1.3 \ + pydocstyle WORKDIR / diff --git a/.azure-pipelines/model-test.yml b/.azure-pipelines/model-test.yml index 4f4ce12c680..8ff2c10cc50 100644 --- a/.azure-pipelines/model-test.yml +++ b/.azure-pipelines/model-test.yml @@ -9,6 +9,7 @@ pr: paths: include: - neural_compressor + - setup.py exclude: - neural_compressor/ux diff --git a/.azure-pipelines/scripts/codeScan/bandit/bandit.sh b/.azure-pipelines/scripts/codeScan/bandit/bandit.sh index a23f2f3000d..b8238ef5f92 100644 --- a/.azure-pipelines/scripts/codeScan/bandit/bandit.sh +++ b/.azure-pipelines/scripts/codeScan/bandit/bandit.sh @@ -1,17 +1,21 @@ #!/bin/bash source /neural-compressor/.azure-pipelines/scripts/change_color.sh -mkdir -p /neural-compressor/.azure-pipelines/scripts/codeScan/scanLog -bandit_log_dir="/neural-compressor/.azure-pipelines/scripts/codeScan/scanLog" +RESET="echo -en \\E[0m \\n" # close color -python -m bandit -r -lll -iii /neural-compressor/neural_compressor > $bandit_log_dir/lpot-bandit.log +log_dir="/neural-compressor/.azure-pipelines/scripts/codeScan/scanLog" +mkdir -p $log_dir + +python -m bandit -r -lll -iii /neural-compressor/neural_compressor > $log_dir/bandit.log exit_code=$? -# code-scan close -RESET="echo -en \\E[0m \\n" + +$BOLD_YELLOW && echo " ----------------- Current bandit cmd start --------------------------" && $RESET +echo "python -m bandit -r -lll -iii /neural-compressor/neural_compressor > $log_dir/bandit.log" +$BOLD_YELLOW && echo " ----------------- Current bandit cmd end --------------------------" && $RESET $BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------" -cat $bandit_log_dir/lpot-bandit.log +cat $log_dir/bandit.log $BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET diff --git a/.azure-pipelines/scripts/codeScan/pydocstyle/pydocstyle.sh b/.azure-pipelines/scripts/codeScan/pydocstyle/pydocstyle.sh new file mode 100644 index 00000000000..8b8a09939e8 --- /dev/null +++ b/.azure-pipelines/scripts/codeScan/pydocstyle/pydocstyle.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +source /neural-compressor/.azure-pipelines/scripts/change_color.sh +RESET="echo -en \\E[0m \\n" # close color + +log_dir="/neural-compressor/.azure-pipelines/scripts/codeScan/scanLog" +mkdir -p $log_dir + +pydocstyle --convention=google /neural-compressor/neural_compressor/experimental > $log_dir/pydocstyle.log +exit_code=$? + + +$BOLD_YELLOW && echo " ----------------- Current pydocstyle cmd start --------------------------" && $RESET +echo "python pydocstyle --convention=google /neural-compressor/neural_compressor/experimental > $log_dir/pydocstyle.log" +$BOLD_YELLOW && echo " ----------------- Current pydocstyle cmd end --------------------------" && $RESET + +$BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------" +cat $log_dir/pydocstyle.log +$BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET + + +if [ ${exit_code} -ne 0 ] ; then + $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view DocStyle error details." && $RESET; exit 1 +fi +$BOLD_PURPLE && echo "Congratulations, DocStyle check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET; exit 0 \ No newline at end of file diff --git a/.azure-pipelines/scripts/codeScan/pylint/pylint.sh b/.azure-pipelines/scripts/codeScan/pylint/pylint.sh index de55395e5ce..b15da8c91b3 100644 --- a/.azure-pipelines/scripts/codeScan/pylint/pylint.sh +++ b/.azure-pipelines/scripts/codeScan/pylint/pylint.sh @@ -1,24 +1,26 @@ #!/bin/bash source /neural-compressor/.azure-pipelines/scripts/change_color.sh -mkdir -p /neural-compressor/.azure-pipelines/scripts/codeScan/scanLog -pylint_log_dir="/neural-compressor/.azure-pipelines/scripts/codeScan/scanLog" +RESET="echo -en \\E[0m \\n" # close color + +log_dir="/neural-compressor/.azure-pipelines/scripts/codeScan/scanLog" +mkdir -p $log_dir pip install -r /neural-compressor/requirements.txt pip install torch==1.12.0 -python -m pylint -f json --disable=R,C,W,E1129 --enable=line-too-long --max-line-length=120 --extension-pkg-whitelist=numpy --ignored-classes=TensorProto,NodeProto --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,intel_extension_for_pytorch /neural-compressor/neural_compressor > $pylint_log_dir/lpot-pylint.json -exit_code=$? +python -m pylint -f json --disable=R,C,W,E1129 --enable=line-too-long --max-line-length=120 --extension-pkg-whitelist=numpy --ignored-classes=TensorProto,NodeProto \ +--ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,intel_extension_for_pytorch /neural-compressor/neural_compressor \ +> $log_dir/pylint.json -# code-scan close -RESET="echo -en \\E[0m \\n" +exit_code=$? $BOLD_YELLOW && echo " ----------------- Current pylint cmd start --------------------------" && $RESET -echo "python -m pylint -f json --disable=R,C,W,E1129 --enable=line-too-long --max-line-length=120 --extension-pkg-whitelist=numpy --ignored-classes=TensorProto,NodeProto --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,intel_extension_for_pytorch /neural-compressor/neural_compressor > $pylint_log_dir/lpot-pylint.json" +echo "python -m pylint -f json --disable=R,C,W,E1129 --enable=line-too-long --max-line-length=120 --extension-pkg-whitelist=numpy --ignored-classes=TensorProto,NodeProto --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,intel_extension_for_pytorch /neural-compressor/neural_compressor > $log_dir/pylint.json" $BOLD_YELLOW && echo " ----------------- Current pylint cmd end --------------------------" && $RESET $BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------" && $RESET -cat $pylint_log_dir/lpot-pylint.json +cat $log_dir/pylint.json $BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET diff --git a/.azure-pipelines/scripts/codeScan/pyspelling/lpot_dict.txt b/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt similarity index 99% rename from .azure-pipelines/scripts/codeScan/pyspelling/lpot_dict.txt rename to .azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt index ec5c0321b29..4601c3ab69e 100644 --- a/.azure-pipelines/scripts/codeScan/pyspelling/lpot_dict.txt +++ b/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt @@ -696,6 +696,7 @@ Goyal gpg GPG gpt +GPTJ gpu gpus GPUs @@ -2378,3 +2379,9 @@ constfold grappler amsgrad qoperator +apis +CPz +Nsh +UmK +fe +vmware diff --git a/.azure-pipelines/scripts/codeScan/pyspelling/pyspelling_conf.yaml b/.azure-pipelines/scripts/codeScan/pyspelling/pyspelling_conf.yaml index 07fe82c07f3..3cf19530020 100644 --- a/.azure-pipelines/scripts/codeScan/pyspelling/pyspelling_conf.yaml +++ b/.azure-pipelines/scripts/codeScan/pyspelling/pyspelling_conf.yaml @@ -4,10 +4,10 @@ matrix: d: en_US.ISO8859-15 dictionary: wordlists: - - ${DICT_DIR}/lpot_dict.txt - output: ${DICT_DIR}/lpot_dict.dic + - ${DICT_DIR}/inc_dict.txt + output: ${DICT_DIR}/inc_dict.dic sources: - - ${REPO_DIR}/docs/* + - ${REPO_DIR}/docs/source/*.md - ${REPO_DIR}/*.md - ${REPO_DIR}/examples/**/*.md|!${REPO_DIR}/examples/pytorch/**/huggingface_models/**/*.md - ${REPO_DIR}/neural_compressor/**/*.md diff --git a/.azure-pipelines/scripts/models/generate_report.sh b/.azure-pipelines/scripts/models/generate_report.sh index 568799ebbc1..9271008d2e8 100644 --- a/.azure-pipelines/scripts/models/generate_report.sh +++ b/.azure-pipelines/scripts/models/generate_report.sh @@ -237,6 +237,7 @@ function generate_html_core { status_png = "background-color:#90EE90"; } else { status_png = "background-color:#FFD2D2"; + job_status = "fail" } printf("
-### Quantization with [GUI](./docs/bench.md)
+### Quantization with [GUI](./docs/source/bench.md)
```shell
# An ONNX Example
pip install onnx==1.12.0 onnxruntime==1.12.1 onnxruntime-extensions
@@ -80,8 +80,8 @@ wget https://github.com/onnx/models/raw/main/vision/classification/resnet/model/
# Start GUI
inc_bench
```
-
-
+
+
## System Requirements
@@ -98,7 +98,7 @@ inc_bench
#### Intel® Neural Compressor quantized ONNX models support multiple hardware vendors through ONNX Runtime:
-* Intel CPU, AMD/ARM CPU, and NVidia GPU. Please refer to the validated model [list](./docs/validated_model_list.md#Validated-ONNX-QDQ-INT8-models-on-multiple-hardware-through-ONNX-Runtime).
+* Intel CPU, AMD/ARM CPU, and NVidia GPU. Please refer to the validated model [list](./docs/source/validated_model_list.md#Validated-ONNX-QDQ-INT8-models-on-multiple-hardware-through-ONNX-Runtime).
### Validated Software Environment
@@ -146,11 +146,11 @@ inc_bench
> Set the environment variable ``TF_ENABLE_ONEDNN_OPTS=1`` to enable oneDNN optimizations if you are using TensorFlow v2.6 to v2.8. oneDNN is the default for TensorFlow v2.9.
### Validated Models
-Intel® Neural Compressor validated 420+ [examples](./examples) for quantization with a performance speedup geomean of 2.2x and up to 4.2x on VNNI while minimizing accuracy loss. Over 30 pruning and knowledge distillation samples are also available. More details for validated models are available [here](docs/validated_model_list.md).
+Intel® Neural Compressor validated 420+ [examples](./examples) for quantization with a performance speedup geomean of 2.2x and up to 4.2x on VNNI while minimizing accuracy loss. Over 30 pruning and knowledge distillation samples are also available. More details for validated models are available [here](./docs/source/validated_model_list.md).
@@ -164,10 +164,10 @@ Intel® Neural Compressor validated 420+ [examples](./examples) for quantization
-
-
-## Workflow
-
-
-
-
diff --git a/docs/doclist.rst b/docs/doclist.rst
deleted file mode 100644
index d5be5857470..00000000000
--- a/docs/doclist.rst
+++ /dev/null
@@ -1,68 +0,0 @@
-Developer Documentation
-#######################
-
-Read the following material as you learn how to use Neural Compressor.
-
-Get Started
-===========
-
-* `Transform | Platform | -OS | -Python | -Framework | -Version | -
|---|---|---|---|---|
| Cascade Lake Cooper Lake Skylake Ice Lake |
- CentOS 8.3 Ubuntu 18.04 |
- 3.6 3.7 3.8 3.9 |
- TensorFlow | -2.5.0 | -
| 2.4.0 | -||||
| 2.3.0 | -||||
| 2.2.0 | -||||
| 2.1.0 | -||||
| 1.15.0 UP1 | -||||
| 1.15.0 UP2 | -||||
| 1.15.0 UP3 | -||||
| 1.15.2 | -||||
| PyTorch | -1.5.0+cpu | -|||
| 1.6.0+cpu | -||||
| 1.8.0+cpu | -||||
| IPEX | -||||
| MXNet | -1.7.0 | -|||
| 1.6.0 | -||||
| ONNX Runtime | -1.6.0 | -|||
| 1.7.0 | -||||
| 1.8.0 | -
| Framework | -version | -Model | -dataset | -Accuracy | -Performance speed up | -||
|---|---|---|---|---|---|---|---|
| INT8 Tuning Accuracy | -FP32 Accuracy Baseline | -Acc Ratio[(INT8-FP32)/FP32] | -Realtime Latency Ratio[FP32/INT8] | -||||
| tensorflow | -2.4.0 | -resnet50v1.5 | -ImageNet | -76.70% | -76.50% | -0.26% | -3.23x | -
| tensorflow | -2.4.0 | -Resnet101 | -ImageNet | -77.20% | -76.40% | -1.05% | -2.42x | -
| tensorflow | -2.4.0 | -inception_v1 | -ImageNet | -70.10% | -69.70% | -0.57% | -1.88x | -
| tensorflow | -2.4.0 | -inception_v2 | -ImageNet | -74.10% | -74.00% | -0.14% | -1.96x | -
| tensorflow | -2.4.0 | -inception_v3 | -ImageNet | -77.20% | -76.70% | -0.65% | -2.36x | -
| tensorflow | -2.4.0 | -inception_v4 | -ImageNet | -80.00% | -80.30% | --0.37% | -2.59x | -
| tensorflow | -2.4.0 | -inception_resnet_v2 | -ImageNet | -80.10% | -80.40% | --0.37% | -1.97x | -
| tensorflow | -2.4.0 | -Mobilenetv1 | -ImageNet | -71.10% | -71.00% | -0.14% | -2.88x | -
| tensorflow | -2.4.0 | -ssd_resnet50_v1 | -Coco | -37.90% | -38.00% | --0.26% | -2.97x | -
| tensorflow | -2.4.0 | -mask_rcnn_inception_v2 | -Coco | -28.90% | -29.10% | --0.69% | -2.66x | -
| tensorflow | -2.4.0 | -vgg16 | -ImageNet | -72.50% | -70.90% | -2.26% | -3.75x | -
| tensorflow | -2.4.0 | -vgg19 | -ImageNet | -72.40% | -71.00% | -1.97% | -3.79x | -
| Framework | -version | -model | -dataset | -Accuracy | -Performance speed up | -||
|---|---|---|---|---|---|---|---|
| INT8 Tuning Accuracy | -FP32 Accuracy Baseline | -Acc Ratio[(INT8-FP32)/FP32] | -Realtime Latency Ratio[FP32/INT8] | -||||
| pytorch | -1.5.0+cpu | -resnet50 | -ImageNet | -75.96% | -76.13% | --0.23% | -2.63x | -
| pytorch | -1.5.0+cpu | -resnext101_32x8d | -ImageNet | -79.12% | -79.31% | --0.24% | -2.61x | -
| pytorch | -1.6.0a0+24aac32 | -bert_base_mrpc | -MRPC | -88.90% | -88.73% | -0.19% | -1.98x | -
| pytorch | -1.6.0a0+24aac32 | -bert_base_cola | -COLA | -59.06% | -58.84% | -0.37% | -2.19x | -
| pytorch | -1.6.0a0+24aac32 | -bert_base_sts-b | -STS-B | -88.40% | -89.27% | --0.97% | -2.28x | -
| pytorch | -1.6.0a0+24aac32 | -bert_base_sst-2 | -SST-2 | -91.51% | -91.86% | --0.37% | -2.30x | -
| pytorch | -1.6.0a0+24aac32 | -bert_base_rte | -RTE | -69.31% | -69.68% | --0.52% | -2.15x | -
| pytorch | -1.6.0a0+24aac32 | -bert_large_mrpc | -MRPC | -87.45% | -88.33% | --0.99% | -2.73x | -
| pytorch | -1.6.0a0+24aac32 | -bert_large_squad | -SQUAD | -92.85% | -93.05% | --0.21% | -2.01x | -
| pytorch | -1.6.0a0+24aac32 | -bert_large_qnli | -QNLI | -91.20% | -91.82% | --0.68% | -2.69x | -
| Model | -Accuracy | -Performance | -||||
|---|---|---|---|---|---|---|
| INT8 | -FP32 | -Acc Ratio[(INT8-FP32)/FP32] | -INT8 | -FP32 | -Performance Ratio[INT8/FP32] | -|
| bert_large_squad_static - | 90.78% | -90.87% | --0.11% | -49.08 | -13.48 | -3.64x | -
| bert_base_mrpc_static - | 82.35% | -83.09% | --0.89% | -497.28 | -151.16 | -3.29x | -
| bert_base_nli_mean_tokens_stsb_static - | 89.23% | -89.55% | --0.36% | -546.97 | -151.77 | -3.60x | -
| bert_base_sparse_mrpc_static - | 70.59% | -70.59% | -0.00% | -551.90 | -153.80 | -3.59x | -
| bert_mini_mrpc_static - | 78.19% | -78.68% | --0.62% | -6962.58 | -3252.14 | -2.14x | -
| bert_mini_sst2_static - | 87.16% | -86.93% | -0.26% | -6850.38 | -3218.98 | -2.13x | -
| distilbert_base_uncased_sst2_static - | 90.14% | -90.25% | --0.12% | -1086.13 | -306.45 | -3.54x | -
| distilbert_base_uncased_mrpc_static - | 83.82% | -84.07% | --0.30% | -1091.99 | -303.92 | -3.59x | -
| distilbert_base_uncased_emotion_static - | 93.90% | -94.20% | --0.32% | -1081.35 | -306.33 | -3.53x | -
| minilm_l6_h384_uncased_sst2_static - | 89.33% | -90.14% | --0.90% | -2594.77 | -1083.84 | -2.39x | -
| roberta_base_mrpc_static - | 88.24% | -88.97% | --0.82% | -508.14 | -153.37 | -3.31x | -
| distilroberta_base_wnli_static - | 56.34% | -56.34% | -0.00% | -1097.22 | -315.94 | -3.47x | -
| paraphrase_xlm_r_multilingual_v1_stsb_static - | 86.66% | -87.23% | --0.65% | -552.44 | -153.74 | -3.59x | -
| finbert_financial_phrasebank_static - | 82.57% | -82.80% | --0.28% | -999.94 | -292.55 | -3.42x | -
+
## Usage
diff --git a/docs/source/SECURITY.md b/docs/source/SECURITY.md
new file mode 100644
index 00000000000..71a71eff1b6
--- /dev/null
+++ b/docs/source/SECURITY.md
@@ -0,0 +1,13 @@
+Security Policy
+===============
+
+## Report a Vulnerability
+
+Please report security issues or vulnerabilities to the [Intel® Security Center].
+
+For more information on how Intel® works to resolve security issues, see
+[Vulnerability Handling Guidelines].
+
+[Intel® Security Center]:https://www.intel.com/security
+
+[Vulnerability Handling Guidelines]:https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html
diff --git a/docs/source/Welcome.md b/docs/source/Welcome.md
new file mode 100644
index 00000000000..35d9e3841a9
--- /dev/null
+++ b/docs/source/Welcome.md
@@ -0,0 +1,249 @@
+
+
+
+### Quantization with [GUI](./bench.html)
+```shell
+# An ONNX Example
+pip install onnx==1.12.0 onnxruntime==1.12.1 onnxruntime-extensions
+# Prepare fp32 model
+wget https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v1-12.onnx
+# Start GUI
+inc_bench
+```
+
+
+
+
+## System Requirements
+
+### Validated Hardware Environment
+#### Intel® Neural Compressor supports CPUs based on [Intel 64 architecture or compatible processors](https://en.wikipedia.org/wiki/X86-64):
+
+* Intel Xeon Scalable processor (formerly Skylake, Cascade Lake, Cooper Lake, and Icelake)
+* Future Intel Xeon Scalable processor (code name Sapphire Rapids)
+
+#### Intel® Neural Compressor supports GPUs built on Intel's Xe architecture:
+
+* [Intel® Data Center GPU Flex Series](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/data-center-gpu/flex-series/overview.html)
+
+#### Intel® Neural Compressor quantized ONNX models support multiple hardware vendors through ONNX Runtime:
+
+* Intel CPU, AMD/ARM CPU, and NVidia GPU. Please refer to the validated model [list](./validated_model_list.html#Validated-ONNX-QDQ-INT8-models-on-multiple-hardware-through-ONNX-Runtime).
+
+### Validated Software Environment
+
+* OS version: CentOS 8.4, Ubuntu 20.04
+* Python version: 3.7, 3.8, 3.9, 3.10
+
+| Framework | +TensorFlow | +Intel TensorFlow | +PyTorch | +Intel® Extension for PyTorch* | +ONNX Runtime | +MXNet | +
|---|---|---|---|---|---|---|
| Version | +2.10.0 + 2.9.1 + 2.8.2 + | 2.10.0 + 2.9.1 + 2.8.0 + | 1.12.1+cpu + 1.11.0+cpu + 1.10.0+cpu |
+ 1.12.0 + 1.11.0 + 1.10.0 |
+ 1.12.1 + 1.11.0 + 1.10.0 |
+ 1.8.0 + 1.7.0 + 1.6.0 |
+
+
+| Overview | +|||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|
| Architecture | +Examples | +GUI | +APIs | +||||||||
| Intel oneAPI AI Analytics Toolkit | +AI and Analytics Samples | +||||||||||
| Basic API | +|||||||||||
| Transform | +Dataset | +Metric | +Objective | +||||||||
| Deep Dive | +|||||||||||
| Quantization | +Pruning(Sparsity) | +Knowledge Distillation | +Mixed Precision | +Orchestration | +|||||||
| Benchmarking | +Distributed Training | +Model Conversion | +TensorBoard | +||||||||
| Distillation for Quantization | +Neural Coder | +||||||||||
| Advanced Topics | +|||||||||||
| Adaptor | +Strategy | +||||||||||
+
+
+## Workflow
+
+
+
+
diff --git a/docs/distillation.md b/docs/source/distillation.md
similarity index 95%
rename from docs/distillation.md
rename to docs/source/distillation.md
index b5f363a5a67..49cec901185 100644
--- a/docs/distillation.md
+++ b/docs/source/distillation.md
@@ -1,138 +1,138 @@
-Distillation
-============
-
-1. [Introduction](#introduction)
-
- 1.1. [Knowledge Distillation](#knowledge-distillation)
-
- 1.2. [Intermediate Layer Knowledge Distillation](#intermediate-layer-knowledge-distillation)
-
- 1.3. [Self Distillation](#self-distillation)
-
-2. [Distillation Support Matrix](#distillation-support-matrix)
-3. [Get Started with Distillation API ](#get-started-with-distillation-api)
-4. [Examples](#examples)
-
-## Introduction
-
-Distillation is one of popular approaches of network compression, which transfers knowledge from a large model to a smaller one without loss of validity. As smaller models are less expensive to evaluate, they can be deployed on less powerful hardware (such as a mobile device). Graph shown below is the workflow of the distillation, the teacher model will take the same input that feed into the student model to produce the output that contains knowledge of the teacher model to instruct the student model.
-
-
-Intel® Neural Compressor supports Knowledge Distillation and Intermediate Layer Knowledge Distillation algorithms.
-
-### Knowledge Distillation
-Knowledge distillation is proposed in [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531). It leverages the logits (the input of softmax in the classification tasks) of teacher and student model to minimize the the difference between their predicted class distributions, this can be done by minimizing the below loss function.
-
-$$L_{KD} = D(z_t, z_s)$$
-
-Where $D$ is a distance measurement, e.g. Euclidean distance and Kullback–Leibler divergence, $z_t$ and $z_s$ are the logits of teacher and student model, or predicted distributions from softmax of the logits in case the distance is measured in terms of distribution.
-
-### Intermediate Layer Knowledge Distillation
-
-There are more information contained in the teacher model beside its logits, for example, the output features of the teacher model's intermediate layers often been used to guide the student model, as in [Patient Knowledge Distillation for BERT Model Compression](https://arxiv.org/pdf/1908.09355) and [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984). The general loss function for this approach can be summarized as follow.
-
-$$L_{KD} = \sum\limits_i D(T_t^{n_i}(F_t^{n_i}), T_s^{m_i}(F_s^{m_i}))$$
-
-Where $D$ is a distance measurement as before, $F_t^{n_i}$ the output feature of the $n_i$'s layer of the teacher model, $F_s^{m_i}$ the output feature of the $m_i$'s layer of the student model. Since the dimensions of $F_t^{n_i}$ and $F_s^{m_i}$ are usually different, the transformations $T_t^{n_i}$ and $T_s^{m_i}$ are needed to match dimensions of the two features. Specifically, the transformation can take the forms like identity, linear transformation, 1X1 convolution etc.
-
-### Self Distillation
-
-Self-distillation ia a one-stage training method where the teacher model and student models can be trained together. It attaches several attention modules and shallow classifiers at different depths of neural networks and distills knowledge from the deepest classifier to the shallower classifiers. Different from the conventional knowledge distillation methods where the knowledge of the teacher model is transferred to another student model, self-distillation can be considered as knowledge transfer in the same model, from the deeper layers to the shallower layers.
-The additional classifiers in self-distillation allow the neural network to work in a dynamic manner, which leads to a much higher acceleration.
-
-
-Architecture from paper [Self-Distillation: Towards Efficient and Compact Neural Networks](https://ieeexplore.ieee.org/document/9381661)
-
-## Distillation Support Matrix
-
-|Distillation Algorithm |PyTorch |TensorFlow |
-|------------------------------------------------|:--------:|:---------:|
-|Knowledge Distillation |✔ |✔ |
-|Intermediate Layer Knowledge Distillation |✔ |Will be supported|
-|Self Distillation |✔ |✖ |
-
-## Get Started with Distillation API
-
-Simplest launcher code if training behavior is defined in user-defined yaml.
-
-```python
-from neural_compressor.experimental import Distillation, common
-distiller = Distillation('/path/to/user/yaml')
-distiller.student_model = student_model
-distiller.teacher_model = teacher_model
-model = distiller.fit()
-```
-Distillation class also support DistillationConf class as it's argument.
-
-```python
-from neural_compressor.experimental import Distillation, common
-from neural_compressor.conf.config import DistillationConf
-conf = DistillationConf('/path/to/user/yaml')
-distiller = Distillation(conf)
-distiller.student_model = student_model
-distiller.teacher_model = teacher_model
-model = distiller.fit()
-```
-
-User can pass the customized training/evaluation functions to `Distillation` for flexible scenarios. In this case, distillation process can be done by pre-defined hooks in Neural Compressor. User needs to put those hooks inside the training function.
-
-Neural Compressor defines several hooks for user pass
-
-```
-on_train_begin() : Hook executed before training begins
-on_after_compute_loss(input, student_output, student_loss) : Hook executed after each batch inference of student model
-on_epoch_end() : Hook executed at each epoch end
-```
-
-Following section shows how to use hooks in user pass-in training function which is part of example from BlendCNN distillation:
-
-```python
-def train_func(model):
- distiller.on_train_begin()
- for nepoch in range(epochs):
- model.train()
- cnt = 0
- loss_sum = 0.
- iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)')
- for batch in iter_bar:
- teacher_logits, input_ids, segment_ids, input_mask, target = batch
- cnt += 1
- output = model(input_ids, segment_ids, input_mask)
- loss = criterion(output, target)
- loss = distiller.on_after_compute_loss(
- {'input_ids':input_ids, 'segment_ids':segment_ids, 'input_mask':input_mask},
- output,
- loss,
- teacher_logits)
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
- if cnt >= iters:
- break
- print('Average Loss: {}'.format(loss_sum / cnt))
- distiller.on_epoch_end()
-...
-```
-
-In this case, the launcher code is like the following:
-
-```python
-from neural_compressor.experimental import Distillation, common
-from neural_compressor.experimental.common.criterion import PyTorchKnowledgeDistillationLoss
-distiller = Distillation(args.config)
-distiller.student_model = model
-distiller.teacher_model = teacher
-distiller.criterion = PyTorchKnowledgeDistillationLoss()
-distiller.train_func = train_func
-model = distiller.fit()
-```
-
-## Examples
-
-[Distillation Examples](../examples/README.md#distillation)
-
+
+Intel® Neural Compressor supports Knowledge Distillation and Intermediate Layer Knowledge Distillation algorithms.
+
+### Knowledge Distillation
+Knowledge distillation is proposed in [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531). It leverages the logits (the input of softmax in the classification tasks) of teacher and student model to minimize the the difference between their predicted class distributions, this can be done by minimizing the below loss function.
+
+$$L_{KD} = D(z_t, z_s)$$
+
+Where $D$ is a distance measurement, e.g. Euclidean distance and Kullback–Leibler divergence, $z_t$ and $z_s$ are the logits of teacher and student model, or predicted distributions from softmax of the logits in case the distance is measured in terms of distribution.
+
+### Intermediate Layer Knowledge Distillation
+
+There are more information contained in the teacher model beside its logits, for example, the output features of the teacher model's intermediate layers often been used to guide the student model, as in [Patient Knowledge Distillation for BERT Model Compression](https://arxiv.org/pdf/1908.09355) and [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984). The general loss function for this approach can be summarized as follow.
+
+$$L_{KD} = \sum\limits_i D(T_t^{n_i}(F_t^{n_i}), T_s^{m_i}(F_s^{m_i}))$$
+
+Where $D$ is a distance measurement as before, $F_t^{n_i}$ the output feature of the $n_i$'s layer of the teacher model, $F_s^{m_i}$ the output feature of the $m_i$'s layer of the student model. Since the dimensions of $F_t^{n_i}$ and $F_s^{m_i}$ are usually different, the transformations $T_t^{n_i}$ and $T_s^{m_i}$ are needed to match dimensions of the two features. Specifically, the transformation can take the forms like identity, linear transformation, 1X1 convolution etc.
+
+### Self Distillation
+
+Self-distillation ia a one-stage training method where the teacher model and student models can be trained together. It attaches several attention modules and shallow classifiers at different depths of neural networks and distills knowledge from the deepest classifier to the shallower classifiers. Different from the conventional knowledge distillation methods where the knowledge of the teacher model is transferred to another student model, self-distillation can be considered as knowledge transfer in the same model, from the deeper layers to the shallower layers.
+The additional classifiers in self-distillation allow the neural network to work in a dynamic manner, which leads to a much higher acceleration.
+
+
+Architecture from paper [Self-Distillation: Towards Efficient and Compact Neural Networks](https://ieeexplore.ieee.org/document/9381661)
+
+## Distillation Support Matrix
+
+|Distillation Algorithm |PyTorch |TensorFlow |
+|------------------------------------------------|:--------:|:---------:|
+|Knowledge Distillation |✔ |✔ |
+|Intermediate Layer Knowledge Distillation |✔ |Will be supported|
+|Self Distillation |✔ |✖ |
+
+## Get Started with Distillation API
+
+Simplest launcher code if training behavior is defined in user-defined yaml.
+
+```python
+from neural_compressor.experimental import Distillation, common
+distiller = Distillation('/path/to/user/yaml')
+distiller.student_model = student_model
+distiller.teacher_model = teacher_model
+model = distiller.fit()
+```
+Distillation class also support DistillationConf class as it's argument.
+
+```python
+from neural_compressor.experimental import Distillation, common
+from neural_compressor.conf.config import DistillationConf
+conf = DistillationConf('/path/to/user/yaml')
+distiller = Distillation(conf)
+distiller.student_model = student_model
+distiller.teacher_model = teacher_model
+model = distiller.fit()
+```
+
+User can pass the customized training/evaluation functions to `Distillation` for flexible scenarios. In this case, distillation process can be done by pre-defined hooks in Neural Compressor. User needs to put those hooks inside the training function.
+
+Neural Compressor defines several hooks for user pass
+
+```
+on_train_begin() : Hook executed before training begins
+on_after_compute_loss(input, student_output, student_loss) : Hook executed after each batch inference of student model
+on_epoch_end() : Hook executed at each epoch end
+```
+
+Following section shows how to use hooks in user pass-in training function which is part of example from BlendCNN distillation:
+
+```python
+def train_func(model):
+ distiller.on_train_begin()
+ for nepoch in range(epochs):
+ model.train()
+ cnt = 0
+ loss_sum = 0.
+ iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)')
+ for batch in iter_bar:
+ teacher_logits, input_ids, segment_ids, input_mask, target = batch
+ cnt += 1
+ output = model(input_ids, segment_ids, input_mask)
+ loss = criterion(output, target)
+ loss = distiller.on_after_compute_loss(
+ {'input_ids':input_ids, 'segment_ids':segment_ids, 'input_mask':input_mask},
+ output,
+ loss,
+ teacher_logits)
+ optimizer.zero_grad()
+ loss.backward()
+ optimizer.step()
+ if cnt >= iters:
+ break
+ print('Average Loss: {}'.format(loss_sum / cnt))
+ distiller.on_epoch_end()
+...
+```
+
+In this case, the launcher code is like the following:
+
+```python
+from neural_compressor.experimental import Distillation, common
+from neural_compressor.experimental.common.criterion import PyTorchKnowledgeDistillationLoss
+distiller = Distillation(args.config)
+distiller.student_model = model
+distiller.teacher_model = teacher
+distiller.criterion = PyTorchKnowledgeDistillationLoss()
+distiller.train_func = train_func
+model = distiller.fit()
+```
+
+## Examples
+
+[Distillation Examples](../examples/README.md#distillation)
+
+
+
Intel® Neural Compressor has unified interfaces which dispatch tasks to different frameworks via adaptor layer. The adaptor layer is the bridge between the tuning strategy and vanilla framework quantization APIs. Users can select tuning strategies and the strategy module contains model configs and tuning configs. Model configs define the quantization approach, if it's post-training static quantization, users need to set more parameters like calibration and so on. There are several tuning strategies for users to choose from while the basic strategy is set as default.
diff --git a/docs/installation_guide.md b/docs/source/installation_guide.md
similarity index 100%
rename from docs/installation_guide.md
rename to docs/source/installation_guide.md
diff --git a/docs/legal_information.md b/docs/source/legal_information.md
similarity index 94%
rename from docs/legal_information.md
rename to docs/source/legal_information.md
index 5c595853b8a..511a04b7a58 100644
--- a/docs/legal_information.md
+++ b/docs/source/legal_information.md
@@ -16,7 +16,7 @@ See the accompanying [license](https://github.com/intel/neural-compressor/tree/m
## Citation
-If you use Intel® Neural Compressor in your research or you wish to refer to the tuning results published in the [Validated Models](getting_started.md#validated-models), use the following BibTeX entry.
+If you use Intel® Neural Compressor in your research or you wish to refer to the tuning results published in the [Validated Models](validated_model_list.md), use the following BibTeX entry.
```
@misc{Intel® Neural Compressor,
diff --git a/docs/metric.md b/docs/source/metric.md
similarity index 100%
rename from docs/metric.md
rename to docs/source/metric.md
diff --git a/docs/mixed_precision.md b/docs/source/mixed_precision.md
similarity index 95%
rename from docs/mixed_precision.md
rename to docs/source/mixed_precision.md
index 04b155bb8f1..4a0ff3830fe 100644
--- a/docs/mixed_precision.md
+++ b/docs/source/mixed_precision.md
@@ -12,9 +12,9 @@ The recent growth of Deep Learning has driven the development of more complex mo
The recently launched 3rd Gen Intel® Xeon® Scalable processor (codenamed Cooper Lake), featuring Intel® Deep Learning Boost, is the first general-purpose x86 CPU to support the bfloat16 format. Specifically, three new bfloat16 instructions are added as a part of the AVX512_BF16 extension within Intel Deep Learning Boost: VCVTNE2PS2BF16, VCVTNEPS2BF16, and VDPBF16PS. The first two instructions allow converting to and from bfloat16 data type, while the last one performs a dot product of bfloat16 pairs. Further details can be found in the [hardware numerics document](https://software.intel.com/content/www/us/en/develop/download/bfloat16-hardware-numerics-definition.html) published by Intel.
-
+
+
+
| Orchestration | -Combinations | -Supported | -
|---|---|---|
| One-shot | -Pruning + Quantization Aware Training | -✔ | -
| Distillation + Quantization Aware Training | -✔ | -|
| Distillation + Pruning | -✔ | -|
| Distillation + Pruning + Quantization Aware Training | -✔ | -|
| Multi-shot | -Pruning then Quantization | -✔ | -
| Distillation then Quantization | -✔ | -|
| Distillation then Pruning | -✔ | -|
| Distillation then Pruning then Quantization | -✔ | -
| Orchestration | +Combinations | +Supported | +
|---|---|---|
| One-shot | +Pruning + Quantization Aware Training | +✔ | +
| Distillation + Quantization Aware Training | +✔ | +|
| Distillation + Pruning | +✔ | +|
| Distillation + Pruning + Quantization Aware Training | +✔ | +|
| Multi-shot | +Pruning then Quantization | +✔ | +
| Distillation then Quantization | +✔ | +|
| Distillation then Pruning | +✔ | +|
| Distillation then Pruning then Quantization | +✔ | +
-
-
-
-### Pruning Patterns
-
-Pruning patterns defines the rules of pruned weights' arrangements in space.
-
-
-
-
-
-
-- Unstructured Pruning
-
-Unstructured pruning means finding and removing the less salient connection in the model where the nonzero patterns are irregular and could be anywhere in the matrix.
-
-- 2in4 Pruning
-
-NVIDIA proposed [2:4 sparsity](https://developer.nvidia.com/blog/accelerating-inference-with-sparsity-using-ampere-and-tensorrt/) (or known as "2in4 sparsity") in Ampere architecture, for every 4 continuous elements in a matrix, two of them are zero and others are non-zero.
-
-- Structured Pruning
-
-Structured pruning means finding parameters in groups, deleting entire blocks, filters, or channels according to some pruning criterions. In general, structured pruning leads to lower accuracy due to restrictive structure than unstructured pruning; However, it can accelerate the model execution significantly because it can fit hardware design better.
-
-Different from 2:4 sparsity above, we propose the block-wise structured sparsity patterns that we are able to demonstrate the performance benefits on existing Intel hardwares even without the support of hardware sparsity. A block-wise sparsity pattern with block size ```S``` means the contiguous ```S``` elements in this block are all zero values.
-
-For a typical GEMM, the weight dimension is ```IC``` x ```OC```, where ```IC``` is the number of input channels and ```OC``` is the number of output channels. Note that sometimes ```IC``` is also called dimension ```K```, and ```OC``` is called dimension ```N```. The sparsity dimension is on ```OC``` (or ```N```).
-
-For a typical Convolution, the weight dimension is ```OC x IC x KH x KW```, where ```OC``` is the number of output channels, ```IC``` is the number of input channels, and ```KH``` and ```KW``` is the kernel height and weight. The sparsity dimension is also on ```OC```.
-
-Here is a figure showing a matrix with ```IC``` = 32 and ```OC``` = 16 dimension, and a block-wise sparsity pattern with block size 4 on ```OC``` dimension.
-
-
-
-
-
-### Pruning Criteria
-
-Pruning criteria defines the rules of which weights are least important to be pruned, in order to maintain the model's original accuracy. Most popular criteria examine weights' absolute value and their corresponding gradients.
-
-- Magnitude
-
- The algorithm prunes the weight by the lowest absolute value at each layer with given sparsity target.
-
-- Gradient sensitivity
-
- The algorithm prunes the head, intermediate layers, and hidden states in NLP model according to importance score calculated by following the paper [FastFormers](https://arxiv.org/abs/2010.13382).
-
-- Group Lasso
-
- The algorithm uses Group lasso regularization to prune entire rows, columns or blocks of parameters that result in a smaller dense network.
-
-- Pattern Lock
-
- The algorithm locks the sparsity pattern in fine tune phase by freezing those zero values of weight tensor during weight update of training.
-
-- SNIP
-
- The algorithm prunes the dense model at its initialization, by analyzing the weights' effect to the loss function when they are masked. Please refer to the original [paper](https://arxiv.org/abs/1810.02340) for details
-
-- SNIP with momentum
-
- The algorithm improves original SNIP algorithms and introduces weights' score maps which updates in a momentum way.\
- In the following formula, $n$ is the pruning step and $W$ and $G$ are model's weights and gradients respectively.
- $$Score_{n} = 1.0 \times Score_{n-1} + 0.9 \times |W_{n} \times G_{n}|$$
-
-### Pruning Schedule
-
-Pruning schedule defines the way the model reach the target sparsity (the ratio of pruned weights).
-
-- One-shot Pruning
-
- One-shot pruning means the model is pruned to its target sparsity with one single step. This pruning method often works at model's initialization step. It can easily cause accuracy drop, but save much training time.
-
-- Iterative Pruning
-
- Iterative pruning means the model is gradually pruned to its target sparsity during a training process. The pruning process contains several pruning steps, and each step raises model's sparsity to a higher value. In the final pruning step, the model reaches target sparsity and the pruning process ends.
-
-## Pruning Support Matrix
-
-| Pruning Type | -Pruning Granularity | -Pruning Algorithm | -Framework | -
|---|---|---|---|
| Unstructured Pruning | -Element-wise | -Magnitude | -PyTorch, TensorFlow | -
| Pattern Lock | -PyTorch | -||
| SNIP with momentum | -PyTorch | -||
| Structured Pruning | -Filter/Channel-wise | -Gradient Sensitivity | -PyTorch | -
| SNIP with momentum | -PyTorch | -||
| Block-wise | -Group Lasso | -PyTorch | -|
| SNIP with momentum | -PyTorch | -||
| Element-wise | -Pattern Lock | -PyTorch | -|
| SNIP with momentum | -PyTorch | -
+
+
+
+### Pruning Patterns
+
+Pruning patterns defines the rules of pruned weights' arrangements in space.
+
+
+
+
+
+
+- Unstructured Pruning
+
+Unstructured pruning means finding and removing the less salient connection in the model where the nonzero patterns are irregular and could be anywhere in the matrix.
+
+- 2in4 Pruning
+
+NVIDIA proposed [2:4 sparsity](https://developer.nvidia.com/blog/accelerating-inference-with-sparsity-using-ampere-and-tensorrt/) (or known as "2in4 sparsity") in Ampere architecture, for every 4 continuous elements in a matrix, two of them are zero and others are non-zero.
+
+- Structured Pruning
+
+Structured pruning means finding parameters in groups, deleting entire blocks, filters, or channels according to some pruning criterions. In general, structured pruning leads to lower accuracy due to restrictive structure than unstructured pruning; However, it can accelerate the model execution significantly because it can fit hardware design better.
+
+Different from 2:4 sparsity above, we propose the block-wise structured sparsity patterns that we are able to demonstrate the performance benefits on existing Intel hardwares even without the support of hardware sparsity. A block-wise sparsity pattern with block size ```S``` means the contiguous ```S``` elements in this block are all zero values.
+
+For a typical GEMM, the weight dimension is ```IC``` x ```OC```, where ```IC``` is the number of input channels and ```OC``` is the number of output channels. Note that sometimes ```IC``` is also called dimension ```K```, and ```OC``` is called dimension ```N```. The sparsity dimension is on ```OC``` (or ```N```).
+
+For a typical Convolution, the weight dimension is ```OC x IC x KH x KW```, where ```OC``` is the number of output channels, ```IC``` is the number of input channels, and ```KH``` and ```KW``` is the kernel height and weight. The sparsity dimension is also on ```OC```.
+
+Here is a figure showing a matrix with ```IC``` = 32 and ```OC``` = 16 dimension, and a block-wise sparsity pattern with block size 4 on ```OC``` dimension.
+
+
+
+
+
+### Pruning Criteria
+
+Pruning criteria defines the rules of which weights are least important to be pruned, in order to maintain the model's original accuracy. Most popular criteria examine weights' absolute value and their corresponding gradients.
+
+- Magnitude
+
+ The algorithm prunes the weight by the lowest absolute value at each layer with given sparsity target.
+
+- Gradient sensitivity
+
+ The algorithm prunes the head, intermediate layers, and hidden states in NLP model according to importance score calculated by following the paper [FastFormers](https://arxiv.org/abs/2010.13382).
+
+- Group Lasso
+
+ The algorithm uses Group lasso regularization to prune entire rows, columns or blocks of parameters that result in a smaller dense network.
+
+- Pattern Lock
+
+ The algorithm locks the sparsity pattern in fine tune phase by freezing those zero values of weight tensor during weight update of training.
+
+- SNIP
+
+ The algorithm prunes the dense model at its initialization, by analyzing the weights' effect to the loss function when they are masked. Please refer to the original [paper](https://arxiv.org/abs/1810.02340) for details
+
+- SNIP with momentum
+
+ The algorithm improves original SNIP algorithms and introduces weights' score maps which updates in a momentum way.\
+ In the following formula, $n$ is the pruning step and $W$ and $G$ are model's weights and gradients respectively.
+ $$Score_{n} = 1.0 \times Score_{n-1} + 0.9 \times |W_{n} \times G_{n}|$$
+
+### Pruning Schedule
+
+Pruning schedule defines the way the model reach the target sparsity (the ratio of pruned weights).
+
+- One-shot Pruning
+
+ One-shot pruning means the model is pruned to its target sparsity with one single step. This pruning method often works at model's initialization step. It can easily cause accuracy drop, but save much training time.
+
+- Iterative Pruning
+
+ Iterative pruning means the model is gradually pruned to its target sparsity during a training process. The pruning process contains several pruning steps, and each step raises model's sparsity to a higher value. In the final pruning step, the model reaches target sparsity and the pruning process ends.
+
+## Pruning Support Matrix
+
+| Pruning Type | +Pruning Granularity | +Pruning Algorithm | +Framework | +
|---|---|---|---|
| Unstructured Pruning | +Element-wise | +Magnitude | +PyTorch, TensorFlow | +
| Pattern Lock | +PyTorch | +||
| SNIP with momentum | +PyTorch | +||
| Structured Pruning | +Filter/Channel-wise | +Gradient Sensitivity | +PyTorch | +
| SNIP with momentum | +PyTorch | +||
| Block-wise | +Group Lasso | +PyTorch | +|
| SNIP with momentum | +PyTorch | +||
| Element-wise | +Pattern Lock | +PyTorch | +|
| SNIP with momentum | +PyTorch | +
+
## Supported Feature Matrix
diff --git a/docs/quantization_mixed_precision.md b/docs/source/quantization_mixed_precision.md
similarity index 87%
rename from docs/quantization_mixed_precision.md
rename to docs/source/quantization_mixed_precision.md
index 728c854da5c..9352a81f8cf 100644
--- a/docs/quantization_mixed_precision.md
+++ b/docs/source/quantization_mixed_precision.md
@@ -1,59 +1,59 @@
-### Turn ON Auto Mixed Precision during Quantization
-
-BF16 conversion during quantization is default OFF. To force enable it, users need to turn on use_bf16 by pythonic config:
-
-```python
-from neural_compressor import config
-from neural_compressor.experimental import Quantization
-
-config.quantization.use_bf16 = True
-quantizer = Quantization(config)
-```
-
-### Tensorflow
-
-Intel has worked with the TensorFlow development team to enhance TensorFlow to include bfloat16 data support for CPUs. For more information about BF16 in TensorFlow, please read [Accelerating AI performance on 3rd Gen Intel® Xeon® Scalable processors with TensorFlow and Bfloat16](https://blog.tensorflow.org/2020/06/accelerating-ai-performance-on-3rd-gen-processors-with-tensorflow-bfloat16.html).
-
-- BF16 conversion during quantization in TensorFlow
-
-
-
-
-
+
+
+
## Example
diff --git a/examples/pytorch/image_recognition/CNN-2/distillation/eager/README.md b/examples/pytorch/image_recognition/CNN-2/distillation/eager/README.md
index 484ddc0d93a..3180b03d1ac 100644
--- a/examples/pytorch/image_recognition/CNN-2/distillation/eager/README.md
+++ b/examples/pytorch/image_recognition/CNN-2/distillation/eager/README.md
@@ -9,3 +9,14 @@ python train_without_distillation.py --model_type CNN-10 --epochs 200 --lr 0.1 -
# for distillation of the student model CNN-2 with the teacher model CNN-10
python main.py --epochs 200 --lr 0.02 --name CNN-2-distillation --student_type CNN-2 --teacher_type CNN-10 --teacher_model runs/CNN-10/model_best.pth.tar --tensorboard
```
+
+We also supported Distributed Data Parallel training on single node and multi nodes settings for distillation. To use Distributed Data Parallel to speedup training, the bash command needs a small adjustment.
+
-
+
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/README.md new file mode 100644 index 00000000000..b3599c59a88 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/README.md @@ -0,0 +1,38 @@ +Step-by-Step +============ + +This document is used to list steps of reproducing PyTorch BERT tuning zoo result. + +# Prerequisite + +## 1. Installation + +The dependent packages are all in requirements, please install as following. + +``` +pip install -r requirements.txt +``` + +## 2. Run + +If the automatic download from modelhub fails, you can download [EleutherAI/gpt-j-6B](https://huggingface.co/EleutherAI/gpt-j-6B?text=My+name+is+Clara+and+I+am) offline. + +```shell + +python run_clm.py \ + --model_name_or_path EleutherAI/gpt-j-6B \ + --dataset_name wikitext\ + --dataset_config_name wikitext-2-raw-v1 \ + --do_train \ + --do_eval \ + --tune \ + --output_dir /path/to/checkpoint/dir +``` + + +## 3. Command + +``` +bash run_tuning.sh --topology=gpt_j_wikitext +bash run_benchmark.sh --topology=gpt_j_wikitext --mode=performance --int8=true +``` diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/conf.yaml b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/conf.yaml new file mode 100644 index 00000000000..0f75f809781 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/conf.yaml @@ -0,0 +1,31 @@ +# +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +version: 1.0 + +model: # mandatory. used to specify model specific information. + name: bert + framework: pytorch_fx # mandatory. possible values are tensorflow, mxnet, pytorch, pytorch_ipex, onnxrt_integerops and onnxrt_qlinearops. + +quantization: # optional. tuning constraints on model-wise for advance user to reduce tuning space. + approach: post_training_static_quant + +tuning: + accuracy_criterion: + relative: 0.5 # optional. default value is relative, other value is absolute. this example allows relative accuracy loss: 1%. + higher_is_better: False + exit_policy: + max_trials: 600 + random_seed: 9527 # optional. random seed for deterministic tuning. diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/requirements.txt new file mode 100644 index 00000000000..763bed755a8 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/requirements.txt @@ -0,0 +1,5 @@ +sentencepiece != 0.1.92 +protobuf +evaluate +datasets +transformers >= 4.22.0 diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_benchmark.sh new file mode 100644 index 00000000000..a36507f4fca --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_benchmark.sh @@ -0,0 +1,91 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + iters=100 + batch_size=16 + tuned_checkpoint=saved_results + max_eval_samples=`expr ${iters} \* ${batch_size}` + echo ${max_eval_samples} + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + --iters=*) + iters=$(echo ${var} |cut -f2 -d=) + ;; + --int8=*) + int8=$(echo ${var} |cut -f2 -d=) + ;; + --config=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + + +# run_benchmark +function run_benchmark { + extra_cmd='' + + if [[ ${mode} == "accuracy" ]]; then + mode_cmd=" --accuracy_only " + elif [[ ${mode} == "benchmark" ]]; then + mode_cmd=" --benchmark " + extra_cmd=$extra_cmd" --max_eval_samples ${max_eval_samples}" + else + echo "Error: No such mode: ${mode}" + exit 1 + fi + + if [ "${topology}" = "gpt_j_wikitext" ]; then + TASK_NAME='wikitext' + model_name_or_path=$input_model + extra_cmd='--dataset_config_name=wikitext-2-raw-v1' + fi + + if [[ ${int8} == "true" ]]; then + extra_cmd=$extra_cmd" --int8" + fi + echo $extra_cmd + + python -u run_clm.py \ + --model_name_or_path ${model_name_or_path} \ + --dataset_name ${TASK_NAME} \ + --do_eval \ + --per_device_eval_batch_size ${batch_size} \ + --output_dir ${tuned_checkpoint} \ + ${mode_cmd} \ + ${extra_cmd} + +} + +main "$@" diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_clm.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_clm.py new file mode 100644 index 00000000000..17a32f1b57a --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_clm.py @@ -0,0 +1,650 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. +Here is the full list of checkpoints on the hub that can be fine-tuned by this script: +https://huggingface.co/models?filter=text-generation +""" +# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. + +import logging +import math +import os +import sys +from dataclasses import dataclass, field +from itertools import chain +from typing import Optional + +import datasets +from datasets import load_dataset + +import evaluate +import transformers +from transformers import ( + CONFIG_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + HfArgumentParser, + Trainer, + TrainingArguments, + default_data_collator, + is_torch_tpu_available, + set_seed, +) +from transformers.testing_utils import CaptureLogger +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.22.0.dev0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") + +logger = logging.getLogger(__name__) + + +MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + """ + + model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": ( + "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch." + ) + }, + ) + model_type: Optional[str] = field( + default=None, + metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + ) + config_overrides: Optional[str] = field( + default=None, + metadata={ + "help": ( + "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + ) + }, + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": ( + "Will use the token generated when running `huggingface-cli login` (necessary to use this script " + "with private models)." + ) + }, + ) + tune: bool = field( + default=False, metadata={"help": "tune quantized model with Neural Compressor"} + ) + int8: bool = field( + default=False, metadata={"help": "use int8 model to get accuracy or benchmark"} + ) + benchmark: bool = field( + default=False, metadata={"help": "get benchmark instead of accuracy"} + ) + accuracy_only: bool = field( + default=False, metadata={"help": "get accuracy"} + ) + + def __post_init__(self): + if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): + raise ValueError( + "--config_overrides can't be used in combination with --config_name or --model_name_or_path" + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + ) + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + ) + }, + ) + + block_size: Optional[int] = field( + default=None, + metadata={ + "help": ( + "Optional input sequence length after tokenization. " + "The training dataset will be truncated in block of this size for training. " + "Default to the model max input length for single sentence inputs (take into account special tokens)." + ) + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + validation_split_percentage: Optional[int] = field( + default=5, + metadata={ + "help": "The percentage of the train set used as validation set in case there's no validation split" + }, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + keep_linebreaks: bool = field( + default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} + ) + + def __post_init__(self): + if self.dataset_name is None and self.train_file is None and self.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The + # information sent is the one passed as arguments along with your Python/PyTorch versions. + send_example_telemetry("run_clm", model_args, data_args) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) + raw_datasets["train"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + data_files = {} + dataset_args = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = ( + data_args.train_file.split(".")[-1] + if data_args.train_file is not None + else data_args.validation_file.split(".")[-1] + ) + if extension == "txt": + extension = "text" + dataset_args["keep_linebreaks"] = data_args.keep_linebreaks + raw_datasets = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + **dataset_args, + ) + # If no validation data is there, validation_split_percentage will be used to divide the dataset. + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + **dataset_args, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + **dataset_args, + ) + + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + config_kwargs = { + "cache_dir": model_args.cache_dir, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.config_name: + config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) + elif model_args.model_name_or_path: + config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) + else: + config = CONFIG_MAPPING[model_args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + if model_args.config_overrides is not None: + logger.info(f"Overriding config: {model_args.config_overrides}") + config.update_from_string(model_args.config_overrides) + logger.info(f"New config: {config}") + + tokenizer_kwargs = { + "cache_dir": model_args.cache_dir, + "use_fast": model_args.use_fast_tokenizer, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) + elif model_args.model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) + else: + raise ValueError( + "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You can do it from another script, save it, and load it from here, using --tokenizer_name." + ) + + if model_args.model_name_or_path: + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + model = AutoModelForCausalLM.from_config(config) + n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) + + # Preprocessing the datasets. + # First we tokenize all the texts. + if training_args.do_train: + column_names = raw_datasets["train"].column_names + else: + column_names = raw_datasets["validation"].column_names + text_column_name = "text" if "text" in column_names else column_names[0] + + # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function + tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + + def tokenize_function(examples): + with CaptureLogger(tok_logger) as cl: + output = tokenizer(examples[text_column_name]) + # clm input could be much much longer than block_size + if "Token indices sequence length is longer than the" in cl.out: + tok_logger.warning( + "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits" + " before being passed to the model." + ) + return output + + with training_args.main_process_first(desc="dataset map tokenization"): + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + + if data_args.block_size is None: + block_size = tokenizer.model_max_length + if block_size > 1024: + logger.warning( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + "Picking 1024 instead. You can change that default value by passing --block_size xxx." + ) + block_size = 1024 + else: + if data_args.block_size > tokenizer.model_max_length: + logger.warning( + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" + f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + ) + block_size = min(data_args.block_size, tokenizer.model_max_length) + + # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= block_size: + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder + # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower + # to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + + with training_args.main_process_first(desc="grouping texts together"): + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) + + if training_args.do_train: + if "train" not in tokenized_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = lm_datasets["train"] + if data_args.max_train_samples is not None: + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) + + if training_args.do_eval: + if "validation" not in tokenized_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = lm_datasets["validation"] + if data_args.max_eval_samples is not None: + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) + + def preprocess_logits_for_metrics(logits, labels): + if isinstance(logits, tuple): + # Depending on the model and config, logits may contain extra tensors, + # like past_key_values, but logits always come first + logits = logits[0] + return logits.argmax(dim=-1) + + metric = evaluate.load("accuracy") + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # preds have the same shape as the labels, after the argmax(-1) has been calculated + # by preprocess_logits_for_metrics but we need to shift the labels + labels = labels[:, 1:].reshape(-1) + preds = preds[:, :-1].reshape(-1) + return metric.compute(predictions=preds, references=labels) + + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + # Data collator will default to DataCollatorWithPadding, so we change it. + data_collator=default_data_collator, + compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None, + preprocess_logits_for_metrics=preprocess_logits_for_metrics + if training_args.do_eval and not is_torch_tpu_available() + else None, + ) + + # Tune + if model_args.tune: + def eval_func_for_nc(model_tuned): + trainer.model = model_tuned + eval_output = trainer.evaluate(eval_dataset=eval_dataset) + perplexity = math.exp(eval_output["eval_loss"]) + results = {"perplexity":perplexity,"eval_loss":eval_output["eval_loss"],\ + "eval_samples_per_second":eval_output['eval_samples_per_second']} + clm_task_metrics_keys = ["perplexity","eval_loss"] + for key in clm_task_metrics_keys: + if key in results.keys(): + logger.info("Finally Eval {}:{}".format(key, results[key])) + if key=="eval_loss": + eval_loss = results[key] + break + print("Accuracy: %.5f" % eval_loss) + print('Throughput: %.3f samples/sec' % (results["eval_samples_per_second"])) + print('Latency: %.3f ms' % (1 * 1000 / results["eval_samples_per_second"])) + print('Batch size = %d' % training_args.per_device_eval_batch_size) + + return eval_loss + + from neural_compressor.experimental import Quantization, common + quantizer = Quantization("./conf.yaml") + quantizer.model = common.Model(model) + quantizer.calib_dataloader = trainer.get_eval_dataloader() + quantizer.eval_func = eval_func_for_nc + q_model = quantizer.fit() + q_model.save(training_args.output_dir) + exit(0) + + # Benchmark or accuracy + if model_args.benchmark or model_args.accuracy_only: + if model_args.int8: + from neural_compressor.utils.pytorch import load + new_model = load( + os.path.abspath(os.path.expanduser(training_args.output_dir)), model) + else: + new_model = model + trainer.model = new_model + eval_output = trainer.evaluate(eval_dataset=eval_dataset) + perplexity = math.exp(eval_output["eval_loss"]) + results = {"perplexity":perplexity,"eval_loss":eval_output["eval_loss"],\ + "eval_samples_per_second":eval_output['eval_samples_per_second']} + clm_task_metrics_keys = ["eval_loss"] + for key in clm_task_metrics_keys: + if key in results.keys(): + acc = results[key] + break + print("Accuracy: %.5f" % acc) + print('Throughput: %.3f samples/sec' % (results["eval_samples_per_second"])) + print('Latency: %.3f ms' % (1 * 1000 / results["eval_samples_per_second"])) + print('Batch size = %d' % training_args.per_device_eval_batch_size) + exit(0) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + metrics = trainer.evaluate() + + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + try: + perplexity = math.exp(metrics["eval_loss"]) + except OverflowError: + perplexity = float("inf") + metrics["perplexity"] = perplexity + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + + if training_args.push_to_hub: + trainer.push_to_hub(**kwargs) + else: + trainer.create_model_card(**kwargs) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_tuning.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_tuning.sh new file mode 100644 index 00000000000..04b16872a59 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx/run_tuning.sh @@ -0,0 +1,63 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + tuned_checkpoint=saved_results + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + +# run_tuning +function run_tuning { + extra_cmd='' + batch_size=8 + model_type='bert' + approach='post_training_static_quant' + + if [ "${topology}" = "gpt_j_wikitext" ]; then + TASK_NAME='wikitext' + model_name_or_path=$input_model + extra_cmd='--dataset_config_name=wikitext-2-raw-v1' + fi + + + python -u run_clm.py \ + --model_name_or_path ${model_name_or_path} \ + --dataset_name ${TASK_NAME} \ + --do_eval \ + --per_device_eval_batch_size ${batch_size} \ + --output_dir ${tuned_checkpoint} \ + --tune \ + ${extra_cmd} + +} + +main "$@" diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/README.md b/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/README.md index 610b624d747..4d340c5e466 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/README.md +++ b/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/README.md @@ -144,6 +144,40 @@ python3 ./run_glue_no_trainer.py \ --lr_scheduler_type "constant"\ --do_prune ``` +Also, per-channel pruning is also supported. +``` +python3 ./run_glue_no_trainer.py \ + --model_name_or_path "./mrpcbaseline/bert-mini/" \ + --pruning_config "./bert_mini_mrpc_1xchannel.yaml" \ + --task_name "mrpc" \ + --max_length "128" \ + --per_device_train_batch_size "16" \ + --learning_rate "1e-3" \ + --num_train_epochs "15" \ + --weight_decay "1e-3" \ + --cooldown_epochs "5" \ + --sparsity_warm_epochs "1"\ + --lr_scheduler_type "constant"\ + --distill_loss_weight "5"\ + --do_prune +``` +``` +python3 ./run_glue_no_trainer.py \ + --model_name_or_path "./sst2_baseline/bert-mini/" \ + --pruning_config "./bert_mini_sst2_1xchannel.yaml" \ + --task_name "sst2" \ + --max_length "128" \ + --per_device_train_batch_size "16" \ + --learning_rate "5e-5" \ + --distill_loss_weight "2.0" \ + --num_train_epochs "15" \ + --weight_decay "5e-5" \ + --cooldown_epochs "5" \ + --sparsity_warm_epochs "0"\ + --lr_scheduler_type "constant"\ + --do_prune +``` + We can also train a dense model on glue datasets (by setting --do_prune to False): ``` python run_glue_no_trainer.py --model_name_or_path "./bert-mini" --task_name "sst2" --max_length "128" --per_device_train_batch_size "32" --learning_rate "5e-5" --num_train_epochs "10" --output_dir "result/" 2>&1 | tee sst2_orig.log @@ -158,12 +192,14 @@ python3 run_glue_no_trainer.py --model_name_or_path "./bert-mini" --task_name | :----: | :----: | :----: | :----: |:----:|:----:| :----: | :----: | :----: | | Bert-Mini | MRPC | 4x1 |Snip-momentum| 0.8804 | Dense & Finetuned | 0.8619/0.8752 | 0.8610/0.8722 | -0.34% | | Bert-Mini | MRPC | 2:4 |Snip-momentum| 0.4795 | Dense & Finetuned | 0.8619/0.8752| 0.8562/0.8695 | -0.65% | +| Bert-Mini | MRPC | per channel |Snip-momentum| 0.66 | Dense & Finetuned | 0.8619/0.8752| 0.8629/0.8680 | -0.83% | #### SST-2 | Model | Dataset | Sparsity pattern | Pruning methods |Element-wise/matmul, Gemm, conv ratio | Init model | Dense Accuracy (mean/max) | Sparse Accuracy (mean/max)| Relative drop| | :----: | :----: | :----: | :----: |:----:|:----:| :----: | :----: | :----: | | Bert-Mini | SST-2 | 4x1 |Snip-momentum| 0.8815 | Dense & Finetuned | 0.8660/0.8761 | 0.8651/0.8692 | -0.79% | | Bert-Mini | SST-2 | 2:4 |Snip-momentum| 0.4795 | Dense & Finetuned | 0.8660/0.8761 | 0.8609/0.8693| -0.78% | +| Bert-Mini | SST-2 | per channel |Snip-momentum| 0.53 | Dense & Finetuned | 0.8660/0.8761 | 0.8651/0.8692| -0.79% | ## References * [SNIP: Single-shot Network Pruning based on Connection Sensitivity](https://arxiv.org/abs/1810.02340) diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/bert_mini_mrpc_1xchannel.yaml b/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/bert_mini_mrpc_1xchannel.yaml new file mode 100644 index 00000000000..33b29c17c6c --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/bert_mini_mrpc_1xchannel.yaml @@ -0,0 +1,23 @@ +version: 1.0 + +model: + name: "bert-mini" + framework: "pytorch" + +pruning: + approach: + weight_compression_pytorch: + start_step: 0 + end_step: 0 + excluded_names: ["classifier", "pooler", ".*embeddings*"] + prune_layer_type: ["Linear"] + target_sparsity: 0.9 + max_sparsity_ratio_per_layer: 0.98 + + pruners: + - !Pruner + pattern: "1xchannel" + update_frequency_on_step: 50 + prune_domain: "global" + prune_type: "snip_momentum" + sparsity_decay_type: "exp" diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/bert_mini_sst2_1xchannel.yaml b/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/bert_mini_sst2_1xchannel.yaml new file mode 100644 index 00000000000..ebb118ecc87 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/text-classification/pruning/pytorch_pruner/eager/bert_mini_sst2_1xchannel.yaml @@ -0,0 +1,25 @@ +version: 1.0 + +model: + name: "bert-mini" + framework: "pytorch" + +pruning: + approach: + weight_compression_pytorch: + start_step: 0 + end_step: 0 + excluded_names: ["classifier", "pooler", ".*embeddings*", "LayerNorm"] + prune_layer_type: ["Linear"] + target_sparsity: 0.9 + update_frequency_on_step: 500 + max_sparsity_ratio_per_layer: 0.98 + prune_domain: "global" + sparsity_decay_type: "exp" + pruners: + - !Pruner + pattern: "ic_pattern_1xchannel" + update_frequency_on_step: 500 + prune_domain: "global" + prune_type: "snip_momentum" + sparsity_decay_type: "exp" diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py index 2da5db448cb..13812b30b4e 100755 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_glue_tune.py @@ -432,11 +432,11 @@ def eval_func_for_nc(model_tuned): acc = result[key] break return acc - from neural_compressor.experimental import Quantization, common - quantizer = Quantization("./conf.yaml") - quantizer.model = common.Model(model) - quantizer.eval_func = eval_func_for_nc - q_model = quantizer.fit() + from neural_compressor.quantization import fit + from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion + tuning_criterion = TuningCriterion(max_trials=600) + conf = PostTrainingQuantConfig(approach="dynamic", backend="pytorch", tuning_criterion=tuning_criterion) + q_model = fit(model, conf=conf, eval_func=eval_func_for_nc) from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir) exit(0) diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py index 8ea43ea4a41..717ae91d886 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py @@ -498,13 +498,11 @@ def eval_func(model): # optimize and quantize with Neural Compressor if model_args.tune: - from neural_compressor.experimental import Quantization, common - calib_dataloader = eval_dataloader - quantizer = Quantization('conf.yaml') - quantizer.eval_func = eval_func - quantizer.calib_dataloader = calib_dataloader - quantizer.model = common.Model(model) - model = quantizer.fit() + from neural_compressor.quantization import fit + from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion + tuning_criterion = TuningCriterion(max_trials=600) + conf = PostTrainingQuantConfig(approach="static", backend="pytorch_fx", tuning_criterion=tuning_criterion) + model = fit(model, conf=conf, calib_dataloader=eval_dataloader, eval_func=eval_func) from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream save_for_huggingface_upstream(model, tokenizer, training_args.output_dir) return diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py index 79c785850c0..f5bc771e712 100644 --- a/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py +++ b/examples/pytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/run_glue_tune.py @@ -502,12 +502,6 @@ def compute_metrics(p: EvalPrediction): eval_dataloader = trainer.get_eval_dataloader() batch_size = eval_dataloader.batch_size - def train_func(model): - trainer.model_wrapped = model - trainer.model = model - trainer.train() - return trainer.model - def eval_func(model): trainer.model = model result = trainer.evaluate(eval_dataset=eval_dataset) @@ -526,12 +520,17 @@ def benchmark(model): # optimize and quantize with Neural Compressor if model_args.tune: - from neural_compressor.experimental import Quantization, common - quantizer = Quantization('conf_qat.yaml') - quantizer.eval_func = eval_func - quantizer.q_func = train_func - quantizer.model = common.Model(model) - model = quantizer.fit() + from neural_compressor.training import prepare_compression + from neural_compressor.config import QuantizationAwareTrainingConfig + conf = QuantizationAwareTrainingConfig(backend="pytorch_fx") + compression_manager = prepare_compression(model, conf) + compression_manager.callbacks.on_train_begin() + model = compression_manager.model + trainer.model_wrapped = model + trainer.model = model + trainer.train() + compression_manager.callbacks.on_train_end() + from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream save_for_huggingface_upstream(model, tokenizer, training_args.output_dir) return diff --git a/examples/pytorch/object_detection/ssd_resnet34/quantization/ptq/ipex/ssd_r34.py b/examples/pytorch/object_detection/ssd_resnet34/quantization/ptq/ipex/ssd_r34.py index 5edbe2580ad..4e2db16cb99 100644 --- a/examples/pytorch/object_detection/ssd_resnet34/quantization/ptq/ipex/ssd_r34.py +++ b/examples/pytorch/object_detection/ssd_resnet34/quantization/ptq/ipex/ssd_r34.py @@ -24,6 +24,7 @@ from base_model import ResNet34 from typing import List +import intel_extension_for_pytorch Vector = List[torch.Tensor] diff --git a/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet121.yaml b/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet121.yaml index 79da662a36f..b9da893f6da 100644 --- a/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet121.yaml +++ b/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet121.yaml @@ -38,6 +38,14 @@ quantization: # optional. tuning constrai algorithm: minmax weight: granularity: per_channel + op_wise: { + 'densenet121/MaxPool2D/MaxPool': { + 'activation': {'dtype': ['fp32']} + }, + 'densenet121/transition_block[1-3]/AvgPool2D/AvgPool': { + 'activation': {'dtype': ['fp32']}, + } + } evaluation: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization. accuracy: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization. diff --git a/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet161.yaml b/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet161.yaml index b5629ad649c..5312ed341fa 100644 --- a/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet161.yaml +++ b/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet161.yaml @@ -38,6 +38,14 @@ quantization: # optional. tuning constrai algorithm: minmax weight: granularity: per_channel + op_wise: { + 'densenet161/MaxPool2D/MaxPool': { + 'activation': {'dtype': ['fp32']} + }, + 'densenet161/transition_block[1-3]/AvgPool2D/AvgPool': { + 'activation': {'dtype': ['fp32']}, + } + } evaluation: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization. accuracy: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization. diff --git a/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet169.yaml b/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet169.yaml index 6892b69dc73..b63414d8acf 100644 --- a/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet169.yaml +++ b/examples/tensorflow/image_recognition/tensorflow_models/quantization/ptq/densenet169.yaml @@ -38,6 +38,14 @@ quantization: # optional. tuning constrai algorithm: minmax weight: granularity: per_channel + op_wise: { + 'densenet169/MaxPool2D/MaxPool': { + 'activation': {'dtype': ['fp32']} + }, + 'densenet169/transition_block[1-3]/AvgPool2D/AvgPool': { + 'activation': {'dtype': ['fp32']}, + } + } evaluation: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization. accuracy: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization. diff --git a/examples/tensorflow/oob_models/quantization/ptq/model_detail.py b/examples/tensorflow/oob_models/quantization/ptq/model_detail.py index 07ce37cf892..8c5f2d1770d 100644 --- a/examples/tensorflow/oob_models/quantization/ptq/model_detail.py +++ b/examples/tensorflow/oob_models/quantization/ptq/model_detail.py @@ -385,5 +385,11 @@ 'low': -1.0, 'high': 1.0 }, + # centernet_hg104 + { + 'model_name': 'centernet_hg104', + 'input': {'input_tensor': generate_data([224, 224, 3]),}, + 'output': ['Identity'], + }, ] diff --git a/examples/tensorflow/oob_models/quantization/ptq/run_benchmark.sh b/examples/tensorflow/oob_models/quantization/ptq/run_benchmark.sh index 87d16a45c1e..efd68dde04d 100755 --- a/examples/tensorflow/oob_models/quantization/ptq/run_benchmark.sh +++ b/examples/tensorflow/oob_models/quantization/ptq/run_benchmark.sh @@ -101,6 +101,9 @@ function set_args { NeuMF PRNet DIEN_Deep-Interest-Evolution-Network + EfficientDet-D2-768x768 + EfficientDet-D4-1024x1024 + centernet_hg104 -------- ) diff --git a/examples/tensorflow/oob_models/quantization/ptq/run_tuning.sh b/examples/tensorflow/oob_models/quantization/ptq/run_tuning.sh index 2971bedf7c3..a183dbb52e6 100755 --- a/examples/tensorflow/oob_models/quantization/ptq/run_tuning.sh +++ b/examples/tensorflow/oob_models/quantization/ptq/run_tuning.sh @@ -83,6 +83,7 @@ function set_args { DIEN_Deep-Interest-Evolution-Network EfficientDet-D2-768x768 EfficientDet-D4-1024x1024 + centernet_hg104 -------- ) diff --git a/neural_coder/__main__.py b/neural_coder/__main__.py index f9011e91f8b..8d9da0472c8 100644 --- a/neural_coder/__main__.py +++ b/neural_coder/__main__.py @@ -28,8 +28,8 @@ def parse_args(): parser.add_argument("--opt", type=str, default="", help="optimization feature to enable") - parser.add_argument("--strategy", type=str, default="static", - help="quantization strategy") + parser.add_argument("--approach", type=str, default="static", + help="quantization approach (strategy)") parser.add_argument('--config', type=str, default="", help='quantization configuration file path') @@ -53,11 +53,11 @@ def parse_args(): # optimize on copied script with Neural Coder from neural_coder import enable if args.opt == "": - if args.strategy == "static": + if args.approach == "static": features=["pytorch_inc_static_quant_fx"] - if args.strategy == "static_ipex": + if args.approach == "static_ipex": features=["pytorch_inc_static_quant_ipex"] - if args.strategy == "dynamic": + if args.approach == "dynamic": features=["pytorch_inc_dynamic_quant"] else: features=[args.opt] diff --git a/neural_coder/backends/onnx_inc_dynamic_quant.yaml b/neural_coder/backends/onnx_inc_dynamic_quant.yaml new file mode 100644 index 00000000000..3c50de8da8e --- /dev/null +++ b/neural_coder/backends/onnx_inc_dynamic_quant.yaml @@ -0,0 +1,30 @@ +# Copyright (c) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +transformation: + location: + - insert_below_model_definition_line + content: + - |- + [+] from neural_compressor.experimental import Quantization, common + [+] from neural_compressor import options, conf + [+] conf.model.framework = 'onnxrt_integerops' + [+] conf.quantization.approach = 'post_training_dynamic_quant' + [+] quantizer = Quantization(conf) + [+] quantizer.model = common.Model(MODEL_NAME) + [+] quantizer.eval_func = EVAL_FUNCTION_NAME + [+] MODEL_NAME = quantizer() + order: + - below: + above: diff --git a/neural_coder/backends/onnx_inc_static_quant_qdq.yaml b/neural_coder/backends/onnx_inc_static_quant_qdq.yaml new file mode 100644 index 00000000000..730c3220f45 --- /dev/null +++ b/neural_coder/backends/onnx_inc_static_quant_qdq.yaml @@ -0,0 +1,31 @@ +# Copyright (c) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +transformation: + location: + - insert_below_model_definition_line + content: + - |- + [+] from neural_compressor.experimental import Quantization, common + [+] from neural_compressor import options, conf + [+] conf.model.framework = 'onnxrt_qdqops' + [+] conf.quantization.approach = 'post_training_static_quant' + [+] quantizer = Quantization(conf) + [+] quantizer.model = common.Model(MODEL_NAME) + [+] quantizer.calib_dataloader = DATALOADER_NAME + [+] quantizer.eval_func = EVAL_FUNCTION_NAME + [+] MODEL_NAME = quantizer() + order: + - below: + above: diff --git a/neural_coder/docs/PythonLauncher.md b/neural_coder/docs/PythonLauncher.md index 38e3bd5fae7..d73257a3a97 100644 --- a/neural_coder/docs/PythonLauncher.md +++ b/neural_coder/docs/PythonLauncher.md @@ -27,7 +27,7 @@ Note: Any modification on the optimized code ```run_glue_optimized.py``` will be Users can specify which Deep Learning optimization they want to conduct using ```--opt``` argument. The list of supported Deep Learning optimization features can be found [here](SupportMatrix.md). -Note that if specifically optimizing with INT8 quantization by Intel® Neural Compressor, ```--strategy``` argument can be specified with either ```static```, ```static_ipex``` or ```dynamic```. For example, to run INT8 dynamic quantization by Intel® Neural Compressor instead of the default static quantization: +Note that if specifically optimizing with INT8 quantization by Intel® Neural Compressor, to choose a quantization approach (strategy), ```--approach``` argument can be specified with either ```static```, ```static_ipex``` or ```dynamic```. For example, to run INT8 dynamic quantization by Intel® Neural Compressor instead of the default static quantization: ```bash -python -m neural_coder --strategy dynamic run_glue.py --model_name_or_path bert-base-cased --task_name mrpc --do_eval --output_dir result +python -m neural_coder --approach dynamic run_glue.py --model_name_or_path bert-base-cased --task_name mrpc --do_eval --output_dir result ``` diff --git a/neural_compressor/__init__.py b/neural_compressor/__init__.py index 6bdf202786c..bc46fdbd916 100644 --- a/neural_compressor/__init__.py +++ b/neural_compressor/__init__.py @@ -24,7 +24,8 @@ from .utils.utility import set_backend from .utils import options from .conf.config import conf -from .conf.pythonic_config import config, DistillationConfig, Options, PostTrainingConfig, \ - PruningConfig, QuantizationAwareTrainingConfig +from .conf.pythonic_config import config +from .config import DistillationConfig, PostTrainingQuantConfig, \ + PruningConfig, QuantizationAwareTrainingConfig -set_backend('NA') +set_backend('NA') \ No newline at end of file diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py index 833d011f858..2a5f62af196 100644 --- a/neural_compressor/adaptor/pytorch.py +++ b/neural_compressor/adaptor/pytorch.py @@ -1029,7 +1029,7 @@ def _get_quantizable_ops(self, model): # get bf16 capability - if (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1') and \ + if self.use_bf16 and (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1') and \ (self.version.release >= Version("1.11.0").release): self.bf16_ops = self.query_handler.get_op_types_by_precision("bf16") bf16_ops = [] @@ -1308,19 +1308,34 @@ def _pre_hook_for_qat(self, dataloader=None): qscheme=torch.per_tensor_affine, reduce_range=REDUCE_RANGE), weight=torch.quantization.default_weight_fake_quant) + self.non_quant_dict = self.get_non_quant_modules(self.model.kwargs) + quantizable_ops = [] + self._get_quantizable_ops_recursively(self.model._model, '', quantizable_ops) + self.bf16_ops = self.query_handler.get_op_types_by_precision("bf16") + bf16_ops = [] + if self.version.release >= Version("1.11.0").release and self.use_bf16 and \ + (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1'): # pragma: no cover + self._get_bf16_ops_recursively(self.model._model, '', bf16_ops) + bf16_ops_list = [(op) for op in bf16_ops if op not in quantizable_ops] self.model.model.training = True torch.quantization.prepare_qat(self.model._model, inplace=True) - def _post_hook_for_qat(self): - torch.quantization.convert(self.model._model, inplace=True) # This is a flag for reloading self.model.q_config = { 'is_oneshot': True, 'framework': 'pytorch', 'reduce_range': REDUCE_RANGE, - 'approach': 'quant_aware_training' + 'approach': 'quant_aware_training', + 'bf16_ops_list': bf16_ops_list, } + def _post_hook_for_qat(self): + torch.quantization.convert(self.model._model, inplace=True) + if len(self.model.q_config['bf16_ops_list']) > 0 and \ + self.version.release >= Version("1.11.0").release and self.use_bf16 and \ + (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1'): # pragma: no cover + self.model._model = torch_utils.bf16_convert.Convert(self.model._model, self.model.q_config) + def _pre_hook_for_hvd(self, dataloader=None): # TODO: lazy init here hvd.init() @@ -2220,7 +2235,8 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None): self.model_calibration(q_model, dataloader, iterations, None, tune_cfg.get('calib_sampling_size', 1)) q_model.save_qconf_summary(qconf_summary=self.ipex_config_path) - if self.use_bf16: + if self.use_bf16 and (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1') and \ + (self.version.release >= Version("1.11.0").release): with torch.no_grad(): with torch.cpu.amp.autocast(): q_model = ipex.quantization.convert(q_model) @@ -2231,6 +2247,7 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None): q_model = torch.jit.trace(q_model, example_inputs, strict=False) q_model = torch.jit.freeze(q_model.eval()) else: + q_model = ipex.quantization.convert(q_model) with torch.no_grad(): try: q_model = torch.jit.trace(q_model, example_inputs) @@ -2486,7 +2503,7 @@ def _get_quantizable_ops_recursively(self, model, prefix, quantizable_ops): if isinstance(self.q_dataloader, BaseDataLoader): self.q_dataloader.batch(batch_size) logger.info('Recovery `calibration.dataloader.batchsize` {} according \ - to config.yaml'.format(batch_size)) + to config.yaml' .format(batch_size)) del init_model with open(self.ipex_config_path, 'r') as f: self.cfgs = json.load(f) @@ -2661,12 +2678,11 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None): self.tune_cfg = tune_cfg self.tune_cfg["approach"] = self.approach self.tune_cfg["framework"] = "pytorch_fx" - # pragma: no cover - if self.approach != 'post_training_dynamic_quant' and self.version.release >= Version("1.13.0").release: - assert dataloader is not None, "Please pass a dataloader to quantizer!" - example_inputs = get_example_inputs(model._model, dataloader) - else: - example_inputs = None + + # PyTorch 1.13 and above version, need example_inputs for fx trace, but it not realy used, + # so set it to None. + example_inputs = None + if self.default_qconfig is not None: default_qconfig = copy.deepcopy(self.default_qconfig) default_qconfig['activation']['dtype'] = \ @@ -2773,7 +2789,7 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None): q_model._model, prefix='') if len(self.tune_cfg['bf16_ops_list']) > 0 and \ - self.version.release >= Version("1.11.0").release and \ + self.version.release >= Version("1.11.0").release and self.use_bf16 and \ (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1'): # pragma: no cover q_model._model = torch_utils.bf16_convert.Convert(q_model._model, self.tune_cfg) @@ -2843,6 +2859,12 @@ def _pre_hook_for_qat(self, dataloader=None): quantizable_ops = [] tmp_model = self.fuse_fx_model(self.model, is_qat=True) self._get_quantizable_ops_recursively(tmp_model, '', quantizable_ops) + self.bf16_ops = self.query_handler.get_op_types_by_precision("bf16") + bf16_ops = [] + if self.version.release >= Version("1.11.0").release and self.use_bf16 and \ + (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1'): # pragma: no cover + self._get_bf16_ops_recursively(tmp_model, '', bf16_ops) + bf16_ops_list = [(op) for op in bf16_ops if op not in quantizable_ops] quantized_ops = OrderedDict() for op in quantizable_ops: if op[1] in [ @@ -2851,6 +2873,10 @@ def _pre_hook_for_qat(self, dataloader=None): quantized_ops[op[0]] = torch.quantization.default_dynamic_qconfig else: quantized_ops[op[0]] = q_cfgs + # build for fetching scale and zeropoint + op_config_dict = {} + for op in quantizable_ops: + op_config_dict[op] = {'weight': {'dtype': 'int8'}, 'activation': {'dtype': 'uint8'}} if self.version.release < Version("1.11.0").release: quantized_ops["default_qconfig"] = None else: @@ -2861,11 +2887,10 @@ def _pre_hook_for_qat(self, dataloader=None): from torch.quantization.quantize_fx import prepare_qat_fx fx_op_cfgs = _cfgs_to_fx_cfgs(quantized_ops, 'quant_aware_training') self.model._model.train() - if self.version.release >= Version("1.13.0").release: # pragma: no cover - assert dataloader is not None, "Please pass dataloader to qat hook!" - example_inputs = get_example_inputs(self.model._model, dataloader) - else: - example_inputs = None + + # PyTorch 1.13 and above version, need example_inputs for fx trace, but it not realy used, + # so set it to None. + example_inputs = None if self.sub_module_list is None: if self.version.release >= Version("1.13.0").release: # pragma: no cover @@ -2893,10 +2918,13 @@ def _pre_hook_for_qat(self, dataloader=None): example_inputs=example_inputs) # This is a flag for reloading self.model.q_config = { + 'calib_sampling_size': 100, # tmp arg for export API 'is_oneshot': True, 'framework': 'pytorch_fx', 'reduce_range': REDUCE_RANGE, 'quantizable_ops': quantizable_ops, + 'bf16_ops_list': bf16_ops_list, + 'op': op_config_dict, 'sub_module_list': self.sub_module_list, 'approach': 'quant_aware_training' } @@ -2919,6 +2947,15 @@ def _post_hook_for_qat(self): PyTorch_FXAdaptor.convert_sub_graph(self.sub_module_list, \ self.model._model, prefix='') + if self.approach != 'post_training_dynamic_quant': + self._get_scale_zeropoint(self.model._model, self.model.q_config) + if len(self.model.q_config['bf16_ops_list']) > 0 and \ + self.version.release >= Version("1.11.0").release and self.use_bf16 and \ + (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1'): # pragma: no cover + self.model._model = torch_utils.bf16_convert.Convert(self.model._model, self.model.q_config) + self._dump_model_op_stats(self.model._model, self.model.q_config, self.approach) + torch_utils.util.get_embedding_contiguous(self.model._model) + def train(self, model, dataloader, optimizer_tuple, criterion_tuple, hooks, **kwargs): """Execute the train process on the specified model. @@ -3092,7 +3129,7 @@ def _dump_model_op_stats(self, model, tune_cfg, approach): res = dict() self._get_sub_module_op_stats(model, tune_cfg, approach, res) - if (self.version.release >= Version("1.11.0").release) and \ + if self.use_bf16 and (self.version.release >= Version("1.11.0").release) and \ (CpuInfo().bf16 or os.getenv('FORCE_BF16') == '1'): # pragma: no cover bf16_ops_list = tune_cfg['bf16_ops_list'] if len(bf16_ops_list) > 0: diff --git a/neural_compressor/adaptor/tensorflow.py b/neural_compressor/adaptor/tensorflow.py index ccc57341fc6..f9f229e96b2 100644 --- a/neural_compressor/adaptor/tensorflow.py +++ b/neural_compressor/adaptor/tensorflow.py @@ -648,6 +648,8 @@ def _dump_model_op_stats(self, model_graphdef): origin_op_type = 'DepthwiseConv2dNative' if origin_op_type == 'BatchMatMul': origin_op_type = 'BatchMatMulV2' + if origin_op_type == 'FusedBatchMatMulV2': + origin_op_type = '_MklFusedBatchMatMulV2' if origin_op_type == 'Deconv2D': origin_op_type = 'Conv2DBackpropInput' if origin_op_type == 'Deconv3D': diff --git a/neural_compressor/adaptor/tensorflow.yaml b/neural_compressor/adaptor/tensorflow.yaml index 5502158a443..62524f544db 100644 --- a/neural_compressor/adaptor/tensorflow.yaml +++ b/neural_compressor/adaptor/tensorflow.yaml @@ -30,7 +30,7 @@ 'MaxPool', 'MaxPool3D', 'AvgPool', 'Conv2DBackpropInput', 'Conv3DBackpropInputV2'] bf16: ["_MklLayerNorm", "Conv2D", "Conv2DBackpropFilter", "Conv2DBackpropInput", "Conv3D", "Conv3DBackpropFilterV2", "Conv3DBackpropInputV2", "DepthwiseConv2dNative", "DepthwiseConv2dNativeBackpropFilter", "DepthwiseConv2dNativeBackpropInput", "GRUBlockCell", - "AUGRUBlockCell", "MklGRU", "MklAUGRU", "MatMul", "BatchMatMul", "BatchMatMulV2", "Einsum", # allow_list + "AUGRUBlockCell", "MklGRU", "MklAUGRU", "MatMul", "BatchMatMul", "BatchMatMulV2", "_MklFusedBatchMatMulV2", "Einsum", # allow_list "Add", "AddN", "AddV2", "AvgPool", "AvgPool3D", "AvgPool3DGrad", "AvgPoolGrad", "BiasAdd", "BiasAddGrad", "BiasAddV1", "Erf", "FusedBatchNormV2", "FusedBatchNormGradV2", "FusedBatchNormV3", "FusedBatchNormGradV3", "LeakyRelu", "LeakyReluGrad", "Mean", "Mul", "Sub", "Elu", "EluGrad", "FloorDiv", "_FusedBatchNormEx", "Log", "Log1p", "LogSoftmax", "Prod", "RealDiv", @@ -299,6 +299,7 @@ 'Dequantize + DepthwiseConv2dNative + Add + Relu6 + QuantizeV2', 'Dequantize + DepthwiseConv2dNative + BiasAdd + QuantizeV2', 'Dequantize + FusedBatchNormV3 + Relu + QuantizeV2', + 'Dequantize + FusedBatchNormV3 + LeakyRelu + QuantizeV2', 'Dequantize + _MklFusedInstanceNorm + Relu + QuantizeV2', 'Dequantize + _MklFusedInstanceNorm + LeakyRelu + QuantizeV2', 'Dequantize + Conv2DBackpropInput + BiasAdd + QuantizeV2', diff --git a/neural_compressor/adaptor/tf_utils/graph_converter.py b/neural_compressor/adaptor/tf_utils/graph_converter.py index 6e09ae02751..ca6573baf9f 100644 --- a/neural_compressor/adaptor/tf_utils/graph_converter.py +++ b/neural_compressor/adaptor/tf_utils/graph_converter.py @@ -160,6 +160,10 @@ def _inference(self, model): Args: model(TensorflowBaseModel): input TensorflowBaseModel """ + # ITEX optimization has broken INC calibration process. + # INC needs turn off ITEX optimization pass in calibration stage. + # TODO ITEX will provide API to replace setting environment variable. + os.environ["ITEX_REMAPPER"] = "0" sess = model.sess iter_op = model.iter_op input_tensor = model.input_tensor @@ -220,24 +224,25 @@ def check_shape(tensor, data): return True disorder_tensors = [] - disorder_inputs = [] + disorder_inputs = [] for idx, sort_tensor in enumerate(input_tensor): sort_input = inputs[idx] if check_shape(sort_tensor, sort_input): - feed_dict.update({sort_tensor: sort_input}) + feed_dict.update({sort_tensor: sort_input}) else: disorder_tensors.append(sort_tensor) disorder_inputs.append(sort_input) for i, dis_tensor in enumerate(disorder_tensors): - for j, dis_input in enumerate(disorder_inputs): - if check_shape(dis_tensor, dis_input): - feed_dict.update({dis_tensor: dis_input}) - break + for j, dis_input in enumerate(disorder_inputs): + if check_shape(dis_tensor, dis_input): + feed_dict.update({dis_tensor: dis_input}) + break _ = sess.run(output_tensor, feed_dict) if iter_op==[] \ else iterator_sess_run(sess, iter_op, \ feed_dict, output_tensor, self.calib_iteration) if idx + 1 == self.calib_iteration: break + os.environ["ITEX_REMAPPER"] = "1" def _check_tf_version(self): is_supported_version = False @@ -517,6 +522,7 @@ def bf16_convert(self): FP32 + INT8 mixed precision graph. """ try: + logger.info("Start BF16 conversion.") self._tmp_model.graph_def = BF16Convert( self._tmp_model.graph_def, self.fp32_ops, diff --git a/neural_compressor/adaptor/tf_utils/graph_rewriter/bf16/bf16_convert.py b/neural_compressor/adaptor/tf_utils/graph_rewriter/bf16/bf16_convert.py index 0a79543b409..707bd69c47d 100644 --- a/neural_compressor/adaptor/tf_utils/graph_rewriter/bf16/bf16_convert.py +++ b/neural_compressor/adaptor/tf_utils/graph_rewriter/bf16/bf16_convert.py @@ -243,11 +243,6 @@ def _model_bf16_convert(self): for bf16_node_name in set(self.bf16_ops): if bf16_node_name not in self.cur_graph.node_name_details: self.bf16_ops.remove(bf16_node_name) - continue - else: - if "fused_ops" in self.cur_graph.node_name_details[bf16_node_name].node.attr: - self.bf16_ops.remove(bf16_node_name) - continue for bf16_node_name in sorted(list(set(self.bf16_ops))): self._bf16_convert(bf16_node_name) return self.cur_graph.dump_graph() diff --git a/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_pad_with_conv.py b/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_pad_with_conv.py index 042c89769d9..e5f1da798ca 100644 --- a/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_pad_with_conv.py +++ b/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_pad_with_conv.py @@ -45,6 +45,7 @@ def do_transformation(self): target_nodes = cur_graph.query_fusion_pattern_nodes( [["Pad"], ["Conv2D", "Conv3D", "DepthwiseConv2dNative"], ('BiasAdd', 'Add', 'AddV2')]) + padding_tensor_dict = {} for node_combination in target_nodes: conv_name = node_combination[1] @@ -70,21 +71,26 @@ def do_transformation(self): continue padding_tensor = None - pad_node = graph_info[node_combination[0]].node - if graph_info[pad_node.input[1]].node.op != 'Const': - input_node = graph_info[pad_node.input[1]].node - if input_node.op == 'DataFormatVecPermute': - parent_input_node = graph_info[input_node.input[0]].node - if parent_input_node.op == 'Const': - padding_tensor = tensor_util.MakeNdarray( \ - parent_input_node.attr["value"].tensor).flatten() + pad_node = None + if node_combination[0] not in padding_tensor_dict: + pad_node = graph_info[node_combination[0]].node + if graph_info[pad_node.input[1]].node.op != 'Const': + input_node = graph_info[pad_node.input[1]].node + if input_node.op == 'DataFormatVecPermute': + parent_input_node = graph_info[input_node.input[0]].node + if parent_input_node.op == 'Const': + padding_tensor = tensor_util.MakeNdarray( \ + parent_input_node.attr["value"].tensor).flatten() + else: + continue else: continue else: - continue + padding_tensor = tensor_util.MakeNdarray( + graph_info[pad_node.input[1]].node.attr["value"].tensor).flatten() + padding_tensor_dict[node_combination[0]] = padding_tensor else: - padding_tensor = tensor_util.MakeNdarray( - graph_info[pad_node.input[1]].node.attr["value"].tensor).flatten() + padding_tensor = padding_tensor_dict[node_combination[0]] if self.itex_qdq_mode: enabled_pad_conv2d = bool(tf.version.VERSION == '1.15.0-up3' or \ @@ -95,12 +101,13 @@ def do_transformation(self): if any(padding_tensor) and not enabled_pad_conv2d: # pragma: no cover continue - if graph_info[pad_node.input[1]].node.op != 'Const': - cur_graph.node_name_details[pad_node.name].node.input.remove(pad_node.input[1]) - cur_graph.remove_node_with_single_input_output(pad_node.name) - else: - cur_graph.remove_node_with_single_input_output(pad_node.name) - cur_graph.remove_node(pad_node.input[1]) + if pad_node: + if graph_info[pad_node.input[1]].node.op != 'Const': + cur_graph.node_name_details[pad_node.name].node.input.remove(pad_node.input[1]) + cur_graph.remove_node_with_single_input_output(pad_node.name) + else: + cur_graph.remove_node_with_single_input_output(pad_node.name) + cur_graph.remove_node(pad_node.input[1]) conv_node = graph_info[node_combination[1]].node if self.itex_qdq_mode: if any(padding_tensor) and enabled_pad_conv2d: # pragma: no cover diff --git a/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_pad_with_fp32_conv.py b/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_pad_with_fp32_conv.py index 8b63b17ff31..2866a40ec04 100644 --- a/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_pad_with_fp32_conv.py +++ b/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_pad_with_fp32_conv.py @@ -46,6 +46,7 @@ def do_transformation(self): target_nodes = cur_graph.query_fusion_pattern_nodes( [["Pad"], ["Conv2D", "DepthwiseConv2dNative"], ('BiasAdd', 'Add', 'AddV2')]) + padding_tensor_dict = {} for node_combination in target_nodes: conv_name = node_combination[1] @@ -71,21 +72,26 @@ def do_transformation(self): continue padding_tensor = None - pad_node = graph_info[node_combination[0]].node - if graph_info[pad_node.input[1]].node.op != 'Const': - input_node = graph_info[pad_node.input[1]].node - if input_node.op == 'DataFormatVecPermute': - parent_input_node = graph_info[input_node.input[0]].node - if parent_input_node.op == 'Const': - padding_tensor = tensor_util.MakeNdarray( \ - parent_input_node.attr["value"].tensor).flatten() + pad_node = None + if node_combination[0] not in padding_tensor_dict: + pad_node = graph_info[node_combination[0]].node + if graph_info[pad_node.input[1]].node.op != 'Const': + input_node = graph_info[pad_node.input[1]].node + if input_node.op == 'DataFormatVecPermute': + parent_input_node = graph_info[input_node.input[0]].node + if parent_input_node.op == 'Const': + padding_tensor = tensor_util.MakeNdarray( \ + parent_input_node.attr["value"].tensor).flatten() + else: + continue else: continue else: - continue + padding_tensor = tensor_util.MakeNdarray( + graph_info[pad_node.input[1]].node.attr["value"].tensor).flatten() + padding_tensor_dict[node_combination[0]] = padding_tensor else: - padding_tensor = tensor_util.MakeNdarray( - graph_info[pad_node.input[1]].node.attr["value"].tensor).flatten() + padding_tensor = padding_tensor_dict[node_combination[0]] if self.itex_qdq_mode: enabled_pad_conv2d = bool(tf.version.VERSION == '1.15.0-up3' or \ @@ -95,12 +101,14 @@ def do_transformation(self): if any(padding_tensor) and not enabled_pad_conv2d: # pragma: no cover continue - if graph_info[pad_node.input[1]].node.op != 'Const': - cur_graph.node_name_details[pad_node.name].node.input.remove(pad_node.input[1]) - cur_graph.remove_node_with_single_input_output(pad_node.name) - else: - cur_graph.remove_node_with_single_input_output(pad_node.name) - cur_graph.remove_node(pad_node.input[1]) + + if pad_node: + if graph_info[pad_node.input[1]].node.op != 'Const': + cur_graph.node_name_details[pad_node.name].node.input.remove(pad_node.input[1]) + cur_graph.remove_node_with_single_input_output(pad_node.name) + else: + cur_graph.remove_node_with_single_input_output(pad_node.name) + cur_graph.remove_node(pad_node.input[1]) conv_node = graph_info[node_combination[1]].node # Helper.set_attr_int_list(conv_node, "padding_list", padding_tensor) # only when padding attr is explicit, the explicit_paddings is not empty diff --git a/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/pre_optimize.py b/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/pre_optimize.py index 4bb1d1a2b04..d7c2e33ca83 100644 --- a/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/pre_optimize.py +++ b/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/pre_optimize.py @@ -146,16 +146,16 @@ def get_optimized_model(self, itex_mode=False): self._tmp_graph_def = ConvertPlaceholderToConst(self._tmp_graph_def).do_transformation() - self._tmp_graph_def = RemoveTrainingNodesOptimizer( - self._tmp_graph_def, protected_nodes=input_output_names).do_transformation() - self._tmp_graph_def = SwitchOptimizer(self._tmp_graph_def).do_transformation() + self._tmp_graph_def = GrapplerOptimizer( + self._tmp_graph_def, input_output_names, self.optimization).do_transformation() + self._tmp_graph_def = StripUnusedNodesOptimizer(self._tmp_graph_def, input_node_names, output_node_names).do_transformation() - self._tmp_graph_def = GrapplerOptimizer( - self._tmp_graph_def, input_output_names, self.optimization).do_transformation() + self._tmp_graph_def = RemoveTrainingNodesOptimizer( + self._tmp_graph_def, protected_nodes=input_output_names).do_transformation() self._tmp_graph_def = SplitSharedInputOptimizer(self._tmp_graph_def).do_transformation() @@ -204,7 +204,7 @@ def get_optimized_model(self, itex_mode=False): self._tmp_graph_def = FetchWeightFromReshapeOptimizer( self._tmp_graph_def).do_transformation() - if not self.new_api: + if not self.new_api and not itex_mode: #TODO we need to remove below optimizer once the TF enabled the single # matmul op quantization self._tmp_graph_def = InjectDummyBiasAddOptimizer( @@ -221,7 +221,7 @@ def get_optimized_model(self, itex_mode=False): self._tmp_graph_def = StripEquivalentNodesOptimizer( self._tmp_graph_def, output_node_names).do_transformation() - if self.new_api: + if self.new_api or itex_mode: self._tmp_graph_def = DilatedContraction( self._tmp_graph_def).do_transformation() self._tmp_graph_def.library.CopyFrom(self.model.graph_def.library) diff --git a/neural_compressor/adaptor/tf_utils/graph_rewriter/int8/fuse_matmul_requantize.py b/neural_compressor/adaptor/tf_utils/graph_rewriter/int8/fuse_matmul_requantize.py index 2060fecbc4e..9647b657d4c 100644 --- a/neural_compressor/adaptor/tf_utils/graph_rewriter/int8/fuse_matmul_requantize.py +++ b/neural_compressor/adaptor/tf_utils/graph_rewriter/int8/fuse_matmul_requantize.py @@ -588,10 +588,16 @@ def do_transformation(self): min_filter_node = None # The Min and Max of non-const weight node are from QuantizeV2's output, not valid nodes. # Add check here for excluding this case. - if ":2" not in new_node.input[6]: - max_filter_node = self.graph_info[new_node.input[6]].node - if ":1" not in new_node.input[5]: - min_filter_node = self.graph_info[new_node.input[5]].node + if len(attr_fused_ops) == 0: # single matmul case + if ":2" not in new_node.input[5]: + max_filter_node = self.graph_info[new_node.input[5]].node + if ":1" not in new_node.input[4]: + min_filter_node = self.graph_info[new_node.input[4]].node + else: + if ":2" not in new_node.input[6]: + max_filter_node = self.graph_info[new_node.input[6]].node + if ":1" not in new_node.input[5]: + min_filter_node = self.graph_info[new_node.input[5]].node last_node = self.graph_info[new_node.input[0]].node is_min_first = bool(quantized_node.attr['input_quant_mode'].s == b'MIN_FIRST') weight_node = self.graph_info[new_node.input[1]].node diff --git a/neural_compressor/adaptor/tf_utils/graph_rewriter/qdq/insert_qdq_pattern.py b/neural_compressor/adaptor/tf_utils/graph_rewriter/qdq/insert_qdq_pattern.py index 863876590a5..091d02add98 100644 --- a/neural_compressor/adaptor/tf_utils/graph_rewriter/qdq/insert_qdq_pattern.py +++ b/neural_compressor/adaptor/tf_utils/graph_rewriter/qdq/insert_qdq_pattern.py @@ -81,6 +81,8 @@ def do_transformation(self): self.g.graph = copy.deepcopy(self.model) self.graph_info = self.g.parse_graph() + self.g.get_frame_info() + # insert QDQ pattern for op's input for op_name in quantizable_op_names: if self._ignore_insert_qdq_pattern(op_name): @@ -115,20 +117,16 @@ def do_transformation(self): computational_node = self.graph_info[computational_node_name].node weight_name = computational_node.input[1] - weight_node = self.graph_info[weight_name].node if re.search(r"\w+:\d+", weight_name): weight_node = self.graph_info[weight_name.rsplit(':', 1)[0]].node else: weight_node = self.graph_info[weight_name].node - enter_node = None if weight_node.op == 'Enter': if self.itex_mode: parent_node = self.graph_info[Helper.node_name_from_input(weight_node.input[0])].node if not parent_node.op == 'Const': continue - else: - enter_node = weight_node - weight_node = parent_node + weight_node = parent_node else: continue @@ -139,10 +137,10 @@ def do_transformation(self): else: per_channel = False weight_bit = 7 - + self._insert_qdq_pattern_for_weight_node(computational_node, weight_node, - enter_node, + weight_name, min_max_values, per_channel, weight_bit, @@ -184,7 +182,7 @@ def _check_op_list(self, node_type): "MaxPool", "MaxPool3D", "FusedBatchNormV3", "Requantize", "RequantizePerChannel", "AvgPool", "Pad", "CropAndResize", "Dequantize", "Mean", "MatMul", "BatchMatMul", "BatchMatMulV2", "FakeQuantWithMinMaxVars", "_MklFusedInstanceNorm", - "Conv2DBackpropInput", "Conv3DBackpropInputV2") + "Conv2DBackpropInput", "Conv3DBackpropInputV2", "Sigmoid", "BiasAdd") return any([node_type.find(i) != -1 for i in op_list]) def _find_relu_node(self, node): @@ -200,7 +198,7 @@ def _find_relu_node(self, node): or len(self.node_name_mapping \ [Helper.node_name_from_input(node.input[0])].output) > 1): return True - elif 'T' in node.attr and node.attr['T'].type in (dtypes.quint8, dtypes.uint8): + elif 'T' in node.attr and dtypes.DType(node.attr['T'].type) in (dtypes.quint8, dtypes.uint8): return True elif (node.op.find("QuantizedConv") != -1 or node.op.find("QuantizedDepthwiseConv") != -1 or @@ -414,7 +412,7 @@ def _insert_qdq_pattern_for_each_input(self, op_name, namespace_prefix, def _insert_qdq_pattern_for_weight_node(self, computational_node, weight_node, - enter_node, + weight_name, min_max_values, per_channel, weight_bit=7.0, @@ -504,41 +502,27 @@ def _insert_qdq_pattern_for_weight_node(self, max_node = Helper.create_constant_node(max_name, max_value, dtypes.float32, device="cpu") if "BatchMatMul" in host_op_type and "BatchMatMul" not in weight_node.op: - min_node.input.append("^" + weight_node.name) - max_node.input.append("^" + weight_node.name) + min_node.input.append("^" + weight_name) + max_node.input.append("^" + weight_name) - quant_const_enter_node = None min_enter_node = None max_enter_node = None - if enter_node: - quant_const_enter_node = Helper.create_node('Enter', \ - qint8_const_name + '_enter', [weight_node.name]) - Helper.set_attr_string(quant_const_enter_node, - 'frame_name', enter_node.attr['frame_name'].s) - Helper.set_attr_dtype(quant_const_enter_node, 'T', dtypes.float32) - Helper.set_attr_bool(quant_const_enter_node, 'is_constant', True) - Helper.set_attr_int(quant_const_enter_node, \ - 'parallel_iterations', enter_node.attr['parallel_iterations'].i) + if insert_reshape: + reshape_dims_4to3_name = qint8_const_name + "_reshape_dims_4to3_" + reshape_dims_4to3_node = Helper.create_constant_node( + reshape_dims_4to3_name, shape_convert, dtypes.int32) + reshape_4to3_name = qint8_const_name + "_reshape_4to3_" + reshape_4to3_node = Helper.create_node("Reshape", reshape_4to3_name, + [weight_node.name, reshape_dims_4to3_name]) + reshape_4to3_node.attr["T"].CopyFrom( + attr_value_pb2.AttrValue(type=dtypes.float32.as_datatype_enum)) quant_node = Helper.create_node( "QuantizeV2", qint8_const_name + '_quant', - [quant_const_enter_node.name, min_name, max_name]) + [reshape_4to3_name, min_name, max_name]) else: - if insert_reshape: - reshape_dims_4to3_name = qint8_const_name + "_reshape_dims_4to3_" - reshape_dims_4to3_node = Helper.create_constant_node( - reshape_dims_4to3_name, shape_convert, dtypes.int32) - reshape_4to3_name = qint8_const_name + "_reshape_4to3_" - reshape_4to3_node = Helper.create_node("Reshape", reshape_4to3_name, - [weight_node.name, reshape_dims_4to3_name]) - reshape_4to3_node.attr["T"].CopyFrom( - attr_value_pb2.AttrValue(type=dtypes.float32.as_datatype_enum)) - quant_node = Helper.create_node( - "QuantizeV2", qint8_const_name + '_quant', - [reshape_4to3_name, min_name, max_name]) - else: - quant_node = Helper.create_node( - "QuantizeV2", qint8_const_name + '_quant', - [weight_node.name, min_name, max_name]) + quant_node = Helper.create_node( + "QuantizeV2", qint8_const_name + '_quant', + [weight_node.name, min_name, max_name]) dequant_node = Helper.create_node( "Dequantize", base_name + '_dequant', @@ -549,10 +533,10 @@ def _insert_qdq_pattern_for_weight_node(self, Helper.set_attr_dtype(dequant_node, "T", dtypes.qint8) Helper.set_attr_string(dequant_node, "mode", b"SCALED") if per_channel: - if host_op_type == 'Conv2D' or host_op_type == 'Conv2DBackpropInput': + if host_op_type in ('Conv2D', 'Conv2DBackpropInput'): Helper.set_attr_int(quant_node, 'axis', 3) Helper.set_attr_int(dequant_node, 'axis', 3) - elif host_op_type == 'Conv3D' or host_op_type == 'Conv3DBackpropInputV2': + elif host_op_type in ('Conv3D', 'Conv3DBackpropInputV2'): Helper.set_attr_int(quant_node, 'axis', 4) Helper.set_attr_int(dequant_node, 'axis', 4) elif host_op_type == 'MatMul': @@ -584,25 +568,24 @@ def _insert_qdq_pattern_for_weight_node(self, self.g_weight.add_node(reshape_3to4_node, dequant_node.name, [computational_node.name]) computational_node.input[1] = reshape_3to4_node.name else: - if enter_node: + if weight_node.name in self.g.parent_frame_details and self.g.parent_frame_details[weight_node.name]: min_enter_node = Helper.create_node('Enter', min_name + '_enter', [min_name]) - Helper.set_attr_string(min_enter_node, - 'frame_name', enter_node.attr['frame_name'].s) + Helper.set_attr_string(min_enter_node, 'frame_name', + self.g.parent_frame_details[weight_node.name].attr['frame_name'].s) Helper.set_attr_dtype(min_enter_node, 'T', dtypes.float32) Helper.set_attr_bool(min_enter_node, 'is_constant', True) Helper.set_attr_int(min_enter_node, 'parallel_iterations', \ - enter_node.attr['parallel_iterations'].i) + self.g.parent_frame_details[weight_node.name].attr['parallel_iterations'].i) max_enter_node = Helper.create_node('Enter', max_name + '_enter', [max_name]) - Helper.set_attr_string(max_enter_node, - 'frame_name', enter_node.attr['frame_name'].s) + Helper.set_attr_string(max_enter_node, 'frame_name', + self.g.parent_frame_details[weight_node.name].attr['frame_name'].s) Helper.set_attr_dtype(max_enter_node, 'T', dtypes.float32) Helper.set_attr_bool(max_enter_node, 'is_constant', True) Helper.set_attr_int(max_enter_node, 'parallel_iterations',\ - enter_node.attr['parallel_iterations'].i) + self.g.parent_frame_details[weight_node.name].attr['parallel_iterations'].i) - self.g_weight.add_node(quant_const_enter_node, weight_node.name, [quant_node.name]) - self.g_weight.add_node(quant_node, quant_const_enter_node.name, []) + self.g_weight.add_node(quant_node, weight_name, []) self.g_weight.add_node(min_node, None, [min_enter_node.name]) self.g_weight.add_node(max_node, None, [max_enter_node.name]) self.g_weight.add_node(min_enter_node, min_node.name, [quant_node.name]) @@ -610,7 +593,7 @@ def _insert_qdq_pattern_for_weight_node(self, self.g_weight.add_node(dequant_node, quant_node.name, [computational_node.name]) computational_node.input[1] = dequant_node.name else: - self.g_weight.add_node(quant_node, weight_node.name, []) + self.g_weight.add_node(quant_node, weight_name, []) self.g_weight.add_node(min_node, None, [quant_node.name]) self.g_weight.add_node(max_node, None, [quant_node.name]) self.g_weight.add_node(dequant_node, quant_node.name, [computational_node.name]) diff --git a/neural_compressor/adaptor/tf_utils/graph_util.py b/neural_compressor/adaptor/tf_utils/graph_util.py index 77903d4b62c..d810f1d87a1 100644 --- a/neural_compressor/adaptor/tf_utils/graph_util.py +++ b/neural_compressor/adaptor/tf_utils/graph_util.py @@ -918,11 +918,13 @@ def gen_per_iter(data): if i.startswith(first_line): iterations += 1 - step = len(valid_data) / iterations + step = int(len(valid_data) / iterations) final_res = [] for i in range(iterations): final_res.extend(gen_per_iter(valid_data[int(i*step): int(step*( i+ 1))])) + if i + 1 == iterations and int(step*( i+ 1)) < len(valid_data): + final_res.extend(gen_per_iter(valid_data[int(step*( i+ 1)): len(valid_data)])) return final_res diff --git a/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_bn.py b/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_bn.py index 9dbe1c82f0a..f36b02a3e94 100644 --- a/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_bn.py +++ b/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_bn.py @@ -31,8 +31,9 @@ def __init__(self, **kwargs): reverse=True) if self.new_api: self.fusion_mapping = { + 'FusedBatchNormV3': self.apply_newly_bn_relu_fusion, 'FusedBatchNormV3Relu': self.apply_newly_bn_relu_fusion, - 'FusedBatchNormV3': self.apply_newly_bn_relu_fusion + 'FusedBatchNormV3LeakyRelu': self.apply_newly_bn_leakyrelu_fusion } else: self.fusion_mapping = {} @@ -75,8 +76,7 @@ def apply_newly_bn_relu_fusion(self, match_node_name): [output_min_node_name] + [output_max_node_name] + control_inputs output_min_node = helper.create_constant_node(output_min_node_name, -1., dtypes.float32) output_max_node = helper.create_constant_node(output_max_node_name, 1., dtypes.float32) - quantized_bn_node = helper.create_node(node_op, quantized_node_name, - quantized_node_input_names) + quantized_bn_node = helper.create_node(node_op, quantized_node_name, quantized_node_input_names) if relu_node_name is not None: helper.set_attr_string(quantized_bn_node, "activation_mode", b'Relu') if self.node_name_mapping[offset_name].node.op == "Const": @@ -141,6 +141,108 @@ def apply_newly_bn_relu_fusion(self, match_node_name): new_node.CopyFrom(node) self.add_output_graph_node(new_node) + def apply_newly_bn_leakyrelu_fusion(self, match_node_name): + matched_node = self.node_name_mapping[match_node_name[0]] + skip_node_name = match_node_name[1:] + control_inputs, normal_inputs = self._get_node_input( + matched_node.node.name) + scale_name = normal_inputs[1] + offset_name = normal_inputs[2] + mean_name = normal_inputs[3] + variance_name = normal_inputs[4] + + all_input_names = self._add_eightbit_prologue_nodes(matched_node.node.name) + all_input_names = [ + all_input_names[0], + scale_name, + offset_name, + mean_name, + variance_name, + all_input_names[1], + all_input_names[2] + ] + + for _, node in enumerate(self.input_graph.node): + if node.name in skip_node_name: + self.logger.debug("skip node {}".format(node.name)) + elif node.name == match_node_name[0]: + self.logger.debug("Matched node {} with input {}.".format(node.name, node.input)) + leakyrelu_node_name = match_node_name[1] + node_op = '_QuantizedFusedBatchNorm' + quantized_node_name = node.name + "_eightbit_quantized_bn" + output_min_node_name = quantized_node_name + "_input7_output_min" + output_max_node_name = quantized_node_name + "_input8_output_max" + quantized_node_input_names = all_input_names + \ + [output_min_node_name] + [output_max_node_name] + control_inputs + output_min_node = helper.create_constant_node(output_min_node_name, -1., dtypes.float32) + output_max_node = helper.create_constant_node(output_max_node_name, 1., dtypes.float32) + quantized_bn_node = helper.create_node(node_op, quantized_node_name, quantized_node_input_names) + + helper.set_attr_string(quantized_bn_node, "activation_mode", b'LeakyRelu') + helper.copy_attr(quantized_bn_node, "alpha", \ + self.node_name_mapping[leakyrelu_node_name].node.attr["alpha"]) + if self.node_name_mapping[offset_name].node.op == "Const": + helper.set_attr_bool(quantized_bn_node, "is_offset_const", True) + else: + helper.set_attr_bool(quantized_bn_node, "is_offset_const", False) + if self.node_name_mapping[mean_name].node.op == "Const": + helper.set_attr_bool(quantized_bn_node, "is_mean_const", True) + else: + helper.set_attr_bool(quantized_bn_node, "is_mean_const", False) + helper.set_attr_dtype(quantized_bn_node, "T", dtypes.qint8) + helper.set_attr_dtype(quantized_bn_node, "U", dtypes.float32) + helper.set_attr_dtype(quantized_bn_node, "Tout", dtypes.qint8) + + """ + # 0. x + # 1. scale + # 2. offset + # 3. mean + # 4. variance + # 5. x_min + # 6. x_max + # 7. {output_min} + # 8. {output_max} + """ + helper.set_attr_type_list(quantized_bn_node, 'input_types', [ + dtypes.qint8.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + ]) + + + """ + # 0. output + # 1. output_min + # 2. output_max + """ + helper.set_attr_type_list(quantized_bn_node, 'out_types', [ + dtypes.qint8.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + ]) + self.add_output_graph_node(output_min_node) + self.add_output_graph_node(output_max_node) + self.add_output_graph_node(quantized_bn_node) + self._intel_cpu_add_dequantize_result_node( + quantized_output_name = quantized_node_name, + original_node_name = match_node_name[-1], + dtype = dtypes.qint8, + min_tensor_index = 1, + performance_only=self.performance_only + ) + + else: + new_node = node_def_pb2.NodeDef() + new_node.CopyFrom(node) + self.add_output_graph_node(new_node) + def get_longest_fuse(self): self._get_op_list() real_patterns = [pattern[1 :-1] for pattern in self.sorted_patterns] diff --git a/neural_compressor/adaptor/tf_utils/quantize_graph/quantize_graph_bn.py b/neural_compressor/adaptor/tf_utils/quantize_graph/quantize_graph_bn.py index 5bf86c74e72..9a425505dc1 100644 --- a/neural_compressor/adaptor/tf_utils/quantize_graph/quantize_graph_bn.py +++ b/neural_compressor/adaptor/tf_utils/quantize_graph/quantize_graph_bn.py @@ -31,8 +31,9 @@ def __init__(self, **kwargs): reverse=True) if self.new_api: self.fusion_mapping = { + 'FusedBatchNormV3': self.apply_newly_bn_relu_fusion, 'FusedBatchNormV3Relu': self.apply_newly_bn_relu_fusion, - 'FusedBatchNormV3': self.apply_newly_bn_relu_fusion + 'FusedBatchNormV3LeakyRelu': self.apply_newly_bn_leakyrelu_fusion } else: self.fusion_mapping = {} @@ -75,8 +76,7 @@ def apply_newly_bn_relu_fusion(self, match_node_name): [output_min_node_name] + [output_max_node_name] + control_inputs output_min_node = helper.create_constant_node(output_min_node_name, -1., dtypes.float32) output_max_node = helper.create_constant_node(output_max_node_name, 1., dtypes.float32) - quantized_bn_node = helper.create_node(node_op, quantized_node_name, - quantized_node_input_names) + quantized_bn_node = helper.create_node(node_op, quantized_node_name, quantized_node_input_names) if relu_node_name is not None: helper.set_attr_string(quantized_bn_node, "activation_mode", b'Relu') if self.node_name_mapping[offset_name].node.op == "Const": @@ -140,6 +140,108 @@ def apply_newly_bn_relu_fusion(self, match_node_name): new_node.CopyFrom(node) self.add_output_graph_node(new_node) + def apply_newly_bn_leakyrelu_fusion(self, match_node_name): + matched_node = self.node_name_mapping[match_node_name[0]] + skip_node_name = match_node_name[1:] + control_inputs, normal_inputs = self._get_node_input( + matched_node.node.name) + scale_name = normal_inputs[1] + offset_name = normal_inputs[2] + mean_name = normal_inputs[3] + variance_name = normal_inputs[4] + + all_input_names = self._add_eightbit_prologue_nodes(matched_node.node.name) + all_input_names = [ + all_input_names[0], + scale_name, + offset_name, + mean_name, + variance_name, + all_input_names[1], + all_input_names[2] + ] + + for _, node in enumerate(self.input_graph.node): + if node.name in skip_node_name: + self.logger.debug("skip node {}".format(node.name)) + elif node.name == match_node_name[0]: + self.logger.debug("Matched node {} with input {}.".format(node.name, node.input)) + leakyrelu_node_name = match_node_name[1] + node_op = '_QuantizedFusedBatchNorm' + quantized_node_name = node.name + "_eightbit_quantized_bn" + output_min_node_name = quantized_node_name + "_input7_output_min" + output_max_node_name = quantized_node_name + "_input8_output_max" + quantized_node_input_names = all_input_names + \ + [output_min_node_name] + [output_max_node_name] + control_inputs + output_min_node = helper.create_constant_node(output_min_node_name, -1., dtypes.float32) + output_max_node = helper.create_constant_node(output_max_node_name, 1., dtypes.float32) + quantized_bn_node = helper.create_node(node_op, quantized_node_name, quantized_node_input_names) + + helper.set_attr_string(quantized_bn_node, "activation_mode", b'LeakyRelu') + helper.copy_attr(quantized_bn_node, "alpha", \ + self.node_name_mapping[leakyrelu_node_name].node.attr["alpha"]) + if self.node_name_mapping[offset_name].node.op == "Const": + helper.set_attr_bool(quantized_bn_node, "is_offset_const", True) + else: + helper.set_attr_bool(quantized_bn_node, "is_offset_const", False) + if self.node_name_mapping[mean_name].node.op == "Const": + helper.set_attr_bool(quantized_bn_node, "is_mean_const", True) + else: + helper.set_attr_bool(quantized_bn_node, "is_mean_const", False) + helper.set_attr_dtype(quantized_bn_node, "T", dtypes.qint8) + helper.set_attr_dtype(quantized_bn_node, "U", dtypes.float32) + helper.set_attr_dtype(quantized_bn_node, "Tout", dtypes.qint8) + + """ + # 0. x + # 1. scale + # 2. offset + # 3. mean + # 4. variance + # 5. x_min + # 6. x_max + # 7. {output_min} + # 8. {output_max} + """ + helper.set_attr_type_list(quantized_bn_node, 'input_types', [ + dtypes.qint8.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + ]) + + + """ + # 0. output + # 1. output_min + # 2. output_max + """ + helper.set_attr_type_list(quantized_bn_node, 'out_types', [ + dtypes.qint8.as_datatype_enum, + dtypes.float32.as_datatype_enum, + dtypes.float32.as_datatype_enum, + ]) + self.add_output_graph_node(output_min_node) + self.add_output_graph_node(output_max_node) + self.add_output_graph_node(quantized_bn_node) + self._intel_cpu_add_dequantize_result_node( + quantized_output_name = quantized_node_name, + original_node_name = match_node_name[-1], + dtype = dtypes.qint8, + min_tensor_index = 1, + performance_only=self.performance_only + ) + + else: + new_node = node_def_pb2.NodeDef() + new_node.CopyFrom(node) + self.add_output_graph_node(new_node) + def get_longest_fuse(self): self._get_op_list() matched_rule, matched_node_name = self._is_match(self.sorted_patterns) diff --git a/neural_compressor/adaptor/torch_utils/onnx.py b/neural_compressor/adaptor/torch_utils/onnx.py index aadcb80f810..c667281cb66 100644 --- a/neural_compressor/adaptor/torch_utils/onnx.py +++ b/neural_compressor/adaptor/torch_utils/onnx.py @@ -30,17 +30,30 @@ def __init__(self, dataloader, sample_size=100): self.datasize = self.batch_num * self.batch_size self.data = [] - for i, (input, label) in enumerate(self.dataloader): - if i * self.batch_size >= self.datasize: - break - if isinstance(input, dict) or isinstance(input, UserDict): - batch = {k: v.detach().cpu().numpy() for k, v in input.items()} - elif isinstance(input, list) or isinstance(input, tuple): - batch = {'input': [v.detach().cpu().numpy() for v in input]} - else: - batch = {'input': input.detach().cpu().numpy()} - self.data.append(batch) - self.data = iter(self.data) + try: + for i, (input, label) in enumerate(self.dataloader): + if i * self.batch_size >= self.datasize: + break + if isinstance(input, dict) or isinstance(input, UserDict): + batch = {k: v.detach().cpu().numpy() for k, v in input.items()} + elif isinstance(input, list) or isinstance(input, tuple): + batch = {'input': [v.detach().cpu().numpy() for v in input]} + else: + batch = {'input': input.detach().cpu().numpy()} + self.data.append(batch) + self.data = iter(self.data) + except: + for i, input in enumerate(self.dataloader): + if i * self.batch_size >= self.datasize: + break + if isinstance(input, dict) or isinstance(input, UserDict): + batch = {k: v.detach().cpu().numpy() for k, v in input.items()} + elif isinstance(input, list) or isinstance(input, tuple): + batch = {'input': [v.detach().cpu().numpy() for v in input]} + else: + batch = {'input': input.detach().cpu().numpy()} + self.data.append(batch) + self.data = iter(self.data) def get_next(self): return next(self.data, None) diff --git a/neural_compressor/benchmark.py b/neural_compressor/benchmark.py index 30e3bf8aa28..87d425a846b 100644 --- a/neural_compressor/benchmark.py +++ b/neural_compressor/benchmark.py @@ -18,6 +18,8 @@ from .utils import logger from .data import DATALOADERS from .experimental import Benchmark as ExpBenchmark +from .conf.pythonic_config import Config +from .config import BenchmarkConfig class Benchmark(object): """Benchmark class can be used to evaluate the model performance, with the objective @@ -67,9 +69,11 @@ def postprocess(self, name, postprocess_cls, **kwargs): self.exp_benchmarker.postprocess = nc_postprocess -def benchmark( +def fit( model, config=None, b_dataloader=None, b_func=None ): + if isinstance(config, BenchmarkConfig): + config = Config(benchmark=config) benchmarker = ExpBenchmark(config) benchmarker.model = model if b_func is not None: @@ -78,3 +82,6 @@ def benchmark( benchmarker.b_dataloader = b_dataloader benchmarker() return benchmarker.results + + +benchmark = fit diff --git a/neural_compressor/conf/config.py b/neural_compressor/conf/config.py index aef8f695291..86f1cac018b 100644 --- a/neural_compressor/conf/config.py +++ b/neural_compressor/conf/config.py @@ -746,6 +746,7 @@ def percent_to_float(data): 'pre_post_process_quantization': True}, 'model_wise': {'weight': {'bit': [7.0]}, 'activation': {}}, + 'optimization_level': 1, }): { Optional('approach', default='post_training_static_quant'): And( str, @@ -839,8 +840,10 @@ def percent_to_float(data): Optional('op_wise', default=None): { str: ops_schema }, + Optional('optimization_level', default=1): And(int, lambda level: level in [0, 1]), }, - Optional('use_bf16', default=False): bool, + Optional('use_bf16', default=True): bool, + Optional('optimization_level', default=1): And(int, lambda level: level in [0, 1]), Optional('graph_optimization'): graph_optimization_schema, Optional('mixed_precision'): mixed_precision_schema, @@ -1111,6 +1114,7 @@ def percent_to_float(data): 'activation': {}}, }): dict, Optional('use_bf16', default=False): bool, + Optional('optimization_level', default=1): int, Optional('tuning', default={ 'strategy': {'name': 'basic'}, 'accuracy_criterion': {'relative': 0.01, 'higher_is_better': True}, @@ -1346,8 +1350,17 @@ def map_pyconfig_to_cfg(self, pythonic_config): 'tuning.exit_policy.max_trials': pythonic_config.quantization.max_trials, 'tuning.exit_policy.performance_only': pythonic_config.quantization.performance_only, 'use_bf16': pythonic_config.quantization.use_bf16, + 'quantization.optimization_level': pythonic_config.quantization.optimization_level, 'reduce_range': pythonic_config.quantization.reduce_range }) + if pythonic_config.quantization.strategy_kwargs: + st_kwargs = pythonic_config.quantization.strategy_kwargs + for st_key in ['sigopt_api_token', 'sigopt_project_id', 'sigopt_experiment_name', \ + 'accuracy_weight', 'latency_weight']: + if st_key in st_kwargs: + st_val = st_kwargs[st_key] + mapping.update({'tuning.strategy.' + st_key: st_val}) + if pythonic_config.distillation is not None: mapping.update({ 'distillation.train.criterion': pythonic_config.distillation.criterion, @@ -1371,6 +1384,10 @@ def map_pyconfig_to_cfg(self, pythonic_config): 'tuning.tensorboard': pythonic_config.options.tensorboard, }) if pythonic_config.benchmark is not None: + if pythonic_config.benchmark.inputs != []: + mapping.update({'model.inputs': pythonic_config.benchmark.inputs}) + if pythonic_config.benchmark.outputs != []: + mapping.update({'model.outputs': pythonic_config.benchmark.outputs}) mapping.update({ 'evaluation.performance.warmup': pythonic_config.benchmark.warmup, 'evaluation.performance.iteration': pythonic_config.benchmark.iteration, diff --git a/neural_compressor/conf/pythonic_config.py b/neural_compressor/conf/pythonic_config.py index 89d0b773d40..c9975a9ebc6 100644 --- a/neural_compressor/conf/pythonic_config.py +++ b/neural_compressor/conf/pythonic_config.py @@ -16,377 +16,12 @@ # limitations under the License. import logging -import datetime -from typing import List -from schema import Schema, And, Use, Optional, Or from .dotdict import DotDict -from .config import Pruner +from ..config import _BaseQuantizationConfig, accuracy_criterion, BenchmarkConfig, \ + check_value, DistillationConfig, options, PruningConfig logger = logging.getLogger("neural_compressor") -default_workspace = './nc_workspace/{}/'.format( - datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) - -ops_schema = Schema({ - Optional('weight', default=None): { - Optional('granularity'): And( - list, - lambda s: all(i in ['per_channel', 'per_tensor'] for i in s)), - Optional('scheme'): And( - list, - lambda s: all(i in ['asym', 'sym', 'asym_float'] for i in s)), - Optional('dtype'): And( - list, - lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16', 'fp16'] for i in s)), - Optional('algorithm'): And( - list, - lambda s: all(i in ['minmax'] for i in s))}, - Optional('activation', default=None): { - Optional('granularity'): And( - list, - lambda s: all(i in ['per_channel', 'per_tensor'] for i in s)), - Optional('scheme'): And( - list, - lambda s: all(i in ['asym', 'sym'] for i in s)), - Optional('dtype'): And( - list, - lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16', 'fp16', 'None'] for i in s)), - Optional('algorithm'): And( - list, - lambda s: all(i in ['minmax', 'kl', 'placeholder'] for i in s))}}) - - -def check_value(name, src, supported_type, supported_value=[]): - if isinstance(src, list) and any([not isinstance(i, supported_type) for i in src]): - logger.warning("Type of {} items should be {} but not {}, " \ - "use its default value.".format(name, str(supported_type), [type(i) for i in src])) - return False - elif not isinstance(src, list) and not isinstance(src, supported_type): - logger.warning("Type of {} should be {} but not {}, " \ - "use its default value.".format(name, str(supported_type), type(src))) - return False - - if len(supported_value) > 0: - if isinstance(src, str) and src not in supported_value: - logger.warning("{} is not in supported {}: {}. Skip setting it and" \ - " use default value.".format(src, name, str(supported_value))) - return False - elif isinstance(src, list) and all([isinstance(i, str) for i in src]) and \ - any([i not in supported_value for i in src]): - logger.warning("{} is not in supported {}: {}. Skip setting it and" \ - " use default value.".format(src, name, str(supported_value))) - return False - - return True - -class BenchmarkConfig: - def __init__(self, warmup=5, iteration=-1, cores_per_instance=None, num_of_instance=None, - inter_num_of_threads=None, intra_num_of_threads=None): - self._warmup = warmup - self._iteration = iteration - self._cores_per_instance = cores_per_instance - self._num_of_instance = num_of_instance - self._inter_num_of_threads = inter_num_of_threads - self._intra_num_of_threads = intra_num_of_threads - - @property - def warmup(self): - return self._warmup - - @warmup.setter - def warmup(self, warmup): - if check_value('warmup', warmup, int): - self._warmup = warmup - - @property - def iteration(self): - return self._iteration - - @iteration.setter - def iteration(self, iteration): - if check_value('iteration', iteration, int): - self._iteration = iteration - - @property - def cores_per_instance(self): - return self._cores_per_instance - - @cores_per_instance.setter - def cores_per_instance(self, cores_per_instance): - if check_value('cores_per_instance', cores_per_instance, int): - self._cores_per_instance = cores_per_instance - - @property - def num_of_instance(self): - return self._num_of_instance - - @num_of_instance.setter - def num_of_instance(self, num_of_instance): - if check_value('num_of_instance', num_of_instance, int): - self._num_of_instance = num_of_instance - - @property - def inter_num_of_threads(self): - return self._inter_num_of_threads - - @inter_num_of_threads.setter - def inter_num_of_threads(self, inter_num_of_threads): - if check_value('inter_num_of_threads', inter_num_of_threads, int): - self._inter_num_of_threads = inter_num_of_threads - - @property - def intra_num_of_threads(self): - return self._intra_num_of_threads - - @intra_num_of_threads.setter - def intra_num_of_threads(self, intra_num_of_threads): - if check_value('intra_num_of_threads', intra_num_of_threads, int): - self._intra_num_of_threads = intra_num_of_threads - -class AccuracyLoss: - def __init__(self, loss=0.01): - self._loss = loss - - @property - def relative(self): - return self._loss - - @relative.setter - def relative(self, relative): - if check_value('relative tolerable loss', relative, float): - self._loss = relative - - @property - def absolute(self): - return self._loss - - @absolute.setter - def absolute(self, absolute): - if check_value('absolute tolerable loss', absolute, float): - self._loss = absolute - -tolerable_loss = AccuracyLoss() - -class AccuracyCriterion: - def __init__(self, higher_is_better=True, criterion='relative', tolerable_loss=tolerable_loss): - self._higher_is_better = higher_is_better - self._criterion = criterion - self._tolerable_loss = tolerable_loss - - @property - def higher_is_better(self): - return self._higher_is_better - - @higher_is_better.setter - def higher_is_better(self, higher_is_better): - if check_value('higher_is_better', higher_is_better, bool): - self._higher_is_better = higher_is_better - - @property - def relative(self): - if self._criterion != 'relative': - return None - return self._tolerable_loss.relative - - @relative.setter - def relative(self, relative): - self._criterion = 'relative' - self._tolerable_loss.relative = relative - - @property - def absolute(self): - if self._criterion != 'absolute': - return None - return self._tolerable_loss.absolute - - @absolute.setter - def absolute(self, absolute): - self._criterion = 'absolute' - self._tolerable_loss.absolute = absolute - - def __str__(self): - return self._criterion - -accuracy_criterion = AccuracyCriterion() - -class _BaseQuantizationConfig: - def __init__(self, - inputs=[], - outputs=[], - backend='NA', - device='cpu', - calibration_sampling_size=[100], - op_type_list=None, - op_name_list=None, - strategy='basic', - objective='performance', - timeout=0, - max_trials=100, - performance_only=False, - reduce_range=None, - use_bf16=False, - accuracy_criterion=accuracy_criterion): - self._inputs = inputs - self._outputs = outputs - self._backend = backend - self._device = device - self._op_type_list = op_type_list - self._op_name_list = op_name_list - self._strategy = strategy - self._objective = objective - self._timeout = timeout - self._max_trials = max_trials - self._performance_only = performance_only - self._reduce_range = reduce_range - self._use_bf16 = use_bf16 - self._accuracy_criterion = accuracy_criterion - self._calibration_sampling_size = calibration_sampling_size - - @property - def accuracy_criterion(self): - return self._accuracy_criterion - - @property - def use_bf16(self): - return self._use_bf16 - - @use_bf16.setter - def use_bf16(self, use_bf16): - if check_value('use_bf16', use_bf16, bool): - self._use_bf16 = use_bf16 - - @property - def reduce_range(self): - return self._reduce_range - - @reduce_range.setter - def reduce_range(self, reduce_range): - if check_value('reduce_range', reduce_range, bool): - self._reduce_range = reduce_range - - @property - def performance_only(self): - return self._performance_only - - @performance_only.setter - def performance_only(self, performance_only): - if check_value('performance_only', performance_only, bool): - self._performance_only = performance_only - - @property - def max_trials(self): - return self._max_trials - - @max_trials.setter - def max_trials(self, max_trials): - if check_value('max_trials', max_trials, int): - self._max_trials = max_trials - - @property - def timeout(self): - return self._timeout - - @timeout.setter - def timeout(self, timeout): - if check_value('timeout', timeout, int): - self._timeout = timeout - - @property - def objective(self): - return self._objective - - @objective.setter - def objective(self, objective): - if check_value('objective', objective, str, - ['performance', 'accuracy', 'modelsize', 'footprint']): - self._objective = objective - - @property - def strategy(self): - return self._strategy - - @strategy.setter - def strategy(self, strategy): - if check_value('strategy', strategy, str, - ['basic', 'mse', 'bayesian', 'random', 'exhaustive']): - self._strategy = strategy - - @property - def op_name_list(self): - return self._op_name_list - - @op_name_list.setter - def op_name_list(self, op_name_list): - if not isinstance(op_name_list, dict): - logger.warning("Type of op_name_list should be dict but not {}, " \ - "use its default value.".format(type(op_name_list))) - else: - for k, v in op_name_list.items(): - ops_schema.validate(v) - self._op_name_list = op_name_list - - @property - def op_type_list(self): - return self._op_type_list - - @op_type_list.setter - def op_type_list(self, op_type_list): - if not isinstance(op_type_list, dict): - logger.warning("Type of op_type_list should be dict but not {}, " \ - "use its default value.".format(type(op_type_list))) - else: - for k, v in op_type_list.items(): - ops_schema.validate(v) - self._op_type_list = op_type_list - - @property - def calibration_sampling_size(self): - return self._calibration_sampling_size - - @calibration_sampling_size.setter - def calibration_sampling_size(self, sampling_size): - if check_value('calibration_sampling_size', sampling_size, int): - self._calibration_sampling_size = sampling_size - - @property - def device(self): - return self._device - - @device.setter - def device(self, device): - if check_value('device', device, str, ['cpu', 'gpu']): - self._device = device - - @property - def backend(self): - return self._backend - - @backend.setter - def backend(self, backend): - if check_value('backend', backend, str, [ - 'tensorflow', 'tensorflow_itex', 'pytorch', 'pytorch_ipex', 'pytorch_fx', - 'onnxrt_qlinearops', 'onnxrt_integerops', 'onnxrt_qdq', 'onnxrt_qoperator', 'mxnet' - ]): - self._backend = backend - - @property - def outputs(self): - return self._outputs - - @outputs.setter - def outputs(self, outputs): - if check_value('outputs', outputs, str): - self._outputs = outputs - - @property - def inputs(self): - return self._inputs - - @inputs.setter - def inputs(self, inputs): - if check_value('inputs', inputs, str): - self._inputs = inputs - class QuantizationConfig(_BaseQuantizationConfig): def __init__(self, @@ -399,16 +34,33 @@ def __init__(self, op_type_list=None, op_name_list=None, strategy='basic', + strategy_kwargs=None, objective='performance', timeout=0, max_trials=100, performance_only=False, reduce_range=None, - use_bf16=False, + use_bf16=True, + optimization_level=1, accuracy_criterion=accuracy_criterion): - super().__init__(inputs, outputs, backend, device, calibration_sampling_size, op_type_list, - op_name_list, strategy, objective, timeout, max_trials, performance_only, - reduce_range, use_bf16, accuracy_criterion) + extra_precisions = ["bf16"] if use_bf16 else [] + super().__init__(inputs=inputs, + outputs=outputs, + backend=backend, + device=device, + calibration_sampling_size=calibration_sampling_size, + op_type_list=op_type_list, + op_name_list=op_name_list, + strategy=strategy, + strategy_kwargs=strategy_kwargs, + objective=objective, + timeout=timeout, + max_trials=max_trials, + performance_only=performance_only, + reduce_range=reduce_range, + extra_precisions=extra_precisions, + optimization_level=optimization_level, + accuracy_criterion=accuracy_criterion) self._approach = approach @property @@ -424,112 +76,6 @@ def approach(self, approach): self._approach = approach -class PostTrainingConfig(_BaseQuantizationConfig): - def __init__(self, - inputs=[], - outputs=[], - backend='NA', - device='cpu', - approach='post_training_auto_quant', - calibration_sampling_size=[100], - op_type_list=None, - op_name_list=None, - strategy='basic', - objective='performance', - timeout=0, - max_trials=100, - performance_only=False, - reduce_range=None, - use_bf16=False, - accuracy_criterion=accuracy_criterion): - super().__init__(inputs, outputs, backend, device, calibration_sampling_size, op_type_list, - op_name_list, strategy, objective, timeout, max_trials, performance_only, - reduce_range, use_bf16, accuracy_criterion) - self._approach = approach - - @property - def approach(self): - return self._approach - - @approach.setter - def approach(self, approach): - if check_value("approach", approach, str, [ - "post_training_static_quant", "post_training_dynamic_quant", - "post_training_auto_quant" - ]): - self._approach = approach - - -class QuantizationAwareTrainingConfig(_BaseQuantizationConfig): - def __init__(self, - inputs=[], - outputs=[], - backend='NA', - device='cpu', - op_type_list=None, - op_name_list=None, - reduce_range=None, - use_bf16=False): - super().__init__(inputs=inputs, outputs=outputs, backend=backend, device=device, - op_type_list=op_type_list, op_name_list=op_name_list, - reduce_range=reduce_range, use_bf16=use_bf16) - self._approach = 'quant_aware_training' - - @property - def approach(self): - return self._approach - - @approach.setter - def approach(self, approach): - if check_value('approach', approach, str, - ['quant_aware_training']): - self._approach = approach - - -class Options: - def __init__(self, random_seed=1978, workspace=default_workspace, - resume_from=None, tensorboard=False): - self._random_seed = random_seed - self._workspace = workspace - self._resume_from = resume_from - self._tensorboard = tensorboard - - @property - def random_seed(self): - return self._random_seed - - @random_seed.setter - def random_seed(self, random_seed): - if check_value('random_seed', random_seed, int): - self._random_seed = random_seed - - @property - def workspace(self): - return self._workspace - - @workspace.setter - def workspace(self, workspace): - if check_value('workspace', workspace, str): - self._workspace = workspace - - @property - def resume_from(self): - return self._resume_from - - @resume_from.setter - def resume_from(self, resume_from): - if check_value('resume_from', resume_from, str): - self._resume_from = resume_from - - @property - def tensorboard(self): - return self._tensorboard - - @tensorboard.setter - def tensorboard(self, tensorboard): - if check_value('tensorboard', tensorboard, bool): - self._tensorboard = tensorboard - class WeightConf: def __init__(self, datatype=None, scheme=None, granularity=None, algorithm=None): self._datatype = datatype @@ -641,134 +187,6 @@ class PyTorch(MXNet): def __init__(self, precisions=None): super().__init__(precisions) -pruners = [Pruner()] - -class PruningConfig: - def __init__(self, pruners=pruners, initial_sparsity=0.0, target_sparsity=0.97, - max_sparsity_ratio_per_layer=0.98, prune_type="basic_magnitude", - start_epoch=0, end_epoch=4, start_step=0, end_step=0, update_frequency=1.0, - update_frequency_on_step=1, not_to_prune_names=[], prune_domain="global", - names=[], exclude_names=[], prune_layer_type=[], sparsity_decay_type="exp", - pattern="tile_pattern_1x1"): - self._weight_compression = DotDict({ - 'initial_sparsity': initial_sparsity, - 'target_sparsity': target_sparsity, - 'max_sparsity_ratio_per_layer': max_sparsity_ratio_per_layer, - 'prune_type': prune_type, - 'start_epoch': start_epoch, - 'end_epoch': end_epoch, - 'start_step': start_step, - 'end_step': end_step, - 'update_frequency': update_frequency, - 'update_frequency_on_step': update_frequency_on_step, - 'not_to_prune_names': not_to_prune_names, - 'prune_domain': prune_domain, - 'names': names, - 'exclude_names': exclude_names, - 'prune_layer_type': prune_layer_type, - 'sparsity_decay_type': sparsity_decay_type, - 'pattern': pattern, - 'pruners': pruners - }) - - @property - def weight_compression(self): - return self._weight_compression - - @weight_compression.setter - def weight_compression(self, weight_compression): - self._weight_compression = weight_compression - - -class KnowledgeDistillationLossConfig: - def __init__(self, temperature=1.0, loss_types=['CE', 'CE'], loss_weights=[0.5, 0.5]): - self.config = DotDict({ - 'KnowledgeDistillationLoss': { - 'temperature': temperature, - 'loss_types': loss_types, - 'loss_weights': loss_weights - } - }) - - -class IntermediateLayersKnowledgeDistillationLossConfig: - def __init__(self, layer_mappings=[], loss_types=[], loss_weights=[], add_origin_loss=False): - self.config = DotDict({ - 'IntermediateLayersKnowledgeDistillationLoss': { - 'layer_mappings': layer_mappings, - 'loss_types': loss_types, - 'loss_weights': loss_weights, - 'add_origin_loss': add_origin_loss - } - }) - - -class SelfKnowledgeDistillationLossConfig: - def __init__(self, - layer_mappings=[], - temperature=1.0, - loss_types=[], - loss_weights=[], - add_origin_loss=False): - self.config = DotDict({ - 'SelfKnowledgeDistillationLoss': { - 'layer_mappings': layer_mappings, - 'temperature': temperature, - 'loss_types': loss_types, - 'loss_weights': loss_weights, - 'add_origin_loss': add_origin_loss, - } - }) - - -criterion = KnowledgeDistillationLossConfig() - -class DistillationConfig: - """Config of distillation. - - Args: - - teacher_model (Callable): Teacher model for distillation. Defaults to None. - features (optional): Teacher features for distillation, features and teacher_model are alternative. - Defaults to None. - criterion (Callable, optional): Distillation loss configure. - optimizer (dictionary, optional): Optimizer configure. - """ - - def __init__(self, - teacher_model, - criterion=criterion, - optimizer={'SGD': { - 'learning_rate': 0.0001 - }}): - self._criterion = criterion.config - self._optimizer = optimizer - self._teacher_model = teacher_model - - @property - def criterion(self): - return self._criterion - - @criterion.setter - def criterion(self, criterion): - self._criterion = criterion - - @property - def optimizer(self): - return self._optimizer - - @optimizer.setter - def optimizer(self, optimizer): - self._optimizer = optimizer - - @property - def teacher_model(self): - return self._teacher_model - - @teacher_model.setter - def teacher_model(self, teacher_model): - self._teacher_model = teacher_model - class DyNASConfig: def __init__(self, supernet=None, metrics=None, population=50, num_evals=100000, @@ -816,41 +234,8 @@ def search(self, search): self._search = search -class MixedPrecisionConfig(PostTrainingConfig): - def __init__(self, - inputs=[], - outputs=[], - backend='NA', - device='cpu', - op_type_list=None, - op_name_list=None, - strategy='basic', - objective='performance', - timeout=0, - max_trials=100, - performance_only=False, - reduce_range=None, - accuracy_criterion=accuracy_criterion, - precisions=["bf16"]): - super().__init__(inputs, outputs, backend, device, op_type_list=op_type_list, - op_name_list=op_name_list, strategy=strategy, objective=objective, - timeout=timeout, max_trials=max_trials, performance_only=performance_only, - reduce_range=reduce_range, accuracy_criterion=accuracy_criterion, - use_bf16=True) - self._precisions = precisions if isinstance(precisions, List) else [precisions] - - @property - def precisions(self): - return self._precisions - - @precisions.setter - def precisions(self, precisions): - self._precisions = precisions - - quantization = QuantizationConfig() benchmark = BenchmarkConfig() -options = Options() pruning = PruningConfig() distillation = DistillationConfig(teacher_model=None) nas = NASConfig() diff --git a/neural_compressor/config.py b/neural_compressor/config.py new file mode 100644 index 00000000000..b3a9fd4352e --- /dev/null +++ b/neural_compressor/config.py @@ -0,0 +1,912 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import logging +from typing import List +from schema import Schema, And, Optional +from .conf.dotdict import DotDict +from .conf.config import Pruner + +logger = logging.getLogger("neural_compressor") +default_workspace = './nc_workspace/{}/'.format( + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + +QUANTMAPPING = { + "auto": "post_training_auto_quant", + "dynamic": "post_training_dynamic_quant", + "static": "post_training_static_quant", + "qat": "quant_aware_training", +} + + +ops_schema = Schema({ + Optional('weight', default=None): { + Optional('granularity'): And( + list, + lambda s: all(i in ['per_channel', 'per_tensor'] for i in s)), + Optional('scheme'): And( + list, + lambda s: all(i in ['asym', 'sym', 'asym_float'] for i in s)), + Optional('dtype'): And( + list, + lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16', 'fp16'] for i in s)), + Optional('algorithm'): And( + list, + lambda s: all(i in ['minmax'] for i in s))}, + Optional('activation', default=None): { + Optional('granularity'): And( + list, + lambda s: all(i in ['per_channel', 'per_tensor'] for i in s)), + Optional('scheme'): And( + list, + lambda s: all(i in ['asym', 'sym'] for i in s)), + Optional('dtype'): And( + list, + lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16', 'fp16', 'None'] for i in s)), + Optional('algorithm'): And( + list, + lambda s: all(i in ['minmax', 'kl', 'placeholder'] for i in s))}}) + + +def check_value(name, src, supported_type, supported_value=[]): + if isinstance(src, list) and any([not isinstance(i, supported_type) for i in src]): + logger.warning("Type of {} items should be {} but not {}, " \ + "use its default value.".format(name, str(supported_type), [type(i) for i in src])) + return False + elif not isinstance(src, list) and not isinstance(src, supported_type): + logger.warning("Type of {} should be {} but not {}, " \ + "use its default value.".format(name, str(supported_type), type(src))) + return False + + if len(supported_value) > 0: + if isinstance(src, str) and src not in supported_value: + logger.warning("{} is not in supported {}: {}. Skip setting it and" \ + " use default value.".format(src, name, str(supported_value))) + return False + elif isinstance(src, list) and all([isinstance(i, str) for i in src]) and \ + any([i not in supported_value for i in src]): + logger.warning("{} is not in supported {}: {}. Skip setting it and" \ + " use default value.".format(src, name, str(supported_value))) + return False + + return True + + +class Options: + def __init__(self, random_seed=1978, workspace=default_workspace, + resume_from=None, tensorboard=False): + self._random_seed = random_seed + self._workspace = workspace + self._resume_from = resume_from + self._tensorboard = tensorboard + + @property + def random_seed(self): + return self._random_seed + + @random_seed.setter + def random_seed(self, random_seed): + if check_value('random_seed', random_seed, int): + self._random_seed = random_seed + + @property + def workspace(self): + return self._workspace + + @workspace.setter + def workspace(self, workspace): + if check_value('workspace', workspace, str): + self._workspace = workspace + + @property + def resume_from(self): + return self._resume_from + + @resume_from.setter + def resume_from(self, resume_from): + if check_value('resume_from', resume_from, str): + self._resume_from = resume_from + + @property + def tensorboard(self): + return self._tensorboard + + @tensorboard.setter + def tensorboard(self, tensorboard): + if check_value('tensorboard', tensorboard, bool): + self._tensorboard = tensorboard + + +options = Options() + + +class BenchmarkConfig: + def __init__(self, + inputs=[], + outputs=[], + warmup=5, + iteration=-1, + cores_per_instance=None, + num_of_instance=None, + inter_num_of_threads=None, + intra_num_of_threads=None): + self._inputs = inputs + self._outputs = outputs + self._warmup = warmup + self._iteration = iteration + self._cores_per_instance = cores_per_instance + self._num_of_instance = num_of_instance + self._inter_num_of_threads = inter_num_of_threads + self._intra_num_of_threads = intra_num_of_threads + + @property + def outputs(self): + return self._outputs + + @outputs.setter + def outputs(self, outputs): + if check_value('outputs', outputs, str): + self._outputs = outputs + + @property + def inputs(self): + return self._inputs + + @inputs.setter + def inputs(self, inputs): + if check_value('inputs', inputs, str): + self._inputs = inputs + + @property + def warmup(self): + return self._warmup + + @warmup.setter + def warmup(self, warmup): + if check_value('warmup', warmup, int): + self._warmup = warmup + + @property + def iteration(self): + return self._iteration + + @iteration.setter + def iteration(self, iteration): + if check_value('iteration', iteration, int): + self._iteration = iteration + + @property + def cores_per_instance(self): + return self._cores_per_instance + + @cores_per_instance.setter + def cores_per_instance(self, cores_per_instance): + if check_value('cores_per_instance', cores_per_instance, int): + self._cores_per_instance = cores_per_instance + + @property + def num_of_instance(self): + return self._num_of_instance + + @num_of_instance.setter + def num_of_instance(self, num_of_instance): + if check_value('num_of_instance', num_of_instance, int): + self._num_of_instance = num_of_instance + + @property + def inter_num_of_threads(self): + return self._inter_num_of_threads + + @inter_num_of_threads.setter + def inter_num_of_threads(self, inter_num_of_threads): + if check_value('inter_num_of_threads', inter_num_of_threads, int): + self._inter_num_of_threads = inter_num_of_threads + + @property + def intra_num_of_threads(self): + return self._intra_num_of_threads + + @intra_num_of_threads.setter + def intra_num_of_threads(self, intra_num_of_threads): + if check_value('intra_num_of_threads', intra_num_of_threads, int): + self._intra_num_of_threads = intra_num_of_threads + + +class AccuracyLoss: + def __init__(self, loss=0.01): + self._loss = loss + + @property + def relative(self): + return self._loss + + @relative.setter + def relative(self, relative): + if check_value('relative tolerable loss', relative, float): + self._loss = relative + + @property + def absolute(self): + return self._loss + + @absolute.setter + def absolute(self, absolute): + if check_value('absolute tolerable loss', absolute, float): + self._loss = absolute + + +tolerable_loss = AccuracyLoss() + + +class AccuracyCriterion: + def __init__(self, higher_is_better=True, criterion='relative', tolerable_loss=tolerable_loss): + self._higher_is_better = higher_is_better + self._criterion = criterion + self._tolerable_loss = tolerable_loss + + @property + def higher_is_better(self): + return self._higher_is_better + + @higher_is_better.setter + def higher_is_better(self, higher_is_better): + if check_value('higher_is_better', higher_is_better, bool): + self._higher_is_better = higher_is_better + + @property + def relative(self): + if self._criterion != 'relative': + return None + return self._tolerable_loss.relative + + @relative.setter + def relative(self, relative): + self._criterion = 'relative' + self._tolerable_loss.relative = relative + + @property + def absolute(self): + if self._criterion != 'absolute': + return None + return self._tolerable_loss.absolute + + @absolute.setter + def absolute(self, absolute): + self._criterion = 'absolute' + self._tolerable_loss.absolute = absolute + + def __str__(self): + return self._criterion + + +accuracy_criterion = AccuracyCriterion() + + +class _BaseQuantizationConfig: + def __init__(self, + inputs=[], + outputs=[], + backend="NA", + device="cpu", + calibration_sampling_size=[100], + op_type_list=None, + op_name_list=None, + strategy="basic", + strategy_kwargs=None, + objective="performance", + timeout=0, + max_trials=100, + performance_only=False, + reduce_range=None, + extra_precisions=["bf16"], + optimization_level=1, + accuracy_criterion=accuracy_criterion): + self._inputs = inputs + self._outputs = outputs + self._backend = backend + self._device = device + self._op_type_list = op_type_list + self._op_name_list = op_name_list + self._strategy = strategy + self._strategy_kwargs = strategy_kwargs + self._objective = objective + self._timeout = timeout + self._max_trials = max_trials + self._performance_only = performance_only + self._reduce_range = reduce_range + self._extra_precisions = extra_precisions \ + if isinstance(extra_precisions, List) else [extra_precisions] + self._optimization_level = optimization_level + self.use_bf16 = "bf16" in self._extra_precisions + self._accuracy_criterion = accuracy_criterion + self._calibration_sampling_size = calibration_sampling_size + + @property + def accuracy_criterion(self): + return self._accuracy_criterion + + @property + def extra_precisions(self): + return self._extra_precisions + + @extra_precisions.setter + def extra_precisions(self, extra_precisions): + if check_value('extra_precisions', extra_precisions, List): + self._extra_precisions = extra_precisions + self._use_bf16 = "bf16" in extra_precisions + + @property + def optimization_level(self): + return self._optimization_level + + @optimization_level.setter + def optimization_level(self, optimization_level): + self._optimization_level = optimization_level + + @property + def reduce_range(self): + return self._reduce_range + + @reduce_range.setter + def reduce_range(self, reduce_range): + if check_value('reduce_range', reduce_range, bool): + self._reduce_range = reduce_range + + @property + def performance_only(self): + return self._performance_only + + @performance_only.setter + def performance_only(self, performance_only): + if check_value('performance_only', performance_only, bool): + self._performance_only = performance_only + + @property + def max_trials(self): + return self._max_trials + + @max_trials.setter + def max_trials(self, max_trials): + if check_value('max_trials', max_trials, int): + self._max_trials = max_trials + + @property + def timeout(self): + return self._timeout + + @timeout.setter + def timeout(self, timeout): + if check_value('timeout', timeout, int): + self._timeout = timeout + + @property + def objective(self): + return self._objective + + @objective.setter + def objective(self, objective): + if check_value('objective', objective, str, + ['performance', 'accuracy', 'modelsize', 'footprint']): + self._objective = objective + + @property + def strategy(self): + return self._strategy + + @strategy.setter + def strategy(self, strategy): + if check_value('strategy', strategy, str, + ['basic', 'mse', 'bayesian', 'random', 'exhaustive', 'sigopt', 'tpe']): + self._strategy = strategy + + @property + def strategy_kwargs(self): + return self._strategy_kwargs + + @strategy_kwargs.setter + def strategy_kwargs(self, strategy_kwargs): + self._strategy_kwargs = strategy_kwargs + + @property + def op_name_list(self): + return self._op_name_list + + @op_name_list.setter + def op_name_list(self, op_name_list): + if not isinstance(op_name_list, dict): + logger.warning("Type of op_name_list should be dict but not {}, " \ + "use its default value.".format(type(op_name_list))) + else: + for k, v in op_name_list.items(): + ops_schema.validate(v) + self._op_name_list = op_name_list + + @property + def op_type_list(self): + return self._op_type_list + + @op_type_list.setter + def op_type_list(self, op_type_list): + if not isinstance(op_type_list, dict): + logger.warning("Type of op_type_list should be dict but not {}, " \ + "use its default value.".format(type(op_type_list))) + else: + for k, v in op_type_list.items(): + ops_schema.validate(v) + self._op_type_list = op_type_list + + @property + def calibration_sampling_size(self): + return self._calibration_sampling_size + + @calibration_sampling_size.setter + def calibration_sampling_size(self, sampling_size): + if check_value('calibration_sampling_size', sampling_size, int): + self._calibration_sampling_size = sampling_size + + @property + def device(self): + return self._device + + @device.setter + def device(self, device): + if check_value('device', device, str, ['cpu', 'gpu']): + self._device = device + + @property + def backend(self): + return self._backend + + @backend.setter + def backend(self, backend): + if check_value('backend', backend, str, [ + 'tensorflow', 'tensorflow_itex', 'pytorch', 'pytorch_ipex', 'pytorch_fx', + 'onnxrt_qlinearops', 'onnxrt_integerops', 'onnxrt_qdq', 'onnxrt_qoperator', 'mxnet' + ]): + self._backend = backend + + @property + def outputs(self): + return self._outputs + + @outputs.setter + def outputs(self, outputs): + if check_value('outputs', outputs, str): + self._outputs = outputs + + @property + def inputs(self): + return self._inputs + + @inputs.setter + def inputs(self, inputs): + if check_value('inputs', inputs, str): + self._inputs = inputs + + +class TuningCriterion: + def __init__(self, strategy="basic", strategy_kwargs=None, timeout=0, max_trials=100, objective="performance"): + self._strategy = strategy + self._timeout = timeout + self._max_trials = max_trials + self._objective = objective + self._strategy_kwargs = strategy_kwargs + + @property + def max_trials(self): + return self._max_trials + + @max_trials.setter + def max_trials(self, max_trials): + if check_value('max_trials', max_trials, int): + self._max_trials = max_trials + + @property + def timeout(self): + return self._timeout + + @timeout.setter + def timeout(self, timeout): + if check_value('timeout', timeout, int): + self._timeout = timeout + + @property + def objective(self): + return self._objective + + @objective.setter + def objective(self, objective): + if check_value('objective', objective, str, + ['performance', 'accuracy', 'modelsize', 'footprint']): + self._objective = objective + + @property + def strategy(self): + return self._strategy + + @strategy.setter + def strategy(self, strategy): + if check_value('strategy', strategy, str, + ['basic', 'mse', 'bayesian', 'random', 'exhaustive', 'sigopt', 'tpe']): + self._strategy = strategy + + @property + def strategy_kwargs(self): + return self._strategy_kwargs + + @strategy_kwargs.setter + def strategy_kwargs(self, strategy_kwargs): + self._strategy_kwargs = strategy_kwargs + +tuning_criterion = TuningCriterion() + + +class PostTrainingQuantConfig(_BaseQuantizationConfig): + def __init__(self, + device="cpu", + backend="NA", + inputs=[], + outputs=[], + approach="auto", + calibration_sampling_size=[100], + op_type_list=None, + op_name_list=None, + reduce_range=None, + extra_precisions = ["bf16"], + optimization_level=1, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion, + ): + super().__init__(inputs=inputs, + outputs=outputs, + device=device, + backend=backend, + calibration_sampling_size=calibration_sampling_size, + op_type_list=op_type_list, + op_name_list=op_name_list, + strategy=tuning_criterion.strategy, + strategy_kwargs=tuning_criterion.strategy_kwargs, + objective=tuning_criterion.objective, + timeout=tuning_criterion.timeout, + max_trials=tuning_criterion.max_trials, + reduce_range=reduce_range, + extra_precisions=extra_precisions, + optimization_level=optimization_level, + accuracy_criterion=accuracy_criterion) + self.approach = approach + + @property + def approach(self): + return self._approach + + @approach.setter + def approach(self, approach): + if check_value("approach", approach, str, ["static", "dynamic", "auto"]): + self._approach = QUANTMAPPING[approach] + + +class QuantizationAwareTrainingConfig(_BaseQuantizationConfig): + def __init__(self, + device="cpu", + backend="NA", + inputs=[], + outputs=[], + op_type_list=None, + op_name_list=None, + reduce_range=None, + extra_precisions=["bf16"], + optimization_level=1): + super().__init__(inputs=inputs, outputs=outputs, device=device, backend=backend, + op_type_list=op_type_list, op_name_list=op_name_list, + reduce_range=reduce_range, extra_precisions=extra_precisions, + optimization_level=optimization_level) + self._approach = 'quant_aware_training' + + @property + def approach(self): + return self._approach + + +pruners = [Pruner()] + + +class PruningConfig: + def __init__(self, pruners=pruners, initial_sparsity=0.0, target_sparsity=0.97, + max_sparsity_ratio_per_layer=0.98, prune_type="basic_magnitude", + start_epoch=0, end_epoch=4, start_step=0, end_step=0, update_frequency=1.0, + update_frequency_on_step=1, not_to_prune_names=[], prune_domain="global", + names=[], exclude_names=[], prune_layer_type=[], sparsity_decay_type="exp", + pattern="tile_pattern_1x1"): + self._weight_compression = DotDict({ + 'initial_sparsity': initial_sparsity, + 'target_sparsity': target_sparsity, + 'max_sparsity_ratio_per_layer': max_sparsity_ratio_per_layer, + 'prune_type': prune_type, + 'start_epoch': start_epoch, + 'end_epoch': end_epoch, + 'start_step': start_step, + 'end_step': end_step, + 'update_frequency': update_frequency, + 'update_frequency_on_step': update_frequency_on_step, + 'not_to_prune_names': not_to_prune_names, + 'prune_domain': prune_domain, + 'names': names, + 'exclude_names': exclude_names, + 'prune_layer_type': prune_layer_type, + 'sparsity_decay_type': sparsity_decay_type, + 'pattern': pattern, + 'pruners': pruners + }) + + @property + def weight_compression(self): + return self._weight_compression + + @weight_compression.setter + def weight_compression(self, weight_compression): + self._weight_compression = weight_compression + + +class KnowledgeDistillationLossConfig: + def __init__(self, temperature=1.0, loss_types=['CE', 'CE'], loss_weights=[0.5, 0.5]): + self.config = DotDict({ + 'KnowledgeDistillationLoss': { + 'temperature': temperature, + 'loss_types': loss_types, + 'loss_weights': loss_weights + } + }) + + +class IntermediateLayersKnowledgeDistillationLossConfig: + def __init__(self, layer_mappings=[], loss_types=[], loss_weights=[], add_origin_loss=False): + self.config = DotDict({ + 'IntermediateLayersKnowledgeDistillationLoss': { + 'layer_mappings': layer_mappings, + 'loss_types': loss_types, + 'loss_weights': loss_weights, + 'add_origin_loss': add_origin_loss + } + }) + + +class SelfKnowledgeDistillationLossConfig: + def __init__(self, + layer_mappings=[], + temperature=1.0, + loss_types=[], + loss_weights=[], + add_origin_loss=False): + self.config = DotDict({ + 'SelfKnowledgeDistillationLoss': { + 'layer_mappings': layer_mappings, + 'temperature': temperature, + 'loss_types': loss_types, + 'loss_weights': loss_weights, + 'add_origin_loss': add_origin_loss, + } + }) + + +criterion = KnowledgeDistillationLossConfig() + + +class DistillationConfig: + """Config of distillation. + + Args: + + teacher_model (Callable): Teacher model for distillation. Defaults to None. + features (optional): Teacher features for distillation, features and teacher_model are alternative. + Defaults to None. + criterion (Callable, optional): Distillation loss configure. + optimizer (dictionary, optional): Optimizer configure. + """ + + def __init__(self, + teacher_model, + criterion=criterion, + optimizer={'SGD': { + 'learning_rate': 0.0001 + }}): + self._criterion = criterion.config + self._optimizer = optimizer + self._teacher_model = teacher_model + + @property + def criterion(self): + return self._criterion + + @criterion.setter + def criterion(self, criterion): + self._criterion = criterion + + @property + def optimizer(self): + return self._optimizer + + @optimizer.setter + def optimizer(self, optimizer): + self._optimizer = optimizer + + @property + def teacher_model(self): + return self._teacher_model + + @teacher_model.setter + def teacher_model(self, teacher_model): + self._teacher_model = teacher_model + + +class MixedPrecisionConfig(PostTrainingQuantConfig): + def __init__(self, + device="cpu", + backend="NA", + inputs=[], + outputs=[], + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion, + extra_precisions=["bf16"]): + super().__init__(inputs=inputs, + outputs=outputs, + device=device, + backend=backend, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion, + extra_precisions=extra_precisions, + ) + + +class ExportConfig: + def __init__( + self, + dtype="int8", + opset_version=14, + quant_format="QDQ", + example_inputs=None, + input_names=None, + output_names=None, + dynamic_axes=None, + ): + self._dtype = dtype + self._opset_version = opset_version + self._quant_format = quant_format + self._example_inputs = example_inputs + self._input_names = input_names + self._output_names = output_names + self._dynamic_axes = dynamic_axes + + @property + def dtype(self): + return self._dtype + + @dtype.setter + def dtype(self, dtype): + self._dtype = dtype + + @property + def opset_version(self): + return self._opset_version + + @opset_version.setter + def opset_version(self, opset_version): + self._opset_version = opset_version + + @property + def quant_format(self): + return self._quant_format + + @quant_format.setter + def quant_format(self, quant_format): + self._quant_format = quant_format + + @property + def example_inputs(self): + return self._example_inputs + + @example_inputs.setter + def example_inputs(self, example_inputs): + self._example_inputs = example_inputs + + @property + def input_names(self): + return self._input_names + + @input_names.setter + def input_names(self, input_names): + self._input_names = input_names + + @property + def output_names(self): + return self._output_names + + @output_names.setter + def output_names(self, output_names): + self._output_names = output_names + + @property + def dynamic_axes(self): + return self._dynamic_axes + + @dynamic_axes.setter + def dynamic_axes(self, dynamic_axes): + self._dynamic_axes = dynamic_axes + + +class Torch2ONNXConfig(ExportConfig): + def __init__( + self, + dtype="int8", + opset_version=14, + quant_format="QDQ", + example_inputs=None, + input_names=None, + output_names=None, + dynamic_axes=None, + **kwargs, + ): + super().__init__( + dtype=dtype, + opset_version=opset_version, + quant_format=quant_format, + example_inputs=example_inputs, + input_names=input_names, + output_names=output_names, + dynamic_axes=dynamic_axes, + ) + self.kwargs = kwargs + + +class TF2ONNXConfig(ExportConfig): + def __init__( + self, + dtype="int8", + opset_version=14, + quant_format="QDQ", + example_inputs=None, + input_names=None, + output_names=None, + dynamic_axes=None, + **kwargs, + ): + super().__init__( + dtype=dtype, + opset_version=opset_version, + quant_format=quant_format, + example_inputs=example_inputs, + input_names=input_names, + output_names=output_names, + dynamic_axes=dynamic_axes, + ) + self.kwargs = kwargs + + +def set_random_seed(seed: int): + options.random_seed = seed + + +def set_workspace(workspace: str): + options.workspace = workspace + + +def set_resume_from(resume_from: str): + options.resume_from = resume_from + + +def set_tensorboard(tensorboard: bool): + options.tensorboard = tensorboard diff --git a/neural_compressor/contrib/strategy/sigopt.py b/neural_compressor/contrib/strategy/sigopt.py index 54593fb2e32..19b3ae1ed3e 100644 --- a/neural_compressor/contrib/strategy/sigopt.py +++ b/neural_compressor/contrib/strategy/sigopt.py @@ -17,12 +17,14 @@ import copy from neural_compressor.utils import logger +from neural_compressor.utils.utility import LazyImport from neural_compressor.strategy.strategy import strategy_registry, TuneStrategy -from sigopt import Connection from collections import OrderedDict from neural_compressor.strategy.st_utils.tuning_sampler import OpWiseTuningSampler from neural_compressor.strategy.st_utils.tuning_structs import OpTuningConfig +sigopt = LazyImport('sigopt') + @strategy_registry class SigOptTuneStrategy(TuneStrategy): """The tuning strategy using SigOpt HPO search in tuning space. @@ -80,7 +82,15 @@ def __init__(self, model, conf, q_dataloader, q_func=None, eval_func, dicts, q_hooks) - + # Initialize the SigOpt tuning strategy if the user specified to use it. + strategy_name = conf.usr_cfg.tuning.strategy.name + if strategy_name.lower() == "sigopt": + try: + import sigopt + except ImportError: + ImportError(f"Please install sigopt for using {strategy_name} strategy.") + else: + pass # SigOpt init client_token = conf.usr_cfg.tuning.strategy.sigopt_api_token self.project_id = conf.usr_cfg.tuning.strategy.sigopt_project_id @@ -107,7 +117,7 @@ def __init__(self, model, conf, q_dataloader, q_func=None, else: logger.info("Experiment name is {}.".format(self.experiment_name)) - self.conn = Connection(client_token) + self.conn = sigopt.Connection(client_token) self.experiment = None def params_to_tune_configs(self, params): diff --git a/neural_compressor/contrib/strategy/tpe.py b/neural_compressor/contrib/strategy/tpe.py index 9baf2911904..39362f1749b 100644 --- a/neural_compressor/contrib/strategy/tpe.py +++ b/neural_compressor/contrib/strategy/tpe.py @@ -20,14 +20,14 @@ from pathlib import Path from functools import partial import numpy as np -import hyperopt as hpo -from hyperopt import fmin, hp, STATUS_OK, Trials from neural_compressor.utils import logger +from neural_compressor.utils.utility import LazyImport from neural_compressor.strategy.strategy import strategy_registry, TuneStrategy from collections import OrderedDict from neural_compressor.strategy.st_utils.tuning_sampler import OpWiseTuningSampler from neural_compressor.strategy.st_utils.tuning_structs import OpTuningConfig +hyperopt = LazyImport('hyperopt') try: import pandas as pd @@ -85,10 +85,19 @@ def __init__(self, model, conf, q_dataloader, q_func=None, eval_dataloader=None, eval_func=None, dicts=None, q_hooks=None): assert conf.usr_cfg.quantization.approach == 'post_training_static_quant', \ "TPE strategy is only for post training static quantization!" + # Initialize the tpe tuning strategy if the user specified to use it. + strategy_name = conf.usr_cfg.tuning.strategy.name + if strategy_name.lower() == "tpe": + try: + import hyperopt + except ImportError: + raise ImportError(f"Please install hyperopt for using {strategy_name} strategy.") + else: + pass self.hpopt_search_space = None self.warm_start = False self.cfg_evaluated = False - self.hpopt_trials = Trials() + self.hpopt_trials = hyperopt.Trials() self.max_trials = conf.usr_cfg.tuning.exit_policy.get('max_trials', 200) self.loss_function_config = { 'acc_th': conf.usr_cfg.tuning.accuracy_criterion.relative if \ @@ -140,7 +149,7 @@ def __getstate__(self): def _configure_hpopt_search_space_and_params(self, search_space): self.hpopt_search_space = {} for param, configs in search_space.items(): - self.hpopt_search_space[(param)] = hp.choice((param[0]), configs) + self.hpopt_search_space[(param)] = hyperopt.hp.choice((param[0]), configs) # Find minimum number of choices for params with more than one choice multichoice_params = [len(configs) for param, configs in search_space.items() if len(configs) > 1] @@ -149,7 +158,7 @@ def _configure_hpopt_search_space_and_params(self, search_space): min_param_size = min(multichoice_params) if len(multichoice_params) > 0 else 1 self.tpe_params['n_EI_candidates'] = min_param_size self.tpe_params['prior_weight'] = 1 / min_param_size - self._algo = partial(hpo.tpe.suggest, + self._algo = partial(hyperopt.tpe.suggest, n_startup_jobs=self.tpe_params['n_initial_point'], gamma=self.tpe_params['gamma'], n_EI_candidates=self.tpe_params['n_EI_candidates'], @@ -225,12 +234,12 @@ def initial_op_quant_mode(items_lst, target_quant_mode, op_item_dtype_dict): self._configure_hpopt_search_space_and_params(first_run_cfg) # Run first iteration with best result from history trials_count = len(self.hpopt_trials.trials) + 1 - fmin(partial(self.object_evaluation, model=self.model), - space=self.hpopt_search_space, - algo=self._algo, - max_evals=trials_count, - trials=self.hpopt_trials, - show_progressbar=False) + hyperopt.fmin(partial(self.object_evaluation, model=self.model), + space=self.hpopt_search_space, + algo=self._algo, + max_evals=trials_count, + trials=self.hpopt_trials, + show_progressbar=False) if pd is not None: self._save_trials(trials_file) self._update_best_result(best_result_file) @@ -266,12 +275,12 @@ def initial_op_quant_mode(items_lst, target_quant_mode, op_item_dtype_dict): self.cfg_evaluated = False logger.debug("Trial iteration start: {} / {}.".format( trials_count, self.max_trials)) - fmin(partial(self.object_evaluation, model=self.model), - space=self.hpopt_search_space, - algo=self._algo, - max_evals=trials_count, - trials=self.hpopt_trials, - show_progressbar=False) + hyperopt.fmin(partial(self.object_evaluation, model=self.model), + space=self.hpopt_search_space, + algo=self._algo, + max_evals=trials_count, + trials=self.hpopt_trials, + show_progressbar=False) trials_count += 1 if pd is not None: self._save_trials(trials_file) @@ -349,7 +358,7 @@ def _compute_metrics(self, tune_cfg, acc, lat): 'acc_loss': acc_diff, 'lat_diff': lat_diff, 'quantization_ratio': quantization_ratio, - 'status': STATUS_OK} + 'status': hyperopt.STATUS_OK} def _calculate_acc_lat_diff(self, acc, lat): int8_acc = acc diff --git a/neural_compressor/experimental/benchmark.py b/neural_compressor/experimental/benchmark.py index 28c790ce7ca..00329dabd43 100644 --- a/neural_compressor/experimental/benchmark.py +++ b/neural_compressor/experimental/benchmark.py @@ -179,16 +179,10 @@ def __call__(self, mode='performance'): """ cfg = self.conf.usr_cfg assert cfg.evaluation is not None, 'benchmark evaluation filed should not be None...' - if self._b_func is None: - assert cfg.evaluation is not None, \ - 'You must pass b_func or benchmark evaluation filed should be set in config yaml file...' - # use first eval config in yaml if mode from __call__not same with yaml config - if not mode in cfg.evaluation: - mode = list(cfg.evaluation.keys())[0] assert sys.platform in ['linux', 'win32'], 'only support platform windows and linux...' set_all_env_var(deep_get(cfg, 'evaluation.{}.configs'.format(mode))) - # disable multi-instance for accuracy mode - if mode == "accuracy": + # disable multi-instance for accuracy mode or running bechmark on GPU device + if mode == "accuracy" or cfg.device == 'gpu': set_env_var('NC_ENV_CONF', True, overwrite_existing=True) logger.info("Start to run Benchmark.") @@ -344,7 +338,6 @@ def run_instance(self, mode): b_dataloader_cfg = deep_get(cfg, 'evaluation.{}.dataloader'.format(mode)) self._b_dataloader = create_dataloader(self.framework, b_dataloader_cfg) - is_measure = True if self._b_func is None: self._b_func = create_eval_func(self.framework, \ self._b_dataloader, \ @@ -354,14 +347,13 @@ def run_instance(self, mode): iteration=iteration) else: self._custom_b_func = True - is_measure = False objectives = [i.lower() for i in cfg.tuning.multi_objectives.objective] if \ deep_get(cfg, 'tuning.multi_objectives') else [cfg.tuning.objective] assert len(objectives) == 1, 'benchmark supports one objective at a time' self.objectives = MultiObjective(objectives, cfg.tuning.accuracy_criterion, - is_measure=is_measure) + is_measure=True) if self._custom_b_func: val = self.objectives.evaluate(self._b_func, self._model.model) @@ -370,7 +362,8 @@ def run_instance(self, mode): # measurer contain info not only performance(eg, memory, model_size) # also measurer have result list among steps acc, _ = val - warmup = 0 if deep_get(cfg, 'evaluation.{}.warmup'.format(mode)) is None \ + batch_size = self._b_dataloader.batch_size + warmup = 0 if deep_get(cfg, 'evaluation.{}.warmup'.format(mode)) is None \ else deep_get(cfg, 'evaluation.{}.warmup'.format(mode)) if len(self.objectives.objectives[0].result_list()) < warmup: @@ -380,20 +373,19 @@ def run_instance(self, mode): warmup = 0 result_list = self.objectives.objectives[0].result_list()[warmup:] + latency = np.array(result_list).mean() / batch_size + self._results[mode] = acc, batch_size, result_list logger.info("\n{} mode benchmark result:".format(mode)) for i, res in enumerate(result_list): logger.debug("Iteration {} result {}:".format(i, res)) if mode == 'accuracy': - self._results[mode] = acc, result_list + logger.info("Batch size = {}".format(batch_size)) if isinstance(acc, list): logger.info("Accuracy is" + "".join([" {:.4f}".format(i) for i in acc])) else: logger.info("Accuracy is {:.4f}".format(acc)) elif mode == 'performance': - batch_size = self._b_dataloader.batch_size - latency = np.array(result_list).mean() / batch_size - self._results[mode] = acc, batch_size, result_list logger.info("Batch size = {}".format(batch_size)) logger.info("Latency: {:.3f} ms".format(latency * 1000)) logger.info("Throughput: {:.3f} images/sec".format(1. / latency)) @@ -475,10 +467,9 @@ def model(self, user_model): auto inferenced, but sometimes auto inferenced inputs/outputs will not meet your requests, so it is better to set them manually in config yaml file. - Another corner case is slim model of tensorflow, - be careful of the name of model configured in yaml file, - make sure the name is in supported slim model list. - + Another corner case is the slim model of tensorflow, + be careful of the name of the model configured in the yaml file, + make sure the name is in the supported slim model list. """ if not isinstance(user_model, BaseModel): logger.warning("Force convert framework model to neural_compressor model.") @@ -525,7 +516,7 @@ def metric(self, user_metric): if deep_get(self.conf.usr_cfg, "evaluation.accuracy.metric"): logger.warning("Override the value of `metric` field defined in yaml file" \ " as user defines the value of `metric` attribute by code.") - + if isinstance(user_metric, NCMetric): metric_cfg = {user_metric.name : {**user_metric.kwargs}} deep_set(self.conf.usr_cfg, "evaluation.accuracy.metric", metric_cfg) diff --git a/neural_compressor/experimental/common/__init__.py b/neural_compressor/experimental/common/__init__.py index 6313abcf296..a5f07849745 100644 --- a/neural_compressor/experimental/common/__init__.py +++ b/neural_compressor/experimental/common/__init__.py @@ -1,3 +1,4 @@ +"""Intel® Neural Compressor: An open-source Python library supporting common model.""" #!/usr/bin/env python # -*- coding: utf-8 -*- # diff --git a/neural_compressor/experimental/common/criterion.py b/neural_compressor/experimental/common/criterion.py index 4382e827225..11308854d10 100644 --- a/neural_compressor/experimental/common/criterion.py +++ b/neural_compressor/experimental/common/criterion.py @@ -1252,14 +1252,17 @@ def __call__(self, **kwargs): class SelfKnowledgeDistillationLoss(KnowledgeDistillationFramework): """SelfKnowledge Distillation Loss.""" - def __init__(self, layer_mappings=[], loss_types=None, loss_weights=None, temperature=1.0,add_origin_loss=False, student_model=None, teacher_model=None): + def __init__(self, layer_mappings=[], loss_types=None, loss_weights=None, temperature=1.0,add_origin_loss=False, + student_model=None, teacher_model=None): """Initialize SelfKnowledge Distillation Loss class. Args: layer_mappings (list): layers of distillation.Format like - [[[student1_layer_name1, teacher_layer_name1],[student2_layer_name1, teacher_layer_name1]],[[student1_layer_name2, teacher_layer_name2],[student2_layer_name2, teacher_layer_name2]]] + [[[student1_layer_name1, teacher_layer_name1],[student2_layer_name1, teacher_layer_name1]], + [[student1_layer_name2, teacher_layer_name2],[student2_layer_name2, teacher_layer_name2]]] loss_types (list, optional): loss types. Defaults to ['CE'] * len(layer_mappings). - loss_weights (list, optional): loss weights. Defaults to [1.0 / len(layer_mappings)] * len(layer_mappings).temperature (float, optional): use to calculate the soft label CE. + loss_weights (list, optional): loss weights. Defaults to [1.0 / len(layer_mappings)] * + len(layer_mappings).temperature (float, optional): use to calculate the soft label CE. temperature (optional): temperature. Defaults to 1.0. add_origin_loss (bool, optional): whether to add origin loss for hard label loss. student_model (optional): student model. Defaults to None. @@ -1342,14 +1345,17 @@ class PyTorchSelfKnowledgeDistillationLoss( SelfKnowledgeDistillationLoss ): """PyTorch SelfKnowledge Distillation Loss.""" - def __init__(self, layer_mappings=[], loss_types=None, loss_weights=None, temperature=1.0,add_origin_loss=False, student_model=None, teacher_model=None): + def __init__(self, layer_mappings=[], loss_types=None, loss_weights=None, temperature=1.0,add_origin_loss=False, + student_model=None, teacher_model=None): """Initialize PyTorch SelfKnowledge Distillation Loss class. Args: layer_mappings (list): layers of distillation.Format like - [[[student1_layer_name1, teacher_layer_name1],[student2_layer_name1, teacher_layer_name1]],[[student1_layer_name2, teacher_layer_name2],[student2_layer_name2, teacher_layer_name2]]] + [[[student1_layer_name1, teacher_layer_name1],[student2_layer_name1, teacher_layer_name1]], + [[student1_layer_name2, teacher_layer_name2],[student2_layer_name2, teacher_layer_name2]]] loss_types (list, optional): loss types. Defaults to ['CE'] * len(layer_mappings). - loss_weights (list, optional): loss weights. Defaults to [1.0 / len(layer_mappings)] * len(layer_mappings).temperature (float, optional): use to calculate the soft label CE. + loss_weights (list, optional): loss weights. Defaults to [1.0 / len(layer_mappings)] * + len(layer_mappings).temperature (float, optional): use to calculate the soft label CE. temperature (optional): temperature. Defaults to 1.0. add_origin_loss (bool, optional): whether to add origin loss for hard label loss. student_model (optional): student model. Defaults to None. @@ -1512,4 +1518,4 @@ def __call__(self, **kwargs): class: PyTorchSelfKnowledgeDistillationLoss param dict (dict): param dict """ - return PyTorchSelfKnowledgeDistillationLoss, self._param_check() + return PyTorchSelfKnowledgeDistillationLoss, self._param_check() \ No newline at end of file diff --git a/neural_compressor/experimental/component.py b/neural_compressor/experimental/component.py index 7a3a225b54e..25ab4d4ba93 100644 --- a/neural_compressor/experimental/component.py +++ b/neural_compressor/experimental/component.py @@ -105,14 +105,6 @@ def _init_with_conf(self): logger.error("{}.".format(e)) raise RuntimeError("{} is not correctly installed. " \ "Please check your environment".format(lib)) - if self.framework == 'tensorflow' or self.framework == 'inteltensorflow': - try: - import tensorflow as tf - except Exception as e: - logger.error("{}.".format(e)) - raise RuntimeError( - "The TensorFlow framework is not correctly installed. Please check your environment" - ) def prepare(self): """Register Quantization Aware Training hooks.""" @@ -133,7 +125,6 @@ def prepare(self): self.register_hook('on_train_begin', self.adaptor._pre_hook_for_qat) self.register_hook('on_train_end', self.adaptor._post_hook_for_qat) - def prepare_qat(self): """Register Quantization Aware Training hooks.""" if self.adaptor is None: diff --git a/neural_compressor/experimental/data/datasets/bert_dataset.py b/neural_compressor/experimental/data/datasets/bert_dataset.py index 636b3bef28f..c22abaa996e 100644 --- a/neural_compressor/experimental/data/datasets/bert_dataset.py +++ b/neural_compressor/experimental/data/datasets/bert_dataset.py @@ -33,7 +33,7 @@ @dataset_registry(dataset_type="bert", framework="pytorch", dataset_format='') class PytorchBertDataset(Dataset): """PyTorch dataset used for model Bert. - + This Dataset is to construct from the Bert TensorDataset and not a full implementation from yaml config. The original repo link is: https://github.com/huggingface/transformers. When you want use this Dataset, you should add it before you initialize your DataLoader. diff --git a/neural_compressor/experimental/distillation.py b/neural_compressor/experimental/distillation.py index 7ad630506ee..c87ef341f22 100644 --- a/neural_compressor/experimental/distillation.py +++ b/neural_compressor/experimental/distillation.py @@ -92,6 +92,7 @@ def _on_train_begin(self, dataloader=None): self.best_model = copy.deepcopy(self._model) else: self.best_model = self._model + def _on_step_begin(self, batch_id): """Operations called on the beginning of batches.""" if self.criterion is not None and hasattr(self.criterion, 'clear_features'): @@ -144,7 +145,10 @@ def _on_epoch_end(self): if (isinstance(score, list) and all([s > b_s for s, b_s in zip(score, self.best_score)])) or score > self.best_score: self.best_score = score - self.best_model = copy.deepcopy(self._model._model) + if self.framework == "pytorch": + self.best_model = copy.deepcopy(self._model) + else: + self.best_model = self._model def init_train_cfg(self): """Initialize the training configuration.""" @@ -288,11 +292,7 @@ def execute(self): logger.info("Model distillation is done.") if self._eval_func is not None: logger.info("Start to evaluate the distilled model.") - if self.best_model: - if self.framework == "pytorch": - self._model._model = self.best_model - else: - self._model = self.best_model + self._model = self.best_model if self.best_model else self._model score = self._eval_func( self._model if getattr(self._eval_func, 'builtin', None) else self._model.model ) diff --git a/neural_compressor/experimental/model_conversion.py b/neural_compressor/experimental/model_conversion.py index f5d11f0f671..489128d93e3 100644 --- a/neural_compressor/experimental/model_conversion.py +++ b/neural_compressor/experimental/model_conversion.py @@ -157,7 +157,7 @@ def dataset(self, dataset_type, *args, **kwargs): """Return dataset. Args: - dataset_typ: dataset type + dataset_type: dataset type Returns: class: dataset class diff --git a/neural_compressor/experimental/pruning.py b/neural_compressor/experimental/pruning.py index f005e3ee5db..7c318e38bf9 100644 --- a/neural_compressor/experimental/pruning.py +++ b/neural_compressor/experimental/pruning.py @@ -114,6 +114,7 @@ def _on_after_optimizer_step(self): pruner.on_after_optimizer_step() def prepare(self): + """Functions prepare for generate_hooks, generate_pruners.""" self.generate_hooks() self.generate_pruners() diff --git a/neural_compressor/experimental/quantization.py b/neural_compressor/experimental/quantization.py index 5f0eda5ecf9..cab874bcca7 100644 --- a/neural_compressor/experimental/quantization.py +++ b/neural_compressor/experimental/quantization.py @@ -133,6 +133,9 @@ def pre_process(self): self._create_eval_dataloader(cfg) self._create_calib_dataloader(cfg) strategy = cfg.tuning.strategy.name.lower() + if cfg.quantization.optimization_level == 0: + strategy = "conservative" + logger.info(f"On the premise that the accuracy meets the conditions, improve the performance.") assert strategy in STRATEGIES, "Tuning strategy {} is NOT supported".format(strategy) _resume = None @@ -390,12 +393,11 @@ def q_func(self): return None @q_func.setter - @deprecated(version='2.0', reason="please use `train_func` instead") def q_func(self, user_q_func): - """Training function for Quantization-Aware Training. + """Calibrate quantization parameters for Post-training static quantization. It is optional and only takes effect when user choose - "quant_aware_training" approach in yaml. + "post_training_static_quant" approach in yaml. Args: user_q_func: This function takes "model" as input parameter diff --git a/neural_compressor/mix_precision.py b/neural_compressor/mix_precision.py index ab0a774aad3..f89686887b7 100644 --- a/neural_compressor/mix_precision.py +++ b/neural_compressor/mix_precision.py @@ -17,14 +17,14 @@ from .experimental.mixed_precision import MixedPrecision -from neural_compressor.conf.pythonic_config import Config, MixedPrecisionConfig, Options +from neural_compressor.conf.pythonic_config import Config +from neural_compressor.config import MixedPrecisionConfig def fit(model, config=None, eval_func=None, eval_dataloader=None, eval_metric=None, **kwargs): assert isinstance(config, MixedPrecisionConfig), "Please provide MixedPrecisionConfig!" - options = Options() if "options" not in kwargs else kwargs["options"] - conf = Config(quantization=config, options=options) + conf = Config(quantization=config) converter = MixedPrecision(conf) - converter.precisions = config.precisions + converter.precisions = config.extra_precisions converter.model = model if eval_func is not None: converter.eval_func = eval_func diff --git a/neural_compressor/model/base_model.py b/neural_compressor/model/base_model.py index 029723ad821..c42604f96fa 100644 --- a/neural_compressor/model/base_model.py +++ b/neural_compressor/model/base_model.py @@ -42,12 +42,7 @@ def save(self, root, *args, **kwargs): def export( self, save_path: str, - input, - target_model_type: str = 'ONNX', - quant_format: str = 'QDQ', - opset_version: int = 14, - *args, - **kwargs + conf, ): ''' abstract method of model convertion to ONNX''' raise NotImplementedError diff --git a/neural_compressor/model/model.py b/neural_compressor/model/model.py index 13629a19038..59a87d51a29 100644 --- a/neural_compressor/model/model.py +++ b/neural_compressor/model/model.py @@ -949,11 +949,6 @@ def save(self, root=None): f.write(self.graph_def.SerializeToString()) logger.info("Save quantized model to {}.".format(pb_file)) - @abstractmethod - def convert(self, src_type="QDQ", dst_type="TFDO", *args, **kwargs): - ''' abstract method of model saving, Tensorflow model only''' - raise NotImplementedError - class TensorflowSavedModelModel(TensorflowBaseModel): def get_all_weight_names(self): diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py index fa09e64e45d..42b5cee2d29 100644 --- a/neural_compressor/model/torch_model.py +++ b/neural_compressor/model/torch_model.py @@ -20,11 +20,9 @@ import inspect import sys from collections import OrderedDict, UserDict -from abc import abstractmethod from ..adaptor.torch_utils.util import input2tuple from neural_compressor.utils.utility import LazyImport, compute_sparsity from neural_compressor.utils import logger -from neural_compressor.conf.dotdict import deep_get, deep_set from neural_compressor.conf import config as cfg from neural_compressor.model.base_model import BaseModel @@ -47,8 +45,41 @@ def __init__(self, model, **kwargs): self.q_config = None self._workspace_path = '' self.is_quantized = False + try: + self.fp32_model = copy.deepcopy(model) + except Exception as e: # pragma: no cover + logger.warning("Fail to deep copy the model due to {}, inplace is used now.".format( + repr(e))) + self.fp32_model = model self.kwargs = kwargs if kwargs else None + def __repr__(self): + # rewirte this func to avoid printing fp32_model + from torch.nn.modules.module import _addindent + # We treat the extra repr like the sub-module, one item per line + extra_lines = [] + extra_repr = self.extra_repr() + # empty string will be split into list [''] + if extra_repr: + extra_lines = extra_repr.split('\n') + child_lines = [] + for key, module in self._modules.items(): + if key == 'fp32_model': + continue + mod_str = repr(module) + mod_str = _addindent(mod_str, 2) + child_lines.append('(' + key + '): ' + mod_str) + lines = extra_lines + child_lines + main_str = self._get_name() + '(' + if lines: + # simple one-liner info, which most builtin Modules will use + if len(extra_lines) == 1 and not child_lines: + main_str += extra_lines[0] + else: + main_str += '\n ' + '\n '.join(lines) + '\n' + main_str += ')' + return main_str + def forward(self, *args, **kwargs): return self._model(*args, **kwargs) @@ -356,13 +387,18 @@ def export_to_fp32_onnx( opset_version=14, dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}}, + input_names=None, + output_names=None, do_constant_folding=True, verbose=True, fp32_model=None, ): - example_input_names = ['input'] - if isinstance(example_inputs, dict) or isinstance(example_inputs, UserDict): - example_input_names = list(example_inputs.keys()) + if input_names: + example_input_names = input_names + else: + example_input_names = ['input'] + if isinstance(example_inputs, dict) or isinstance(example_inputs, UserDict): + example_input_names = list(example_inputs.keys()) model = self.model if fp32_model: model = fp32_model @@ -372,6 +408,7 @@ def export_to_fp32_onnx( save_path, opset_version=opset_version, input_names=example_input_names, + output_names=output_names, dynamic_axes=dynamic_axes, do_constant_folding=do_constant_folding, ) @@ -387,6 +424,8 @@ def export_to_bf16_onnx(self, opset_version=14, dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}}, + input_names=None, + output_names=None, do_constant_folding=True, verbose=True, ): @@ -396,6 +435,8 @@ def export_to_bf16_onnx(self, example_inputs = example_inputs, opset_version=opset_version, dynamic_axes=dynamic_axes, + input_names=input_names, + output_names=output_names, do_constant_folding=do_constant_folding, verbose=False, ) @@ -438,6 +479,8 @@ def export_to_int8_onnx( opset_version=14, dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}}, + input_names=None, + output_names=None, do_constant_folding=True, quant_format='QDQ', dtype='S8S8', @@ -466,22 +509,13 @@ def export_to_int8_onnx( "No quantization configuration found, " + \ "please use the model generated by INC quantizer" if 'dynamic' in self.q_config['approach']: - op_types_to_quantize=['MatMul', 'Gather', "LSTM", 'Conv'] - pytorch_op_types_to_quantize=['Linear', 'Embedding', "LSTM", - 'Conv1d', 'Conv2d'] - addition_op_to_quantize = list(ortq.registry.IntegerOpsRegistry.keys()) + op_types_to_quantize=['MatMul', 'Gather', "LSTM"] + pytorch_op_types_to_quantize=['Linear', 'Embedding', "LSTM"] + addition_op_to_quantize = [] else: op_types_to_quantize=['MatMul', 'Gather', 'Conv'] pytorch_op_types_to_quantize=['Linear', 'Embedding', 'Conv1d', 'Conv2d'] - if quant_format == 'QDQ': - addition_op_to_quantize = list(ortq.registry.QDQRegistry.keys()) - addition_op_to_quantize.remove('Relu') # ValueError: x not in list - else: - addition_op_to_quantize = list(ortq.registry.QLinearOpsRegistry.keys()) - - if 'U8S8' in dtype: - op_types_to_quantize.remove('Gather') - pytorch_op_types_to_quantize.remove('Embedding') + addition_op_to_quantize = [] if quant_format == 'QDQ' and opset_version < 13: # pragma: no cover opset_version = 13 @@ -496,6 +530,8 @@ def export_to_int8_onnx( example_inputs = example_inputs, opset_version=opset_version, dynamic_axes=dynamic_axes, + input_names=input_names, + output_names=output_names, do_constant_folding=do_constant_folding, verbose=False, fp32_model=fp32_model @@ -623,17 +659,35 @@ def export_to_int8_onnx( def export( self, save_path: str, - input, - target_model_type: str = 'ONNX', - quant_mode: str = 'QDQ', - opset_version: int = 14, - *args, - **kwargs + conf, ): - if self.q_config is not None: - assert False, "Unsupport convertion from PyTorch to ONNX" - else: - self.export_to_fp32_onnx(save_path, input, opset_version=opset_version) + if conf.dtype == 'int8': + calib_dataloader = conf.kwargs.pop("calib_dataloader", None) + self.export_to_int8_onnx( + save_path=save_path, + example_inputs=conf.example_inputs, + opset_version=conf.opset_version, + dynamic_axes=conf.dynamic_axes, + input_names=conf.input_names, + output_names=conf.output_names, + quant_format=conf.quant_format, + dtype='U8S8', + fp32_model=self.fp32_model, + calib_dataloader=calib_dataloader, + ) + elif conf.dtype == 'fp32': + self.export_to_fp32_onnx( + save_path=save_path, + example_inputs=conf.example_inputs, + opset_version=conf.opset_version, + dynamic_axes=conf.dynamic_axes, + input_names=conf.input_names, + output_names=conf.output_names, + verbose=True, + fp32_model=self.fp32_model, + ) + else: # pragma: no cover + assert False, "Not allowed dtype: {}, pleas use 'fp32' or 'int8'.".format(conf.dtype) class PyTorchFXModel(PyTorchModel): diff --git a/neural_compressor/objective.py b/neural_compressor/objective.py index 81c96117ef9..f373db46c1b 100644 --- a/neural_compressor/objective.py +++ b/neural_compressor/objective.py @@ -18,6 +18,7 @@ from abc import abstractmethod import time import numpy as np +from copy import deepcopy import tracemalloc from .utils.utility import get_size @@ -178,7 +179,7 @@ def __init__(self, objectives, accuracy_criterion, metric_criterion=[True], \ self.objectives = [OBJECTIVES[i]() for i in objectives] self.representation = [str(i).capitalize() for i in self.objectives] - self.baseline = None + self._baseline = None self.val = None if obj_criterion: if len(self.objectives) != len(obj_criterion) and len(obj_criterion) == 1: @@ -192,7 +193,24 @@ def __init__(self, objectives, accuracy_criterion, metric_criterion=[True], \ self.metric_criterion = metric_criterion self.obj_weight = obj_weight self.is_measure = is_measure - + self._accuracy_target = None + + @property + def baseline(self): + return self._baseline + + @baseline.setter + def baseline(self, val): + self._baseline = val + + @property + def accuracy_target(self): + return self._accuracy_target + + @accuracy_target.setter + def accuracy_target(self, val): + self._accuracy_target = val + def compare(self, last, baseline): """The interface of comparing if metric reaches the goal with acceptable accuracy loss. @@ -248,6 +266,49 @@ def compare(self, last, baseline): zip(acc, acc_target, self.metric_criterion)]) else: return False + + def _get_accuracy_target(self): + assert self._baseline is not None, "Baseline is None" + base_acc, _ = self._baseline + if not isinstance(base_acc, list): + base_acc = [base_acc] + if self.metric_weight is not None and len(base_acc) > 1: + base_acc = [np.mean(np.array(base_acc) * self.metric_weight)] + + if self.relative: + if len(base_acc) == 1: + acc_target = [base_acc[0] * (1 - float(self.acc_goal)) if self.higher_is_better \ + else base_acc[0] * (1 + float(self.acc_goal))] + else: + # use metric_criterion to replace acc_criterion + acc_target = [b_acc * (1 - float(self.acc_goal)) if higher_is_better \ + else b_acc * (1 + float(self.acc_goal)) \ + for b_acc, higher_is_better in zip(base_acc, self.metric_criterion)] + else: + if len(base_acc) == 1: + acc_target = [base_acc[0] - float(self.acc_goal) if self.higher_is_better \ + else base_acc[0] + float(self.acc_goal)] + else: + # use metric_criterion to replace acc_criterion + acc_target = [b_acc - float(self.acc_goal) if higher_is_better \ + else b_acc + float(self.acc_goal) \ + for b_acc, higher_is_better in zip(base_acc, self.metric_criterion)] + return acc_target + + def accuracy_meets(self): + last_acc, _ = deepcopy(self.val) + got_better_result = False + if not isinstance(last_acc, list): + last_acc = [last_acc] + + if self.metric_weight is not None and len(last_acc) > 1: + last_acc = [np.mean(np.array(last_acc) * self.metric_weight)] + if not self._accuracy_target: + self.accuracy_target = self._get_accuracy_target() + all_higher = all([_last > _target for _last, _target in zip(last_acc, self.accuracy_target) ]) + all_lower = all([_last < _target for _last, _target in zip(last_acc, self.accuracy_target) ]) + got_better_result = (all_higher and self.higher_is_better) or (all_lower and not self.higher_is_better) + return got_better_result def evaluate(self, eval_func, model): """The interface of calculating the objective. diff --git a/neural_compressor/quantization.py b/neural_compressor/quantization.py index 025e4c23fa5..272b86fdc0f 100644 --- a/neural_compressor/quantization.py +++ b/neural_compressor/quantization.py @@ -20,7 +20,8 @@ from .data import DATALOADERS, DATASETS from .experimental import Quantization as ExpQuantization from deprecated import deprecated -from neural_compressor.conf.pythonic_config import Config, PostTrainingConfig +from neural_compressor.conf.pythonic_config import Config +from neural_compressor.config import PostTrainingQuantConfig class Quantization(object): """Quantization class automatically searches for optimal quantization recipes for low @@ -155,7 +156,7 @@ def eval_func(model): self.exp_quantizer.q_func = q_func if eval_func is not None: - self.exp_quantizer.eval_func = eval_func + self.exp_quantizer.eval_func = eval_func elif eval_dataloader is not None: self.exp_quantizer.eval_dataloader = eval_dataloader @@ -197,10 +198,14 @@ def postprocess(self, name, postprocess_cls, **kwargs): self.exp_quantizer.postprocess = nc_postprocess -def fit( - model, conf, calib_dataloader=None, calib_func=None, eval_dataloader=None, - eval_func=None, eval_metric=None, options=None, **kwargs -): +def fit(model, + conf, + calib_dataloader=None, + calib_func=None, + eval_dataloader=None, + eval_func=None, + eval_metric=None, + **kwargs): """Quantize the model with a given configure. Args: @@ -256,22 +261,21 @@ def eval_func(model): output = model(input) accuracy = metric(output, label) return accuracy - options (Options, optional): The configure for random_seed, workspace, - resume path and tensorboard flag. """ - if isinstance(conf, PostTrainingConfig): - if options is None: - conf = Config(quantization=conf) - else: - conf = Config(quantization=conf, options=options) + if isinstance(conf, PostTrainingQuantConfig): + if eval_func is None and eval_dataloader is None: + conf.performance_only = True + conf = Config(quantization=conf) quantizer = ExpQuantization(conf) quantizer.model = model if eval_func is not None: quantizer.eval_func = eval_func if calib_dataloader is not None: quantizer.calib_dataloader = calib_dataloader + if calib_func is not None: + quantizer.calib_func = calib_func if eval_dataloader is not None: quantizer.eval_dataloader = eval_dataloader if eval_metric is not None: diff --git a/neural_compressor/strategy/conservative.py b/neural_compressor/strategy/conservative.py new file mode 100644 index 00000000000..d4806e59ad5 --- /dev/null +++ b/neural_compressor/strategy/conservative.py @@ -0,0 +1,425 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import os +import numpy as np + +from collections import deque +from collections import OrderedDict as COrderedDict +from copy import deepcopy +from typing import Dict, List, Tuple, OrderedDict + +from .strategy import strategy_registry, TuneStrategy +from .st_utils.tuning_space import TuningItem +from ..utils import logger +from ..utils.utility import Statistics + +@strategy_registry +class ConservativeTuneStrategy(TuneStrategy): + def __init__(self, model, conf, q_dataloader, q_func=None, + eval_dataloader=None, eval_func=None, dicts=None, q_hooks=None): + super( + ConservativeTuneStrategy, + self).__init__( + model, + conf, + q_dataloader, + q_func, + eval_dataloader, + eval_func, + dicts, + q_hooks) + self.acc_meet_flag = False + + def next_tune_cfg(self): + """ + Conservative tuning: accuracy first, performance second + + 1. Query all quantifiable ops and save as a list: quantifiable_ops = [(op_name, op_type), ...] + 2. Classify the op by its op type + 3. Add op to quant_queue according to the op type priority + 4. Go through the quant_queue and replace it with the fp32 config in tune_cfg if + accuracy meets the requirements else continue + + For bf16 and fp16, do the same thing as int8 + Note: + 1) other tunable items will using the first option as the default value. + + Yields: + tune_config (dict): It's a dict containing the tuning configuration to run. + """ + + tuning_space = self.tuning_space + calib_sampling_size_lst = tuning_space.root_item.get_option_by_name('calib_sampling_size').options + calib_sampling_size = calib_sampling_size_lst[0] + tune_cfg = self._initialize_tune_cfg() + tune_cfg['calib_sampling_size'] = calib_sampling_size + op_type_priority = self._get_op_type_priority() + quant_items_pool = self._quant_items_pool(op_type_priority) + logger.info(f"*** Try to convert op into lower precision to improve performance.") + for dtype, op_items in quant_items_pool.items(): + logger.info(f"*** Start to convert op into {dtype}.") + for op_type, items_lst in op_items.items(): + logger.info(f"*** Try to convert all {op_type} ops into {dtype}.") + tmp_tune_cfg = deepcopy(tune_cfg) + for item, quant_mode in items_lst: + op_info = item.name + op_config = tuning_space.set_deafult_config(op_info, quant_mode) + tmp_tune_cfg[op_info] = op_config + yield tmp_tune_cfg + if self.acc_meet_flag: + logger.info(f"*** Convert all {op_type} ops to {dtype} and accuracy still meet the requirements") + tune_cfg = deepcopy(tmp_tune_cfg) + else: + tmp_tune_cfg = deepcopy(tune_cfg) + logger.info(f"*** Convert all {op_type} ops to {dtype} but accuracy not meet the requirements") + logger.info(f"*** Try to convert {op_type} op into {dtype} one by one.") + for item, quant_mode in items_lst: + op_info = item.name + op_config = tuning_space.set_deafult_config(op_info, quant_mode) + tmp_tune_cfg[op_info] = op_config + yield tmp_tune_cfg + if self.acc_meet_flag: + tune_cfg[op_info] = op_config + logger.info((f"*** Convert one {op_type} op({op_info}) " + f"into {dtype} and accuracy still meet the requirements")) + else: + tmp_tune_cfg[op_info] = tune_cfg[op_info] + logger.info(f"*** Skip convert {op_info}.") + logger.info(f"*** Ending tuning process due to no quantifiable op left.") + + def traverse(self): + if not (self.cfg.evaluation and self.cfg.evaluation.accuracy and \ + (self.cfg.evaluation.accuracy.metric or self.cfg.evaluation.accuracy.multi_metrics)) \ + and self.eval_func is None: + logger.info("Neither evaluation function nor metric is defined." \ + " Generate a quantized model with default quantization configuration.") + self.cfg.tuning.exit_policy.performance_only = True + logger.info("Force setting 'tuning.exit_policy.performance_only = True'.") + logger.info("Generate a fake evaluation function.") + self.eval_func = self._fake_eval_func + + # Get fp32 model baseline + if self.baseline is None: + logger.info("Get FP32 model baseline.") + self._fp32_model = self.model + self.baseline = self._evaluate(self.model) + self.objectives.baseline = self.baseline + # self.best_tune_result = self.baseline + # Initialize the best qmodel as fp32 model + # self.best_qmodel = self._fp32_model + # Record the FP32 baseline + self._add_tuning_history() + self.show_baseline_info() + + # Start tuning + trials_count = 0 + for op_tuning_cfg in self.next_tune_cfg(): + tune_cfg = self._tune_cfg_converter(op_tuning_cfg) + trials_count += 1 + tuning_history = self._find_tuning_history(tune_cfg) + if tuning_history and trials_count < self.cfg.tuning.exit_policy.max_trials: + self.last_tune_result = tuning_history['last_tune_result'] + self.best_tune_result = tuning_history['best_tune_result'] + logger.warn("Find evaluated tuning config, skip.") + continue + logger.debug("Dump current tuning configuration:") + logger.debug(tune_cfg) + self.tuning_times += 1 + self.q_model = self.adaptor.quantize( + copy.deepcopy(tune_cfg), self.model, self.calib_dataloader, self.q_func) + self.algo.calib_iter = tune_cfg['calib_iteration'] + self.algo.q_model = self.q_model + # TODO align the api to let strategy has access to pre_optimized model + assert self.adaptor.pre_optimized_model + self.algo.origin_model = self.adaptor.pre_optimized_model + if self.cfg.quantization.recipes.fast_bias_correction: + self.algo.algorithms[0].quantization_cfg = tune_cfg + self.last_qmodel = self.algo() + assert self.last_qmodel + self.last_tune_result = self._evaluate(self.last_qmodel) + self.acc_meet_flag = self.objectives.accuracy_meets() + if self.acc_meet_flag: + # For the first tuning + if not self.best_tune_result: + self.best_tune_result = self.last_tune_result + self.best_qmodel = self.last_qmodel + self.best_tune_result = self.last_tune_result + else: + # Update current tuning config and model with best performance + get_better_performance = self.compare_performace(self.last_tune_result, self.best_tune_result) + if get_better_performance: + logger.info(f"*** Update the model with better performance.") + self.best_qmodel = self.last_qmodel + self.best_tune_result = self.last_tune_result + else: + logger.info(f"*** The qmodel was not updated due to not achieving better performance.") + # Dump the current state to log + self.dump_tuning_state(trials_count, self.last_tune_result, self.best_tune_result, self.baseline) + # Judge stop or continue tuning + need_stop = self.stop(trials_count) + # Record the tuning history + saved_tune_cfg = copy.deepcopy(tune_cfg) + saved_last_tune_result = copy.deepcopy(self.last_tune_result) + self._add_tuning_history(saved_tune_cfg, + saved_last_tune_result, + q_config=self.q_model.q_config) + self.tune_result_record.append(copy.deepcopy(self.last_tune_result)) + self.tune_cfg = tune_cfg + self._dump_tuning_process_statistics() + if need_stop: + if self.cfg.tuning.diagnosis and self.cfg.tuning.diagnosis.diagnosis_after_tuning: + logger.debug(f'*** Start to do diagnosis (inspect tensor).') + self._diagnosis() + if self.use_multi_objective and len(self.tune_result_record) > 1 and \ + self.best_tune_result is not None: + best_trail, best_result = self.objectives.best_result(self.tune_result_record, + copy.deepcopy(self.baseline)) + if best_result != self.best_tune_result: + from neural_compressor.utils.utility import recover + self.best_qmodel = recover(self.model.model, + os.path.join(self.cfg.tuning.workspace.path, 'history.snapshot'), + best_trail) + self.best_tune_result = best_result + self._dump_tuning_process_statistics() + break + + def stop(self, trials_count): + need_stop = False + if trials_count >= self.cfg.tuning.exit_policy.max_trials: + need_stop = True + return need_stop + + def compare_performace(self, last_tune_result, best_tune_result): # pragma: no cover + _, last_perf = last_tune_result + _, best_perf = best_tune_result + return last_perf[0] < best_perf[0] + + def dump_tuning_state(self, trials_count, last_tune_result, best_tune_result, baseline): + if last_tune_result: + last_tune = last_tune_result[0] if \ + isinstance(last_tune_result[0], list) else [last_tune_result[0]] + for name, data in zip(self.metric_name, last_tune): + if len(self.tune_data[name]) == 1: + self.tune_data[name].append(data) + else: + self.tune_data[name][1] = data + + if self.metric_weight and len(last_tune) > 1: + weighted_acc = np.mean(np.array(last_tune) * self.metric_weight) + if len(self.tune_data['Weighted accuracy']) == 1: + self.tune_data['Weighted accuracy'].append(weighted_acc) + else: + self.tune_data['Weighted accuracy'][1] = weighted_acc + last_tune = [weighted_acc] + + last_tune_msg = '[Accuracy (int8|fp32):' + \ + ''.join([' {:.4f}|{:.4f}'.format(last, base) for last, base in \ + zip(last_tune, self.tune_data['baseline'])]) + \ + ''.join([', {} (int8|fp32): {:.4f}|{:.4f}'.format( \ + x, y, z) for x, y, z in zip( \ + self.objectives.representation, last_tune_result[1], baseline[1]) \ + if x != 'Accuracy']) + ']' + else: # pragma: no cover + last_tune_msg = 'n/a' + for name in self.tune_data.keys() - {'baseline'}: + if len(self.tune_data[name]) == 1: + self.tune_data[name].append('n/a') + else: + self.tune_data[name][1] = 'n/a' + + if best_tune_result: + best_tune = best_tune_result[0] if isinstance(best_tune_result[0], list) \ + else [best_tune_result[0]] + + for name, data in zip(self.metric_name, best_tune): + if len(self.tune_data[name]) == 2: + self.tune_data[name].append(data) + else: + self.tune_data[name][2] = data + + if self.metric_weight and len(best_tune) > 1: + weighted_acc = np.mean(np.array(best_tune) * self.metric_weight) + + if len(self.tune_data['Weighted accuracy']) == 2: + self.tune_data['Weighted accuracy'].append(weighted_acc) + else: # pragma: no cover + self.tune_data['Weighted accuracy'][2] = weighted_acc + + best_tune = [weighted_acc] + + best_tune_msg = '[Accuracy:' + ''.join([' {:.4f}'.format(best) \ + for best in best_tune]) + ''.join([', {}: {:.4f}'.format(x,y) \ + for x,y in zip(self.objectives.representation, \ + best_tune_result[1]) if x != 'Accuracy']) + ']' + + else: + best_tune_msg = 'n/a' + for name in self.tune_data.keys() - {'baseline'}: + if len(self.tune_data[name]) == 2: + self.tune_data[name].append('n/a') + else: + self.tune_data[name][2] = 'n/a' + + logger.info("Tune {} result is: {}, Best tune result is: {}".format(trials_count, + last_tune_msg, + best_tune_msg)) + output_data = [[info_type, + '{:.4f} '.format(self.tune_data[info_type][0]) if \ + not isinstance(self.tune_data[info_type][0], str) else self.tune_data[info_type][0], + '{:.4f} '.format(self.tune_data[info_type][1]) if \ + not isinstance(self.tune_data[info_type][1], str) else self.tune_data[info_type][1], + '{:.4f} '.format(self.tune_data[info_type][2]) if \ + not isinstance(self.tune_data[info_type][2], str) else self.tune_data[info_type][2]] \ + for info_type in self.tune_data.keys() if info_type != 'baseline'] + + output_data.extend([[obj, + '{:.4f} '.format(baseline[1][i]) if baseline else 'n/a', + '{:.4f} '.format(last_tune_result[1][i]) if last_tune_result else 'n/a', + '{:.4f} '.format(best_tune_result[1][i]) if best_tune_result else 'n/a'] \ + for i, obj in enumerate(self.objectives.representation)]) + + Statistics(output_data, + header='Tune Result Statistics', + field_names=['Info Type', 'Baseline', 'Tune {} result'.format(trials_count), \ + 'Best tune result']).print_stat() + + def _get_op_type_priority(self): + optypewise_cap = self.capability['optypewise'] + op_type_priority = list(optypewise_cap.keys()) + return op_type_priority + + def _sorted_item_by_op_type(self, + items_lst: List[Tuple[TuningItem, str]], + op_type_priority: List[str]) -> OrderedDict[str, List]: + """ Socring the tuning items according to its op type. + + Args: + items_lst: The tuning item list. # [(op_item, quant_mode), ... ] + op_type_priority: The op type list with the order. # [optype_1, optype_2] + + Returns: + The tuning items list that sorted according to its op type. + OrderDict: + # op_type: [(TuningItem, quant_mode), ...] + conv2d: [(TuningItem, static), (TuningItem, static)] + linear: [(TuningItem, static), (TuningItem, static)] + """ + op_type_lst_from_items_lst = list(set([item[0].name[1] for item in items_lst])) + # For items whose op type does not exist in the priority list, assign it with lowest priority. + sorted_op_type_lst = [op_type for op_type in op_type_priority if op_type in op_type_lst_from_items_lst] + sorted_op_type_lst += list(set(op_type_lst_from_items_lst) - set(op_type_priority)) + sorted_items = COrderedDict() + for op_type in sorted_op_type_lst: + sorted_items[op_type] = [] + for op_item, quant_mode in items_lst: + op_type = op_item.name[1] + sorted_items[op_type].append((op_item, quant_mode)) + return sorted_items + + def _initialize_tune_cfg(self): + """Initialize the tuning config with fp32 AMAP. + + Returns: + The intialized tuning config. + """ + tuning_space = self.tuning_space + quant_mode_wise_items = tuning_space.quant_mode_wise_items + # Initialize the tuning config + initial_tuning_cfg = {} + all_ops = set() + fp32_ops = [] + for quant_mode, items_lst in quant_mode_wise_items.items(): + items_name_lst = [item.name for item in items_lst] + all_ops = all_ops.union(set(items_name_lst)) + if quant_mode == "fp32": + fp32_ops += [item.name for item in items_lst] + non_fp32_ops_dtype = {} + fp32_ops_set = set(fp32_ops) + for quant_mode, items_lst in quant_mode_wise_items.items(): + items_name_set = set([item.name for item in items_lst]) + tmp_non_fp32_ops = items_name_set.difference(fp32_ops_set) + if tmp_non_fp32_ops: + for op_info in tmp_non_fp32_ops: + non_fp32_ops_dtype[op_info] = quant_mode + for op_info in fp32_ops: + initial_tuning_cfg[op_info] = tuning_space.set_deafult_config(op_info, "fp32") + for op_info, quant_mode in non_fp32_ops_dtype.items(): + initial_tuning_cfg[op_info] = tuning_space.set_deafult_config(op_info, quant_mode) + return initial_tuning_cfg + + def _quant_items_pool(self, op_type_priority: List[str]) -> OrderedDict[ + str, OrderedDict[str, List[Tuple[TuningItem, str]]]]: + """Create the op queue to be quantized. + + -------------------------------------------------------------------------- + | Level 1 | bf16 | fp16 | static/dynamic | + | Level 2 | conv2d, linear, ...| conv2d, linear, ...| conv2d, linear, ...| + + Args: + op_type_priority: The optype list with priority. + + Returns: + The op item pool to convert into lower precision. + quant_items_pool(OrderDict): + bf16: + OrderDict: + conv2d: [(TuningItem, bf16), (TuningItem, bf16)] + linear: [(TuningItem, bf16), (TuningItem, bf16)] + int8: + OrderDict: + # (TuningItem, quant_mode) + conv2d: [(TuningItem, static), (TuningItem, static)] + linear: [(TuningItem, static), (TuningItem, static)] + """ + quant_mode_wise_items = self.tuning_space.quant_mode_wise_items + # Add all quantized pair into queue + quant_items_pool = COrderedDict() + # collect and sorted all ops that support bf16 and fp16 + for quant_mode in ['bf16', 'fp16']: + if quant_mode in quant_mode_wise_items: + op_item_pairs = [(op_item, quant_mode) for op_item in quant_mode_wise_items[quant_mode]] + op_item_pairs = self._sorted_item_by_op_type(op_item_pairs, op_type_priority) + quant_items_pool[quant_mode] = op_item_pairs + op_item_pairs = [] + quant_ops_name_set = set() + # collect and sorted all ops that support int8 + for quant_mode, items_lst in quant_mode_wise_items.items(): + if "static" in quant_mode or 'dynamic' in quant_mode: + _quant_mode = "static" if "static" in quant_mode else "dynamic" + op_item_pairs += [(item, _quant_mode) for item in items_lst if item.name not in quant_ops_name_set] + quant_ops_name_set = quant_ops_name_set.union([item.name for item in items_lst]) + op_item_pairs = self._sorted_item_by_op_type(op_item_pairs, op_type_priority) + quant_items_pool['int8'] = op_item_pairs + return quant_items_pool + + + + + + + + + + + + + + + + diff --git a/neural_compressor/strategy/st_utils/tuning_space.py b/neural_compressor/strategy/st_utils/tuning_space.py index b2cfddbdd38..0c4a71559ec 100644 --- a/neural_compressor/strategy/st_utils/tuning_space.py +++ b/neural_compressor/strategy/st_utils/tuning_space.py @@ -422,16 +422,15 @@ def set_deafult_config(self, op_name_type, quant_mode): # set the first option as the default if the not support the required quant mode quant_mode_item = op_item.options[0] for quant_item in op_item.options: - if quant_mode == quant_item.name or quant_mode in quant_item.name: + if quant_mode == quant_item.name or (isinstance(quant_mode, str) and quant_mode in quant_item.name): quant_mode_item = quant_item break # set the first option as the default for each tuning item config = {item.name: item.options[0] for item in quant_mode_item.options} op_tuning_config = OpTuningConfig(op_name_type[0], op_name_type[1], - quant_mode_item.name, + quant_mode, self, config) return op_tuning_config - diff --git a/neural_compressor/strategy/strategy.py b/neural_compressor/strategy/strategy.py index 63710b43264..7be1897a948 100644 --- a/neural_compressor/strategy/strategy.py +++ b/neural_compressor/strategy/strategy.py @@ -219,7 +219,8 @@ def traverse(self): if self.baseline is None: logger.info("Get FP32 model baseline.") self._fp32_model = self.model - self.baseline = self._evaluate(self.model) + self.baseline = self._evaluate(self.model) + self.objectives.baseline = self.baseline # record the FP32 baseline self._add_tuning_history() self.show_baseline_info() diff --git a/neural_compressor/training.py b/neural_compressor/training.py index 4cb93e39409..8f0dcecb57e 100644 --- a/neural_compressor/training.py +++ b/neural_compressor/training.py @@ -16,8 +16,8 @@ # limitations under the License. import copy -from .conf.pythonic_config import Config, DistillationConfig, Options, \ - PruningConfig, QuantizationAwareTrainingConfig +from .conf.pythonic_config import Config +from .config import DistillationConfig, PruningConfig, QuantizationAwareTrainingConfig from .experimental.distillation import Distillation from .experimental.pruning import Pruning from .experimental.quantization import Quantization @@ -54,8 +54,7 @@ class CompressionManager: compression_manager.save("path_to_save") """ def __init__(self, component): - self.callbacks = \ - component.components[0] if isinstance(component, Scheduler) else component + self.callbacks = self.CallBacks(component) self.model = component.model try: # TODO: export to ONNX model need original fp32 model now, will remove it @@ -65,6 +64,46 @@ def __init__(self, component): logger.warning("Fail to deep copy the model due to {}.".format(repr(e))) self.fp32_model = None + class CallBacks: + def __init__(self, component): + self.callbacks = \ + component.components[0] if isinstance(component, Scheduler) else component + + def on_train_begin(self, dataloader=None): + """ called before the beginning of epochs""" + self.callbacks.on_train_begin(dataloader) + + def on_train_end(self): + """ called after the end of epochs""" + self.callbacks.on_train_end() + + def on_epoch_begin(self, epoch): + """ called on the beginning of epochs""" + self.callbacks.on_epoch_begin(epoch) + + def on_step_begin(self, batch_id): + """ called on the beginning of batches""" + self.callbacks.on_step_begin(batch_id) + + def on_after_compute_loss(self, input, student_output, student_loss, teacher_output=None): + """ called on the end of loss computation""" + return self.callbacks.on_after_compute_loss( + input, student_output, student_loss, teacher_output=None + ) + + def on_before_optimizer_step(self): + """ called on the end of backward""" + self.callbacks.on_before_optimizer_step() + + + def on_step_end(self): + """ called on the end of batches""" + return self.callbacks.on_step_end() + + def on_epoch_end(self): + """ called on the end of epochs""" + return self.callbacks.on_epoch_end() + def save(self, root=None): """Save compressed model. @@ -101,7 +140,7 @@ def export( assert False, "Unsupport export for {} model".format(type(self.model)) -def prepare_compression(model: Callable, confs: Union[Callable, List], options=None, **kwargs): +def prepare_compression(model: Callable, confs: Union[Callable, List], **kwargs): """_summary_ Args: @@ -135,20 +174,18 @@ def prepare_compression(model: Callable, confs: Union[Callable, List], options=N compression_manager.on_train_end() """ - if options is None: - options = Options() if isinstance(confs, List): from .experimental.scheduler import Scheduler comps = [] for conf in confs: if isinstance(conf, QuantizationAwareTrainingConfig): - conf_ = Config(quantization=conf, options=options) + conf_ = Config(quantization=conf) com = Quantization(conf_) elif isinstance(conf, PruningConfig): - conf_ = Config(pruning=conf, options=options) + conf_ = Config(pruning=conf) com = Pruning(conf_) elif isinstance(conf, DistillationConfig): - conf_ = Config(distillation=conf, options=options) + conf_ = Config(distillation=conf) com = Distillation(conf_) assert conf.teacher_model is not None, \ "Please set teacher_model in DistillationConfig" @@ -165,13 +202,13 @@ def prepare_compression(model: Callable, confs: Union[Callable, List], options=N component = scheduler else: if isinstance(confs, QuantizationAwareTrainingConfig): - conf = Config(quantization=confs, options=options) + conf = Config(quantization=confs) component = Quantization(conf) elif type(confs) == PruningConfig: - conf = Config(pruning=confs, options=options) + conf = Config(pruning=confs) component = Pruning(conf) elif type(confs) == DistillationConfig: - conf = Config(distillation=confs, options=options) + conf = Config(distillation=confs) component = Distillation(conf) assert confs.teacher_model is not None, \ "Please set teacher_model in DistillationConfig" diff --git a/requirements.txt b/requirements.txt index 6da20f57fee..ebae42235a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -Cython numpy pandas pyyaml @@ -7,7 +6,6 @@ scikit-image matplotlib schema py-cpuinfo -hyperopt contextlib2 requests Flask @@ -20,7 +18,6 @@ Pillow pycocotools-windows; sys_platform != 'linux' pycocotools; sys_platform == 'linux' opencv-python -sigopt prettytable cryptography sqlalchemy==1.4.27 diff --git a/setup.py b/setup.py index 7dd91a69efb..9ae6370a040 100644 --- a/setup.py +++ b/setup.py @@ -36,11 +36,11 @@ # define install requirements install_requires_list = [ - 'numpy', 'pyyaml', 'scikit-learn', 'schema', 'py-cpuinfo', 'hyperopt', 'pandas', 'pycocotools', - 'opencv-python', 'requests', 'psutil', 'Pillow', 'sigopt', 'prettytable', 'cryptography', 'Cython', - 'deprecated'] + 'numpy', 'pyyaml', 'scikit-learn', 'schema', 'py-cpuinfo', 'pandas', 'pycocotools', + 'opencv-python', 'requests', 'psutil', 'Pillow', 'prettytable', 'deprecated'] ux_install_requires_list = [ - 'Flask-Cors', 'Flask-SocketIO', 'Flask', 'gevent-websocket', 'gevent','sqlalchemy==1.4.27', 'alembic==1.7.7'] + 'Flask-Cors', 'Flask-SocketIO', 'Flask', 'gevent-websocket', 'gevent','sqlalchemy==1.4.27', + 'alembic==1.7.7', 'cryptography'] # define scripts scripts_list = [] diff --git a/sphinx-requirements.txt b/sphinx-requirements.txt deleted file mode 100755 index 71cfc10b849..00000000000 --- a/sphinx-requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -sphinx -sphinx-rtd-theme -recommonmark -sphinx-markdown-tables -sphinx-md \ No newline at end of file diff --git a/test/adaptor/pytorch_adaptor/test_adaptor_pytorch.py b/test/adaptor/pytorch_adaptor/test_adaptor_pytorch.py deleted file mode 100644 index aeeafd0b660..00000000000 --- a/test/adaptor/pytorch_adaptor/test_adaptor_pytorch.py +++ /dev/null @@ -1,1406 +0,0 @@ -import copy -import neural_compressor.adaptor.pytorch as nc_torch -import numpy as np -import os -import shutil -import torch -import torch.nn as nn -import torch.nn.quantized as nnq -import unittest -import os -from neural_compressor import Options, PostTrainingConfig, QuantizationAwareTrainingConfig -from neural_compressor.conf.config import QuantConf -from neural_compressor.data import DATASETS, DATALOADERS -from neural_compressor.adaptor import FRAMEWORKS -from neural_compressor.model import MODELS -from neural_compressor.experimental import Quantization, common -from neural_compressor.experimental.data.datasets.dataset import DATASETS -from neural_compressor import quantization -from neural_compressor.training import prepare_compression -from neural_compressor.utils.pytorch import load -from neural_compressor.utils.utility import recover -from neural_compressor.utils.utility import LazyImport -from torch.quantization import QuantStub, DeQuantStub -from packaging.version import Version - - -# improve lazy import UT coverage -resnet18 = LazyImport("torchvision.models.resnet18") -q_resnet18 = LazyImport("torchvision.models.quantization.resnet18") - -PT_VERSION = nc_torch.get_torch_version().release -if PT_VERSION >= Version("1.8.0").release: - FX_MODE = True -else: - FX_MODE = False - - -fake_dyn_yaml = """ - model: - name: imagenet - framework: pytorch - - quantization: - approach: post_training_dynamic_quant - op_wise: { - "decoder": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - } - } - evaluation: - accuracy: - metric: - topk: 1 - performance: - warmup: 5 - iteration: 10 - - tuning: - accuracy_criterion: - relative: 0.01 - exit_policy: - timeout: 0 - random_seed: 9527 - workspace: - path: saved - """ - - -fake_ptq_yaml = """ - model: - name: imagenet - framework: pytorch - - quantization: - op_wise: { - - "layer1.0.conv1": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - }, - "layer1.0.conv2": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - }, - "layer2.0.conv1": { - "activation": {"dtype": ["uint8"], "algorithm": ["minmax"], "granularity": ["per_tensor"], "scheme":["sym"]}, - "weight": {"dtype": ["int8"], "algorithm": ["minmax"], "granularity": ["per_channel"], "scheme":["sym"]} - }, - "layer3.0.conv1": { - "activation": {"dtype": ["uint8"], "algorithm": ["kl"], "granularity": ["per_tensor"], "scheme":["sym"]}, - "weight": {"dtype": ["int8"], "algorithm": ["minmax"], "granularity": ["per_channel"], "scheme":["sym"]} - }, - "layer1.0.add_relu": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - }, - } - evaluation: - accuracy: - metric: - topk: 1 - performance: - warmup: 1 - iteration: 10 - - tuning: - accuracy_criterion: - relative: 0.01 - exit_policy: - timeout: 0 - random_seed: 9527 - workspace: - path: saved - """ - -fake_auto_yaml = """ - model: - name: imagenet - framework: pytorch_fx - - quantization: - approach: post_training_auto_quant - evaluation: - accuracy: - metric: - topk: 1 - performance: - warmup: 1 - iteration: 10 - - tuning: - accuracy_criterion: - relative: 0.01 - exit_policy: - timeout: 1000 - max_trials: 3 - random_seed: 9527 - workspace: - path: saved - """ - - -fake_ptq_yaml_for_fx = """ - model: - name: imagenet - framework: pytorch_fx - - quantization: - approach: post_training_auto_quant - op_wise: { - "layer1.0.conv1": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - }, - "layer1.0.conv2": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - }, - "layer2.0.conv1": { - "activation": {"dtype": ["uint8"], "algorithm": ["minmax"], "granularity": ["per_tensor"], "scheme":["sym"]}, - "weight": {"dtype": ["int8"], "algorithm": ["minmax"], "granularity": ["per_channel"], "scheme":["sym"]} - }, - "layer3.0.conv1": { - "activation": {"dtype": ["uint8"], "algorithm": ["kl"], "granularity": ["per_tensor"], "scheme":["sym"]}, - "weight": {"dtype": ["int8"], "algorithm": ["minmax"], "granularity": ["per_channel"], "scheme":["sym"]} - }, - "layer1.0.add_relu": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - }, - "conv.module": { - "weight": {"dtype": ["fp32"]}, - "activation": {"dtype": ["fp32"]} - }, - "default_qconfig": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - } - } - evaluation: - accuracy: - metric: - topk: 1 - performance: - warmup: 5 - iteration: 10 - - tuning: - accuracy_criterion: - relative: 0.01 - exit_policy: - timeout: 0 - random_seed: 9527 - workspace: - path: saved - """ - - -fake_qat_yaml = """ - model: - name: imagenet - framework: pytorch - - quantization: - approach: quant_aware_training - train: - end_epoch: 1 - iteration: 1 - optimizer: - SGD: - learning_rate: 0.0001 - criterion: - CrossEntropyLoss: - reduction: mean - op_wise: { - "layer1.0.conv1": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - }, - "layer1.0.conv2": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - }, - "layer2.0.conv1": { - "activation": {"dtype": ["uint8"], "algorithm": ["minmax"], "granularity": ["per_tensor"], "scheme":["sym"]}, - "weight": {"dtype": ["int8"], "algorithm": ["minmax"], "granularity": ["per_channel"], "scheme":["sym"]} - }, - "layer3.0.conv1": { - "activation": {"dtype": ["uint8"], "algorithm": ["kl"], "granularity": ["per_tensor"], "scheme":["sym"]}, - "weight": {"dtype": ["int8"], "algorithm": ["minmax"], "granularity": ["per_channel"], "scheme":["sym"]} - }, - "layer1.0.add_relu": { - "activation": {"dtype": ["fp32"]}, - "weight": {"dtype": ["fp32"]} - } - } - evaluation: - accuracy: - metric: - topk: 1 - - tuning: - accuracy_criterion: - relative: 0.01 - exit_policy: - timeout: 0 - random_seed: 9527 - workspace: - path: saved - """ - -dyn_op_name_list = {"decoder": {"activation": {"dtype": ["fp32"]}, "weight": {"dtype": ["fp32"]}}} - -ptq_op_name_list = { - "layer1.0.conv1": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - }, - "layer1.0.conv2": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - }, - "layer2.0.conv1": { - "activation": { - "dtype": ["uint8"], - "algorithm": ["minmax"], - "granularity": ["per_tensor"], - "scheme": ["sym"] - }, - "weight": { - "dtype": ["int8"], - "algorithm": ["minmax"], - "granularity": ["per_channel"], - "scheme": ["sym"] - } - }, - "layer3.0.conv1": { - "activation": { - "dtype": ["uint8"], - "algorithm": ["kl"], - "granularity": ["per_tensor"], - "scheme": ["sym"] - }, - "weight": { - "dtype": ["int8"], - "algorithm": ["minmax"], - "granularity": ["per_channel"], - "scheme": ["sym"] - } - }, - "layer1.0.add_relu": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - }, -} - -ptq_fx_op_name_list = { - "layer1.0.conv1": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - }, - "layer1.0.conv2": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - }, - "layer2.0.conv1": { - "activation": { - "dtype": ["uint8"], - "algorithm": ["minmax"], - "granularity": ["per_tensor"], - "scheme": ["sym"] - }, - "weight": { - "dtype": ["int8"], - "algorithm": ["minmax"], - "granularity": ["per_channel"], - "scheme": ["sym"] - } - }, - "layer3.0.conv1": { - "activation": { - "dtype": ["uint8"], - "algorithm": ["kl"], - "granularity": ["per_tensor"], - "scheme": ["sym"] - }, - "weight": { - "dtype": ["int8"], - "algorithm": ["minmax"], - "granularity": ["per_channel"], - "scheme": ["sym"] - } - }, - "layer1.0.add_relu": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - }, - "conv.module": { - "weight": { - "dtype": ["fp32"] - }, - "activation": { - "dtype": ["fp32"] - } - }, - "default_qconfig": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - } -} - -qat_op_name_list = { - "layer1.0.conv1": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - }, - "layer1.0.conv2": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - }, - "layer2.0.conv1": { - "activation": { - "dtype": ["uint8"], - "algorithm": ["minmax"], - "granularity": ["per_tensor"], - "scheme": ["sym"] - }, - "weight": { - "dtype": ["int8"], - "algorithm": ["minmax"], - "granularity": ["per_channel"], - "scheme": ["sym"] - } - }, - "layer3.0.conv1": { - "activation": { - "dtype": ["uint8"], - "algorithm": ["kl"], - "granularity": ["per_tensor"], - "scheme": ["sym"] - }, - "weight": { - "dtype": ["int8"], - "algorithm": ["minmax"], - "granularity": ["per_channel"], - "scheme": ["sym"] - } - }, - "layer1.0.add_relu": { - "activation": { - "dtype": ["fp32"] - }, - "weight": { - "dtype": ["fp32"] - } - } -} - - -def build_pytorch_yaml(): - with open("ptq_yaml.yaml", "w", encoding="utf-8") as f: - f.write(fake_ptq_yaml) - - with open("dynamic_yaml.yaml", "w", encoding="utf-8") as f: - f.write(fake_dyn_yaml) - - with open("qat_yaml.yaml", "w", encoding="utf-8") as f: - f.write(fake_qat_yaml) - - with open("auto_yaml.yaml", "w", encoding="utf-8") as f: - f.write(fake_auto_yaml) - -def build_pytorch_fx_yaml(): - if PT_VERSION >= Version("1.9.0").release: - fake_fx_ptq_yaml = fake_ptq_yaml_for_fx - else: - fake_fx_ptq_yaml = fake_ptq_yaml.replace("pytorch", "pytorch_fx") - with open("fx_ptq_yaml.yaml", "w", encoding="utf-8") as f: - f.write(fake_fx_ptq_yaml) - - fake_fx_dyn_yaml = fake_dyn_yaml.replace("pytorch", "pytorch_fx") - with open("fx_dynamic_yaml.yaml", "w", encoding="utf-8") as f: - f.write(fake_fx_dyn_yaml) - - fake_fx_qat_yaml = fake_qat_yaml.replace("pytorch", "pytorch_fx") - with open("fx_qat_yaml.yaml", "w", encoding="utf-8") as f: - f.write(fake_fx_qat_yaml) - -def build_dump_tensors_yaml(): - fake_yaml = """ - model: - name: imagenet - framework: pytorch - - evaluation: - accuracy: - metric: - topk: 1 - - tuning: - accuracy_criterion: - relative: 0.01 - exit_policy: - timeout: 0 - random_seed: 9527 - workspace: - path: saved - tensorboard: true - """ - with open("dump_yaml.yaml", "w", encoding="utf-8") as f: - f.write(fake_yaml) - - -class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.quant = QuantStub() - self.conv = nn.Conv2d(3, 1, 1) - self.linear = nn.Linear(224 * 224, 5) - self.dequant = DeQuantStub() - - def forward(self, x): - x = self.quant(x) - x = self.conv(x) - x = x.view(1, -1) - x = self.linear(x) - x = self.dequant(x) - return x - - -class FP32Model(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - times = x.size(1) - if times == 1: - return x + x - return x - - -class DynamicModel(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = nn.Conv2d(1, 1, 1) - def forward(self, x): - if x is not None: - x = self.conv(x) - return x - - -class SubModel(torch.nn.Module): - def __init__(self, bypass=True): - super().__init__() - self.quant = QuantStub() - self.conv = nn.Conv2d(1, 1, 1) - self.conv1 = nn.Conv2d(1, 1, 1) - self.bn = nn.BatchNorm2d(1) - self.relu = nn.ReLU() - self.fp32 = FP32Model() - self.norm = nn.LayerNorm([1, 224, 224]) - self.dequant = DeQuantStub() - self.bypass = bypass - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.quant(x) - x = self.relu(x) - x = self.conv1(x) - x = self.dequant(x) - if not self.bypass: - x = self.fp32(x) - x = self.norm(x) - return x - - -class PartialQuantModel(torch.nn.Module): - def __init__(self): - super().__init__() - self.quant = QuantStub() - self.conv = nn.Conv2d(3, 1, 1) - self.bn = nn.BatchNorm2d(1) - self.conv1 = nn.Conv2d(1, 1, 1) - self.bn1 = nn.BatchNorm2d(1) - self.conv2 = nn.Conv2d(1, 1, 1) - self.linear = nn.Linear(224 * 224, 1) - self.dequant = DeQuantStub() - self.sub = SubModel(bypass=False) - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.conv1(x) - x = self.bn1(x) - x = self.sub(x) - x = self.quant(x) - x = self.conv2(x) - x = x.view(1, -1) - x = self.linear(x) - x = self.dequant(x) - return x - -class DynamicControlModel(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = nn.Conv2d(3, 1, 1) - self.bn = nn.BatchNorm2d(1) - self.linear = nn.Linear(224 * 224, 1) - self.sub = SubModel() - self.fp32 = FP32Model() - self.dyn = DynamicModel() - - def forward(self, x): - x = self.conv(x) - x = self.dyn(x) - x = self.bn(x) - x = self.sub(x) - x = self.fp32(x) - x = x.view(1, -1) - x = self.linear(x) - return x - - -class LSTMModel(nn.Module): - """Container module with an encoder, a recurrent module, and a decoder.""" - - def __init__(self, ntoken=10, ninp=512, nhid=256, nlayers=5, dropout=0.5): - super(LSTMModel, self).__init__() - self.drop = nn.Dropout(dropout) - self.encoder = nn.Embedding(ntoken, ninp) - self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout) - self.decoder = nn.Linear(nhid, ntoken) - self.init_weights() - self.nhid = nhid - self.nlayers = nlayers - - def init_weights(self): - initrange = 0.1 - self.encoder.weight.data.uniform_(-initrange, initrange) - self.decoder.bias.data.zero_() - self.decoder.weight.data.uniform_(-initrange, initrange) - - def forward(self, input): - input = torch.ones((3, 10), dtype=torch.int32) - h0 = torch.randn(2, 10, 256) - c0 = torch.randn(2, 10, 256) - hidden = (h0, c0) - emb = self.encoder(input) - output, hidden = self.rnn(emb, hidden) - output = self.drop(output) - decoded = self.decoder(output) - return decoded, hidden - - -def eval_func(model): - # switch to evaluate mode - model.eval() - with torch.no_grad(): - input = torch.randn(1, 3, 224, 224) - # compute output - output = model(input) - return 0.0 - - -def train_func(compression_manager, model, dataloader=None): - compression_manager.callbacks.on_train_begin(dataloader=dataloader) - optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) - # switch to evaluate mode - model.train() - input = torch.randn(1, 3, 224, 224) - # compute output - output = model(input) - loss = output[0].mean() if isinstance(output, tuple) else output.mean() - optimizer.zero_grad() - loss.backward() - optimizer.step() - compression_manager.callbacks.on_train_end() - return model - - -def q_func(model): - optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) - # switch to evaluate mode - model.train() - input = torch.randn(1, 3, 224, 224) - # compute output - output = model(input) - loss = output.mean() - optimizer.zero_grad() - loss.backward() - optimizer.step() - return model - - -class TestPytorchAdaptor(unittest.TestCase): - # some UT would be affected when IPEX installed. - try: - import intel_extension_for_pytorch as ipex - IPEX = True - except: - IPEX = False - framework_specific_info = {"device": "cpu", - "approach": "post_training_static_quant", - "random_seed": 1234, - "q_dataloader": None, - "workspace_path": "./"} - framework = "pytorch" - adaptor = FRAMEWORKS[framework](framework_specific_info) - model = q_resnet18() - nc_model = MODELS["pytorch"](model) - - @classmethod - def setUpClass(self): - build_pytorch_yaml() - build_dump_tensors_yaml() - - @classmethod - def tearDownClass(self): - os.remove("ptq_yaml.yaml") - os.remove("dynamic_yaml.yaml") - os.remove("qat_yaml.yaml") - os.remove("dump_yaml.yaml") - os.remove("auto_yaml.yaml") - shutil.rmtree("./saved", ignore_errors=True) - shutil.rmtree("runs", ignore_errors=True) - - def test_get_all_weight_name(self): - assert len(list(self.nc_model.get_all_weight_names())) == 62 - - def test_get_weight(self): - for name, param in self.model.named_parameters(): - if name == "layer4.1.conv2.weight": - param.data.fill_(0.0) - if name == "fc.bias": - param.data.fill_(0.1) - assert int(torch.sum(self.nc_model.get_weight("layer4.1.conv2.weight"))) == 0 - assert torch.allclose( - torch.sum( - self.nc_model.get_weight("fc.bias")), - torch.tensor(100.)) - - def test_get_input(self): - model = MODELS["pytorch"](q_resnet18()) - model.model.eval().fuse_model() - model.register_forward_pre_hook() - rand_input = torch.rand(100, 3, 224, 224).float() - model.model(rand_input) - assert torch.equal(model.get_inputs("x"), rand_input) - model.remove_hooks() - - def test_update_weights(self): - self.nc_model.update_weights("fc.bias", torch.zeros([1000])) - assert int(torch.sum(self.nc_model.get_weight("fc.bias"))) == 0 - - def test_get_gradient(self): - with self.assertRaises(AssertionError): - self.nc_model.get_gradient("fc.bias") - - for name, tensor in self.nc_model._model.named_parameters(): - if name == "fc.bias": - tensor.grad = torch.zeros_like(tensor) - break - assert torch.equal(torch.Tensor(self.nc_model.get_gradient("fc.bias")), torch.zeros_like(tensor)) - - rand_input = torch.rand(100, 3, 224, 224).float() - rand_input.grad = torch.ones_like(rand_input) - assert torch.equal(torch.Tensor(self.nc_model.get_gradient(rand_input)), - torch.ones_like(rand_input)) - - def test_report_sparsity(self): - df, total_sparsity = self.nc_model.report_sparsity() - self.assertTrue(total_sparsity > 0) - self.assertTrue(len(df) == 22) - - def test_quantization_saved(self): - for fake_yaml in ["dynamic_yaml.yaml", "qat_yaml.yaml", "ptq_yaml.yaml"]: - model = M() - quantizer = Quantization(fake_yaml) - quantizer.conf.usr_cfg.tuning.exit_policy["performance_only"] = True - dataset = quantizer.dataset("dummy", (100, 3, 224, 224), label=True) - quantizer.model = model - quantizer.calib_dataloader = common.DataLoader(dataset) - quantizer.eval_dataloader = common.DataLoader(dataset) - q_model = quantizer.fit() - eval_func(q_model) - q_model.save("./saved") - # Load configure and weights by neural_compressor.utils - saved_model = load("./saved", model) - eval_func(saved_model) - # recover int8 model from history - history_file = "./saved/history.snapshot" - model_recover = recover(model, history_file, 0) - eval_func(model_recover) - self.assertEqual(type(saved_model.conv), \ - type(model_recover.conv)) - shutil.rmtree("./saved", ignore_errors=True) - from neural_compressor.experimental import Benchmark - evaluator = Benchmark("ptq_yaml.yaml") - # Load configure and weights by neural_compressor.model - evaluator.model = model - evaluator.b_dataloader = common.DataLoader(dataset) - evaluator.fit("accuracy") - - for fake_yaml in ["qat_yaml.yaml", "ptq_yaml.yaml"]: - model = copy.deepcopy(self.model) - if fake_yaml == "ptq_yaml.yaml": - model.eval().fuse_model() - conf = QuantConf(fake_yaml) - quantizer = Quantization(conf) - dataset = quantizer.dataset("dummy", (100, 3, 224, 224)) - quantizer.model = model - if fake_yaml == "qat_yaml.yaml": - quantizer.q_func = q_func - else: - quantizer.calib_dataloader = common.DataLoader(dataset) - quantizer.eval_func = eval_func - q_model = quantizer.fit() - q_model.save("./saved") - # Load configure and weights by neural_compressor.utils - saved_model = load("./saved", model) - eval_func(saved_model) - shutil.rmtree("./saved", ignore_errors=True) - - def test_quantization_new_saved(self): - for fake_yaml in ["dynamic_yaml.yaml", "qat_yaml.yaml", "ptq_yaml.yaml"]: - model = M() - quantizer = Quantization(fake_yaml) - quantizer.conf.usr_cfg.tuning.exit_policy["performance_only"] = True - dataset = quantizer.dataset("dummy", (100, 3, 224, 224), label=True) - quantizer.model = model - quantizer.calib_dataloader = common.DataLoader(dataset) - quantizer.eval_dataloader = common.DataLoader(dataset) - q_model = quantizer.fit() - eval_func(q_model) - torch.save(q_model.quantized_state_dict(), "./saved/model.pt") - # Load configure and weights by neural_compressor.utils - from neural_compressor.experimental.common import Model - common_model = Model(model) - common_model.load_quantized_state_dict(torch.load("./saved/model.pt")) - eval_func(common_model) - self.assertEqual(type(q_model._model.linear), \ - type(common_model._model.linear)) - shutil.rmtree("./saved", ignore_errors=True) - - def test_quantization_new_API(self): - for fake_yaml in ["dynamic", "qat", "static"]: - model = M() - if fake_yaml == "qat": - quant_conf = QuantizationAwareTrainingConfig(op_name_list=qat_op_name_list) - compression_manager = prepare_compression(copy.deepcopy(model), quant_conf) - q_model = train_func(compression_manager, compression_manager.model) - else: - dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) - dataloader = DATALOADERS["pytorch"](dataset) - if fake_yaml == "dynamic": - quant_conf = PostTrainingConfig(approach="post_training_dynamic_quant", - op_name_list=dyn_op_name_list, - performance_only=True) - elif fake_yaml == "static": - quant_conf = PostTrainingConfig(approach="post_training_static_quant", - op_name_list=ptq_op_name_list, - performance_only=True) - q_model = quantization.fit( - model, - quant_conf, - calib_dataloader=dataloader if fake_yaml == "static" else None, - eval_func=eval_func) - q_model.save("./saved") - # Load configure and weights by neural_compressor.utils - saved_model = load("./saved", model) - shutil.rmtree("./saved", ignore_errors=True) - - @unittest.skipIf(IPEX, "this function is affected by IPEX, Fixing now.") - def test_non_quant_module(self): - for fake_yaml in ["qat_yaml.yaml", "ptq_yaml.yaml"]: - model = PartialQuantModel() - conf = QuantConf(fake_yaml) - quantizer = Quantization(conf) - dataset = quantizer.dataset("dummy", (1, 3, 224, 224)) - non_quant_dict = {"non_quant_module_name": ["conv", "conv1", "sub.conv"], \ - "non_quant_module_class": ["BatchNorm2d", "FP32Model"]} - quantizer.model = common.Model(model, **non_quant_dict) - if fake_yaml == "qat_yaml.yaml": - quantizer.q_func = q_func - else: - quantizer.calib_func = eval_func - quantizer.eval_func = eval_func - q_model = quantizer.fit() - q_model.save("./saved") - saved_model = load("./saved", model, **non_quant_dict) - eval_func(saved_model) - shutil.rmtree("./saved", ignore_errors=True) - - def test_auto_quant(self): - def eval_func(model): - return 1 - - model_origin = LSTMModel( - ntoken = 10, - ninp = 512, - nhid = 256, - nlayers = 2, - ) - # run fx_quant in neural_compressor and save the quantized GraphModule - quant_conf = PostTrainingConfig(approach="post_training_auto_quant") - dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) - dataloader = common.DataLoader(dataset) - model = common.Model(model_origin) - q_model = quantization.fit(model, - quant_conf, - calib_dataloader=dataloader, - eval_func=eval_func) - self.assertNotEqual(q_model, None) - - def test_workspace_path(self): - model = M() - quant_conf = PostTrainingConfig(approach="post_training_static_quant", - op_name_list=ptq_op_name_list, - performance_only=True) - dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) - dataloader = common.DataLoader(dataset) - q_model = quantization.fit(model, - quant_conf, - calib_dataloader=dataloader, - eval_func=eval_func) - eval_func(q_model) - os.makedirs("./saved", exist_ok=True) - torch.save(q_model.quantized_state_dict(), "./saved/best_model.pt") - # Load configure and weights by workspace_path - from neural_compressor.experimental.common import Model - common_model = Model(model) - common_model.workspace_path = "./saved" - eval_func(common_model) - self.assertEqual(type(q_model._model.linear), - type(common_model._model.linear)) - shutil.rmtree("./saved", ignore_errors=True) - - def test_get_graph_info(self): - from neural_compressor.model.torch_model import PyTorchModel - model = PyTorchModel(self.model) - op_map = model.graph_info - self.assertTrue(op_map["conv1"] == "Conv2d") - - def test_tensorboard(self): - model = copy.deepcopy(self.nc_model) - model.model.eval().fuse_model() - quant_conf = PostTrainingConfig(approach="post_training_static_quant", - backend="pytorch", - performance_only=True) - options = Options(tensorboard=True) - dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) - dataloader = common.DataLoader(dataset) - quantization.fit( - model.model, quant_conf, calib_dataloader=dataloader, - eval_func=eval_func, options=options - ) - self.assertTrue(True if os.path.exists("runs/eval/baseline_acc0.0") else False) - quantization.fit(model.model, - quant_conf, - calib_dataloader=dataloader, - eval_dataloader=dataloader, - eval_func=None) - self.assertTrue(True if os.path.exists("runs/eval/baseline_acc0.0") else False) - - def test_tensor_dump_and_set(self): - model = copy.deepcopy(self.nc_model) - model.model.eval().fuse_model() - quantizer = Quantization("ptq_yaml.yaml") - dataset = quantizer.dataset("dummy", (100, 3, 224, 224), label=True) - dataloader = common.DataLoader(dataset) - dataloader = common._generate_common_dataloader(dataloader, "pytorch") - quantizer.eval_dataloader = dataloader - quantizer.calib_dataloader = dataloader - quantizer.model = model.model - q_model = quantizer.fit() - quantizer.strategy.adaptor.inspect_tensor( - model, dataloader, op_list=["conv1.0", "layer1.0.conv1.0"], - iteration_list=[1, 2], inspect_type="all", save_to_disk=True) - load_array = lambda *a, **k: np.load(*a, allow_pickle=True, **k) - a = load_array("saved/dump_tensor/activation_iter1.npz") - w = load_array("saved/dump_tensor/weight.npz") - if PT_VERSION >= Version("1.8.0").release: - self.assertTrue(w["conv1.0"].item()["conv1.0.weight"].shape[0] == - a["conv1.0"].item()["conv1.0.output0"].shape[1]) - else: - self.assertTrue(w["conv1.0"].item()["conv1.0.weight"].shape[0] == - a["conv1.0"].item()["conv1.1.output0"].shape[1]) - data = np.random.random(w["conv1.0"].item()["conv1.0.weight"].shape).astype(np.float32) - quantizer.strategy.adaptor.set_tensor(q_model, {"conv1.0.weight": data}) - changed_tensor = q_model.get_weight("conv1.weight") - scales = changed_tensor.q_per_channel_scales() - changed_tensor_fp32 = torch.dequantize(changed_tensor) - self.assertTrue(np.allclose(data, changed_tensor_fp32.numpy(), atol=2 / np.min(scales.numpy()))) - quantizer.strategy.adaptor.inspect_tensor( - q_model, dataloader, op_list=["conv1.0", "layer1.0.conv1.0"], - iteration_list=[1, 2], inspect_type="all", save_to_disk=False) - - def test_get_graph_info(self): - from neural_compressor.adaptor.pytorch import get_ops_recursively - model = copy.deepcopy(self.model) - op_map = {} - get_ops_recursively(model, "", op_map) - self.assertTrue(op_map["conv1"] == "Conv2d") - - def test_forward_wrapper(self): - vision_model = resnet18() - class dummymodel(torch.nn.Module): - def __init__(self, model): - super(dummymodel, self).__init__() - self._model = model - def forward(self,input=None): - return self._model(input) - - data = [[{"input": torch.rand(3,224,224)}, torch.ones(1,1)], ] - # dataloader.batch_size=100 - dataloader = common.DataLoader(data, batch_size=1) - quant_conf = QuantConf("dynamic_yaml.yaml") - model = dummymodel(vision_model) - q_model = quantization.fit(model, - quant_conf, - calib_dataloader=dataloader, - eval_func=eval_func) - - def test_floatfunctions_fallback(self): - class ModelWithFunctionals(torch.nn.Module): - def __init__(self): - super(ModelWithFunctionals, self).__init__() - self.mycat = nnq.FloatFunctional() - self.myadd = nnq.FloatFunctional() - self.myadd_relu = nnq.FloatFunctional() - # Tracing doesnt work yet for c10 ops with scalar inputs - # https://github.com/pytorch/pytorch/issues/27097 - self.my_scalar_add = nnq.FloatFunctional() - self.mymul = nnq.FloatFunctional() - self.my_scalar_mul = nnq.FloatFunctional() - self.quant = QuantStub() - self.dequant = DeQuantStub() - - def forward(self, x): - x = self.quant(x) - y = self.mycat.cat([x, x, x]) - z = self.myadd.add(y, y) - w = self.myadd_relu.add_relu(z, z) - # Tracing doesnt work yet for c10 ops with scalar inputs - # https://github.com/pytorch/pytorch/issues/27097 - w = self.my_scalar_add.add_scalar(w, -0.5) - w = self.mymul.mul(w, w) - w = self.my_scalar_mul.mul_scalar(w, 0.5) - w = self.dequant(w) - return w - - model = ModelWithFunctionals() - model = MODELS["pytorch"](model) - x = torch.rand(10, 1, dtype=torch.float) - y = model.model(x) - fallback_ops = [] - q_capability = self.adaptor.query_fw_capability(model) - for k, v in q_capability["opwise"].items(): - if k[0] != "quant" and k[0] != "dequant": - fallback_ops.append(k[0]) - model.model.qconfig = torch.quantization.default_qconfig - model.model.quant.qconfig = torch.quantization.default_qconfig - if PT_VERSION >= Version("1.8.0").release: - model.model.dequant.qconfig = torch.quantization.default_qconfig - nc_torch._fallback_quantizable_ops_recursively( - model.model, "", fallback_ops, op_qcfgs={}) - torch.quantization.add_observer_(model.model) - model.model(x) - torch.quantization.convert(model.model, self.adaptor.q_mapping, inplace=True) - qy = model.model(x) - tol = {"atol": 1e-01, "rtol": 1e-03} - self.assertTrue(np.allclose(y, qy, **tol)) - - -@unittest.skipIf(not FX_MODE, "Unsupport Fx Mode with PyTorch Version Below 1.8") -class TestPytorchFXAdaptor(unittest.TestCase): - framework_specific_info = {"device": "cpu", - "approach": "post_training_static_quant", - "random_seed": 1234, - "q_dataloader": None, - "workspace_path": "./"} - framework = "pytorch_fx" - adaptor = FRAMEWORKS[framework](framework_specific_info) - @classmethod - def setUpClass(self): - build_pytorch_fx_yaml() - - @classmethod - def tearDownClass(self): - os.remove("fx_ptq_yaml.yaml") - os.remove("fx_dynamic_yaml.yaml") - shutil.rmtree("./saved", ignore_errors=True) - shutil.rmtree("runs", ignore_errors=True) - - def test_fx_quant(self): - for fake_yaml in ["qat", "static"]: - model_origin = resnet18() - model = common.Model(model_origin, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []} - } - ) - dataset = DATASETS("pytorch")["dummy"]((10, 3, 224, 224), label=True) - dataloader = DATALOADERS["pytorch"](dataset) - if fake_yaml == "qat": - conf = QuantizationAwareTrainingConfig( - op_name_list=qat_op_name_list, backend="pytorch_fx" - ) - compression_manager = prepare_compression(copy.deepcopy(model), conf) - q_model = train_func(compression_manager, compression_manager.model, dataloader) - else: - conf = PostTrainingConfig( - op_name_list=ptq_fx_op_name_list, backend="pytorch_fx", performance_only=True - ) - options = Options(workspace="./saved") - q_model = quantization.fit(model, - conf, - calib_dataloader=dataloader, - eval_func=eval_func, - calib_func=eval_func, - options=options) - q_model.save("./saved") - # Load configure and weights with neural_compressor.utils - model_fx = load("./saved", model_origin, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []}, \ - "dataloader": torch.utils.data.DataLoader(dataset) - }) - self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) - - if fake_yaml != "qat": - # recover int8 model with only tune_cfg - history_file = "./saved/history.snapshot" - model_fx_recover = recover(model_origin, history_file, 0, - **{"prepare_custom_config_dict": - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": - {"preserved_attributes": []}, - "dataloader": dataloader - }) - self.assertEqual(model_fx.code, model_fx_recover.code) - shutil.rmtree("./saved", ignore_errors=True) - for fake_yaml in ["fx_qat_yaml.yaml", "fx_ptq_yaml.yaml"]: - model_origin = M() - # run fx_quant in neural_compressor and save the quantized GraphModule - dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224), label=True) - dataloader = DATALOADERS["pytorch"](dataset) - model = common.Model(model_origin, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []} - }) - if fake_yaml == "fx_qat_yaml.yaml": - conf = QuantizationAwareTrainingConfig( - op_name_list=qat_op_name_list, backend="pytorch_fx" - ) - compression_manager = prepare_compression(copy.deepcopy(model), conf) - q_model = train_func(compression_manager, compression_manager.model, dataloader) - compression_manager.save("./saved") - else: - conf = PostTrainingConfig( - op_name_list=ptq_fx_op_name_list, backend="pytorch_fx", performance_only=True - ) - q_model = quantization.fit(model, - conf, - calib_dataloader=dataloader, - eval_dataloader=dataloader) - q_model.save("./saved") - # Load configure and weights with neural_compressor.utils - model_fx = load("./saved", model_origin, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []}, \ - "dataloader": torch.utils.data.DataLoader(dataset) - }) - self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) - shutil.rmtree("./saved", ignore_errors=True) - - @unittest.skipIf(PT_VERSION < Version("1.9.0").release, - "Please use PyTroch 1.9 or higher version for dynamic quantization with pytorch_fx backend") - def test_fx_dynamic_quant(self): - origin_model = LSTMModel( - ntoken = 10, - ninp = 512, - nhid = 256, - nlayers = 5, - ) - # run fx_quant in neural_compressor and save the quantized GraphModule - origin_model.eval() - quant_conf = QuantConf("fx_dynamic_yaml.yaml") - model = common.Model(origin_model, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []} - }) - q_model = quantization.fit(model, - quant_conf - ) - q_model.save("./saved") - - # Load configure and weights by neural_compressor.utils - model_fx = load("./saved", origin_model, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []} - }) - self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) - - # Test the functionality of older model saving type - state_dict = torch.load("./saved/best_model.pt") - tune_cfg = state_dict.pop("best_configure") - import yaml - with open("./saved/best_configure.yaml", "w") as f: - yaml.dump(tune_cfg, f, default_flow_style=False) - torch.save(state_dict, "./saved/best_model_weights.pt") - os.remove("./saved/best_model.pt") - model_fx = load("./saved", origin_model, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []} - }) - self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) - - # recover int8 model with only tune_cfg - history_file = "./saved/history.snapshot" - model_fx_recover = recover(origin_model, history_file, 0, - **{"prepare_custom_config_dict": - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": - {"preserved_attributes": []} - }) - self.assertEqual(model_fx.code, model_fx_recover.code) - shutil.rmtree("./saved", ignore_errors=True) - - def test_default_dynamic_quant(self): - def eval_func(model): - return 1 - - # Model Definition - for fake_yaml in ["fx_qat_yaml.yaml", "fx_ptq_yaml.yaml"]: - model_origin = LSTMModel( - ntoken = 10, - ninp = 512, - nhid = 256, - nlayers = 2, - ) - dataset = DATASETS("pytorch")["dummy"]((3, 10)) - dataloader = DATALOADERS["pytorch"](dataset) - # run fx_quant in neural_compressor and save the quantized GraphModule - if fake_yaml == "fx_qat_yaml.yaml": - conf = QuantizationAwareTrainingConfig( - op_name_list=qat_op_name_list, backend="pytorch_fx" - ) - compression_manager = prepare_compression(copy.deepcopy(model_origin), conf) - q_model = train_func(compression_manager, compression_manager.model, dataloader=dataloader) - self.assertTrue("quantize" in str(type(q_model.model.encoder))) - self.assertTrue("quantize" in str(type(q_model.model.rnn))) - else: - conf = PostTrainingConfig(backend="pytorch_fx", performance_only=True) - q_model = quantization.fit(model_origin, - conf, - calib_dataloader=dataloader, - eval_func=eval_func) - self.assertTrue("quantize" in str(type(q_model.model.encoder))) - self.assertTrue("quantize" in str(type(q_model.model.rnn))) - - def test_fx_sub_module_quant(self): - for fake_yaml in ["fx_qat_yaml.yaml", "fx_ptq_yaml.yaml", "fx_dynamic_yaml.yaml"]: - model_origin = DynamicControlModel() - model = common.Model(model_origin, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []} - }) - dataset = DATASETS("pytorch")["dummy"]((1, 3, 224, 224)) - dataloader = DATALOADERS["pytorch"](dataset) - # run fx_quant in neural_compressor and save the quantized GraphModule - if fake_yaml == "fx_qat_yaml.yaml": - conf = QuantizationAwareTrainingConfig( - op_name_list=qat_op_name_list, backend="pytorch_fx" - ) - compression_manager = prepare_compression(copy.deepcopy(model), conf) - q_model = train_func(compression_manager, compression_manager.model, dataloader) - else: - options = Options(workspace="./saved") - conf = PostTrainingConfig(backend="pytorch_fx", performance_only=True) - q_model = quantization.fit(model, - conf, - calib_dataloader=dataloader, - eval_func=eval_func, - options=options) - q_model.save("./saved") - # Load configure and weights with neural_compressor.utils - model_fx = load("./saved/best_model.pt", model_origin, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []}, \ - "dataloader": torch.utils.data.DataLoader(dataset) - }) - self.assertTrue(isinstance(model_fx.sub, torch.fx.graph_module.GraphModule)) - - if fake_yaml != "fx_qat_yaml.yaml": - # recover int8 model with only tune_cfg - history_file = "./saved/history.snapshot" - model_fx_recover = recover(model_origin, history_file, 0, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []}, \ - "dataloader": torch.utils.data.DataLoader(dataset) - }) - self.assertEqual(model_fx.sub.code, model_fx_recover.sub.code) - shutil.rmtree("./saved", ignore_errors=True) - - def test_deepcopy_failure(self): - def eval_func(model): - return 1 - - # To build an object t2, which will fail on deepcopy. - class T1(): - def __init__(self, t1) -> None: - self.t1 = t1 - self.j = 1 - - # required for usage with set in T1 - def __hash__(self): - return hash(self.j) - - t1 = set() - t2 = T1([t1]) - t1.add(t2) - - for fake_yaml in ['fx_ptq_yaml.yaml']: - model_origin = M() - model_origin.tmp = t2 - # run fx_quant in neural_compressor and save the quantized GraphModule - quantizer = Quantization(fake_yaml) - dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True) - quantizer.eval_func = eval_func - quantizer.calib_dataloader = common.DataLoader(dataset) - quantizer.model = common.Model(model_origin) - q_model = quantizer.fit() - self.assertTrue(isinstance(q_model.model, torch.fx.graph_module.GraphModule)) - - @unittest.skipIf(PT_VERSION < Version("1.11.0").release, - "Please use PyTroch 1.11 or higher version for mixed precision with pytorch_fx or pytorch backend") - def test_bf16_capability(self): - model_origin = DynamicControlModel() - os.environ["FORCE_BF16"] = "1" - q_capability = self.adaptor._get_quantizable_ops(model_origin) - del os.environ["FORCE_BF16"] - - self.assertEqual( - [elem["weight"]["dtype"] for elem in q_capability["optypewise"]["Conv2d"]], - [["int8"], "fp32"]) - self.assertEqual( - [elem["activation"]["dtype"] for elem in q_capability["optypewise"]["Conv2d"]], - [["uint8"], "fp32"]) - self.assertEqual( - [elem["weight"]["dtype"] for elem in q_capability["opwise"][("conv", "Conv2d")]], - [["int8"], "fp32"]) - self.assertEqual( - [elem["activation"]["dtype"] for elem in q_capability["opwise"][("conv", "Conv2d")]], - [["uint8"], "fp32"]) - self.assertEqual( - [elem["weight"]["dtype"] for elem in q_capability["opwise"][("linear", "Linear")]], - [["int8"], "fp32", "bf16"]) - self.assertEqual( - [elem["activation"]["dtype"] for elem in q_capability["opwise"][("linear", "Linear")]], - [["uint8"], "fp32", "bf16"]) - - @unittest.skipIf(PT_VERSION < Version("1.11.0").release, - "Please use PyTroch 1.11 or higher version for mixed precision with pytorch_fx or pytorch backend") - def test_mix_precision(self): - model_origin = DynamicControlModel() - # run fx_quant in neural_compressor and save the quantized GraphModule - dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) - dataloader = DATALOADERS["pytorch"](dataset) - model = common.Model(model_origin, - **{"prepare_custom_config_dict": \ - {"non_traceable_module_name": ["a"]}, - "convert_custom_config_dict": \ - {"preserved_attributes": []} - }) - options = Options(workspace="./saved") - conf = PostTrainingConfig(op_name_list=ptq_fx_op_name_list, backend="pytorch_fx", performance_only=True) - q_model = quantization.fit(model_origin, - conf, - calib_dataloader=dataloader, - eval_func=eval_func, - calib_func = eval_func, - options=options) - tune_cfg = q_model.q_config - tune_cfg["op"][("conv.module", "Conv2d")].clear() - tune_cfg["op"][("conv.module", "Conv2d")] = \ - {"weight": {"dtype": "bf16"}, "activation": {"dtype": "bf16"}} - tune_cfg["bf16_ops_list"].append(("conv.module", "Conv2d")) - from neural_compressor.adaptor.torch_utils.bf16_convert import Convert - q_model._model = Convert(q_model._model, tune_cfg) - - self.assertEqual(q_model._model.conv.module.module.weight.dtype, torch.bfloat16) - self.assertEqual(q_model._model.conv.module.module.bias.dtype, torch.bfloat16) - - def test_symbolic_trace(self): - from neural_compressor.adaptor.torch_utils.symbolic_trace import symbolic_trace - model_origin = DynamicControlModel() - traced_model = symbolic_trace(model_origin, is_qat=False) - if PT_VERSION >= Version("1.11.0").release: - self.assertTrue(isinstance(traced_model.sub, torch.nn.Module)) - self.assertTrue(isinstance(traced_model.conv, torch.fx.graph_module.GraphModule)) - else: - self.assertTrue(isinstance(traced_model.sub, torch.fx.graph_module.GraphModule)) - traced_model_qat = symbolic_trace(model_origin, is_qat=True) - self.assertTrue(isinstance(traced_model_qat.sub, torch.fx.graph_module.GraphModule)) - -if __name__ == "__main__": - unittest.main() diff --git a/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_1.x.py b/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_1.x.py new file mode 100644 index 00000000000..effd890bdd7 --- /dev/null +++ b/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_1.x.py @@ -0,0 +1,1118 @@ +import copy +import neural_compressor.adaptor.pytorch as nc_torch +import numpy as np +import os +import shutil +import torch +import torch.nn as nn +import torch.nn.quantized as nnq +import unittest +from neural_compressor.adaptor import FRAMEWORKS +from neural_compressor.model import MODELS +from neural_compressor.experimental import Quantization, common +from neural_compressor.conf.config import QuantConf +from neural_compressor.utils.pytorch import load +from neural_compressor.utils.utility import recover +from neural_compressor.utils.utility import LazyImport +from torch.quantization import QuantStub, DeQuantStub +from packaging.version import Version +try: + import intel_extension_for_pytorch as ipex + IPEX = True +except: + IPEX = False + +# improve lazy import UT coverage +resnet18 = LazyImport("torchvision.models.resnet18") +q_resnet18 = LazyImport("torchvision.models.quantization.resnet18") + +PT_VERSION = nc_torch.get_torch_version().release +if PT_VERSION >= Version("1.8.0").release: + FX_MODE = True +else: + FX_MODE = False + + +fake_dyn_yaml = ''' + model: + name: imagenet + framework: pytorch + + quantization: + approach: post_training_dynamic_quant + op_wise: { + 'decoder': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + } + } + evaluation: + accuracy: + metric: + topk: 1 + performance: + warmup: 5 + iteration: 10 + + tuning: + accuracy_criterion: + relative: 0.01 + exit_policy: + timeout: 0 + random_seed: 9527 + workspace: + path: saved + ''' + + +fake_ptq_yaml = ''' + model: + name: imagenet + framework: pytorch + + quantization: + op_wise: { + + 'layer1.0.conv1': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + 'layer1.0.conv2': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + 'layer2.0.conv1': { + 'activation': {'dtype': ['uint8'], 'algorithm': ['minmax'], 'granularity': ['per_tensor'], 'scheme':['sym']}, + 'weight': {'dtype': ['int8'], 'algorithm': ['minmax'], 'granularity': ['per_channel'], 'scheme':['sym']} + }, + 'layer3.0.conv1': { + 'activation': {'dtype': ['uint8'], 'algorithm': ['kl'], 'granularity': ['per_tensor'], 'scheme':['sym']}, + 'weight': {'dtype': ['int8'], 'algorithm': ['minmax'], 'granularity': ['per_channel'], 'scheme':['sym']} + }, + 'layer1.0.add_relu': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + } + evaluation: + accuracy: + metric: + topk: 1 + performance: + warmup: 1 + iteration: 10 + + tuning: + accuracy_criterion: + relative: 0.01 + exit_policy: + timeout: 0 + random_seed: 9527 + workspace: + path: saved + ''' + +fake_auto_yaml = ''' + model: + name: imagenet + framework: pytorch_fx + + quantization: + approach: post_training_auto_quant + evaluation: + accuracy: + metric: + topk: 1 + performance: + warmup: 1 + iteration: 10 + + tuning: + accuracy_criterion: + relative: 0.01 + exit_policy: + timeout: 1000 + max_trials: 3 + random_seed: 9527 + workspace: + path: saved + ''' + + +fake_ptq_yaml_for_fx = ''' + model: + name: imagenet + framework: pytorch_fx + + quantization: + approach: post_training_auto_quant + op_wise: { + 'layer1.0.conv1': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + 'layer1.0.conv2': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + 'layer2.0.conv1': { + 'activation': {'dtype': ['uint8'], 'algorithm': ['minmax'], 'granularity': ['per_tensor'], 'scheme':['sym']}, + 'weight': {'dtype': ['int8'], 'algorithm': ['minmax'], 'granularity': ['per_channel'], 'scheme':['sym']} + }, + 'layer3.0.conv1': { + 'activation': {'dtype': ['uint8'], 'algorithm': ['kl'], 'granularity': ['per_tensor'], 'scheme':['sym']}, + 'weight': {'dtype': ['int8'], 'algorithm': ['minmax'], 'granularity': ['per_channel'], 'scheme':['sym']} + }, + 'layer1.0.add_relu': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + 'conv.module': { + 'weight': {'dtype': ['fp32']}, + 'activation': {'dtype': ['fp32']} + }, + 'default_qconfig': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + } + } + evaluation: + accuracy: + metric: + topk: 1 + performance: + warmup: 5 + iteration: 10 + + tuning: + accuracy_criterion: + relative: 0.01 + exit_policy: + timeout: 0 + random_seed: 9527 + workspace: + path: saved + ''' + + +fake_qat_yaml = ''' + model: + name: imagenet + framework: pytorch + + quantization: + approach: quant_aware_training + train: + end_epoch: 1 + iteration: 1 + optimizer: + SGD: + learning_rate: 0.0001 + criterion: + CrossEntropyLoss: + reduction: mean + op_wise: { + 'layer1.0.conv1': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + 'layer1.0.conv2': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + 'layer2.0.conv1': { + 'activation': {'dtype': ['uint8'], 'algorithm': ['minmax'], 'granularity': ['per_tensor'], 'scheme':['sym']}, + 'weight': {'dtype': ['int8'], 'algorithm': ['minmax'], 'granularity': ['per_channel'], 'scheme':['sym']} + }, + 'layer3.0.conv1': { + 'activation': {'dtype': ['uint8'], 'algorithm': ['kl'], 'granularity': ['per_tensor'], 'scheme':['sym']}, + 'weight': {'dtype': ['int8'], 'algorithm': ['minmax'], 'granularity': ['per_channel'], 'scheme':['sym']} + }, + 'layer1.0.add_relu': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + } + } + evaluation: + accuracy: + metric: + topk: 1 + + tuning: + accuracy_criterion: + relative: 0.01 + exit_policy: + timeout: 0 + random_seed: 9527 + workspace: + path: saved + ''' + + +def build_pytorch_yaml(): + with open('ptq_yaml.yaml', 'w', encoding="utf-8") as f: + f.write(fake_ptq_yaml) + + with open('dynamic_yaml.yaml', 'w', encoding="utf-8") as f: + f.write(fake_dyn_yaml) + + with open('qat_yaml.yaml', 'w', encoding="utf-8") as f: + f.write(fake_qat_yaml) + + with open('auto_yaml.yaml', 'w', encoding="utf-8") as f: + f.write(fake_auto_yaml) + +def build_pytorch_fx_yaml(): + if PT_VERSION >= Version("1.9.0").release: + fake_fx_ptq_yaml = fake_ptq_yaml_for_fx + else: + fake_fx_ptq_yaml = fake_ptq_yaml.replace('pytorch', 'pytorch_fx') + with open('fx_ptq_yaml.yaml', 'w', encoding="utf-8") as f: + f.write(fake_fx_ptq_yaml) + + fake_fx_dyn_yaml = fake_dyn_yaml.replace('pytorch', 'pytorch_fx') + with open('fx_dynamic_yaml.yaml', 'w', encoding="utf-8") as f: + f.write(fake_fx_dyn_yaml) + + fake_fx_qat_yaml = fake_qat_yaml.replace('pytorch', 'pytorch_fx') + with open('fx_qat_yaml.yaml', 'w', encoding="utf-8") as f: + f.write(fake_fx_qat_yaml) + +def build_dump_tensors_yaml(): + fake_yaml = ''' + model: + name: imagenet + framework: pytorch + + evaluation: + accuracy: + metric: + topk: 1 + + tuning: + accuracy_criterion: + relative: 0.01 + exit_policy: + timeout: 0 + random_seed: 9527 + workspace: + path: saved + tensorboard: true + ''' + with open('dump_yaml.yaml', 'w', encoding="utf-8") as f: + f.write(fake_yaml) + + +class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = QuantStub() + self.conv = nn.Conv2d(3, 1, 1) + self.linear = nn.Linear(224 * 224, 5) + self.dequant = DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.conv(x) + x = x.view(1, -1) + x = self.linear(x) + x = self.dequant(x) + return x + + +class FP32Model(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + times = x.size(1) + if times == 1: + return x + x + return x + + +class DynamicModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv2d(1, 1, 1) + def forward(self, x): + if x is not None: + x = self.conv(x) + return x + + +class SubModel(torch.nn.Module): + def __init__(self, bypass=True): + super().__init__() + self.quant = QuantStub() + self.conv = nn.Conv2d(1, 1, 1) + self.conv1 = nn.Conv2d(1, 1, 1) + self.bn = nn.BatchNorm2d(1) + self.relu = nn.ReLU() + self.fp32 = FP32Model() + self.norm = nn.LayerNorm([1, 224, 224]) + self.dequant = DeQuantStub() + self.bypass = bypass + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.quant(x) + x = self.relu(x) + x = self.conv1(x) + x = self.dequant(x) + if not self.bypass: + x = self.fp32(x) + x = self.norm(x) + return x + + +class PartialQuantModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = QuantStub() + self.conv = nn.Conv2d(3, 1, 1) + self.bn = nn.BatchNorm2d(1) + self.conv1 = nn.Conv2d(1, 1, 1) + self.bn1 = nn.BatchNorm2d(1) + self.conv2 = nn.Conv2d(1, 1, 1) + self.linear = nn.Linear(224 * 224, 1) + self.dequant = DeQuantStub() + self.sub = SubModel(bypass=False) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.conv1(x) + x = self.bn1(x) + x = self.sub(x) + x = self.quant(x) + x = self.conv2(x) + x = x.view(1, -1) + x = self.linear(x) + x = self.dequant(x) + return x + +class DynamicControlModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv2d(3, 1, 1) + self.bn = nn.BatchNorm2d(1) + self.linear = nn.Linear(224 * 224, 1) + self.sub = SubModel() + self.fp32 = FP32Model() + self.dyn = DynamicModel() + + def forward(self, x): + x = self.conv(x) + x = self.dyn(x) + x = self.bn(x) + x = self.sub(x) + x = self.fp32(x) + x = x.view(1, -1) + x = self.linear(x) + return x + + +class LSTMModel(nn.Module): + '''Container module with an encoder, a recurrent module, and a decoder.''' + + def __init__(self, ntoken=10, ninp=512, nhid=256, nlayers=5, dropout=0.5): + super(LSTMModel, self).__init__() + self.drop = nn.Dropout(dropout) + self.encoder = nn.Embedding(ntoken, ninp) + self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout) + self.decoder = nn.Linear(nhid, ntoken) + self.init_weights() + self.nhid = nhid + self.nlayers = nlayers + + def init_weights(self): + initrange = 0.1 + self.encoder.weight.data.uniform_(-initrange, initrange) + self.decoder.bias.data.zero_() + self.decoder.weight.data.uniform_(-initrange, initrange) + + def forward(self, input): + input = torch.ones((3, 10), dtype=torch.int32) + h0 = torch.randn(2, 10, 256) + c0 = torch.randn(2, 10, 256) + hidden = (h0, c0) + emb = self.encoder(input) + output, hidden = self.rnn(emb, hidden) + output = self.drop(output) + decoded = self.decoder(output) + return decoded, hidden + + +def eval_func(model): + # switch to evaluate mode + model.eval() + with torch.no_grad(): + input = torch.randn(1, 3, 224, 224) + # compute output + output = model(input) + return 0.0 + + +def q_func(model): + optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) + # switch to evaluate mode + model.train() + input = torch.randn(1, 3, 224, 224) + # compute output + output = model(input) + loss = output.mean() + optimizer.zero_grad() + loss.backward() + optimizer.step() + return model + + +class TestPytorchAdaptor(unittest.TestCase): + framework_specific_info = {"device": "cpu", + "approach": "post_training_static_quant", + "random_seed": 1234, + "q_dataloader": None, + "workspace_path": "./"} + framework = "pytorch" + adaptor = FRAMEWORKS[framework](framework_specific_info) + model = q_resnet18() + nc_model = MODELS['pytorch'](model) + + @classmethod + def setUpClass(self): + build_pytorch_yaml() + build_dump_tensors_yaml() + + @classmethod + def tearDownClass(self): + os.remove('ptq_yaml.yaml') + os.remove('dynamic_yaml.yaml') + os.remove('qat_yaml.yaml') + os.remove('dump_yaml.yaml') + os.remove('auto_yaml.yaml') + shutil.rmtree('./saved', ignore_errors=True) + shutil.rmtree('runs', ignore_errors=True) + + def test_get_all_weight_name(self): + assert len(list(self.nc_model.get_all_weight_names())) == 62 + + def test_get_weight(self): + for name, param in self.model.named_parameters(): + if name == "layer4.1.conv2.weight": + param.data.fill_(0.0) + if name == "fc.bias": + param.data.fill_(0.1) + assert int(torch.sum(self.nc_model.get_weight("layer4.1.conv2.weight"))) == 0 + assert torch.allclose( + torch.sum( + self.nc_model.get_weight("fc.bias")), + torch.tensor(100.)) + + def test_get_input(self): + model = MODELS['pytorch'](q_resnet18()) + model.model.eval().fuse_model() + model.register_forward_pre_hook() + rand_input = torch.rand(100, 3, 224, 224).float() + model.model(rand_input) + assert torch.equal(model.get_inputs('x'), rand_input) + model.remove_hooks() + + def test_update_weights(self): + self.nc_model.update_weights('fc.bias', torch.zeros([1000])) + assert int(torch.sum(self.nc_model.get_weight("fc.bias"))) == 0 + + def test_get_gradient(self): + with self.assertRaises(AssertionError): + self.nc_model.get_gradient('fc.bias') + + for name, tensor in self.nc_model._model.named_parameters(): + if name == 'fc.bias': + tensor.grad = torch.zeros_like(tensor) + break + assert torch.equal(torch.Tensor(self.nc_model.get_gradient('fc.bias')), torch.zeros_like(tensor)) + + rand_input = torch.rand(100, 3, 224, 224).float() + rand_input.grad = torch.ones_like(rand_input) + assert torch.equal(torch.Tensor(self.nc_model.get_gradient(rand_input)), + torch.ones_like(rand_input)) + + def test_report_sparsity(self): + df, total_sparsity = self.nc_model.report_sparsity() + self.assertTrue(total_sparsity > 0) + self.assertTrue(len(df) == 22) + + def test_quantization_saved(self): + for fake_yaml in ['dynamic_yaml.yaml', 'qat_yaml.yaml', 'ptq_yaml.yaml']: + model = M() + quantizer = Quantization(fake_yaml) + quantizer.conf.usr_cfg.tuning.exit_policy['performance_only'] = True + dataset = quantizer.dataset('dummy', (100, 3, 224, 224), label=True) + quantizer.model = model + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.eval_dataloader = common.DataLoader(dataset) + q_model = quantizer.fit() + eval_func(q_model) + q_model.save('./saved') + # Load configure and weights by neural_compressor.utils + saved_model = load("./saved", model) + eval_func(saved_model) + # recover int8 model from history + history_file = './saved/history.snapshot' + model_recover = recover(model, history_file, 0) + eval_func(model_recover) + self.assertEqual(type(saved_model.conv), \ + type(model_recover.conv)) + shutil.rmtree('./saved', ignore_errors=True) + from neural_compressor.experimental import Benchmark + evaluator = Benchmark('ptq_yaml.yaml') + # Load configure and weights by neural_compressor.model + evaluator.model = model + evaluator.b_dataloader = common.DataLoader(dataset) + evaluator.fit('accuracy') + + for fake_yaml in ['qat_yaml.yaml', 'ptq_yaml.yaml']: + model = copy.deepcopy(self.model) + if fake_yaml == 'ptq_yaml.yaml': + model.eval().fuse_model() + conf = QuantConf(fake_yaml) + quantizer = Quantization(conf) + dataset = quantizer.dataset('dummy', (100, 3, 224, 224)) + quantizer.model = model + if fake_yaml == 'qat_yaml.yaml': + quantizer.q_func = q_func + else: + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.eval_func = eval_func + q_model = quantizer.fit() + q_model.save('./saved') + # Load configure and weights by neural_compressor.utils + saved_model = load("./saved", model) + eval_func(saved_model) + shutil.rmtree('./saved', ignore_errors=True) + + def test_quantization_new_saved(self): + for fake_yaml in ['dynamic_yaml.yaml', 'qat_yaml.yaml', 'ptq_yaml.yaml']: + model = M() + quantizer = Quantization(fake_yaml) + quantizer.conf.usr_cfg.tuning.exit_policy['performance_only'] = True + dataset = quantizer.dataset('dummy', (100, 3, 224, 224), label=True) + quantizer.model = model + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.eval_dataloader = common.DataLoader(dataset) + q_model = quantizer.fit() + eval_func(q_model) + torch.save(q_model.quantized_state_dict(), './saved/model.pt') + # Load configure and weights by neural_compressor.utils + from neural_compressor.experimental.common import Model + common_model = Model(model) + common_model.load_quantized_state_dict(torch.load('./saved/model.pt')) + eval_func(common_model) + self.assertEqual(type(q_model._model.linear), \ + type(common_model._model.linear)) + shutil.rmtree('./saved', ignore_errors=True) + + @unittest.skipIf(IPEX, "this function is affected by IPEX, Fixing now.") + def test_non_quant_module(self): + for fake_yaml in ['qat_yaml.yaml', 'ptq_yaml.yaml']: + model = PartialQuantModel() + conf = QuantConf(fake_yaml) + quantizer = Quantization(conf) + dataset = quantizer.dataset('dummy', (1, 3, 224, 224)) + non_quant_dict = {'non_quant_module_name': ['conv', 'conv1', 'sub.conv'], \ + 'non_quant_module_class': ['BatchNorm2d', 'FP32Model']} + quantizer.model = common.Model(model, **non_quant_dict) + if fake_yaml == 'qat_yaml.yaml': + quantizer.q_func = q_func + else: + quantizer.calib_func = eval_func + quantizer.eval_func = eval_func + q_model = quantizer.fit() + q_model.save('./saved') + saved_model = load("./saved", model, **non_quant_dict) + eval_func(saved_model) + shutil.rmtree('./saved', ignore_errors=True) + + def test_auto_quant(self): + def eval_func(model): + return 1 + + model_origin = LSTMModel( + ntoken = 10, + ninp = 512, + nhid = 256, + nlayers = 2, + ) + # run fx_quant in neural_compressor and save the quantized GraphModule + quantizer = Quantization('auto_yaml.yaml') + dataset = quantizer.dataset('dummy', (3, 10), label=True) + quantizer.eval_func = eval_func + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.model = common.Model(model_origin) + q_model = quantizer.fit() + self.assertNotEqual(q_model, None) + + def test_workspace_path(self): + model = M() + quantizer = Quantization('ptq_yaml.yaml') + quantizer.conf.usr_cfg.tuning.exit_policy['performance_only'] = True + dataset = quantizer.dataset('dummy', (100, 3, 224, 224), label=True) + quantizer.model = model + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.eval_dataloader = common.DataLoader(dataset) + q_model = quantizer.fit() + eval_func(q_model) + torch.save(q_model.quantized_state_dict(), './saved/best_model.pt') + # Load configure and weights by workspace_path + from neural_compressor.experimental.common import Model + common_model = Model(model) + common_model.workspace_path = './saved' + eval_func(common_model) + self.assertEqual(type(q_model._model.linear), \ + type(common_model._model.linear)) + shutil.rmtree('./saved', ignore_errors=True) + + def test_get_graph_info(self): + from neural_compressor.model.torch_model import PyTorchModel + model = PyTorchModel(self.model) + op_map = model.graph_info + self.assertTrue(op_map['conv1'] == 'Conv2d') + + def test_tensorboard(self): + model = copy.deepcopy(self.nc_model) + model.model.eval().fuse_model() + quantizer = Quantization('dump_yaml.yaml') + dataset = quantizer.dataset('dummy', (100, 3, 224, 224), label=True) + quantizer.model = model.model + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.eval_func = eval_func + quantizer.fit() + self.assertTrue(True if os.path.exists('runs/eval/baseline_acc0.0') else False) + quantizer.eval_dataloader = common.DataLoader(dataset) + quantizer.eval_func = None + quantizer.fit() + self.assertTrue(True if os.path.exists('runs/eval/baseline_acc0.0') else False) + + def test_tensor_dump_and_set(self): + model = copy.deepcopy(self.nc_model) + model.model.eval().fuse_model() + quantizer = Quantization('ptq_yaml.yaml') + dataset = quantizer.dataset('dummy', (100, 3, 224, 224), label=True) + dataloader = common.DataLoader(dataset) + dataloader = common._generate_common_dataloader(dataloader, 'pytorch') + quantizer.eval_dataloader = dataloader + quantizer.calib_dataloader = dataloader + quantizer.model = model.model + q_model = quantizer.fit() + quantizer.strategy.adaptor.inspect_tensor( + model, dataloader, op_list=['conv1.0', 'layer1.0.conv1.0'], + iteration_list=[1, 2], inspect_type='all', save_to_disk=True) + load_array = lambda *a, **k: np.load(*a, allow_pickle=True, **k) + a = load_array('saved/dump_tensor/activation_iter1.npz') + w = load_array('saved/dump_tensor/weight.npz') + if PT_VERSION >= Version("1.8.0").release: + self.assertTrue(w['conv1.0'].item()['conv1.0.weight'].shape[0] == + a['conv1.0'].item()['conv1.0.output0'].shape[1]) + else: + self.assertTrue(w['conv1.0'].item()['conv1.0.weight'].shape[0] == + a['conv1.0'].item()['conv1.1.output0'].shape[1]) + data = np.random.random(w['conv1.0'].item()['conv1.0.weight'].shape).astype(np.float32) + quantizer.strategy.adaptor.set_tensor(q_model, {'conv1.0.weight': data}) + changed_tensor = q_model.get_weight('conv1.weight') + scales = changed_tensor.q_per_channel_scales() + changed_tensor_fp32 = torch.dequantize(changed_tensor) + self.assertTrue(np.allclose(data, changed_tensor_fp32.numpy(), atol=2 / np.min(scales.numpy()))) + quantizer.strategy.adaptor.inspect_tensor( + q_model, dataloader, op_list=['conv1.0', 'layer1.0.conv1.0'], + iteration_list=[1, 2], inspect_type='all', save_to_disk=False) + + def test_get_graph_info(self): + from neural_compressor.adaptor.pytorch import get_ops_recursively + model = copy.deepcopy(self.model) + op_map = {} + get_ops_recursively(model, '', op_map) + self.assertTrue(op_map['conv1'] == 'Conv2d') + + def test_forward_wrapper(self): + vision_model = resnet18() + class dummymodel(torch.nn.Module): + def __init__(self, model): + super(dummymodel, self).__init__() + self._model = model + def forward(self,input=None): + return self._model(input) + + data = [[{'input': torch.rand(3,224,224)}, torch.ones(1,1)], ] + # dataloader.batch_size=100 + dataloader = common.DataLoader(data, batch_size=1) + quantizer = Quantization('dynamic_yaml.yaml') + model = dummymodel(vision_model) + quantizer.model = model + quantizer.calib_dataloader = dataloader + quantizer.eval_dataloader = dataloader + quantizer.fit() + + def test_floatfunctions_fallback(self): + class ModelWithFunctionals(torch.nn.Module): + def __init__(self): + super(ModelWithFunctionals, self).__init__() + self.mycat = nnq.FloatFunctional() + self.myadd = nnq.FloatFunctional() + self.myadd_relu = nnq.FloatFunctional() + # Tracing doesnt work yet for c10 ops with scalar inputs + # https://github.com/pytorch/pytorch/issues/27097 + self.my_scalar_add = nnq.FloatFunctional() + self.mymul = nnq.FloatFunctional() + self.my_scalar_mul = nnq.FloatFunctional() + self.quant = QuantStub() + self.dequant = DeQuantStub() + + def forward(self, x): + x = self.quant(x) + y = self.mycat.cat([x, x, x]) + z = self.myadd.add(y, y) + w = self.myadd_relu.add_relu(z, z) + # Tracing doesnt work yet for c10 ops with scalar inputs + # https://github.com/pytorch/pytorch/issues/27097 + w = self.my_scalar_add.add_scalar(w, -0.5) + w = self.mymul.mul(w, w) + w = self.my_scalar_mul.mul_scalar(w, 0.5) + w = self.dequant(w) + return w + + model = ModelWithFunctionals() + model = MODELS['pytorch'](model) + x = torch.rand(10, 1, dtype=torch.float) + y = model.model(x) + fallback_ops = [] + q_capability = self.adaptor.query_fw_capability(model) + for k, v in q_capability["opwise"].items(): + if k[0] != "quant" and k[0] != "dequant": + fallback_ops.append(k[0]) + model.model.qconfig = torch.quantization.default_qconfig + model.model.quant.qconfig = torch.quantization.default_qconfig + if PT_VERSION >= Version("1.8.0").release: + model.model.dequant.qconfig = torch.quantization.default_qconfig + nc_torch._fallback_quantizable_ops_recursively( + model.model, '', fallback_ops, op_qcfgs={}) + torch.quantization.add_observer_(model.model) + model.model(x) + torch.quantization.convert(model.model, self.adaptor.q_mapping, inplace=True) + qy = model.model(x) + tol = {'atol': 1e-01, 'rtol': 1e-03} + self.assertTrue(np.allclose(y, qy, **tol)) + +@unittest.skipIf(not FX_MODE, "Unsupport Fx Mode with PyTorch Version Below 1.8") +class TestPytorchFXAdaptor(unittest.TestCase): + framework_specific_info = {"device": "cpu", + "approach": "post_training_static_quant", + "random_seed": 1234, + "q_dataloader": None, + "workspace_path": "./"} + framework = "pytorch_fx" + adaptor = FRAMEWORKS[framework](framework_specific_info) + @classmethod + def setUpClass(self): + build_pytorch_fx_yaml() + + @classmethod + def tearDownClass(self): + os.remove('fx_ptq_yaml.yaml') + os.remove('fx_dynamic_yaml.yaml') + shutil.rmtree('./saved', ignore_errors=True) + shutil.rmtree('runs', ignore_errors=True) + + def test_fx_quant(self): + for fake_yaml in ['fx_qat_yaml.yaml', 'fx_ptq_yaml.yaml']: + model_origin = resnet18() + # run fx_quant in neural_compressor and save the quantized GraphModule + quantizer = Quantization(fake_yaml) + dataset = quantizer.dataset('dummy', (10, 3, 224, 224), label=True) + quantizer.eval_func = eval_func + if fake_yaml == 'fx_qat_yaml.yaml': + quantizer.q_func = q_func + else: + quantizer.calib_func = eval_func + dataloader = common.DataLoader(dataset) + quantizer.calib_dataloader = dataloader + quantizer.model = common.Model(model_origin, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []} + }) + q_model = quantizer.fit() + q_model.save('./saved') + # Load configure and weights with neural_compressor.utils + model_fx = load('./saved', model_origin, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []}, \ + 'dataloader': quantizer.calib_dataloader + }) + self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) + + # recover int8 model with only tune_cfg + history_file = './saved/history.snapshot' + model_fx_recover = recover(model_origin, history_file, 0, + **{'prepare_custom_config_dict': + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': + {'preserved_attributes': []}, + 'dataloader': quantizer.calib_dataloader + }) + self.assertEqual(model_fx.code, model_fx_recover.code) + shutil.rmtree('./saved', ignore_errors=True) + + for fake_yaml in ['fx_qat_yaml.yaml', 'fx_ptq_yaml.yaml']: + model_origin = M() + # run fx_quant in neural_compressor and save the quantized GraphModule + quantizer = Quantization(fake_yaml) + quantizer.conf.usr_cfg.tuning.exit_policy['performance_only'] = True + dataset = quantizer.dataset('dummy', (10, 3, 224, 224), label=True) + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.eval_dataloader = common.DataLoader(dataset) + quantizer.model = common.Model(model_origin, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []} + }) + q_model = quantizer.fit() + q_model.save('./saved') + # Load configure and weights with neural_compressor.utils + model_fx = load('./saved', model_origin, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []}, \ + 'dataloader': quantizer.calib_dataloader + }) + self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) + shutil.rmtree('./saved', ignore_errors=True) + + @unittest.skipIf(PT_VERSION < Version("1.9.0").release, + "Please use PyTroch 1.9 or higher version for dynamic quantization with pytorch_fx backend") + def test_fx_dynamic_quant(self): + model = LSTMModel( + ntoken = 10, + ninp = 512, + nhid = 256, + nlayers = 5, + ) + # run fx_quant in neural_compressor and save the quantized GraphModule + model.eval() + quantizer = Quantization('fx_dynamic_yaml.yaml') + quantizer.model = common.Model(model, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []} + }) + q_model = quantizer.fit() + q_model.save('./saved') + + # Load configure and weights by neural_compressor.utils + model_fx = load("./saved", model, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []} + }) + self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) + + # Test the functionality of older model saving type + state_dict = torch.load("./saved/best_model.pt") + tune_cfg = state_dict.pop('best_configure') + import yaml + with open("./saved/best_configure.yaml", 'w') as f: + yaml.dump(tune_cfg, f, default_flow_style=False) + torch.save(state_dict, "./saved/best_model_weights.pt") + os.remove('./saved/best_model.pt') + model_fx = load("./saved", model, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []} + }) + self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) + + # recover int8 model with only tune_cfg + history_file = './saved/history.snapshot' + model_fx_recover = recover(model, history_file, 0, + **{'prepare_custom_config_dict': + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': + {'preserved_attributes': []} + }) + self.assertEqual(model_fx.code, model_fx_recover.code) + shutil.rmtree('./saved', ignore_errors=True) + + def test_default_dynamic_quant(self): + def eval_func(model): + return 1 + + def q_func(model): + return model + + # Model Definition + for fake_yaml in ['fx_qat_yaml.yaml', 'fx_ptq_yaml.yaml']: + model_origin = LSTMModel( + ntoken = 10, + ninp = 512, + nhid = 256, + nlayers = 2, + ) + # run fx_quant in neural_compressor and save the quantized GraphModule + quantizer = Quantization(fake_yaml) + dataset = quantizer.dataset('dummy', (3, 10), label=True) + quantizer.eval_func = eval_func + if fake_yaml == 'fx_qat_yaml.yaml': + quantizer.q_func = q_func + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.model = common.Model(model_origin) + q_model = quantizer.fit() + self.assertTrue('quantize' in str(type(q_model.model.encoder))) + self.assertTrue('quantize' in str(type(q_model.model.rnn))) + + def test_fx_sub_module_quant(self): + for fake_yaml in ['fx_qat_yaml.yaml', 'fx_ptq_yaml.yaml', 'fx_dynamic_yaml.yaml']: + model_origin = DynamicControlModel() + # run fx_quant in neural_compressor and save the quantized GraphModule + quantizer = Quantization(fake_yaml) + dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True) + quantizer.eval_func = eval_func + if fake_yaml == 'fx_qat_yaml.yaml': + quantizer.q_func = q_func + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.model = common.Model(model_origin, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []} + }) + q_model = quantizer.fit() + q_model.save('./saved') + # Load configure and weights with neural_compressor.utils + model_fx = load('./saved/best_model.pt', model_origin, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []}, \ + 'dataloader': quantizer.calib_dataloader + }) + self.assertTrue(isinstance(model_fx.sub, torch.fx.graph_module.GraphModule)) + + # recover int8 model with only tune_cfg + history_file = './saved/history.snapshot' + model_fx_recover = recover(model_origin, history_file, 0, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []}, \ + 'dataloader': quantizer.calib_dataloader + }) + self.assertEqual(model_fx.sub.code, model_fx_recover.sub.code) + shutil.rmtree('./saved', ignore_errors=True) + + def test_deepcopy_failure(self): + def eval_func(model): + return 1 + + # To build an object t2, which will fail on deepcopy. + class T1(): + def __init__(self, t1) -> None: + self.t1 = t1 + self.j = 1 + + # required for usage with set in T1 + def __hash__(self): + return hash(self.j) + + t1 = set() + t2 = T1([t1]) + t1.add(t2) + + for fake_yaml in ['fx_ptq_yaml.yaml']: + model_origin = M() + model_origin.tmp = t2 + # run fx_quant in neural_compressor and save the quantized GraphModule + quantizer = Quantization(fake_yaml) + dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True) + quantizer.eval_func = eval_func + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.model = common.Model(model_origin) + q_model = quantizer.fit() + self.assertTrue(isinstance(q_model.model, torch.fx.graph_module.GraphModule)) + + @unittest.skipIf(PT_VERSION < Version("1.11.0").release, + "Please use PyTroch 1.11 or higher version for mixed precision with pytorch_fx or pytorch backend") + def test_bf16_capability(self): + model_origin = DynamicControlModel() + os.environ['FORCE_BF16'] = '1' + q_capability = self.adaptor._get_quantizable_ops(model_origin) + del os.environ['FORCE_BF16'] + + self.assertEqual( + [elem['weight']['dtype'] for elem in q_capability['optypewise']['Conv2d']], + [['int8'], 'fp32']) + self.assertEqual( + [elem['activation']['dtype'] for elem in q_capability['optypewise']['Conv2d']], + [['uint8'], 'fp32']) + self.assertEqual( + [elem['weight']['dtype'] for elem in q_capability['opwise'][('conv', 'Conv2d')]], + [['int8'], 'fp32']) + self.assertEqual( + [elem['activation']['dtype'] for elem in q_capability['opwise'][('conv', 'Conv2d')]], + [['uint8'], 'fp32']) + self.assertEqual( + [elem['weight']['dtype'] for elem in q_capability['opwise'][('linear', 'Linear')]], + [['int8'], 'fp32', 'bf16']) + self.assertEqual( + [elem['activation']['dtype'] for elem in q_capability['opwise'][('linear', 'Linear')]], + [['uint8'], 'fp32', 'bf16']) + + @unittest.skipIf(PT_VERSION < Version("1.11.0").release, + "Please use PyTroch 1.11 or higher version for mixed precision with pytorch_fx or pytorch backend") + def test_mix_precision(self): + fake_yaml = 'fx_ptq_yaml.yaml' + model_origin = DynamicControlModel() + # run fx_quant in neural_compressor and save the quantized GraphModule + quantizer = Quantization(fake_yaml) + dataset = quantizer.dataset('dummy', (1, 3, 224, 224), label=True) + quantizer.eval_func = eval_func + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.model = common.Model(model_origin, + **{'prepare_custom_config_dict': \ + {'non_traceable_module_name': ['a']}, + 'convert_custom_config_dict': \ + {'preserved_attributes': []} + }) + q_model = quantizer.fit() + tune_cfg = q_model.q_config + tune_cfg['op'][('conv.module', 'Conv2d')].clear() + tune_cfg['op'][('conv.module', 'Conv2d')] = \ + {'weight': {'dtype': 'bf16'}, 'activation': {'dtype': 'bf16'}} + tune_cfg["bf16_ops_list"].append(('conv.module', 'Conv2d')) + from neural_compressor.adaptor.torch_utils.bf16_convert import Convert + q_model._model = Convert(q_model._model, tune_cfg) + + self.assertEqual(q_model._model.conv.module.module.weight.dtype, torch.bfloat16) + self.assertEqual(q_model._model.conv.module.module.bias.dtype, torch.bfloat16) + + def test_symbolic_trace(self): + from neural_compressor.adaptor.torch_utils.symbolic_trace import symbolic_trace + model_origin = DynamicControlModel() + traced_model = symbolic_trace(model_origin, is_qat=False) + if PT_VERSION >= Version("1.11.0").release: + self.assertTrue(isinstance(traced_model.sub, torch.nn.Module)) + self.assertTrue(isinstance(traced_model.conv, torch.fx.graph_module.GraphModule)) + else: + self.assertTrue(isinstance(traced_model.sub, torch.fx.graph_module.GraphModule)) + traced_model_qat = symbolic_trace(model_origin, is_qat=True) + self.assertTrue(isinstance(traced_model_qat.sub, torch.fx.graph_module.GraphModule)) + +if __name__ == "__main__": + unittest.main() diff --git a/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_2.x.py b/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_2.x.py new file mode 100644 index 00000000000..3bea3e28673 --- /dev/null +++ b/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_2.x.py @@ -0,0 +1,682 @@ +import copy +import neural_compressor.adaptor.pytorch as nc_torch +import numpy as np +import os +import shutil +import torch +import torch.nn as nn +import torch.nn.quantized as nnq +import unittest +import os +from neural_compressor import PostTrainingQuantConfig, QuantizationAwareTrainingConfig +from neural_compressor.config import set_tensorboard, set_workspace +from neural_compressor.data import DATASETS, DATALOADERS +from neural_compressor.adaptor import FRAMEWORKS +from neural_compressor.model import MODELS +from neural_compressor.experimental import Quantization, common +from neural_compressor.experimental.data.datasets.dataset import DATASETS +from neural_compressor import quantization +from neural_compressor.training import prepare_compression +from neural_compressor.utils.pytorch import load +from neural_compressor.utils.utility import recover +from neural_compressor.utils.utility import LazyImport +from torch.quantization import QuantStub, DeQuantStub +from packaging.version import Version + + +# improve lazy import UT coverage +resnet18 = LazyImport("torchvision.models.resnet18") +q_resnet18 = LazyImport("torchvision.models.quantization.resnet18") + +PT_VERSION = nc_torch.get_torch_version().release +if PT_VERSION >= Version("1.8.0").release: + FX_MODE = True +else: + FX_MODE = False + + +dyn_op_name_list = {"decoder": {"activation": {"dtype": ["fp32"]}, "weight": {"dtype": ["fp32"]}}} + +ptq_op_name_list = { + "layer1.0.conv1": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + }, + "layer1.0.conv2": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + }, + "layer2.0.conv1": { + "activation": { + "dtype": ["uint8"], + "algorithm": ["minmax"], + "granularity": ["per_tensor"], + "scheme": ["sym"] + }, + "weight": { + "dtype": ["int8"], + "algorithm": ["minmax"], + "granularity": ["per_channel"], + "scheme": ["sym"] + } + }, + "layer3.0.conv1": { + "activation": { + "dtype": ["uint8"], + "algorithm": ["kl"], + "granularity": ["per_tensor"], + "scheme": ["sym"] + }, + "weight": { + "dtype": ["int8"], + "algorithm": ["minmax"], + "granularity": ["per_channel"], + "scheme": ["sym"] + } + }, + "layer1.0.add_relu": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + }, +} + +ptq_fx_op_name_list = { + "layer1.0.conv1": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + }, + "layer1.0.conv2": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + }, + "layer2.0.conv1": { + "activation": { + "dtype": ["uint8"], + "algorithm": ["minmax"], + "granularity": ["per_tensor"], + "scheme": ["sym"] + }, + "weight": { + "dtype": ["int8"], + "algorithm": ["minmax"], + "granularity": ["per_channel"], + "scheme": ["sym"] + } + }, + "layer3.0.conv1": { + "activation": { + "dtype": ["uint8"], + "algorithm": ["kl"], + "granularity": ["per_tensor"], + "scheme": ["sym"] + }, + "weight": { + "dtype": ["int8"], + "algorithm": ["minmax"], + "granularity": ["per_channel"], + "scheme": ["sym"] + } + }, + "layer1.0.add_relu": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + }, + "conv.module": { + "weight": { + "dtype": ["fp32"] + }, + "activation": { + "dtype": ["fp32"] + } + }, + "default_qconfig": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + } +} + +qat_op_name_list = { + "layer1.0.conv1": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + }, + "layer1.0.conv2": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + }, + "layer2.0.conv1": { + "activation": { + "dtype": ["uint8"], + "algorithm": ["minmax"], + "granularity": ["per_tensor"], + "scheme": ["sym"] + }, + "weight": { + "dtype": ["int8"], + "algorithm": ["minmax"], + "granularity": ["per_channel"], + "scheme": ["sym"] + } + }, + "layer3.0.conv1": { + "activation": { + "dtype": ["uint8"], + "algorithm": ["kl"], + "granularity": ["per_tensor"], + "scheme": ["sym"] + }, + "weight": { + "dtype": ["int8"], + "algorithm": ["minmax"], + "granularity": ["per_channel"], + "scheme": ["sym"] + } + }, + "layer1.0.add_relu": { + "activation": { + "dtype": ["fp32"] + }, + "weight": { + "dtype": ["fp32"] + } + } +} + + + + +class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = QuantStub() + self.conv = nn.Conv2d(3, 1, 1) + self.linear = nn.Linear(224 * 224, 5) + self.dequant = DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.conv(x) + x = x.view(1, -1) + x = self.linear(x) + x = self.dequant(x) + return x + + +class FP32Model(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + times = x.size(1) + if times == 1: + return x + x + return x + + +class DynamicModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv2d(1, 1, 1) + def forward(self, x): + if x is not None: + x = self.conv(x) + return x + + +class SubModel(torch.nn.Module): + def __init__(self, bypass=True): + super().__init__() + self.quant = QuantStub() + self.conv = nn.Conv2d(1, 1, 1) + self.conv1 = nn.Conv2d(1, 1, 1) + self.bn = nn.BatchNorm2d(1) + self.relu = nn.ReLU() + self.fp32 = FP32Model() + self.norm = nn.LayerNorm([1, 224, 224]) + self.dequant = DeQuantStub() + self.bypass = bypass + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.quant(x) + x = self.relu(x) + x = self.conv1(x) + x = self.dequant(x) + if not self.bypass: + x = self.fp32(x) + x = self.norm(x) + return x + + +class PartialQuantModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = QuantStub() + self.conv = nn.Conv2d(3, 1, 1) + self.bn = nn.BatchNorm2d(1) + self.conv1 = nn.Conv2d(1, 1, 1) + self.bn1 = nn.BatchNorm2d(1) + self.conv2 = nn.Conv2d(1, 1, 1) + self.linear = nn.Linear(224 * 224, 1) + self.dequant = DeQuantStub() + self.sub = SubModel(bypass=False) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.conv1(x) + x = self.bn1(x) + x = self.sub(x) + x = self.quant(x) + x = self.conv2(x) + x = x.view(1, -1) + x = self.linear(x) + x = self.dequant(x) + return x + +class DynamicControlModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv2d(3, 1, 1) + self.bn = nn.BatchNorm2d(1) + self.linear = nn.Linear(224 * 224, 1) + self.sub = SubModel() + self.fp32 = FP32Model() + self.dyn = DynamicModel() + + def forward(self, x): + x = self.conv(x) + x = self.dyn(x) + x = self.bn(x) + x = self.sub(x) + x = self.fp32(x) + x = x.view(1, -1) + x = self.linear(x) + return x + + +class LSTMModel(nn.Module): + """Container module with an encoder, a recurrent module, and a decoder.""" + + def __init__(self, ntoken=10, ninp=512, nhid=256, nlayers=5, dropout=0.5): + super(LSTMModel, self).__init__() + self.drop = nn.Dropout(dropout) + self.encoder = nn.Embedding(ntoken, ninp) + self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout) + self.decoder = nn.Linear(nhid, ntoken) + self.init_weights() + self.nhid = nhid + self.nlayers = nlayers + + def init_weights(self): + initrange = 0.1 + self.encoder.weight.data.uniform_(-initrange, initrange) + self.decoder.bias.data.zero_() + self.decoder.weight.data.uniform_(-initrange, initrange) + + def forward(self, input): + input = torch.ones((3, 10), dtype=torch.int32) + h0 = torch.randn(2, 10, 256) + c0 = torch.randn(2, 10, 256) + hidden = (h0, c0) + emb = self.encoder(input) + output, hidden = self.rnn(emb, hidden) + output = self.drop(output) + decoded = self.decoder(output) + return decoded, hidden + + +def eval_func(model): + # switch to evaluate mode + model.eval() + with torch.no_grad(): + input = torch.randn(1, 3, 224, 224) + # compute output + output = model(input) + return 0.0 + + +def train_func(compression_manager, model, dataloader=None): + compression_manager.callbacks.on_train_begin(dataloader=dataloader) + optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) + # switch to evaluate mode + model.train() + input = torch.randn(1, 3, 224, 224) + # compute output + output = model(input) + loss = output[0].mean() if isinstance(output, tuple) else output.mean() + optimizer.zero_grad() + loss.backward() + optimizer.step() + compression_manager.callbacks.on_train_end() + return model + + +def q_func(model): + optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) + # switch to evaluate mode + model.train() + input = torch.randn(1, 3, 224, 224) + # compute output + output = model(input) + loss = output.mean() + optimizer.zero_grad() + loss.backward() + optimizer.step() + return model + + +class TestPytorchAdaptor(unittest.TestCase): + model = q_resnet18() + + @classmethod + def tearDownClass(self): + shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree("runs", ignore_errors=True) + + def test_quantization_new_API(self): + for fake_yaml in ["dynamic", "qat", "static"]: + model = M() + if fake_yaml == "qat": + quant_conf = QuantizationAwareTrainingConfig(op_name_list=qat_op_name_list) + compression_manager = prepare_compression(copy.deepcopy(model), quant_conf) + q_model = train_func(compression_manager, compression_manager.model) + else: + dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) + dataloader = DATALOADERS["pytorch"](dataset) + if fake_yaml == "dynamic": + quant_conf = PostTrainingQuantConfig(approach="dynamic", + op_name_list=dyn_op_name_list) + elif fake_yaml == "static": + quant_conf = PostTrainingQuantConfig(approach="static", + op_name_list=ptq_op_name_list) + q_model = quantization.fit( + model, + quant_conf, + calib_dataloader=dataloader if fake_yaml == "static" else None) + q_model.save("./saved") + # Load configure and weights by neural_compressor.utils + saved_model = load("./saved", model) + shutil.rmtree("./saved", ignore_errors=True) + + def test_auto_quant(self): + def eval_func(model): + return 1 + + model_origin = LSTMModel( + ntoken = 10, + ninp = 512, + nhid = 256, + nlayers = 2, + ) + # run fx_quant in neural_compressor and save the quantized GraphModule + quant_conf = PostTrainingQuantConfig(approach="auto") + set_workspace("./saved") + dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) + dataloader = common.DataLoader(dataset) + q_model = quantization.fit(model_origin, + quant_conf, + calib_dataloader=dataloader, + eval_func=eval_func) + q_model.save("./saved") + model = common.Model(model_origin) + model.workspace_path = "./saved" + self.assertNotEqual(q_model, None) + self.assertEqual(type(q_model._model.decoder), + type(model._model.decoder)) + shutil.rmtree("./saved", ignore_errors=True) + + def test_tensorboard(self): + model = copy.deepcopy(self.model) + model.eval().fuse_model() + quant_conf = PostTrainingQuantConfig(approach="static", + backend="pytorch") + set_tensorboard(True) + dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) + dataloader = common.DataLoader(dataset) + quantization.fit( + model, quant_conf, calib_dataloader=dataloader, eval_func=eval_func + ) + self.assertTrue(True if os.path.exists("runs/eval/baseline_acc0.0") else False) + quantization.fit(model, + quant_conf, + calib_dataloader=dataloader, + eval_dataloader=dataloader) + self.assertTrue(True if os.path.exists("runs/eval/baseline_acc0.0") else False) + set_tensorboard(False) + + +@unittest.skipIf(not FX_MODE, "Unsupport Fx Mode with PyTorch Version Below 1.8") +class TestPytorchFXAdaptor(unittest.TestCase): + @classmethod + def tearDownClass(self): + shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree("runs", ignore_errors=True) + + def test_fx_quant(self): + for fake_yaml in ["qat", "static"]: + model_origin = resnet18() + dataset = DATASETS("pytorch")["dummy"]((10, 3, 224, 224), label=True) + dataloader = DATALOADERS["pytorch"](dataset) + if fake_yaml == "qat": + conf = QuantizationAwareTrainingConfig( + op_name_list=qat_op_name_list, backend="pytorch_fx" + ) + compression_manager = prepare_compression(copy.deepcopy(model_origin), conf) + q_model = train_func(compression_manager, compression_manager.model, dataloader) + else: + conf = PostTrainingQuantConfig( + op_name_list=ptq_fx_op_name_list, backend="pytorch_fx" + ) + set_workspace("./saved") + q_model = quantization.fit(model_origin, + conf, + calib_dataloader=dataloader, + calib_func=eval_func) + q_model.save("./saved") + # Load configure and weights with neural_compressor.utils + model_fx = load("./saved", model_origin, + **{"dataloader": torch.utils.data.DataLoader(dataset)}) + self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) + + if fake_yaml != "qat": + # recover int8 model with only tune_cfg + history_file = "./saved/history.snapshot" + model_fx_recover = recover(model_origin, history_file, 0, + **{"dataloader": dataloader}) + self.assertEqual(model_fx.code, model_fx_recover.code) + shutil.rmtree("./saved", ignore_errors=True) + for fake_yaml in ["qat", "static"]: + model_origin = M() + # run fx_quant in neural_compressor and save the quantized GraphModule + dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224), label=True) + dataloader = DATALOADERS["pytorch"](dataset) + if fake_yaml == "qat": + conf = QuantizationAwareTrainingConfig( + op_name_list=qat_op_name_list, backend="pytorch_fx" + ) + compression_manager = prepare_compression(copy.deepcopy(model_origin), conf) + q_model = train_func(compression_manager, compression_manager.model, dataloader) + compression_manager.save("./saved") + else: + conf = PostTrainingQuantConfig( + op_name_list=ptq_fx_op_name_list, backend="pytorch_fx" + ) + q_model = quantization.fit(model_origin, + conf, + calib_dataloader=dataloader) + q_model.save("./saved") + # Load configure and weights with neural_compressor.utils + model_fx = load("./saved", model_origin, + **{"dataloader": torch.utils.data.DataLoader(dataset)}) + self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) + shutil.rmtree("./saved", ignore_errors=True) + + @unittest.skipIf(PT_VERSION < Version("1.9.0").release, + "Please use PyTroch 1.9 or higher version for dynamic quantization with pytorch_fx backend") + def test_fx_dynamic_quant(self): + origin_model = LSTMModel( + ntoken = 10, + ninp = 512, + nhid = 256, + nlayers = 5, + ) + # run fx_quant in neural_compressor and save the quantized GraphModule + origin_model.eval() + conf = PostTrainingQuantConfig(approach="dynamic", + op_name_list=ptq_fx_op_name_list, backend="pytorch_fx" + ) + set_workspace("./saved") + q_model = quantization.fit(origin_model, conf) + q_model.save("./saved") + + # Load configure and weights by neural_compressor.utils + model_fx = load("./saved", origin_model) + self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) + + # Test the functionality of older model saving type + state_dict = torch.load("./saved/best_model.pt") + tune_cfg = state_dict.pop("best_configure") + import yaml + with open("./saved/best_configure.yaml", "w") as f: + yaml.dump(tune_cfg, f, default_flow_style=False) + torch.save(state_dict, "./saved/best_model_weights.pt") + os.remove("./saved/best_model.pt") + model_fx = load("./saved", origin_model) + self.assertTrue(isinstance(model_fx, torch.fx.graph_module.GraphModule)) + + # recover int8 model with only tune_cfg + history_file = "./saved/history.snapshot" + model_fx_recover = recover(origin_model, history_file, 0) + self.assertEqual(model_fx.code, model_fx_recover.code) + shutil.rmtree("./saved", ignore_errors=True) + + def test_default_dynamic_quant(self): + def eval_func(model): + return 1 + + # Model Definition + for fake_yaml in ["qat", "auto"]: + model_origin = LSTMModel( + ntoken = 10, + ninp = 512, + nhid = 256, + nlayers = 2, + ) + dataset = DATASETS("pytorch")["dummy"]((3, 10)) + dataloader = DATALOADERS["pytorch"](dataset) + # run fx_quant in neural_compressor and save the quantized GraphModule + if fake_yaml == "qat": + conf = QuantizationAwareTrainingConfig( + op_name_list=qat_op_name_list, backend="pytorch_fx" + ) + compression_manager = prepare_compression(copy.deepcopy(model_origin), conf) + q_model = train_func(compression_manager, compression_manager.model, dataloader=dataloader) + self.assertTrue("quantize" in str(type(q_model.model.encoder))) + self.assertTrue("quantize" in str(type(q_model.model.rnn))) + else: + conf = PostTrainingQuantConfig(backend="pytorch_fx") + q_model = quantization.fit(model_origin, + conf, + calib_dataloader=dataloader) + self.assertTrue("quantize" in str(type(q_model.model.encoder))) + self.assertTrue("quantize" in str(type(q_model.model.rnn))) + + def test_fx_sub_module_quant(self): + for fake_yaml in ["qat", "static"]: + model_origin = DynamicControlModel() + dataset = DATASETS("pytorch")["dummy"]((1, 3, 224, 224)) + dataloader = DATALOADERS["pytorch"](dataset) + # run fx_quant in neural_compressor and save the quantized GraphModule + if fake_yaml == "qat": + conf = QuantizationAwareTrainingConfig( + op_name_list=qat_op_name_list, backend="pytorch_fx" + ) + compression_manager = prepare_compression(copy.deepcopy(model_origin), conf) + q_model = train_func(compression_manager, compression_manager.model, dataloader) + else: + set_workspace("./saved") + conf = PostTrainingQuantConfig(backend="pytorch_fx") + q_model = quantization.fit(model_origin, + conf, + calib_dataloader=dataloader) + q_model.save("./saved") + # Load configure and weights with neural_compressor.utils + model_fx = load("./saved/best_model.pt", model_origin, + **{"dataloader": torch.utils.data.DataLoader(dataset) + }) + self.assertTrue(isinstance(model_fx.sub, torch.fx.graph_module.GraphModule)) + + if fake_yaml != "qat": + # recover int8 model with only tune_cfg + history_file = "./saved/history.snapshot" + model_fx_recover = recover(model_origin, history_file, 0, + **{"dataloader": torch.utils.data.DataLoader(dataset) + }) + self.assertEqual(model_fx.sub.code, model_fx_recover.sub.code) + shutil.rmtree("./saved", ignore_errors=True) + + @unittest.skipIf(PT_VERSION < Version("1.11.0").release, + "Please use PyTroch 1.11 or higher version for mixed precision with pytorch_fx or pytorch backend") + def test_mix_precision(self): + model_origin = DynamicControlModel() + # run fx_quant in neural_compressor and save the quantized GraphModule + dataset = DATASETS("pytorch")["dummy"]((100, 3, 224, 224)) + dataloader = DATALOADERS["pytorch"](dataset) + set_workspace=("./saved") + conf = PostTrainingQuantConfig(op_name_list=ptq_fx_op_name_list, backend="pytorch_fx") + q_model = quantization.fit(model_origin, + conf, + calib_dataloader=dataloader, + calib_func = eval_func) + tune_cfg = q_model.q_config + tune_cfg["op"][("conv.module", "Conv2d")].clear() + tune_cfg["op"][("conv.module", "Conv2d")] = \ + {"weight": {"dtype": "bf16"}, "activation": {"dtype": "bf16"}} + tune_cfg["bf16_ops_list"].append(("conv.module", "Conv2d")) + from neural_compressor.adaptor.torch_utils.bf16_convert import Convert + q_model._model = Convert(q_model._model, tune_cfg) + + self.assertEqual(q_model._model.conv.module.module.weight.dtype, torch.bfloat16) + self.assertEqual(q_model._model.conv.module.module.bias.dtype, torch.bfloat16) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/adaptor/pytorch_adaptor/test_torch2onnx.py b/test/adaptor/pytorch_adaptor/test_torch2onnx.py index 977e621be84..8977b1a1dd4 100644 --- a/test/adaptor/pytorch_adaptor/test_torch2onnx.py +++ b/test/adaptor/pytorch_adaptor/test_torch2onnx.py @@ -8,9 +8,8 @@ import unittest import neural_compressor.adaptor.pytorch as nc_torch from neural_compressor import quantization -from neural_compressor.conf.pythonic_config import PostTrainingConfig, QuantizationAwareTrainingConfig +from neural_compressor.config import PostTrainingQuantConfig from neural_compressor.experimental.data.datasets.dataset import DATASETS -from neural_compressor.training import prepare_compression from packaging.version import Version from torch.quantization import QuantStub, DeQuantStub @@ -209,11 +208,9 @@ def test_fx_quant(self): for fake_yaml in ['dynamic', 'static']: model = DynamicControlModel() # run fx_quant in neural_compressor and save the quantized GraphModule - conf = PostTrainingConfig( - approach="post_training_dynamic_quant" \ - if fake_yaml == "dynamic" else "post_training_static_quant", - backend="pytorch_fx", - performance_only=True + conf = PostTrainingQuantConfig( + approach=fake_yaml, + backend="pytorch_fx" ) dataset = DATASETS("pytorch")['dummy']((100, 3, 224, 224)) dataloader = torch.utils.data.DataLoader(dataset) diff --git a/test/benchmark/test_benchmark.py b/test/benchmark/test_benchmark.py index 7815bb6cbfe..37aef1ca500 100644 --- a/test/benchmark/test_benchmark.py +++ b/test/benchmark/test_benchmark.py @@ -4,11 +4,13 @@ import os import yaml import numpy as np -import tensorflow as tf import tempfile import re +import platform from neural_compressor.adaptor.tf_utils.util import write_graph +import tensorflow as tf + def build_fake_yaml(): fake_yaml = ''' model: @@ -43,12 +45,14 @@ def build_benchmark(): arg_parser = ArgumentParser(description='Parse args') arg_parser.add_argument('--input_model', dest='input_model', default='input_model', help='input odel') args = arg_parser.parse_args() -import neural_compressor from neural_compressor.data import DATASETS -from neural_compressor.experimental import common dataset = DATASETS('tensorflow')['dummy']((100, 32, 32, 1), label=True) -b_dataloader = common.DataLoader(dataset, batch_size=10) -neural_compressor.benchmark(args.input_model, 'fake_yaml.yaml', b_dataloader=b_dataloader) +from neural_compressor.experimental import Benchmark, common +from neural_compressor.conf.config import BenchmarkConf +benchmarker = Benchmark('fake_yaml.yaml') +benchmarker.b_dataloader = common.DataLoader(dataset, batch_size=10) +benchmarker.model = args.input_model +benchmarker.fit() ''' seq1 = ''' @@ -56,14 +60,15 @@ def build_benchmark(): arg_parser = ArgumentParser(description='Parse args') arg_parser.add_argument('--input_model', dest='input_model', default='input_model', help='input odel') args = arg_parser.parse_args() -import neural_compressor from neural_compressor.data import DATASETS dataset = DATASETS('tensorflow')['dummy']((100, 32, 32, 1), label=True) -from neural_compressor.experimental import common +from neural_compressor.experimental import Benchmark, common from neural_compressor.conf.config import BenchmarkConf conf = BenchmarkConf('fake_yaml.yaml') -b_dataloader = common.DataLoader(dataset, batch_size=10) -neural_compressor.benchmark(args.input_model, conf, b_dataloader=b_dataloader) +benchmarker = Benchmark(conf) +benchmarker.b_dataloader = common.DataLoader(dataset, batch_size=10) +benchmarker.model = args.input_model +benchmarker.fit() ''' # test normal case @@ -88,13 +93,15 @@ def build_benchmark2(): "arg_parser = ArgumentParser(description='Parse args')\n", "arg_parser.add_argument('--input_model', dest='input_model', default='input_model', help='input model')\n", "args = arg_parser.parse_args()\n", - "import neural_compressor\n" + "from neural_compressor.data import DATASETS\n", "dataset = DATASETS('tensorflow')['dummy']((5, 32, 32, 1), label=True)\n", - "from neural_compressor.experimental import common\n", - "b_dataloader = common.DataLoader(dataset)\n", - "neural_compressor.benchmark(args.input_model, b_dataloader=b_dataloader)\n" + "from neural_compressor.experimental import Benchmark, common\n", + "benchmarker = Benchmark()\n", + "benchmarker.model = args.input_model\n", + "benchmarker.b_dataloader = common.DataLoader(dataset)\n", + "benchmarker.fit()\n" ] seq1 = ''' @@ -102,11 +109,13 @@ def build_benchmark2(): arg_parser = ArgumentParser(description='Parse args') arg_parser.add_argument('--input_model', dest='input_model', default='input_model', help='input odel') args = arg_parser.parse_args() -import neural_compressor + from neural_compressor import conf -from neural_compressor.experimental import common +from neural_compressor.experimental import Benchmark, common conf.evaluation.performance.dataloader.dataset = {'dummy': {'shape': [100,32,32,1], 'label':True}} -neural_compressor.benchmark(args.input_model, conf) +benchmarker = Benchmark(conf) +benchmarker.model = args.input_model +benchmarker.fit() ''' seq2 = ''' @@ -188,6 +197,7 @@ def setUpClass(self): build_benchmark() build_benchmark2() self.cpu_counts = psutil.cpu_count(logical=False) + self.platform = platform.system().lower() @classmethod def tearDownClass(self): @@ -195,11 +205,11 @@ def tearDownClass(self): os.remove('fake_yaml.yaml') if os.path.exists('fake.py'): os.remove('fake.py') - if os.path.exists('fake.py'): + if os.path.exists('fake2.py'): os.remove('fake2.py') - if os.path.exists('fake.py'): + if os.path.exists('fake3.py'): os.remove('fake3.py') - if os.path.exists('fake.py'): + if os.path.exists('fake4.py'): os.remove('fake4.py') if os.path.exists('fake_data_5.py'): os.remove('fake_data_5.py') @@ -248,8 +258,8 @@ def test_benchmark_without_yaml(self): os.system("python fake2.py --input_model={} 2>&1 | tee benchmark.log".format(self.graph_path)) with open('benchmark.log', "r") as f: for line in f: - accuracy = re.search(r"Accuracy is\s+(\d+(\.\d+)?)", line) - self.assertIsNotNone(accuracy) + throughput = re.search(r"Throughput sum: (\d+(\.\d+)?)", line) + self.assertIsNotNone(throughput) os.system("rm *.log") def test_benchmark_with_conf(self): @@ -259,7 +269,7 @@ def test_benchmark_with_conf(self): throughput = re.search(r"Throughput:\s+(\d+(\.\d+)?) images/sec", line) self.assertIsNotNone(throughput) os.system("rm *.log") - + def test_benchmark_with_custom_metric(self): os.system("python fake4.py --input_model={} 2>&1 | tee benchmark.log".format(self.graph_path)) with open('benchmark.log', "r") as f: @@ -267,6 +277,6 @@ def test_benchmark_with_custom_metric(self): accuracy = re.search(r"Accuracy is\s+(\d+(\.\d+)?)", line) self.assertIsNotNone(accuracy) os.system("rm *.log") - + if __name__ == "__main__": unittest.main() diff --git a/test/benchmark/test_benchmark_2.x.py b/test/benchmark/test_benchmark_2.x.py new file mode 100644 index 00000000000..fe5b0d0d710 --- /dev/null +++ b/test/benchmark/test_benchmark_2.x.py @@ -0,0 +1,176 @@ +"""Tests for neural_compressor benchmark""" +import psutil +import unittest +import os +import yaml +import numpy as np +import tensorflow as tf +import tempfile +import re +from neural_compressor.adaptor.tf_utils.util import write_graph + + +def build_benchmark(): + seq = ''' +from argparse import ArgumentParser +arg_parser = ArgumentParser(description='Parse args') +arg_parser.add_argument('--input_model', dest='input_model', default='input_model', help='input odel') +args = arg_parser.parse_args() +from neural_compressor.benchmark import fit +from neural_compressor.config import BenchmarkConfig +from neural_compressor.data import DATASETS +from neural_compressor.experimental import common +dataset = DATASETS('tensorflow')['dummy']((100, 32, 32, 1), label=True) +b_dataloader = common.DataLoader(dataset, batch_size=10) +conf = BenchmarkConfig(warmup=5, iteration=10, cores_per_instance=4, num_of_instance=2) +fit(args.input_model, conf, b_dataloader=b_dataloader) + ''' + + seq1 = ''' +from argparse import ArgumentParser +arg_parser = ArgumentParser(description='Parse args') +arg_parser.add_argument('--input_model', dest='input_model', default='input_model', help='input odel') +args = arg_parser.parse_args() +from neural_compressor.benchmark import fit +from neural_compressor.config import BenchmarkConfig +from neural_compressor.data import DATASETS +dataset = DATASETS('tensorflow')['dummy']((100, 32, 32, 1), label=True) +from neural_compressor.experimental import common +conf = BenchmarkConfig(warmup=5, iteration=10, cores_per_instance=4, num_of_instance=2) +b_dataloader = common.DataLoader(dataset, batch_size=10) +fit(args.input_model, conf, b_dataloader=b_dataloader) + ''' + + # test normal case + with open('fake.py', "w", encoding="utf-8") as f: + f.writelines(seq) + # test batchsize > len(dataset), use first batch + fake_data_5 = seq.replace('100, 32, 32, 1', '5, 32, 32, 1') + with open('fake_data_5.py', "w", encoding="utf-8") as f: + f.writelines(fake_data_5) + # test batchsize < len(dataset) < 2*batchsize, discard first batch + fake_data_15 = seq1.replace('100, 32, 32, 1', '15, 32, 32, 1') + with open('fake_data_15.py', "w", encoding="utf-8") as f: + f.writelines(fake_data_15) + # test 2*batchsize < len(dataset) < warmup*batchsize, discard last batch + fake_data_25 = seq1.replace('100, 32, 32, 1', '25, 32, 32, 1') + with open('fake_data_25.py', "w", encoding="utf-8") as f: + f.writelines(fake_data_25) + +def build_benchmark2(): + seq = [ + "from argparse import ArgumentParser\n", + "arg_parser = ArgumentParser(description='Parse args')\n", + "arg_parser.add_argument('--input_model', dest='input_model', default='input_model', help='input model')\n", + "args = arg_parser.parse_args()\n", + "from neural_compressor.benchmark import fit\n" + "from neural_compressor.data import DATASETS\n", + "dataset = DATASETS('tensorflow')['dummy']((5, 32, 32, 1), label=True)\n", + + "from neural_compressor.experimental import common\n", + "b_dataloader = common.DataLoader(dataset)\n", + "fit(args.input_model, b_dataloader=b_dataloader)\n" + ] + + with open('fake2.py', "w", encoding="utf-8") as f: + f.writelines(seq) + + +def build_fake_model(): + graph_path = tempfile.mkstemp(suffix='.pb')[1] + try: + graph = tf.Graph() + graph_def = tf.GraphDef() + with tf.Session(graph=graph) as sess: + x = tf.placeholder(tf.float64, shape=(None, 32, 32, 1), name='x') + y_1 = tf.constant(np.random.random((3, 3, 1, 1)), name='y_1') + y_2 = tf.constant(np.random.random((3, 3, 1, 1)), name='y_2') + conv1 = tf.nn.conv2d(input=x, filter=y_1, strides=[1, 1, 1, 1], \ + padding='VALID', name='conv1') + op = tf.nn.conv2d(input=conv1, filter=y_2, strides=[1, 1, 1, 1], \ + padding='VALID', name='op_to_store') + + sess.run(tf.global_variables_initializer()) + constant_graph = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def, ['op_to_store']) + + graph_def.ParseFromString(constant_graph.SerializeToString()) + write_graph(graph_def, graph_path) + except: + graph = tf.Graph() + graph_def = tf.compat.v1.GraphDef() + with tf.compat.v1.Session(graph=graph) as sess: + x = tf.compat.v1.placeholder(tf.float64, shape=(None, 32, 32, 1), name='x') + y_1 = tf.constant(np.random.random((3, 3, 1, 1)), name='y_1') + y_2 = tf.constant(np.random.random((3, 3, 1, 1)), name='y_2') + conv1 = tf.nn.conv2d(input=x, filters=y_1, strides=[1, 1, 1, 1], \ + padding='VALID', name='conv1') + op = tf.nn.conv2d(input=conv1, filters=y_2, strides=[1, 1, 1, 1], \ + padding='VALID', name='op_to_store') + + sess.run(tf.compat.v1.global_variables_initializer()) + constant_graph = tf.compat.v1.graph_util.convert_variables_to_constants(sess, sess.graph_def, ['op_to_store']) + + graph_def.ParseFromString(constant_graph.SerializeToString()) + write_graph(graph_def, graph_path) + return graph_path + +class TestObjective(unittest.TestCase): + @classmethod + def setUpClass(self): + self.graph_path = build_fake_model() + build_benchmark() + build_benchmark2() + self.cpu_counts = psutil.cpu_count(logical=False) + + @classmethod + def tearDownClass(self): + if os.path.exists('fake.py'): + os.remove('fake.py') + if os.path.exists('fake2.py'): + os.remove('fake2.py') + if os.path.exists('fake_data_5.py'): + os.remove('fake_data_5.py') + if os.path.exists('fake_data_15.py'): + os.remove('fake_data_15.py') + if os.path.exists('fake_data_25.py'): + os.remove('fake_data_25.py') + + def test_benchmark(self): + os.system("python fake.py --input_model={}".format(self.graph_path)) + for i in range(2): + with open(f'2_4_{i}.log', "r") as f: + for line in f: + throughput = re.search(r"Throughput:\s+(\d+(\.\d+)?) images/sec", line) + self.assertIsNotNone(throughput) + os.system("rm *.log") + + def test_benchmark_data_5(self): + os.system("python fake_data_5.py --input_model={}".format(self.graph_path)) + for i in range(2): + with open(f'2_4_{i}.log', "r") as f: + for line in f: + throughput = re.search(r"Throughput:\s+(\d+(\.\d+)?) images/sec", line) + self.assertIsNotNone(throughput) + os.system("rm *.log") + + def test_benchmark_data_15(self): + os.system("python fake_data_15.py --input_model={}".format(self.graph_path)) + for i in range(2): + with open(f'2_4_{i}.log', "r") as f: + for line in f: + throughput = re.search(r"Throughput:\s+(\d+(\.\d+)?) images/sec", line) + self.assertIsNotNone(throughput) + os.system("rm *.log") + + def test_benchmark_data_25(self): + os.system("python fake_data_25.py --input_model={}".format(self.graph_path)) + for i in range(2): + with open(f'2_4_{i}.log', "r") as f: + for line in f: + throughput = re.search(r"Throughput:\s+(\d+(\.\d+)?) images/sec", line) + self.assertIsNotNone(throughput) + os.system("rm *.log") + + +if __name__ == "__main__": + unittest.main() diff --git a/test/distillation/test_distillation.py b/test/distillation/test_distillation.py index 4d63baf5c00..a5a993f2fdf 100644 --- a/test/distillation/test_distillation.py +++ b/test/distillation/test_distillation.py @@ -7,7 +7,7 @@ import torch.nn as nn import tensorflow as tf from neural_compressor.data import DATASETS -from neural_compressor.conf.pythonic_config import DistillationConfig, KnowledgeDistillationLossConfig +from neural_compressor.config import DistillationConfig, KnowledgeDistillationLossConfig from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader diff --git a/test/distillation/test_self_distillation.py b/test/distillation/test_self_distillation.py index 5bd29d37432..e05a40ae56e 100644 --- a/test/distillation/test_self_distillation.py +++ b/test/distillation/test_self_distillation.py @@ -5,7 +5,6 @@ import torch import torch.nn as nn import torchvision -from neural_compressor.conf.config import DistillationConf from neural_compressor.data import DATASETS from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import \ PyTorchDataLoader @@ -82,8 +81,8 @@ def tearDownClass(cls): def test_self_distillation(self): import copy from neural_compressor.training import prepare_compression - from neural_compressor.conf.pythonic_config import DistillationConfig, \ - SelfKnowledgeDistillationLossConfig + from neural_compressor.config import DistillationConfig, \ + SelfKnowledgeDistillationLossConfig datasets = DATASETS("pytorch") dummy_dataset = datasets["dummy"]( diff --git a/test/export/test_torch2onnx.py b/test/export/test_torch2onnx.py new file mode 100644 index 00000000000..01410ff0952 --- /dev/null +++ b/test/export/test_torch2onnx.py @@ -0,0 +1,227 @@ +import os +import copy +import shutil +import torch +import unittest +import numpy as np +from neural_compressor import quantization +from neural_compressor.experimental.common import Model +from neural_compressor.config import Torch2ONNXConfig +from neural_compressor.experimental.data.datasets.dataset import DATASETS +from neural_compressor import PostTrainingQuantConfig, QuantizationAwareTrainingConfig +from neural_compressor.training import prepare_compression +from neural_compressor.data import DATASETS, DATALOADERS +from transformers import AutoModelForSequenceClassification, AutoTokenizer +import torch.utils.data as data + + +def train_func_cv(compression_manager, model): + compression_manager.callbacks.on_train_begin() + optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) + model.train() + input = torch.randn(1, 3, 224, 224) + output = model(input) + loss = output[0].mean() if isinstance(output, tuple) else output.mean() + optimizer.zero_grad() + loss.backward() + optimizer.step() + compression_manager.callbacks.on_train_end() + return model + +def train_func_nlp(compression_manager, model, input): + compression_manager.callbacks.on_train_begin() + optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) + model.train() + output = model(**input) + loss = output.logits[0][0] + optimizer.zero_grad() + loss.backward() + optimizer.step() + compression_manager.callbacks.on_train_end() + return model + +def check_CV_onnx(model_path, dataloader): + import onnxruntime as ort + ort_session = ort.InferenceSession(model_path) + it = iter(dataloader) + input = next(it) + input_dict = {'input': input[0].detach().cpu().numpy()} + ort_session.run(None, input_dict) + return True + +def check_NLP_onnx(model_path, input): + import onnxruntime as ort + ort_session = ort.InferenceSession(model_path, None) + input_dict = {} + for k, v in input.items(): + input_dict[k] = np.array(v) + ort_session.run(None, input_dict) + return True + + +class DummyNLPDataloader(object): + def __init__(self, model_name): + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.sequence_a = "intel-extension-for-transformers is based in SH" + self.sequence_b = "Where is intel-extension-for-transformers based? NYC or SH" + self.encoded_dict = self.tokenizer(self.sequence_a, self.sequence_b, return_tensors='pt') + self.encoded_dict['labels'] = 1 + self.batch_size = 1 + + def __iter__(self): + yield self.encoded_dict + + def __next__(self): + return self.encoded_dict + +class TestPytorch2ONNX(unittest.TestCase): + @classmethod + def setUpClass(self): + from torchvision.models.quantization import resnet18 + self.cv_model = resnet18() + self.cv_dataset = DATASETS("pytorch")["dummy"]((10, 3, 224, 224)) + self.cv_dataloader = DATALOADERS["pytorch"](self.cv_dataset) + self.nlp_model = AutoModelForSequenceClassification.from_pretrained( + "distilbert-base-uncased-finetuned-sst-2-english" + ) + self.nlp_dataloader = DummyNLPDataloader( + "distilbert-base-uncased-finetuned-sst-2-english" + ) + input = next(self.nlp_dataloader) + input.pop('labels') + self.nlp_input = input + + @classmethod + def tearDownClass(self): + shutil.rmtree('runs', ignore_errors=True) + # os.remove('fp32-cv-model.onnx') + # os.remove('int8-cv-model.onnx') + # os.remove('fp32-nlp-model.onnx') + # os.remove('int8-nlp-model.onnx') + shutil.rmtree("./saved", ignore_errors=True) + + def test_fp32_CV_models(self): + model = self.cv_model + inc_model = Model(model) + fp32_onnx_config = Torch2ONNXConfig( + dtype="fp32", + example_inputs=torch.randn(1, 3, 224, 224), + input_names=['input'], + output_names=['output'], + dynamic_axes={"input": {0: "batch_size"}, + "output": {0: "batch_size"}}, + ) + inc_model.export('fp32-cv-model.onnx', fp32_onnx_config) + check_CV_onnx('fp32-cv-model.onnx', self.cv_dataloader) + + def test_int8_CV_models(self): + for fake_yaml in ["dynamic", "qat", "static"]: + model = self.cv_model + if fake_yaml == "qat": + quant_conf = QuantizationAwareTrainingConfig(backend='pytorch_fx') + compression_manager = prepare_compression(copy.deepcopy(model), quant_conf) + q_model = train_func_cv(compression_manager, compression_manager.model) + else: + if fake_yaml == "dynamic": + quant_conf = PostTrainingQuantConfig(approach="dynamic") + elif fake_yaml == "static": + quant_conf = PostTrainingQuantConfig(approach="static", backend='pytorch_fx') + q_model = quantization.fit( + model, + quant_conf, + calib_dataloader=self.cv_dataloader if fake_yaml == "static" else None) + + if fake_yaml != "dynamic": + int8_onnx_config = Torch2ONNXConfig( + dtype="int8", + opset_version=14, + quant_format="QDQ", + example_inputs=torch.randn(1, 3, 224, 224), + input_names=['input'], + output_names=['output'], + dynamic_axes={"input": {0: "batch_size"}, + "output": {0: "batch_size"}}, + calib_dataloader=self.cv_dataloader, + ) + else: + int8_onnx_config = Torch2ONNXConfig( + dtype="int8", + opset_version=14, + quant_format="QDQ", + example_inputs=torch.randn(1, 3, 224, 224), + input_names=['input'], + output_names=['output'], + dynamic_axes={"input": {0: "batch_size"}, + "output": {0: "batch_size"}}, + ) + q_model.export('int8-cv-model.onnx', int8_onnx_config) + check_CV_onnx('int8-cv-model.onnx', self.cv_dataloader) + + def test_fp32_NLP_models(self): + symbolic_names = {0: 'batch_size', 1: 'max_seq_len'} + dynamic_axes = {k: symbolic_names for k in self.nlp_input.keys()} + + model = self.nlp_model + inc_model = Model(model) + fp32_onnx_config = Torch2ONNXConfig( + dtype="fp32", + example_inputs=tuple(self.nlp_input.values()), + input_names=list(self.nlp_input.keys()), + output_names=['labels'], + dynamic_axes=dynamic_axes, + ) + inc_model.export('fp32-nlp-model.onnx', fp32_onnx_config) + check_NLP_onnx('fp32-nlp-model.onnx', self.nlp_input) + + def test_int8_NLP_models(self): + symbolic_names = {0: 'batch_size', 1: 'max_seq_len'} + dynamic_axes = {k: symbolic_names for k in self.nlp_input.keys()} + + for fake_yaml in ["dynamic", "static", "qat"]: + model = self.nlp_model + if fake_yaml == "qat": + quant_conf = QuantizationAwareTrainingConfig(backend='pytorch_fx') + compression_manager = prepare_compression(copy.deepcopy(model), quant_conf) + q_model = train_func_nlp( + compression_manager, + compression_manager.model, + self.nlp_input + ) + else: + if fake_yaml == "dynamic": + quant_conf = PostTrainingQuantConfig(approach="dynamic") + elif fake_yaml == "static": + quant_conf = PostTrainingQuantConfig(approach="static", backend='pytorch_fx') + q_model = quantization.fit( + model, + quant_conf, + calib_dataloader=self.nlp_dataloader if fake_yaml == "static" else None) + + if fake_yaml != "dynamic": + int8_onnx_config = Torch2ONNXConfig( + dtype="int8", + opset_version=14, + quant_format="QDQ", + example_inputs=tuple(self.nlp_input.values()), + input_names=list(self.nlp_input.keys()), + output_names=['labels'], + dynamic_axes=dynamic_axes, + calib_dataloader=self.nlp_dataloader, + ) + else: + int8_onnx_config = Torch2ONNXConfig( + dtype="int8", + opset_version=14, + quant_format="QDQ", + example_inputs=tuple(self.nlp_input.values()), + input_names=list(self.nlp_input.keys()), + output_names=['labels'], + dynamic_axes=dynamic_axes, + ) + q_model.export('int8-nlp-model.onnx', int8_onnx_config) + check_NLP_onnx('int8-nlp-model.onnx', self.nlp_input) + +if __name__ == "__main__": + unittest.main() + + diff --git a/test/itex/test_tensorflow_itex_basic.py b/test/itex/test_tensorflow_itex_basic.py index 6fc3e9a518a..9d3cb1e58ef 100644 --- a/test/itex/test_tensorflow_itex_basic.py +++ b/test/itex/test_tensorflow_itex_basic.py @@ -5,13 +5,14 @@ import os import shutil import yaml +import platform import numpy as np from neural_compressor.adaptor.tf_utils.quantize_graph.quantize_graph_for_intel_cpu import QuantizeGraphForIntel from neural_compressor.adaptor.tf_utils.graph_rewriter.generic.strip_unused_nodes import StripUnusedNodesOptimizer from neural_compressor.adaptor.tf_utils.graph_rewriter.generic.fold_batch_norm import FoldBatchNormNodesOptimizer from neural_compressor.adaptor.tensorflow import TensorflowQuery from neural_compressor.adaptor.tf_utils.util import disable_random -from neural_compressor.experimental import Quantization, common +from neural_compressor.experimental import Quantization, Benchmark, common from neural_compressor.utils.utility import CpuInfo from neural_compressor.adaptor.tf_utils.util import version1_lt_version2, version1_gte_version2 @@ -217,5 +218,53 @@ def test_depthwiseconv2d_case(self): reshape_counter += 1 self.assertEqual(reshape_counter, 2) + @disable_random() + @unittest.skipIf(version1_lt_version2(tf.version.VERSION, '2.8.0') or \ + platform.system().lower() == "windows", "Only supports tf greater 2.7.0 and Linux") + def test_itex_benchmark_gpu(self): + x = tf.compat.v1.placeholder(tf.float32, [1, 56, 56, 16], name="input") + top_relu = tf.nn.relu(x) + paddings = tf.constant([[0, 0], [1, 1], [1, 1], [0, 0]]) + x_pad = tf.pad(top_relu, paddings, "CONSTANT") + conv_weights = tf.compat.v1.get_variable("weight", [3, 3, 16, 16], + initializer=tf.compat.v1.random_normal_initializer()) + conv = tf.nn.conv2d(x_pad, conv_weights, strides=[1, 2, 2, 1], padding="VALID") + normed = tf.compat.v1.layers.batch_normalization(conv) + conv_weights2 = tf.compat.v1.get_variable("weight2", [3, 3, 16, 16], + initializer=tf.compat.v1.random_normal_initializer()) + conv2 = tf.nn.conv2d(top_relu, conv_weights2, strides=[1, 2, 2, 1], padding="SAME") + normed2 = tf.compat.v1.layers.batch_normalization(conv2) + add = tf.raw_ops.Add(x=normed, y=normed2, name='addv2') + relu = tf.nn.relu(add) + relu6 = tf.nn.relu6(relu, name='op_to_store') + out_name = relu6.name.split(':')[0] + with tf.compat.v1.Session() as sess: + sess.run(tf.compat.v1.global_variables_initializer()) + output_graph_def = graph_util.convert_variables_to_constants( + sess=sess, + input_graph_def=sess.graph_def, + output_node_names=[out_name]) + + quantizer = Quantization('fake_yaml_2.yaml') + dataset = quantizer.dataset('dummy', shape=(100, 56, 56, 16), label=True) + quantizer.eval_dataloader = common.DataLoader(dataset) + quantizer.calib_dataloader = common.DataLoader(dataset) + quantizer.model = output_graph_def + output_graph = quantizer.fit() + + evaluator = Benchmark('fake_yaml_2.yaml') + evaluator.b_dataloader = common.DataLoader(dataset) + evaluator.model = output_graph + evaluator('performance') + + found_multi_instance_log = False + for file_name in os.listdir(os.getcwd()): + if file_name.endswith(".log"): + found_multi_instance_log = True + break + + self.assertEqual(found_multi_instance_log, False) + + if __name__ == '__main__': unittest.main() diff --git a/test/mixed_precision/test_mixed_precision.py b/test/mixed_precision/test_mixed_precision.py index a5d5e09bfc9..a05a3e25e5c 100644 --- a/test/mixed_precision/test_mixed_precision.py +++ b/test/mixed_precision/test_mixed_precision.py @@ -10,7 +10,7 @@ from neural_compressor import mix_precision from neural_compressor.utils.utility import LazyImport, CpuInfo from neural_compressor.adaptor.torch_utils.bf16_convert import BF16ModuleWrapper -from neural_compressor.conf.pythonic_config import MixedPrecisionConfig, Options +from neural_compressor.config import MixedPrecisionConfig, set_workspace, TuningCriterion from onnx import helper, TensorProto from packaging.version import Version from tensorflow.core.framework import attr_value_pb2 @@ -262,26 +262,26 @@ def setUpClass(self): def test_on_non_enabled_host(self): # test onnx - conf = MixedPrecisionConfig(precisions=["fp16"], backend="onnxrt_qlinearops") + conf = MixedPrecisionConfig(extra_precisions=["fp16"], backend="onnxrt_qlinearops") with self.assertRaises(SystemExit) as cm: output_model = mix_precision.fit(self.onnx_model, conf) self.assertEqual(cm.exception.code, 0) @unittest.skipIf(CpuInfo().bf16, 'skip since hardware support bf16') def test_on_non_enabled_host_tf(self): - conf = MixedPrecisionConfig(precisions=["bf16"], backend="tensorflow") + conf = MixedPrecisionConfig(extra_precisions=["bf16"], backend="tensorflow") with self.assertRaises(SystemExit) as cm: output_model = mix_precision.fit(self.tf_model, conf) self.assertEqual(cm.exception.code, 0) def test_on_non_enabled_dtype(self): # test onnx - conf = MixedPrecisionConfig(precisions=["bf16"], backend="onnxrt_qlinearops") + conf = MixedPrecisionConfig(extra_precisions=["bf16"], backend="onnxrt_qlinearops") with self.assertRaises(SystemExit) as cm: output_model = mix_precision.fit(self.onnx_model, conf) self.assertEqual(cm.exception.code, 0) - conf = MixedPrecisionConfig(precisions=["fp16"], backend="tensorflow") + conf = MixedPrecisionConfig(extra_precisions=["fp16"], backend="tensorflow") with self.assertRaises(SystemExit) as cm: output_model = mix_precision.fit(self.tf_model, conf) self.assertEqual(cm.exception.code, 0) @@ -310,16 +310,16 @@ def test_mixed_precision_with_evaluation(self): from neural_compressor.experimental import common from neural_compressor.experimental.metric.metric import ONNXRT_QL_METRICS # test onnx - conf = MixedPrecisionConfig(precisions=["fp16"], + conf = MixedPrecisionConfig(extra_precisions=["fp16"], backend="onnxrt_qlinearops") - options = Options(workspace="./saved") - output_model = mix_precision.fit(self.onnx_model, conf, options=options) + set_workspace("./saved") + output_model = mix_precision.fit(self.onnx_model, conf) self.assertFalse(any([i.op_type == 'Cast' for i in output_model.nodes()])) - conf = MixedPrecisionConfig(precisions=["fp16"], + tuning_criterion = TuningCriterion(max_trials=3, timeout=50) + conf = MixedPrecisionConfig(extra_precisions=["fp16"], backend="onnxrt_qlinearops", - max_trials=3, - timeout=50) + tuning_criterion=tuning_criterion) output_model = mix_precision.fit(self.onnx_model, conf, @@ -347,7 +347,7 @@ def eval2(model): from neural_compressor.experimental import MixedPrecision, common from neural_compressor import conf my_metric = Metric() - conf = MixedPrecisionConfig(precisions=["fp16"], + conf = MixedPrecisionConfig(extra_precisions=["fp16"], backend="onnxrt_qlinearops") output_model = mix_precision.fit(self.onnx_model, @@ -355,7 +355,7 @@ def eval2(model): eval_dataloader=common.DataLoader(self.matmul_dataset), eval_metric=my_metric) self.assertFalse(any([i.op_type == 'Cast' for i in output_model.nodes()])) - conf = MixedPrecisionConfig(precisions=["fp16"], + conf = MixedPrecisionConfig(extra_precisions=["fp16"], backend="onnxrt_qlinearops") output_model = mix_precision.fit(self.onnx_model, @@ -367,7 +367,7 @@ def eval2(model): conf = MixedPrecisionConfig( inputs="input", outputs="final", - precisions=["bf16", "fp32"], + extra_precisions=["bf16", "fp32"], ) output_model = mix_precision.fit( @@ -376,15 +376,15 @@ def eval2(model): eval_func=eval, ) self.assertTrue(any([i.op == 'Cast' for i in output_model.graph_def.node])) - self.assertEqual(conf.precisions, ['bf16', 'fp32']) + self.assertEqual(conf.extra_precisions, ['bf16', 'fp32']) self.assertEqual(conf.inputs, 'input') self.assertEqual(conf.outputs, 'final') + tuning_criterion = TuningCriterion(max_trials=4, timeout=500) conf = MixedPrecisionConfig( - max_trials=4, - timeout=500, - precisions=["bf16"], backend="tensorflow", + tuning_criterion=tuning_criterion, + extra_precisions=["bf16"], ) output_model = mix_precision.fit( common.Model(self.tf_model), @@ -393,12 +393,12 @@ def eval2(model): ) self.assertTrue(any([i.op == 'Cast' for i in output_model.graph_def.node])) + tuning_criterion = TuningCriterion(max_trials=1, timeout=100) conf = MixedPrecisionConfig( inputs="input", outputs="final, test", - max_trials=1, - timeout=100, - precisions=["bf16", "fp32"], + tuning_criterion=tuning_criterion, + extra_precisions=["bf16", "fp32"], ) output_model = mix_precision.fit( self.tf_model, @@ -414,7 +414,7 @@ def eval(model): return 0.5 conf = MixedPrecisionConfig( - precisions=["bf16"], + extra_precisions=["bf16"], backend="pytorch" ) output_model = mix_precision.fit( diff --git a/test/pruning/test_pruning.py b/test/pruning/test_pruning.py index b5b437639c0..3e1290e6bb7 100644 --- a/test/pruning/test_pruning.py +++ b/test/pruning/test_pruning.py @@ -6,7 +6,7 @@ import torchvision import torch.nn as nn -from neural_compressor.conf.pythonic_config import Pruner, PruningConfig +from neural_compressor.config import Pruner, PruningConfig from neural_compressor.data import DATASETS from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader from neural_compressor.training import prepare_compression diff --git a/test/requirements.txt b/test/requirements.txt index c570fff1dec..30712c4bafb 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -14,6 +14,7 @@ transformers<=4.12.3; python_version < '3.10' transformers==4.16.0; python_version == '3.10' tensorflow_model_optimization sigopt +hyperopt horovod tensorflow-addons onnxruntime-extensions; python_version < '3.10' diff --git a/test/strategy/test_basic.py b/test/strategy/test_basic.py index 845e9b0ccae..0a2812b5f79 100644 --- a/test/strategy/test_basic.py +++ b/test/strategy/test_basic.py @@ -155,7 +155,7 @@ def build_fake_model(): tf.import_graph_def(graph_def, name='') return graph -class TestQuantization(unittest.TestCase): +class TestBasicTuningStrategy(unittest.TestCase): @classmethod def setUpClass(self): @@ -217,6 +217,25 @@ def test_run_basic_max_trials_multimetric_weight(self): quantizer.model = self.constant_graph quantizer.fit() + + def test_run_basic_one_trial_new_api(self): + from neural_compressor.quantization import fit + from neural_compressor.config import AccuracyCriterion, AccuracyLoss, PostTrainingQuantConfig, TuningCriterion + from neural_compressor.data import DATASETS, DATALOADERS + + # dataset and dataloader + dataset = DATASETS("tensorflow")["dummy"](((100, 3, 3, 1))) + dataloader = DATALOADERS["tensorflow"](dataset) + + # tuning and accuracy criterion + tolerable_loss = AccuracyLoss(0.01) + accuracy_criterion = AccuracyCriterion(criterion='relative', tolerable_loss=tolerable_loss) + tuning_criterion = TuningCriterion(strategy='basic') + conf = PostTrainingQuantConfig(approach="static", backend="tensorflow", + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion) + q_model = fit(model=self.constant_graph, conf=conf, calib_dataloader= dataloader, eval_dataloader=dataloader) + self.assertIsNotNone(q_model) if __name__ == "__main__": unittest.main() diff --git a/test/strategy/test_optimization_level_2.x.py b/test/strategy/test_optimization_level_2.x.py new file mode 100644 index 00000000000..b599c07bf2a --- /dev/null +++ b/test/strategy/test_optimization_level_2.x.py @@ -0,0 +1,151 @@ +"""Tests for optimization level & conservative strategy""" + +import shutil +import unittest +import time + +import numpy as np + +from neural_compressor.utils import logger + +def build_fake_model(): + import tensorflow as tf + try: + graph = tf.Graph() + graph_def = tf.compat.v1.GraphDef() + with tf.compat.v1.Session() as sess: + x = tf.compat.v1.placeholder(tf.float32, shape=(1,3,3,1), name='x') + y = tf.constant(np.random.random((2,2,1,1)).astype(np.float32), name='y') + z = tf.constant(np.random.random((1,1,1,1)).astype(np.float32), name='z') + op = tf.nn.conv2d(input=x, filters=y, strides=[1,1,1,1], padding='VALID', name='op_to_store') + op2 = tf.nn.conv2d(input=op, filters=z, strides=[1,1,1,1], padding='VALID', ) + last_identity = tf.identity(op2, name='op2_to_store') + sess.run(tf.compat.v1.global_variables_initializer()) + constant_graph = tf.compat.v1.graph_util.convert_variables_to_constants(sess, sess.graph_def, ['op2_to_store']) + + graph_def.ParseFromString(constant_graph.SerializeToString()) + with graph.as_default(): + tf.import_graph_def(graph_def, name='') + except: + graph = tf.Graph() + graph_def = tf.compat.v1.GraphDef() + with tf.compat.v1.Session() as sess: + x = tf.compat.v1.placeholder(tf.float32, shape=(1,3,3,1), name='x') + y = tf.constant(np.random.random((2,2,1,1)).astype(np.float32), name='y') + z = tf.constant(np.random.random((1,1,1,1)).astype(np.float32), name='z') + op = tf.nn.conv2d(input=x, filters=y, strides=[1,1,1,1], padding='VALID', name='op_to_store') + op2 = tf.nn.conv2d(input=op, filters=z, strides=[1,1,1,1], padding='VALID') + last_identity = tf.identity(op2, name='op2_to_store') + + sess.run(tf.compat.v1.global_variables_initializer()) + constant_graph = tf.compat.v1.graph_util.convert_variables_to_constants(sess, sess.graph_def, ['op2_to_store']) + + graph_def.ParseFromString(constant_graph.SerializeToString()) + with graph.as_default(): + tf.import_graph_def(graph_def, name='') + return graph + + +class TestOptimizationLevel(unittest.TestCase): + + @classmethod + def setUpClass(self): + self.constant_graph = build_fake_model() + + @classmethod + def tearDownClass(self): + shutil.rmtree('saved', ignore_errors=True) + shutil.rmtree('nc_workspace', ignore_errors=True) + + def test_tf_opt_level_0(self): + logger.info("*** Test: optimization level 0 with tensorflow model.") + from neural_compressor.quantization import fit + from neural_compressor.config import PostTrainingQuantConfig + from neural_compressor.data import DATASETS, DATALOADERS + + # fake evaluation function + def _fake_eval(model): + return 1 + + # dataset and dataloader + dataset = DATASETS("tensorflow")["dummy"](((100, 3, 3, 1))) + dataloader = DATALOADERS["tensorflow"](dataset) + + # tuning and accuracy criterion + optimization_level = 0 + conf = PostTrainingQuantConfig(approach="static", backend="tensorflow", optimization_level=0) + + # fit + q_model = fit(model=self.constant_graph, + conf=conf, + calib_dataloader= dataloader, + eval_dataloader=dataloader, + eval_func=_fake_eval) + self.assertIsNotNone(q_model) + + def test_tf_opt_level_1(self): + logger.info("*** Test: optimization level 1 with tensorflow model.") + from neural_compressor.quantization import fit + from neural_compressor.config import PostTrainingQuantConfig + from neural_compressor.data import DATASETS, DATALOADERS + + # fake evaluation function + self._fake_acc = 10 + def _fake_eval(model): + self._fake_acc -= 1 + return self._fake_acc + + # dataset and dataloader + dataset = DATASETS("tensorflow")["dummy"](((100, 3, 3, 1))) + dataloader = DATALOADERS["tensorflow"](dataset) + + # tuning and accuracy criterion + optimization_level = 1 + conf = PostTrainingQuantConfig(approach="static", backend="tensorflow", optimization_level=optimization_level) + + # fit + q_model = fit(model=self.constant_graph, + conf=conf, + calib_dataloader= dataloader, + eval_dataloader=dataloader, + eval_func=_fake_eval) + self.assertIsNone(q_model) + + def test_pt_opt_level_0(self): + logger.info("*** Test: optimization level 0 with pytorch model.") + from neural_compressor.quantization import fit + from neural_compressor.config import PostTrainingQuantConfig + from neural_compressor.data import DATASETS, DATALOADERS + import torchvision + + # model + resnet18 = torchvision.models.resnet18() + + # fake evaluation function + acc_lst = [2.0, 1.0, 2.1, 2.2, 2.3] + perf_lst = [2.0, 1.5, 1.0, 0.5, 0.1] + self.test_pt_opt_level_0_index = -1 + def _fake_eval(model): + self.test_pt_opt_level_0_index += 1 + perf = perf_lst[self.test_pt_opt_level_0_index] + time.sleep(perf) + return acc_lst[self.test_pt_opt_level_0_index] + + # dataset and dataloader + dataset = DATASETS("pytorch")["dummy"](((100, 3, 3, 1))) + dataloader = DATALOADERS["pytorch"](dataset) + + # tuning and accuracy criterion + optimization_level = 0 + conf = PostTrainingQuantConfig(approach="static", backend="pytorch", optimization_level=optimization_level) + + # fit + q_model = fit(model=resnet18, + conf=conf, + calib_dataloader= dataloader, + eval_dataloader=dataloader, + eval_func=_fake_eval) + self.assertIsNotNone(q_model) + +if __name__ == "__main__": + unittest.main() diff --git a/test/strategy/test_sigopt.py b/test/strategy/test_sigopt.py index ce7a7669862..5d443e3dba2 100644 --- a/test/strategy/test_sigopt.py +++ b/test/strategy/test_sigopt.py @@ -104,7 +104,7 @@ def build_fake_model(): return graph @unittest.skipIf(CONDITION , "missing the env variables 'SIGOPT_API_TOKEN' or 'SIGOPT_PROJECT_ID'") -class TestQuantization(unittest.TestCase): +class TestSigoptTuningStrategy(unittest.TestCase): @classmethod def setUpClass(self): @@ -140,6 +140,29 @@ def test_run_basic_max_trials(self): quantizer.eval_dataloader = common.DataLoader(dataset) quantizer.model = self.constant_graph quantizer.fit() + + def test_run_sigopt_one_trial_new_api(self): + from neural_compressor.quantization import fit + from neural_compressor.config import AccuracyCriterion, AccuracyLoss, PostTrainingQuantConfig, TuningCriterion + from neural_compressor.data import DATASETS, DATALOADERS + + # dataset and dataloader + dataset = DATASETS("tensorflow")["dummy"](((100, 3, 3, 1))) + dataloader = DATALOADERS["tensorflow"](dataset) + + # tuning and accuracy criterion + tolerable_loss = AccuracyLoss(0.01) + accuracy_criterion = AccuracyCriterion(criterion='relative', tolerable_loss=tolerable_loss) + strategy_kwargs = {'sigopt_api_token': 'sigopt_api_token_test', + 'sigopt_project_id': 'sigopt_project_id_test', + 'sigopt_experiment_name': 'nc-tune'} + tuning_criterion = TuningCriterion(strategy='sigopt', strategy_kwargs=strategy_kwargs, max_trials=3) + conf = PostTrainingQuantConfig(approach="static", backend="tensorflow", + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion) + q_model = fit(model=self.constant_graph, conf=conf, calib_dataloader= dataloader, eval_dataloader=dataloader) + self.assertIsNotNone(q_model) + if __name__ == "__main__": unittest.main() diff --git a/test/tfnewapi/test_tensorflow_graph_qdq_bn_fusion.py b/test/tfnewapi/test_tensorflow_graph_qdq_bn_fusion.py index acfbd049072..d99a48c1803 100644 --- a/test/tfnewapi/test_tensorflow_graph_qdq_bn_fusion.py +++ b/test/tfnewapi/test_tensorflow_graph_qdq_bn_fusion.py @@ -12,6 +12,8 @@ from tensorflow.python.framework import dtypes from neural_compressor.adaptor.tf_utils.util import disable_random from neural_compressor.utils.utility import CpuInfo +from neural_compressor.experimental import Quantization, common +from neural_compressor.utils import logger def build_fake_yaml_1(): fake_yaml_1 = ''' @@ -91,7 +93,7 @@ def tearDownClass(self): @disable_random() def test_bn_relu_depthwiseconv_biasadd_relu6_fusion(self): - logging.getLogger().info("test_depthwiseconv_biasadd_relu_fusion") + logger.info("test_bn_relu_depthwiseconv_biasadd_relu6_fusion") x = tf.compat.v1.placeholder(tf.float32, [1, 56, 56, 16], name="input") conv_weights = tf.compat.v1.get_variable("weight", [3, 3, 16, 16], initializer=tf.compat.v1.random_normal_initializer()) @@ -107,7 +109,7 @@ def test_bn_relu_depthwiseconv_biasadd_relu6_fusion(self): sess=sess, input_graph_def=sess.graph_def, output_node_names=[out_name]) - from neural_compressor.experimental import Quantization, common + quantizer = Quantization('fake_yaml_1.yaml') dataset = quantizer.dataset('dummy', shape=(100, 56, 56, 16), label=True) quantizer.eval_dataloader = common.DataLoader(dataset) @@ -137,7 +139,7 @@ def test_bn_relu_depthwiseconv_biasadd_relu6_fusion(self): @disable_random() def test_training_bn_relu_depthwiseconv_biasadd_relu6_fusion(self): - logging.getLogger().info("test_depthwiseconv_biasadd_relu_fusion") + logger.info("test_training_bn_relu_depthwiseconv_biasadd_relu6_fusion") x = tf.compat.v1.placeholder(tf.float32, [1, 56, 56, 16], name="input") conv_weights = tf.compat.v1.get_variable("weight", [3, 3, 16, 16], initializer=tf.compat.v1.random_normal_initializer()) @@ -153,7 +155,7 @@ def test_training_bn_relu_depthwiseconv_biasadd_relu6_fusion(self): sess=sess, input_graph_def=sess.graph_def, output_node_names=[out_name]) - from neural_compressor.experimental import Quantization, common + quantizer = Quantization('fake_yaml_1.yaml') dataset = quantizer.dataset('dummy', shape=(100, 56, 56, 16), label=True) quantizer.eval_dataloader = common.DataLoader(dataset) @@ -177,9 +179,68 @@ def test_training_bn_relu_depthwiseconv_biasadd_relu6_fusion(self): if bf16_enabled: self.assertEqual(bf16_bn_num, 1) + @disable_random() + def test_bn_leakyrelu_conv_biasadd_relu(self): + logger.info("test_bn_leakyrelu_conv_biasadd_relu") + x = tf.compat.v1.placeholder(tf.float32, [1, 56, 56, 16], name="input") + conv_weights = tf.compat.v1.get_variable("weight", [3, 3, 16, 16], + initializer=tf.compat.v1.random_normal_initializer()) + normed_0 = tf.compat.v1.layers.batch_normalization(x) + leaky_relu = tf.nn.leaky_relu(normed_0, alpha=0.3, name='op_to_store_0') + conv = tf.nn.conv2d(leaky_relu, conv_weights, strides=[1, 2, 2, 1], padding="VALID") + normed_1 = tf.compat.v1.layers.batch_normalization(conv) + relu = tf.nn.relu(normed_1, name='op_to_store_1') + out_name = relu.name.split(':')[0] + with tf.compat.v1.Session() as sess: + sess.run(tf.compat.v1.global_variables_initializer()) + output_graph_def = graph_util.convert_variables_to_constants( + sess=sess, + input_graph_def=sess.graph_def, + output_node_names=[out_name]) + + quantizer = Quantization('fake_yaml_1.yaml') + dataset = quantizer.dataset('dummy', shape=(100, 56, 56, 16), label=True) + quantizer.eval_dataloader = common.DataLoader(dataset) + quantizer.calib_dataloader = common.DataLoader(dataset) + + quantizer.model = output_graph_def + output_graph = quantizer.fit() + conv_input_type = True + found_fusion = True + qbn_num = 0 + dq_num = 0 + qbn_output_max_name = 'batch_normalization/FusedBatchNormV3_eightbit_quantized_bn/frozen_bn_output_max' + for i in output_graph.graph_def.node: + if i.op == '_FusedQuantizedConv2D' \ + and i.attr['Thost_inputs'].list.type != [11, 11, 1, 1, 1, 1, 1, 1, 1]: + conv_input_type = False + break + if i.op in ['Relu', 'LeakyRelu', 'FusedBatchNormV3']: + found_fusion = False + break + if i.op == '_QuantizedFusedBatchNorm': + is_offset_const = i.attr["is_offset_const"].b + is_mean_const = i.attr["is_mean_const"].b + qbn_alpha = i.attr["alpha"].f + frozen_qbn_output_max = i.input[8] + qbn_num += 1 + if i.name == qbn_output_max_name: + frozen_qbn_output_max_value = i.attr["value"].tensor.float_val[0] + if i.op == 'Dequantize': + dq_num += 1 + self.assertEqual(conv_input_type, True) + self.assertEqual(found_fusion, True) + self.assertEqual(qbn_num, 1) + self.assertEqual(dq_num, 1) + self.assertEqual(is_offset_const, True) + self.assertEqual(is_mean_const, True) + self.assertEqual(round(qbn_alpha, 7), 0.3) + self.assertEqual(frozen_qbn_output_max, qbn_output_max_name) + self.assertGreater(frozen_qbn_output_max_value, 126) + @disable_random() def test_bn_relu_conv_biasadd_relu(self): - logging.getLogger().info("test_conv_biasadd_relu_fusion") + logger.info("test_bn_relu_conv_biasadd_relu") x = tf.compat.v1.placeholder(tf.float32, [1, 56, 56, 16], name="input") conv_weights = tf.compat.v1.get_variable("weight", [3, 3, 16, 16], initializer=tf.compat.v1.random_normal_initializer()) @@ -195,7 +256,7 @@ def test_bn_relu_conv_biasadd_relu(self): sess=sess, input_graph_def=sess.graph_def, output_node_names=[out_name]) - from neural_compressor.experimental import Quantization, common + quantizer = Quantization('fake_yaml_1.yaml') dataset = quantizer.dataset('dummy', shape=(100, 56, 56, 16), label=True) quantizer.eval_dataloader = common.DataLoader(dataset) @@ -236,7 +297,7 @@ def test_bn_relu_conv_biasadd_relu(self): @disable_random() def test_bn_performance_only_false(self): - logging.getLogger().info("test_conv_biasadd_relu_fusion") + logger.info("test_bn_performance_only_false") x = tf.compat.v1.placeholder(tf.float32, [1, 56, 56, 16], name="input") conv_weights = tf.compat.v1.get_variable("weight", [3, 3, 16, 16], initializer=tf.compat.v1.random_normal_initializer()) @@ -252,7 +313,7 @@ def test_bn_performance_only_false(self): sess=sess, input_graph_def=sess.graph_def, output_node_names=[out_name]) - from neural_compressor.experimental import Quantization, common + quantizer = Quantization('fake_yaml_2.yaml') dataset = quantizer.dataset('dummy', shape=(100, 56, 56, 16), label=True) quantizer.eval_dataloader = common.DataLoader(dataset) @@ -281,7 +342,7 @@ def test_bn_performance_only_false(self): @disable_random() def test_bnex_performance_only_false(self): - logging.getLogger().info("test_conv_biasadd_relu_fusion") + logger.info("test_bnex_performance_only_false") x = tf.compat.v1.placeholder(tf.float32, [1, 56, 56, 16], name="input") conv_weights_0 = tf.compat.v1.get_variable("weight_0", [3, 3, 16, 16], initializer=tf.compat.v1.random_normal_initializer()) @@ -312,7 +373,7 @@ def test_bnex_performance_only_false(self): if node.name == "batch_normalization_1/FusedBatchNormV3": node.op = "_FusedBatchNormEx" node.attr["activation_mode"].CopyFrom(attr_value_pb2.AttrValue(s=b"Relu")) - from neural_compressor.experimental import Quantization, common + quantizer = Quantization('fake_yaml_2.yaml') dataset = quantizer.dataset('dummy', shape=(100, 56, 56, 16), label=True) quantizer.eval_dataloader = common.DataLoader(dataset)