Skip to content
This repository has been archived by the owner on Jan 20, 2022. It is now read-only.

[Examples] Add TensorFlow examples - ResNet50 and BERT models #2530

Closed
wants to merge 8 commits into from
60 changes: 60 additions & 0 deletions Examples/tensorflow/BERT/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# BERT sample for Tensorflow

GRAPHENEDIR ?= ../../..
SGX_SIGNER_KEY ?= $(GRAPHENEDIR)/Pal/src/host/Linux-SGX/signer/enclave-key.pem

include $(GRAPHENEDIR)/Scripts/Makefile.configs

ifeq ($(DEBUG),1)
GRAPHENE_LOG_LEVEL = debug
else
GRAPHENE_LOG_LEVEL = error
endif

.PHONY: all
all: python.manifest
ifeq ($(SGX),1)
all: python.manifest.sgx python.sig python.token
endif

BERT_DATASET = https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip
SQUAAD_DATASET = https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
CHECKPOINTS = https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/bert_large_checkpoints.zip
BERT_INT8_MODEL = https://storage.googleapis.com/intel-optimized-tensorflow/models/r2.5-icx-b631821f/asymmetric_per_channel_bert_int8.pb

collateral:
apt install unzip
test -d models || git clone https://github.com/IntelAI/models.git
mkdir -p data
test -f data/wwm_uncased_L-24_H-1024_A-16.zip || wget $(BERT_DATASET) -P data/
test -d data/wwm_uncased_L-24_H-1024_A-16 || unzip data/wwm_uncased_L-24_H-1024_A-16.zip -d data
test -f data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json || wget $(SQUAAD_DATASET) -P data/wwm_uncased_L-24_H-1024_A-16
test -f data/bert_large_checkpoints.zip || wget $(CHECKPOINTS) -P data/
test -d data/bert_large_checkpoints || unzip data/bert_large_checkpoints.zip -d data
test -f data/asymmetric_per_channel_bert_int8.pb || wget $(BERT_INT8_MODEL) -P data/

python.manifest: python.manifest.template collateral
graphene-manifest \
-Dlog_level=$(GRAPHENE_LOG_LEVEL) \
-Darch_libdir=$(ARCH_LIBDIR) \
-Dentrypoint=$(realpath $(shell sh -c "command -v python3")) \
-Dpythondistpath=$(PYTHONDISTPATH) \
$< >$@

python.manifest.sgx: python.manifest
graphene-sgx-sign \
--key $(SGX_SIGNER_KEY) \
--manifest $< -output $@

python.sig: python.manifest.sgx

python.token: python.sig
graphene-sgx-get-token -output $@ -sig $<

.PHONY: clean
clean:
$(RM) *.manifest *.manifest.sgx *.token *.sig

.PHONY: distclean
distclean: clean
$(RM) -r models/ data/
79 changes: 79 additions & 0 deletions Examples/tensorflow/BERT/python.manifest.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# This manifest was tested on Ubuntu 18.04 with python3.6.

libos.entrypoint = "{{ entrypoint }}"
loader.preload = "file:{{ graphene.libos }}"

# Graphene log level
loader.log_level = "{{ log_level }}"

# Read application arguments directly from the command line. Don't use this on production!
loader.insecure__use_cmdline_argv = 1

# Propagate environment variables from the host. Don't use this on production!
loader.insecure__use_host_env = 1

# Disable address space layout randomization. Don't use this on production!
loader.insecure__disable_aslr = 1

# Update Library Path - overwrites environment variable
loader.env.LD_LIBRARY_PATH = "{{ python.stdlib }}/lib:/lib:{{ arch_libdir }}:/usr/lib:/usr/{{ arch_libdir }}"

# Additional memory for Graphene's internal use
loader.pal_internal_mem_size = "512M"

# Default glibc files, mounted from graphene's Runtime directory
fs.mount.lib.type = "chroot"
fs.mount.lib.path = "/lib"
fs.mount.lib.uri = "file:{{ graphene.runtimedir() }}"

# More libraries required by Tensorflow
fs.mount.lib2.type = "chroot"
fs.mount.lib2.path = "{{ arch_libdir }}"
fs.mount.lib2.uri = "file:{{ arch_libdir }}"

fs.mount.usr.type = "chroot"
fs.mount.usr.path = "/usr"
fs.mount.usr.uri = "file:/usr"

fs.mount.pyhome.type = "chroot"
fs.mount.pyhome.path = "{{ python.stdlib }}"
fs.mount.pyhome.uri = "file:{{ python.stdlib }}"

fs.mount.pydisthome.type = "chroot"
fs.mount.pydisthome.path = "{{ python.distlib }}"
fs.mount.pydisthome.uri = "file:{{ python.distlib }}"

fs.mount.pydistpath.type = "chroot"
fs.mount.pydistpath.path = "{{ pythondistpath }}"
fs.mount.pydistpath.uri = "file:{{ pythondistpath }}"

fs.mount.tmp.type = "chroot"
fs.mount.tmp.path = "/tmp"
fs.mount.tmp.uri = "file:/tmp"

fs.mount.etc.type = "chroot"
fs.mount.etc.path = "/etc"
fs.mount.etc.uri = "file:/etc"

# SGX general options
sgx.enclave_size = "32G"
sgx.thread_num = 256
sgx.preheat_enclave = 1
sgx.nonpie_binary = 1

# SGX trusted files
sgx.trusted_files.runtime = "file:{{ graphene.runtimedir() }}/"
sgx.trusted_files.arch_libdir = "file:{{ arch_libdir }}/"
sgx.trusted_files.usr_arch_libdir = "file:/usr/{{ arch_libdir }}/"
sgx.trusted_files.python = "file:{{ entrypoint }}"
sgx.trusted_files.pyhome = "file:{{ python.stdlib }}"
sgx.trusted_files.pydisthome = "file:{{ python.distlib }}"
sgx.trusted_files.pydistpath = "file:{{ pythondistpath }}"

# SGX allowed files
sgx.allowed_files.tmp = "file:/tmp/"
sgx.allowed_files.etc = "file:/etc/"
sgx.allowed_files.output = "file:output/"
sgx.allowed_files.scripts = "file:models/"
sgx.allowed_files.dataDir = "file:data/"
sgx.allowed_files.keras = "file:root/.keras/keras.json"
6 changes: 6 additions & 0 deletions Examples/tensorflow/BERT/root/.keras/keras.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"floatx": "float32",
"epsilon": 1e-07,
"backend": "tensorflow",
"image_data_format": "channels_last"
}
170 changes: 170 additions & 0 deletions Examples/tensorflow/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
## Inference on TensorFlow BERT and ResNet50 models
This directory contains steps and artifacts to run inference with TensorFlow BERT and ResNet50\
sample workloads on Graphene. Specifically, both these examples use pre-trained models to run inference.\
We tested this on Ubuntu 18.04 and uses the package version with Python 3.6.

### Bidirectional Encoder Representations from Transformers (BERT):
BERT is a method of pre-training language representations and then use that trained model for downstream\
NLP tasks like 'question answering'. BERT is an unsupervised, deeply birectional system for pre-training NLP.\
In this BERT sample, we use 'BERT-Large, Uncased (Whole Word Masking)' model and perform int8 inference.\
More details about BERT can be found at https://github.com/google-research/bert.

### Residual Network (ResNet):
ResNet50 is a convolutional neural network that is 50 layers deep.\
In this ResNet50(v1.5) sample, we use a pre-trained model and perform int8 inference.\
More details about ResNet50 can be found at https://github.com/IntelAI/models/tree/icx-launch-public/benchmarks/image_recognition/tensorflow/resnet50v1_5.


## Pre-System setting
Linux systems have CPU frequency scaling governor that helps the system to scale the CPU frequency\
to achieve best performance or to save power based on the requirement.
To achieve the best peformance, please set the CPU frequency scaling governor to performance mode.

``for ((i=0; i<$(nproc); i++)); do echo 'performance' > /sys/devices/system/cpu/cpu$i/cpufreq/scaling_governor; done``

## Pre-requisites
- Install python3.6.
- Upgrade pip/pip3.
- Install tensorflow using ``pip install intel-tensorflow-avx512==2.4.0`` or by downloading whl\
package from https://pypi.org/project/intel-tensorflow-avx512/2.4.0/#files.

## Build BERT or ResNet50 samples
- To build BERT sample, do ``cd BERT`` or to build ResNet50 sample, do ``cd ResNet50``.
- To clean the sample, do ``make clean``
- To clean and remove downloaded models and datasets, do ``make distclean``
- To build the non-SGX version, do ``make PYTHONDISTPATH=path_to_python_dist_packages/``
- To build the SGX version, do ``make PYTHONDISTPATH=path_to_python_dist_packages/ SGX=1``
>**WARNING:** Building BERT sample downloads about 5GB of data.\
>**NOTE:** Typically, path_to_python_dist_packages is '/usr/local/lib/python3.6/dist-packages',\
but can change based on python's installation directory.

## Run inference on BERT model
- To run int8 inference on graphene-sgx(SGX version)
```
OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-sgx \
./python models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \
--init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 \
--vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt \
--bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json \
--predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json \
--precision=int8 \
--output_dir=output/bert-squad-output \
--predict_batch_size=32 \
--experimental_gelu=True \
--optimized_softmax=True \
--input_graph=data/asymmetric_per_channel_bert_int8.pb \
--do_predict=True --mode=benchmark \
--inter_op_parallelism_threads=1 \
--intra_op_parallelism_threads=36
```
- To run int8 inference on graphene-direct(non-SGX version)
```
OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 \
graphene-direct ./python models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \
--init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 \
--vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt \
--bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json \
--predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json \
--precision=int8 \
--output_dir=output/bert-squad-output \
--predict_batch_size=32 \
--experimental_gelu=True \
--optimized_softmax=True \
--input_graph=data/asymmetric_per_channel_bert_int8.pb \
--do_predict=True \
--mode=benchmark \
--inter_op_parallelism_threads=1 \
--intra_op_parallelism_threads=36
```
- To run int8 inference on native baremetal(outside graphene)
```
OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 python3.6 \
models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \
--init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 \
--vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt \
--bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json \
--predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json \
--precision=int8 --output_dir=output/bert-squad-output \
--predict_batch_size=32 \
--experimental_gelu=True \
--optimized_softmax=True \
--input_graph=data/asymmetric_per_channel_bert_int8.pb \
--do_predict=True \
--mode=benchmark \
--inter_op_parallelism_threads=1 \
--intra_op_parallelism_threads=36
```
- Above commands are for a 36 core system. Please set the following options accordingly for optimal performance.
- OMP_NUM_THREADS='Core(s) per socket'
- taskset to 'Core(s) per socket'
- intra_op_parallelism_threads='Core(s) per socket'
- If hyperthreading is enabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact,1,0``
- If hyperthreading is disabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact``
>**NOTE:** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'``\
> OMP_NUM_THREADS sets the maximum number of threads to use for OpenMP parallel regions. \
> KMP_AFFINITY binds OpenMP threads to physical processing units.

## Run inference on ResNet50 model
- To run inference on graphene-sgx(SGX version)
```
OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-sgx \
./python models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py \
--input-graph=resnet50v1_5_int8_pretrained_model.pb \
--num-inter-threads=1 \
--num-intra-threads=36 \
--batch-size=32 \
--warmup-steps=50 \
--steps=500
```
- To run inference on graphene-direct(non-SGX version)
```
OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-direct \
./python models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py \
--input-graph=resnet50v1_5_int8_pretrained_model.pb \
--num-inter-threads=1 \
--num-intra-threads=36 \
--batch-size=32 \
--warmup-steps=50 \
--steps=500
```
- To run inference on native baremetal(outside graphene)
```
OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 python3.6 \
models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py \
--input-graph=resnet50v1_5_int8_pretrained_model.pb \
--num-inter-threads=1 \
--num-intra-threads=36 \
--batch-size=32 \
--warmup-steps=50 \
--steps=500
```
- Above commands are for a 36 core system. Please set the following options accordingly for optimal performance.
- OMP_NUM_THREADS='Core(s) per socket'
- taskset to 'Core(s) per socket'
- num-intra-threads='Core(s) per socket'
- If hyperthreading is enabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact,1,0``
- If hyperthreading is disabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact``
- The options batch-size, warmup-steps and steps can be varied.
>**NOTE:** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'``\
> OMP_NUM_THREADS sets the maximum number of threads to use for OpenMP parallel regions.\
> KMP_AFFINITY binds OpenMP threads to physical processing units.

## Performance considerations
- Preheat manifest option pre-faults the enclave memory and moves the performance penalty to\
graphene-sgx invocation (before the workload starts execution).\
To use preheat option, add ``sgx.preheat_enclave = 1`` to the manifest template.
- TCMalloc and mimalloc are memory allocator libraries from Google and Microsoft that can help\
improve performance significantly based on the workloads. At any point, only one of these allocators can be used.
- TCMalloc (Please update the binary location and name if different from default)
- Install tcmalloc : ``sudo apt-get install google-perftools``
- Add these in the manifest template
```loader.env.LD_PRELOAD = "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"```
```sgx.trusted_files.libtcmalloc = "file:/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"```
```sgx.trusted_files.libunwind = "file:/usr/lib/x86_64-linux-gnu/libunwind.so.8"```
- Save the template and rebuild.
- mimalloc (Please update the binary location and name if different from default)
- Install mimalloc using the steps from https://github.com/microsoft/mimalloc
- Add these in the manifest template
```loader.env.LD_PRELOAD = "/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"```
```sgx.trusted_files.libmimalloc = "file:/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"```
- Save the template and rebuild.
49 changes: 49 additions & 0 deletions Examples/tensorflow/ResNet50/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# ResNet50 sample for Tensorflow

GRAPHENEDIR ?= ../../..
SGX_SIGNER_KEY ?= $(GRAPHENEDIR)/Pal/src/host/Linux-SGX/signer/enclave-key.pem

include $(GRAPHENEDIR)/Scripts/Makefile.configs

ifeq ($(DEBUG),1)
GRAPHENE_LOG_LEVEL = debug
else
GRAPHENE_LOG_LEVEL = error
endif

.PHONY: all collateral
all: python.manifest
ifeq ($(SGX),1)
all: python.manifest.sgx python.sig python.token
endif

collateral:
test -d models || git clone https://github.com/IntelAI/models.git
test -f resnet50v1_5_int8_pretrained_model.pb || wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/resnet50v1_5_int8_pretrained_model.pb

python.manifest: python.manifest.template collateral
graphene-manifest \
-Dlog_level=$(GRAPHENE_LOG_LEVEL) \
-Darch_libdir=$(ARCH_LIBDIR) \
-Dentrypoint=$(realpath $(shell sh -c "command -v python3")) \
-Dpythondistpath=$(PYTHONDISTPATH) \
$< >$@

python.manifest.sgx: python.manifest
graphene-sgx-sign \
--key $(SGX_SIGNER_KEY) \
--manifest python.manifest \
--output $@

python.sig: python.manifest.sgx

python.token: python.sig
graphene-sgx-get-token -output $@ -sig $<

.PHONY: clean
clean:
$(RM) *.manifest *.manifest.sgx *.token *.sig

.PHONY: distclean
distclean: clean
$(RM) -r models/ resnet50v1_5_int8_pretrained_model.pb
Loading