gramineproject · Satya1493 · Jul 9, 2021 · Jul 9, 2021 · Jul 12, 2021 · Jul 20, 2021
diff --git a/Examples/tensorflow/BERT/Makefile b/Examples/tensorflow/BERT/Makefile
@@ -0,0 +1,56 @@
+# BERT sample for Tensorflow
+
+GRAPHENEDIR ?= ../../..
+SGX_SIGNER_KEY ?= $(GRAPHENEDIR)/Pal/src/host/Linux-SGX/signer/enclave-key.pem
+
+include $(GRAPHENEDIR)/Scripts/Makefile.configs
+
+ifeq ($(DEBUG),1)
+GRAPHENE_LOG_LEVEL = debug
+else
+GRAPHENE_LOG_LEVEL = error
+endif
+
+.PHONY: all
+all: python.manifest
+ifeq ($(SGX),1)
+all: python.manifest.sgx python.sig python.token
+endif
+
+collateral:
+	apt install unzip
+	test -d models || git clone https://github.com/IntelAI/models.git
+	mkdir -p data
+	test -f data/wwm_uncased_L-24_H-1024_A-16.zip || wget https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip -P data/
+	test -d data/wwm_uncased_L-24_H-1024_A-16 || unzip data/wwm_uncased_L-24_H-1024_A-16.zip -d data
+	test -f data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json || wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -P data/wwm_uncased_L-24_H-1024_A-16
+	test -f data/bert_large_checkpoints.zip || wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/bert_large_checkpoints.zip -P data/
+	test -d data/bert_large_checkpoints || unzip data/bert_large_checkpoints.zip -d data
+	test -f data/asymmetric_per_channel_bert_int8.pb || wget https://storage.googleapis.com/intel-optimized-tensorflow/models/r2.5-icx-b631821f/asymmetric_per_channel_bert_int8.pb -P data/
+
+python.manifest: python.manifest.template collateral
+	graphene-manifest \
+                -Dlog_level=$(GRAPHENE_LOG_LEVEL) \
+                -Darch_libdir=$(ARCH_LIBDIR) \
+                -Dentrypoint=$(realpath $(shell sh -c "command -v python3")) \
+                -Dpythondistpath=$(PYTHONDISTPATH) \
+                $< >$@
+
+python.manifest.sgx: python.manifest
+	graphene-sgx-sign \
+		--key $(SGX_SIGNER_KEY) \
+		--manifest $< -output $@
+
+python.sig: python.manifest.sgx
+
+python.token: python.sig
+	graphene-sgx-get-token -output $@ -sig $<
+
+.PHONY: clean
+clean:
+	$(RM) *.manifest *.manifest.sgx *.token *.sig
+
+.PHONY: distclean
+distclean: clean
+	$(RM) -r models/ data/
+
diff --git a/Examples/tensorflow/BERT/python.manifest.template b/Examples/tensorflow/BERT/python.manifest.template
@@ -0,0 +1,78 @@
+# This manifest was tested on Ubuntu 18.04 with python3.6.
+
+libos.entrypoint = "{{ entrypoint }}"
+loader.preload = "file:{{ graphene.libos }}"
+
+# Graphene log level
+loader.log_level = "{{ log_level }}"
+
+# Read application arguments directly from the command line. Don't use this on production!
+loader.insecure__use_cmdline_argv = 1
+
+# Propagate environment variables from the host. Don't use this on production!
+loader.insecure__use_host_env = 1
+
+# Disable address space layour randomization. Don't use this on production!
+loader.insecure__disable_aslr = 1
+
+# Update Library Path - overwrites environment variable
+loader.env.LD_LIBRARY_PATH = "{{ python.stdlib }}/lib:/lib:{{ arch_libdir }}:/usr/lib:/usr/{{ arch_libdir }}"
+
+# Default glibc files, mounted from graphene's Runtime directory
+fs.mount.lib.type = "chroot"
+fs.mount.lib.path = "/lib"
+fs.mount.lib.uri = "file:{{ graphene.runtimedir() }}"
+
+# More libraries required by Tensorflow
+fs.mount.lib2.type = "chroot"
+fs.mount.lib2.path = "{{ arch_libdir }}"
+fs.mount.lib2.uri = "file:{{ arch_libdir }}"
+
+fs.mount.usr.type = "chroot"
+fs.mount.usr.path = "/usr"
+fs.mount.usr.uri = "file:/usr"
+
+fs.mount.pyhome.type = "chroot"
+fs.mount.pyhome.path = "{{ python.stdlib }}"
+fs.mount.pyhome.uri = "file:{{ python.stdlib }}"
+
+fs.mount.pydisthome.type = "chroot"
+fs.mount.pydisthome.path = "{{ python.distlib }}"
+fs.mount.pydisthome.uri = "file:{{ python.distlib }}"
+
+fs.mount.pydistpath.type = "chroot"
+fs.mount.pydistpath.path = "{{ pythondistpath }}"
+fs.mount.pydistpath.uri = "file:{{ pythondistpath }}"
+
+fs.mount.tmp.type = "chroot"
+fs.mount.tmp.path = "/tmp"
+fs.mount.tmp.uri = "file:/tmp"
+
+fs.mount.etc.type = "chroot"
+fs.mount.etc.path = "/etc"
+fs.mount.etc.uri = "file:/etc"
+
+# SGX general options
+sgx.enclave_size = "32G"
+sgx.thread_num = 256
+sgx.preheat_enclave = 1
+sgx.nonpie_binary = 1
+
+# SGX trusted files
+sgx.trusted_files.runtime = "file:{{ graphene.runtimedir() }}/"
+sgx.trusted_files.arch_libdir = "file:{{ arch_libdir }}/"
+sgx.trusted_files.usr_arch_libdir = "file:/usr/{{ arch_libdir }}/"
+sgx.trusted_files.libcpp = "file:/usr/lib/x86_64-linux-gnu/libstdc++.so.6"
+sgx.trusted_files.libgcc = "file:/lib/x86_64-linux-gnu/libgcc_s.so.1"
+
+sgx.allowed_files.tmp = "file:/tmp/"
+sgx.allowed_files.etc = "file:/etc/"
+sgx.allow_file_creation = "1"
+sgx.allowed_files.output = "file:output/"
+sgx.allowed_files.scripts = "file:models/models/language_modeling/tensorflow/bert_large/inference/"
+sgx.allowed_files.dataDir = "file:data/"
+sgx.allowed_files.python = "file:{{ entrypoint }}"
+sgx.allowed_files.pyhome = "file:{{ python.stdlib }}"
+sgx.allowed_files.pydisthome = "file:{{ python.distlib }}"
+sgx.allowed_files.pydistpath = "file:{{ pythondistpath }}"
+sgx.allowed_files.keras = "file:root/.keras/keras.json"
diff --git a/Examples/tensorflow/BERT/root/.keras/keras.json b/Examples/tensorflow/BERT/root/.keras/keras.json
@@ -0,0 +1,6 @@
+{
+    "floatx": "float32",
+    "epsilon": 1e-07,
+    "backend": "tensorflow",
+    "image_data_format": "channels_last"
+}
diff --git a/Examples/tensorflow/README.md b/Examples/tensorflow/README.md
@@ -0,0 +1,125 @@
+## Run inference on TensorFlow BERT and ResNet50 models
+This directory contains steps and artifacts to run inference with TensorFlow BERT and ResNet50 sample workloads on Graphene. Specifically, both these examples use pre-trained models to run inference. We tested this on Ubuntu 18.04 and uses the package version of Python 3.6.
+
+## Pre-System setting
+Linux systems have CPU frequency scaling governor that helps the system to scale the CPU frequency to achieve best performance or to save power based on the requirement. To achieve the best peformance, please set the CPU frequency scaling governor to performance mode.
+
+``for ((i=0; i<$(nproc); i++)); do echo 'performance' > /sys/devices/system/cpu/cpu$i/cpufreq/scaling_governor; done``
+
+## Pre-requisites
+- Install python3.6.
+- Upgrade pip/pip3.
+- Install tensorflow using ``pip install intel-tensorflow-avx512==2.4.0`` or by downloading whl package from https://pypi.org/project/intel-tensorflow-avx512/2.4.0/#files.
+
+## Build BERT or ResNet50 samples
+- To build BERT sample, do ``cd BERT`` or to build ResNet50 sample, do ``cd ResNet50``.
+- To clean the sample, do ``make clean``
+- To clean and remove downloaded  models and datasets, do ``make distclean``
+- To build the non-SGX version, do ``make PYTHONDISTPATH=path_to_python_dist_packages/``
+- To build the SGX version, do ``make PYTHONDISTPATH=path_to_python_dist_packages/ SGX=1``
+>**NOTE** Typically, path_to_python_dist_packages is '/usr/local/lib/python3.6/dist-packages', but can change based on python's installation directory.
+
+## Run inference on BERT model
+- To run int8 inference on graphene-sgx(SGX version)<br>
+``KMP_BLOCKTIME=1 KMP_SETTINGS=1 OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-sgx ./python models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py --init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 --vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt --bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json --predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json --precision=int8 --output_dir=output/bert-squad-output --predict_batch_size=32 --experimental_gelu=True --optimized_softmax=True --input_graph=data/asymmetric_per_channel_bert_int8.pb --do_predict=True --mode=benchmark --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=36``
+- To run int8 inference on graphene-direct(non-SGX version)<br>
+``KMP_BLOCKTIME=1 KMP_SETTINGS=1 OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-direct ./python models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py --init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 --vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt --bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json --predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json --precision=int8 --output_dir=output/bert-squad-output --predict_batch_size=32 --experimental_gelu=True --optimized_softmax=True --input_graph=data/asymmetric_per_channel_bert_int8.pb --do_predict=True --mode=benchmark --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=36``
+- To run int8 inference on native baremetal(outside graphene)<br>
+``KMP_BLOCKTIME=1 KMP_SETTINGS=1 OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 python3.6 models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py --init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 --vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt --bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json --predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json --precision=int8 --output_dir=output/bert-squad-output --predict_batch_size=32 --experimental_gelu=True --optimized_softmax=True --input_graph=data/asymmetric_per_channel_bert_int8.pb --do_predict=True  --mode=benchmark --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=36``
+- Above commands are for a 36 core system. Please set the following options accordingly for optimal performance.
+	- OMP_NUM_THREADS='Core(s) per socket'
+	- taskset to 'Core(s) per socket'
+	- intra_op_parallelism_threads='Core(s) per socket'
+>**NOTE** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'``
+
+## Run inference on ResNet50 model
+- To run inference on graphene-sgx(SGX version)<br>
+``OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-sgx ./python models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=36 --batch-size=512 --warmup-steps=50 --steps=500``
+- To run inference on graphene-direct(non-SGX version)<br>
+``OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-direct ./python models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=36 --batch-size=512 --warmup-steps=50 --steps=500``
+- To run inference on native baremetal(outside graphene)<br>
+``OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 python3.6 models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=36 --batch-size=128 --warmup-steps=50 --steps=500``
+- Above commands are for a 36 core system. Please set the following options accordingly for optimal performance.
+	- OMP_NUM_THREADS='Core(s) per socket'
+	- taskset to 'Core(s) per socket'
+	- num-intra-threads='Core(s) per socket'
+>**NOTE** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'``
+
+# GSC :
+
+## Build graphenize Docker image and run BERT inference :
+1. ``cd $(GRAPHENE_DIR)/Tools/gsc``
+
+2. Create a configuration file : ``cp config.yaml.template config.yaml``  
+Manually adopt config.yaml to the installed Intel SGX driver and desired Graphene repository/version
+
+3. Generate the signing key : ``openssl genrsa -3 -out enclave-key.pem 3072``
+
+4. Build docker image :
+    - ``cd test``
+    - ``docker build --rm -t ubuntu18.04-tensorflow-bert -f ubuntu18.04-tensorflow-bert.dockerfile ../../../Examples``
+
+5. Graphenize the docker image using gsc build :
+    - ``cd ..``
+    - ``./gsc build --insecure-args ubuntu18.04-tensorflow-bert test/ubuntu18.04-tensorflow.manifest``
+
+6. Sign the graphenized Docker image using gsc sign-image : ``./gsc sign-image ubuntu18.04-tensorflow-bert enclave-key.pem``
+
+7. To run int8 inference on GSC <br>
+``docker run --device=/dev/sgx_encalve --cpuset-cpus="0-35" --env OMP_NUM_THREADS=36 --env KMP_BLOCKTIME=1 --env KMP_SETTINGS=1 --env KMP_AFFINITY=granularity=fine,noverbose,compact,1,0 gsc-ubuntu18.04-tensorflow-bert models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py --init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 --vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt --bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json --predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json --precision=int8 --predict_batch_size=32 --experimental_gelu=True --optimized_softmax=True --input_graph=data/asymmetric_per_channel_bert_int8.pb --do_predict=True --mode=benchmark --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=36 --output_dir=output/bert-squad-output``
+
+8. To run int8 inference on native container <br>
+``docker run --cpuset-cpus="0-35" --env OMP_NUM_THREADS=36 --env KMP_BLOCKTIME=1 --env KMP_SETTINGS=1 --env KMP_AFFINITY=granularity=fine,noverbose,compact,1,0 ubuntu18.04-tensorflow-bert models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py --init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 --vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt --bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json --predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json --precision=int8 --predict_batch_size=32 --experimental_gelu=True --optimized_softmax=True --input_graph=data/asymmetric_per_channel_bert_int8.pb --do_predict=True --mode=benchmark --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=36 --output_dir=output/bert-squad-output``
+
+9. Above commands are for a 36 core system. Please set the following options accordingly for optimal performance.
+	- OMP_NUM_THREADS='Core(s) per socket'
+	- --cpuset-cpus to 'Core(s) per socket'
+	- intra_op_parallelism_threads='Core(s) per socket'
+>**NOTE** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'``
+
+## Build graphenize Docker image and run ResNet50 inference :
+1. ``cd $(GRAPHENE_DIR)/Tools/gsc``
+
+2. Create a configuration file : ``cp config.yaml.template config.yaml``  
+Manually adopt config.yaml to the installed Intel SGX driver and desired Graphene repository/version
+
+3. Generate the signing key : ``openssl genrsa -3 -out enclave-key.pem 3072``
+
+4. Build docker image :
+    - ``cd test``
+    - ``docker build --rm -t ubuntu18.04-tensorflow-resnet50 -f ubuntu18.04-tensorflow-resnet50.dockerfile ../../../Examples``
+
+5. Graphenize the docker image using gsc build :
+    - ``cd ..``
+    - ``./gsc build --insecure-args ubuntu18.04-tensorflow-resnet50 test/ubuntu18.04-tensorflow.manifest``
+
+6. Sign the graphenized Docker image using gsc sign-image : ``./gsc sign-image ubuntu18.04-tensorflow-resnet50 enclave-key.pem``
+
+7. To run inference on GSC <br>
+``docker run --device=/dev/sgx_enclave --cpuset-cpus="0-35" --env OMP_NUM_THREADS=36 --env KMP_AFFINITY=granularity=fine,noverbose,compact,1,0 gsc-ubuntu18.04-tensorflow-resnet50 models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=36 --batch-size=128 --warmup-steps=50 --steps=500``  
+	> **NOTE**: When OOM happens pass option ``-env TF_MKL_ALLOC_MAX_BYTES=34359738368`` to docker run command.  
+8. To run inference on native Container <br>
+``docker run --cpuset-cpus="0-35" --env OMP_NUM_THREADS=36 --env KMP_AFFINITY=granularity=fine,noverbose,compact,1,0 ubuntu18.04-tensorflow-resnet50 models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=36 --batch-size=128 --warmup-steps=50 --steps=500``
+
+9. Above commands are for a 36 core system. Please set the following options accordingly for optimal performance.
+	- OMP_NUM_THREADS='Core(s) per socket'
+	-  --cpuset-cpus to 'Core(s) per socket'
+	- num-intra-threads='Core(s) per socket'
+>**NOTE** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'``
+
+## Performance considerations
+- Preheat manifest option pre-faults the enclave memory and moves the performance penalty to graphene-sgx invocation (before the workload starts executing). To use preheat option, add ``sgx.preheat_enclave = 1`` to the manifest template.
+- TCMalloc and mimalloc are memory allocator libraries from Google and Microsoft that can help improve performance significantly based on the workloads. At any point, only one of these allocators can be used.
+  - TCMalloc (Please update the binary location and name if different from default)
+	- Install tcmalloc : sudo apt-get install google-perftools
+	- Add these in the manifest template<br>
+		``loader.env.LD_PRELOAD = "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"``<br>
+		``sgx.trusted_files.libtcmalloc = "file:/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"``<br>
+		``sgx.trusted_files.libunwind = "file:/usr/lib/x86_64-linux-gnu/libunwind.so.8"``
+	- Save the template and rebuild.
+  - mimalloc (Please update the binary location and name if different from default)
+	- Install mimalloc using the steps from https://github.com/microsoft/mimalloc
+	- Add these in the manifest template<br>
+		``loader.env.LD_PRELOAD = "/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"``<br>
+		``sgx.trusted_files.libmimalloc = "file:/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"``
+	- Save the template and rebuild.
diff --git a/Examples/tensorflow/ResNet50/Makefile b/Examples/tensorflow/ResNet50/Makefile
@@ -0,0 +1,49 @@
+# ResNet50 sample for Tensorflow
+
+GRAPHENEDIR ?= ../../..
+SGX_SIGNER_KEY ?= $(GRAPHENEDIR)/Pal/src/host/Linux-SGX/signer/enclave-key.pem
+
+include $(GRAPHENEDIR)/Scripts/Makefile.configs
+
+ifeq ($(DEBUG),1)
+GRAPHENE_LOG_LEVEL = debug
+else
+GRAPHENE_LOG_LEVEL = error
+endif
+
+.PHONY: all collateral
+all: python.manifest
+ifeq ($(SGX),1)
+all: python.manifest.sgx python.sig python.token
+endif
+
+collateral:
+	test -d models || git clone https://github.com/IntelAI/models.git
+	test -f resnet50v1_5_int8_pretrained_model.pb || wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/resnet50v1_5_int8_pretrained_model.pb
+
+python.manifest: python.manifest.template collateral
+	graphene-manifest \
+		-Dlog_level=$(GRAPHENE_LOG_LEVEL) \
+		-Darch_libdir=$(ARCH_LIBDIR) \
+		-Dentrypoint=$(realpath $(shell sh -c "command -v python3")) \
+		-Dpythondistpath=$(PYTHONDISTPATH) \
+		$< >$@
+
+python.manifest.sgx: python.manifest
+	graphene-sgx-sign \
+		--key $(SGX_SIGNER_KEY) \
+		--manifest python.manifest \
+		--output $@
+
+python.sig: python.manifest.sgx
+
+python.token: python.sig
+	graphene-sgx-get-token -output $@ -sig $<
+
+.PHONY: clean
+clean:
+	$(RM) *.manifest *.manifest.sgx *.token *.sig
+
+.PHONY: distclean
+distclean: clean
+	$(RM) -r models/ resnet50v1_5_int8_pretrained_model.pb