gramineproject · Satya1493 · Jul 9, 2021 · Jul 9, 2021 · Jul 12, 2021 · Jul 20, 2021
diff --git a/Examples/tensorflow/BERT/Makefile b/Examples/tensorflow/BERT/Makefile
@@ -0,0 +1,60 @@
+# BERT sample for Tensorflow
+
+GRAPHENEDIR ?= ../../..
+SGX_SIGNER_KEY ?= $(GRAPHENEDIR)/Pal/src/host/Linux-SGX/signer/enclave-key.pem
+
+include $(GRAPHENEDIR)/Scripts/Makefile.configs
+
+ifeq ($(DEBUG),1)
+GRAPHENE_LOG_LEVEL = debug
+else
+GRAPHENE_LOG_LEVEL = error
+endif
+
+.PHONY: all
+all: python.manifest
+ifeq ($(SGX),1)
+all: python.manifest.sgx python.sig python.token
+endif
+
+BERT_DATASET = https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip
+SQUAAD_DATASET = https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
+CHECKPOINTS = https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/bert_large_checkpoints.zip
+BERT_INT8_MODEL = https://storage.googleapis.com/intel-optimized-tensorflow/models/r2.5-icx-b631821f/asymmetric_per_channel_bert_int8.pb
+
+collateral:
+	apt install unzip
+	test -d models || git clone https://github.com/IntelAI/models.git
+	mkdir -p data
+	test -f data/wwm_uncased_L-24_H-1024_A-16.zip || wget $(BERT_DATASET) -P data/
+	test -d data/wwm_uncased_L-24_H-1024_A-16 || unzip data/wwm_uncased_L-24_H-1024_A-16.zip -d data
+	test -f data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json || wget $(SQUAAD_DATASET) -P data/wwm_uncased_L-24_H-1024_A-16
+	test -f data/bert_large_checkpoints.zip || wget $(CHECKPOINTS) -P data/
+	test -d data/bert_large_checkpoints || unzip data/bert_large_checkpoints.zip -d data
+	test -f data/asymmetric_per_channel_bert_int8.pb || wget $(BERT_INT8_MODEL) -P data/
+
+python.manifest: python.manifest.template collateral
+	graphene-manifest \
+		-Dlog_level=$(GRAPHENE_LOG_LEVEL) \
+		-Darch_libdir=$(ARCH_LIBDIR) \
+		-Dentrypoint=$(realpath $(shell sh -c "command -v python3")) \
+		-Dpythondistpath=$(PYTHONDISTPATH) \
+		$< >$@
+
+python.manifest.sgx: python.manifest
+	graphene-sgx-sign \
+		--key $(SGX_SIGNER_KEY) \
+		--manifest $< -output $@
+
+python.sig: python.manifest.sgx
+
+python.token: python.sig
+	graphene-sgx-get-token -output $@ -sig $<
+
+.PHONY: clean
+clean:
+	$(RM) *.manifest *.manifest.sgx *.token *.sig
+
+.PHONY: distclean
+distclean: clean
+	$(RM) -r models/ data/
diff --git a/Examples/tensorflow/BERT/python.manifest.template b/Examples/tensorflow/BERT/python.manifest.template
@@ -0,0 +1,79 @@
+# This manifest was tested on Ubuntu 18.04 with python3.6.
+
+libos.entrypoint = "{{ entrypoint }}"
+loader.preload = "file:{{ graphene.libos }}"
+
+# Graphene log level
+loader.log_level = "{{ log_level }}"
+
+# Read application arguments directly from the command line. Don't use this on production!
+loader.insecure__use_cmdline_argv = 1
+
+# Propagate environment variables from the host. Don't use this on production!
+loader.insecure__use_host_env = 1
+
+# Disable address space layout randomization. Don't use this on production!
+loader.insecure__disable_aslr = 1
+
+# Update Library Path - overwrites environment variable
+loader.env.LD_LIBRARY_PATH = "{{ python.stdlib }}/lib:/lib:{{ arch_libdir }}:/usr/lib:/usr/{{ arch_libdir }}"
+
+# Additional memory for Graphene's internal use
+loader.pal_internal_mem_size = "512M"
+
+# Default glibc files, mounted from graphene's Runtime directory
+fs.mount.lib.type = "chroot"
+fs.mount.lib.path = "/lib"
+fs.mount.lib.uri = "file:{{ graphene.runtimedir() }}"
+
+# More libraries required by Tensorflow
+fs.mount.lib2.type = "chroot"
+fs.mount.lib2.path = "{{ arch_libdir }}"
+fs.mount.lib2.uri = "file:{{ arch_libdir }}"
+
+fs.mount.usr.type = "chroot"
+fs.mount.usr.path = "/usr"
+fs.mount.usr.uri = "file:/usr"
+
+fs.mount.pyhome.type = "chroot"
+fs.mount.pyhome.path = "{{ python.stdlib }}"
+fs.mount.pyhome.uri = "file:{{ python.stdlib }}"
+
+fs.mount.pydisthome.type = "chroot"
+fs.mount.pydisthome.path = "{{ python.distlib }}"
+fs.mount.pydisthome.uri = "file:{{ python.distlib }}"
+
+fs.mount.pydistpath.type = "chroot"
+fs.mount.pydistpath.path = "{{ pythondistpath }}"
+fs.mount.pydistpath.uri = "file:{{ pythondistpath }}"
+
+fs.mount.tmp.type = "chroot"
+fs.mount.tmp.path = "/tmp"
+fs.mount.tmp.uri = "file:/tmp"
+
+fs.mount.etc.type = "chroot"
+fs.mount.etc.path = "/etc"
+fs.mount.etc.uri = "file:/etc"
+
+# SGX general options
+sgx.enclave_size = "32G"
+sgx.thread_num = 256
+sgx.preheat_enclave = 1
+sgx.nonpie_binary = 1
+
+# SGX trusted files
+sgx.trusted_files.runtime = "file:{{ graphene.runtimedir() }}/"
+sgx.trusted_files.arch_libdir = "file:{{ arch_libdir }}/"
+sgx.trusted_files.usr_arch_libdir = "file:/usr/{{ arch_libdir }}/"
+sgx.trusted_files.python = "file:{{ entrypoint }}"
+sgx.trusted_files.pyhome = "file:{{ python.stdlib }}"
+sgx.trusted_files.pydisthome = "file:{{ python.distlib }}"
+sgx.trusted_files.pydistpath = "file:{{ pythondistpath }}"
+
+# SGX allowed files
+sgx.allowed_files.tmp = "file:/tmp/"
+sgx.allowed_files.etc = "file:/etc/"
+sgx.allowed_files.output = "file:output/"
+sgx.allowed_files.scripts = "file:models/"
+sgx.allowed_files.dataDir = "file:data/"
+sgx.allowed_files.keras = "file:root/.keras/keras.json"
diff --git a/Examples/tensorflow/BERT/root/.keras/keras.json b/Examples/tensorflow/BERT/root/.keras/keras.json
@@ -0,0 +1,6 @@
+{
+    "floatx": "float32",
+    "epsilon": 1e-07,
+    "backend": "tensorflow",
+    "image_data_format": "channels_last"
+}
diff --git a/Examples/tensorflow/README.md b/Examples/tensorflow/README.md
@@ -0,0 +1,170 @@
+## Inference on TensorFlow BERT and ResNet50 models
+This directory contains steps and artifacts to run inference with TensorFlow BERT and ResNet50\
+sample workloads on Graphene. Specifically, both these examples use pre-trained models to run inference.\
+We tested this on Ubuntu 18.04 and uses the package version with Python 3.6.
+
+### Bidirectional Encoder Representations from Transformers (BERT):
+BERT is a method of pre-training language representations and then use that trained model for downstream\
+NLP tasks like 'question answering'. BERT is an unsupervised, deeply birectional system for pre-training NLP.\
+In this BERT sample, we use 'BERT-Large, Uncased (Whole Word Masking)' model and perform int8 inference.\
+More details about BERT can be found at https://github.com/google-research/bert.
+
+### Residual Network (ResNet):
+ResNet50 is a convolutional neural network that is 50 layers deep.\
+In this ResNet50(v1.5) sample, we use a pre-trained model and perform int8 inference.\
+More details about ResNet50 can be found at https://github.com/IntelAI/models/tree/icx-launch-public/benchmarks/image_recognition/tensorflow/resnet50v1_5.
+
+
+## Pre-System setting
+Linux systems have CPU frequency scaling governor that helps the system to scale the CPU frequency\
+to achieve best performance or to save power based on the requirement.
+To achieve the best peformance, please set the CPU frequency scaling governor to performance mode.
+
+``for ((i=0; i<$(nproc); i++)); do echo 'performance' > /sys/devices/system/cpu/cpu$i/cpufreq/scaling_governor; done``
+
+## Pre-requisites
+- Install python3.6.
+- Upgrade pip/pip3.
+- Install tensorflow using ``pip install intel-tensorflow-avx512==2.4.0`` or by downloading whl\
+package from https://pypi.org/project/intel-tensorflow-avx512/2.4.0/#files.
+
+## Build BERT or ResNet50 samples
+- To build BERT sample, do ``cd BERT`` or to build ResNet50 sample, do ``cd ResNet50``.
+- To clean the sample, do ``make clean``
+- To clean and remove downloaded models and datasets, do ``make distclean``
+- To build the non-SGX version, do ``make PYTHONDISTPATH=path_to_python_dist_packages/``
+- To build the SGX version, do ``make PYTHONDISTPATH=path_to_python_dist_packages/ SGX=1``
+>**WARNING:** Building BERT sample downloads about 5GB of data.\
+>**NOTE:** Typically, path_to_python_dist_packages is '/usr/local/lib/python3.6/dist-packages',\
+but can change based on python's installation directory.
+
+## Run inference on BERT model
+- To run int8 inference on graphene-sgx(SGX version)
+```
+OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-sgx \
+./python models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \
+--init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 \
+--vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt \
+--bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json \
+--predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json \
+--precision=int8 \
+--output_dir=output/bert-squad-output \
+--predict_batch_size=32 \
+--experimental_gelu=True \
+--optimized_softmax=True \
+--input_graph=data/asymmetric_per_channel_bert_int8.pb \
+--do_predict=True --mode=benchmark \
+--inter_op_parallelism_threads=1 \
+--intra_op_parallelism_threads=36
+```
+- To run int8 inference on graphene-direct(non-SGX version)
+```
+OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 \
+graphene-direct ./python models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \
+--init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 \
+--vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt \
+--bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json \
+--predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json \
+--precision=int8 \
+--output_dir=output/bert-squad-output \
+--predict_batch_size=32 \
+--experimental_gelu=True \
+--optimized_softmax=True \
+--input_graph=data/asymmetric_per_channel_bert_int8.pb \
+--do_predict=True \
+--mode=benchmark \
+--inter_op_parallelism_threads=1 \
+--intra_op_parallelism_threads=36
+```
+- To run int8 inference on native baremetal(outside graphene)
+```
+OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 python3.6 \
+models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \
+--init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 \
+--vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt \
+--bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json \
+--predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json \
+--precision=int8 --output_dir=output/bert-squad-output \
+--predict_batch_size=32 \
+--experimental_gelu=True \
+--optimized_softmax=True \
+--input_graph=data/asymmetric_per_channel_bert_int8.pb \
+--do_predict=True \
+--mode=benchmark \
+--inter_op_parallelism_threads=1 \
+--intra_op_parallelism_threads=36
+```
+- Above commands are for a 36 core system. Please set the following options accordingly for optimal performance.
+    - OMP_NUM_THREADS='Core(s) per socket'
+    - taskset to 'Core(s) per socket'
+    - intra_op_parallelism_threads='Core(s) per socket'
+    - If hyperthreading is enabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact,1,0``
+    - If hyperthreading is disabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact``
+>**NOTE:** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'``\
+> OMP_NUM_THREADS sets the maximum number of threads to use for OpenMP parallel regions. \
+> KMP_AFFINITY binds OpenMP threads to physical processing units.
+
+## Run inference on ResNet50 model
+- To run inference on graphene-sgx(SGX version)
+```
+OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-sgx \
+./python models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py \
+--input-graph=resnet50v1_5_int8_pretrained_model.pb \
+--num-inter-threads=1 \
+--num-intra-threads=36 \
+--batch-size=32 \
+--warmup-steps=50 \
+--steps=500
+```
+- To run inference on graphene-direct(non-SGX version)
+```
+OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-direct \
+./python models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py \
+--input-graph=resnet50v1_5_int8_pretrained_model.pb \
+--num-inter-threads=1 \
+--num-intra-threads=36 \
+--batch-size=32 \
+--warmup-steps=50 \
+--steps=500
+```
+- To run inference on native baremetal(outside graphene)
+```
+OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 python3.6 \
+models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py \
+--input-graph=resnet50v1_5_int8_pretrained_model.pb \
+--num-inter-threads=1 \
+--num-intra-threads=36 \
+--batch-size=32 \
+--warmup-steps=50 \
+--steps=500
+```
+- Above commands are for a 36 core system. Please set the following options accordingly for optimal performance.
+    - OMP_NUM_THREADS='Core(s) per socket'
+    - taskset to 'Core(s) per socket'
+    - num-intra-threads='Core(s) per socket'
+    - If hyperthreading is enabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact,1,0``
+    - If hyperthreading is disabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact``
+    - The options batch-size, warmup-steps and steps can be varied.
+>**NOTE:** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'``\
+> OMP_NUM_THREADS sets the maximum number of threads to use for OpenMP parallel regions.\
+> KMP_AFFINITY binds OpenMP threads to physical processing units.
+
+## Performance considerations
+- Preheat manifest option pre-faults the enclave memory and moves the performance penalty to\
+graphene-sgx invocation (before the workload starts execution).\
+To use preheat option, add ``sgx.preheat_enclave = 1`` to the manifest template.
+- TCMalloc and mimalloc are memory allocator libraries from Google and Microsoft that can help\
+improve performance significantly based on the workloads. At any point, only one of these allocators can be used.
+  - TCMalloc (Please update the binary location and name if different from default)
+    - Install tcmalloc : ``sudo apt-get install google-perftools``
+    - Add these in the manifest template
+        ```loader.env.LD_PRELOAD = "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"```
+        ```sgx.trusted_files.libtcmalloc = "file:/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"```
+        ```sgx.trusted_files.libunwind = "file:/usr/lib/x86_64-linux-gnu/libunwind.so.8"```
+    - Save the template and rebuild.
+  - mimalloc (Please update the binary location and name if different from default)
+    - Install mimalloc using the steps from https://github.com/microsoft/mimalloc
+    - Add these in the manifest template
+        ```loader.env.LD_PRELOAD = "/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"```
+        ```sgx.trusted_files.libmimalloc = "file:/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"```
+    - Save the template and rebuild.
diff --git a/Examples/tensorflow/ResNet50/Makefile b/Examples/tensorflow/ResNet50/Makefile
@@ -0,0 +1,49 @@
+# ResNet50 sample for Tensorflow
+
+GRAPHENEDIR ?= ../../..
+SGX_SIGNER_KEY ?= $(GRAPHENEDIR)/Pal/src/host/Linux-SGX/signer/enclave-key.pem
+
+include $(GRAPHENEDIR)/Scripts/Makefile.configs
+
+ifeq ($(DEBUG),1)
+GRAPHENE_LOG_LEVEL = debug
+else
+GRAPHENE_LOG_LEVEL = error
+endif
+
+.PHONY: all collateral
+all: python.manifest
+ifeq ($(SGX),1)
+all: python.manifest.sgx python.sig python.token
+endif
+
+collateral:
+	test -d models || git clone https://github.com/IntelAI/models.git
+	test -f resnet50v1_5_int8_pretrained_model.pb || wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/resnet50v1_5_int8_pretrained_model.pb
+
+python.manifest: python.manifest.template collateral
+	graphene-manifest \
+		-Dlog_level=$(GRAPHENE_LOG_LEVEL) \
+		-Darch_libdir=$(ARCH_LIBDIR) \
+		-Dentrypoint=$(realpath $(shell sh -c "command -v python3")) \
+		-Dpythondistpath=$(PYTHONDISTPATH) \
+		$< >$@
+
+python.manifest.sgx: python.manifest
+	graphene-sgx-sign \
+		--key $(SGX_SIGNER_KEY) \
+		--manifest python.manifest \
+		--output $@
+
+python.sig: python.manifest.sgx
+
+python.token: python.sig
+	graphene-sgx-get-token -output $@ -sig $<
+
+.PHONY: clean
+clean:
+	$(RM) *.manifest *.manifest.sgx *.token *.sig
+
+.PHONY: distclean
+distclean: clean
+	$(RM) -r models/ resnet50v1_5_int8_pretrained_model.pb