Merge remote-tracking branch 'origin/master' into mozga-intel/full_pa…

…th_include
apache · Jan 24, 2022 · 95ae587 · 95ae587
2 parents a9c5b0e + 69e6c04
commit 95ae587
Show file tree

Hide file tree

Showing 73 changed files with 1,449 additions and 1,302 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -102,6 +102,32 @@ message(STATUS "CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}")
 
 message(STATUS "CMAKE_SYSTEM_NAME ${CMAKE_SYSTEM_NAME}")
 
+find_package(Git QUIET)
+if(${GIT_FOUND})
+  execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD 
+    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+    OUTPUT_VARIABLE GIT_BRANCH
+    RESULT_VARIABLE BRANCH_FAILED
+  )
+  execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse HEAD
+    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+    OUTPUT_VARIABLE GIT_COMMIT
+    RESULT_VARIABLE COMMIT_FAILED
+  )
+  if(NOT BRANCH_FAILED)
+    string(REGEX REPLACE "\n$" "" GIT_BRANCH "${GIT_BRANCH}")
+    add_compile_definitions(MXNET_BRANCH="${GIT_BRANCH}")
+  else()
+    add_compile_definitions(MXNET_BRANCH="Unavailable")
+  endif()
+  if(NOT COMMIT_FAILED)
+    string(REGEX REPLACE "\n$" "" GIT_COMMIT "${GIT_COMMIT}")
+    add_compile_definitions(MXNET_COMMIT_HASH="${GIT_COMMIT}")
+  else()
+    add_compile_definitions(MXNET_COMMIT_HASH="Unavailable")
+  endif()
+endif()
+
 if(USE_TVM_OP)
   add_definitions(-DMXNET_USE_TVM_OP=1)
 endif()

diff --git a/KEYS b/KEYS
diff --git a/README.md b/README.md
@@ -68,6 +68,7 @@ Contents
 
 What's New
 ----------
+* [1.9.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.9.0) - MXNet 1.9.0 Release.
 * [1.8.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.8.0) - MXNet 1.8.0 Release.
 * [1.7.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.7.0) - MXNet 1.7.0 Release.
 * [1.6.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.6.0) - MXNet 1.6.0 Release.

diff --git a/benchmark/python/dnnl/fc_add.py b/benchmark/python/dnnl/fc_add.py
@@ -0,0 +1,164 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import time
+import gc
+import sys
+import mxnet as mx
+from mxnet.gluon import nn
+from mxnet.contrib import quantization
+
+#shape, num_hidden:
+sizes = [
+    ((  1, 224),   512),
+    ((  1, 224),  4096),
+    (( 16, 1024), 1024),
+    (( 32, 4096), 1024),
+    (( 32, 4096), 4096),
+    ((512,  512), 4096)]
+
+rounds = 1000
+warmup = 10
+
+test_header = "--no_test_header" not in sys.argv
+table_header = "--no_table_header" not in sys.argv
+table_left_colums = "--no_size_column" not in sys.argv
+dump_graph = "--dump_graph" in sys.argv
+
+def dump_graph_fn(net, postfix):
+    if dump_graph:
+        net.export("/tmp/fc_add_" + postfix)
+
+def operator_string(elemwise_add):
+    return 'elemwise_add' if elemwise_add else 'npi_add'
+
+def print_header(header):
+    print("\n")
+    print(header if test_header else "", "\n")
+    if table_header:
+        if table_left_colums:
+            print("|    Shape    | Hidden | Mean [ms] |" )
+            print("|------------:|-------:|----------:|" )
+        else:
+            print(" Mean [ms] |" )
+            print("----------:|" )
+
+def print_value(shape, hidden, mean):
+    if table_left_colums:
+        print("| ({:4},{:4}) | {:6} | {:9.3f} |".format(shape[0], shape[1], hidden, mean))
+    else:
+        print(" {:9.3f} |".format(mean))
+
+
+def measure(net, data0, data1, data2, shape, nhid):
+    mx.nd.waitall()
+    gc.collect()
+    gc.disable()
+    for i in range(rounds + warmup):
+        if i == warmup:
+            start_time = time.time()
+        o = net(data0, data1, data2)
+        o.wait_to_read()
+    end_time = time.time()
+    run_time = (end_time - start_time)
+    print_value(shape, nhid, 1000 * run_time / rounds)
+    gc.enable()
+
+
+class FCWithSum(nn.HybridBlock):
+    def __init__(self, num_in, num_hidden, elemwise_add, **kwargs):
+        super(FCWithSum, self).__init__(**kwargs)
+        self.fc0 = nn.Dense(units=num_hidden, in_units=num_in)
+        self.fc1 = nn.Dense(units=num_hidden)
+        self.elemwise_add = elemwise_add
+
+    def forward(self, data0, data1, data2):
+        _fc0 = self.fc0(data0)
+        _fc1 = self.fc1(data1)
+        if  self.elemwise_add:
+            _sum0 = mx.nd.elemwise_add(data2.as_nd_ndarray(), _fc0.as_nd_ndarray()).as_np_ndarray()
+            _sum1 = mx.nd.elemwise_add(_fc1.as_nd_ndarray(), _sum0.as_nd_ndarray()).as_np_ndarray()
+        else:
+            _sum0 = data2 + _fc0
+            _sum1 = _fc1 + _sum0
+        return _sum1
+
+def benchmark_float(elemwise_add):
+    header = operator_string(elemwise_add) + ', float'
+    print_header(header)
+    for shape, nhid in sizes:
+        net = FCWithSum(shape[1], nhid, elemwise_add)
+        net.initialize()
+        net.hybridize(static_alloc=True, static_shape=True)
+        data0 = mx.np.random.uniform(size=shape, low=-1.0, high=1.0)
+        data1 = mx.np.random.uniform(size=shape, low=-1.0, high=1.0)
+        shape2 = (shape[0], nhid)
+        data2 = mx.np.random.uniform(size=shape2, low=-1.0, high=1.0)
+        net.optimize_for(data0, data1, data2, backend='ONEDNN')
+        measure(net, data0, data1, data2, shape, nhid)
+    dump_graph_fn(net, operator_string(elemwise_add) + '_float')
+
+class CalibIter(mx.io.DataIter):
+    def __init__(self, batch, data_shape, batch_size):
+        super(CalibIter, self).__init__(batch_size)
+        self.label_shape = (batch_size,)
+        self.data_shape = data_shape
+        if isinstance(data_shape, tuple):
+            self.provide_data = [('data', data_shape)]
+        else:
+            self.provide_data = data_shape
+        self.provide_label = []
+        self.batch = batch
+    def __iter__(self):
+        yield self.batch
+
+def benchmark_int8(quantize_mode, quantize_granularity, elemwise_add):
+    header = operator_string(elemwise_add) + ', mode = ' + quantize_mode + \
+             ', granularity = ' + quantize_granularity
+    print_header(header)
+    for shape, nhid in sizes:
+        net = FCWithSum(shape[1], nhid, elemwise_add)
+        net.initialize()
+        net.hybridize(static_alloc=True, static_shape=True)
+        data0 = mx.np.random.uniform(size=shape, low=-1.0, high=1.0)
+        data1 = mx.np.random.uniform(size=shape, low=-1.0, high=1.0)
+        shape2 = (shape[0], nhid)
+        data2 = mx.np.random.uniform(size=shape2, low=-1.0, high=1.0)
+        data = mx.gluon.data.ArrayDataset(data0, data1, data2)
+        calib_data = mx.gluon.data.DataLoader(data, batch_size=1)
+        net = quantization.quantize_net(net,
+                                        device=mx.cpu(),
+                                        exclude_layers=None,
+                                        exclude_operators=None,
+                                        calib_mode='naive',
+                                        calib_data=calib_data,
+                                        num_calib_batches=1,
+                                        quantize_mode=quantize_mode,
+                                        quantize_granularity=quantize_granularity
+                                        )
+        net.hybridize(static_alloc=True, static_shape=True)
+        measure(net, data0, data1, data2, shape, nhid)
+    dump_graph_fn(net, operator_string(elemwise_add) + \
+                    '_' + str(quantize_mode) + '_' + str(quantize_granularity))
+
+for elemwise_add in [True, False]:
+    benchmark_float(elemwise_add)
+
+for quantize_mode in ['smart', 'full']:
+    for quantize_granularity in ['tensor-wise', 'channel-wise']:
+        for elemwise_add in [True, False]:
+            benchmark_int8(quantize_mode, quantize_granularity, elemwise_add)
diff --git a/benchmark/python/dnnl/run.sh b/benchmark/python/dnnl/run.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Script for running python benchmark with properly setting OMP prarameters for it
+
+check_parametrs() {
+ 	if [ "$#" -eq 0 ] ; then
+		echo "Please give python script to run as parameter."
+		echo "Optionally you can give number of threads to use and python scripts parameters:"
+		echo "    `basename "$0"`  [num_threads] python_script [python script parameters]"
+		exit
+	fi
+}
+
+check_parametrs $@
+
+NUM_SOCKET=`lscpu | grep 'Socket(s)' | awk '{print $NF}'`
+CORES_PER_SOCKET=`lscpu | grep 'Core(s) per socket' | awk '{print $NF}'`
+NUM_CORES=$((CORES_PER_SOCKET * NUM_SOCKET))
+
+integer_reg='^[0-9]+$'
+if [[ $1 =~ $integer_reg ]] ; then
+	if (($1 > $NUM_CORES)); then
+		echo >&2
+		echo "WARNING: given number of threads = $1" \
+			" is greater than number of physical cores = $NUM_CORES." >&2
+		echo >&2
+	fi
+	NUM_CORES=$1
+	shift
+	check_parametrs $@
+fi
+
+CORES={0}:${NUM_CORES}:1
+
+INSTRUCTION="OMP_NUM_THREADS=${NUM_CORES} OMP_PROC_BIND=TRUE OMP_PLACES=${CORES} python3 -u $@"
+echo $INSTRUCTION >&2
+eval $INSTRUCTION
diff --git a/benchmark/python/dnnl/run_per_thread.sh b/benchmark/python/dnnl/run_per_thread.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Script for running python benchmark against number of used OMP threads
+
+
+help_and_exit() {
+	echo "Usage:"
+	echo "    `basename "$0"`  [start_num_threads step_num_threads end_num_threads] python_script [python script parameters]"
+	echo "Number of threads range parameters and python script are optional."
+	exit
+}
+
+if [ "$#" -eq 0 ] ; then
+	help_and_exit
+fi
+
+NUM_SOCKET=`lscpu | grep 'Socket(s)' | awk '{print $NF}'`
+CORES_PER_SOCKET=`lscpu | grep 'Core(s) per socket' | awk '{print $NF}'`
+NUM_CORES=$((CORES_PER_SOCKET * NUM_SOCKET))
+
+NT_START=1
+NT_STEP=1
+NT_END=$NUM_CORES
+
+integer_reg='^[0-9]+$'
+signed_integer_reg='^[+-]*[0-9]+$'
+if [[ $1 =~ $integer_reg ]] ; then
+	if [[ $2 =~ $signed_integer_reg ]] && [[ $3 =~ $integer_reg ]]; then
+		NT_START=$1
+		NT_STEP=$2
+		NT_END=$3
+		shift 3
+		if [ "$#" -eq 0 ] ; then
+			help_and_exit
+		fi
+	else
+		echo "Provide 3 numbers for threads range: start, step and the end."
+		help_and_exit
+	fi
+fi
+
+NT_SEQUENCE=`seq $NT_START $NT_STEP $NT_END`
+if [ -z "$NT_SEQUENCE" ]; then
+	echo "Given threads range produce empy sequence."
+	help_and_exit
+else
+	echo "Start python script $1 for following number of threads:"  >&2
+	echo $NT_SEQUENCE  >&2
+fi
+
+RUN_SCRIPT=`dirname "$0"`/run.sh
+for NT in $NT_SEQUENCE;
+do
+	TMP_FILE=/tmp/_result_${NT}.txt
+	echo  1>${TMP_FILE}
+	if [[ $NT -eq $NT_START ]]; then
+		echo "NUM_THREADS = $NT" 1>>${TMP_FILE}
+		$RUN_SCRIPT $NT $@ 1>>${TMP_FILE}
+	else
+		echo " $NT" 1>>${TMP_FILE}
+		$RUN_SCRIPT $NT $@ --no_size_column --no_test_header 1>>${TMP_FILE}
+	fi
+	TMP_FILES+=" ${TMP_FILE}"
+done
+paste -d "" ${TMP_FILES}
diff --git a/ci/build_windows.py b/ci/build_windows.py
@@ -118,7 +118,7 @@ class BuildFlavour(Enum):
         '-DUSE_BLAS=open '
         '-DUSE_LAPACK=ON '
         '-DUSE_DIST_KVSTORE=OFF '
-        '-DMXNET_CUDA_ARCH="5.2" '
+        '-DMXNET_CUDA_ARCH="5.2 7.5" '
         '-DCMAKE_BUILD_TYPE=Release')
 
     , 'WIN_GPU_ONEDNN': (
@@ -131,7 +131,7 @@ class BuildFlavour(Enum):
         '-DUSE_BLAS=open '
         '-DUSE_LAPACK=ON '
         '-DUSE_DIST_KVSTORE=OFF '
-        '-DMXNET_CUDA_ARCH="5.2" '
+        '-DMXNET_CUDA_ARCH="5.2 7.5" '
         '-DUSE_ONEDNN=ON '
         '-DCMAKE_BUILD_TYPE=Release')
 

diff --git a/ci/docker/Dockerfile.build.ubuntu b/ci/docker/Dockerfile.build.ubuntu
@@ -91,7 +91,8 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
         libb2-dev \
         libzstd-dev \
         gfortran && \
-    rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/* && \
+    add-apt-repository -r "deb https://apt.repos.intel.com/oneapi all main"
 
 # Build OpenBLAS from source
 RUN export LIBRARY_PATH=$LIBRARY_PATH:/usr/lib/gcc/x86_64-linux-gnu/7/ && \

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
@@ -1329,7 +1329,7 @@ build_docs() {
 
     # copy the full site for this version to versions folder
     mkdir -p html/versions/master
-    for f in 404.html api assets blog community ecosystem features feed.xml get_started index.html; do
+    for f in 404.html api assets blog community ecosystem features trusted_by feed.xml get_started index.html; do
         cp -r html/$f html/versions/master/
     done
 
@@ -1355,7 +1355,6 @@ build_docs_beta() {
 push_docs() {
     folder_name=$1
     set -ex
-    pip3 install --user awscli
     export PATH=~/.local/bin:$PATH
     pushd docs/_build
     tar -xzf full_website.tgz --strip-components 1
@@ -1471,7 +1470,6 @@ cd_pypi_publish() {
 
 cd_s3_publish() {
     set -ex
-    pip3 install --upgrade --user awscli
     filepath=$(readlink -f wheel_build/dist/*.whl)
     filename=$(basename $filepath)
     variant=$(echo $filename | cut -d'-' -f1 | cut -d'_' -f2 -s)