diff --git a/docs/conf.py b/docs/conf.py
index 3b5c38dd23bc..d2a0ef0283f8 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -107,7 +107,7 @@
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
-exclude_patterns = ['3rdparty', 'api/python/model.md', 'build_version_doc', 'README.md', 'tutorial_utils', 'virtualenv']
+exclude_patterns = ['3rdparty', 'api/python/model.md', 'build_version_doc', 'cpp_docs', 'python_docs', 'README.md', 'static_site', 'tutorial_utils', 'virtualenv']
# The reST default role (used for this markup: `text`) to use for all documents.
diff --git a/docs/cpp_docs/Doxyfile b/docs/cpp_docs/Doxyfile
new file mode 100644
index 000000000000..463641a6bdab
--- /dev/null
+++ b/docs/cpp_docs/Doxyfile
@@ -0,0 +1,2370 @@
+# Doxyfile 1.8.8
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME = "mxnet"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF =
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is included in
+# the documentation. The maximum height of the logo should not exceed 55 pixels
+# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo
+# to the output directory.
+
+PROJECT_LOGO =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY = build/html
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+#ALLOW_UNICODE_NAMES = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC = YES
+
+# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB = NO
+
+# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a
+# new page for each member. If set to NO, the documentation of a member will be
+# part of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE = 8
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+#MARKDOWN_SUPPORT = YES
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by by putting a % sign in front of the word
+# or globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+#AUTOLINK_SUPPORT = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+#EXTRACT_PACKAGE = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES = YES
+
+# This flag is only useful for Objective-C code. When set to YES local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO these classes will be included in the various overviews. This option has
+# no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+#SHOW_GROUPED_MEMB_INC = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING = NO
+
+# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the
+# todo list. This list is created by putting \todo commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST = YES
+
+# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the
+# test list. This list is created by putting \test commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST = YES
+
+# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if ... \endif and \cond
+# ... \endcond blocks.
+
+ENABLED_SECTIONS =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES the list
+# will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO doxygen will only warn about wrong or incomplete parameter
+# documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC = YES
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces.
+# Note: If this tag is empty the current directory is searched.
+
+INPUT = ../../include ../../src/common ../../cpp-package/include/mxnet-cpp
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank the
+# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii,
+# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp,
+# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown,
+# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
+# *.qsf, *.as and *.js.
+
+FILE_PATTERNS = *.h
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE = 3rdparty
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS = */test/* \
+ logging.h
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+#
+#
+# where is the value of the INPUT_FILTER tag, and is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+
+INPUT_FILTER =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER ) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+#USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES, then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+#SOURCE_TOOLTIPS = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES, then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# compiled with the --with-libclang option.
+# The default value is: NO.
+
+#CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+#CLANG_OPTIONS =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefor more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra stylesheet files is of importance (e.g. the last
+# stylesheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+#HTML_EXTRA_STYLESHEET =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the stylesheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+#HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler ( hhc.exe). If non-empty
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated (
+# YES) or that it should be included in the master .chm file ( NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated (
+# YES) or a normal table of contents ( NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using prerendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+#MATHJAX_FORMAT = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH = http://www.mathjax.org/mathjax
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+#MATHJAX_CODEFILE =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use + S
+# (what the is depends on the OS and browser, but it is typically
+# , /
+
+Why does this help? [Some research](https://papers.nips.cc/paper/7515-how-does-batch-normalization-help-optimization.pdf) has found that networks with normalization have a loss function that's easier to optimize using stochastic gradient descent. Other reasons are that it prevents saturation of activations and prevents certain features from dominating due to differences in scale.
+
+### Data Normalization
+
+One of the first applications of normalization is on the input data to the network. You can do this with the following steps:
+
+* **Step 1** is to calculate the mean and standard deviation of the entire training dataset. You'll usually want to do this for each channel separately. Sometimes you'll see normalization on images applied per pixel, but per channel is more common.
+* **Step 2** is to use these statistics to normalize each batch for training and for inference too.
+
+Tip: A `BatchNorm` layer at the start of your network can have a similar effect (see 'Beta and Gamma' section for details on how this can be achieved). You won't need to manually calculate and keep track of the normalization statistics.
+
+Warning: You should calculate the normalization means and standard deviations using the training dataset only. Any leakage of information from you testing dataset will effect the reliability of your testing metrics.
+
+When using pre-trained models from the [Gluon Model Zoo](https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html) you'll usually see the normalization statistics used for training (i.e. statistics from step 1). You'll want to use these statistics to normalize your own input data for fine-tuning or inference with these models. Using `transforms.Normalize` is one way of applying the normalization, and this should be used in the `Dataset`.
+
+```python
+import mxnet as mx
+from mxnet.gluon.data.vision.transforms import Normalize
+
+image_int = mx.nd.random.randint(low=0, high=256, shape=(1,3,2,2))
+image_float = image_int.astype('float32')/255
+# the following normalization statistics are taken from gluon model zoo
+normalizer = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+image = normalizer(image_float)
+image
+```
+
+### Activation Normalization
+
+We don't have to limit ourselves to normalizing the inputs to the network either. A similar idea can be applied inside the network too, and we can normalize activations between certain layer operations. With deep neural networks most of the convergence benefits described are from this type of normalization.
+
+MXNet Gluon has 3 of the most commonly used normalization blocks: `BatchNorm`, `LayerNorm` and `InstanceNorm`. You can use them in networks just like any other MXNet Gluon Block, and are often used after `Activation` Blocks.
+
+Watch Out: Check the architecture of models carefully because sometimes the normalization is applied before the `Activation`.
+
+Advanced: all of the following methods begin by normalizing certain input distribution (i.e. zero-centered with unit variance), but then shift by (a trainable parameter) beta and scale by (a trainable parameter) gamma. Overall the effect is changing the input distribution to have a mean of beta and a variance of gamma, also allowing to the network to 'undo' the effect of the normalization if necessary.
+
+## Batch Normalization
+
+Figure 1: `BatchNorm` on NCHW data | Figure 2: `BatchNorm` on NTC data
+- | -
+![](./imgs/NCHW_BN.png) | ![](./imgs/NTC_BN.png)
+(e.g. batch of images) using the default of `axis=1` | (e.g. batch of sequences) overriding the default with `axis=2` (or `axis=-1`)
+
+One of the most popular normalization techniques is Batch Normalization, usually called BatchNorm for short. We normalize the activations **across all samples in a batch** for each of the channels independently. See Figure 1. We calculate two batch (or local) statistics for every channel to perform the normalization: the mean and variance of the activations in that channel for all samples in a batch. And we use these to shift and scale respectively.
+
+Tip: we can use this at the start of a network to perform data normalization, although this is not exactly equivalent to the data normalization example seen above (that had fixed normalization statistics). With `BatchNorm` the normalization statistics depend on the batch, so could change each batch, and there can also be a post-normalization shift and scale.
+
+Warning: the estimates for the batch mean and variance can themselves have high variance when the batch size is small (or when the spatial dimensions of samples are small). This can lead to instability during training, and unreliable estimates for the global statistics.
+
+Warning: it seems that `BatchNorm` is better suited to convolutional networks (CNNs) than recurrent networks (RNNs). We expect the input distribution to the recurrent cell to change over time, so normalization over time doesn't work well. `LayerNorm` is better suited for this case. When you do *need* to use `BatchNorm` on sequential data, make sure the `axis` parameter is set correctly. With data in NTC format you should set `axis=2` (or `axis=-1` equivalently). See Figure 2.
+
+As an example, we'll apply `BatchNorm` to a batch of 2 samples, each with 2 channels, and both height and width of 2 (in NCHW format).
+
+
+```python
+data = mx.nd.arange(start=0, stop=2*2*2*2).reshape(2, 2, 2, 2)
+print(data)
+```
+
+With MXNet Gluon we can apply batch normalization with the `mx.gluon.nn.BatchNorm` block. It can be created and used just like any other MXNet Gluon block (such as `Conv2D`). Its input will typically be unnormalized activations from the previous layer, and the output will be the normalized activations ready for the next layer. Since we're using data in NCHW format we can use the default axis.
+
+
+```python
+net = mx.gluon.nn.BatchNorm()
+```
+
+We still need to initialize the block because it has a number of trainable parameters, as we'll see later on.
+
+
+```python
+net.initialize()
+```
+
+We can now run the network as we would during training (under `autograd.record` context scope).
+
+Remember: `BatchNorm` runs differently during training and inference. When training, the batch statistics are used for normalization. During inference, a exponentially smoothed average of the batch statistics that have been observed during training is used instead.
+
+Warning: `BatchNorm` assumes the channel dimension is the 2nd in order (i.e. `axis=1`). You need to ensure your data has a channel dimension, and change the `axis` parameter of `BatchNorm` if it's not the 2nd dimension. A batch of greyscale images of shape `(100,32,32)` would not work, since the 2nd dimension is height and not channel. You'd need to add a channel dimension using `data.expand_dims(1)` in this case to give shape `(100,1,32,32)`.
+
+
+```python
+with mx.autograd.record():
+ output = net(data)
+ loss = output.abs()
+loss.backward()
+print(output)
+```
+
+We can immediately see the activations have been scaled down and centered around zero. Activations are the same for each channel, because each channel was normalized independently. We can do a quick sanity check on these results, by manually calculating the batch mean and variance for each channel.
+
+
+```python
+batch_means = data.mean(axis=1, exclude=True)
+batch_vars = (data - batch_means.reshape(1, -1, 1, 1)).square().mean(axis=1, exclude=True)
+print('batch_means:', batch_means.asnumpy())
+print('batch_vars:', batch_vars.asnumpy())
+```
+
+And use these to scale the first entry in `data`, to confirm the `BatchNorm` calculation of `-1.324` was correct.
+
+
+```python
+print("manually calculated:", ((data[0][0][0][0] - batch_means[0])/batch_vars[0].sqrt()).asnumpy())
+print("automatically calculated:", output[0][0][0][0].asnumpy())
+```
+
+As mentioned before, `BatchNorm` has a number of parameters that update throughout training. 2 of the parameters are not updated in the typical fashion (using gradients), but instead are updated deterministically using exponential smoothing. We need to keep track of the average mean and variance of batches during training, so that we can use these values for normalization during inference.
+
+Why are global statistics needed? Often during inference, we have a batch size of 1 so batch variance would be impossible to calculate. We can just use global statistics instead. And we might get a data distribution shift between training and inference data, which shouldn't just be normalized away.
+
+Advanced: when using a pre-trained model inside another model (e.g. a pre-trained ResNet as a image feature extractor inside an instance segmentation model) you might want to use global statistics of the pre-trained model *during training*. Setting `use_global_stats=True` is a method of using the global running statistics during training, and preventing the global statistics from updating. It has no effect on inference mode.
+
+After a single step (specifically after the `backward` call) we can see the `running_mean` and `running_var` have been updated.
+
+```python
+print('running_mean:', net.running_mean.data().asnumpy())
+print('running_var:', net.running_var.data().asnumpy())
+```
+
+You should notice though that these running statistics do not match the batch statistics we just calculated. And instead they are just 10% of the value we'd expect. We see this because of the exponential average process, and because the `momentum` parameter of `BatchNorm` is equal to 0.9 : i.e. 10% of the new value, 90% of the old value (which was initialized to 0). Over time the running statistics will converge to the statistics of the input distribution, while still being flexible enough to adjust to shifts in the input distribution. Using the same batch another 100 times (which wouldn't happen in practice), we can see the running statistics converge to the batch statsitics calculated before.
+
+
+```python
+for i in range(100):
+ with mx.autograd.record():
+ output = net(data)
+ loss = output.abs()
+ loss.backward()
+print('running_means:', net.running_mean.data().asnumpy())
+print('running_vars:', net.running_var.data().asnumpy())
+```
+
+#### Beta and Gamma
+
+As mentioned previously, there are two additional parameters in `BatchNorm` which are trainable in the typical fashion (with gradients). `beta` is used to shift and `gamma` is used to scale the normalized distribution, which allows the network to 'undo' the effects of normalization if required.
+
+Advanced: Sometimes used for input normalization, you can prevent `beta` shifting and `gamma` scaling by setting the learning rate multipler (i.e. `lr_mult`) of these parameters to 0. Zero centering and scaling to unit variance will still occur, only post normalization shifting and scaling will prevented. See [this discussion post](https://discuss.mxnet.io/t/mxnet-use-batch-norm-for-input-scaling/3581/3) for details.
+
+We haven't updated these parameters yet, so they should still be as initialized. You can see the default for `beta` is 0 (i.e. not shift) and `gamma` is 1 (i.e. not scale), so the initial behaviour is to keep the distribution unit normalized.
+
+
+```python
+print('beta:', net.beta.data().asnumpy())
+print('gamma:', net.gamma.data().asnumpy())
+```
+
+We can also check the gradient on these parameters. Since we were finding the gradient of the sum of absolute values, we would expect the gradient of `gamma` to be equal to the number of points in the data (i.e. 16). So to minimize the loss we'd decrease the value of `gamma`, which would happen as part of a `trainer.step`.
+
+
+```python
+print('beta gradient:', net.beta.grad().asnumpy())
+print('gamma gradient:', net.gamma.grad().asnumpy())
+```
+
+#### Inference Mode
+
+When it comes to inference, `BatchNorm` uses the global statistics that were calculated during training. Since we're using the same batch of data over and over again (and our global running statistics have converged), we get a very similar result to using training mode. `beta` and `gamma` are also applied by default (unless explicitly removed).
+
+
+```python
+output = net(data)
+print(output)
+```
+
+## Layer Normalization
+
+An alternative to `BatchNorm` that is better suited to recurrent networks (RNNs) is called `LayerNorm`. Unlike `BatchNorm` which normalizes across all samples of a batch per channel, `LayerNorm` normalizes **across all channels of a single sample**.
+
+Some of the disadvantages of `BatchNorm` no longer apply. Small batch sizes are no longer an issue, since normalization statistics are calculated on single samples. And confusion around training and inference modes disappears because `LayerNorm` is the same for both modes.
+
+Warning: similar to having a small batch sizes in `BatchNorm`, you may have issues with `LayerNorm` if the input channel size is small. Using embeddings with a large enough dimension size avoids this (approx >20).
+
+Warning: currently MXNet Gluon's implementation of `LayerNorm` is applied along a single axis (which should be the channel axis). Other frameworks have the option to apply normalization across multiple axes, which leads to differences in `LayerNorm` on NCHW input by default. See Figure 3. Other frameworks can normalize over C, H and W, not just C as with MXNet Gluon.
+
+Remember: `LayerNorm` is intended to be used with data in NTC format so the default normalization axis is set to -1 (corresponding to C for channel). Change this to `axis=1` if you need to apply `LayerNorm` to data in NCHW format.
+
+Figure 3: `LayerNorm` on NCHW data | Figure 4: `LayerNorm` on NTC data
+- | -
+![](./imgs/NCHW_LN.png) | ![](./imgs/NTC_LN.png)
+(e.g. batch of images) overriding the default with `axis=1` | (e.g. batch of sequences) using the default of `axis=-1`
+
+As an example, we'll apply `LayerNorm` to a batch of 2 samples, each with 4 time steps and 2 channels (in NTC format).
+
+
+```python
+data = mx.nd.arange(start=0, stop=2*4*2).reshape(2, 4, 2)
+print(data)
+```
+
+With MXNet Gluon we can apply layer normalization with the `mx.gluon.nn.LayerNorm` block. We need to call `initialize` because `LayerNorm` has two learnable parameters by default: `beta` and `gamma` that are used for post normalization shifting and scaling of each channel.
+
+
+```python
+net = mx.gluon.nn.LayerNorm()
+net.initialize()
+output = net(data)
+print(output)
+```
+
+We can see that normalization has been applied across all channels for each time step and each sample.
+
+We can also check the parameters `beta` and `gamma` and see that they are per channel (i.e. 2 of each in this example).
+
+
+```python
+print('beta:', net.beta.data().asnumpy())
+print('gamma:', net.gamma.data().asnumpy())
+```
+
+## Instance Normalization
+
+Another less common normalization technique is called `InstanceNorm`, which can be useful for certain tasks such as image stylization. Unlike `BatchNorm` which normalizes across all samples of a batch per channel, `InstanceNorm` normalizes **across all spatial dimensions per channel per sample** (i.e. each sample of a batch is normalized independently).
+
+Watch out: `InstanceNorm` is better suited to convolutional networks (CNNs) than recurrent networks (RNNs). We expect the input distribution to the recurrent cell to change over time, so normalization over time doesn't work well. LayerNorm is better suited for this case.
+
+Figure 3: `InstanceNorm` on NCHW data | Figure 4: `InstanceNorm` on NTC data
+- | -
+![](./imgs/NCHW_IN.png) | ![](./imgs/NTC_IN.png)
+(e.g. batch of images) using the default `axis=1` | (e.g. batch of sequences) overiding the default with `axis=2` (or `axis=-1` equivalently)
+
+As an example, we'll apply `InstanceNorm` to a batch of 2 samples, each with 2 channels, and both height and width of 2 (in NCHW format).
+
+
+```python
+data = mx.nd.arange(start=0, stop=2*2*2*2).reshape(2, 2, 2, 2)
+print(data)
+```
+
+With MXNet Gluon we can apply instance normalization with the `mx.gluon.nn.InstanceNorm` block. We need to call `initialize` because InstanceNorm has two learnable parameters by default: `beta` and `gamma` that are used for post normalization shifting and scaling of each channel.
+
+
+```python
+net = mx.gluon.nn.InstanceNorm()
+net.initialize()
+output = net(data)
+print(output)
+```
+
+We can also check the parameters `beta` and `gamma` and see that they are per channel (i.e. 2 of each in this example).
+
+
+```python
+print('beta:', net.beta.data().asnumpy())
+print('gamma:', net.gamma.data().asnumpy())
+```
diff --git a/docs/python_docs/python/tutorials/packages/gluon/parameters.md b/docs/python_docs/python/tutorials/packages/gluon/parameters.md
new file mode 100644
index 000000000000..57ab5304bf9a
--- /dev/null
+++ b/docs/python_docs/python/tutorials/packages/gluon/parameters.md
@@ -0,0 +1,243 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# Parameter Management
+
+
+
+The ultimate goal of training deep neural networks is finding good parameter values for a given architecture. The [`nn.Sequential`](http://beta.mxnet.io/api/gluon/_autogen/mxnet.gluon.nn.Sequential.html#mxnet.gluon.nn.Sequential) class is a perfect tool to work with standard models. However, very few models are entirely standard, and most scientists want to build novel things, which requires working with model parameters.
+
+This section shows how to manipulate parameters. In particular we will cover the following aspects:
+
+* How to access parameters in order to debug, diagnose, visualize or save them. It is the first step to understand how to work with custom models.
+* We will learn how to set parameters to specific values, e.g. how to initialize them. We will discuss the structure of parameter initializers.
+* We will show how this knowledge can be used to build networks that share some parameters.
+
+As always, we start with a Multilayer Perceptron with a single hidden layer. We will use it to demonstrate the aspects mentioned above.
+
+```{.python .input n=1}
+from mxnet import init, nd
+from mxnet.gluon import nn
+
+
+net = nn.Sequential()
+net.add(nn.Dense(256, activation='relu'))
+net.add(nn.Dense(10))
+net.initialize() # Use the default initialization method
+
+x = nd.random.uniform(shape=(2, 20))
+net(x) # Forward computation
+```
+
+## Parameter Access
+
+In case of a Sequential class we can access the parameters simply by indexing each layer of the network. The `params` variable contains the required data. Let's try this out in practice by inspecting the parameters of the first layer.
+
+```{.python .input n=2}
+print(net[0].params)
+print(net[1].params)
+```
+
+From the output we can see that the layer consists of two sets of parameters: `dense0_weight` and `dense0_bias`. They are both single precision and they have the necessary shapes that we would expect from the first layer, given that the input dimension is 20 and the output dimension 256. The names of the parameters are very useful, because they allow us to identify parameters *uniquely* even in a network of hundreds of layers and with nontrivial structure. The second layer is structured in a similar way.
+
+### Targeted Parameters
+
+In order to do something useful with the parameters we need to access them. There are several ways to do this, ranging from simple to general. Let's look at some of them.
+
+```{.python .input n=3}
+print(net[1].bias)
+print(net[1].bias.data())
+```
+
+The first line returns the bias of the second layer. Since this is an object containing data, gradients, and additional information, we need to request the data explicitly. To request the data, we call `data` method on the parameter on the second line. Note that the bias is all 0 since we initialized the bias to contain all zeros.
+
+We can also access the parameter by name, such as `dense0_weight`. This is possible since each layer comes with its own parameter dictionary that can be accessed directly. Both methods are entirely equivalent, but the first method leads to more readable code.
+
+```{.python .input n=4}
+print(net[0].params['dense0_weight'])
+print(net[0].params['dense0_weight'].data())
+```
+
+Note that the weights are nonzero as they were randomly initialized when we constructed the network.
+
+[`data`](http://beta.mxnet.io/api/gluon/_autogen/mxnet.gluon.Parameter.data.html) is not the only method that we can invoke. For instance, we can compute the gradient with respect to the parameters. It has the same shape as the weight. However, since we did not invoke backpropagation yet, the values are all 0.
+
+```{.python .input n=5}
+net[0].weight.grad()
+```
+
+### All Parameters at Once
+
+Accessing parameters as described above can be a bit tedious, in particular if we have more complex blocks, or blocks of blocks (or even blocks of blocks of blocks), since we need to walk through the entire tree in reverse order to learn how the blocks were constructed. To avoid this, blocks come with a method [`collect_params`](http://beta.mxnet.io/api/gluon/_autogen/mxnet.gluon.nn.Block.collect_params.html#mxnet.gluon.nn.Block.collect_params) which grabs all parameters of a network in one dictionary such that we can traverse it with ease. It does so by iterating over all constituents of a block and calls `collect_params` on sub-blocks as needed. To see the difference, consider the following:
+
+```{.python .input n=6}
+# Parameters only for the first layer
+print(net[0].collect_params())
+# Parameters of the entire network
+print(net.collect_params())
+```
+
+This provides us with the third way of accessing the parameters of the network. If we want to get the value of the bias term of the second layer we could simply use this:
+
+```{.python .input n=7}
+net.collect_params()['dense1_bias'].data()
+```
+
+By adding a regular expression as an argument to `collect_params` method, we can select only a particular set of parameters whose names are matched by the regular expression.
+
+```{.python .input n=8}
+print(net.collect_params('.*weight'))
+print(net.collect_params('dense0.*'))
+```
+
+### Rube Goldberg strikes again
+
+Let's see how the parameter naming conventions work if we nest multiple blocks inside each other. For that we first define a function that produces blocks (a block factory, so to speak) and then we combine these inside yet larger blocks.
+
+```{.python .input n=20}
+def block1():
+ net = nn.Sequential()
+ net.add(nn.Dense(32, activation='relu'))
+ net.add(nn.Dense(16, activation='relu'))
+ return net
+
+def block2():
+ net = nn.Sequential()
+ for i in range(4):
+ net.add(block1())
+ return net
+
+rgnet = nn.Sequential()
+rgnet.add(block2())
+rgnet.add(nn.Dense(10))
+rgnet.initialize()
+rgnet(x)
+```
+
+Now that we are done designing the network, let's see how it is organized. `collect_params` provides us with this information, both in terms of naming and in terms of logical structure.
+
+```{.python .input}
+print(rgnet.collect_params)
+print(rgnet.collect_params())
+```
+
+We can access layers following the hierarchy in which they are structured. For instance, if we want to access the bias of the first layer of the second subblock of the first major block, we could perform the following:
+
+```{.python .input}
+rgnet[0][1][0].bias.data()
+```
+
+### Saving and loading parameters
+
+In order to save parameters, we can use [`save_parameters`](http://beta.mxnet.io/api/gluon/_autogen/mxnet.gluon.nn.Block.save_parameters.html#mxnet.gluon.nn.Block.save_parameters) method on the whole network or a particular subblock. The only parameter that is needed is the `file_name`. In a similar way, we can load parameters back from the file. We use [`load_parameters`](http://beta.mxnet.io/api/gluon/_autogen/mxnet.gluon.nn.Block.load_parameters.html#mxnet.gluon.nn.Block.load_parameters) method for that:
+
+```{.python .input}
+rgnet.save_parameters('model.params')
+rgnet.load_parameters('model.params')
+```
+
+## Parameter Initialization
+
+Now that we know how to access the parameters, let's look at how to initialize them properly. By default, MXNet initializes the weight matrices uniformly by drawing from $U[-0.07, 0.07]$ and the bias parameters are all set to $0$. However, we often need to use other methods to initialize the weights. MXNet's [`init`](http://beta.mxnet.io/api/gluon-related/mxnet.initializer.html?#module-mxnet.initializer) module provides a variety of preset initialization methods, but if we want something unusual, we need to do a bit of extra work.
+
+### Built-in Initialization
+
+Let's begin with the built-in initializers. The code below initializes all parameters with Gaussian random variables.
+
+```{.python .input n=9}
+# force_reinit ensures that the variables are initialized again,
+# regardless of whether they were already initialized previously
+net.initialize(init=init.Normal(sigma=0.01), force_reinit=True)
+net[0].weight.data()[0]
+```
+
+If we wanted to initialize all parameters to 1, we could do this simply by changing the initializer to [`Constant(1)`](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.initializer.Constant.html#mxnet.initializer.Constant).
+
+```{.python .input n=10}
+net.initialize(init=init.Constant(1), force_reinit=True)
+net[0].weight.data()[0]
+```
+
+If we want to initialize only a specific parameter in a different manner, we can simply set the initializer only for the appropriate subblock (or parameter) for that matter. For instance, below we initialize the second layer to a constant value of 42 and we use the [`Xavier`](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.initializer.Xavier.html#mxnet.initializer.Xavier) initializer for the weights of the first layer.
+
+```{.python .input n=11}
+net[1].initialize(init=init.Constant(42), force_reinit=True)
+net[0].weight.initialize(init=init.Xavier(), force_reinit=True)
+print(net[1].weight.data()[0,0])
+print(net[0].weight.data()[0])
+```
+
+### Custom Initialization
+
+Sometimes, the initialization methods we need are not provided in the `init` module. If this is the case, we can implement a subclass of the [`Initializer`](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.initializer.Initializer.html#mxnet.initializer.Initializer) class so that we can use it like any other initialization method. Usually, we only need to implement the `_init_weight` method and modify the incoming NDArray according to the initial result. In the example below, we pick a nontrivial distribution, just to prove the point. We draw the coefficients from the following distribution:
+
+$$
+\begin{aligned}
+ w \sim \begin{cases}
+ U[5, 10] & \text{ with probability } \frac{1}{4} \\
+ 0 & \text{ with probability } \frac{1}{2} \\
+ U[-10, -5] & \text{ with probability } \frac{1}{4}
+ \end{cases}
+\end{aligned}
+$$
+
+```{.python .input n=12}
+class MyInit(init.Initializer):
+ def _init_weight(self, name, data):
+ print('Init', name, data.shape)
+ data[:] = nd.random.uniform(low=-10, high=10, shape=data.shape)
+ data *= data.abs() >= 5
+
+net.initialize(MyInit(), force_reinit=True)
+net[0].weight.data()[0]
+```
+
+If even this functionality is insufficient, we can set parameters directly. Since `data()` returns an NDArray we can access it just like any other matrix. A note for advanced users - if you want to adjust parameters within an [`autograd`](http://beta.mxnet.io/api/gluon-related/mxnet.autograd.html?#module-mxnet.autograd) scope you need to use [`set_data`](http://beta.mxnet.io/api/gluon/_autogen/mxnet.gluon.Parameter.set_data.html#mxnet.gluon.Parameter.set_data) to avoid confusing the automatic differentiation mechanics.
+
+```{.python .input n=13}
+net[0].weight.data()[:] += 1
+net[0].weight.data()[0,0] = 42
+net[0].weight.data()[0]
+```
+
+## Tied Parameters
+
+In some cases, we want to share model parameters across multiple layers. For instance, when we want to find good word embeddings we may decide to use the same parameters both for encoding and decoding of words. In the code below, we allocate a dense layer and then use its parameters specifically to set those of another layer.
+
+```{.python .input n=14}
+net = nn.Sequential()
+# We need to give the shared layer a name such that we can reference
+# its parameters
+shared = nn.Dense(8, activation='relu')
+net.add(nn.Dense(8, activation='relu'),
+ shared,
+ nn.Dense(8, activation='relu', params=shared.params),
+ nn.Dense(10))
+net.initialize()
+
+x = nd.random.uniform(shape=(2, 20))
+net(x)
+
+# Check whether the parameters are the same
+print(net[1].weight.data()[0] == net[2].weight.data()[0])
+net[1].weight.data()[0,0] = 100
+# And make sure that they're actually the same object rather
+# than just having the same value
+print(net[1].weight.data()[0] == net[2].weight.data()[0])
+```
+
+The above example shows that the parameters of the second and third layer are tied. They are identical rather than just being equal. That is, by changing one of the parameters the other one changes, too. What happens to the gradients is quite ingenious. Since the model parameters contain gradients, the gradients of the second hidden layer and the third hidden layer are accumulated in the [`shared.params.grad()`](http://beta.mxnet.io/api/gluon/_autogen/mxnet.gluon.Parameter.grad.html) during backpropagation.
diff --git a/docs/python_docs/python/tutorials/packages/gluon/text/gnmt.rst b/docs/python_docs/python/tutorials/packages/gluon/text/gnmt.rst
new file mode 100644
index 000000000000..c716c8c7f2a7
--- /dev/null
+++ b/docs/python_docs/python/tutorials/packages/gluon/text/gnmt.rst
@@ -0,0 +1,489 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+Google Neural Machine Translation
+=================================
+
+In this notebook, we are going to train Google NMT on IWSLT 2015
+English-Vietnamese Dataset. The building process includes four steps: 1)
+load and process dataset, 2) create sampler and DataLoader, 3) build
+model, and 4) write training epochs.
+
+Load MXNET and Gluon
+--------------------
+
+.. code:: python
+
+ import warnings
+ warnings.filterwarnings('ignore')
+
+ import argparse
+ import time
+ import random
+ import os
+ import io
+ import logging
+ import numpy as np
+ import mxnet as mx
+ from mxnet import gluon
+ import gluonnlp as nlp
+ import nmt
+
+Hyper-parameters
+----------------
+
+.. code:: python
+
+ np.random.seed(100)
+ random.seed(100)
+ mx.random.seed(10000)
+ ctx = mx.gpu(0)
+
+ # parameters for dataset
+ dataset = 'IWSLT2015'
+ src_lang, tgt_lang = 'en', 'vi'
+ src_max_len, tgt_max_len = 50, 50
+
+ # parameters for model
+ num_hidden = 512
+ num_layers = 2
+ num_bi_layers = 1
+ dropout = 0.2
+
+ # parameters for training
+ batch_size, test_batch_size = 128, 32
+ num_buckets = 5
+ epochs = 1
+ clip = 5
+ lr = 0.001
+ lr_update_factor = 0.5
+ log_interval = 10
+ save_dir = 'gnmt_en_vi_u512'
+
+ #parameters for testing
+ beam_size = 10
+ lp_alpha = 1.0
+ lp_k = 5
+
+ nmt.utils.logging_config(save_dir)
+
+Load and Preprocess Dataset
+---------------------------
+
+The following shows how to process the dataset and cache the processed
+dataset for future use. The processing steps include: 1) clip the source
+and target sequences, 2) split the string input to a list of tokens, 3)
+map the string token into its integer index in the vocabulary, and 4)
+append end-of-sentence (EOS) token to source sentence and add BOS and
+EOS tokens to target sentence.
+
+.. code:: python
+
+ def cache_dataset(dataset, prefix):
+ """Cache the processed npy dataset the dataset into a npz
+
+ Parameters
+ ----------
+ dataset : gluon.data.SimpleDataset
+ file_path : str
+ """
+ if not os.path.exists(nmt._constants.CACHE_PATH):
+ os.makedirs(nmt._constants.CACHE_PATH)
+ src_data = np.array([ele[0] for ele in dataset])
+ tgt_data = np.array([ele[1] for ele in dataset])
+ np.savez(os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz'), src_data=src_data, tgt_data=tgt_data)
+
+
+ def load_cached_dataset(prefix):
+ cached_file_path = os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz')
+ if os.path.exists(cached_file_path):
+ print('Load cached data from {}'.format(cached_file_path))
+ dat = np.load(cached_file_path)
+ return gluon.data.ArrayDataset(np.array(dat['src_data']), np.array(dat['tgt_data']))
+ else:
+ return None
+
+
+ class TrainValDataTransform(object):
+ """Transform the machine translation dataset.
+
+ Clip source and the target sentences to the maximum length. For the source sentence, append the
+ EOS. For the target sentence, append BOS and EOS.
+
+ Parameters
+ ----------
+ src_vocab : Vocab
+ tgt_vocab : Vocab
+ src_max_len : int
+ tgt_max_len : int
+ """
+ def __init__(self, src_vocab, tgt_vocab, src_max_len, tgt_max_len):
+ self._src_vocab = src_vocab
+ self._tgt_vocab = tgt_vocab
+ self._src_max_len = src_max_len
+ self._tgt_max_len = tgt_max_len
+
+ def __call__(self, src, tgt):
+ if self._src_max_len > 0:
+ src_sentence = self._src_vocab[src.split()[:self._src_max_len]]
+ else:
+ src_sentence = self._src_vocab[src.split()]
+ if self._tgt_max_len > 0:
+ tgt_sentence = self._tgt_vocab[tgt.split()[:self._tgt_max_len]]
+ else:
+ tgt_sentence = self._tgt_vocab[tgt.split()]
+ src_sentence.append(self._src_vocab[self._src_vocab.eos_token])
+ tgt_sentence.insert(0, self._tgt_vocab[self._tgt_vocab.bos_token])
+ tgt_sentence.append(self._tgt_vocab[self._tgt_vocab.eos_token])
+ src_npy = np.array(src_sentence, dtype=np.int32)
+ tgt_npy = np.array(tgt_sentence, dtype=np.int32)
+ return src_npy, tgt_npy
+
+
+ def process_dataset(dataset, src_vocab, tgt_vocab, src_max_len=-1, tgt_max_len=-1):
+ start = time.time()
+ dataset_processed = dataset.transform(TrainValDataTransform(src_vocab, tgt_vocab,
+ src_max_len,
+ tgt_max_len), lazy=False)
+ end = time.time()
+ print('Processing time spent: {}'.format(end - start))
+ return dataset_processed
+
+
+ def load_translation_data(dataset, src_lang='en', tgt_lang='vi'):
+ """Load translation dataset
+
+ Parameters
+ ----------
+ dataset : str
+ src_lang : str, default 'en'
+ tgt_lang : str, default 'vi'
+
+ Returns
+ -------
+ data_train_processed : Dataset
+ The preprocessed training sentence pairs
+ data_val_processed : Dataset
+ The preprocessed validation sentence pairs
+ data_test_processed : Dataset
+ The preprocessed test sentence pairs
+ val_tgt_sentences : list
+ The target sentences in the validation set
+ test_tgt_sentences : list
+ The target sentences in the test set
+ src_vocab : Vocab
+ Vocabulary of the source language
+ tgt_vocab : Vocab
+ Vocabulary of the target language
+ """
+ common_prefix = 'IWSLT2015_{}_{}_{}_{}'.format(src_lang, tgt_lang,
+ src_max_len, tgt_max_len)
+ data_train = nlp.data.IWSLT2015('train', src_lang=src_lang, tgt_lang=tgt_lang)
+ data_val = nlp.data.IWSLT2015('val', src_lang=src_lang, tgt_lang=tgt_lang)
+ data_test = nlp.data.IWSLT2015('test', src_lang=src_lang, tgt_lang=tgt_lang)
+ src_vocab, tgt_vocab = data_train.src_vocab, data_train.tgt_vocab
+ data_train_processed = load_cached_dataset(common_prefix + '_train')
+ if not data_train_processed:
+ data_train_processed = process_dataset(data_train, src_vocab, tgt_vocab,
+ src_max_len, tgt_max_len)
+ cache_dataset(data_train_processed, common_prefix + '_train')
+ data_val_processed = load_cached_dataset(common_prefix + '_val')
+ if not data_val_processed:
+ data_val_processed = process_dataset(data_val, src_vocab, tgt_vocab)
+ cache_dataset(data_val_processed, common_prefix + '_val')
+ data_test_processed = load_cached_dataset(common_prefix + '_test')
+ if not data_test_processed:
+ data_test_processed = process_dataset(data_test, src_vocab, tgt_vocab)
+ cache_dataset(data_test_processed, common_prefix + '_test')
+ fetch_tgt_sentence = lambda src, tgt: tgt.split()
+ val_tgt_sentences = list(data_val.transform(fetch_tgt_sentence))
+ test_tgt_sentences = list(data_test.transform(fetch_tgt_sentence))
+ return data_train_processed, data_val_processed, data_test_processed, \
+ val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab
+
+
+ def get_data_lengths(dataset):
+ return list(dataset.transform(lambda srg, tgt: (len(srg), len(tgt))))
+
+
+ data_train, data_val, data_test, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab\
+ = load_translation_data(dataset=dataset, src_lang=src_lang, tgt_lang=tgt_lang)
+ data_train_lengths = get_data_lengths(data_train)
+ data_val_lengths = get_data_lengths(data_val)
+ data_test_lengths = get_data_lengths(data_test)
+
+ with io.open(os.path.join(save_dir, 'val_gt.txt'), 'w', encoding='utf-8') as of:
+ for ele in val_tgt_sentences:
+ of.write(' '.join(ele) + '\n')
+
+ with io.open(os.path.join(save_dir, 'test_gt.txt'), 'w', encoding='utf-8') as of:
+ for ele in test_tgt_sentences:
+ of.write(' '.join(ele) + '\n')
+
+
+ data_train = data_train.transform(lambda src, tgt: (src, tgt, len(src), len(tgt)), lazy=False)
+ data_val = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
+ for i, ele in enumerate(data_val)])
+ data_test = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
+ for i, ele in enumerate(data_test)])
+
+Create Sampler and DataLoader
+-----------------------------
+
+Now, we have obtained ``data_train``, ``data_val``, and ``data_test``.
+The next step is to construct sampler and DataLoader. The first step is
+to construct batchify function, which pads and stacks sequences to form
+mini-batch.
+
+.. code:: python
+
+ train_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(),
+ nlp.data.batchify.Pad(),
+ nlp.data.batchify.Stack(dtype='float32'),
+ nlp.data.batchify.Stack(dtype='float32'))
+ test_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(),
+ nlp.data.batchify.Pad(),
+ nlp.data.batchify.Stack(dtype='float32'),
+ nlp.data.batchify.Stack(dtype='float32'),
+ nlp.data.batchify.Stack())
+
+We can then construct bucketing samplers, which generate batches by
+grouping sequences with similar lengths. Here, the bucketing scheme is
+empirically determined.
+
+.. code:: python
+
+ bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)
+ train_batch_sampler = nlp.data.FixedBucketSampler(lengths=data_train_lengths,
+ batch_size=batch_size,
+ num_buckets=num_buckets,
+ shuffle=True,
+ bucket_scheme=bucket_scheme)
+ logging.info('Train Batch Sampler:\n{}'.format(train_batch_sampler.stats()))
+ val_batch_sampler = nlp.data.FixedBucketSampler(lengths=data_val_lengths,
+ batch_size=test_batch_size,
+ num_buckets=num_buckets,
+ shuffle=False)
+ logging.info('Valid Batch Sampler:\n{}'.format(val_batch_sampler.stats()))
+ test_batch_sampler = nlp.data.FixedBucketSampler(lengths=data_test_lengths,
+ batch_size=test_batch_size,
+ num_buckets=num_buckets,
+ shuffle=False)
+ logging.info('Test Batch Sampler:\n{}'.format(test_batch_sampler.stats()))
+
+Given the samplers, we can create DataLoader, which is iterable.
+
+.. code:: python
+
+ train_data_loader = gluon.data.DataLoader(data_train,
+ batch_sampler=train_batch_sampler,
+ batchify_fn=train_batchify_fn,
+ num_workers=4)
+ val_data_loader = gluon.data.DataLoader(data_val,
+ batch_sampler=val_batch_sampler,
+ batchify_fn=test_batchify_fn,
+ num_workers=4)
+ test_data_loader = gluon.data.DataLoader(data_test,
+ batch_sampler=test_batch_sampler,
+ batchify_fn=test_batchify_fn,
+ num_workers=4)
+
+Build GNMT Model
+----------------
+
+After obtaining DataLoader, we can build the model. The GNMT encoder and
+decoder can be easily constructed by calling
+``get_gnmt_encoder_decoder`` function. Then, we feed the encoder and
+decoder to ``NMTModel`` to construct the GNMT model. ``model.hybridize``
+allows computation to be done using the symbolic backend.
+
+.. code:: python
+
+ encoder, decoder = nmt.gnmt.get_gnmt_encoder_decoder(hidden_size=num_hidden,
+ dropout=dropout,
+ num_layers=num_layers,
+ num_bi_layers=num_bi_layers)
+ model = nmt.translation.NMTModel(src_vocab=src_vocab, tgt_vocab=tgt_vocab, encoder=encoder, decoder=decoder,
+ embed_size=num_hidden, prefix='gnmt_')
+ model.initialize(init=mx.init.Uniform(0.1), ctx=ctx)
+ static_alloc = True
+ model.hybridize(static_alloc=static_alloc)
+ logging.info(model)
+
+ # Due to the paddings, we need to mask out the losses corresponding to padding tokens.
+ loss_function = nmt.loss.SoftmaxCEMaskedLoss()
+ loss_function.hybridize(static_alloc=static_alloc)
+
+We also build the beam search translator.
+
+.. code:: python
+
+ translator = nmt.translation.BeamSearchTranslator(model=model, beam_size=beam_size,
+ scorer=nlp.model.BeamSearchScorer(alpha=lp_alpha,
+ K=lp_k),
+ max_length=tgt_max_len + 100)
+ logging.info('Use beam_size={}, alpha={}, K={}'.format(beam_size, lp_alpha, lp_k))
+
+We define evaluation function as follows. The ``evaluate`` function use
+beam search translator to generate outputs for the validation and
+testing datasets.
+
+.. code:: python
+
+ def evaluate(data_loader):
+ """Evaluate given the data loader
+
+ Parameters
+ ----------
+ data_loader : gluon.data.DataLoader
+
+ Returns
+ -------
+ avg_loss : float
+ Average loss
+ real_translation_out : list of list of str
+ The translation output
+ """
+ translation_out = []
+ all_inst_ids = []
+ avg_loss_denom = 0
+ avg_loss = 0.0
+ for _, (src_seq, tgt_seq, src_valid_length, tgt_valid_length, inst_ids) \
+ in enumerate(data_loader):
+ src_seq = src_seq.as_in_context(ctx)
+ tgt_seq = tgt_seq.as_in_context(ctx)
+ src_valid_length = src_valid_length.as_in_context(ctx)
+ tgt_valid_length = tgt_valid_length.as_in_context(ctx)
+ # Calculating Loss
+ out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
+ loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean().asscalar()
+ all_inst_ids.extend(inst_ids.asnumpy().astype(np.int32).tolist())
+ avg_loss += loss * (tgt_seq.shape[1] - 1)
+ avg_loss_denom += (tgt_seq.shape[1] - 1)
+ # Translate
+ samples, _, sample_valid_length =\
+ translator.translate(src_seq=src_seq, src_valid_length=src_valid_length)
+ max_score_sample = samples[:, 0, :].asnumpy()
+ sample_valid_length = sample_valid_length[:, 0].asnumpy()
+ for i in range(max_score_sample.shape[0]):
+ translation_out.append(
+ [tgt_vocab.idx_to_token[ele] for ele in
+ max_score_sample[i][1:(sample_valid_length[i] - 1)]])
+ avg_loss = avg_loss / avg_loss_denom
+ real_translation_out = [None for _ in range(len(all_inst_ids))]
+ for ind, sentence in zip(all_inst_ids, translation_out):
+ real_translation_out[ind] = sentence
+ return avg_loss, real_translation_out
+
+
+ def write_sentences(sentences, file_path):
+ with io.open(file_path, 'w', encoding='utf-8') as of:
+ for sent in sentences:
+ of.write(' '.join(sent) + '\n')
+
+Training Epochs
+---------------
+
+Before entering the training stage, we need to create trainer for
+updating the parameters. In the following example, we create a trainer
+that uses ADAM optimzier.
+
+.. code:: python
+
+ trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr})
+
+We can then write the training loop. During the training, we evaluate on
+the validation and testing datasets every epoch, and record the
+parameters that give the hightest BLEU score on the validation dataset.
+Before performing forward and backward, we first use ``as_in_context``
+function to copy the mini-batch to GPU. The statement
+``with mx.autograd.record()`` tells Gluon backend to compute the
+gradients for the part inside the block.
+
+.. code:: python
+
+ best_valid_bleu = 0.0
+ for epoch_id in range(epochs):
+ log_avg_loss = 0
+ log_avg_gnorm = 0
+ log_wc = 0
+ log_start_time = time.time()
+ for batch_id, (src_seq, tgt_seq, src_valid_length, tgt_valid_length)\
+ in enumerate(train_data_loader):
+ # logging.info(src_seq.context) Context suddenly becomes GPU.
+ src_seq = src_seq.as_in_context(ctx)
+ tgt_seq = tgt_seq.as_in_context(ctx)
+ src_valid_length = src_valid_length.as_in_context(ctx)
+ tgt_valid_length = tgt_valid_length.as_in_context(ctx)
+ with mx.autograd.record():
+ out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
+ loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean()
+ loss = loss * (tgt_seq.shape[1] - 1) / (tgt_valid_length - 1).mean()
+ loss.backward()
+ grads = [p.grad(ctx) for p in model.collect_params().values()]
+ gnorm = gluon.utils.clip_global_norm(grads, clip)
+ trainer.step(1)
+ src_wc = src_valid_length.sum().asscalar()
+ tgt_wc = (tgt_valid_length - 1).sum().asscalar()
+ step_loss = loss.asscalar()
+ log_avg_loss += step_loss
+ log_avg_gnorm += gnorm
+ log_wc += src_wc + tgt_wc
+ if (batch_id + 1) % log_interval == 0:
+ wps = log_wc / (time.time() - log_start_time)
+ logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, gnorm={:.4f}, '
+ 'throughput={:.2f}K wps, wc={:.2f}K'
+ .format(epoch_id, batch_id + 1, len(train_data_loader),
+ log_avg_loss / log_interval,
+ np.exp(log_avg_loss / log_interval),
+ log_avg_gnorm / log_interval,
+ wps / 1000, log_wc / 1000))
+ log_start_time = time.time()
+ log_avg_loss = 0
+ log_avg_gnorm = 0
+ log_wc = 0
+ valid_loss, valid_translation_out = evaluate(val_data_loader)
+ valid_bleu_score, _, _, _, _ = nmt.bleu.compute_bleu([val_tgt_sentences], valid_translation_out)
+ logging.info('[Epoch {}] valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'
+ .format(epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100))
+ test_loss, test_translation_out = evaluate(test_data_loader)
+ test_bleu_score, _, _, _, _ = nmt.bleu.compute_bleu([test_tgt_sentences], test_translation_out)
+ logging.info('[Epoch {}] test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'
+ .format(epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100))
+ write_sentences(valid_translation_out,
+ os.path.join(save_dir, 'epoch{:d}_valid_out.txt').format(epoch_id))
+ write_sentences(test_translation_out,
+ os.path.join(save_dir, 'epoch{:d}_test_out.txt').format(epoch_id))
+ if valid_bleu_score > best_valid_bleu:
+ best_valid_bleu = valid_bleu_score
+ save_path = os.path.join(save_dir, 'valid_best.params')
+ logging.info('Save best parameters to {}'.format(save_path))
+ model.save_parameters(save_path)
+ if epoch_id + 1 >= (epochs * 2) // 3:
+ new_lr = trainer.learning_rate * lr_update_factor
+ logging.info('Learning rate change to {}'.format(new_lr))
+ trainer.set_learning_rate(new_lr)
+
+Summary
+-------
+
+In this notebook, we have shown how to train a GNMT model on IWSLT 2015
+English-Vietnamese using Gluon NLP toolkit. The complete training script
+can be found
+`here `__.
+The command to reproduce the result can be seen in the `nmt scripts
+page `__.
diff --git a/docs/python_docs/python/tutorials/packages/gluon/text/index.rst b/docs/python_docs/python/tutorials/packages/gluon/text/index.rst
new file mode 100644
index 000000000000..7fe503ae279d
--- /dev/null
+++ b/docs/python_docs/python/tutorials/packages/gluon/text/index.rst
@@ -0,0 +1,116 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+Text Tutorials
+===============
+
+These tutorials will help you learn how to create and use models that work with text and other natural language processing tasks.
+
+Word Embedding
+---------------------
+
+.. container:: cards
+
+ .. card::
+ :title: Pre-trained Word Embeddings
+ :link: http://gluon-nlp.mxnet.io/examples/word_embedding/word_embedding.html
+
+ Basics on how to use word embedding with vocab in GluonNLP and apply it on word similarity and analogy problems.
+
+ .. card::
+ :title: Word Embeddings Training and Evaluation
+ :link: http://gluon-nlp.mxnet.io/examples/word_embedding/word_embedding_training.html
+
+ Learn how to train fastText and word2vec embeddings on your own dataset, and determine embedding quality through intrinsic evaluation.
+
+Language Model
+==============
+
+
+.. container:: cards
+
+ .. card::
+ :title: LSTM-based Language Models
+ :link: http://gluon-nlp.mxnet.io/examples/language_model/language_model.html
+
+ Learn what a language model is, what it can do, and how to train a word-level language model with truncated back-propagation-through-time (BPTT).
+
+Machine Translation
+---------------------
+
+.. container:: cards
+
+ .. card::
+ :title: Google Neural Machine Translation
+ :link: http://gluon-nlp.mxnet.io/examples/machine_translation/gnmt.html
+
+ Learn how to train Google Neural Machine Translation, a seq2seq with attention model.
+
+ .. card::
+ :title: Machine Translation with Transformer
+ :link: http://gluon-nlp.mxnet.io/examples/machine_translation/transformer.html
+
+ Learn how to use a pre-trained transformer translation model for English to German translation.
+
+Sentence Embedding
+---------------------
+
+.. container:: cards
+
+ .. card::
+ :title: ELMo: Deep Contextualized Word Representations
+ :link: http://gluon-nlp.mxnet.io/examples/sentence_embedding/elmo_sentence_representation.html
+
+ See how to use GluonNLP’s model API to automatically download the pre-trained ELMo model from NAACL2018 best paper, and extract features with it.
+
+ .. card::
+ :title: A Structured Self-attentive Sentence Embedding
+ :link: http://gluon-nlp.mxnet.io/examples/sentence_embedding/self_attentive_sentence_embedding.html
+
+ See how to use GluonNLP to build more advanced model structure for extracting sentence embeddings to predict Yelp review rating.
+
+ .. card::
+ :title: BERT: Bidirectional Encoder Representations from Transformers
+ :link: http://gluon-nlp.mxnet.io/examples/sentence_embedding/bert.html
+
+ See how to use GluonNLP to fine-tune a sentence pair classification model with pre-trained BERT parameters.
+
+Sentiment Analysis
+---------------------
+
+.. container:: cards
+
+ .. card::
+ :title: Sentiment Analysis by Fine-tuning Word Language Model
+ :link: http://gluon-nlp.mxnet.io/examples/sentiment_analysis/sentiment_analysis.html
+
+ See how to fine-tune a pre-trained language model to perform sentiment analysis on movie reviews.
+
+Sequence Sampling
+---------------------
+
+.. container:: cards
+
+ .. card::
+ :title: Sequence Generation with Sampling and Beam Search
+ :link: http://gluon-nlp.mxnet.io/examples/sequence_sampling/sequence_sampling.html
+
+ Learn how to generate sentence from pre-trained language model through sampling and beam search.
+
+.. toctree::
+ :hidden:
+ :maxdepth: 1
diff --git a/docs/python_docs/python/tutorials/packages/gluon/text/transformer.rst b/docs/python_docs/python/tutorials/packages/gluon/text/transformer.rst
new file mode 100644
index 000000000000..c44fb37f6b8a
--- /dev/null
+++ b/docs/python_docs/python/tutorials/packages/gluon/text/transformer.rst
@@ -0,0 +1,607 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+Machine Translation with Transformer
+====================================
+
+In this notebook, we will show how to train Transformer introduced in
+[1] and evaluate the pretrained model using GluonNLP. The model is both
+more accurate and lighter to train than previous seq2seq models. We will
+together go through:
+
+1) Use the state-of-the-art pretrained Transformer model: we will
+ evaluate the pretrained SOTA Transformer model and translate a few
+ sentences ourselves with the ``BeamSearchTranslator`` using the SOTA
+ model;
+
+2) Train the Transformer yourself: including loading and processing
+ dataset, define the Transformer model, write train script and
+ evaluate the trained model. Note that in order to obtain the
+ state-of-the-art results on WMT 2014 English-German dataset, it will
+ take around 1 day to have the model. In order to let you run through
+ the Transformer quickly, we suggest you to start with the ``TOY``
+ dataset sampled from the WMT dataset (by default in this notebook).
+
+Preparation
+-----------
+
+Load MXNet and GluonNLP
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: python
+
+ import warnings
+ warnings.filterwarnings('ignore')
+
+ import random
+ import numpy as np
+ import mxnet as mx
+ from mxnet import gluon
+ import gluonnlp as nlp
+
+Set Environment
+~~~~~~~~~~~~~~~
+
+.. code:: python
+
+ np.random.seed(100)
+ random.seed(100)
+ mx.random.seed(10000)
+ ctx = mx.gpu(0)
+
+Use the SOTA Pretrained Transformer model
+-----------------------------------------
+
+In this subsection, we first load the SOTA Transformer model in GluonNLP
+model zoo; and secondly we load the full WMT 2014 English-German test
+dataset; and finally evaluate the model.
+
+Get the SOTA Transformer
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Next, we load the pretrained SOTA Transformer using the model API in
+GluonNLP. In this way, we can easily get access to the SOTA machine
+translation model and use it in your own application.
+
+.. code:: python
+
+ import nmt
+
+ wmt_model_name = 'transformer_en_de_512'
+
+ wmt_transformer_model, wmt_src_vocab, wmt_tgt_vocab = \
+ nmt.transformer.get_model(wmt_model_name,
+ dataset_name='WMT2014',
+ pretrained=True,
+ ctx=ctx)
+
+ print(wmt_src_vocab)
+ print(wmt_tgt_vocab)
+
+The Transformer model architecture is shown as below:
+
+.. raw:: html
+
+
+
+|transformer|
+
+.. raw:: html
+
+
+
+.. code:: python
+
+ print(wmt_transformer_model)
+
+Load and Preprocess WMT 2014 Dataset
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We then load the WMT 2014 English-German test dataset for evaluation
+purpose.
+
+The following shows how to process the dataset and cache the processed
+dataset for the future use. The processing steps include:
+
+-
+
+ 1) clip the source and target sequences
+
+-
+
+ 2) split the string input to a list of tokens
+
+-
+
+ 3) map the string token into its index in the vocabulary
+
+-
+
+ 4) append EOS token to source sentence and add BOS and EOS tokens to
+ target sentence.
+
+Let's first look at the WMT 2014 corpus.
+
+.. code:: python
+
+ import hyperparameters as hparams
+
+ wmt_data_test = nlp.data.WMT2014BPE('newstest2014',
+ src_lang=hparams.src_lang,
+ tgt_lang=hparams.tgt_lang,
+ full=False)
+ print('Source language %s, Target language %s' % (hparams.src_lang, hparams.tgt_lang))
+
+ wmt_data_test[0]
+
+.. code:: python
+
+ wmt_test_text = nlp.data.WMT2014('newstest2014',
+ src_lang=hparams.src_lang,
+ tgt_lang=hparams.tgt_lang,
+ full=False)
+ wmt_test_text[0]
+
+We then generate the target gold translations.
+
+.. code:: python
+
+ wmt_test_tgt_sentences = list(wmt_test_text.transform(lambda src, tgt: tgt))
+ wmt_test_tgt_sentences[0]
+
+.. code:: python
+
+ import dataprocessor
+
+ print(dataprocessor.TrainValDataTransform.__doc__)
+
+.. code:: python
+
+ wmt_transform_fn = dataprocessor.TrainValDataTransform(wmt_src_vocab, wmt_tgt_vocab, -1, -1)
+ wmt_dataset_processed = wmt_data_test.transform(wmt_transform_fn, lazy=False)
+ print(*wmt_dataset_processed[0], sep='\n')
+
+Create Sampler and DataLoader for WMT 2014 Dataset
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: python
+
+ wmt_data_test_with_len = gluon.data.SimpleDataset([(ele[0], ele[1], len(
+ ele[0]), len(ele[1]), i) for i, ele in enumerate(wmt_dataset_processed)])
+
+Now, we have obtained data\_train, data\_val, and data\_test. The next
+step is to construct sampler and DataLoader. The first step is to
+construct batchify function, which pads and stacks sequences to form
+mini-batch.
+
+.. code:: python
+
+ wmt_test_batchify_fn = nlp.data.batchify.Tuple(
+ nlp.data.batchify.Pad(),
+ nlp.data.batchify.Pad(),
+ nlp.data.batchify.Stack(dtype='float32'),
+ nlp.data.batchify.Stack(dtype='float32'),
+ nlp.data.batchify.Stack())
+
+We can then construct bucketing samplers, which generate batches by
+grouping sequences with similar lengths.
+
+.. code:: python
+
+ wmt_bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)
+
+.. code:: python
+
+ wmt_test_batch_sampler = nlp.data.FixedBucketSampler(
+ lengths=wmt_dataset_processed.transform(lambda src, tgt: len(tgt)),
+ use_average_length=True,
+ bucket_scheme=wmt_bucket_scheme,
+ batch_size=256)
+ print(wmt_test_batch_sampler.stats())
+
+Given the samplers, we can create DataLoader, which is iterable.
+
+.. code:: python
+
+ wmt_test_data_loader = gluon.data.DataLoader(
+ wmt_data_test_with_len,
+ batch_sampler=wmt_test_batch_sampler,
+ batchify_fn=wmt_test_batchify_fn,
+ num_workers=8)
+ len(wmt_test_data_loader)
+
+Evaluate Transformer
+~~~~~~~~~~~~~~~~~~~~
+
+Next, we generate the SOTA results on the WMT test dataset. As we can
+see from the result, we are able to achieve the SOTA number 27.35 as the
+BLEU score.
+
+We first define the ``BeamSearchTranslator`` to generate the actual
+translations.
+
+.. code:: python
+
+ wmt_translator = nmt.translation.BeamSearchTranslator(
+ model=wmt_transformer_model,
+ beam_size=hparams.beam_size,
+ scorer=nlp.model.BeamSearchScorer(alpha=hparams.lp_alpha, K=hparams.lp_k),
+ max_length=200)
+
+Then we caculate the ``loss`` as well as the ``bleu`` score on the WMT
+2014 English-German test dataset. Note that the following evalution
+process will take ~13 mins to complete.
+
+.. code:: python
+
+ import time
+ import utils
+
+ eval_start_time = time.time()
+
+ wmt_test_loss_function = nmt.loss.SoftmaxCEMaskedLoss()
+ wmt_test_loss_function.hybridize()
+
+ wmt_detokenizer = nlp.data.SacreMosesDetokenizer()
+
+ wmt_test_loss, wmt_test_translation_out = utils.evaluate(wmt_transformer_model,
+ wmt_test_data_loader,
+ wmt_test_loss_function,
+ wmt_translator,
+ wmt_tgt_vocab,
+ wmt_detokenizer,
+ ctx)
+
+ wmt_test_bleu_score, _, _, _, _ = nmt.bleu.compute_bleu([wmt_test_tgt_sentences],
+ wmt_test_translation_out,
+ tokenized=False,
+ tokenizer=hparams.bleu,
+ split_compound_word=False,
+ bpe=False)
+
+ print('WMT14 EN-DE SOTA model test loss: %.2f; test bleu score: %.2f; time cost %.2fs'
+ %(wmt_test_loss, wmt_test_bleu_score * 100, (time.time() - eval_start_time)))
+
+.. code:: python
+
+ print('Sample translations:')
+ num_pairs = 3
+
+ for i in range(num_pairs):
+ print('EN:')
+ print(wmt_test_text[i][0])
+ print('DE-Candidate:')
+ print(wmt_test_translation_out[i])
+ print('DE-Reference:')
+ print(wmt_test_tgt_sentences[i])
+ print('========')
+
+Translation Inference
+~~~~~~~~~~~~~~~~~~~~~
+
+We herein show the actual translation example (EN-DE) when given a
+source language using the SOTA Transformer model.
+
+.. code:: python
+
+ import utils
+
+ print('Translate the following English sentence into German:')
+
+ sample_src_seq = 'We love each other'
+
+ print('[\'' + sample_src_seq + '\']')
+
+ sample_tgt_seq = utils.translate(wmt_translator,
+ sample_src_seq,
+ wmt_src_vocab,
+ wmt_tgt_vocab,
+ wmt_detokenizer,
+ ctx)
+
+ print('The German translation is:')
+ print(sample_tgt_seq)
+
+Train Your Own Transformer
+--------------------------
+
+In this subsection, we will go though the whole process about loading
+translation dataset in a more unified way, and create data sampler and
+loader, as well as define the Transformer model, finally writing
+training script to train the model yourself.
+
+Load and Preprocess TOY Dataset
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Note that we use demo mode (``TOY`` dataset) by default, since loading
+the whole WMT 2014 English-German dataset ``WMT2014BPE`` for the later
+training will be slow (~1 day). But if you really want to train to have
+the SOTA result, please set ``demo = False``. In order to make the data
+processing blocks execute in a more efficient way, we package them in
+the ``load_translation_data`` (``transform`` etc.) function used as
+below. The function also returns the gold target sentences as well as
+the vocabularies.
+
+.. code:: python
+
+ demo = True
+ if demo:
+ dataset = 'TOY'
+ else:
+ dataset = 'WMT2014BPE'
+
+ data_train, data_val, data_test, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab = \
+ dataprocessor.load_translation_data(
+ dataset=dataset,
+ src_lang=hparams.src_lang,
+ tgt_lang=hparams.tgt_lang)
+
+ data_train_lengths = dataprocessor.get_data_lengths(data_train)
+ data_val_lengths = dataprocessor.get_data_lengths(data_val)
+ data_test_lengths = dataprocessor.get_data_lengths(data_test)
+
+ data_train = data_train.transform(lambda src, tgt: (src, tgt, len(src), len(tgt)), lazy=False)
+ data_val = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
+ for i, ele in enumerate(data_val)])
+ data_test = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
+ for i, ele in enumerate(data_test)])
+
+Create Sampler and DataLoader for TOY Dataset
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now, we have obtained ``data_train``, ``data_val``, and ``data_test``.
+The next step is to construct sampler and DataLoader. The first step is
+to construct batchify function, which pads and stacks sequences to form
+mini-batch.
+
+.. code:: python
+
+ train_batchify_fn = nlp.data.batchify.Tuple(
+ nlp.data.batchify.Pad(),
+ nlp.data.batchify.Pad(),
+ nlp.data.batchify.Stack(dtype='float32'),
+ nlp.data.batchify.Stack(dtype='float32'))
+ test_batchify_fn = nlp.data.batchify.Tuple(
+ nlp.data.batchify.Pad(),
+ nlp.data.batchify.Pad(),
+ nlp.data.batchify.Stack(dtype='float32'),
+ nlp.data.batchify.Stack(dtype='float32'),
+ nlp.data.batchify.Stack())
+
+ target_val_lengths = list(map(lambda x: x[-1], data_val_lengths))
+ target_test_lengths = list(map(lambda x: x[-1], data_test_lengths))
+
+We can then construct bucketing samplers, which generate batches by
+grouping sequences with similar lengths.
+
+.. code:: python
+
+ bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)
+ train_batch_sampler = nlp.data.FixedBucketSampler(lengths=data_train_lengths,
+ batch_size=hparams.batch_size,
+ num_buckets=hparams.num_buckets,
+ ratio=0.0,
+ shuffle=True,
+ use_average_length=True,
+ num_shards=1,
+ bucket_scheme=bucket_scheme)
+ print('Train Batch Sampler:')
+ print(train_batch_sampler.stats())
+
+
+ val_batch_sampler = nlp.data.FixedBucketSampler(lengths=target_val_lengths,
+ batch_size=hparams.test_batch_size,
+ num_buckets=hparams.num_buckets,
+ ratio=0.0,
+ shuffle=False,
+ use_average_length=True,
+ bucket_scheme=bucket_scheme)
+ print('Validation Batch Sampler:')
+ print(val_batch_sampler.stats())
+
+ test_batch_sampler = nlp.data.FixedBucketSampler(lengths=target_test_lengths,
+ batch_size=hparams.test_batch_size,
+ num_buckets=hparams.num_buckets,
+ ratio=0.0,
+ shuffle=False,
+ use_average_length=True,
+ bucket_scheme=bucket_scheme)
+ print('Test Batch Sampler:')
+ print(test_batch_sampler.stats())
+
+Given the samplers, we can create DataLoader, which is iterable. Note
+that the data loader of validation and test dataset share the same
+batchifying function ``test_batchify_fn``.
+
+.. code:: python
+
+ train_data_loader = nlp.data.ShardedDataLoader(data_train,
+ batch_sampler=train_batch_sampler,
+ batchify_fn=train_batchify_fn,
+ num_workers=8)
+ print('Length of train_data_loader: %d' % len(train_data_loader))
+ val_data_loader = gluon.data.DataLoader(data_val,
+ batch_sampler=val_batch_sampler,
+ batchify_fn=test_batchify_fn,
+ num_workers=8)
+ print('Length of val_data_loader: %d' % len(val_data_loader))
+ test_data_loader = gluon.data.DataLoader(data_test,
+ batch_sampler=test_batch_sampler,
+ batchify_fn=test_batchify_fn,
+ num_workers=8)
+ print('Length of test_data_loader: %d' % len(test_data_loader))
+
+Define Transformer Model
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+After obtaining DataLoader, we then start to define the Transformer. The
+encoder and decoder of the Transformer can be easily obtained by calling
+``get_transformer_encoder_decoder`` function. Then, we use the encoder
+and decoder in ``NMTModel`` to construct the Transformer model.
+``model.hybridize`` allows computation to be done using symbolic
+backend. We also use ``label_smoothing``.
+
+.. code:: python
+
+ encoder, decoder = nmt.transformer.get_transformer_encoder_decoder(units=hparams.num_units,
+ hidden_size=hparams.hidden_size,
+ dropout=hparams.dropout,
+ num_layers=hparams.num_layers,
+ num_heads=hparams.num_heads,
+ max_src_length=530,
+ max_tgt_length=549,
+ scaled=hparams.scaled)
+ model = nmt.translation.NMTModel(src_vocab=src_vocab, tgt_vocab=tgt_vocab, encoder=encoder, decoder=decoder,
+ share_embed=True, embed_size=hparams.num_units, tie_weights=True,
+ embed_initializer=None, prefix='transformer_')
+ model.initialize(init=mx.init.Xavier(magnitude=3.0), ctx=ctx)
+ model.hybridize()
+
+ print(model)
+
+ label_smoothing = nmt.loss.LabelSmoothing(epsilon=hparams.epsilon, units=len(tgt_vocab))
+ label_smoothing.hybridize()
+
+ loss_function = nmt.loss.SoftmaxCEMaskedLoss(sparse_label=False)
+ loss_function.hybridize()
+
+ test_loss_function = nmt.loss.SoftmaxCEMaskedLoss()
+ test_loss_function.hybridize()
+
+ detokenizer = nlp.data.SacreMosesDetokenizer()
+
+Here, we build the translator using the beam search
+
+.. code:: python
+
+ translator = nmt.translation.BeamSearchTranslator(model=model,
+ beam_size=hparams.beam_size,
+ scorer=nlp.model.BeamSearchScorer(alpha=hparams.lp_alpha,
+ K=hparams.lp_k),
+ max_length=200)
+ print('Use beam_size=%d, alpha=%.2f, K=%d' % (hparams.beam_size, hparams.lp_alpha, hparams.lp_k))
+
+Training Loop
+~~~~~~~~~~~~~
+
+Before conducting training, we need to create trainer for updating the
+parameter. In the following example, we create a trainer that uses ADAM
+optimzier.
+
+.. code:: python
+
+ trainer = gluon.Trainer(model.collect_params(), hparams.optimizer,
+ {'learning_rate': hparams.lr, 'beta2': 0.98, 'epsilon': 1e-9})
+ print('Use learning_rate=%.2f'
+ % (trainer.learning_rate))
+
+We can then write the training loop. During the training, we perform the
+evaluation on validation and testing dataset every epoch, and record the
+parameters that give the hightest BLEU score on validation dataset.
+Before performing forward and backward, we first use ``as_in_context``
+function to copy the mini-batch to GPU. The statement
+``with mx.autograd.record()`` will locate Gluon backend to compute the
+gradients for the part inside the block. For ease of observing the
+convergence of the update of the ``Loss`` in a quick fashion, we set the
+``epochs = 3``. Notice that, in order to obtain the best BLEU score, we
+will need more epochs and large warmup steps following the original
+paper as you can find the SOTA results in the first subsection. Besides,
+we use Averaging SGD [2] to update the parameters, since it is more
+robust for the machine translation task.
+
+.. code:: python
+
+ best_valid_loss = float('Inf')
+ step_num = 0
+ #We use warmup steps as introduced in [1].
+ warmup_steps = hparams.warmup_steps
+ grad_interval = hparams.num_accumulated
+ model.collect_params().setattr('grad_req', 'add')
+ #We use Averaging SGD [2] to update the parameters.
+ average_start = (len(train_data_loader) // grad_interval) * \
+ (hparams.epochs - hparams.average_start)
+ average_param_dict = {k: mx.nd.array([0]) for k, v in
+ model.collect_params().items()}
+ update_average_param_dict = True
+ model.collect_params().zero_grad()
+ for epoch_id in range(hparams.epochs):
+ utils.train_one_epoch(epoch_id, model, train_data_loader, trainer,
+ label_smoothing, loss_function, grad_interval,
+ average_param_dict, update_average_param_dict,
+ step_num, ctx)
+ mx.nd.waitall()
+ # We define evaluation function as follows. The `evaluate` function use beam search translator
+ # to generate outputs for the validation and testing datasets.
+ valid_loss, _ = utils.evaluate(model, val_data_loader,
+ test_loss_function, translator,
+ tgt_vocab, detokenizer, ctx)
+ print('Epoch %d, valid Loss=%.4f, valid ppl=%.4f'
+ % (epoch_id, valid_loss, np.exp(valid_loss)))
+ test_loss, _ = utils.evaluate(model, test_data_loader,
+ test_loss_function, translator,
+ tgt_vocab, detokenizer, ctx)
+ print('Epoch %d, test Loss=%.4f, test ppl=%.4f'
+ % (epoch_id, test_loss, np.exp(test_loss)))
+ if valid_loss < best_valid_loss:
+ best_valid_loss = valid_loss
+ model.save_parameters('{}.{}'.format(hparams.save_dir, 'valid_best.params'))
+ model.save_parameters('{}.epoch{:d}.params'.format(hparams.save_dir, epoch_id))
+ mx.nd.save('{}.{}'.format(hparams.save_dir, 'average.params'), average_param_dict)
+
+ if hparams.average_start > 0:
+ for k, v in model.collect_params().items():
+ v.set_data(average_param_dict[k])
+ else:
+ model.load_parameters('{}.{}'.format(hparams.save_dir, 'valid_best.params'), ctx)
+ valid_loss, _ = utils.evaluate(model, val_data_loader,
+ test_loss_function, translator,
+ tgt_vocab, detokenizer, ctx)
+ print('Best model valid Loss=%.4f, valid ppl=%.4f'
+ % (valid_loss, np.exp(valid_loss)))
+ test_loss, _ = utils.evaluate(model, test_data_loader,
+ test_loss_function, translator,
+ tgt_vocab, detokenizer, ctx)
+ print('Best model test Loss=%.4f, test ppl=%.4f'
+ % (test_loss, np.exp(test_loss)))
+
+Conclusion
+----------
+
+- Showcase with Transformer, we are able to support the deep neural
+ networks for seq2seq task. We have already achieved SOTA results on
+ the WMT 2014 English-German task.
+- Gluon NLP Toolkit provides high-level APIs that could drastically
+ simplify the development process of modeling for NLP tasks sharing
+ the encoder-decoder structure.
+- Low-level APIs in NLP Toolkit enables easy customization.
+
+Documentation can be found at http://gluon-nlp.mxnet.io/index.html
+
+Code is here https://github.com/dmlc/gluon-nlp
+
+References
+----------
+
+[1] Vaswani, Ashish, et al. "Attention is all you need." Advances in
+Neural Information Processing Systems. 2017.
+
+[2] Polyak, Boris T, and Anatoli B. Juditsky. "Acceleration of
+stochastic approximation by averaging." SIAM Journal on Control and
+Optimization. 1992.
+
+.. |transformer| image:: transformer.png
+
diff --git a/docs/python_docs/python/tutorials/packages/gluon/trainer.md b/docs/python_docs/python/tutorials/packages/gluon/trainer.md
new file mode 100644
index 000000000000..c154a56bed26
--- /dev/null
+++ b/docs/python_docs/python/tutorials/packages/gluon/trainer.md
@@ -0,0 +1,168 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# Trainer
+
+Training a neural network model consists of iteratively performing three simple steps.
+
+The first step is the forward step which computes the loss. In MXNet Gluon, this first step is achieved by doing a forward pass by calling `net.forward(X)` or simply `net(X)` and then calling the loss function with the result of the forward pass and the labels. For example `l = loss_fn(net(X), y)`.
+
+The second step is the backward step which computes the gradient of the loss with respect to the parameters. In Gluon, this step is achieved by doing the first step in an [`autograd.record()`](http://beta.mxnet.io/guide/packages/autograd/autograd.html) scope to record the computations needed to calculate the loss, and then calling `l.backward()` to compute the gradient of the loss with respect to the parameters.
+
+The final step is to update the neural network model parameters using an optimization algorithm. In Gluon, this step is performed by the [`gluon.Trainer`](http://beta.mxnet.io/api/gluon/mxnet.gluon.Trainer.html) and is the subject of this guide. When creating a Gluon `Trainer` you must provide a collection of parameters that need to be learnt. You also provide an `Optimizer` that will be used to update the parameters every training iteration when `trainer.step` is called.
+
+## Basic Usage
+
+### Network and Trainer
+
+To illustrate how to use the Gluon `Trainer` we will create a simple perceptron model and create a `Trainer ` instance using the perceptron model parameters and a simple optimizer - `sgd` with learning rate as 1.
+
+```{.python .input}
+from mxnet import nd, autograd, optimizer, gluon
+
+net = gluon.nn.Dense(1)
+net.initialize()
+
+trainer = gluon.Trainer(net.collect_params(),
+ optimizer='sgd', optimizer_params={'learning_rate':1})
+
+```
+
+### Forward and Backward Pass
+
+Before we can use the `trainer` to update model parameters, we must first run the forward and backward passes. Here we implement a function to compute the first two steps (forward step and backward step) of training the perceptron on a random dataset.
+
+```{.python .input}
+batch_size = 8
+X = nd.random.uniform(shape=(batch_size, 4))
+y = nd.random.uniform(shape=(batch_size,))
+
+loss = gluon.loss.L2Loss()
+
+def forward_backward():
+ with autograd.record():
+ l = loss(net(X), y)
+ l.backward()
+
+forward_backward()
+```
+
+**Warning**: It is extremely important that the gradients of the loss function with respect to your model parameters are computed before running `trainer step`. A common way to introduce bugs to your model training code is to omit the `loss.backward()`before the update step.
+
+
+
+Before updating, let's check the current network parameters.
+
+```{.python .input}
+curr_weight = net.weight.data().copy()
+print(curr_weight)
+```
+
+### `Trainer` step
+
+Now we will call the `step` method to perform one update. We provide the `batch_size` as an argument to normalize the size of the gradients and make it independent of the batch size. Otherwise we'd get larger gradients with larger batch sizes. We can see the network parameters have now changed.
+
+```{.python .input}
+trainer.step(batch_size)
+print(net.weight.data())
+```
+
+Since we used plain SGD, the update rule is $w = w - \eta/b \nabla \ell$, where $b$ is the batch size and $\nabla\ell$ is the gradient of the loss function with respect to the weights and $\eta$ is the learning rate.
+
+We can verify it by running the following code snippet which is explicitly performing the SGD update.
+
+```{.python .input}
+print(curr_weight - net.weight.grad() * 1 / batch_size)
+```
+
+
+
+## Advanced Usage
+
+### Using Optimizer Instance
+
+In the previous example, we use the string argument `sgd` to select the optimization method, and `optimizer_params` to specify the optimization method arguments.
+
+All pre-defined optimization methods can be passed in this way and the complete list of implemented optimizers is provided in the [`mxnet.optimizer`](http://beta.mxnet.io/api/gluon-related/mxnet.optimizer.html) module.
+
+However we can also pass an optimizer instance directly to the `Trainer` constructor.
+
+For example:
+
+```{.python .input}
+optim = optimizer.Adam(learning_rate = 1)
+trainer = gluon.Trainer(net.collect_params(), optim)
+```
+
+```{.python .input}
+forward_backward()
+trainer.step(batch_size)
+net.weight.data()
+```
+
+For reference and implementation details about each optimizer, please refer to the [guide](http://beta.mxnet.io/guide/packages/optimizer/optimizer.html) for the `optimizer` module.
+
+### KVStore Options
+
+The `Trainer` constructor also accepts the following keyword arguments for :
+
+- `kvstore` – how key value store should be created for multi-gpu and distributed training. Check out [`mxnet.kvstore.KVStore`](http://beta.mxnet.io/api/gluon-related/mxnet.kvstore.KVStore.html#mxnet.kvstore.KVStore) for more information. String options are any of the following ['local', 'device', 'dist_device_sync', 'dist_device_async'].
+- `compression_params` – Specifies type of gradient compression and additional arguments depending on the type of compression being used. See [`mxnet.KVStore.set_gradient_compression_method`](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.kvstore.KVStore.set_gradient_compression.html#mxnet.kvstore.KVStore.set_gradient_compression) for more details on gradient compression.
+- `update_on_kvstore` – Whether to perform parameter updates on KVStore. If None, then the `Trainer` instance will choose the more suitable option depending on the type of KVStore.
+
+### Changing the Learning Rate
+
+We set the initial learning rate when creating a trainer by passing the learning rate as an `optimizer_param`. However, sometimes we may need to change the learning rate during training, for example when doing an explicit learning rate warmup schedule. The trainer instance provides an easy way to achieve this.
+
+The current training rate can be accessed through the `learning_rate` attribute.
+
+```{.python .input}
+trainer.learning_rate
+```
+
+We can change it through the `set_learning_rate` method.
+
+```{.python .input}
+trainer.set_learning_rate(0.1)
+trainer.learning_rate
+```
+
+
+
+In addition, there are multiple pre-defined learning rate scheduling methods that are already implemented in the [`mxnet.lr_scheduler`](http://beta.mxnet.io/api/gluon-related/mxnet.lr_scheduler.html) module. The learning rate schedulers can be incorporated into your trainer by passing them in as an `optimizer_param` entry. Please refer to the [LR scheduler guide](https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/learning_rate_schedules.html) to learn more.
+
+
+
+## Summary
+
+* The MXNet Gluon `Trainer` API is used to update the parameters of a network with a particular optimization algorithm.
+* After the forward and backward pass, the model update step is done in Gluon using `trainer.step()`.
+* A Gluon `Trainer` can be instantiated by passing in the name of the optimizer to use and the `optimizer_params` for that optimizer or alternatively by passing in an instance of `mxnet.optimizer.Optimizer`.
+* You can change the learning rate for a Gluon `Trainer` by setting the member variable but Gluon also provides a module for learning rate scheduling.
+
+
+
+## Next Steps
+
+While optimization and optimizers play a significant role in deep learning model training, there are still other important components to model training. Here are a few suggestions about where to look next.
+
+* The [Optimizer API](http://beta.mxnet.io/api/gluon-related/mxnet.optimizer.html) and [guide](http://beta.mxnet.io/guide/packages/optimizer/optimizer.html) have information about all the different optimizers implemented in MXNet and their update steps. The [Dive into Deep Learning](http://en.diveintodeeplearning.org/chapter_optimization/index.html) book also has a chapter dedicated to optimization methods and explains various key optimizers in great detail.
+
+- Take a look at the [guide to parameter initialization](http://beta.mxnet.io/guide/packages/gluon/init.html) in MXNet to learn about what initialization schemes are already implemented, and how to implement your custom initialization schemes.
+- Also check out this [guide on parameter management](http://beta.mxnet.io/guide/packages/gluon/parameters.html) to learn about how to manage model parameters in gluon.
+- Make sure to take a look at the [guide to scheduling learning rates](https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/learning_rate_schedules.html) to learn how to create learning rate schedules to make your training converge faster.
+- Finally take a look at the [KVStore API](http://beta.mxnet.io/api/gluon-related/mxnet.kvstore.KVStore.html#mxnet.kvstore.KVStore) to learn how parameter values are synchronized over multiple devices.
diff --git a/docs/python_docs/python/tutorials/packages/index.rst b/docs/python_docs/python/tutorials/packages/index.rst
new file mode 100644
index 000000000000..f6aa17e6b6c2
--- /dev/null
+++ b/docs/python_docs/python/tutorials/packages/index.rst
@@ -0,0 +1,140 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+Packages
+========
+
+The documents in this unit dive into the details how each MXNet module works.
+
+High Level APIs
+---------------
+
+.. container:: cards
+
+ .. card::
+ :title: Gluon
+ :link: gluon/index.html
+
+ MXNet's imperative interface for Python. If you're new to MXNet, start here!
+
+
+Shared APIs
+-----------
+
+.. container:: cards
+
+ .. card::
+ :title: NDArray API
+ :link: ndarray/index.html
+
+ How to use the NDArray API to manipulate data.
+ A useful set of tutorials for beginners.
+
+ .. card::
+ :title: Symbol API
+ :link: symbol/index.html
+
+ How to use MXNet's Symbol API.
+
+ .. card::
+ :title: Autograd API
+ :link: autograd/autograd.html
+
+ How to use Automatic Differentiation with the Autograd API.
+
+ .. card::
+ :title: Learning Rate
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/learning_rate_schedules.html
+
+ How to use the Learning Rate Scheduler.
+
+ .. card::
+ :title: KVStore API
+ :link: kvstore/index.html
+
+ How to use the KVStore API for distributed training.
+
+ .. card::
+ :title: Data APIs
+ :link: data/index.html
+
+ How to use MXNet's data APIs.
+
+ .. card::
+ :title: Visualizations
+ :link: viz/index.html
+
+ How to use MXNet's visualization features.
+
+ .. card::
+ :title: ONNX
+ :link: onnx/index.html
+
+ How to use Open Neural Network Exchange (ONNX) with MXNet.
+
+ ..
+ .. card::
+ :title: Optimizer
+ :link: optimizer.html
+
+ How to use optimizer.
+ ..
+
+Old APIs
+--------
+Currently supported, but not recommended APIs.
+
+.. container:: cards
+
+ .. card::
+ :title: Module
+ :link: module/index.html
+
+ MXNet's symbolic interface for Python.
+
+
+.. toctree::
+ :hidden:
+
+ gluon/index
+ ndarray/index
+ symbol/index
+ autograd/autograd
+ onnx/index
+
+..
+ Basic
+ -----
+
+ .. toctree::
+ :maxdepth: 1
+
+ mxboard
+ gpus
+
+ Advanced
+ --------
+
+
+ .. toctree::
+ :maxdepth: 1
+
+ symbol
+ record-io
+ sparse
+ control-flow
+ distributed-training
diff --git a/docs/python_docs/python/tutorials/packages/kvstore/index.rst b/docs/python_docs/python/tutorials/packages/kvstore/index.rst
new file mode 100644
index 000000000000..4cf15d93ed3f
--- /dev/null
+++ b/docs/python_docs/python/tutorials/packages/kvstore/index.rst
@@ -0,0 +1,44 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+KVStore
+=======
+
+.. container:: cards
+
+ .. card::
+ :title: Distributed Training Using the KVStore API
+ :link: https://mxnet.incubator.apache.org/api/faq/distributed_training
+
+ How to use the KVStore API to use multiple GPUs when training a model.
+
+ .. card::
+ :title: Distributed Key-Value Store
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/python/kvstore.html
+
+ How to use the KVStore API to share data across different devices.
+
+References
+-----------------
+
+- `KVStore API. <../api/gluon-related/mxnet.kvstore.html>`_
+
+.. toctree::
+ :hidden:
+ :glob:
+
+ *
diff --git a/docs/python_docs/python/tutorials/packages/module/index.rst b/docs/python_docs/python/tutorials/packages/module/index.rst
new file mode 100644
index 000000000000..a485b7f523c3
--- /dev/null
+++ b/docs/python_docs/python/tutorials/packages/module/index.rst
@@ -0,0 +1,52 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+Module
+======
+
+.. container:: cards
+
+ .. card::
+ :title: Module API Intro
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/basic/module.html
+
+ How to use MXNet's Module API.
+
+ .. card::
+ :title: Converting Module API code to the Gluon API
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/python/module_to_gluon.html
+
+ How to transform models written using Module code to Gluon code.
+
+ .. card::
+ :title: Data Augmentation
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/python/data_augmentation.html
+
+ How to augment data with the Module API.
+
+ .. card::
+ :title: Predict with pre-trained models
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/python/predict_image.html
+
+ How to recognize objects in an image with a pre-trained model.
+
+
+.. toctree::
+ :hidden:
+
+ module
+ data_augmentation
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/01-ndarray-intro.md b/docs/python_docs/python/tutorials/packages/ndarray/01-ndarray-intro.md
new file mode 100644
index 000000000000..26e9c48bd8e3
--- /dev/null
+++ b/docs/python_docs/python/tutorials/packages/ndarray/01-ndarray-intro.md
@@ -0,0 +1,176 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# An Intro: Manipulate Data the MXNet Way with NDArray
+
+## Overview
+This guide
+will introduce you to how data is handled with MXNet. You will learn the basics
+about MXNet's multi-dimensional array format, `ndarray`.
+
+This content was extracted and simplified from the gluon tutorials in
+[Dive Into Deep Learning](http://gluon.io/).
+
+## Prerequisites
+* [MXNet installed in a Python environment](../../../install/index.html?language=Python).
+* Python 2.7.x or Python 3.x
+
+
+## Getting started
+
+In this chapter, we'll get
+you going with the basic functionality. Don't worry if you don't understand any
+of the basic math, like element-wise operations or normal distributions. In the
+next two chapters we'll take another pass at `NDArray`, teaching you both the math
+you'll need and how to realize it in code.
+
+To get started, let's import
+`mxnet`. We'll also import `ndarray` from `mxnet` for convenience. We’ll make a
+habit of setting a random seed so that you always get the same results that we
+do.
+
+```python
+import mxnet as mx
+from mxnet import nd
+```
+
+Let's start with a very simple 1-dimensional array with a python list.
+
+```python
+x = nd.array([1,2,3])
+print(x)
+```
+
+Now a 2-dimensional array.
+
+```python
+y = nd.array([[1,2,3,4], [1,2,3,4], [1,2,3,4]])
+print(y)
+```
+
+Next, let's see how to create an `NDArray`, without any values initialized.
+Specifically, we'll create a 2D array (also called a *matrix*) with 3 rows and 4
+columns using the `.empty` function. We'll also try out `.full` which takes an
+additional parameter for what value you want to fill in the array.
+
+```python
+x = nd.empty((3, 3))
+print(x)
+x = nd.full((3,3), 7)
+print(x)
+```
+
+`empty` just grabs some memory and hands us back a matrix without setting the
+values of any of its entries. This means that the entries can have any form of
+values, including very big ones! Typically, we'll want our matrices initialized
+and very often we want a matrix of all zeros, so we can use the `.zeros`
+function. If you're feeling experimental, try one of the several [array creation
+functions](https://mxnet.incubator.apache.org/api/{.python
+.input}/ndarray/ndarray.html#array-creation-routines).
+
+
+
+```python
+x = nd.zeros((3, 10))
+print(x)
+```
+
+Similarly, `ndarray` has a function to create a matrix of all ones aptly named
+[ones](https://mxnet.incubator.apache.org/api/{.python
+.input}/ndarray.html?highlight=random_normal#mxnet.ndarray.ones).
+
+```python
+x = nd.ones((3, 4))
+print(x)
+```
+
+Often, we'll want to create arrays whose values are sampled randomly. This is
+especially common when we intend to use the array as a parameter in a neural
+network. In this snippet, we initialize with values drawn from a standard normal
+distribution with zero mean and unit variance using
+[random_normal](https://mxnet.incubator.apache.org/api/{.python
+.input}/ndarray.html?highlight=random_normal#mxnet.ndarray.random_normal).
+
+
+
+```python
+y = nd.random_normal(0, 1, shape=(3, 4))
+print(y)
+```
+
+Sometimes you will want to copy an array by its shape but not its contents. You
+can do this with `.zeros_like`.
+
+```python
+z = nd.zeros_like(y)
+print(z)
+```
+
+As in NumPy, the dimensions of each `NDArray` are accessible via the `.shape`
+attribute.
+
+```python
+y.shape
+```
+
+We can also query its `.size`, which is equal to the product of the components
+of the shape. Together with the precision of the stored values, this tells us
+how much memory the array occupies.
+
+
+```python
+y.size
+```
+
+We can query the data type using `.dtype`.
+
+```python
+y.dtype
+```
+
+`float32` is the default data type. Performance can be improved with less
+precision, or you might want to use a different data type. You can force the
+data type when you create the array using a numpy type. This requires you to
+import numpy first.
+
+```python
+import numpy as np
+a = nd.array([1,2,3])
+b = nd.array([1,2,3], dtype=np.int32)
+c = nd.array([1.2, 2.3], dtype=np.float16)
+(a.dtype, b.dtype, c.dtype)
+```
+
+As you will come to learn in detail later, operations and memory storage will
+happen on specific devices that you can set. You can compute on CPU(s), GPU(s), a
+specific GPU, or all of the above depending on your situation and preference.
+Using `.context` reveals the location of the variable.
+
+```python
+y.context
+```
+
+## Next Up
+
+[NDArray Operations](02-ndarray-operations.md)
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/02-ndarray-operations.md b/docs/python_docs/python/tutorials/packages/ndarray/02-ndarray-operations.md
new file mode 100644
index 000000000000..0f2520f2a322
--- /dev/null
+++ b/docs/python_docs/python/tutorials/packages/ndarray/02-ndarray-operations.md
@@ -0,0 +1,260 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# NDArray Operations
+
+## Overview
+This guide will introduce you to MXNet's array operations.
+
+This content was extracted and simplified from the gluon tutorials in
+[Dive Into Deep Learning](http://gluon.io/).
+
+## Prerequisites
+* [MXNet installed in a Python environment](../../../install/index.html?language=Python).
+* Python 2.7.x or Python 3.x
+
+
+## Operations
+
+NDArray supports a large number of standard mathematical operations.
+Such as element-wise addition:
+
+
+```{.python .input}
+import mxnet as mx
+from mxnet import nd
+```
+
+```{.python .input}
+x = nd.ones((3, 4))
+y = nd.random_normal(0, 1, shape=(3, 4))
+print('x=', x)
+print('y=', y)
+x = x + y
+print('x = x + y, x=', x)
+```
+
+Multiplication:
+
+```{.python .input}
+x = nd.array([1, 2, 3])
+y = nd.array([2, 2, 2])
+x * y
+```
+
+And exponentiation:
+
+
+```{.python .input}
+nd.exp(x)
+```
+
+We can also grab a matrix's transpose to compute a proper matrix-matrix product.
+
+
+```{.python .input}
+nd.dot(x, y.T)
+```
+
+We'll explain these operations and present even more operators in the [linear
+algebra](P01-C03-linear-algebra.ipynb) chapter. But for now, we'll stick with
+the mechanics of working with NDArrays.
+
+## In-place operations
+
+In the previous
+example, every time we ran an operation, we allocated new memory to host its
+results. For example, if we write `y = x + y`, we will dereference the matrix
+that `y` used to point to and instead point it at the newly allocated memory. We
+can show this using Python's `id()` function, which tells us precisely which
+object a variable refers to.
+
+
+
+```{.python .input}
+print('y=', y)
+print('id(y):', id(y))
+y = y + x
+print('after y=y+x, y=', y)
+print('id(y):', id(y))
+```
+
+We can assign the result to a previously allocated array with slice notation,
+e.g., `result[:] = ...`.
+
+```{.python .input}
+print('x=', x)
+z = nd.zeros_like(x)
+print('z is zeros_like x, z=', z)
+print('id(z):', id(z))
+print('y=', y)
+z[:] = x + y
+print('z[:] = x + y, z=', z)
+print('id(z) is the same as before:', id(z))
+```
+
+However, `x+y` here will still allocate a temporary buffer to store the result
+before copying it to z. To make better use of memory, we can perform operations
+in place, avoiding temporary buffers. To do this we specify the `out` keyword
+argument every operator supports:
+
+```{.python .input}
+print('x=', x, 'is in id(x):', id(x))
+print('y=', y, 'is in id(y):', id(y))
+print('z=', z, 'is in id(z):', id(z))
+nd.elemwise_add(x, y, out=z)
+print('after nd.elemwise_add(x, y, out=z), x=', x, 'is in id(x):', id(x))
+print('after nd.elemwise_add(x, y, out=z), y=', y, 'is in id(y):', id(y))
+print('after nd.elemwise_add(x, y, out=z), z=', z, 'is in id(z):', id(z))
+```
+
+If we're not planning to re-use ``x``, then we can assign the result to ``x``
+itself. There are two ways to do this in MXNet.
+1. By using slice notation x[:]
+= x op y
+2. By using the op-equals operators like `+=`
+
+```{.python .input}
+print('x=', x, 'is in id(x):', id(x))
+x += y
+print('x=', x, 'is in id(x):', id(x))
+```
+
+## Slicing
+MXNet NDArrays support slicing in all the ridiculous ways you might
+imagine accessing your data. For a quick review:
+
+* items start through end-1: a[start:end]
+* items start through the rest of the
+array: a[start:]
+* items from the beginning through end-1: a[:end]
+* a copy of
+the whole array: a[:]
+
+Here's an example of reading the second and third rows from `x`.
+
+```{.python .input}
+x = nd.array([1, 2, 3])
+print('1D complete array, x=', x)
+s = x[1:3]
+print('slicing the 2nd and 3rd elements, s=', s)
+x = nd.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
+print('multi-D complete array, x=', x)
+s = x[1:3]
+print('slicing the 2nd and 3rd elements, s=', s)
+```
+
+Now let's try writing to a specific element.
+
+```{.python .input}
+print('original x, x=', x)
+x[2] = 9.0
+print('replaced entire row with x[2] = 9.0, x=', x)
+x[0,2] = 9.0
+print('replaced specific element with x[0,2] = 9.0, x=', x)
+x[1:2,1:3] = 5.0
+print('replaced range of elements with x[1:2,1:3] = 5.0, x=', x)
+```
+
+Multi-dimensional slicing is also supported.
+
+```{.python .input}
+x = nd.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
+print('original x, x=', x)
+s = x[1:2,1:3]
+print('plucking specific elements with x[1:2,1:3]', s)
+s = x[:,:1]
+print('first column with x[:,:1]', s)
+s = x[:1,:]
+print('first row with x[:1,:]', s)
+s = x[:,3:]
+print('last column with x[:,3:]', s)
+s = x[2:,:]
+print('last row with x[2:,:]', s)
+```
+
+## Broadcasting
+
+You might wonder, what happens if you add a vector `y` to a
+matrix `X`? These operations, where we compose a low dimensional array `y` with
+a high-dimensional array `X` invoke a functionality called broadcasting. First
+we'll introduce `.arange` which is useful for filling out an array with evenly
+spaced data. Then we can take the low-dimensional array and duplicate it along
+any axis with dimension $1$ to match the shape of the high dimensional array.
+Consider the following example.
+
+Comment (visible to demonstrate with font):
+dimension one(1)? Or L(elle) or l(lil elle) or I(eye) or... ? We don't even use
+the notation later, so did it need to be introduced here?
+
+
+
+```{.python .input}
+x = nd.ones(shape=(3,6))
+print('x = ', x)
+y = nd.arange(6)
+print('y = ', y)
+print('x + y = ', x + y)
+```
+
+While `y` is initially of shape $6$,
+MXNet infers its shape to be (1,6),
+and then broadcasts along the rows to form a (3,6) matrix).
+You might wonder, why did MXNet choose to interpret `y` as a (1,6) matrix and not (6,1).
+That's because broadcasting prefers to duplicate along the left most axis.
+We can alter this behavior by explicitly giving `y` a $2$D shape using `.reshape`.
+You can also chain `.arange` and `.reshape` to do this in one step.
+
+```{.python .input}
+y = y.reshape((3,1))
+print('y = ', y)
+print('x + y = ', x+y)
+y = nd.arange(6).reshape((3,1))
+print('y = ', y)
+```
+
+## Converting from MXNet NDArray to NumPy
+Converting MXNet NDArrays to and from
+NumPy is easy. The converted arrays do not share memory.
+
+```{.python .input}
+a = x.asnumpy()
+type(a)
+```
+
+```{.python .input}
+y = nd.array(a)
+print('id(a)=', id(a), 'id(x)=', id(x), 'id(y)=', id(y))
+```
+
+## Next Up
+
+[NDArray Contexts](03-ndarray-contexts.md)
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/03-ndarray-contexts.md b/docs/python_docs/python/tutorials/packages/ndarray/03-ndarray-contexts.md
new file mode 100644
index 000000000000..f6e365974d64
--- /dev/null
+++ b/docs/python_docs/python/tutorials/packages/ndarray/03-ndarray-contexts.md
@@ -0,0 +1,89 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# NDArray Contexts
+
+## Overview
+This guide will introduce you to managing CPU versus GPU contexts for handling data.
+
+This content was extracted and simplified from the gluon tutorials in
+[Dive Into Deep Learning](http://gluon.io/).
+
+## Prerequisites
+* [MXNet installed (with GPU support) in a Python environment](../../../install/index.html?language=Python).
+* Python 2.7.x or Python 3.x
+* **One or more GPUs**
+
+
+## Managing Context
+
+In MXNet, every array has a context.
+One context could be the CPU. Other contexts might be various GPUs.
+Things can get even hairier when we deploy jobs across multiple servers.
+By assigning arrays to contexts intelligently, we can minimize
+the time spent transferring data between devices.
+For example, when training neural networks on a server with a GPU,
+we typically prefer for the model's parameters to live on the GPU.
+If you have a GPU, let's try initializing an array on the first GPU.
+Otherwise, use `ctx=mx.cpu()` in place of `ctx=gpu(0)`.
+
+```{.python .input}
+from mxnet import gpu
+from mxnet import nd
+z = nd.ones(shape=(3,3), ctx=gpu(0))
+print(z)
+```
+
+Given an NDArray on a given context, we can copy it to another context by using
+the copyto() method. Skip this if you don't have a GPU at the moment.
+
+```{.python .input}
+x_gpu = x.copyto(gpu(0))
+print(x_gpu)
+```
+
+The result of an operator will have the same context as the inputs.
+
+```{.python .input}
+x_gpu + z
+```
+
+## Watch out!
+
+Imagine that your variable z already lives on your second GPU
+(`gpu(0)`). What happens if we call `z.copyto(gpu(0))`? It will make a copy and
+allocate new memory, even though that variable already lives on the desired
+device!
+
+
+Often, we only want to make
+a copy if the variable currently lives in the wrong context. In these cases, we
+can call `as_in_context()`. If the variable is already on `gpu(0)` then this is
+a no-op.
+
+```{.python .input}
+print('id(z):', id(z))
+z = z.copyto(gpu(0))
+print('id(z):', id(z))
+z = z.as_in_context(gpu(0))
+print('id(z):', id(z))
+print(z)
+```
+
+## Next Up
+
+[Back to NDArray API Guides](.)
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/index.rst b/docs/python_docs/python/tutorials/packages/ndarray/index.rst
new file mode 100644
index 000000000000..9f8d04741c6c
--- /dev/null
+++ b/docs/python_docs/python/tutorials/packages/ndarray/index.rst
@@ -0,0 +1,88 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+NDArray
+========
+
+.. container:: cards
+
+
+ .. card::
+ :title: Introduction to NDArray - Part 1
+ :link: 01-ndarray-intro.html
+
+ Learn how to manipulate data with MXNet's multi-dimensional data format, NDArray.
+
+ .. card::
+ :title: Introduction to NDArray - Part 2: Operations
+ :link: 02-ndarray-operations.html
+
+ Learn basic array operations like math and slicing.
+
+ .. card::
+ :title: Introduction to NDArray - Part 3: Contexts
+ :link: 03-ndarray-contexts.html
+
+ This guide will introduce you to how CPU and GPU contexts are handled with MXNet.
+
+ .. card::
+ :title: NDArray API Quickstart
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/ndarray.html
+
+ If you are familiar with NumPy you can start here.
+
+ .. card::
+ :title: Imperative tensor operations
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/basic/ndarray.html
+
+ A more detailed look at NDArray operations.
+
+ .. card::
+ :title: Reshape vs Transpose
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/basic/ndarray_indexing.html
+
+ A guide on the difference between the reshape and transpose operators.
+
+ .. card::
+ :title: NDArray Indexing
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/basic/ndarray_indexing.html
+
+ How to use the indexing features of NDArray.
+
+ .. card::
+ :title: RowSparseNDArray
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/sparse/row_sparse.html
+
+ How to implement sparse tensors in NDArray.
+
+ .. card::
+ :title: CSRNDArray
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/sparse/csr.html
+
+ How to store and manipulate large sparse matrices.
+
+ .. card::
+ :title: Sparse Symbols
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/sparse/train.html
+
+ How to train a linear regression model with sparse symbols.
+
+.. toctree::
+ :hidden:
+
+ ndarray
+ sparse
diff --git a/docs/python_docs/python/tutorials/packages/onnx/index.rst b/docs/python_docs/python/tutorials/packages/onnx/index.rst
new file mode 100644
index 000000000000..a3483337c9a4
--- /dev/null
+++ b/docs/python_docs/python/tutorials/packages/onnx/index.rst
@@ -0,0 +1,51 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+ONNX
+======
+
+.. container:: cards
+
+ .. card::
+ :title: Fine-tuning an ONNX model with MXNet/Gluon
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/onnx/fine_tuning_gluon.html
+
+ A tutorial on loading a model in Gluon and fine-tuning it on a dataset.
+
+ .. card::
+ :title: Running inference on MXNet/Gluon from an ONNX model
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/onnx/inference_on_onnx_model.html
+
+ A tutorial on running inference from an ONNX model.
+
+ .. card::
+ :title: Importing an ONNX model into MXNet
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/onnx/super_resolution.html
+
+ How to load a pre-trained ONNX model file into MXNet.
+
+ .. card::
+ :title: Export ONNX Models
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/onnx/export_mxnet_to_onnx.html
+
+ How to export an MXNet model to the ONNX model format.
+
+
+.. toctree::
+ :hidden:
+
+ export_mxnet_to_onnx
diff --git a/docs/python_docs/python/tutorials/packages/optimizer/images/momentum_sgd_animation.gif b/docs/python_docs/python/tutorials/packages/optimizer/images/momentum_sgd_animation.gif
new file mode 100644
index 000000000000..759f35c3aa9f
Binary files /dev/null and b/docs/python_docs/python/tutorials/packages/optimizer/images/momentum_sgd_animation.gif differ
diff --git a/docs/python_docs/python/tutorials/packages/optimizer/images/nesterov_momentum_animation.gif b/docs/python_docs/python/tutorials/packages/optimizer/images/nesterov_momentum_animation.gif
new file mode 100644
index 000000000000..e8ee688f5980
Binary files /dev/null and b/docs/python_docs/python/tutorials/packages/optimizer/images/nesterov_momentum_animation.gif differ
diff --git a/docs/python_docs/python/tutorials/packages/optimizer/images/sgd_animation.gif b/docs/python_docs/python/tutorials/packages/optimizer/images/sgd_animation.gif
new file mode 100644
index 000000000000..12d558239c16
Binary files /dev/null and b/docs/python_docs/python/tutorials/packages/optimizer/images/sgd_animation.gif differ
diff --git a/docs/python_docs/python/tutorials/packages/optimizer/optimizer.md b/docs/python_docs/python/tutorials/packages/optimizer/optimizer.md
new file mode 100644
index 000000000000..dfc56abedd02
--- /dev/null
+++ b/docs/python_docs/python/tutorials/packages/optimizer/optimizer.md
@@ -0,0 +1,429 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# Optimization Algorithms
+
+Deep learning models are comprised of a model architecture and the model parameters. The model architecture is chosen based on the task - for example Convolutional Neural Networks (CNNs) are very successful in handling image based tasks and Recurrent Neural Networks (RNNs) are better suited for sequential prediction tasks. However, the values of the model parameters are learned by solving an optimization problem during model training.
+
+To learn the parameters, we start with an initialization scheme and iteratively refine the parameter initial values by moving them along a direction that is opposite to the (approximate) gradient of the loss function. The extent to which the parameters are updated in this direction is governed by a hyperparameter called the learning rate. This process, known as gradient descent, is the backbone of optimization algorithms in deep learning. In MXNet, this functionality is abstracted by the [Optimizer API](http://beta.mxnet.io/api/gluon-related/mxnet.optimizer.html).
+
+When training a deep learning model using the MXNet [gluon API](http://beta.mxnet.io/guide/packages/gluon/index.html), a gluon [Trainer](http://beta.mxnet.io/guide/packages/gluon/trainer.html) is initialized with the all the learnable parameters and the optimizer to be used to learn those parameters. A single step of iterative refinement of model parameters in MXNet is achieved by calling [`trainer.step`](http://beta.mxnet.io/api/gluon/_autogen/mxnet.gluon.Trainer.step.html) which in turn uses the gradient (and perhaps some state information) to update the parameters by calling `optimizer.update`.
+
+Here is an example of how a trainer with an optimizer is created for, a simple Linear (Dense) Network.
+
+
+```python
+from mxnet import gluon, optimizer
+
+net = gluon.nn.Dense(1)
+net.initialize()
+optim = optimizer.SGD(learning_rate=0.1)
+trainer = gluon.Trainer(net.collect_params(), optimizer=optim)
+```
+
+In model training, the code snippet above would be followed by a training loop which, at every iteration performs a forward pass (to compute the loss), a backward pass (to compute the gradient of the loss with respect to the parameters) and a trainer step (which updates the parameters using the gradient). See the [gluon Trainer guide](http://beta.mxnet.io/guide/packages/gluon/trainer.html) for a complete example.
+
+We can also create the trainer by passing in the optimizer name and optimizer params into the trainer constructor directly, as shown below.
+
+
+```python
+trainer = gluon.Trainer(net.collect_params(), optimizer='adam', optimizer_params={'learning_rate':1})
+```
+
+### What should I use?
+For many deep learning model architectures, the `sgd` and `adam` optimizers are a really good place to start. If you are implementing a deep learning model and trying to pick an optimizer, start with [`'sgd'`](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.optimizer.SGD.html#mxnet.optimizer.SGD) as you will often get good enough results as long as your learning problem is tractable. If you already have a trainable model and you want to improve the convergence then you can try [`'adam'`](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.optimizer.SGD.html#mxnet.optimizer.Adam). If you would like to improve your model training process further, there are a number of specialized optimizers out there with many of them already implemented in MXNet. This guide walks through these optimizers in some detail.
+
+## Stochastic Gradient Descent
+[Gradient descent](https://en.wikipedia.org/wiki/Gradient_descent) is a general purpose algorithm for minimizing a function using information from the gradient of the function with respect to its parameters. In deep learning, the function we are interested in minimizing is the [loss function](http://beta.mxnet.io/guide/packages/gluon/loss.html). Our model accepts training data as inputs and the loss function tells us how good our model predictions are. Since the training data can routinely consist of millions of examples, computing the loss gradient on the full batch of training data is very computationally expensive. Luckily, we can effectively approximate the full gradient with the gradient of the loss function on randomly chosen minibatches of our training data. This variant of gradient descent is [stochastic gradient descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent).
+
+Technically, stochastic gradient descent (SGD) refers to an online approximation of the gradient descent algorithm that computes the gradient of the loss function applied to a *single datapoint*, instead of your entire dataset, and uses this approximate gradient to update the model parameter values. However, in MXNet, and other deep learning frameworks, the SGD optimizer is agnostic to how many datapoints the loss function is applied to, and it is more effective to use a mini-batch loss gradient, as described earlier, instead of a single datapoint loss gradient.
+
+### [SGD optimizer](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.optimizer.SGD.html#mxnet.optimizer.SGD)
+
+For an SGD optimizer initialized with learning rate $lr$, the update function accepts parameters (weights) $w_i$, and their gradients $grad(w_i)$, and performs the single update step:
+
+$$w_{i+1} = w_i + lr\cdot -grad(w_i)$$
+
+visualized in the diagram shown below.
+
+
+
+
+
+
+### Weight decay
+The SGD update step can be modified by introducing an extra term that enforces a penalty on the size of the parameters. This is achieved by subtracting a fraction of the weight $\delta\cdot w$ during the weight update as shown below.
+
+$$w_{i+1} = w_i + lr\cdot (-grad(w_i) -\delta\cdot w_i)$$
+
+Introducing weight decay modifies the objective of the optimization problem by adding an implicit regularization term to penalizes large weights. Weight decay is discussed more extensively in this [paper](https://papers.nips.cc/paper/563-a-simple-weight-decay-can-improve-generalization.pdf).
+
+### Momentum
+The convergence of the SGD optimizer can be accelerated by incorporating momentum. Originally proposed by [Polyak (1964)](https://www.sciencedirect.com/science/article/abs/pii/0041555364901375), SGD with momentum improves the approximation of the gradient term by incorporating the gradients from previous update steps. To achieve this, SGD with momentum stores and 'remembers' the update at each iteration to be included in the next iteration. In the equations below we denote the momentum history as $v$.
+
+For the first update the SGD optimizer with momentum performs the single update step:
+
+$$ v_1= lr\cdot -grad(w_0)$$
+$$ w_1= w_0 + v_1 $$
+
+For subsequent updates, SGD with momentum, with momentum parameter $\gamma$, performs the update step:
+
+$$ v_{i+1} = \gamma \cdot v_{i} + lr\cdot -grad(w_{i}) $$
+$$ w_{i+1} = w_i + v_{i+1} $$
+
+This is also shown in the diagram below.
+
+
+
+
+
+
+The use of SGD with momentum for learning in neural networks was introduced by Rumelhart, Hinton and Williams in [Learning Internal Representations by Error Propagation](https://dl.acm.org/citation.cfm?id=104279.104293).
+
+To create an SGD optimizer with momentum $\gamma$ and weight decay in MXNet simply use the following code.
+
+
+```python
+sgd_optimizer = optimizer.SGD(learning_rate=0.1, wd=0., momentum=0.8)
+```
+
+### [Nesterov Accelerated Stochastic Gradient Descent](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.optimizer.NAG.html#mxnet.optimizer.NAG)
+
+The momentum method of [Nesterov](https://goo.gl/M5xbuX) is a modification to SGD with momentum that allows for even faster convergence in practice. With Nesterov accelerated gradient (NAG) descent, the update term is derived from the gradient of the loss function with respect to *refined parameter values*. These refined parameter values are computed by performing a SGD update step using the momentum history as the gradient term.
+
+Alternatively, you can think of the NAG optimizer as performing two update steps:
+* The first (internal) update step approximates uses the current momentum history $v_i$ to calculate the refined parameter values $(w_i + \gamma \cdot v_i)$. This is also known as the lookahead step.
+* The second (actual) step uses the gradient of the loss function with respect to the lookahead parameter values from the first step and the current momentum history $v_i$ to obtain a new direction to update our original parameter values, like classical momentum.
+
+The NAG optimizer with momentum parameter $\gamma$ performs the update step:
+
+$$ v_{i+1} = \gamma \cdot v_{i} + lr\cdot -grad(w_{i} + \gamma \cdot v_i) $$
+$$ w_{i+1} = w_i + v_{i+1} $$
+
+
+
+
+
+
+The effects of using NAG over SGD and classical momentum are discussed in this [paper](http://proceedings.mlr.press/v28/sutskever13.pdf) by Sutskever et al.
+
+The NAG optimizer can be initialized in MXNet by using the code snippet below or by creating a trainer with argument `optimizer='nag'`.
+
+
+```python
+nag_optimizer = optimizer.NAG(learning_rate=0.1, momentum=0.8)
+```
+
+## Adaptive Learning Rate Methods
+
+The gradient methods implemented by the optimizers described above use a global learning rate hyperparameter for all parameter updates. This has a well-documented shortcoming in that it makes the training process and convergence of the optimization algorithm really sensitive to the choice of the global learning rate. Adaptive learning rate methods avoid this pitfall by incorporating some history of the gradients observed in earlier iterations to scale step sizes (learning rates) to each learnable parameter in the model.
+
+### [AdaGrad](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.optimizer.AdaGrad.html)
+
+The AdaGrad optimizer, which implements the optimization method originally described by [Duchi et al](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf), multiplies the global learning rate by the $L_2$ norm of the preceeding gradient estimates for each paramater to obtain the per-parameter learning rate. To achieve this, AdaGrad introduces a new term which we'll denote as $g^2$ - the accumulated square of the gradient of the loss function with respect to the parameters.
+
+Thus the AdaGrad optimizer update function performs the update steps below to obtain $i+1$th refinement.
+
+$$ g^2_{i+1} = g^2_{i} + grad(w_i)^2 $$
+$$ w_{i+1} = w_i + \dfrac{lr}{\sqrt{g^2 + \epsilon}}\cdot -grad(w_i)$$
+
+The $\epsilon$ term is a tiny positive value introduced to avoid division by zero due to floating point issues.
+
+The overaching benefit of AdaGrad over SGD is that it ensures the overall convergence is more resilient to the choice of the global learning rate $lr$ especially in tasks, such as natural language processing some data is sparse but the parameters influenced by the sparse data are quite informative.
+
+To instantiate the Adagrad optimizer in MXNet you can use the following line of code.
+
+
+```python
+adagrad_optimizer = optimizer.AdaGrad(learning_rate=0.1, eps=1e-07)
+```
+
+### [RMSProp](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.optimizer.RMSProp.html)
+
+RMSProp, introduced by [Tielemen and Hinton](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf), is similar to AdaGrad described above, but, instead of accumulating the sum of historical square gradients, maintains an exponential decaying average of the historical square gradients, in order to give more weighting to more recent gradients.
+
+For rmsprop, we introduce the term $\mathbb{E}[g^2]$ - the decaying average over past squared gradients and $\beta$ as the forgetting factor. The rmsprop optimizer performs the update given below.
+
+
+$$ \mathbb{E}[g^2]_{i+1} = \beta\cdot\mathbb{E}[g^2]_{i} + (1-\beta)\cdot [grad(w_{i})]^2 $$
+$$ w_{i+1} = w_i + \dfrac{lr}{\sqrt{\mathbb{E}[g^2]_{i+1} + \epsilon}}\cdot -grad(w_i) $$
+
+The $\epsilon$ term is included, as in AdaGrad, for numerical stability.
+
+RMSProp was derived independently of AdaGrad and the name RMSProp derives from a combination of [RProp](https://en.wikipedia.org/wiki/Rprop) and the RMS, root mean square, operation in the denominator of the weight update.
+
+
+#### RMSProp (Centered)
+The MXNet RMSProp optimizer with the `centered=True` argument implements a variant of the RMSProp update described by [Alex Graves](https://arxiv.org/pdf/1308.0850v5.pdf), which centres the second moment $\mathbb{E}[g^2]$ or decaying average of square gradients by subtracting the square of decaying average of gradients. It also adds an explicit momentum term to weight past update steps. Representing the decaying average of gradients as $\mathbb{E}[g]$ and momentum parameter as $\gamma$, we add another equation to the non-centered rmsprop update described above.
+
+The centered RMSProp optimizer performs the update step:
+
+$$ \mathbb{E}[g]_{i+1} = \beta\cdot\mathbb{E}[g]_{i} + (1-\beta)\cdot [grad(w_{i})] $$
+$$ \mathbb{E}[g^2]_{i+1} = \beta\cdot\mathbb{E}[g^2]_{i} + (1-\beta)\cdot [grad(w_{i})]^2 $$
+$$ v_{i+1} = \gamma \cdot v_{i} + \dfrac{lr}{\sqrt{\mathbb{E}[g^2]_{i+1} - \mathbb{E}[g]^2_{i+1}+ \epsilon}}\cdot -grad(w_{i}) $$
+$$ w_{i+1} = w_i + v_{i+1} $$
+
+Here is an example snippet creating the RMSProp optimizer in MXNet.
+
+
+```python
+rmsprop_optimizer = optimizer.RMSProp(learning_rate=0.001, gamma1=0.9, gamma2=0.9, epsilon=1e-07, centered=False)
+```
+
+In the code snippet above, `gamma1` is $\beta$ in the equations above and `gamma2` is $\gamma$, which is only used where `centered=True`.
+
+### [AdaDelta](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.optimizer.AdaDelta.html)
+
+AdaDelta was introduced to address some remaining lingering issues with AdaGrad and RMSProp - the selection of a global learning rate. AdaGrad and RMSProp assign each parameter its own learning rate but the per-parameter learning rate are still calculated using the global learning rate. In contrast, AdaDelta does not require a global learning rate, instead, it tracks the square of previous update steps, represented below as $\mathbb{E}[\Delta w^2]$ and uses the root mean square of the previous update steps as an estimate of the learning rate.
+
+The AdaDelta optimizer performs the following equations in its update step:
+
+$$ \mathbb{E}[\Delta w^2]_{i+1} = \beta\cdot\mathbb{E}[\Delta w^2]_i + (1 - \beta) \cdot (w_i - w_{i-1})^2 $$
+$$ \mathbb{E}[g^2]_{i+1} = \beta\cdot\mathbb{E}[g^2]_{i} + (1-\beta)\cdot [grad(w_{i})]^2 $$
+$$ w_{i+1} = w_i + \dfrac{\sqrt{\mathbb{E}[\Delta w^2] + \epsilon}}{\sqrt{\mathbb{E}[g^2]_{i+1} + \epsilon}} \cdot -grad(w_i)$$
+
+As evident from the above equations, AdaDelta is similar to RMSProp but does not require you to specify $lr$ and instead uses $\sqrt{\mathbb{E}[\Delta w^2] + \epsilon}$ as the estimated learning rate. AdaDelta was introduced by Zeiler in this [paper](https://arxiv.org/abs/1212.5701).
+
+Here is the code snippet creating the AdaDelta optimizer in MXNet. The argument `rho` in the code is $\beta$ in the update equations. Notice there is no learning rate argument in the code.
+
+
+```python
+adadelta_optimizer = optimizer.AdaDelta(rho=0.9, epsilon=1e-07)
+```
+
+### [Adam](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.optimizer.Adam.html)
+Adam, introduced by [Kingma and Ba](https://arxiv.org/abs/1412.6980), is one of the popular adaptive algorithms for deep learning. It combines elements of RMSProp with momentum SGD. Like RMSProp, Adam uses the RootMeanSquare of decaying average of historical gradients but also explicitly keeps track of a decaying average of momentum and uses that for the update step direction. Thus, Adam accepts two hyperparameters $\beta_1$ and $\beta_2$ for momentum weighting and gradient RMS weighting respectively. Adam also accepts a global learning rate that's adaptively tuned to each parameter with the gradient RootMeanSquare. Finally, Adam also includes bias correction steps within the update that transform the biased estimates of first and second order moments, $v_{i+1}$ and $\mathbb{E}[g^2]_{i+1}$ to their unbiased counterparts $\tilde{v}_{i+1}$ and $\tilde{\mathbb{E}[g^2]}_{i+1}$
+
+The Adam optimizer performs the update step described the following equations:
+
+$$ v_{i+1} = \beta_1 \cdot v_{i} + (1 - \beta_1) \cdot grad(w_i) $$
+$$ \mathbb{E}[g^2]_{i+1} = \beta_2\cdot\mathbb{E}[g^2]_{i} + (1-\beta_2)\cdot [grad(w_{i})]^2 $$
+$$ \tilde{v}_{i+1} = \dfrac{v_{i+1}}{1 - (\beta_1)^{i+1}} $$
+$$ \tilde{\mathbb{E}[g^2]}_{i+1} = \dfrac{\mathbb{E}[g^2]_{i+1}}{1 - (\beta_2)^{i+1}} $$
+$$ w_{i+1} = w_i + \dfrac{lr}{\sqrt{\tilde{\mathbb{E}[g^2]}_{i+1}} + \epsilon} \cdot -\tilde{v}_{i+1} $$
+
+In MXNet, you can construct the Adam optimizer with the following line of code.
+
+
+```python
+adam_optimizer = optimizer.Adam(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08)
+```
+
+### [Adamax](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.optimizer.Adamax.html)
+Adamax is a variant of Adam also included in the original paper by [Kingma and Ba](https://arxiv.org/abs/1412.6980). Like Adam, Adamax maintains a moving average for first and second moments but Adamax uses the $L_{\infty}$ norm for the exponentially weighted average of the gradients, instead of the $L_2$ norm used in Adam used to keep track of the gradient second moment. The $L_{\infty}$ norm of a vector is equivalent to take the maximum absolute value of elements in that vector.
+
+$$ v_{i+1} = \beta_1 \cdot v_{i} + (1 - \beta_1) \cdot grad(w_i) $$
+$$ g^\infty_{i+1} = \mathtt{max}(\beta_2\cdot g^\infty_{i}, |{grad(w_i)}|) $$
+$$ \tilde{v}_{i+1} = \dfrac{v_{i+1}}{1 - \beta_1^{i+1}} $$
+$$ w_{i+1} = w_i + \dfrac{lr}{g^\infty_{i+1} + \epsilon} \cdot - \tilde{v}_{i+1} $$
+
+See the code snippet below for how to construct Adamax in MXNet.
+
+
+```python
+adamax_optimizer = optimizer.Adamax(learning_rate=0.002, beta1=0.9, beta2=0.999)
+```
+
+### [Nadam](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.optimizer.Nadam.html)
+Nadam is also a variant of Adam and draws from the perspective that Adam can be viewed as a combination of RMSProp and classical Momentum (or Polyak Momentum). Nadam replaces the classical Momentum component of Adam with Nesterov Momentum (See [paper](http://cs229.stanford.edu/proj2015/054_report.pdf) by Dozat). The consequence of this is that the gradient used to update the weighted average of the momentum term is a lookahead gradient as is the case with NAG.
+
+The Nadam optimizer performs the update step:
+
+$$ v_{i+1} = \beta_1 \cdot v_{i} + (1 - \beta_1) \cdot grad(w_i + \beta_1 \cdot v_{i}) $$
+$$ \mathbb{E}[g^2]_{i+1} = \beta_2\cdot\mathbb{E}[g^2]_{i} + (1-\beta_2)\cdot [grad(w_{i})]^2 $$
+$$ \tilde{v}_{i+1} = \dfrac{v_{i+1}}{1 - \beta_1^{i+1}} $$
+$$ \tilde{\mathbb{E}[g^2]}_{i+1} = \dfrac{\mathbb{E}[g^2]_{i+1}}{1 - \beta_2^{i+1}} $$
+$$ w_{i+1} = w_i + \dfrac{lr}{\sqrt{\tilde{\mathbb{E}[g^2]}_{i+1}} + \epsilon}\cdot - \tilde{v}_{i+1} $$
+
+Here is the line of code to create the NAdam optimizer in MXNet.
+
+
+```python
+nadam_optimizer = optimizer.Nadam(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08)
+```
+
+## SGD optimized for large scale distributed training
+
+Training very deep neural networks can be time consuming and as such it is very common now to see practitioners turn to distributed training on multiple processors on the same machine or even across a fleet of machines to parallelize network training because this can reduce neural network training time from days to minutes.
+
+While all the preceding optimizers, from SGD to Adam, can be readily used in the distributed setting, the following optimizers in MXNet provide extra features targeted at alleviating some of the problems associated with distributed training.
+
+### [Signum](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.optimizer.Signum.html)
+In distributed training, communicating gradients across multiple worker nodes can be expensive and create a performance bottleneck. The Signum optimizer addresses this problem by transmitting just the sign of each minibatch gradient instead of the full precision gradient. In MXNet, the signum optimizer implements two variants of compressed gradients described in the paper by [Bernstein et al](https://arxiv.org/pdf/1802.04434.pdf).
+
+The first variant, achieved by constructing the Signum optimizer with `momentum=0`, implements SignSGD update which performs the update below.
+
+$$ w_{i+1} = w_i - lr \cdot sign(grad(w_i)) $$
+
+The second variant, achieved by passing a non-zero momentum parameter implements the Signum update which is equivalent to SignSGD and momentum. For momentum parameter $0 < \gamma < 1 $, the Signum optimizer performs the following update:
+
+$$ v_{i+1} = \gamma \cdot v_i + (1 - \gamma) \cdot grad(w_i) $$
+$$ w_{i+1} = w_i - lr \cdot sign(v_{i+1}) $$
+
+Here is how to create the signum optimizer in MXNet.
+
+
+```python
+signum_optimizer = optimizer.Signum(learning_rate=0.01, momentum=0.9, wd_lh=0.0)
+```
+
+### [LBSGD](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.optimizer.LBSGD.html)
+LBSGD stands for Large Batch Stochastic Gradient Descent and implements a technique where Layer-wise Adaptive Rate Scaling (LARS) is used to maintain a separate learning rate for each layer of the neural network. LBSGD has no additional modifications to SGD and performs the same parameter update steps as the SGD optimizer described above.
+
+LBSGD was introduced by [You et al](https://arxiv.org/pdf/1708.03888.pdf) for distributed training with data-parallel synchronous SGD across multiple worker nodes to overcome the issue of reduced model accuracy when the number of workers, and by extension effective batch size, is increased.
+
+Here is how to initialize the LBSGD optimizer in MXNet.
+
+
+```python
+lbsgd_optimizer = optimizer.LBSGD(momentum=0.0,
+ multi_precision=False,
+ warmup_strategy='linear',
+ warmup_epochs=5,
+ batch_scale=1,
+ updates_per_epoch=32,
+ begin_epoch=0,
+ num_epochs=60)
+```
+
+LBSGD has a number of extra keyword arguments described below
+* `multi_precision` - When True performs updates with float32 precision weights regardless of whether weights are initialized with lower precision. When False perform updates with same precision as the weights when initialized. Set to True to improve performance when training with low precision weight represenations.
+* `warmup_strategy` - The warmup is period where the learning rate is increased through the first few epochs. The following strategies are supported: ['linear', 'power2', 'sqrt','lars']
+* `warmup_epochs` - How many epochs to perform warmup for
+* `batch_scale` - use batch size*numworkers
+* `updates_per_epoch` - How many updates to the learning rate to perform every epoch. For example during warmup the warmup strategy is applied to increase the learning rate a total of `warmup_epochs*updates_per_epoch` number of times.
+* `begin_epoch` - The epoch at which to start warmup.
+
+### [DCASGD](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.optimizer.DCASGD.html)
+
+The DCASGD optimizer implements Delay Compensated Asynchronous Stochastic Gradient Descent by [Zheng et al](https://arxiv.org/pdf/1609.08326.pdf). In asynchronous distributed SGD, it is possible that a training worker node add its gradients too late to the global (parameter) server resulting in a delayed gradient being used to update the current parameters. DCASGD addresses this issue of delayed gradients by compensating for this delay in the parameter update steps.
+
+If $grad(w_i)$ denotes the delayed gradient, $w_{i+\tau}$ denotes the parameter values at the current iteration, and $\lambda$ is the delay scale factor, the DCASGD optimizer update function performs the update:
+
+$$ w_{i+\tau+1} = w_{i+\tau} − lr \cdot (grad(w_i) + \lambda \cdot grad(w_i)^2 \cdot (w_{i+\tau} − w_i)) $$
+
+The DCASGD optimizer in MXNet can be initialized using the code below.
+
+
+```python
+dcasgd_optimizer = optimizer.DCASGD(momentum=0.0, lamda=0.04)
+```
+
+## Online Learning Algorithms
+Before deep neural networks became popular post 2012, people were already solving large scale optimization problems to train (shallow) machine learning models. One particular area this was done was active or online learning where the model is continually learning and updating its parameters after it is deployed to production. In online learning, the model has to make predictions on new inputs but moments later may become aware of the true value of what it tried to predict and use this information to update its parameters.
+
+The class of optimization algorithms designed to tackle online learning problems have also seen some success in offline training of deep neural models. The following optimizers are algorithms taken from online learning that have been implemented in MXNet.
+
+### [FTRL](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.optimizer.Ftrl.html)
+
+FTRL stands for Follow the Regularized Leader and describes a family of algorithms originally designed for online learning tasks.
+
+For each iteration, FTRL algorithms finds the next parameter by solving the following optimization problem which minimizes the total regret i.e the sum of the inner product all preceding gradients and next parameter. The optimization objective is regularized so that the next parameter is close (proximal) in $L2$ norm to the preceding parameter values and is sparse which is enforced by the $L1$ norm.
+
+$$ w_{i+1} = \texttt{argmin}_{w} \left[\sum_{j=1}^{i} grad(w_i)\cdot w + \dfrac{1}{2}\sum_{j=1}^{i} \sigma_j \cdot ||w - w_j||_2^2 + \lambda ||w||_1\right]$$
+
+Due to the similarity of online learning and neural network training, there is an [equivalence](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/37013.pdf) between variants of gradient descent and FTRL algorithms. In fact, the $w$ that minimizes FTRL with only $L_2$ regularization (i.e $\lambda$ in the equation above is set to 0) is exactly the $w$ derived from stochastic gradient descent update.
+
+The version of FTRL implemented as an MXNet optimizer is from [McMahan et al](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41159.pdf) and encourages sparse parameters due to $L_1$ regularization. It performs the following update:
+
+$$ z_{i+1} = z_i + \dfrac{\left(\sqrt{\eta_i + grad(w_i)^2} - \sqrt{\eta_i}\right) \cdot w_i}{lr}$$
+$$ \eta_{i+1} = \eta_i + grad(w_i)^2$$
+$$ w_{i+1} = (|z_{i+1}| > \lambda) \cdot \left[ \dfrac{-lr}{\beta + \sqrt{\eta_{i+1}}} (z_{i+1} - \lambda \cdot sign(z_{i+1}))\right] $$
+
+Here is how to initialize the FTRL optimizer in MXNet
+
+
+```python
+ftrl_optimizer = optimizer.Ftrl(lamda1=0.01, learning_rate=0.1, beta=1)
+```
+
+### [FTML](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.optimizer.FTML.html)
+
+FTML stands for Follow the Moving Leader and is a variant of the FTRL family of algorithms adapted specifically to deep learning. Regular FTRL algorithms, described above, solve an optimization problem every update that involves the sum of all previous gradients. This is not well suited for the non-convex loss functions in deep learning. In the non-convex settings, older gradients are likely uninformative as the parameter updates can move to converge towards different local minima at different iterations. FTML addresses this problem by reweighing the learning subproblems in each iteration as shown below.
+
+
+$$ w_{i+1} = \texttt{argmin}_{w} \left[\sum_{j=1}^{i} (1 − \beta_1)\beta_1^{i−j} grad(w_i)\cdot w + \dfrac{1}{2}\sum_{j=1}^{i} \sigma_j \cdot ||w - w_j||_2^2 \right]$$
+
+$\beta_1$ is introduced to compute the exponential moving average of the previous accumulated gradient. The improvements of FTML over FTRL can be compared to the improvements of RMSProp/Adam to AdaGrad. According to [Zheng et al](http://proceedings.mlr.press/v70/zheng17a/zheng17a.pdf), FTML enjoys some of the nice properties of RMSProp and Adam while avoiding their pitfalls.
+
+The FTML optimizer performs the following update:
+
+$$ v_{i+1} = \beta_2 \cdot v_i + (1 - \beta_2) \cdot grad(w_i)^2$$
+$$ d_{i+1} = \dfrac{1 - \beta_1^{i+1}}{lr} \big(\sqrt{\dfrac{v_{i+1}}{1 - \beta_2^{i+1}}} + \epsilon\big)$$
+$$ z_{i+1} = \beta_1 \cdot z_i + (1 - \beta_1)\cdot grad(w_i) - (d_{i+1} - \beta_1 \cdot d_i) \cdot w_i$$
+$$ w_{i+1} = \dfrac{-z_{i+1}}{d_{i+1}} $$
+
+In MXNet, you can initialize the FTML optimizer using
+
+
+```python
+ftml_optimizer = optimizer.FTML(beta1=0.6, beta2=0.999, epsilon=1e-08)
+```
+
+Here `beta1` and `beta2` are similar to the arguments in the Adam optimizer.
+
+## Bayesian SGD
+A notable shortcoming of deep learning is that the model parameters learned after training are only point estimates, therefore deep learning model predictions have no information about uncertainty or confidence bounds. This is in contrast to a fully Bayesian approach which incorporates prior distributions on the model parameters and estimates the model parameters as belonging to a posterior distribution. This approach allows the predictions of a bayesian model to have information about uncertainty, as you can sample different values from the posterior distribution to obtain different model parameters. One approach to close the bayesian gap in deep learning is to incorporate the gradient descent algorithm with properties that allow the model parameters to converge to a distribution instead of a single value or point estimate.
+
+### [SGLD](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.optimizer.SGLD.html)
+Stochastic Gradient Langevin Dynamics or SGLD was introduced to allow uncertainties around model parameters to be captured directly during model training. With every update in SGLD, the learning rate decreases to zero and a gaussian noise of known variances is injected into the SGD step. This has the effect of having the training parameters converge to a sufficient statistic for a posterior distribution instead of simply a point estimate of the model parameters.
+
+SGLD performs the parameter update:
+
+$$ w_{i+1} = w_i + \dfrac{lr_{i+1}}{2}\cdot -grad(w_i) + \eta_{i+1}$$
+
+where $ \eta_{i+1} \sim N(0, lr_{i+1})$ i.e $\eta_{i+1}$ is drawn from a zero centered gaussian with variance $lr_{i+1}$
+
+SGLD was introduced by [Patterson and Teh](https://papers.nips.cc/paper/4883-stochastic-gradient-riemannian-langevin-dynamics-on-the-probability-simplex.pdf) and the optimizer can be created in MXNet with the following line of code.
+
+
+```python
+sgld_optimizer = optimizer.SGLD()
+```
+
+## Custom Optimizer
+
+If you would like to use a particular optimizer that is not yet implemented in MXNet or you have a custom optimization algorithm of your own that you would like to use to train your model, it is very straightforward to create a custom optimizer.
+
+Step 1: First create a function that is able to perform your desired updates given the weights, gradients and other state information.
+
+Step 2: You will have to write your own optimizer class that extends the [base optimizer class](http://beta.mxnet.io/api/gluon-related/_autogen/mxnet.optimizer.Optimizer.html#mxnet.optimizer.Optimizer) and override the following functions
+* `__init__`: accepts the parameters of your optimizer algorithm as inputs as saves them as member variables.
+* `create_state`: If your custom optimizer uses some additional state information besides the gradient, then you should implement a function that accepts the weights and returns the state.
+* `update`: Implement your optimizer update function using the function in Step 1
+
+Step 3: Register your optimizer with `@register` decorator on your optimizer class.
+
+See the [source code](http://beta.mxnet.io/_modules/mxnet/optimizer/optimizer.html#NAG) for the NAG optimizer for a concrete example.
+
+## Summary
+* MXNet implements many state-of-the-art optimizers which can be passed directly into a gluon trainer object. Calling `trainer.step` during model training uses the optimizers to update the model parameters.
+* Gradient descent algorithms minimize the loss function by using information from the gradient of the loss function and a learning rate hyperparameter.
+* Stochastic Gradient Descent is the backbone of deep learning optimization algorithms and simple SGD optimizers can be made really powerful by incorporating momentum, for example `sgd` with momentum and `nag`.
+* Adaptive learning rate methods compute per-parameter learning rates to make optimization less sensitive to the choice of global learning rate. `adam` is a popular adaptive learning rate optimizer.
+* Certain MXNet optimizers like `Signum` and Large Batch SGD are well suited for large scale distributed training as they consider challenges specific these tasks.
+* MXNet also implements optimizers from active learning like `FTML`, `FTRL`, and optimizers for bayesian learning like `SGLD`.
+* Finally, it is easy to create a custom optimizer by following the patterns in the source code implementation for the optimizers that already exist in MXNet.
+
+## Next Steps
+While optimization and optimizers play a significant role in deep learning model training, there are still other important components to model training. Here are a few suggestions about where to look next.
+* The [trainer API](http://beta.mxnet.io/api/gluon/mxnet.gluon.Trainer.html) and [guide](http://beta.mxnet.io/guide/packages/gluon/trainer.html) have information about how to construct the trainer that encapsulate the optimizers and will actually be used in your model training loop.
+* Check out the guide to MXNet gluon [Loss functions](http://beta.mxnet.io/guide/packages/gluon/loss.html) and [custom losses](http://beta.mxnet.io/guide/packages/gluon/custom-loss/custom-loss.html) to learn about the loss functions optimized by these optimizers, see what loss functions are already implemented in MXNet and understand how to write your own custom loss functions.
+* Take a look at the [guide to parameter initialization](http://beta.mxnet.io/guide/packages/gluon/init.html) in MXNet to learn about what initialization schemes are already implemented, and how to implement your custom initialization schemes.
+* Also check out the [autograd guide](http://beta.mxnet.io/guide/packages/autograd/autograd.html) to learn about automatic differentiation and how gradients are automatically computed in MXNet.
+* Make sure to take a look at the [guide to scheduling learning rates](https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/learning_rate_schedules.html) to learn how to create learning rate schedules to supercharge the convergence of your optimizer.
+* Finally take a look at the [KVStore API](http://beta.mxnet.io/api/gluon-related/mxnet.kvstore.KVStore.html#mxnet.kvstore.KVStore) to learn how parameter values are synchronized over multiple devices.
diff --git a/docs/python_docs/python/tutorials/packages/symbol/index.rst b/docs/python_docs/python/tutorials/packages/symbol/index.rst
new file mode 100644
index 000000000000..20af8b0ca8a6
--- /dev/null
+++ b/docs/python_docs/python/tutorials/packages/symbol/index.rst
@@ -0,0 +1,41 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+Symbol
+======
+
+.. container:: cards
+
+ .. card::
+ :title: Symbol API Intro
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/basic/symbol.html
+
+ How to use MXNet's Symbol API.
+
+
+ .. card::
+ :title: Data Loading
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/basic/data.html
+
+ How to load data with the Symbol API.
+
+
+.. toctree::
+ :hidden:
+
+ symbol
+ data
diff --git a/docs/python_docs/python/tutorials/packages/viz/index.rst b/docs/python_docs/python/tutorials/packages/viz/index.rst
new file mode 100644
index 000000000000..5bcb2ac1ca8b
--- /dev/null
+++ b/docs/python_docs/python/tutorials/packages/viz/index.rst
@@ -0,0 +1,38 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+Data
+====
+
+.. container:: cards
+
+ .. card::
+ :title: How to Visualize Neural Networks as Computation Graph
+ :link: https://mxnet.incubator.apache.org/api/faq/visualize_graph
+
+ A demonstration how to use ``mx.viz.plot_network`` for visualizing your neural networks.
+
+References
+----------
+
+- `mxnet.viz <../api/symbol-related/mxnet.visualization.html>`_
+
+.. toctree::
+ :hidden:
+ :glob:
+
+ *
diff --git a/docs/python_docs/python/tutorials/performance/backend/MKLDNN_README.md b/docs/python_docs/python/tutorials/performance/backend/MKLDNN_README.md
new file mode 100644
index 000000000000..d8f78ecf71a7
--- /dev/null
+++ b/docs/python_docs/python/tutorials/performance/backend/MKLDNN_README.md
@@ -0,0 +1,291 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# Build/Install MXNet with MKL-DNN
+
+A better training and inference performance is expected to be achieved on Intel-Architecture CPUs with MXNet built with [Intel MKL-DNN](https://github.com/intel/mkl-dnn) on multiple operating system, including Linux, Windows and MacOS.
+In the following sections, you will find build instructions for MXNet with Intel MKL-DNN on Linux, MacOS and Windows.
+
+Please find MKL-DNN optimized operators and other features in the [MKL-DNN operator list](../mkldnn/operator_list.md).
+
+The detailed performance data collected on Intel Xeon CPU with MXNet built with Intel MKL-DNN can be found [here](https://mxnet.incubator.apache.org/faq/perf.html#intel-cpu).
+
+
+
+
+### Prerequisites
+
+```
+sudo apt-get update
+sudo apt-get install -y build-essential git
+sudo apt-get install -y libopenblas-dev liblapack-dev
+sudo apt-get install -y libopencv-dev
+sudo apt-get install -y graphviz
+```
+
+### Clone MXNet sources
+
+```
+git clone --recursive https://github.com/apache/incubator-mxnet.git
+cd incubator-mxnet
+```
+
+### Build MXNet with MKL-DNN
+
+```
+make -j $(nproc) USE_OPENCV=1 USE_MKLDNN=1 USE_BLAS=mkl USE_INTEL_PATH=/opt/intel
+```
+
+If you don't have the full [MKL](https://software.intel.com/en-us/intel-mkl) library installation, you might use OpenBLAS as the blas library, by setting USE_BLAS=openblas.
+
+
MacOS
+
+### Prerequisites
+
+Install the dependencies, required for MXNet, with the following commands:
+
+- [Homebrew](https://brew.sh/)
+- llvm (clang in macOS does not support OpenMP)
+- OpenCV (for computer vision operations)
+
+```
+# Paste this command in Mac terminal to install Homebrew
+/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
+
+# install dependency
+brew update
+brew install pkg-config
+brew install graphviz
+brew tap homebrew/core
+brew install opencv
+brew tap homebrew/versions
+brew install llvm
+```
+
+### Clone MXNet sources
+
+```
+git clone --recursive https://github.com/apache/incubator-mxnet.git
+cd incubator-mxnet
+```
+
+### Build MXNet with MKL-DNN
+
+```
+LIBRARY_PATH=$(brew --prefix llvm)/lib/ make -j $(sysctl -n hw.ncpu) CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ USE_OPENCV=1 USE_OPENMP=1 USE_MKLDNN=1 USE_BLAS=apple
+```
+
+
Windows
+
+On Windows, you can use [Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) and [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/) to compile MXNet with Intel MKL-DNN.
+[Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is recommended.
+
+**Visual Studio 2015**
+
+To build and install MXNet yourself, you need the following dependencies. Install the required dependencies:
+
+1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is not already installed, download and install it. You can download and install the free community edition.
+2. Download and Install [CMake 3](https://cmake.org/files/v3.14/cmake-3.14.0-win64-x64.msi) if it is not already installed.
+3. Download [OpenCV 3](https://sourceforge.net/projects/opencvlibrary/files/3.4.5/opencv-3.4.5-vc14_vc15.exe/download), and unzip the OpenCV package, set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (e.g.,```OpenCV_DIR = C:\opencv\build ```). Also, add the OpenCV bin directory (```C:\opencv\build\x64\vc14\bin``` for example) to the ``PATH`` variable.
+4. If you have Intel Math Kernel Library (Intel MKL) installed, set ```MKL_ROOT``` to point to ```MKL``` directory that contains the ```include``` and ```lib```. If you want to use MKL blas, you should set ```-DUSE_BLAS=mkl``` when cmake. Typically, you can find the directory in ```C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl```.
+5. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBLAS](http://sourceforge.net/projects/openblas/files/v0.2.14/), or build the latest version of OpenBLAS from source. Note that you should also download ```mingw64.dll.zip``` along with openBLAS and add them to PATH.
+6. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Downloads\OpenBLAS\```.
+
+After you have installed all of the required dependencies, build the MXNet source code:
+
+1. Start a Visual Studio command prompt by click windows Start menu>>Visual Studio 2015>>VS2015 X64 Native Tools Command Prompt, and download the MXNet source code from [GitHub](https://github.com/apache/incubator-mxnet) by the command:
+```
+git clone --recursive https://github.com/apache/incubator-mxnet.git
+cd C:\incubator-mxent
+```
+2. Enable Intel MKL-DNN by -DUSE_MKLDNN=1. Use [CMake 3](https://cmake.org/) to create a Visual Studio solution in ```./build```. Make sure to specify the architecture in the
+command:
+```
+>mkdir build
+>cd build
+>cmake -G "Visual Studio 14 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release
+```
+3. Enable Intel MKL-DNN and Intel MKL as BLAS library by the command:
+```
+>"C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl\bin\mklvars.bat" intel64
+>cmake -G "Visual Studio 14 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=mkl -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release -DMKL_ROOT="C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl"
+```
+4. After the CMake successfully completed, in Visual Studio, open the solution file ```.sln``` and compile it, or compile the MXNet source code by using following command:
+```r
+msbuild mxnet.sln /p:Configuration=Release;Platform=x64 /maxcpucount
+```
+ These commands produce mxnet library called ```libmxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder. Also ```libmkldnn.dll``` with be in the ```./build/3rdparty/mkldnn/src/Release/```
+
+5. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml*.dll`, `libiomp5.dll`, `libopenblas*.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
+
+**Visual Studio 2017**
+
+User can follow the same steps of Visual Studio 2015 to build MXNET with MKL-DNN, but change the version related command, for example,```C:\opencv\build\x64\vc15\bin``` and build command is as below:
+
+```
+>cmake -G "Visual Studio 15 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=mkl -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release -DMKL_ROOT="C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl"
+
+```
+
+
Verify MXNet with python
+
+Preinstall python and some dependent modules:
+```
+pip install numpy graphviz
+set PYTHONPATH=[workdir]\incubator-mxnet\python
+```
+or install mxnet
+```
+cd python
+sudo python setup.py install
+python -c "import mxnet as mx;print((mx.nd.ones((2, 3))*2).asnumpy());"
+```
+Expected Output:
+```
+[[ 2. 2. 2.]
+ [ 2. 2. 2.]]
+```
+### Verify whether MKL-DNN works
+
+After MXNet is installed, you can verify if MKL-DNN backend works well with a single Convolution layer.
+```
+import mxnet as mx
+import numpy as np
+
+num_filter = 32
+kernel = (3, 3)
+pad = (1, 1)
+shape = (32, 32, 256, 256)
+
+x = mx.sym.Variable('x')
+w = mx.sym.Variable('w')
+y = mx.sym.Convolution(data=x, weight=w, num_filter=num_filter, kernel=kernel, no_bias=True, pad=pad)
+exe = y.simple_bind(mx.cpu(), x=shape)
+
+exe.arg_arrays[0][:] = np.random.normal(size=exe.arg_arrays[0].shape)
+exe.arg_arrays[1][:] = np.random.normal(size=exe.arg_arrays[1].shape)
+
+exe.forward(is_train=False)
+o = exe.outputs[0]
+t = o.asnumpy()
+```
+
+More detailed debugging and profiling information can be logged by setting the environment variable 'MKLDNN_VERBOSE':
+```
+export MKLDNN_VERBOSE=1
+```
+For example, by running above code snippet, the following debugging logs providing more insights on MKL-DNN primitives `convolution` and `reorder`. That includes: Memory layout, infer shape and the time cost of primitive execution.
+```
+mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_nchw out:f32_nChw16c,num:1,32x32x256x256,6.47681
+mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_oihw out:f32_OIhw16i16o,num:1,32x32x3x3,0.0429688
+mkldnn_verbose,exec,convolution,jit:avx512_common,forward_inference,fsrc:nChw16c fwei:OIhw16i16o fbia:undef fdst:nChw16c,alg:convolution_direct,mb32_g1ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1,9.98193
+mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_oihw out:f32_OIhw16i16o,num:1,32x32x3x3,0.0510254
+mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_nChw16c out:f32_nchw,num:1,32x32x256x256,20.4819
+```
+
+
Enable MKL BLAS
+
+With MKL BLAS, the performace is expected to furtherly improved with variable range depending on the computation load of the models.
+You can redistribute not only dynamic libraries but also headers, examples and static libraries on accepting the license [Intel Simplified license](https://software.intel.com/en-us/license/intel-simplified-software-license).
+Installing the full MKL installation enables MKL support for all operators under the linalg namespace.
+
+ 1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/mkl) You can also install MKL through [YUM](https://software.intel.com/en-us/articles/installing-intel-free-libs-and-python-yum-repo) or [APT](https://software.intel.com/en-us/articles/installing-intel-free-libs-and-python-apt-repo) Repository.
+
+ 2. Run `make -j ${nproc} USE_BLAS=mkl`
+
+ 3. Navigate into the python directory
+
+ 4. Run `sudo python setup.py install`
+
+### Verify whether MKL works
+
+After MXNet is installed, you can verify if MKL BLAS works well with a single dot layer.
+
+```
+import mxnet as mx
+import numpy as np
+
+shape_x = (1, 10, 8)
+shape_w = (1, 12, 8)
+
+x_npy = np.random.normal(0, 1, shape_x)
+w_npy = np.random.normal(0, 1, shape_w)
+
+x = mx.sym.Variable('x')
+w = mx.sym.Variable('w')
+y = mx.sym.batch_dot(x, w, transpose_b=True)
+exe = y.simple_bind(mx.cpu(), x=x_npy.shape, w=w_npy.shape)
+
+exe.forward(is_train=False)
+o = exe.outputs[0]
+t = o.asnumpy()
+```
+
+You can open the `MKL_VERBOSE` flag by setting environment variable:
+```
+export MKL_VERBOSE=1
+```
+Then by running above code snippet, you probably will get the following output message which means `SGEMM` primitive from MKL are called. Layout information and primitive execution performance are also demonstrated in the log message.
+```
+Numpy + Intel(R) MKL: THREADING LAYER: (null)
+Numpy + Intel(R) MKL: setting Intel(R) MKL to use INTEL OpenMP runtime
+Numpy + Intel(R) MKL: preloading libiomp5.so runtime
+MKL_VERBOSE Intel(R) MKL 2019.0 Update 3 Product build 20190125 for Intel(R) 64 architecture Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) enabled processors, Lnx 2.40GHz lp64 intel_thread NMICDev:0
+MKL_VERBOSE SGEMM(T,N,12,10,8,0x7f7f927b1378,0x1bc2140,8,0x1ba8040,8,0x7f7f927b1380,0x7f7f7400a280,12) 8.93ms CNR:OFF Dyn:1 FastMM:1 TID:0 NThr:40 WDiv:HOST:+0.000
+```
+
+
Enable graph optimization
+
+Graph optimization with subgraph is available and enabled by default in master branch. For MXNet release v1.5, you can manually enable it by:
+
+```
+export MXNET_SUBGRAPH_BACKEND=MKLDNN
+```
+
+This limitations of this experimental feature are:
+
+- Use this feature only for inference. When training, be sure to turn the feature off by unsetting the `MXNET_SUBGRAPH_BACKEND` environment variable.
+
+- This feature will only run on the CPU, even if you're using a GPU-enabled build of MXNet.
+
+
+
Quantization and Inference with INT8
+
+Benefiting from Intel MKL-DNN, MXNet built with Intel MKL-DNN brings outstanding performance improvement on quantization and inference with INT8 Intel CPU Platform on Intel Xeon Scalable Platform.
+
+- [CNN Quantization Examples](https://github.com/apache/incubator-mxnet/tree/master/example/quantization).
+
+- [Model Quantization for Production-Level Neural Network Inference](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Graph+Optimization+and+Quantization+based+on+subgraph+and+MKL-DNN).
+
+
Next Steps and Support
+
+- For questions or support specific to MKL, visit the [Intel MKL](https://software.intel.com/en-us/mkl) website.
+
+- For questions or support specific to MKL, visit the [Intel MKLDNN](https://github.com/intel/mkl-dnn) website.
+
+- If you find bugs, please open an issue on GitHub for [MXNet with MKL](https://github.com/apache/incubator-mxnet/labels/MKL) or [MXNet with MKLDNN](https://github.com/apache/incubator-mxnet/labels/MKLDNN).
diff --git a/docs/python_docs/python/tutorials/performance/backend/index.rst b/docs/python_docs/python/tutorials/performance/backend/index.rst
new file mode 100644
index 000000000000..fbe40c0ed684
--- /dev/null
+++ b/docs/python_docs/python/tutorials/performance/backend/index.rst
@@ -0,0 +1,52 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+Accelerated Backend Tools
+=========================
+The following tutorials will help you learn how to use backend tools to boost performance.
+
+.. container:: cards
+
+ .. card::
+ :title: TensorRT
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/tensorrt/inference_with_trt.html
+
+ How to use NVIDIA's TensorRT to boost inference performance.
+
+
+ .. card::
+ :title: MKL-DNN
+ :link: mkl-dnn.html
+
+ How to get the most from your CPU by using Intel's MKL-DNN.
+
+ .. card::
+ :title: TVM
+ :link: tvm.html
+
+ How to use TVM to boost performance.
+..
+
+.. toctree::
+ :hidden:
+ :maxdepth: 1
+
+ tensorRt
+
+ ..
+ mkl-dnn
+ tvm
\ No newline at end of file
diff --git a/docs/python_docs/python/tutorials/performance/backend/mkl-dnn.rst b/docs/python_docs/python/tutorials/performance/backend/mkl-dnn.rst
new file mode 100644
index 000000000000..1841c1c05b32
--- /dev/null
+++ b/docs/python_docs/python/tutorials/performance/backend/mkl-dnn.rst
@@ -0,0 +1,33 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+Use Intel MKL-DNN to accelerate CPU performance
+===============================================
+
+.. container:: cards
+
+ .. card::
+ :title: MKL-DNN Installation and Verification
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/mkldnn/MKLDNN_README.html
+
+ A guide on using MKL-DNN with MXNet.
+
+ .. card::
+ :title: MKL-DNN Operators
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/mkldnn/operator_list.html
+
+ A list of operators that MKL-DNN supports.
diff --git a/docs/python_docs/python/tutorials/performance/backend/mkldnn_quantization.md b/docs/python_docs/python/tutorials/performance/backend/mkldnn_quantization.md
new file mode 100644
index 000000000000..b2eaa2e44d53
--- /dev/null
+++ b/docs/python_docs/python/tutorials/performance/backend/mkldnn_quantization.md
@@ -0,0 +1,258 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# Quantize custom models with MKL-DNN backend
+
+This document is to introduce how to quantize the customer models from FP32 to INT8 with Apache/MXNet toolkit and APIs under Intel CPU.
+
+If you are not familiar with Apache/MXNet quantization flow, please reference [quantization blog](https://medium.com/apache-mxnet/model-quantization-for-production-level-neural-network-inference-f54462ebba05) first, and the performance data is shown in [Apache/MXNet C++ interface](https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference) and [GluonCV](https://gluon-cv.mxnet.io/build/examples_deployment/int8_inference.html).
+
+## Installation and Prerequisites
+
+Installing MXNet with MKLDNN backend is an easy and essential process. You can follow [How to build and install MXNet with MKL-DNN backend](https://mxnet.incubator.apache.org/tutorials/mkldnn/MKLDNN_README.html) to build and install MXNet from source. Also, you can install the release or nightly version via PyPi and pip directly by running:
+
+```
+# release version
+pip install mxnet-mkl
+# nightly version
+pip install mxnet-mkl --pre
+```
+
+## Image Classification Demo
+
+A quantization script [imagenet_gen_qsym_mkldnn.py](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/imagenet_gen_qsym_mkldnn.py) has been designed to launch quantization for image-classification models. This script is integrated with [Gluon-CV modelzoo](https://gluon-cv.mxnet.io/model_zoo/classification.html), so that all pre-trained models can be downloaded from Gluon-CV and then converted for quantization. For details, you can refer [Model Quantization with Calibration Examples](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/README.md).
+
+## Integrate Quantization Flow to Your Project
+
+Quantization flow works for both symbolic and Gluon models. If you're using Gluon, you can first refer [Saving and Loading Gluon Models](https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/save_load_params.html) to hybridize your computation graph and export it as a symbol before running quantization.
+
+In general, the quantization flow includes 4 steps. The user can get the acceptable accuracy from step 1 to 3 with minimum effort. Most of thing in this stage is out-of-box and the data scientists and researchers only need to focus on how to represent data and layers in their model. After a quantized model is generated, you may want to deploy it online and the performance will be the next key point. Thus, step 4, calibration, can improve the performance a lot by reducing lots of runtime calculation.
+
+![quantization flow](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/quantization.png)
+
+Now, we are going to take Gluon ResNet18 as an example to show how each step work.
+
+### Initialize Model
+
+```python
+import logging
+import mxnet as mx
+from mxnet.gluon.model_zoo import vision
+from mxnet.contrib.quantization import *
+
+logging.basicConfig()
+logger = logging.getLogger('logger')
+logger.setLevel(logging.INFO)
+
+batch_shape = (1, 3, 224, 224)
+resnet18 = vision.resnet18_v1(pretrained=True)
+resnet18.hybridize()
+resnet18.forward(mx.nd.zeros(batch_shape))
+resnet18.export('resnet18_v1')
+sym, arg_params, aux_params = mx.model.load_checkpoint('resnet18_v1', 0)
+# (optional) visualize float32 model
+mx.viz.plot_network(sym)
+```
+First, we download resnet18-v1 model from gluon modelzoo and export it as a symbol. You can visualize float32 model. Below is a raw residual block.
+
+![float32 model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/fp32_raw.png)
+
+#### Model Fusion
+
+```python
+sym = sym.get_backend_symbol('MKLDNN_QUANTIZE')
+# (optional) visualize fused float32 model
+mx.viz.plot_network(sym)
+```
+It's important to add this line to enable graph fusion before quantization to get better performance. Below is a fused residual block. Batchnorm, Activation and elemwise_add are fused into Convolution.
+
+![float32 fused model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/fp32_fusion.png)
+
+### Quantize Model
+
+A python interface `quantize_graph` is provided for the user. Thus, it is very flexible for the data scientist to construct the expected models based on different requirements in a real deployment.
+
+```python
+# quantize configs
+# set exclude layers
+excluded_names = []
+# set calib mode.
+calib_mode = 'none'
+# set calib_layer
+calib_layer = None
+# set quantized_dtype
+quantized_dtype = 'auto'
+logger.info('Quantizing FP32 model Resnet18-V1')
+qsym, qarg_params, aux_params, collector = quantize_graph(sym=sym, arg_params=arg_params, aux_params=aux_params,
+ excluded_sym_names=excluded_names,
+ calib_mode=calib_mode, calib_layer=calib_layer,
+ quantized_dtype=quantized_dtype, logger=logger)
+# (optional) visualize quantized model
+mx.viz.plot_network(qsym)
+# save quantized model
+mx.model.save_checkpoint('quantized-resnet18_v1', 0, qsym, qarg_params, aux_params)
+```
+
+By applying `quantize_graph` to the symbolic model, a new quantized model can be generated, named `qsym` along with its parameters. We can see `_contrib_requantize` operators are inserted after `Convolution` to convert the INT32 output to FP32.
+
+![none calibrated model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/none_calib.png)
+
+Below table gives some descriptions.
+
+| param | type | description|
+|--------------------|-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| excluded_sym_names | list of strings | A list of strings representing the names of the symbols that users want to excluding from being quantized.|
+| calib_mode | str | If calib_mode='none', no calibration will be used and the thresholds for requantization after the corresponding layers will be calculated at runtime by calling min and max operators. The quantized models generated in this mode are normally 10-20% slower than those with calibrations during inference. If calib_mode='naive', the min and max values of the layer outputs from a calibration dataset will be directly taken as the thresholds for quantization. If calib_mode='entropy', the thresholds for quantization will be derived such that the KL divergence between the distributions of FP32 layer outputs and quantized layer outputs is minimized based upon the calibration dataset. |
+| calib_layer | function | Given a layer's output name in string, return True or False for deciding whether to calibrate this layer. If yes, the statistics of the layer's output will be collected; otherwise, no information of the layer's output will be collected. If not provided, all the layers' outputs that need requantization will be collected.|
+| quantized_dtype | str | The quantized destination type for input data. Currently support 'int8', 'uint8' and 'auto'. 'auto' means automatically select output type according to calibration result.|
+
+### Evaluate & Tune
+
+Now, you get a pair of quantized symbol and params file for inference. For Gluon inference, only difference is to load model and params by a SymbolBlock as below example:
+
+```python
+quantized_net = mx.gluon.SymbolBlock.imports('quantized-resnet18_v1-symbol.json', 'data', 'quantized-resnet18_v1-0000.params')
+quantized_net.hybridize(static_shape=True, static_alloc=True)
+batch_size = 1
+data = mx.nd.ones((batch_size,3,224,224))
+quantized_net(data)
+```
+
+Now, you can get the accuracy from a quantized network. Furthermore, you can try to select different layers or OPs to be quantized by `excluded_sym_names` parameter and figure out an acceptable accuracy.
+
+### Calibrate Model (optional for performance)
+
+The quantized model generated in previous steps can be very slow during inference since it will calculate min and max at runtime. We recommend using offline calibration for better performance by setting `calib_mode` to `naive` or `entropy`. And then calling `set_monitor_callback` api to collect layer information with a subset of the validation datasets before int8 inference.
+
+```python
+# quantization configs
+# set exclude layers
+excluded_names = []
+# set calib mode.
+calib_mode = 'naive'
+# set calib_layer
+calib_layer = None
+# set quantized_dtype
+quantized_dtype = 'auto'
+logger.info('Quantizing FP32 model resnet18-V1')
+cqsym, cqarg_params, aux_params, collector = quantize_graph(sym=sym, arg_params=arg_params, aux_params=aux_params,
+ excluded_sym_names=excluded_names,
+ calib_mode=calib_mode, calib_layer=calib_layer,
+ quantized_dtype=quantized_dtype, logger=logger)
+
+# download imagenet validation dataset
+mx.test_utils.download('http://data.mxnet.io/data/val_256_q90.rec', 'dataset.rec')
+# set rgb info for data
+mean_std = {'mean_r': 123.68, 'mean_g': 116.779, 'mean_b': 103.939, 'std_r': 58.393, 'std_g': 57.12, 'std_b': 57.375}
+# set batch size
+batch_size = 16
+# create DataIter
+data = mx.io.ImageRecordIter(path_imgrec='dataset.rec', batch_size=batch_size, data_shape=batch_shape[1:], rand_crop=False, rand_mirror=False, **mean_std)
+# create module
+mod = mx.mod.Module(symbol=sym, label_names=None, context=mx.cpu())
+mod.bind(for_training=False, data_shapes=data.provide_data, label_shapes=None)
+mod.set_params(arg_params, aux_params)
+
+# calibration configs
+# set num_calib_batches
+num_calib_batches = 5
+max_num_examples = num_calib_batches * batch_size
+# monitor FP32 Inference
+mod._exec_group.execs[0].set_monitor_callback(collector.collect, monitor_all=True)
+num_batches = 0
+num_examples = 0
+for batch in data:
+ mod.forward(data_batch=batch, is_train=False)
+ num_batches += 1
+ num_examples += batch_size
+ if num_examples >= max_num_examples:
+ break
+if logger is not None:
+ logger.info("Collected statistics from %d batches with batch_size=%d"
+ % (num_batches, batch_size))
+```
+
+After that, layer information will be filled into the `collector` returned by `quantize_graph` api. Then, you need to write the layer information into int8 model by calling `calib_graph` api.
+
+
+```python
+# write scaling factor into quantized symbol
+cqsym, cqarg_params, aux_params = calib_graph(qsym=cqsym, arg_params=arg_params, aux_params=aux_params,
+ collector=collector, calib_mode=calib_mode,
+ quantized_dtype=quantized_dtype, logger=logger)
+# (optional) visualize quantized model
+mx.viz.plot_network(cqsym)
+```
+
+Below is a quantized residual block with naive calibration. We can see `min_calib_range` and `max_calib_range` are written into `_contrib_requantize` operators.
+
+![naive calibrated model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/naive_calib.png)
+
+When you get a quantized model with calibration, keeping sure to call fusion api again since this can fuse some `requantize` or `dequantize` operators for further performance improvement.
+
+```python
+# perform post-quantization fusion
+cqsym = cqsym.get_backend_symbol('MKLDNN_QUANTIZE')
+# (optional) visualize post-quantized model
+mx.viz.plot_network(cqsym)
+# save quantized model
+mx.model.save_checkpoint('quantized-resnet18_v1', 0, cqsym, cqarg_params, aux_params)
+```
+
+Below is a post-quantized residual block. We can see `_contrib_requantize` operators are fused into `Convolution` operators.
+
+![post-quantized model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/post_quantize.png)
+
+BTW, You can also modify the `min_calib_range` and `max_calib_range` in the JSON file directly.
+
+```
+ {
+ "op": "_sg_mkldnn_conv",
+ "name": "quantized_sg_mkldnn_conv_bn_act_6",
+ "attrs": {
+ "max_calib_range": "3.562147",
+ "min_calib_range": "0.000000",
+ "quantized": "true",
+ "with_act": "true",
+ "with_bn": "true"
+ },
+......
+```
+
+### Tips for Model Calibration
+
+#### Accuracy Tuning
+
+- Try to use `entropy` calib mode;
+
+- Try to exclude some layers which may cause obvious accuracy drop;
+
+- Change calibration dataset by setting different `num_calib_batches` or shuffle your validation dataset;
+
+#### Performance Tuning
+
+- Keep sure to perform graph fusion before quantization;
+
+- If lots of `requantize` layers exist, keep sure to perform post-quantization fusion after calibration;
+
+- Compare the MXNet profile or `MKLDNN_VERBOSE` of float32 and int8 inference;
+
+## Deploy with Python/C++
+
+MXNet also supports deploy quantized models with C++. Refer [MXNet C++ Package](https://github.com/apache/incubator-mxnet/blob/master/cpp-package/README.md) for more details.
+
+
diff --git a/docs/python_docs/python/tutorials/performance/backend/profiler.md b/docs/python_docs/python/tutorials/performance/backend/profiler.md
new file mode 100644
index 000000000000..91a74e4f49cf
--- /dev/null
+++ b/docs/python_docs/python/tutorials/performance/backend/profiler.md
@@ -0,0 +1,333 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# Profiling MXNet Models
+
+It is often helpful to check the execution time of each operation in a neural network. You can then determine where to focus your effort to speed up model training or inference. In this tutorial, we will learn how to profile MXNet models to measure their running time and memory consumption using the MXNet profiler.
+
+## The incorrect way to profile
+
+If you have just started to use MXNet, you might be tempted to measure the execution time of your model using Python's `time` module like shown below:
+
+```python
+from time import time
+from mxnet import autograd, nd
+import mxnet as mx
+
+start = time()
+x = nd.random_uniform(shape=(2000,2000))
+y = nd.dot(x, x)
+print('Time for matrix multiplication: %f sec\n' % (time() - start))
+
+start = time()
+y_np = y.asnumpy()
+print('Time for converting to numpy: %f sec' % (time() - start))
+```
+
+**Time for matrix multiplication: 0.005051 sec**
+
+**Time for converting to numpy: 0.167693 sec**
+
+From the timings above, it seems as if converting to numpy takes lot more time than multiplying two large matrices. That doesn't seem right.
+
+This is because, in MXNet, all operations are executed asynchronously. So, when `nd.dot(x, x)` returns, the matrix multiplication is not complete, it has only been queued for execution. However, [`asnumpy`](http://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html?highlight=asnumpy#mxnet.ndarray.NDArray.asnumpy) has to wait for the result to be calculated in order to convert it to numpy array on CPU, hence taking a longer time. Other examples of 'blocking' operations include [`asscalar`](http://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html?highlight=asscalar#mxnet.ndarray.NDArray.asscalar) and [`wait_to_read`](http://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html?highlight=wait_to_read#mxnet.ndarray.NDArray.wait_to_read).
+
+While it is possible to use [`NDArray.waitall()`](http://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html?highlight=waitall#mxnet.ndarray.waitall) before and after operations to get running time of operations, it is not a scalable method to measure running time of multiple sets of operations, especially in a [`Sequential`](http://mxnet.incubator.apache.org/api/python/gluon/gluon.html?highlight=sequential#mxnet.gluon.nn.Sequential) or hybridized network.
+
+## The correct way to profile
+
+The correct way to measure running time of MXNet models is to use MXNet profiler. In the rest of this tutorial, we will learn how to use the MXNet profiler to measure the running time and memory consumption of MXNet models. You can import the profiler and configure it from Python code.
+
+```python
+from mxnet import profiler
+
+profiler.set_config(profile_all=True,
+ aggregate_stats=True,
+ filename='profile_output.json')
+```
+
+`profile_all` enables all types of profiling. You can also individually enable the following types of profiling:
+
+- `profile_symbolic` (boolean): whether to profile symbolic operators
+- `profile_imperative` (boolean): whether to profile imperative operators
+- `profile_memory` (boolean): whether to profile memory usage
+- `profile_api` (boolean): whether to profile the C API
+
+`aggregate_stats` aggregates statistics in memory which can then be printed to console by calling `profiler.dumps()`.
+
+### Setup: Build a model
+
+Let's build a small convolutional neural network that we can use to demonstrate profiling.
+
+```python
+from mxnet import gluon
+
+net = gluon.nn.HybridSequential()
+with net.name_scope():
+ net.add(gluon.nn.Conv2D(channels=20, kernel_size=5, activation='relu'))
+ net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
+ net.add(gluon.nn.Conv2D(channels=50, kernel_size=5, activation='relu'))
+ net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
+ net.add(gluon.nn.Flatten())
+ net.add(gluon.nn.Dense(512, activation="relu"))
+ net.add(gluon.nn.Dense(10))
+```
+
+We need data that we can run through the network for profiling. We'll use the MNIST dataset.
+
+```python
+from mxnet.gluon.data.vision import transforms
+
+dataset = gluon.data.vision.MNIST(train=True)
+dataset = dataset.transform_first(transforms.ToTensor())
+dataloader = gluon.data.DataLoader(dataset, batch_size=64, shuffle=True)
+```
+
+Let's define a function that will run a single training iteration given `data` and `label`.
+
+```python
+# Use GPU if available
+if mx.context.num_gpus():
+ ctx=mx.gpu()
+else:
+ ctx=mx.cpu()
+
+# Initialize the parameters with random weights
+net.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
+
+# Use SGD optimizer
+trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1})
+
+# Softmax Cross Entropy is a frequently used loss function for multi-class classification
+softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
+
+# A helper function to run one training iteration
+def run_training_iteration(data, label):
+ # Load data and label is the right context
+ data = data.as_in_context(ctx)
+ label = label.as_in_context(ctx)
+ # Run the forward pass
+ with autograd.record():
+ output = net(data)
+ loss = softmax_cross_entropy(output, label)
+ # Run the backward pass
+ loss.backward()
+ # Apply changes to parameters
+ trainer.step(data.shape[0])
+```
+
+### Starting and stopping the profiler from Python
+
+When the first forward pass is run on a network, MXNet does a number of housekeeping tasks including inferring the shapes of various parameters, allocating memory for intermediate and final outputs, etc. For these reasons, profiling the first iteration doesn't provide representative results for the rest of training. We will, therefore, skip the first iteration.
+
+```python
+# Run the first iteration without profiling
+itr = iter(dataloader)
+run_training_iteration(*next(itr))
+```
+
+We'll run the next iteration with the profiler turned on.
+
+```python
+data, label = next(itr)
+
+# Ask the profiler to start recording
+profiler.set_state('run')
+
+run_training_iteration(*next(itr))
+
+# Make sure all operations have completed
+mx.nd.waitall()
+# Ask the profiler to stop recording
+profiler.set_state('stop')
+# Dump all results to log file before download
+profiler.dump()
+```
+
+Between running and stopping the profiler, you can also pause and resume the profiler using `profiler.pause()` and `profiler.resume()` respectively to profile only parts of the code you want to profile.
+
+### Starting the profiler automatically using an environment variable
+
+The method described above requires code changes to start and stop the profiler. You can also start the profiler automatically and profile the entire code without any code changes using the `MXNET_PROFILER_AUTOSTART` environment variable.
+
+`$ MXNET_PROFILER_AUTOSTART=1 python my_script.py`
+
+MXNet will start the profiler automatically if you run your code with the environment variable `MXNET_PROFILER_AUTOSTART` set to `1`. The profiler output is stored in `profile.json` inside the current directory.
+
+Note that the profiler output could be large depending on your code. It might be helpful to profile only sections of your code using the `set_state` API described in the previous section.
+
+### Increasing granularity of the profiler output
+
+MXNet executes computation graphs in 'bulk mode' which reduces kernel launch gaps in between symbolic operators for faster execution. This could reduce the granularity of the profiler output. If you need profiling result of every operator, please set the environment variables `MXNET_EXEC_BULK_EXEC_INFERENCE` and `MXNET_EXEC_BULK_EXEC_TRAIN` to `0` to disable the bulk execution mode.
+
+When working with networks created using the Gluon API, you will get a more granular profiling outputs if you profile networks that haven't been hybridized. Operations can appear fused together in the profiling outputs after hybridization, which can make debugging tricky.
+
+### Viewing profiler output
+
+There are a few ways to view the information collected by the profiler. You can view it in the console, you can view a more graphical version in a browser, or you can use a vendor tool such as Intel VTune or Nvidia NVProf to view output. For most scenarios the information you need can be obtained with MXNet's built in profiler support, but if you want to investigate the performance of operators alongside extra context about your hardware (e.g. cache hit rates, or CUDA kernel timings) then profiling jointly with vendor tools is recommended.
+
+#### 1. View in console
+
+You can use the `profiler.dumps()` method to view the information collected by the profiler in the console. The collected information contains time taken by each operator, time taken by each C API and memory consumed in both CPU and GPU.
+
+```python
+print(profiler.dumps())
+```
+
+![Profile Statistics](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/python/profiler/profile_stats.png)
+
+#### 2. View in browser
+
+You can also dump the information collected by the profiler into a `json` file using the `profiler.dump()` function and view it in a browser.
+
+```python
+profiler.dump(finished=False)
+```
+
+`dump()` creates a `json` file which can be viewed using a trace consumer like `chrome://tracing` in the Chrome browser. Here is a snapshot that shows the output of the profiling we did above. Note that setting the `finished` parameter to `False` will prevent the profiler from finishing dumping to file. If you just use `profiler.dump()`, you will no longer be able to profile the remaining sections of your model.
+
+![Tracing Screenshot](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/python/profiler/profiler_output_chrome.png)
+
+Let's zoom in to check the time taken by operators
+
+![Operator profiling](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/python/profiler/profile_operators.png)
+
+The above picture visualizes the sequence in which the operators were executed and the time taken by each operator.
+
+### Profiling Custom Operators
+Should the existing NDArray operators fail to meet all your model's needs, MXNet supports [Custom Operators](https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/customop.html) that you can define in Python. In `forward()` and `backward()` of a custom operator, there are two kinds of code: "pure Python" code (NumPy operators included) and "sub-operators" (NDArray operators called within `forward()` and `backward()`). With that said, MXNet can profile the execution time of both kinds without additional setup. Specifically, the MXNet profiler will break a single custom operator call into a pure Python event and several sub-operator events if there are any. Furthermore, all of those events will have a prefix in their names, which is, conveniently, the name of the custom operator you called.
+
+Let's try profiling custom operators with the following code example:
+
+```python
+class MyAddOne(mx.operator.CustomOp):
+ def forward(self, is_train, req, in_data, out_data, aux):
+ self.assign(out_data[0], req[0], in_data[0]+1)
+
+ def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+ self.assign(in_grad[0], req[0], out_grad[0])
+
+@mx.operator.register('MyAddOne')
+class CustomAddOneProp(mx.operator.CustomOpProp):
+ def __init__(self):
+ super(CustomAddOneProp, self).__init__(need_top_grad=True)
+
+ def list_arguments(self):
+ return ['data']
+
+ def list_outputs(self):
+ return ['output']
+
+ def infer_shape(self, in_shape):
+ return [in_shape[0]], [in_shape[0]], []
+
+ def create_operator(self, ctx, shapes, dtypes):
+ return MyAddOne()
+
+
+inp = mx.nd.zeros(shape=(500, 500))
+
+profiler.set_config(profile_all=True, continuous_dump=True, \
+ aggregate_stats=True)
+profiler.set_state('run')
+
+w = nd.Custom(inp, op_type="MyAddOne")
+
+mx.nd.waitall()
+
+profiler.set_state('stop')
+print(profiler.dumps())
+profiler.dump(finished=False)
+```
+
+Here, we have created a custom operator called `MyAddOne`, and within its `forward()` function, we simply add one to the input. We can visualize the dump file in `chrome://tracing/`:
+
+![Custom Operator Profiling Screenshot](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/python/profiler/profiler_output_custom_operator_chrome.png)
+
+As shown by the screenshot, in the **Custom Operator** domain where all the custom operator-related events fall into, we can easily visualize the execution time of each segment of `MyAddOne`. We can tell that `MyAddOne::pure_python` is executed first. We also know that `CopyCPU2CPU` and `_plus_scalr` are two "sub-operators" of `MyAddOne` and the sequence in which they are executed.
+
+Please note that: to be able to see the previously described information, you need to set `profile_imperative` to `True` even when you are using custom operators in [symbolic mode](https://mxnet.incubator.apache.org/versions/master/tutorials/basic/symbol.html) (refer to the code snippet below, which is the symbolic-mode equivelent of the code example above). The reason is that within custom operators, pure python code and sub-operators are still called imperatively.
+
+```python
+# Set profile_all to True
+profiler.set_config(profile_all=True, aggregate_stats=True, continuous_dump=True)
+# OR, Explicitly Set profile_symbolic and profile_imperative to True
+profiler.set_config(profile_symbolic=True, profile_imperative=True, \
+ aggregate_stats=True, continuous_dump=True)
+
+profiler.set_state('run')
+# Use Symbolic Mode
+a = mx.symbol.Variable('a')
+b = mx.symbol.Custom(data=a, op_type='MyAddOne')
+c = b.bind(mx.cpu(), {'a': inp})
+y = c.forward()
+mx.nd.waitall()
+profiler.set_state('stop')
+print(profiler.dumps())
+profiler.dump()
+```
+
+### Some Rules to Pay Attention to
+1. Always use `profiler.dump(finished=False)` if you do not intend to finish dumping to file. Otherwise, calling `profiler.dump()` in the middle of your model may lead to unexpected behaviors; and if you subsequently call `profiler.set_config()`, the program will error out.
+
+2. You can only dump to one file. Do not change the target file by calling `profiler.set_config(filename='new_name.json')` in the middle of your model. This will lead to incomplete dump outputs.
+
+## Advanced: Using NVIDIA Profiling Tools
+
+MXNet's Profiler is the recommended starting point for profiling MXNet code, but NVIDIA also provides a couple of tools for low-level profiling of CUDA code: [NVProf](https://devblogs.nvidia.com/cuda-pro-tip-nvprof-your-handy-universal-gpu-profiler/), [Visual Profiler](https://developer.nvidia.com/nvidia-visual-profiler) and [Nsight Compute](https://developer.nvidia.com/nsight-compute). You can use these tools to profile all kinds of executables, so they can be used for profiling Python scripts running MXNet. And you can use these in conjunction with the MXNet Profiler to see high-level information from MXNet alongside the low-level CUDA kernel information.
+
+#### NVProf and Visual Profiler
+
+NVProf and Visual Profiler are available in CUDA 9 and CUDA 10 toolkits. You can get a timeline view of CUDA kernel executions, and also analyse the profiling results to get automated recommendations. It is useful for profiling end-to-end training but the interface can sometimes become slow and unresponsive.
+
+You can initiate the profiling directly from inside Visual Profiler or from the command line with `nvprof` which wraps the execution of your Python script. If it's not on your path already, you can find `nvprof` inside your CUDA directory. See [this discussion post](https://discuss.mxnet.io/t/using-nvidia-profiling-tools-visual-profiler-and-nsight-compute/) for more details on setup.
+
+`$ nvprof -o my_profile.nvvp python my_profiler_script.py`
+
+`==11588== NVPROF is profiling process 11588, command: python my_profiler_script.py`
+
+`==11588== Generated result file: /home/user/Development/incubator-mxnet/ci/my_profile.nvvp`
+
+We specified an output file called `my_profile.nvvp` and this will be annotated with NVTX ranges (for MXNet operations) that will be displayed alongside the standard NVProf timeline. This can be very useful when you're trying to find patterns between operators run by MXNet, and their associated CUDA kernel calls.
+
+You can open this file in Visual Profiler to visualize the results.
+
+![Operator profiling](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/python/profiler/profiler_nvprof.png)
+
+At the top of the plot we have CPU tasks such as driver operations, memory copy calls, MXNet engine operator invocations, and imperative MXNet API calls. Below we see the kernels active on the GPU during the same time period.
+
+![Operator profiling](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/python/profiler/profiler_nvprof_zoomed.png)
+
+Zooming in on a backwards convolution operator we can see that it is in fact made up of a number of different GPU kernel calls, including a cuDNN winograd convolution call, and a fast-fourier transform call.
+
+![Operator profiling](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/python/profiler/profiler_winograd.png)
+
+Selecting any of these kernel calls (the winograd convolution call shown here) will get you some interesting GPU performance information such as occupancy rates (vs theoretical), shared memory usage and execution duration.
+
+#### Nsight Compute
+
+Nsight Compute is available in CUDA 10 toolkit, but can be used to profile code running CUDA 9. You don't get a timeline view, but you get many low level statistics about each individual kernel executed and can compare multiple runs (i.e. create a baseline).
+
+![Nsight Compute](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/python/profiler/profile_nsight_compute.png)
+
+### Further reading
+
+- [Examples using MXNet profiler.](https://github.com/apache/incubator-mxnet/tree/master/example/profiler)
+- [Some tips for improving MXNet performance.](https://mxnet.incubator.apache.org/faq/perf.html)
+
+
+
diff --git a/docs/python_docs/python/tutorials/performance/backend/tvm.rst b/docs/python_docs/python/tutorials/performance/backend/tvm.rst
new file mode 100644
index 000000000000..8da277bd119f
--- /dev/null
+++ b/docs/python_docs/python/tutorials/performance/backend/tvm.rst
@@ -0,0 +1,21 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+Use TVM
+=======
+
+Contributions welcome!
diff --git a/docs/python_docs/python/tutorials/performance/compression/index.rst b/docs/python_docs/python/tutorials/performance/compression/index.rst
new file mode 100644
index 000000000000..e2e4f2c73250
--- /dev/null
+++ b/docs/python_docs/python/tutorials/performance/compression/index.rst
@@ -0,0 +1,59 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+Compression
+===========
+The following tutorials will help you learn how to use compression techniques with MXNet.
+
+.. container:: cards
+
+ .. card::
+ :title: Compression: float16
+ :link: https://mxnet.incubator.apache.org/api/faq/float16
+
+ How to use float16 in your model to boost training speed.
+
+ .. card::
+ :title: Gradient Compression
+ :link: https://mxnet.incubator.apache.org/api/faq/gradient_compression
+
+ How to use gradient compression to reduce communication bandwidth and increase speed.
+
+ .. card::
+ :title: Inference with Quantized Models
+ :link: https://gluon-cv.mxnet.io/build/examples_deployment/int8_inference.html
+
+ How to use quantized GluonCV models for inference on Intel Xeon Processors to gain higher performance.
+
+ ..
+ TBD Content
+ .. card::
+ :title: Compression: int8
+ :link: int8.html
+
+ How to use int8 in your model to boost training speed.
+ ..
+
+.. toctree::
+ :hidden:
+ :glob:
+
+ *
+
+ ..
+ int8
+ ..
diff --git a/docs/python_docs/python/tutorials/performance/compression/int8.rst b/docs/python_docs/python/tutorials/performance/compression/int8.rst
new file mode 100644
index 000000000000..a0cad0f03800
--- /dev/null
+++ b/docs/python_docs/python/tutorials/performance/compression/int8.rst
@@ -0,0 +1,21 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+Deploy with int-8
+=================
+
+Contributions welcome!
diff --git a/docs/python_docs/python/tutorials/performance/index.rst b/docs/python_docs/python/tutorials/performance/index.rst
new file mode 100644
index 000000000000..6d9159eb2ace
--- /dev/null
+++ b/docs/python_docs/python/tutorials/performance/index.rst
@@ -0,0 +1,130 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+Performance
+===========
+The following tutorials will help you learn how to tune MXNet or use tools that will improve training and inference performance.
+
+Essential
+---------
+
+.. container:: cards
+
+ .. card::
+ :title: Improving Performance
+ :link: https://mxnet.incubator.apache.org/versions/master/faq/perf.html
+
+ How to get the best performance from MXNet.
+
+ .. card::
+ :title: Profiler
+ :link: backend/profiler.html
+
+ How to profile MXNet models.
+
+ .. card::
+ :title: Tuning NumPy Operations
+ :link: https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/gotchas_numpy_in_mxnet.html
+
+ Gotchas using NumPy in MXNet.
+
+Compression
+-----------
+
+.. container:: cards
+
+ .. card::
+ :title: Compression: float16
+ :link: compression/float16.html
+
+ How to use float16 in your model to boost training speed.
+
+ .. card::
+ :title: Gradient Compression
+ :link: compression/gradient_compression.html
+
+ How to use gradient compression to reduce communication bandwidth and increase speed.
+ ..
+ .. card::
+ :title: Compression: int8
+ :link: compression/int8.html
+
+ How to use int8 in your model to boost training speed.
+ ..
+
+
+Accelerated Backend
+-------------------
+
+.. container:: cards
+
+ .. card::
+ :title: TensorRT
+ :link: backend/tensorRt.html
+
+ How to use NVIDIA's TensorRT to boost inference performance.
+
+ ..
+ TBD Content
+ .. card::
+ :title: MKL-DNN
+ :link: backend/mkl-dnn.html
+
+ How to get the most from your CPU by using Intel's MKL-DNN.
+
+ .. card::
+ :title: TVM
+ :link: backend/tvm.html
+
+ How to use TVM to boost performance.
+ ..
+
+Distributed Training
+--------------------
+
+.. container:: cards
+
+ .. card::
+ :title: Distributed Training Using the KVStore API
+ :link: https://mxnet.incubator.apache.org/versions/master/faq/distributed_training.html
+
+ How to use the KVStore API to use multiple GPUs when training a model.
+
+ .. card::
+ :title: Training with Multiple GPUs Using Model Parallelism
+ :link: https://mxnet.incubator.apache.org/versions/master/faq/model_parallel_lstm.html
+
+ An overview of using multiple GPUs when training an LSTM.
+
+ .. card::
+ :title: Data Parallelism in MXNet
+ :link: https://mxnet.incubator.apache.org/versions/master/faq/multi_devices.html
+
+ An overview of distributed training strategies.
+
+ .. card::
+ :title: MXNet with Horovod
+ :link: https://github.com/apache/incubator-mxnet/tree/master/example/distributed_training-horovod
+
+ A set of example scripts demonstrating MNIST and ImageNet training with Horovod as the distributed training backend.
+
+.. toctree::
+ :hidden:
+ :maxdepth: 1
+
+ compression/index
+ backend/index
diff --git a/docs/python_docs/themes/.babelrc b/docs/python_docs/themes/.babelrc
new file mode 100644
index 000000000000..69f50d59a72b
--- /dev/null
+++ b/docs/python_docs/themes/.babelrc
@@ -0,0 +1,3 @@
+{
+ "presets": ["env"]
+}
\ No newline at end of file
diff --git a/docs/python_docs/themes/.circleci/config.yml b/docs/python_docs/themes/.circleci/config.yml
new file mode 100644
index 000000000000..edc02a35d7f0
--- /dev/null
+++ b/docs/python_docs/themes/.circleci/config.yml
@@ -0,0 +1,37 @@
+version: 2
+jobs:
+ build:
+ working_directory: ~/sphinx_materialdesign_theme
+ docker:
+ - image: circleci/python:3.6.4
+ steps:
+ - checkout
+ - run: sudo chown -R circleci:circleci /usr/local/bin
+ - run: sudo chown -R circleci:circleci /usr/local/lib/python3.6/site-packages
+ - run:
+ name: install dependencies
+ command: pip install -r requirements.txt
+ - run:
+ name: build
+ command: sphinx-build -b html ./example ./_build
+ - run:
+ name: deploy
+ command: |
+ remote=$(git config remote.origin.url)
+ pushd _build > /dev/null
+ git config --global user.email "$GH_EMAIL" > /dev/null 2>&1
+ git config --global user.name "$GH_NAME" > /dev/null 2>&1
+ touch .nojekyll
+ git init
+ git add .
+ git commit -m "Deploy to GitHub Pages. [skip ci]"
+ git push --force --quiet $remote master:gh-pages
+ popd > /dev/null
+workflows:
+ version: 2
+ build_flow:
+ jobs:
+ - build:
+ filters:
+ branches:
+ only: master
\ No newline at end of file
diff --git a/docs/python_docs/themes/.gitignore b/docs/python_docs/themes/.gitignore
new file mode 100644
index 000000000000..fd123f1c8549
--- /dev/null
+++ b/docs/python_docs/themes/.gitignore
@@ -0,0 +1,13 @@
+.idea/
+.cache/
+.vscode/
+*.egg-info/
+dist/
+_build/
+build/
+example/_build/
+node_modules/
+*.log
+**/*.pyc
+**/__pycache__
+.sass-cache/
diff --git a/docs/python_docs/themes/.sassrc b/docs/python_docs/themes/.sassrc
new file mode 100644
index 000000000000..ac7d2c97c6e8
--- /dev/null
+++ b/docs/python_docs/themes/.sassrc
@@ -0,0 +1,5 @@
+{
+ "includePaths": [
+ "node_modules"
+ ]
+}
\ No newline at end of file
diff --git a/docs/python_docs/themes/mx-theme/LICENSE b/docs/python_docs/themes/mx-theme/LICENSE
new file mode 100644
index 000000000000..74e4dc3ad961
--- /dev/null
+++ b/docs/python_docs/themes/mx-theme/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2016 myyasuda
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/docs/python_docs/themes/mx-theme/MANIFEST.in b/docs/python_docs/themes/mx-theme/MANIFEST.in
new file mode 100644
index 000000000000..b5ab8d3cb11c
--- /dev/null
+++ b/docs/python_docs/themes/mx-theme/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include mxtheme *
diff --git a/docs/python_docs/themes/mx-theme/README.md b/docs/python_docs/themes/mx-theme/README.md
new file mode 100644
index 000000000000..cea79f43a9e5
--- /dev/null
+++ b/docs/python_docs/themes/mx-theme/README.md
@@ -0,0 +1,65 @@
+# Material Design HTML Theme for Sphinx
+
+## How to use
+
+- Install the theme by
+
+```bash
+pip install mxtheme
+```
+
+- Modify the `conf.py` for your sphinx project by
+
+create a submodule of this repo on the same folder with `conf.py` for your sphinx project. then modify the following three lines in `conf.py`:
+
+```python
+html_theme = 'mxtheme'
+```
+
+In addition, to use the `card` directive in rst, you can and add the following two lines into your `def setup(app)` function:
+
+```python
+def setup(app):
+ ...
+ import mxtheme
+ app.add_directive('card', mxtheme.CardDirective)
+```
+
+## How to build
+
+
+Install `npm` first,
+
+on ubuntu:
+
+```
+wget -qO- https://deb.nodesource.com/setup_8.x | sudo -E bash -
+sudo apt-get install -y nodejs
+```
+
+on macos
+
+```
+brew install nodejs
+```
+
+Then install packages
+
+```
+npm install
+```
+
+Last, build css and js
+
+
+```
+npm run build
+```
+
+## Acknowledgment
+
+
+This is fork of
+[sphinx_materialdesign_theme](https://github.com/myyasuda/sphinx_materialdesign_theme). With
+some CSS/JS modifications. Please refer to the original project for more
+documents.
diff --git a/docs/python_docs/themes/mx-theme/mxtheme/__init__.py b/docs/python_docs/themes/mx-theme/mxtheme/__init__.py
new file mode 100644
index 000000000000..a1c4e0783b6b
--- /dev/null
+++ b/docs/python_docs/themes/mx-theme/mxtheme/__init__.py
@@ -0,0 +1,13 @@
+from os import path
+from .card import CardDirective
+
+__version__ = '0.3.9'
+__version_full__ = __version__
+
+package_dir = path.dirname(path.abspath(__file__))
+
+def get_path():
+ return package_dir
+
+def setup(app):
+ app.add_html_theme('mxtheme', package_dir)
diff --git a/docs/python_docs/themes/mx-theme/mxtheme/card.py b/docs/python_docs/themes/mx-theme/mxtheme/card.py
new file mode 100644
index 000000000000..4d389a939ee8
--- /dev/null
+++ b/docs/python_docs/themes/mx-theme/mxtheme/card.py
@@ -0,0 +1,42 @@
+from sphinx.locale import _
+from docutils import nodes
+from docutils.parsers.rst import Directive, directives
+
+class card(nodes.General, nodes.Element):
+ pass
+
+class CardDirective(Directive):
+
+ # defines the parameter the directive expects
+ # directives.unchanged means you get the raw value from RST
+ required_arguments = 0
+ optional_arguments = 0
+ final_argument_whitespace = True
+ option_spec = {'title': directives.unchanged,
+ 'link': directives.unchanged,
+ 'is_head': directives.unchanged}
+ has_content = True
+ add_index = False
+
+ def run(self):
+ # gives you access to the options of the directive
+ options = self.options
+
+ cid = nodes.make_id("card-{}".format(options['title']))
+
+ classes = ['mx-card']
+ if options.get('is_head', 'False').lower() == 'true':
+ classes.append('head-card')
+ container = nodes.container(ids=[cid], classes=classes)
+
+ container += nodes.inline('', options['title'], classes=['mx-card-title'])
+ link = options.get('link')
+ if link:
+ container += nodes.inline('', link, classes=['mx-card-link'])
+
+ para = nodes.paragraph(classes=['mx-card-text'])
+ self.state.nested_parse(self.content, self.content_offset, para)
+ container += para
+
+ # we return the result
+ return [container]
diff --git a/docs/python_docs/themes/mx-theme/mxtheme/drawer.html b/docs/python_docs/themes/mx-theme/mxtheme/drawer.html
new file mode 100644
index 000000000000..f5f4a016c18c
--- /dev/null
+++ b/docs/python_docs/themes/mx-theme/mxtheme/drawer.html
@@ -0,0 +1,17 @@
+
+ {% block menu %}
+
+ {{ _('Table Of Contents') }}
+ {% set toctree = toctree(maxdepth=3, collapse=False, includehidden=True, titles_only=True) %}
+ {% if toctree %}
+ {% set lines = toctree.split('\n') %}
+
+ {% else %}
+
+
+ {% endif %}
+
+ {% endblock %}
+
diff --git a/docs/python_docs/themes/mx-theme/mxtheme/footer.html b/docs/python_docs/themes/mx-theme/mxtheme/footer.html
new file mode 100644
index 000000000000..4926f4fcb5d3
--- /dev/null
+++ b/docs/python_docs/themes/mx-theme/mxtheme/footer.html
@@ -0,0 +1,46 @@
+
+
+
\ No newline at end of file
diff --git a/docs/python_docs/themes/mx-theme/mxtheme/header.html b/docs/python_docs/themes/mx-theme/mxtheme/header.html
new file mode 100644
index 000000000000..606c6f985f6c
--- /dev/null
+++ b/docs/python_docs/themes/mx-theme/mxtheme/header.html
@@ -0,0 +1,45 @@
+
+
+
diff --git a/docs/python_docs/themes/mx-theme/mxtheme/localtoc.html b/docs/python_docs/themes/mx-theme/mxtheme/localtoc.html
new file mode 100644
index 000000000000..7ae810fcbd25
--- /dev/null
+++ b/docs/python_docs/themes/mx-theme/mxtheme/localtoc.html
@@ -0,0 +1,8 @@
+{%- if display_toc %}
+
+
+ {{ _('Table Of Contents') }}
+
+ {{ toc }}
+
+{%- endif %}
\ No newline at end of file
diff --git a/docs/python_docs/themes/mx-theme/mxtheme/relations.html b/docs/python_docs/themes/mx-theme/mxtheme/relations.html
new file mode 100644
index 000000000000..e6922d48550a
--- /dev/null
+++ b/docs/python_docs/themes/mx-theme/mxtheme/relations.html
@@ -0,0 +1,20 @@
+
diff --git a/docs/python_docs/themes/mx-theme/mxtheme/search.html b/docs/python_docs/themes/mx-theme/mxtheme/search.html
new file mode 100644
index 000000000000..bea072f264fb
--- /dev/null
+++ b/docs/python_docs/themes/mx-theme/mxtheme/search.html
@@ -0,0 +1,56 @@
+{%- extends "layout.html" %}
+{% set title = _('Search') %}
+{% block extrahead %}
+
+
+ {# this is used when loading the search index using $.ajax fails,
+ such as on Chrome for documents on localhost #}
+
+ {{ super() }}
+{% endblock %}
+{% block body %}
+
{{ _('Search') }}
+
+
+
+ {% trans %}Please activate JavaScript to enable the search
+ functionality.{% endtrans %}
+
+
+
+ {% trans %}From here you can search these documents. Enter your search
+ words into the box below and click "search". Note that the search
+ function will automatically search for all of the words. Pages
+ containing fewer words won't appear in the result list.{% endtrans %}
+
+
+ {% if search_performed %}
+
{{ _('Search Results') }}
+ {% if not search_results %}
+
{{ _('Your search did not match any documents. Please make sure that all words are spelled correctly and that you\'ve selected enough categories.') }}
+ {% endif %}
+ {% endif %}
+
+ {% if search_results %}
+
+ {% for href, caption, context in search_results %}
+
+ {% markdown %}{% include /get_started/devices/raspberry_pi.md %}{% endmarkdown %}
+
+
+
+
+ {% markdown %}{% include /get_started/devices/nvidia-jetson.md %}{% endmarkdown %}
+
+
+
+
+
diff --git a/docs/static_site/src/_includes/get_started/gpu_snippet.md b/docs/static_site/src/_includes/get_started/gpu_snippet.md
new file mode 100644
index 000000000000..1154507db2e8
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/gpu_snippet.md
@@ -0,0 +1,11 @@
+CUDA should be installed first.
+
+**Important:** Make sure your installed CUDA version matches the CUDA version in the pip package.
+
+Check your CUDA version with the following command:
+
+{% highlight bash %}
+nvcc --version
+{% endhighlight %}
+
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/linux/clojure/cpu.md b/docs/static_site/src/_includes/get_started/linux/clojure/cpu.md
new file mode 100644
index 000000000000..c16eb9ada34e
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/clojure/cpu.md
@@ -0,0 +1,15 @@
+You can use the Maven packages defined in the following dependency to include MXNet in your Clojure
+project. To maximize leverage, the Clojure package has been built on the existing Scala package. Please
+refer to the [MXNet-Scala setup guide]({{'/get_started/scala_setup'|relative_url}}) for a detailed set of instructions
+to help you with the setup process that is required to use the Clojure dependency.
+
+
+
+{% highlight html %}
+
+org.apache.mxnet.contrib.clojure
+clojure-mxnet-linux-cpu
+
+{% endhighlight %}
diff --git a/docs/static_site/src/_includes/get_started/linux/clojure/gpu.md b/docs/static_site/src/_includes/get_started/linux/clojure/gpu.md
new file mode 100644
index 000000000000..26293f6e077f
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/clojure/gpu.md
@@ -0,0 +1,15 @@
+You can use the Maven packages defined in the following dependency to include MXNet in your Clojure
+project. To maximize leverage, the Clojure package has been built on the existing Scala package. Please
+refer to the [MXNet-Scala setup guide]({{'/get_started/scala_setup'|relative_url}}) for a detailed set of instructions
+to help you with the setup process that is required to use the Clojure dependency.
+
+
+
+{% highlight html %}
+
+ org.apache.mxnet.contrib.clojure
+ clojure-mxnet-linux-gpu
+
+{% endhighlight %}
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/linux/cpp/cpp.md b/docs/static_site/src/_includes/get_started/linux/cpp/cpp.md
new file mode 100644
index 000000000000..ac664185a61a
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/cpp/cpp.md
@@ -0,0 +1,4 @@
+To enable the C++ package, build from source using `make USE_CPP_PACKAGE=1`.
+Refer to the [MXNet C++ setup guide](c_plus_plus)
+for full instructions.
+
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/linux/java/cpu.md b/docs/static_site/src/_includes/get_started/linux/java/cpu.md
new file mode 100644
index 000000000000..2140a34af7ad
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/java/cpu.md
@@ -0,0 +1,17 @@
+You can use the Maven packages defined in the following dependency to include MXNet in your Java
+project. The Java API is provided as a subset of the Scala API and is intended for inference only.
+Please refer to the MXNet-Java setup guide for a detailed set of
+instructions to help you with the setup process.
+
+
+
+
+
+{% highlight html %}
+
+ org.apache.mxnet
+ mxnet-full_2.11-linux-x86_64-cpu
+ [1.5.0, )
+
+{% endhighlight %}
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/linux/java/gpu.md b/docs/static_site/src/_includes/get_started/linux/java/gpu.md
new file mode 100644
index 000000000000..cbab5e60ca47
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/java/gpu.md
@@ -0,0 +1,17 @@
+You can use the Maven packages defined in the following dependency to include MXNet in your Java
+project. The Java API is provided as a subset of the Scala API and is intended for inference only.
+Please refer to the MXNet-Java setup guide for a detailed set of
+instructions to help you with the setup process.
+
+
+
+
+
+{% highlight html %}
+
+ org.apache.mxnet
+ mxnet-full_2.11-linux-x86_64-gpu
+ [1.5.0, )
+
+{% endhighlight %}
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/linux/julia/build-from-source.md b/docs/static_site/src/_includes/get_started/linux/julia/build-from-source.md
new file mode 100644
index 000000000000..4b39ac633e0d
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/julia/build-from-source.md
@@ -0,0 +1,2 @@
+Refer to the [Julia section of the MXNet Ubuntu installation guide](ubuntu_setup#install-the-mxnet-package-for-julia).
+
diff --git a/docs/static_site/src/_includes/get_started/linux/julia/pkg.md b/docs/static_site/src/_includes/get_started/linux/julia/pkg.md
new file mode 100644
index 000000000000..35971305ee2d
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/julia/pkg.md
@@ -0,0 +1,10 @@
+Install a pinned version of MXNet like this:
+
+{% highlight julia %}
+]add MXNet#v1.5.0
+{% endhighlight %}
+
+Or directly install the latest release:
+{% highlight julia %}
+]add MXNet
+{% endhighlight %}
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/linux/perl/perl.md b/docs/static_site/src/_includes/get_started/linux/perl/perl.md
new file mode 100644
index 000000000000..d9fc8d5328bc
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/perl/perl.md
@@ -0,0 +1 @@
+Refer to the [Perl section of the MXNet Ubuntu installation guide](ubuntu_setup.html#install-the-mxnet-package-for-perl).
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/linux/python/cpu/build-from-source.md b/docs/static_site/src/_includes/get_started/linux/python/cpu/build-from-source.md
new file mode 100644
index 000000000000..4adf039739d5
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/python/cpu/build-from-source.md
@@ -0,0 +1 @@
+To build from source, refer to the [MXNet Ubuntu installation guide]({{'/get_started/ubuntu_setup' | relative_url}}).
diff --git a/docs/static_site/src/_includes/get_started/linux/python/cpu/docker.md b/docs/static_site/src/_includes/get_started/linux/python/cpu/docker.md
new file mode 100644
index 000000000000..1980a0999f92
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/python/cpu/docker.md
@@ -0,0 +1,43 @@
+Docker images with *MXNet* are available at [DockerHub](https://hub.docker.com/r/mxnet/).
+
+**Step 1** Install Docker on your machine by following the [docker installation
+instructions](https://docs.docker.com/engine/installation/linux/ubuntu/#install-using-the-repository).
+
+*Note* - You can install Community Edition (CE) to get started with *MXNet*.
+
+**Step 2** [Optional] Post installation steps to manage Docker as a non-root user.
+
+Follow the four steps in this [docker
+documentation](https://docs.docker.com/engine/installation/linux/linux-postinstall/#manage-docker-as-a-non-root-user)
+to allow managing docker containers without *sudo*.
+
+If you skip this step, you need to use *sudo* each time you invoke Docker.
+
+**Step 3** Pull the MXNet docker image.
+
+{% highlight bash %}
+$ docker pull mxnet/python # Use sudo if you skip Step 2
+{% endhighlight %}
+
+You can list docker images to see if mxnet/python docker image pull was successful.
+
+{% highlight bash %}
+$ docker images # Use sudo if you skip Step 2
+
+REPOSITORY TAG IMAGE ID CREATED SIZE
+mxnet/python latest 00d026968b3c 3 weeks ago 1.41 GB
+{% endhighlight %}
+
+Using the latest MXNet with [Intel MKL-DNN](https://github.com/intel/mkl-dnn) is
+recommended for the
+fastest inference speeds with MXNet.
+
+{% highlight bash %}
+$ docker pull mxnet/python:1.3.0_cpu_mkl # Use sudo if you skip Step 2
+$ docker images # Use sudo if you skip Step 2
+
+REPOSITORY TAG IMAGE ID CREATED SIZE
+mxnet/python 1.3.0_cpu_mkl deaf9bf61d29 4 days ago 678 MB
+{% endhighlight %}
+
+**Step 4** Validate the installation.
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/linux/python/cpu/pip.md b/docs/static_site/src/_includes/get_started/linux/python/cpu/pip.md
new file mode 100644
index 000000000000..b05480e1c967
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/python/cpu/pip.md
@@ -0,0 +1,121 @@
+Run the following command:
+
+
+{% highlight bash %}
+$ pip install mxnet
+{% endhighlight %}
+
+MKL-DNN enabled pip packages are optimized for Intel hardware. You can find
+performance numbers
+in the MXNet tuning guide.
+
+{% highlight bash %}
+$ pip install mxnet-mkl
+{% endhighlight %}
+
+
+
+
+
+{% highlight bash %}
+$ pip install mxnet==1.4.1
+{% endhighlight %}
+
+MKL-DNN enabled pip packages are optimized for Intel hardware. You can find
+performance numbers
+in the MXNet tuning guide.
+
+{% highlight bash %}
+$ pip install mxnet-mkl==1.4.1
+{% endhighlight %}
+
+
+
+
+{% highlight bash %}
+$ pip install mxnet==1.3.1
+{% endhighlight %}
+
+MKL-DNN enabled pip packages are optimized for Intel hardware. You can find
+performance numbers
+in the MXNet tuning guide.
+
+{% highlight bash %}
+$ pip install mxnet-mkl==1.3.1
+{% endhighlight %}
+
+
+
+
+{% highlight bash %}
+$ pip install mxnet==1.2.1
+{% endhighlight %}
+
+MKL-DNN enabled pip packages are optimized for Intel hardware. You can find
+performance numbers
+in the MXNet tuning guide.
+
+{% highlight bash %}
+$ pip install mxnet-mkl==1.2.1
+{% endhighlight %}
+
+
+
+{% highlight bash %}
+$ pip install mxnet --pre
+{% endhighlight %}
+
+MKL-DNN enabled pip packages are optimized for Intel hardware. You can find
+performance numbers
+in the MXNet tuning guide.
+
+{% highlight bash %}
+$ pip install mxnet-mkl --pre
+{% endhighlight %}
+
+
+
+
+
+{% include /get_started/pip_snippet.md %}
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/linux/python/gpu/build-from-source.md b/docs/static_site/src/_includes/get_started/linux/python/gpu/build-from-source.md
new file mode 100644
index 000000000000..4adf039739d5
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/python/gpu/build-from-source.md
@@ -0,0 +1 @@
+To build from source, refer to the [MXNet Ubuntu installation guide]({{'/get_started/ubuntu_setup' | relative_url}}).
diff --git a/docs/static_site/src/_includes/get_started/linux/python/gpu/docker.md b/docs/static_site/src/_includes/get_started/linux/python/gpu/docker.md
new file mode 100644
index 000000000000..d4d8cf295458
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/python/gpu/docker.md
@@ -0,0 +1,48 @@
+Docker images with *MXNet* are available at [DockerHub](https://hub.docker.com/r/mxnet/).
+
+**Step 1** Install Docker on your machine by following the [docker installation
+instructions](https://docs.docker.com/engine/installation/linux/ubuntu/#install-using-the-repository).
+
+*Note* - You can install Community Edition (CE) to get started with *MXNet*.
+
+**Step 2** [Optional] Post installation steps to manage Docker as a non-root user.
+
+Follow the four steps in this [docker
+documentation](https://docs.docker.com/engine/installation/linux/linux-postinstall/#manage-docker-as-a-non-root-user)
+to allow managing docker containers without *sudo*.
+
+If you skip this step, you need to use *sudo* each time you invoke Docker.
+
+**Step 3** Install *nvidia-docker-plugin* following the [installation
+instructions](https://github.com/NVIDIA/nvidia-docker/wiki). *nvidia-docker-plugin*
+is required to
+enable the usage of GPUs from the docker containers.
+
+**Step 4** Pull the MXNet docker image.
+
+{% highlight bash %}
+$ docker pull mxnet/python:gpu # Use sudo if you skip Step 2
+{% endhighlight %}
+
+You can list docker images to see if mxnet/python docker image pull was successful.
+
+{% highlight bash %}
+$ docker images # Use sudo if you skip Step 2
+
+REPOSITORY TAG IMAGE ID CREATED SIZE
+mxnet/python gpu 493b2683c269 3 weeks ago 4.77 GB
+{% endhighlight %}
+
+Using the latest MXNet with [Intel MKL-DNN](https://github.com/intel/mkl-dnn) is
+recommended for the
+fastest inference speeds with MXNet.
+
+{% highlight bash %}
+$ docker pull mxnet/python:1.3.0_cpu_mkl # Use sudo if you skip Step 2
+$ docker images # Use sudo if you skip Step 2
+
+REPOSITORY TAG IMAGE ID CREATED SIZE
+mxnet/python 1.3.0_gpu_cu92_mkl adcb3ab19f50 4 days ago 4.23 GB
+{% endhighlight %}
+
+**Step 5** Validate the installation.
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/linux/python/gpu/pip.md b/docs/static_site/src/_includes/get_started/linux/python/gpu/pip.md
new file mode 100644
index 000000000000..249cd5b54052
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/python/gpu/pip.md
@@ -0,0 +1,74 @@
+Run the following command:
+
+
+
+
+
+{% include /get_started/pip_snippet.md %}
+{% include /get_started/gpu_snippet.md %}
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/linux/r/cpu.md b/docs/static_site/src/_includes/get_started/linux/r/cpu.md
new file mode 100644
index 000000000000..0ccbd9f66efa
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/r/cpu.md
@@ -0,0 +1,9 @@
+The default version of R that is installed with `apt-get` is insufficient. You will need
+to first [install R v3.4.4+ and build MXNet from source](ubuntu_setup.html#install-the-mxnet-package-for-r).
+
+After you have setup R v3.4.4+ and MXNet, you can build and install the MXNet R bindings with the following, assuming that `incubator-mxnet` is the source directory you used to build MXNet as follows:
+
+{% highlight bash %}
+$ cd incubator-mxnet
+$ make rpkg
+{% endhighlight %}
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/linux/r/gpu.md b/docs/static_site/src/_includes/get_started/linux/r/gpu.md
new file mode 100644
index 000000000000..3e26d319e4f0
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/r/gpu.md
@@ -0,0 +1,16 @@
+The default version of R that is installed with `apt-get` is insufficient. You will need
+to first
+[install R v3.4.4+ and build MXNet from
+source](ubuntu_setup.html#install-the-mxnet-package-for-r).
+
+After you have setup R v3.4.4+ and MXNet, you can build and install the MXNet R bindings
+with the
+following, assuming that `incubator-mxnet` is the source directory you used to build
+MXNet as follows:
+
+{% highlight bash %}
+$ cd incubator-mxnet
+$ make rpkg
+{% endhighlight %}
+
+{% include /get_started/gpu_snippet.md %}
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/linux/scala/cpu.md b/docs/static_site/src/_includes/get_started/linux/scala/cpu.md
new file mode 100644
index 000000000000..3cc96bade7df
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/scala/cpu.md
@@ -0,0 +1,14 @@
+You can use the Maven packages defined in the following dependency to include MXNet in your Scala
+project. Please refer to the [MXNet-Scala setup guide]({{'/get_started/scala_setup'|relative_url}}) for
+a detailed set of instructions to help you with the setup process.
+
+
+
+{% highlight html %}
+
+org.apache.mxnet
+mxnet-full_2.11-linux-x86_64-cpu
+
+{% endhighlight %}
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/linux/scala/gpu.md b/docs/static_site/src/_includes/get_started/linux/scala/gpu.md
new file mode 100644
index 000000000000..9f458112a0ae
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/linux/scala/gpu.md
@@ -0,0 +1,16 @@
+You can use the Maven packages defined in the following dependency to include MXNet in
+your Scala
+project. Please refer to the MXNet-Scala setup guide for
+a detailed set
+of instructions to help you with the setup process.
+
+
+
+```html
+
+org.apache.mxnet
+mxnet-full_2.11-linux-x86_64-gpu
+
+```
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/macos/clojure/cpu.md b/docs/static_site/src/_includes/get_started/macos/clojure/cpu.md
new file mode 100644
index 000000000000..1e27af5e5dd7
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/macos/clojure/cpu.md
@@ -0,0 +1,17 @@
+
+You can use the Maven packages defined in the following dependency to include MXNet in
+your Clojure project. To maximize leverage, the Clojure package has been built on the existing Scala
+package. Please refer to the [MXNet-Scala setup guide](scala_setup.html) for a detailed set
+of instructions to help you with the setup process that is required to use the Clojure dependency.
+
+
+
+{% highlight html %}
+
+ org.apache.mxnet.contrib.clojure
+ clojure-mxnet-osx-cpu
+
+{% endhighlight %}
+
diff --git a/docs/static_site/src/_includes/get_started/macos/clojure/gpu.md b/docs/static_site/src/_includes/get_started/macos/clojure/gpu.md
new file mode 100644
index 000000000000..ccbc24db96e7
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/macos/clojure/gpu.md
@@ -0,0 +1 @@
+Not available at this time.
diff --git a/docs/static_site/src/_includes/get_started/macos/cpp/cpp.md b/docs/static_site/src/_includes/get_started/macos/cpp/cpp.md
new file mode 100644
index 000000000000..b2e632b0f117
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/macos/cpp/cpp.md
@@ -0,0 +1,3 @@
+To enable the C++ package, build from source using `make USE_CPP_PACKAGE=1`.
+Refer to the [MXNet C++ setup guide](c_plus_plus) for full instructions.
+
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/macos/java/cpu.md b/docs/static_site/src/_includes/get_started/macos/java/cpu.md
new file mode 100644
index 000000000000..e8c320e9462d
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/macos/java/cpu.md
@@ -0,0 +1,16 @@
+You can use the Maven packages defined in the following dependency to include MXNet in
+your Java project. The Java API is provided as a subset of the Scala API and is intended for
+inference only.
+Please refer to the [MXNet-Java setup guide](java_setup.html) for a detailed set of instructions to help you with the setup process.
+
+
+
+{% highlight html %}
+
+ org.apache.mxnet
+ mxnet-full_2.11-linux-x86_64-cpu
+ [1.5.0, )
+
+{% endhighlight %}
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/macos/java/gpu.md b/docs/static_site/src/_includes/get_started/macos/java/gpu.md
new file mode 100644
index 000000000000..b17ef33478ea
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/macos/java/gpu.md
@@ -0,0 +1 @@
+Not available at this time.
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/macos/julia/build-from-source.md b/docs/static_site/src/_includes/get_started/macos/julia/build-from-source.md
new file mode 100644
index 000000000000..85045cacf8f6
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/macos/julia/build-from-source.md
@@ -0,0 +1 @@
+Refer to the [Julia section of the MXNet macOS installation guide](osx_setup.html#install-the-mxnet-package-for-julia).
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/macos/julia/pkg.md b/docs/static_site/src/_includes/get_started/macos/julia/pkg.md
new file mode 100644
index 000000000000..35971305ee2d
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/macos/julia/pkg.md
@@ -0,0 +1,10 @@
+Install a pinned version of MXNet like this:
+
+{% highlight julia %}
+]add MXNet#v1.5.0
+{% endhighlight %}
+
+Or directly install the latest release:
+{% highlight julia %}
+]add MXNet
+{% endhighlight %}
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/macos/perl/perl.md b/docs/static_site/src/_includes/get_started/macos/perl/perl.md
new file mode 100644
index 000000000000..c9b04e328edc
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/macos/perl/perl.md
@@ -0,0 +1 @@
+Refer to the [Perl section of installation guide](osx_setup.html#install-the-mxnet-package-for-perl).
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/macos/python/cpu/build-from-source.md b/docs/static_site/src/_includes/get_started/macos/python/cpu/build-from-source.md
new file mode 100644
index 000000000000..b8b5d2119bd0
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/macos/python/cpu/build-from-source.md
@@ -0,0 +1,2 @@
+To build from source, refer to the [MXNet macOS installation guide](osx_setup.html).
+MXNet developers should refer to the MXNet wiki's [Developer Setup on Mac](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Developer+Setup+on+Mac).
diff --git a/docs/static_site/src/_includes/get_started/macos/python/cpu/docker.md b/docs/static_site/src/_includes/get_started/macos/python/cpu/docker.md
new file mode 100644
index 000000000000..2f0f04ae432c
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/macos/python/cpu/docker.md
@@ -0,0 +1,35 @@
+Docker images with *MXNet* are available at [DockerHub](https://hub.docker.com/r/mxnet/).
+
+**Step 1** Install Docker on your machine by following the [docker installation
+instructions](https://docs.docker.com/docker-for-mac/install/#install-and-run-docker-for-mac).
+
+*Note* - You can install Community Edition (CE) to get started with *MXNet*.
+
+**Step 2** Pull the MXNet docker image.
+
+{% highlight bash %}
+$ docker pull mxnet/python
+{% endhighlight %}
+
+You can list docker images to see if mxnet/python docker image pull was successful.
+
+{% highlight bash %}
+$ docker images
+
+REPOSITORY TAG IMAGE ID CREATED SIZE
+mxnet/python latest 00d026968b3c 3 weeks ago 1.41 GB
+{% endhighlight %}
+
+Using the latest MXNet with [Intel MKL-DNN](https://github.com/intel/mkl-dnn) is
+recommended for the
+fastest inference speeds with MXNet.
+
+{% highlight bash %}
+$ docker pull mxnet/python:1.3.0_cpu_mkl # Use sudo if you skip Step 2
+$ docker images # Use sudo if you skip Step 2
+
+REPOSITORY TAG IMAGE ID CREATED SIZE
+mxnet/python 1.3.0_cpu_mkl deaf9bf61d29 4 days ago 678 MB
+{% endhighlight %}
+
+**Step 4** Validate the installation.
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/macos/python/cpu/pip.md b/docs/static_site/src/_includes/get_started/macos/python/cpu/pip.md
new file mode 100644
index 000000000000..beb5eb4fb797
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/macos/python/cpu/pip.md
@@ -0,0 +1,73 @@
+Run the following command:
+
+
+
+{% include /get_started/pip_snippet.md %}
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/macos/python/gpu/build-from-source.md b/docs/static_site/src/_includes/get_started/macos/python/gpu/build-from-source.md
new file mode 100644
index 000000000000..7cb61a59c8bf
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/macos/python/gpu/build-from-source.md
@@ -0,0 +1,2 @@
+Refer to the [MXNet macOS installation guide](osx_setup.html).
+MXNet developers should refer to the MXNet wiki's [Developer Setup on Mac](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Developer+Setup+on+Mac).
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/macos/python/gpu/pip_docker.md b/docs/static_site/src/_includes/get_started/macos/python/gpu/pip_docker.md
new file mode 100644
index 000000000000..b4585922e8e0
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/macos/python/gpu/pip_docker.md
@@ -0,0 +1 @@
+This option is only available by building from source. Refer to the [MXNet macOS installation guide](osx_setup.html).
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/macos/r/cpu.md b/docs/static_site/src/_includes/get_started/macos/r/cpu.md
new file mode 100644
index 000000000000..beca0d414730
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/macos/r/cpu.md
@@ -0,0 +1,28 @@
+To run MXNet you also should have OpenCV and OpenBLAS installed. You may install them with `brew` as follows:
+
+{% highlight bash %}
+brew install opencv
+brew install openblas
+{% endhighlight %}
+
+To ensure MXNet R package runs with the version of OpenBLAS installed, create a symbolic link as follows:
+
+{% highlight bash %}
+ln -sf /usr/local/opt/openblas/lib/libopenblas.dylib
+/usr/local/opt/openblas/lib/libopenblasp-r0.3.1.dylib
+{% endhighlight %}
+
+Note: packages for 3.6.x are not yet available.
+
+Install 3.5.x of R from [CRAN](https://cran.r-project.org/bin/macosx/). The latest is
+[v3.5.3](https://cran.r-project.org/bin/macosx/R-3.5.3.pkg).
+
+You can [build MXNet-R from source](osx_setup.html#install-the-mxnet-package-for-r), or
+you can use a pre-built binary:
+
+{% highlight r %}
+cran <- getOption("repos")
+cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/"
+options(repos = cran)
+install.packages("mxnet")
+{% endhighlight %}
diff --git a/docs/static_site/src/_includes/get_started/macos/r/gpu.md b/docs/static_site/src/_includes/get_started/macos/r/gpu.md
new file mode 100644
index 000000000000..3fc556a8dd91
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/macos/r/gpu.md
@@ -0,0 +1 @@
+Be the first one to contribute this guide!
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/macos/scala/cpu.md b/docs/static_site/src/_includes/get_started/macos/scala/cpu.md
new file mode 100644
index 000000000000..623a8a56fb4d
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/macos/scala/cpu.md
@@ -0,0 +1,14 @@
+You can use the Maven packages defined in the following dependency to include MXNet in your Scala
+project. Please refer to the [MXNet-Scala setup guide](scala_setup.html) for a detailed set
+of instructions to help you with the setup process.
+
+
+
+{% highlight html %}
+
+ org.apache.mxnet
+ mxnet-full_2.11-osx-x86_64-cpu
+
+{% endhighlight %}
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/macos/scala/gpu.md b/docs/static_site/src/_includes/get_started/macos/scala/gpu.md
new file mode 100644
index 000000000000..b17ef33478ea
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/macos/scala/gpu.md
@@ -0,0 +1 @@
+Not available at this time.
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/pip_snippet.md b/docs/static_site/src/_includes/get_started/pip_snippet.md
new file mode 100644
index 000000000000..e67c3331c91c
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/pip_snippet.md
@@ -0,0 +1,14 @@
+MXNet offers MKL pip packages that will be much faster when running on Intel hardware.
+Check the chart below for other options, refer to PyPI for
+other MXNet pip packages, or validate your MXNet installation.
+
+
+
+
+
+**NOTES:**
+
+*mxnet-cu101mkl* means the package is built with CUDA/cuDNN and MKL-DNN enabled and the CUDA version is 10.1.
+
+All MKL pip packages are experimental prior to version 1.3.0.
diff --git a/docs/static_site/src/_includes/get_started/windows/clojure/clojure.md b/docs/static_site/src/_includes/get_started/windows/clojure/clojure.md
new file mode 100644
index 000000000000..0b25ab9018d3
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/windows/clojure/clojure.md
@@ -0,0 +1 @@
+MXNet-Clojure for Windows is not yet available.
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/windows/cpp/cpp.md b/docs/static_site/src/_includes/get_started/windows/cpp/cpp.md
new file mode 100644
index 000000000000..b2e632b0f117
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/windows/cpp/cpp.md
@@ -0,0 +1,3 @@
+To enable the C++ package, build from source using `make USE_CPP_PACKAGE=1`.
+Refer to the [MXNet C++ setup guide](c_plus_plus) for full instructions.
+
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/windows/java/java.md b/docs/static_site/src/_includes/get_started/windows/java/java.md
new file mode 100644
index 000000000000..0db1f50590a2
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/windows/java/java.md
@@ -0,0 +1 @@
+MXNet-Java for Windows is not yet available.
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/windows/julia/build-from-source.md b/docs/static_site/src/_includes/get_started/windows/julia/build-from-source.md
new file mode 100644
index 000000000000..cdc185416329
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/windows/julia/build-from-source.md
@@ -0,0 +1 @@
+Refer to the [Julia section of the MXNet Windows installation guide](windows_setup.html#install-the-mxnet-package-for-julia).
diff --git a/docs/static_site/src/_includes/get_started/windows/julia/pkg.md b/docs/static_site/src/_includes/get_started/windows/julia/pkg.md
new file mode 100644
index 000000000000..cb79177e5bbe
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/windows/julia/pkg.md
@@ -0,0 +1,10 @@
+Install a pinned version of MXNet like this:
+
+{% highlight julia %}
+]add MXNet#v1.5.0
+{% endhighlight %}
+
+Or directly install the latest release:
+{% highlight julia %}
+]add MXNet
+{% endhighlight %}
diff --git a/docs/static_site/src/_includes/get_started/windows/perl/perl.md b/docs/static_site/src/_includes/get_started/windows/perl/perl.md
new file mode 100644
index 000000000000..2655f7281486
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/windows/perl/perl.md
@@ -0,0 +1 @@
+Refer to the [Perl section of the MXNet Windows installation guide](windows_setup.html#install-the-mxnet-package-for-perl).
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/windows/python/cpu/build-from-source.md b/docs/static_site/src/_includes/get_started/windows/python/cpu/build-from-source.md
new file mode 100644
index 000000000000..a527f51d919d
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/windows/python/cpu/build-from-source.md
@@ -0,0 +1 @@
+Refer to the [MXNet Windows installation guide](windows_setup.html)
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/windows/python/cpu/docker.md b/docs/static_site/src/_includes/get_started/windows/python/cpu/docker.md
new file mode 100644
index 000000000000..a3df5a92aa99
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/windows/python/cpu/docker.md
@@ -0,0 +1,34 @@
+Docker images with *MXNet* are available at [Docker Hub](https://hub.docker.com/r/mxnet/).
+
+**Step 1** Install Docker on your machine by following the docker installation instructions
+
+*Note* - You can install Community Edition (CE) to get started with *MXNet*.
+
+**Step 2** Pull the MXNet docker image.
+
+{% highlight bash %}
+$ docker pull mxnet/python # Use sudo if you skip Step 2
+{% endhighlight %}
+
+You can list docker images to see if mxnet/python docker image pull was successful.
+
+{% highlight bash %}
+$ docker images # Use sudo if you skip Step 2
+
+REPOSITORY TAG IMAGE ID CREATED SIZE
+mxnet/python latest 00d026968b3c 3 weeks ago 1.41 GB
+{% endhighlight %}
+
+Using the latest MXNet with [Intel MKL-DNN](https://github.com/intel/mkl-dnn) is
+recommended for the
+fastest inference speeds with MXNet.
+
+{% highlight bash %}
+$ docker pull mxnet/python:1.3.0_cpu_mkl # Use sudo if you skip Step 2
+$ docker images # Use sudo if you skip Step 2
+
+REPOSITORY TAG IMAGE ID CREATED SIZE
+mxnet/python 1.3.0_cpu_mkl deaf9bf61d29 4 days ago 678 MB
+{% endhighlight %}
+
+**Step 4** Validate the installation.
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/windows/python/cpu/pip.md b/docs/static_site/src/_includes/get_started/windows/python/cpu/pip.md
new file mode 100644
index 000000000000..d5c7f1fd08f0
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/windows/python/cpu/pip.md
@@ -0,0 +1,73 @@
+Run the following command:
+
+
+
+{% include /get_started/pip_snippet.md %}
+{% include /get_started/gpu_snippet.md %}
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/windows/python/gpu/build-from-source.md b/docs/static_site/src/_includes/get_started/windows/python/gpu/build-from-source.md
new file mode 100644
index 000000000000..27e90fad4f5e
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/windows/python/gpu/build-from-source.md
@@ -0,0 +1 @@
+To build from source, refer to the [MXNet Windows installation guide](windows_setup.html).
diff --git a/docs/static_site/src/_includes/get_started/windows/python/gpu/pip.md b/docs/static_site/src/_includes/get_started/windows/python/gpu/pip.md
new file mode 100644
index 000000000000..cbcd9d44d6af
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/windows/python/gpu/pip.md
@@ -0,0 +1,74 @@
+Run the following command:
+
+
+
+
+{% include /get_started/pip_snippet.md %}
+{% include /get_started/gpu_snippet.md %}
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/windows/r/cpu.md b/docs/static_site/src/_includes/get_started/windows/r/cpu.md
new file mode 100644
index 000000000000..8b7151f9b94a
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/windows/r/cpu.md
@@ -0,0 +1,15 @@
+Note: packages for 3.6.x are not yet available.
+Install 3.5.x of R from [CRAN](https://cran.r-project.org/bin/windows/base/old/).
+
+You can [build MXNet-R from source](windows_setup.html#install-mxnet-package-for-r), or
+you can use a
+pre-built binary:
+
+{% highlight r %}
+cran <- getOption("repos")
+cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/"
+options(repos = cran)
+install.packages("mxnet")
+{% endhighlight %}
+
+To run MXNet you also should have OpenCV and OpenBLAS installed.
diff --git a/docs/static_site/src/_includes/get_started/windows/r/gpu.md b/docs/static_site/src/_includes/get_started/windows/r/gpu.md
new file mode 100644
index 000000000000..b08a56986cca
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/windows/r/gpu.md
@@ -0,0 +1,16 @@
+You can [build MXNet-R from source](windows_setup.html#install-mxnet-package-for-r), or
+you can use a
+pre-built binary:
+
+{% highlight r %}
+cran <- getOption("repos")
+cran["dmlc"] <-
+"https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/GPU/cu92"
+options(repos = cran)
+install.packages("mxnet")
+{% endhighlight %}
+
+Change cu92 to cu90, cu91 or cuda100 based on your CUDA toolkit version. Currently, MXNet supports these versions of CUDA.
+Note : You also need to have cuDNN installed on Windows. Check out this
+[guide](https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installwindows)
+on the steps for installation.
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/windows/scala/scala.md b/docs/static_site/src/_includes/get_started/windows/scala/scala.md
new file mode 100644
index 000000000000..74b7d45c6d79
--- /dev/null
+++ b/docs/static_site/src/_includes/get_started/windows/scala/scala.md
@@ -0,0 +1 @@
+MXNet-Scala for Windows is not yet available.
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/google-analytics.html b/docs/static_site/src/_includes/google-analytics.html
new file mode 100644
index 000000000000..e9afb2093b55
--- /dev/null
+++ b/docs/static_site/src/_includes/google-analytics.html
@@ -0,0 +1,12 @@
+
+
diff --git a/docs/static_site/src/_includes/head.html b/docs/static_site/src/_includes/head.html
new file mode 100644
index 000000000000..9a565c756c07
--- /dev/null
+++ b/docs/static_site/src/_includes/head.html
@@ -0,0 +1,18 @@
+
+
+
+
+
+ {%- seo -%}
+
+
+ {%- feed_meta -%}
+ {%- if jekyll.environment == 'production' and site.google_analytics -%}
+ {%- include google-analytics.html -%}
+ {%- endif -%}
+
+
+
+
+
+
diff --git a/docs/static_site/src/_includes/header.html b/docs/static_site/src/_includes/header.html
new file mode 100644
index 000000000000..314506476985
--- /dev/null
+++ b/docs/static_site/src/_includes/header.html
@@ -0,0 +1,49 @@
+
+
+
+
+
+
+
+
diff --git a/docs/static_site/src/_includes/icon-github.html b/docs/static_site/src/_includes/icon-github.html
new file mode 100644
index 000000000000..e501a16b1878
--- /dev/null
+++ b/docs/static_site/src/_includes/icon-github.html
@@ -0,0 +1 @@
+{% include icon-github.svg %}{{ include.username }}
diff --git a/docs/static_site/src/_includes/icon-github.svg b/docs/static_site/src/_includes/icon-github.svg
new file mode 100644
index 000000000000..e6c5f6dfd6c4
--- /dev/null
+++ b/docs/static_site/src/_includes/icon-github.svg
@@ -0,0 +1 @@
+
diff --git a/docs/static_site/src/_includes/icon-twitter.html b/docs/static_site/src/_includes/icon-twitter.html
new file mode 100644
index 000000000000..e623dbd6efc5
--- /dev/null
+++ b/docs/static_site/src/_includes/icon-twitter.html
@@ -0,0 +1 @@
+{% include icon-twitter.svg %}{{ include.username }}
diff --git a/docs/static_site/src/_includes/icon-twitter.svg b/docs/static_site/src/_includes/icon-twitter.svg
new file mode 100644
index 000000000000..efc0ecf6781d
--- /dev/null
+++ b/docs/static_site/src/_includes/icon-twitter.svg
@@ -0,0 +1 @@
+
diff --git a/docs/static_site/src/_includes/social.html b/docs/static_site/src/_includes/social.html
new file mode 100644
index 000000000000..474404ec8cdd
--- /dev/null
+++ b/docs/static_site/src/_includes/social.html
@@ -0,0 +1,14 @@
+
MXNet provides a comprehensive and flexible Python API to serve a broad community of developers with different levels of experience and wide ranging requirements. Current efforts are focused on the
+ Gluon API. Gluon provides a clear, concise, and simple API for deep learning. It makes it easy to prototype, build, and train deep learning models without sacrificing training speed.
While most of the usability improvement around training are focused on the python API, the performance of MXNet is accessible through a variety of different language bindings, checkout their respective API and guides below!
+
+
+
+ {%- endif -%}
+{%- endfor -%}
+
Other Bindings
+
+{%- for doc in page.docs -%}
+ {%- if doc.tag != 'python' -%}
+
+ Building a high-performance deep learning library
+ requires many systems-level design decisions.
+ In this design note, we share the rationale
+ for the specific choices made when designing _MXNet_.
+ We imagine that these insights may be useful
+ to both deep learning practitioners
+ and builders of other deep learning systems.
+
+
Deep Learning System Design Concepts
+
+ The following pages address general design concepts for deep learning systems.
+ Mainly, they focus on the following 3 areas:
+ abstraction, optimization, and trade-offs between efficiency and flexibility.
+ Additionally, we provide an overview of the complete MXNet system.
+
+
+ {%- for p in site.pages -%}
+ {%- if p.category == 'architecture' -%}
+
diff --git a/docs/static_site/src/pages/api/architecture/exception_handling.md b/docs/static_site/src/pages/api/architecture/exception_handling.md
new file mode 100644
index 000000000000..b985b27b285e
--- /dev/null
+++ b/docs/static_site/src/pages/api/architecture/exception_handling.md
@@ -0,0 +1,130 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Exception Handling in MXNet
+category: architecture
+permalink: /api/architecture/exception_handling
+---
+
+# Exception Handling in MXNet
+
+This tutorial explains the exception handling support in MXNet,
+and provides examples on how to throw and handle exceptions when in a multithreaded context.
+Although, the examples are in Python, they can be easily extended to MXNet
+language bindings.
+
+MXNet exceptions can be thrown from two areas:
+- MXNet main thread. For eg. Infershape and InferType.
+- Spawned threads:
+ * By dependency engine for operator execution in parallel
+ * By the iterators, during the data loading, text parsing phase etc.
+
+In the first case, the exception is thrown and can be handled in the main thread.
+In the second case, the exception is thrown in a spawned thread, caught and transported to the
+main thread, where it is rethrown. This tutorial will give more explanation and examples on how
+to handle exceptions for the second case.
+
+## Prerequisites
+
+To complete this tutorial, we need:
+- MXNet [7b24137](https://github.com/apache/incubator-mxnet/commit/7b24137ed45df605defa4ce72ec91554f6e445f0). See Instructions in [Setup and Installation](http://mxnet.io/install/index.html).
+
+## Exception Handling for Iterators
+
+The below example shows how to handle exceptions for iterators. In this example,
+we populate files for data and labels with fewer number of labels compared to the
+number of samples. This should throw an exception.
+
+CSVIter uses PrefetcherIter for loading and parsing data.
+The PrefetcherIter spawns a producer thread in the background which prefetches
+the data while the main thread consumes the data. The exception is thrown in the spawned
+producer thread during the prefetching, when the label is not found corresponding to a specific sample.
+
+The exception is transported to the main thread, where it is rethrown when Next is
+called as part of the following line: `for batch in iter(data_train)`.
+
+In general, Exception may be rethrown as part of `Next` and `BeforeFirst` calls which correspond to `reset()` and `next()` methods in `MXDataIter` for Python language bindings.
+
+```python
+import os
+import mxnet as mx
+
+cwd = os.getcwd()
+data_path = os.path.join(cwd, "data.csv")
+label_path = os.path.join(cwd, "label.csv")
+
+with open(data_path, "w") as fout:
+ for i in range(8):
+ fout.write("1,2,3,4,5,6,7,8,9,10\n")
+
+with open(label_path, "w") as fout:
+ for i in range(7):
+ fout.write("label"+str(i))
+
+try:
+ data_train = mx.io.CSVIter(data_csv=data_path, label_csv=label_path, data_shape=(1, 10),
+ batch_size=4)
+
+ for batch in iter(data_train):
+ print(data_train.getdata().asnumpy())
+except mx.base.MXNetError as ex:
+ print("Exception handled")
+ print(ex)
+```
+
+### Limitation
+
+There is a race condition when your last `next()` call doesnt reach the batch in your dataset where exception occurs. Exception may or may not be thrown in this case depending on which thread wins the race. To avoid this situation, you should try and iterate through your full dataset if you think it can throw exceptions which need to be handled.
+
+
+## Exception Handling for Operators
+
+The below example shows how to handle exceptions for operators in the imperative mode.
+
+For the operator case, the dependency engine spawns a number of threads if it is running in the `ThreadedEnginePool` or `ThreadedEnginePerDevice` mode. The final operator is executed in one of the spawned threads.
+
+If an operator throws an exception during execution, this exception is propagated
+down the dependency chain. Once there is a synchronizing call i.e. WaitToRead for a variable in the dependency chain, the propagated exception is rethrown.
+
+In the below example, I illustrate how an exception that occured in the first line is propagated down the dependency chain, and finally is rethrown when we make a synchronizing call to WaitToRead.
+
+```python
+import mxnet as mx
+a = mx.nd.random.normal(0, 1, (2, 2))
+b = mx.nd.random.normal(0, 2, (2, 2))
+c = mx.nd.dot(a, b)
+d = mx.nd.random.normal(0, -1, (2, 2))
+e = mx.nd.dot(c, d)
+e.wait_to_read()
+```
+
+Although the above exception occurs when executing the operation which writes to the variable d in one of the child threads, it is thrown only when the synchronization happens as part of the line: `e.wait_to_read()`.
+
+Let us take another example. In the following case, we write to two variables and then `wait_to_read` for both. This example shows that any particular exception will not be thrown more than once.
+
+```python
+import mxnet as mx
+a = mx.nd.random.normal(0, 1, (2, 2))
+b = mx.nd.random.normal(0, -1, (2, 2))
+c, d = mx.nd.dot(a, b)
+try:
+ c.asnumpy()
+except mx.base.MXNetError as ex:
+ print("Exception handled")
+d.asnumpy()
+```
diff --git a/docs/static_site/src/pages/api/architecture/note_data_loading.md b/docs/static_site/src/pages/api/architecture/note_data_loading.md
new file mode 100644
index 000000000000..63d90024a80c
--- /dev/null
+++ b/docs/static_site/src/pages/api/architecture/note_data_loading.md
@@ -0,0 +1,267 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Efficient Data Loaders
+category: architecture
+permalink: /api/architecture/note_data_loading
+---
+
+# Designing Efficient Data Loaders for Deep Learning
+
+Data loading is an important component of any machine learning system.
+When we work with tiny datasets, we can get away with loading an entire dataset into GPU memory.
+With larger datasets, we must store examples in main memory.
+And when datasets grow too large to fit into main memory,
+data loading can become performance-critical.
+In designing a data loader,
+we aim to achieve more efficient data loading,
+to spend less effort on data preparation,
+and to present a clean and flexible interface.
+
+We organize this design note as follows:
+
+* **IO Design Insight:** Guiding principles in data loading design.
+* **Data Format:** Our solution using dmlc-core's binary recordIO implementation.
+* **Data Loading:** Our method to reduce IO cost by utilizing the threaded iterator provided by dmlc-core.
+* **Interface Design:** Our approach to facilitate writing MXNet data iterators in just a few lines of Python.
+* **Future Extension:** Prospective ideas for making data loading more flexible.
+
+Our analysis will motivate several requirements that an effective IO system should fulfill.
+
+***List of Key Requirements***
+- Small file size.
+- Parallel (distributed) packing of data.
+- Fast data loading and online augmentation.
+- Quick reads from arbitrary parts of the dataset in the distributed setting.
+
+## Design Insight
+To design an IO system, we must address two kinds of tasks:
+data preparation and data loading.
+Data preparation is usually performed offline,
+whereas data loading influences the online performance.
+In this section, we will introduce our insight of IO design involving the two phases.
+
+### Data Preparation
+Data preparation describes the process of packing data
+into a desired format for later processing.
+When working with large datasets like ImageNet, this process can be time-consuming.
+In these cases, there are several heuristics we ought to follow:
+
+- Pack the dataset into small numbers of files. A dataset may contain millions of data instances. Packed data distributes easily from machine to machine.
+- Do the packing once. We don't want to repack data every time run-time settings, like the number of machines, are changed.
+- Process the packing in parallel to save time.
+- Be able to access arbitrary parts of the data easily. This is crucial for distributed machine learning when data parallelism is introduced. Things may get tricky when the data has been packed into several physical data files. The desired behavior could be: the packed data can be logically separated into arbitrary numbers of partitions, no matter how many physical data files there are. For example, if we pack 1000 images into 4 physical files, then each file contains 250 images. If we then use 10 machines to train a DNN, we should be able to load approximately 100 images per machine. Some machines may need images from different physical files.
+
+### Data Loading
+The next step to consider is how to load the packed data into RAM.
+Our goal is to load the data as quickly as possible.
+There are several heuristics we try to follow:
+- **Read continuously:** We can read faster when reading from contiguous locations on disk.
+- **Reduce the bytes to be loaded:** We can achieve this by storing data in a compact way, e.g. saving images in JPEG format.
+- **Load and train in different threads:** This avoids computational bottlenecks while loading data.
+- **Save RAM:** Judiciously decide whether to load entire files into RAM.
+
+## Data Format
+
+Since the training of deep neural network often involves large amounts of data,
+the format we choose should be both efficient and convenient.
+To achieve our goals, we need to pack binary data into a splittable format.
+In MXNet, we rely on the binary recordIO format implemented in dmlc-core.
+
+### Binary Record
+
+![baserecordio](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/baserecordio.jpg)
+In MXNet's binary RecordIO, we store each data instance as a record.
+**kMagic** is a *magic number* indicating the start of a record.
+**Lrecord** encodes length and a continue flag.
+In lrecord,
+- cflag == 0: this is a complete record
+- cflag == 1: start of a multiple-records
+- cflag == 2: middle of multiple-records
+- cflag == 3: end of multiple-records
+
+**Data** is the space to save data content.
+**Pad** is simply a padding space to make record align to 4 bytes.
+
+After we pack the data, each file contains multiple records.
+Then, loading can be continuous.
+This avoids the low performance that can result
+from reading random locations on disk.
+
+One advantage of storing data via records
+is that each record can vary in length.
+This allows us to save data compactly
+when good compression algorithms are available for our data.
+For example, we can use JPEG format to save image data.
+The packed data will be much smaller
+compared with storing uncompressed RGB values for each pixel.
+
+Take ImageNet_1K dataset as an example.
+If we store the data as 3 * 256 * 256 array of raw RGB values,
+the dataset would occupy more than **200G**.
+But after compressing the images using JPEG,
+they only occupy about **35G** of disk space.
+This significantly reduces the cost owing to reading from disk.
+
+Here's an example of binary recordIO:
+![baserecordio](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/ImageRecordIO.jpg)
+We first resize the image into 256 * 256,
+then compress into JPEG format.
+After that, we save a header that indicates the index and label
+for that image to be used when constructing the *Data* field for that record.
+We then pack several images together into a file.
+You may want to also review the [example using im2rec.py to create a RecordIO dataset](https://mxnet.incubator.apache.org/tutorials/basic/data.html#loading-data-using-image-iterators).
+
+### Access Arbitrary Parts Of Data
+
+One desirable property for a data loader might be:
+The packed data can be logically sliced into an arbitrary number of partitions,
+no matter how many physical packed data files there are.
+Since binary recordIO can easily locate
+the start and end of a record using the Magic Number,
+we can achieve the above goal using the InputSplit
+functionality provided by dmlc-core.
+
+InputSplit takes the following parameters:
+- FileSystem *filesys*: dmlc-core wrapper around the IO operations for different file systems, like hdfs, s3, local. User shouldn't need to worry about the difference between file systems anymore.
+- Char *uri*: The URI of files. Note that it could be a list of files because we may pack the data into several physical parts. File URIs are separated by ';'.
+- Unsigned *nsplit*: The number of logical splits. *nsplit* could be different from the number of physical files.
+- Unsigned *rank*: Which split to load in this process.
+
+The splitting process is demonstrated below:
+- Determine the size of each partition.
+
+![beforepartition](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/beforepartition.jpg)
+
+- Approximately partition the records according to file size. Note that the boundary of each part may be located in the middle of a record.
+
+![approxipartition](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/approximatepartition.jpg)
+
+- Set the beginning of partitions in such a way as to avoid splitting records across partitions.
+
+![afterpartition](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/afterpartition.jpg)
+
+By conducting the above operations,
+we now identify the records belong to each part,
+and the physical data files needed by each logical part.
+InputSplit greatly simplifies data parallelism,
+where each process only reads part of the data.
+
+Since our partitioning scheme does not depend on the number of physical data files,
+we can process a huge dataset like ImageNet_22K in parallel fashion as illustrated below.
+We don't need to consider distributed loading issue at the preparation time,
+just select the most efficient physical file number
+according to the dataset size and computing resources available.
+![parallelprepare](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/parallelprepare.jpg)
+
+## Data Loading and Preprocessing
+
+When the speed of loading and preprocessing can't keep up
+with the speed of training or evaluation,
+IO can bottleneck the speed of the whole system.
+In this section, we will introduce a few tricks
+to achieve greater efficiency when loading
+and preprocessing data packed in binary recordIO format.
+When applied to the ImageNet dataset, our approach achieves
+the IO speed of **3000** images/sec **with a normal HDD**.
+
+### Loading and preprocessing on the fly
+
+When training deep neural networks,
+we sometimes must load and preprocess the data
+while simultaneously training for the following reasons:
+- When the whole size of the dataset exceeds available RAM size, we can't load it in advance;
+- Sometimes, to make models robust to things like translations, rotations, and small amounts of color shift of noise, we introduce randomness into the training process. In these cases we must re-preprocess the data each time we revisit an example.
+
+In service of efficiency, we also address multi-threading techniques. Taking Imagenet training as an example, after loading a bunch of image records, we can start multiple threads to simultaneously perform image decoding and image augmentation. We depict this process in the following illustration:
+![process](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/process.jpg)
+
+### Hide IO Cost Using Threadediter
+
+One way to lower IO cost is to pre-fetch the data for next batch on one thread,
+while the main thread performs the forward and backward passes for training.
+To support more complicated training schemes,
+MXNet provides a more general IO processing pipeline
+using *threadediter* provided by dmlc-core.
+The key of *threadediter* is to start a stand-alone thread that acts as a data provider,
+while the main thread acts as a data consumer as illustrated below.
+
+The threadediter maintains a buffer of a certain size
+and automatically fills the buffer when it's not full.
+And after the consumer finishes consuming part of the data in the buffer,
+threadediter will reuse the space to save the next part of data.
+![threadediter](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/threadediter.png)
+
+## MXNet IO Python Interface
+We make the IO object as an iterator in numpy.
+By achieving that, the user can easily access the data
+using a for-loop or calling next() function.
+Defining a data iterator is very similar to defining a symbolic operator in MXNet.
+
+The following example code demonstrates a Cifar data iterator.
+
+```python
+dataiter = mx.io.ImageRecordIter(
+ # Dataset Parameter, indicating the data file, please check the data is already there
+ path_imgrec="data/cifar/train.rec",
+ # Dataset Parameter, indicating the image size after preprocessing
+ data_shape=(3,28,28),
+ # Batch Parameter, tells how many images in a batch
+ batch_size=100,
+ # Augmentation Parameter, when offers mean_img, each image will subtract the mean value at each pixel
+ mean_img="data/cifar/cifar10_mean.bin",
+ # Augmentation Parameter, randomly crop a patch of the data_shape from the original image
+ rand_crop=True,
+ # Augmentation Parameter, randomly mirror the image horizontally
+ rand_mirror=True,
+ # Augmentation Parameter, randomly shuffle the data
+ shuffle=False,
+ # Backend Parameter, preprocessing thread number
+ preprocess_threads=4,
+ # Backend Parameter, prefetch buffer size
+ prefetch_buffer=1,
+ # Optional, the device context which data loader optimized for, could be 'gpu' or 'cpu'
+ ctx="gpu",
+ # The out data type, could be 'float32' 'int8' or 'uint8'
+ dtype="float32")
+```
+
+Generally, to create a data iterator, you need to provide five kinds of parameters:
+
+* **Dataset Param:** Information needed to access the dataset, e.g. file path, input shape.
+* **Batch Param:** Specifies how to form a batch, e.g. batch size.
+* **Augmentation Param:** Which augmentation operations (e.g. crop, mirror) should be taken on an input image.
+* **Backend Param:** Controls the behavior of the backend threads to hide data loading cost.
+* **Auxiliary Param:** Provides options to help with debugging.
+
+Usually, **Dataset Param** and **Batch Param** MUST be given,
+otherwise the data batch can't be created.
+Other parameters can be given as needed.
+Ideally, we should separate the MX Data IO into modules,
+some of which might be useful to expose to users, for example:
+
+* **Efficient prefetcher:** allows the user to write a data loader that reads their customized binary format that automatically gets multi-threaded prefetcher support.
+* **Data transformer:** image random cropping, mirroring, etc. Allows the users to use those tools, or plug in their own customized transformers (maybe they want to add some specific kind of coherent random noise to data, etc.)
+
+## Future Extensions
+
+In the future, there are some extensions to our data IO
+that we might consider adding.
+Specifically, we might add specialized support
+for applications including image segmentation, object localization, and speech recognition.
+More detail will be provided when such applications have been running on MXNet.
diff --git a/docs/static_site/src/pages/api/architecture/note_engine.md b/docs/static_site/src/pages/api/architecture/note_engine.md
new file mode 100644
index 000000000000..cb626649e5ff
--- /dev/null
+++ b/docs/static_site/src/pages/api/architecture/note_engine.md
@@ -0,0 +1,391 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Dependency Engine
+category: architecture
+permalink: /api/architecture/note_engine
+---
+
+# Dependency Engine for Deep Learning
+
+We always want deep learning libraries
+to run faster and scale to larger datasets.
+One natural approach is to see if we can benefit
+from throwing more hardware at the problem,
+as by using multiple GPUs simultaneously.
+
+Library designers then ask:
+How can we *parallelize* computation across devices?
+And, more importantly, how can we *synchronize* computation
+when we introduce multi-threading?
+A runtime dependency engine is a generic solution to these problems.
+
+In this document, we examine approaches for using
+runtime dependency scheduling to accelerate deep learning.
+We aim to explain how runtime dependency scheduling
+can both speed up and simplify multi-device deep learning.
+We also explore potential designs for a generic dependency engine
+that could be both library- and operation-independent.
+
+Most of the discussion of on this page draws inspiration
+from the MXNet dependency engine.
+The dependency tracking algorithm we discuss
+was primarily developed by [Yutian Li](https://github.com/hotpxl)
+and [Mingjie Wang](https://github.com/jermainewang).
+
+## Dependency Scheduling
+
+Although most users want to take advantage of parallel computation,
+most of us are more familiar with serial programs.
+So one natural question is: how can we write serial programs
+and build a library to automatically parallelize our programs
+in an asynchronous way?
+
+For example, in the following code, we can run `B = A + 1`
+and `C = A + 2` in any order, or in parallel:
+
+```python
+ A = 2
+ B = A + 1
+ C = A + 2
+ D = B * C
+```
+
+However, it's quite hard to code the sequence manually
+because the last operation, `D = B * C`, needs to wait
+for both of the preceding operations to complete before it starts.
+The following dependency graph/data flow graph illustrates this.
+
+![Dep Simple](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_simple.png)
+
+
+A dependency engine is a library that takes a sequence of operations
+and schedules them according to the dependency pattern, potentially in parallel.
+So in this example, a dependency library
+could run ```B = A + 1``` and ```C = A + 2``` in parallel,
+and run ```D = B * C``` after those operations complete.
+
+## Problems in Dependency Scheduling
+
+A dependency engine relieves the burden of writing concurrent programs.
+However, as operations become parallelized,
+new dependency tracking problems arise.
+In this section, we discuss those problems.
+
+### Data Flow Dependency
+Data flow dependency describes how the outcome of one computation
+can be used in other computations.
+Every dependency engine has to solve the data flow dependency problem.
+
+![Dep Simple](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_simple.png)
+
+Because we discussed this issue in the preceding section,
+we include the same figure here. Libraries that have
+data flow tracking engines include Minerva and Purine2.
+
+### Memory Recycling
+When should we recycle the memory that we allocated to the arrays?
+In serial processing, this is easy to determine.
+We simply recycle the memory after the variable goes out of scope.
+However, as the following figure shows, this is a bit harder in parallel processing.
+
+![Dep Del](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_del.png)
+
+In this example, because both computations need to use values from A,
+we can't recycle the memory until both complete.
+The engine must schedule the memory recycling operations according to the dependencies,
+and ensure that they are executed after both ```B = A + 1``` and ```C = A + 2``` complete.
+
+
+### Random Number Generation
+Random number generators, which are commonly used in machine learning,
+pose interesting challenges for dependency engines.
+Consider the following example:
+
+![Dep Rand](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_rand.png)
+
+In this example, we are generating random numbers in a sequence.
+Although it seems that the two random number generations can be parallelized,
+this is usually not the case. A pseudo-random number generator (PRNG)
+is not thread-safe because it might cause some internal state
+to mutate when generating a new number.
+Even if the PRNG is thread-safe,
+it is preferable to serialize number generation,
+so we can get reproducible random numbers.
+
+## Case Study: A Dependency Engine for a Multi-GPU Neural Network
+
+In the last section, we discussed the problems
+we might face in designing a dependency engine.
+Before thinking about how to design a generic engine to solve those problems,
+let's consider how a dependency engine can help in multi-GPU training of a neural network.
+The following pseudocode Python program illustrates
+training one batch on a two-layer neural network.
+
+```python
+ # Example of one iteration Two GPU neural Net
+ data = next_batch()
+ data[gpu0].copyfrom(data[0:50])
+ data[gpu1].copyfrom(data[50:100])
+ # forward, backprop on GPU 0
+ fc1[gpu0] = FullcForward(data[gpu0], fc1_weight[gpu0])
+ fc2[gpu0] = FullcForward(fc1[gpu0], fc2_weight[gpu0])
+ fc2_ograd[gpu0] = LossGrad(fc2[gpu0], label[0:50])
+ fc1_ograd[gpu0], fc2_wgrad[gpu0] =
+ FullcBackward(fc2_ograd[gpu0] , fc2_weight[gpu0])
+ _, fc1_wgrad[gpu0] = FullcBackward(fc1_ograd[gpu0] , fc1_weight[gpu0])
+ # forward, backprop on GPU 1
+ fc1[gpu1] = FullcForward(data[gpu1], fc1_weight[gpu1])
+ fc2[gpu1] = FullcForward(fc1[gpu1], fc2_weight[gpu1])
+ fc2_ograd[gpu1] = LossGrad(fc2[gpu1], label[50:100])
+ fc1_ograd[gpu1], fc2_wgrad[gpu1] =
+ FullcBackward(fc2_ograd[gpu1] , fc2_weight[gpu1])
+ _, fc1_wgrad[gpu1] = FullcBackward(fc1_ograd[gpu1] , fc1_weight[gpu1])
+ # aggregate gradient and update
+ fc1_wgrad[cpu] = fc1_wgrad[gpu0] + fc1_wgrad[gpu1]
+ fc2_wgrad[cpu] = fc2_wgrad[gpu0] + fc2_wgrad[gpu1]
+ fc1_weight[cpu] -= lr * fc1_wgrad[cpu]
+ fc2_weight[cpu] -= lr * fc2_wgrad[cpu]
+ fc1_weight[cpu].copyto(fc1_weight[gpu0] , fc1_weight[gpu1])
+ fc2_weight[cpu].copyto(fc2_weight[gpu0] , fc2_weight[gpu1])
+```
+In this program, the data 0 to 50 is copied to GPU 0,
+and the data 50 to 100 is copied to GPU 1.
+The calculated gradients are aggregated in the CPU,
+which then performs a simple SGD update,
+and copies the updated weight back to each GPU.
+This is a common way to write a parallel program in a serial manner.
+The following dependency graph shows how it can be parallelized:
+
+![Dep Net](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_net.png)
+
+***Notes:***
+
+- The gradient can be copied to the CPU as soon as we get the gradient of a layer.
+- The weight can be copied back soon as the weight is updated.
+- In the forward pass, we have a dependency on ```fc1_weight[cpu].copyto(fc1_weight[gpu0] , fc1_weight[gpu1])```
+ from the previous iteration.
+- There is a delay in computation between the last backward pass to layer k and the next forward call to layer k. We can synchronize the weight of layer k *in parallel* with other computation during this delay.
+
+This approach to optimization is used by multi-GPU deep learning libraries, such as CXXNet.
+The point is to overlap weight synchronization (communication) with computation.
+However, it's not easy to do that, because the copy operation needs to be triggered
+as soon as the backward pass of the layer completes,
+which then triggers the reduction, updates, etc.
+
+A dependency engine can schedule these operations and perform multi-threading
+and dependency tracking.
+
+## Designing a Generic Dependency Engine
+
+We hope that you're convinced that a dependency engine is useful
+for scaling deep learning programs to multiple devices.
+Now let's discuss how we can design and implement
+a generic interface for a dependency engine.
+This solution isn't the only possible design for a dependency engine.
+It's an example that we think is useful in most cases.
+
+Our goal is to create a dependency engine that is *generic* and *lightweight*.
+Ideally, we'd like the engine that easily plugs into existing deep learning code,
+and that can scale up to multiple machines with minor modifications.
+To do that, we need to focus only on dependency tracking,
+not on assumptions about what users can or can't do.
+
+Here's a summary of goals for the engine:
+
+- The engine should not be aware of what operations it performs, so that users can perform any operations they define.
+- It should not be restricted in what type of objects it can schedule.
+ - We should be able to schedule dependencies on GPU and CPU memory.
+ - We should be able to track dependencies on the random number generator, etc.
+- The engine should not allocate resources. It should only track dependencies. Users can allocate their own memory, PRNG, etc.
+
+The following Python snippet provides an engine interface that might help us reach our goal. Note that a real implementation will be closer to the metal, typically in C++.
+
+```python
+ class DepEngine(object):
+ def new_variable():
+ """Return a new variable tag
+ Returns
+ -------
+ vtag : Variable Tag
+ The token of the engine to represent dependencies.
+ """
+ pass
+
+ def push(exec_func, read_vars, mutate_vars):
+ """Push the operation to the engine.
+
+ Parameters
+ ----------
+ exec_func : callable
+ The real operation to be performed.
+
+ read_vars : list of Variable Tags
+ The list of variables this operation will read from.
+
+ mutate_vars : list of Variable Tags
+ The list of variables this operation will mutate.
+ """
+ pass
+```
+
+Because we can't make assumptions about what objects we are scheduling, we ask the user to allocate a
+_virtual tag_ that is associated with each object to represent what we need to schedule.
+So, at the beginning, the user can allocate the variable tag,
+and attach it to each of the objects that we want to schedule.
+
+![Dep Net](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/tag_var.png)
+
+The user then calls `push` to tell the engine about the function to execute.
+The user also needs to specify the dependencies of the operation,
+using `read_vars` and `write_vars`:
+
+- `read_vars` are variable tags for objects that the operation will _read from_, without changing their internal state.
+- `mutate_vars` are variable tags for objects whose internal states the operation will mutate.
+
+![Push Op](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/push_var.png)
+
+The preceding figure shows how to push operation `B = A + 1` to the dependency engine. `B.data` and
+`A.data` are the allocated space. Note that the engine is *only aware of variable tags*.
+Any execution function can be processed.
+This interface is generic for the operations and resources we want to schedule.
+
+For fun, let's look at how the engine internals work with the tags by considering the following code snippet:
+
+```
+ B = A + 1
+ C = A + 2
+ A = C * 2
+ D = A + 3
+```
+
+The first line reads variable `A` and mutates variable `B`. The second line reads variable `A` and mutates variable `C`. And so on.
+
+The engine maintains a queue for each variable, as the following animation shows for each of the four lines. Green blocks represents a read action, while red blocks represent mutations.
+
+![Dependency Queue](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_queue.gif)
+
+Upon building this queue, the engine sees that the first two green blocks at the beginning of `A`'s queue could actually be run in parallel because they are both read actions and won't conflict with each other. The following graph illustrates this point.
+
+![Dependency Parallelism](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_parallel.png)
+
+One cool thing about all this scheduling is that it's not confined to numerical calculations.
+Because everything that is scheduled is only a tag, the engine could schedule everything!
+
+The following figure gives a complete push sequence of the programs we mentioned in previous sections.
+
+![Push Seq](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/push_seq.png)
+
+### Porting Existing Code to the Dependency Engine
+Because the generic interface doesn't control things like memory allocation and which operation to execute,
+most existing code can be scheduled by the dependency engine in two steps:
+
+
+1. Allocate the variable tags associated with resources like memory blob, PRNGS.
+2. Call `push` with the execution function as the original code to execute, and put the variable tags of
+ corresponding resources correctly in `read_vars` and `mutate_vars`.
+
+## Implementing the Generic Dependency Engine
+
+We have described the generic engine interface and
+how it can be used to schedule various operations.
+In this section, we provide a high-level discussion
+of how to implement such an engine.
+
+The general idea is as follows:
+
+- Use a queue to track all of the pending dependencies on each variable tag.
+- Use a counter on each operation to track how many dependencies are yet to be fulfilled.
+- When operations are completed, update the state of the queue and dependency counters to schedule new operations.
+
+The following figure illustrates the scheduling algorithm
+and might give you a better sense of what is going on in the engine.
+
+![Dep Tracking](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/engine_queue_step.png)
+
+Below, we show another example involving random number generators.
+
+![Dep Rand](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/engine_queue_rand.png)
+
+As you can see, the purpose of the algorithm is to update pending queues
+of operations and to make the right state transition when an operation has completed.
+More care should be taken to make sure the state transitions
+are done in a way that's safe for threads.
+
+### Separate Dependency Tracking with Running Policy
+If you're reading carefully, you might have noticed
+that the preceding section shows only the algorithm
+for deciding when an operation can be executed.
+We didn't show how to actually run an operation.
+In practice, there can be many different policies.
+For example, we can either use a global thread-pool to run all operations,
+or use a specific thread to run operations on each device.
+
+This running policy is usually independent of dependency tracking,
+and can be separated out as either an independent module
+or a virtual interface of base-dependency tracking modules.
+Developing an elegant runtime policy that is fair
+to all operations and schedules is an interesting systems problem itself.
+
+## Discussion
+
+The design that we discussed in this article
+isn't the only solution to the dependency tracking problem.
+It's just one example of how we might approach this.
+To be sure, some of these design choices are debatable.
+We'll discuss some of them in this section.
+
+### Dynamic vs. Static
+The dependency engine interface discussed in this topic is somewhat dynamic
+in the sense that the user can push operations one by one,
+instead of declaring the entire dependency graph (static).
+Dynamic scheduling can require more overhead
+than static declarations, in terms of data structure.
+However, it also enables more flexibility, such as supporting auto parallelism
+for imperative programs or a mixture of imperative and symbolic programs.
+You can also add some level of predeclared operations
+to the interface to enable data structure reuse.
+
+### Mutation vs. Immutable
+The generic engine interface presented in this page
+supports explicit scheduling for mutation.
+In a typical data flow engine, the data are usually immutable.
+Working with immutable data has a lot of benefits.
+For example, immutable data is generally more suitable for parallelization,
+and facilitates better fault tolerance in a distributed setting (by way of re-computation).
+
+However, immutability presents several challenges:
+
+- It's harder to schedule resource contention problems, as arise when dealing with random numbers and deletion.
+- The engine usually needs to manage resources (memory, random number) to avoid conflicts. It's harder to plug in user-allocated space, etc.
+- Preallocated static memory isn't available, again because the usual pattern is to write to a preallocated layer space, which is not supported if data is immutable.
+
+Allowing mutation mitigates these issues.
+
+
+## Source Code of the Generic Dependency Engine
+[MXNet](https://github.com/dmlc/mxnet) provides an implementation
+of the generic dependency engine described in this page.
+You can find more details in [this topic](http://mxnet.io/architecture/note_engine.html).
+We welcome your contributions.
+
+## Next Steps
+
+* [Squeeze the Memory Consumption of Deep Learning](http://mxnet.io/architecture/note_memory.html)
+* [Efficient Data Loading Module for Deep Learning](http://mxnet.io/architecture/note_data_loading.html)
+* [Survey of RNN Interface](http://mxnet.io/architecture/rnn_interface.html)
diff --git a/docs/static_site/src/pages/api/architecture/note_memory.md b/docs/static_site/src/pages/api/architecture/note_memory.md
new file mode 100644
index 000000000000..771d6a5ff012
--- /dev/null
+++ b/docs/static_site/src/pages/api/architecture/note_memory.md
@@ -0,0 +1,351 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Memory Consumption
+category: architecture
+permalink: /api/architecture/note_memory
+---
+
+# Optimizing Memory Consumption in Deep Learning
+
+Over the last ten years, a constant trend in deep learning
+is towards deeper and larger networks.
+Despite rapid advances in hardware performance,
+cutting-edge deep learning models continue to push the limits of GPU RAM.
+So even today, it's always desirable to find ways
+to train larger models while consuming less memory.
+Doing so enables us to train faster, using larger batch sizes,
+and consequently achieving a higher GPU utilization rate.
+
+In this document, we explore techniques for optimizing
+memory allocation for deep neural networks.
+We discuss a few candidate solutions.
+While our proposals are by no means exhaustive,
+these solutions are instructive and allow us to
+introduce the major design issues at play.
+
+## Computation Graph
+
+First, let's revisit the idea of the computation graph.
+A computation graph describes the (data flow) dependencies
+between the operations in the deep network.
+The operations performed in the graph
+can be either fine-grained or coarse-grained.
+The following figure shows two examples of computation graphs.
+
+![Comp Graph Example](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/comp_graph_example.png)
+
+The concept of a computation graph is explicitly encoded in packages like Theano and CGT.
+In other libraries, computation graphs appear implicitly as network configuration files.
+The major difference in these libraries comes down to how they calculate gradients.
+There are mainly two ways: performing back-propagation on the _same_ graph
+or explicitly representing a _backwards path_ to calculate the required gradients.
+
+![Backward Graph](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_graph.png)
+
+Libraries like Caffe, CXXNet, and Torch take the former approach,
+performing back-prop on the original graph.
+Libraries like Theano and CGT take the latter approach,
+explicitly representing the backward path.
+In this discussion, we adopt the *explicit backward path* approach
+because it has several advantages for optimization.
+
+However, we should emphasize that choosing the explicit backward path approach doesn't restrict us
+to symbolic libraries, such as Theano and CGT. We can also use the explicit backward path for gradient calculation of
+layer-based (which ties forward and backward together) libraries. The following graph shows how to do this.
+Basically, we introduce a backward node that links to the forward node of the graph and calls the ```layer.backward```
+in the backward operations.
+
+![Backward Layer](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/explicit_back_layer.png)
+
+This discussion applies to almost all existing deep learning libraries.
+(There are differences between libraries, e.g., higher-order differentiation, which is beyond the scope of this topic.)
+
+Why is the explicit backward path better? Let's explain it with two examples.
+The first reason is that the explicit backward path
+clearly describes the dependency between computations.
+Consider the following case, where we want to get
+the gradient of A and B. As we can see clearly from the graph,
+the computation of the ```d(C)``` gradient doesn't depend on F.
+This means that we can free the memory of ```F```
+right after the forward computation is done.
+Similarly, the memory of ```C``` can be recycled.
+
+![Backward Prune](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_dep_prune.png)
+
+Another advantage of the explicit backward path
+is the ability to have a different backward path,
+instead of a mirror of forward one.
+A common example is the split connection case,
+as shown in the following figure.
+
+![Backward Agg](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_agg_grad.png)
+
+In this example, the output of B is referenced by two operations.
+If we want to do the gradient calculation in the same
+network, we need to introduce an explicit split layer.
+This means we need to do the split for the forward pass, too.
+In this figure, the forward pass doesn't contain a split layer,
+but the graph will automatically insert a gradient
+aggregation node before passing the gradient back to B.
+This helps us to save the memory cost of allocating the output of the split layer,
+and the operation cost of replicating the data in the forward pass.
+
+If we adopt the explicit backward approach,
+there's no difference between the forward pass and the backward pass.
+We simply step through the computation graph in chronological order
+and carry out computations.
+This makes the explicit backward approach easy to analyze.
+We just need to answer the question:
+how do we allocate memory for each output node of a computation graph?
+
+
+## What Can Be Optimized?
+
+As you can see, the computation graph is a useful way
+to discuss memory allocation optimization techniques.
+Already, we've shown how you can save some memory
+by using the explicit backward graph.
+Now let's explore further optimizations,
+and see how we might determine reasonable baselines for benchmarking.
+
+Assume that we want to build a neural network with `n` layers.
+Typically, when implementing a neural network,
+we need to allocate node space for both the output of each layer
+and the gradient values used during back-propagation.
+This means we need roughly `2 n` memory cells.
+We face the same requirement when using the explicit backward graph approach
+because the number of nodes in a backward pass
+is roughly the same as in a forward pass.
+
+### In-place Operations
+One of the simplest techniques we can employ
+is _in-place memory sharing_ across operations.
+For neural networks, we can usually apply this technique
+for the operations corresponding to activation functions.
+Consider the following case, where we want
+to compute the value of three chained sigmoid functions.
+
+![Inplace op](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_inline.png)
+
+Because we can compute sigmoid ```in-place```,
+using the same memory for input and output,
+we can compute an arbitrary-length chain
+of sigmoid functions using constant memory.
+
+Note: it's easy to make mistakes when implementing in-place optimization.
+Consider the following case, where the value of B is used not only by C, but also by F.
+
+![In-place trap](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_inline_trap.png)
+
+We can't perform in-place optimization because the value of B
+is still needed after ```C=sigmoid(B)``` is computed.
+An algorithm that simply does in-place optimization
+for every sigmoid operation might fall into such trap,
+so we need to be careful about when we can use it.
+
+### Standard Memory Sharing
+In-place operations are not the only places where we can share memory.
+In the following example, because the value of B is no longer needed
+after we compute E, we can reuse B's memory to hold the result of E.
+
+![Normal Sharing](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_normal.png)
+
+*Memory sharing doesn't necessarily require the same data shape*.
+Note that in the preceding example, the shapes of `B` and `E` can differ.
+To handle such a situation, we can allocate a memory region
+of size equal to the maximum of that required by `B` and `E` and share it between them.
+
+### Example of Real Neural Network Allocation
+Of course, these are only toy examples and they address only the computation of the forward pass.
+But the same ideas apply to real neural networks.
+The following figure shows an allocation plan for a two-layer perceptron.
+
+![Net Alloc](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_mlp.png)
+
+In this example:
+
+- In-place optimization is applied when computing ```act1```, ```d(fc1)```, ```out``` and ```d(fc2)```.
+- Memory is shared between ```d(act1)``` and ```d(A)```.
+
+## Memory Allocation Algorithm
+
+So far, we've discussed general techniques for optimizing memory allocation.
+We've seen that there are traps to avoid,
+as demonstrated in the case of in-place memory optimization.
+So, how can we allocate memory correctly?
+This is not a new problem.
+For example, it is very similar
+to the problem with register allocation in compilers.
+There might be techniques that we can borrow.
+We're not attempting to give a comprehensive review of techniques here,
+but rather to introduce some simple
+but useful tricks to attack the problem.
+
+The key problem is that we need to place resources
+so that they don't conflict with each other.
+More specifically, each variable has a *life time*
+between the time it gets computed until the last time it is used.
+In the case of the multi-layer perceptron,
+the *life time* of ```fc1``` ends after ```act1``` get computed.
+
+![Net Alloc](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_mlp.png)
+
+The principle is *to allow memory sharing only between variables whose lifetimes don't overlap*.
+There are multiple ways to do this.
+You can construct the conflicting graph
+with each variable as a node and link the edge
+between variables with overlapping lifespans,
+and then run a graph-coloring algorithm.
+This likely has ```$O(n^2)$``` complexity,
+where ```n``` is the number of nodes in the graph.
+This might be too costly.
+
+Let's consider another simple heuristic.
+The idea is to simulate the procedure of traversing the graph,
+and keep a count of future operations that depends on the node.
+
+![Alloc](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_step.png)
+
+- An in-place optimization can be performed when only the current operation depends on the source (i.e., ```count==1```).
+- Memory can be recycled into the box on the upper right corner when the ```count``` goes to 0.
+- When we need new memory, we can either get it from the box or allocate a new one.
+
+***Note:*** During the simulation, no memory is allocated.
+Instead, we keep a record of how much memory each node needs,
+and allocate the maximum of the shared parts in the final memory plan.
+
+## Static vs. Dynamic Allocation
+
+The preceding strategy exactly simulates
+the dynamic memory allocation procedure
+in imperative languages, such as Python.
+The ```count``` is the reference counter for each memory object,
+and the object gets garbage collected
+when the reference counter goes to 0.
+In that sense,
+we are simulating dynamic memory allocation once
+to create a static allocation plan.
+Can we simply use an imperative language
+that dynamically allocates and deallocates memory?
+
+The major difference is that static allocation is only done once,
+so we can afford to use more complicated algorithms.
+For example, we can search for memory sizes
+that are similar to the required memory block.
+The Allocation can also be made graph aware.
+We'll talk about that in the next section.
+Dynamic allocation puts more pressure
+on fast memory allocation and garbage collection.
+
+There is also one takeaway for users
+who want to rely on dynamic memory allocations:
+*do not unnecessarily reference objects*.
+For example, if we organize all of the nodes in a list
+and store then in a Net object,
+these nodes will never get dereferenced, and we gain no space.
+Unfortunately, this is a common way to organize code.
+
+
+## Memory Allocation for Parallel Operations
+
+In the previous section, we discussed
+how we can *simulate* running the procedure
+for a computation graph to get a static allocation plan.
+However, optimizing for parallel computation presents other challenges
+because resource sharing and parallelization are on the two ends of a balance.
+Let's look at the following two allocation plans for the same graph:
+
+![Parallel Alloc](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/parallel_alloc.png)
+
+Both allocation plans are valid
+if we run the computation serially,
+from ```A[1]``` to ```A[8]```.
+However, the allocation plan on the left
+introduces additional dependencies,
+which means we can't run computation of ```A[2]``` and ```A[5]``` in parallel.
+The plan on the right can.
+To parallelize computation, we need to take greater care.
+
+### Be Correct and Safe First
+Being correct is our first principle.
+This means to execute in a way that takes implicit dependency
+memory sharing into consideration.
+You can do this by adding the implicit dependency edge to the execution graph.
+Or, even simpler, if the execution engine is mutation aware,
+as described in [our discussion of dependency engine design](note_engine),
+push the operation in sequence
+and write to the same variable tag
+that represents the same memory region.
+
+Always produce a safe memory allocation plan.
+This means never allocate the same memory
+to nodes that can be parallelized.
+This might not be ideal when memory reduction is more desirable,
+and we don't gain too much when we can get benefit
+from multiple computing streams simultaneously executing on the same GPU.
+
+### Try to Allow More Parallelization
+Now we can safely perform some optimizations.
+The general idea is to try and encourage memory sharing between nodes that can't be parallelized.
+You can do this by creating an ancestor relationship
+graph and querying it during allocation,
+which costs approximately ```$O(n^2)$``` in time to construct.
+We can also use a heuristic here,
+for example, color the path in the graph.
+As shown in the following figure,
+when you try to find the longest paths in the graph,
+color them the same color and continue.
+
+![Path Color](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/graph_color.png)
+
+After you get the color of the node,
+you allow sharing (or encourage sharing)
+only between nodes of the same color.
+This is a stricter version of the ancestor relationship,
+but it costs only `$O(n)$` of time
+if you search for only the first `k` path.
+
+This is by no means the only solution.
+More sophisticated approaches might exist:
+
+## How Much Can you Save?
+
+We've discussed the techniques and algorithms you can use
+to squeeze memory usage for deep learning.
+How much can you really save by using these techniques?
+
+On coarse-grained operation graphs
+that are already optimized for big operations,
+you can reduce memory consumption roughly *by half*.
+You can reduce memory usage even more
+if you are optimizing a fine-grained computation network
+used by symbolic libraries, such as Theano. Most of the ideas in this article inspired the design of _MXNet_.
+
+Also, you will notice that memory cost, for forward pass only execution, is extremely low compared to running both forward and backward pass. This is simply because there's more memory reuse if you run only the forward pass.
+
+So here are two takeaways:
+
+- Use a computation graph to allocate memory.
+- For deep learning models, prediction consumes much less memory than training.
+
+
+## Next Steps
+
+* [Efficient Data Loading Module for Deep Learning](http://mxnet.io/architecture/note_data_loading.html)
+* [Survey of RNN Interface](http://mxnet.io/architecture/rnn_interface.html)
diff --git a/docs/static_site/src/pages/api/architecture/overview.md b/docs/static_site/src/pages/api/architecture/overview.md
new file mode 100644
index 000000000000..d2ad51c87776
--- /dev/null
+++ b/docs/static_site/src/pages/api/architecture/overview.md
@@ -0,0 +1,874 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: MXNet System Architecture
+category: architecture
+permalink: /api/architecture/overview
+---
+
+# MXNet System Architecture
+
+![System Overview](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/system/overview.png)
+
+This figure shows the major modules and components of the MXNet system and their interaction. The modules are:
+
+- Runtime Dependency Engine: Schedules and executes the
+ operations according to their read/write dependency.
+- Storage Allocator: Efficiently allocates and recycles memory blocks
+ on host (CPU) and devices (GPUs).
+- Resource Manager: Manages global resources, such as the random number generator
+ and temporal space.
+- NDArray: Dynamic, asynchronous n-dimensional arrays,
+ which provide flexible imperative programs for MXNet.
+- Symbolic Execution: Static symbolic graph executor,
+ which provides efficient symbolic graph execution and optimization.
+- Operator: Operators that define static forward and gradient
+ calculation (backprop).
+- SimpleOp: Operators that extend NDArray operators and symbolic operators
+ in a unified fashion.
+- Symbol Construction: Symbolic construction, which provides a way to construct
+ a computation graph (net configuration).
+- KVStore: Key-value store interface for efficient parameter synchronization.
+- Data Loading(IO): Efficient distributed data loading and augmentation.
+
+# MXNet System Components
+
+## Execution Engine
+
+You can use MXNet's engine not only for deep learning,
+but for any domain-specific problem.
+It's designed to solve a general problem:
+execute a bunch of functions following their dependencies.
+Execution of any two functions with dependencies should be serialized.
+To boost performance, functions with no dependencies *can* be executed in parallel.
+For a general discussion of this topic,
+see our [notes on the dependency engine](note_engine.md).
+
+### Interface
+
+The following API is the core interface for the execution engine:
+
+```c++
+ virtual void PushSync(Fn exec_fun, Context exec_ctx,
+ std::vector const& const_vars,
+ std::vector const& mutate_vars) = 0;
+```
+This API allows you to push a function (`exec_fun`),
+along with its context information and dependencies, to the engine.
+`exec_ctx` is the context information in which the `exec_fun` should be executed,
+`const_vars` denotes the variables that the function reads from,
+and `mutate_vars` are the variables to be modified.
+The engine provides the following guarantee:
+
+>*The execution of any two functions
+that modify a common variable
+is serialized in their push order.*
+
+### Function
+
+The function type of the engine is:
+
+```c++
+ using Fn = std::function;
+```
+`RunContext` contains runtime information, which is determined by the engine:
+
+```c++
+ struct RunContext {
+ // stream pointer which could be safely cast to
+ // cudaStream_t* type
+ void *stream;
+ };
+```
+Alternatively, you could use `mxnet::engine::DAGEngine::Fn`, which has the same type definition.
+
+All of the functions are executed by the engine's internal threads.
+In such a model, it's usually not a good idea to push *blocking* functions
+to the engine (usually for dealing with I/O tasks like disk, web service, UI, etc.)
+because it will occupy the execution thread and reduce total throughput.
+In that case, we provide another *asynchronous* function type:
+
+```c++
+ using Callback = std::function;
+ using AsyncFn = std::function;
+```
+In the `AsyncFn` function, you can pass the heavy part to your own threads
+and safely exit the body of the function.
+The engine doesn't consider the function finished
+until the `Callback` function is called.
+
+### Context
+
+You can specify the `Context` of the function to be executed within.
+This usually includes whether the function should be run on a CPU or a GPU,
+and if you specify a GPU, which GPU to use.
+`Context` is different from `RunContext`.
+`Context` contains device type (GPU/CPU) and device id,
+ while `RunContext` contains information that can be decided only during runtime,
+ for example, on which stream the function should be executed.
+
+### VarHandle
+
+`VarHandle` is used to specify the dependencies of functions.
+The MXNet engine is designed to be decoupled from other MXNet modules.
+So `VarHandle` is like an engine-provided token you use
+to represent the external resources the functions can use or modify.
+It's designed to be lightweight, so creating,
+deleting, or copying a variable incurs little overhead.
+Upon pushing the functions, you need to specify the variables
+that will be used (immutable) in the `const_vars` vector,
+and the variables that will be modified (mutable) in the `mutate_vars` vector.
+The engine uses one rule for resolving the dependencies among functions:
+
+>*The execution of any two functions when one of them modifies at least one common variable is serialized in their push order.*
+
+For example, if `Fn1` and `Fn2` both mutate `V2` then `Fn2`
+is guaranteed to be executed after `Fn1`
+if `Fn2` is pushed after `Fn1`.
+On the other hand, if `Fn1` and `Fn2` both use `V2`,
+their actual execution order could be random.
+
+This design allows the engine to schedule *state-mutating* operations in a manner
+that minimizes calls to allocate new memory.
+For example, the weight update function in DNN
+can now use the `+=` operator
+to update the weights in place,
+rather than generating a new weight array each time.
+
+To create a variable, use the `NewVar()` API.
+To delete a variable, use the `PushDelete` API.
+
+### Push and Wait
+
+*All `Push` APIs are asynchronous.* The API call returns immediately
+regardless of whether the pushed `Fn` is finished or not.
+This allows the engine to start computing at the same time
+as the user thread is pushing functions.
+`Push` APIs are not thread-safe.
+To be specific, only one thread should make engine API calls at a time.
+
+If you want to wait for a specific `Fn` to finish,
+include a callback function in the closure,
+and call the function at the end of your `Fn`.
+
+If you want to wait for all `Fn`s
+that involve (use or mutate) a certain variable to finish,
+use the `WaitForVar(var)` API.
+
+If you want to wait for all pushed `Fn`s to finish,
+use the `WaitForAll()` API.
+
+### Save Object Creation Cost
+
+In some cases, you need to push several functions to the engine for a long period of time.
+If the computation of these functions is light,
+the overhead of copying lambdas and creating use/mutate variable lists becomes relatively high.
+We provide an API to create an `OprHandle` beforehand:
+
+```c++
+ virtual OprHandle NewOperator(AsyncFn fn,
+ std::vector const& const_vars,
+ std::vector const& mutate_vars) = 0;
+```
+You can keep pushing the `OprHandle` without repeatedly creating them:
+
+```c++
+ virtual void Push(OprHandle op, Context exec_ctx) = 0;
+```
+To delete it, call the `DeleteOperator(OprHandle op)` API.
+Ensure that the operator has finished computing before calling this API.
+
+
+## Operators in MXNet
+
+In MXNet, an operator is a class that contains both actual computation logic
+and auxiliary information that can aid the system in performing optimizations,
+like in-place updates and auto-derivatives.
+To understand the remainder of the document,
+we recommend that you familiarize yourself with the `mshadow` library,
+because all operators compute on the tensor-like structure `mshadow::TBlob`
+provided by the system during runtime.
+
+MXNet's operator interface allows you to:
+
+* Reduce memory allocation cost by specifying in-place updates.
+* Hide some internal arguments from Python to make it cleaner.
+* Define the relationships among input tensors and output tensors,
+which allows the system to perform shape checking for you.
+* Acquire additional temporary spaces from the system
+to perform computation (e.g., calling `cudnn` routines).
+
+### Operator Interface
+
+`Forward` is the core operator interface:
+
+```c++
+ virtual void Forward(const OpContext &ctx,
+ const std::vector &in_data,
+ const std::vector &req,
+ const std::vector &out_data,
+ const std::vector &aux_states) = 0;
+```
+The `OpContext` structure is:
+
+```c++
+ struct OpContext {
+ int is_train;
+ RunContext run_ctx;
+ std::vector requested;
+ }
+```
+It describes whether the operator is in the train or test phase,
+which device the operator should be run on (in `run_ctx`),
+and requested resources (covered in the following sections).
+
+- `in_data` and `out_data` represent the input and output tensors, respectively.
+All of the tensor spaces have been allocated by the system.
+- `req` denotes how the computation results are written into the `out_data`.
+In other words, `req.size() == out_data.size()` and `req[i]`
+correspond to the write type of `out_data[i]`.
+
+- The `OpReqType` is defined as:
+
+```c++
+ enum OpReqType {
+ kNullOp,
+ kWriteTo,
+ kWriteInplace,
+ kAddTo
+ };
+```
+ Normally, the types of all `out_data` should be `kWriteTo`,
+ meaning that the provided `out_data` tensor is a *raw* memory block,
+ so the operator should write results directly into it.
+ In some cases, for example when calculating the `gradient` tensor,
+ it would be great if we could accumulate the result,
+ rather than directly overwrite the tensor contents
+ so that no extra space needs to be created each time.
+ In such a case, the corresponding `req` type is set as `kAddTo`,
+ indicating that a `+=` should be called.
+
+- `aux_states` is intentionally designed for auxiliary tensors used to help computation. Currently, it is useless.
+
+Aside from the `Forward` operator, you could optionally implement the `Backward` interface:
+
+```c++
+ virtual void Backward(const OpContext &ctx,
+ const std::vector &out_grad,
+ const std::vector &in_data,
+ const std::vector &out_data,
+ const std::vector &req,
+ const std::vector &in_grad,
+ const std::vector &aux_states);
+```
+This interface follows the same design principle as the `Forward` interface,
+except that `out_grad`, `in_data`, and `out_data` are given,
+and the operator computes `in_grad` as the results.
+ The naming strategy is similar to Torch's convention,
+ and can be summarized in following figure:
+
+[input/output semantics figure]
+
+Some operators might not require all of the following:
+`out_grad`, `in_data` and `out_data`.
+You can specify these dependencies with the `DeclareBackwardDependency` interface in `OperatorProperty`.
+
+### Operator Property
+
+One convolution might have several implementations,
+and you might want to switch among them to achieve the best performance.
+Therefore, we separate the operator *semantic* interfaces
+from the implementation interface (`Operator` class)
+into the `OperatorProperty` class.
+The `OperatorProperty` interface consists of:
+
+* **InferShape:**
+
+```c++
+ virtual bool InferShape(mxnet::ShapeVector *in_shape,
+ mxnet::ShapeVector *out_shape,
+ mxnet::ShapeVector *aux_shape) const = 0;
+```
+
+This interface has two purposes:
+* Tell the system the size of each input and output tensor,
+ so it can allocate space for them before the `Forward` and `Backward` call.
+* Perform a size check to make sure that there isn't an obvious error before running.
+ The shape in `in_shape` is set by the system
+ (from the `out_shape` of the previous operators).
+ It returns `false` when there is not enough information
+ to infer shapes or throws an error when the shape is inconsistent.
+
+* **Request Resources:** Operations like `cudnnConvolutionForward` need a work space for computation.
+If the system can manage that, it could then perform optimizations,
+like reuse the space, and so on.
+MXNet defines two interfaces to achieve this:
+
+```c++
+ virtual std::vector ForwardResource(
+ const mxnet::ShapeVector &in_shape) const;
+ virtual std::vector BackwardResource(
+ const mxnet::ShapeVector &in_shape) const;
+```
+ The `ResourceRequest` structure (in `resource.h`) currently contains only a type flag:
+
+```c++
+ struct ResourceRequest {
+ enum Type {
+ kRandom, // get a mshadow::Random object
+ kTempSpace, // request temporary space
+ };
+ Type type;
+ };
+```
+ If `ForwardResource` and `BackwardResource` return non-empty arrays,
+ the system offers the corresponding resources through the `ctx` parameter
+ in the `Forward` and `Backward` interface of `Operator`.
+ Basically, to access those resources, simply write:
+
+```c++
+ auto tmp_space_res = ctx.requested[kTempSpace].get_space(some_shape, some_stream);
+ auto rand_res = ctx.requested[kRandom].get_random(some_stream);
+```
+ For an example, see `src/operator/cudnn_convolution-inl.h`.
+
+* **Backward dependency:** Let's look at two different operator signatures
+(we name all of the arguments for demonstration purposes):
+
+```c++
+ void FullyConnectedForward(TBlob weight, TBlob in_data, TBlob out_data);
+ void FullyConnectedBackward(TBlob weight, TBlob in_data, TBlob out_grad, TBlob in_grad);
+
+ void PoolingForward(TBlob in_data, TBlob out_data);
+ void PoolingBackward(TBlob in_data, TBlob out_data, TBlob out_grad, TBlob in_grad);
+```
+ Note that `out_data` in `FullyConnectedForward`
+ is not used by `FullyConnectedBackward`,
+ while `PoolingBackward` requires all of the arguments of `PoolingForward`.
+ Therefore, for `FullyConnectedForward`,
+ the `out_data` tensor once consumed could be safely freed
+ because the backward function will not need it.
+ This provides a chance for the system to collect some tensors
+ as garbage as soon as possible.
+ To specify this situation, we provide an interface:
+
+```c++
+ virtual std::vector DeclareBackwardDependency(
+ const std::vector &out_grad,
+ const std::vector &in_data,
+ const std::vector &out_data) const;
+```
+ The `int` element of the argument vector is an ID
+ to distinguish different arrays.
+ Let's see how this interface specifies different dependencies
+ for `FullyConnected` and `Pooling`:
+
+ ```c++
+ std::vector FullyConnectedProperty::DeclareBackwardDependency(
+ const std::vector &out_grad,
+ const std::vector &in_data,
+ const std::vector &out_data) const {
+ return {out_grad[0], in_data[0]}; // NOTE: out_data[0] is NOT included
+ }
+ std::vector PoolingProperty::DeclareBackwardDependency(
+ const std::vector &out_grad,
+ const std::vector &in_data,
+ const std::vector &out_data) const {
+ return {out_grad[0], in_data[0], out_data[0]};
+ }
+```
+
+* **In place Option:** To further save the cost of memory allocation,
+you can use in-place updates.
+They are appropriate for element-wise operations
+when the input tensor and output tensor have the same shape.
+You specify and in-place update with the following interface:
+
+```c++
+ virtual std::vector> ElewiseOpProperty::ForwardInplaceOption(
+ const std::vector &in_data,
+ const std::vector &out_data) const {
+ return { {in_data[0], out_data[0]} };
+ }
+ virtual std::vector> ElewiseOpProperty::BackwardInplaceOption(
+ const std::vector &out_grad,
+ const std::vector &in_data,
+ const std::vector &out_data,
+ const std::vector &in_grad) const {
+ return { {out_grad[0], in_grad[0]} }
+ }
+```
+ This tells the system that the `in_data[0]` and `out_data[0]` tensors could share the same memory spaces during `Forward`, and so do `out_grad[0]` and `in_grad[0]` during `Backward`.
+
+ >**Important:** Even if you use the preceding specification, it's *not* guaranteed that the input and output tensors will share the same space. In fact, this is only a suggestion for the system, which makes the final decision. However, in either case, the decision is completely transparent to you, so the actual `Forward` and `Backward` implementation does not need to consider that.
+
+* **Expose Operator to Python:** Because of the restrictions of C++, you need user to implement following interfaces:
+
+```c++
+ // initial the property class from a list of key-value string pairs
+ virtual void Init(const vector> &kwargs) = 0;
+ // return the parameters in a key-value string map
+ virtual map GetParams() const = 0;
+ // return the name of arguments (for generating signature in python)
+ virtual vector ListArguments() const;
+ // return the name of output values
+ virtual vector ListOutputs() const;
+ // return the name of auxiliary states
+ virtual vector ListAuxiliaryStates() const;
+ // return the number of output values
+ virtual int NumOutputs() const;
+ // return the number of visible outputs
+ virtual int NumVisibleOutputs() const;
+```
+
+### Create an Operator from the Operator Property
+
+ `OperatorProperty` includes all *semantic* attributes of an operation. It's also responsible for creating the `Operator` pointer for actual computation.
+
+#### Create Operator
+Implement the following interface in `OperatorProperty`:
+
+```c++
+ virtual Operator* CreateOperator(Context ctx) const = 0;
+```
+For example:
+
+```c++
+ class ConvolutionOp {
+ public:
+ void Forward( ... ) { ... }
+ void Backward( ... ) { ... }
+ };
+ class ConvolutionOpProperty : public OperatorProperty {
+ public:
+ Operator* CreateOperator(Context ctx) const {
+ return new ConvolutionOp;
+ }
+ };
+```
+
+#### Parametrize Operator
+When implementing a convolution operator, you need to know the kernel size,
+the stride size, padding size, and so on.
+These parameters should be passed to the operator
+before any `Forward` or `Backward` interface is called.
+To do so, you could define a `ConvolutionParam` structure, as follows:
+
+```c++
+ #include
+ struct ConvolutionParam : public dmlc::Parameter {
+ mxnet::TShape kernel, stride, pad;
+ uint32_t num_filter, num_group, workspace;
+ bool no_bias;
+ };
+```
+Put it in `ConvolutionOpProperty`, and pass it to the operator class during construction:
+
+```c++
+ class ConvolutionOp {
+ public:
+ ConvolutionOp(ConvolutionParam p): param_(p) {}
+ void Forward( ... ) { ... }
+ void Backward( ... ) { ... }
+ private:
+ ConvolutionParam param_;
+ };
+ class ConvolutionOpProperty : public OperatorProperty {
+ public:
+ void Init(const vector& kwargs) {
+ // initialize param_ using kwargs
+ }
+ Operator* CreateOperator(Context ctx) const {
+ return new ConvolutionOp(param_);
+ }
+ private:
+ ConvolutionParam param_;
+ };
+```
+
+#### Register the Operator Property Class and the Parameter Class to MXNet
+Use the following macros to register the parameter structure and the operator property class to MXNet:
+
+```c++
+ DMLC_REGISTER_PARAMETER(ConvolutionParam);
+ MXNET_REGISTER_OP_PROPERTY(Convolution, ConvolutionOpProperty);
+```
+The first argument is the name string, the second is the property class name.
+
+### Interface Summary
+
+We've almost covered the entire interface required to define a new operator. Let's do a recap:
+
+* Use the `Operator` interface to write your computation logic (`Forward` and `Backward`).
+* Use the `OperatorProperty` interface to:
+ - Pass the parameter to the operator class (you can use the `Init` interface).
+ - Create an operator using the `CreateOperator` interface.
+ - Correctly implement the operator description interface, such as the names of arguments, etc.
+ - Correctly implement the `InferShape` interface to set the output tensor shape.
+ - [Optional] If additional resources are needed, check `ForwardResource` and `BackwardResource`.
+ - [Optional] If `Backward` doesn't need all of the input and output of `Forward`, check `DeclareBackwardDependency`.
+ - [Optional] If in-place update is supported, check `ForwardInplaceOption` and `BackwardInplaceOption`.
+* Register the `OperatorProperty` class and the parameter class.
+
+## Unifying the NDArray Operator and Symbolic Operator
+NDArray operations are similar to symbolic operations,
+except that sometimes you can't write in place to the operands
+without a complete dependency graph.
+However, the logic underlying NDArray and symbolic operations are almost identical.
+*SimpleOp*, a new unified operator API,
+unifies different invoking processes
+and returns to the fundamental elements of operators.
+Because most mathematical operators attend to one or two operands,
+and more operands make dependency-related optimization useful,
+the unified operator is specifically designed for unary and binary operations.
+
+Consider the elements of an operation.
+Ideally, you need only functions and derivatives
+to describe an operation.
+Let's restrict that to the space of unary and binary operations.
+How do we classify all operations to maximize the possibility
+of in-place write optimization?
+Note that you can separate functions by the number of operands.
+Derivatives are a bit more complex.
+To construct a dependency graph, you need to know whether output value,
+input data, or neither are needed alongside head gradient.
+Gradient functions in the unified API are differentiated
+by the types of operands it takes for calculation.
+
+Before you learn more about the SimpleOp interface,
+ we recommend that you review the
+ [mshadow library guide](https://github.com/dmlc/mshadow/tree/master/guide)
+ because calculations will be done in the `mshadow::TBlob` structure.
+
+In the following example, we'll create an operator
+functioning as a smooth l1 loss,
+which is a mixture of l1 loss and l2 loss. The loss itself can be written as:
+
+```
+ loss = outside_weight .* f(inside_weight .* (data - label))
+ grad = outside_weight .* inside_weight .* f'(inside_weight .* (data - label))
+```
+ `.*` stands for element-wise multiplication, and `f`, `f'` is the smooth l1 loss function,
+which we are assuming is in `mshadow` for now.
+At first glance, it's impossible to implement
+this particular loss as a unary or binary operator.
+But we have automatic differentiation in symbolic execution.
+That simplifies the loss to `f` and `f'` directly.
+This loss is no more complex than a `sin` or an `abs` function,
+and can certainly be implemented as a unary operator.
+
+## SimpleOp: The Unified Operator API
+### Define Shapes
+The `mshadow` library requires explicit memory allocation.
+As a consequence, all data shapes
+must be provided before any calculation occurs.
+ Before we proceed with defining functions and gradient,
+let's check input data shape consistency and provide output shape.
+
+```cpp
+ typedef mxnet::TShape (*UnaryShapeFunction)(const mxnet::TShape& src,
+ const EnvArguments& env);
+ typedef mxnet::TShape (*BinaryShapeFunction)(const mxnet::TShape& lhs,
+ const mxnet::TShape& rhs,
+ const EnvArguments& env);
+```
+You can use `mshadow::TShape` to check input data shape and designate output data shape.
+If you don't define this function, the default output shape is the same as the input shape.
+In the case of a binary operator, the shape of `lhs` and `rhs` is checked as the same by default.
+
+You can also use shape functions to check if any additional arguments and resources are present.
+Refer to the additional usages of `EnvArguments` to accomplish this.
+
+Before we start on our smooth l1 loss example, we define a `XPU` to `cpu` or `gpu` in the header
+`smooth_l1_unary-inl.h` implementation so that we reuse the same code in `smooth_l1_unary.cc` and
+`smooth_l1_unary.cu`.
+
+```cpp
+ #include
+ #if defined(__CUDACC__)
+ #define XPU gpu
+ #else
+ #define XPU cpu
+ #endif
+```
+In our smooth l1 loss example, it's okay to use the default behavior whereby the output has the same shape as the source.
+Written explicitly, it is:
+
+```cpp
+ inline mxnet::TShape SmoothL1Shape_(const mxnet::TShape& src,
+ const EnvArguments& env) {
+ return mxnet::TShape(src);
+ }
+```
+
+### Define Functions
+Create a unary or binary function with one output: `mshadow::TBlob`.
+
+```cpp
+ typedef void (*UnaryFunction)(const TBlob& src,
+ const EnvArguments& env,
+ TBlob* ret,
+ OpReqType req,
+ RunContext ctx);
+ typedef void (*BinaryFunction)(const TBlob& lhs,
+ const TBlob& rhs,
+ const EnvArguments& env,
+ TBlob* ret,
+ OpReqType req,
+ RunContext ctx);
+```
+* Functions are differentiated by the types of input arguments.
+* `RunContext ctx` contains information needed during runtime for execution.
+
+```cpp
+ struct RunContext {
+ void *stream; // the stream of the device, can be NULL or Stream* in GPU mode
+ template inline mshadow::Stream* get_stream() // get mshadow stream from Context
+ } // namespace mxnet
+```
+ `mshadow::stream *s = ctx.get_stream();` is an example of obtaining a stream from `ctx`.
+* `OpReqType req` denotes how computation results are written into `ret`.
+
+```cpp
+ enum OpReqType {
+ kNullOp, // no operation, do not write anything
+ kWriteTo, // write gradient to provided space
+ kWriteInplace, // perform an in-place write
+ kAddTo // add to the provided space
+ };
+```
+ A macro is defined in `operator_util.h` for a simplified use of `OpReqType`.
+ `ASSIGN_DISPATCH(out, req, exp)` checks `req` and performs an assignment.
+
+In our smooth l1 loss example, we use `UnaryFunction` to define the function of this operator.
+
+```cpp
+ template
+ void SmoothL1Forward_(const TBlob& src,
+ const EnvArguments& env,
+ TBlob *ret,
+ OpReqType req,
+ RunContext ctx) {
+ using namespace mshadow;
+ using namespace mshadow::expr;
+ mshadow::Stream *s = ctx.get_stream();
+ real_t sigma2 = env.scalar * env.scalar;
+ MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+ mshadow::Tensor out = ret->get(s);
+ mshadow::Tensor in = src.get(s);
+ ASSIGN_DISPATCH(out, req,
+ F(in, ScalarExp(sigma2)));
+ });
+ }
+```
+After obtaining `mshadow::Stream` from `RunContext`, we get `mshadow::Tensor` from `mshadow::TBlob`.
+`mshadow::F` is a shortcut to initiate a `mshadow` expression. The macro `MSHADOW_TYPE_SWITCH(type, DType, ...)`
+handles details on different types, and the macro `ASSIGN_DISPATCH(out, req, exp)` checks `OpReqType` and
+performs actions accordingly. `sigma2` is a special parameter in this loss, which we will cover later.
+
+### Define Gradients (Optional)
+Create a gradient function with various types of inputs.
+
+```cpp
+ // depending only on out_grad
+ typedef void (*UnaryGradFunctionT0)(const OutputGrad& out_grad,
+ const EnvArguments& env,
+ TBlob* in_grad,
+ OpReqType req,
+ RunContext ctx);
+ // depending only on out_value
+ typedef void (*UnaryGradFunctionT1)(const OutputGrad& out_grad,
+ const OutputValue& out_value,
+ const EnvArguments& env,
+ TBlob* in_grad,
+ OpReqType req,
+ RunContext ctx);
+ // depending only on in_data
+ typedef void (*UnaryGradFunctionT2)(const OutputGrad& out_grad,
+ const Input0& in_data0,
+ const EnvArguments& env,
+ TBlob* in_grad,
+ OpReqType req,
+ RunContext ctx);
+```
+Gradient functions of binary operators have similar structures, except that `Input`, `TBlob`, and `OpReqType`
+are doubled.
+
+`GradFunctionArgument`
+
+ `Input0`, `Input`, `OutputValue`, and `OutputGrad` all share the structure of `GradFunctionArgument`,
+ which is defined as:
+
+ ```cpp
+ struct GradFunctionArgument {
+ TBlob data;
+ }
+ ```
+
+In our smooth l1 loss example, note that it's an `f'(x)`,
+which utilizes input for the gradient calculation,
+so the `UnaryGradFunctionT2` is suitable.
+To enable the chain rule of the gradient,
+we also need to multiply `out_grad` from the top to the result of `in_grad`.
+
+```cpp
+ template
+ void SmoothL1BackwardUseIn_(const OutputGrad& out_grad,
+ const Input0& in_data0,
+ const EnvArguments& env,
+ TBlob *in_grad,
+ OpReqType req,
+ RunContext ctx) {
+ using namespace mshadow;
+ using namespace mshadow::expr;
+ mshadow::Stream *s = ctx.get_stream();
+ real_t sigma2 = env.scalar * env.scalar;
+ MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
+ mshadow::Tensor src = in_data0.data.get(s);
+ mshadow::Tensor ograd = out_grad.data.get(s);
+ mshadow::Tensor igrad = in_grad->get(s);
+ ASSIGN_DISPATCH(igrad, req,
+ ograd * F(src, ScalarExp(sigma2)));
+ });
+ }
+```
+
+### Register SimpleOp to MXNet
+After creating the shape, function, and gradient, restore them into both an NDArray operator and
+a symbolic operator. To simplify this process, use the registration macro defined in `operator_util.h`.
+
+```cpp
+ MXNET_REGISTER_SIMPLE_OP(Name, DEV)
+ .set_shape_function(Shape)
+ .set_function(DEV::kDevMask, Function, SimpleOpInplaceOption)
+ .set_gradient(DEV::kDevMask, Gradient, SimpleOpInplaceOption)
+ .describe("description");
+```
+`SimpleOpInplaceOption` is defined as:
+
+```cpp
+ enum SimpleOpInplaceOption {
+ kNoInplace, // do not allow inplace in arguments
+ kInplaceInOut, // allow inplace in with out (unary)
+ kInplaceOutIn, // allow inplace out_grad with in_grad (unary)
+ kInplaceLhsOut, // allow inplace left operand with out (binary)
+ kInplaceOutLhs // allow inplace out_grad with lhs_grad (binary)
+ };
+```
+
+In our example, we have a gradient function that relies on input data, so the function can't be written in
+place. The output gradient has no purpose after gradient computation, so the gradient can be written in place.
+
+```cpp
+ MXNET_REGISTER_SIMPLE_OP(smooth_l1, XPU)
+ .set_function(XPU::kDevMask, SmoothL1Forward_, kNoInplace)
+ .set_gradient(XPU::kDevMask, SmoothL1BackwardUseIn_, kInplaceOutIn)
+ .set_enable_scalar(true)
+ .describe("Calculate Smooth L1 Loss(lhs, scalar)");
+```
+Remember from the discussion of shape functions that a default behavior without `set_shape_function` forces the inputs
+(if they're binary) to be the same shape and yield the same shape for output. We'll discuss `set_enable_scalar` later.
+
+### NDArray Operator Summary
+* Create a shape function for determining the output shape.
+* Create a function as the forward routine by choosing a suitable function type.
+* Create a gradient as the backward routine by choosing a suitable gradient type.
+* Register the operator using the registration process.
+
+## Additional Information on SimpleOp
+### Using SimpleOp on EnvArguments
+Some operations might need a scalar as input, such as a gradient scale, a set of keyword arguments
+controlling behavior, or a temporary space to speed up calculations.`EnvArguments` provides additional arguments and resources to make calculations more scalable
+and efficient.
+
+```cpp
+ struct EnvArguments {
+ real_t scalar; // scalar argument, if enabled
+ std::vector > kwargs; // keyword arguments
+ std::vector resource; // pointer to the resources requested
+ };
+```
+
+More registration parameters are required to enable these additional features. To prevent confusion on parameters, `scalar` and `kwargs`
+can't be present at the same time. To enable `scalar`, use
+`set_enable_scalar(bool enable_scalar)` in registration. Then, in forward functions and gradients, the `scalar` can be accessed from `env.scalar` as in the function parameter `EnvArguments env`.
+
+To enable `kwargs`, use `set_enable_kwargs(bool enable_kwargs)` in registration. Then, in forward
+functions and gradients, additional arguments are contained in `env.kwarg`, which is defined as
+`std::vector >`. Use the DMLC parameter structure to
+simplify parsing keyword arguments. For more details, see the [guide on parameter structure](https://github.com/dmlc/dmlc-core/blob/master/doc/parameter.md).
+
+Additional resources like `mshadow::Random` and temporary memory space can also be requested and
+accessed from `EnvArguments.resource`. The registration routine is `set_resource_request(ResourceRequest req)`
+or `set_resource_request(const std::vector)`, where `mxnet::ResourceRequest` is defined as:
+
+```cpp
+ struct ResourceRequest {
+ enum Type { // Resource type, indicating what the pointer type is
+ kRandom, // mshadow::Random object
+ kTempSpace // A dynamic temp space that can be arbitrary size
+ };
+ Type type; // type of resources
+ };
+```
+Registration will request the declared resource requests from `mxnet::ResourceManager`, and place resources
+in `std::vector resource` in `EnvArguments`. To access resources, use the following:
+
+```cpp
+ auto tmp_space_res = env.resources[0].get_space(some_shape, some_stream);
+ auto rand_res = env.resources[0].get_random(some_stream);
+```
+For an example, see `src/operator/loss_binary_op-inl.h`.
+
+In our smooth l1 loss example, a scalar input is needed to mark the turning point of a loss function. Therefore,
+in the registration process, we use `set_enable_scalar(true)`, and use `env.scalar` in function and gradient
+declarations.
+
+### Crafting a Tensor Operation
+Because computation utilizes the `mshadow` library and we sometimes don't have functions readily available, we
+can craft tensor operations in operator implementations. If you define such functions as element-wise, you
+can implement them as a `mxnet::op::mshadow_op`. `src/operator/mshadow_op.h` that contains a lot of `mshadow_op`,
+for example. `mshadow_op` are expression mappers. They deal with the scalar case of desired functions. For details, see
+[mshadow expression API guide](https://github.com/dmlc/mshadow/tree/master/doc).
+
+If an operation can't be done in an element-wise way, like the softmax loss and gradient, then you need to create a new tensor operation. You need to create as `mshadow` function and as `mshadow::cuda`
+function directly. For details, see the `mshadow` library. For an example, see `src/operator/roi_pooling.cc`.
+
+In our smooth l1 loss example, we create two mappers, namely the scalar cases of smooth l1 loss and gradient.
+
+```cpp
+ namespace mshadow_op {
+ struct smooth_l1_loss {
+ // a is x, b is sigma2
+ MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+ if (a > 1.0f / b) {
+ return a - 0.5f / b;
+ } else if (a < -1.0f / b) {
+ return -a - 0.5f / b;
+ } else {
+ return 0.5f * a * a * b;
+ }
+ }
+ };
+ }
+```
+The gradient, which can be found in `src/operator/smooth_l1_unary-inl.h`, is similar.
+
+### Beyond Two Operands
+The new unified API is designed to fulfill the fundamentals of an operation. For operators with more than two inputs,
+more than one output, or that need more features, see the original [Operator API](http://mxnet.io/architecture/overview.html#operators-in-mxnet).
diff --git a/docs/static_site/src/pages/api/architecture/program_model.md b/docs/static_site/src/pages/api/architecture/program_model.md
new file mode 100644
index 000000000000..be291d24b19e
--- /dev/null
+++ b/docs/static_site/src/pages/api/architecture/program_model.md
@@ -0,0 +1,629 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Deep Learning Programming Paradigm
+category: architecture
+permalink: /api/architecture/program_model
+---
+
+# Deep Learning Programming Paradigm
+
+However much we might ultimately care about performance,
+we first need working code before we can start worrying about optimization.
+Writing clear, intuitive deep learning code can be challenging,
+and the first thing any practitioner must deal with is the language syntax itself.
+Complicating matters, of the many deep learning libraries out there,
+each has its own approach to programming style.
+
+In this document, we focus on two of the most important high-level design decisions:
+1. Whether to embrace the _symbolic_ or _imperative_ paradigm for mathematical computation.
+2. Whether to build networks with bigger (more abstract) or more atomic operations.
+
+Throughout, we'll focus on the programming models themselves.
+When programming style decisions may impact performance, we point this out,
+but we don't dwell on specific implementation details.
+
+
+## Symbolic vs. Imperative Programs
+
+If you are a Python or C++ programmer, then you're already familiar with imperative programs.
+Imperative-style programs perform computation as you run them.
+Most code you write in Python is imperative, as is the following NumPy snippet.
+
+```python
+ import numpy as np
+ a = np.ones(10)
+ b = np.ones(10) * 2
+ c = b * a
+ d = c + 1
+```
+When the program executes ```c = b * a```, it runs the actual numerical computation.
+
+Symbolic programs are a bit different. With symbolic-style programs,
+we first define a (potentially complex) function abstractly.
+When defining the function, no actual numerical computation takes place.
+We define the abstract function in terms of placeholder values.
+Then we can compile the function, and evaluate it given real inputs.
+In the following example, we rewrite the imperative program from above
+as a symbolic-style program:
+
+```python
+ A = Variable('A')
+ B = Variable('B')
+ C = B * A
+ D = C + Constant(1)
+ # compiles the function
+ f = compile(D)
+ d = f(A=np.ones(10), B=np.ones(10)*2)
+```
+As you can see, in the symbolic version, when ```C = B * A``` is executed, no computation occurs.
+Instead, this operation generates a _computation graph_ (also called a _symbolic graph_)
+that represents the computation.
+The following figure shows a computation graph to compute ```D```.
+
+![Comp Graph](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/prog_model/comp_graph.png)
+
+Most symbolic-style programs contain, either explicitly or implicitly, a *compile* step.
+This converts the computation graph into a function that we can later call.
+In the above example, numerical computation only occurs in the last line of code.
+The defining characteristic of symbolic programs is their clear separation
+between building the computation graph and executing it.
+For neural networks, we typically define the entire model as a single compute graph.
+
+Among other popular deep learning libraries, Torch, Chainer, and Minerva embrace the imperative style.
+Examples of symbolic-style deep learning libraries include Theano, CGT, and TensorFlow.
+We might also view libraries like CXXNet and Caffe, which rely on configuration files, as symbolic-style libraries.
+In this interpretation, we'd consider the content of the configuration file as defining the computation graph.
+
+Now that you understand the difference between these two programming models, let's compare the advantages of each.
+
+
+### Imperative Programs Tend to be More Flexible
+
+When you're using an imperative-style library from Python, you are writing in Python.
+Nearly anything that would be intuitive to write in Python, you could accelerate by calling down in the appropriate places to the imperative deep learning library.
+On the other hand, when you write a symbolic program, you may not have access to all the familiar Python constructs, like iteration.
+Consider the following imperative program, and think about how you can translate this into a symbolic program.
+
+```python
+ a = 2
+ b = a + 1
+ d = np.zeros(10)
+ for i in range(d):
+ d += np.zeros(10)
+```
+This wouldn't be so easy if the Python for-loop weren't supported by the symbolic API.
+When you write a symbolic program in Python, you're *not* writing in Python.
+Instead, you're writing in a domain-specific language (DSL) defined by the symbolic API.
+The symbolic APIs found in deep learning libraries
+are powerful DSLs that generate callable computation graphs for neural networks.
+
+
+Intuitively, you might say that imperative programs
+are more *native* than symbolic programs.
+It's easier to use native language features.
+For example, it's straightforward to print out the values
+in the middle of computation or to use native control flow and loops
+at any point in the flow of computation.
+
+### Symbolic Programs Tend to be More Efficient
+
+As we've seen, imperative programs tend to be flexible
+and fit nicely into the programming flow of a host language.
+So you might wonder, why do so many deep learning libraries
+embrace the symbolic paradigm?
+The main reason is efficiency, both in terms of memory and speed.
+Let's revisit our toy example from before.
+
+```python
+ import numpy as np
+ a = np.ones(10)
+ b = np.ones(10) * 2
+ c = b * a
+ d = c + 1
+ ...
+```
+
+![Comp Graph](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/prog_model/comp_graph.png)
+
+Assume that each cell in the array occupies 8 bytes of memory.
+How much memory do you need to execute this program in the Python console?
+
+As an imperative program we need to allocate memory at each line.
+That leaves us allocating 4 arrays of size 10.
+So we'll need `4 * 10 * 8 = 320` bytes.
+On the other hand, if we built a computation graph,
+and knew in advance that we only needed `d`,
+we could reuse the memory originally allocated for intermediate values.
+For example, by performing computations in-place,
+we might recycle the bits allocated for ```b``` to store `c`.
+And we might recycle the bits allocated for `c` to store `d`.
+In the end we could cut our memory requirement in half,
+requiring just `2 * 10 * 8 = 160` bytes.
+
+Symbolic programs are more *restricted*.
+When we call `compile` on D, we tell the system
+that only the value of `d` is needed.
+The intermediate values of the computation,
+in this case ```c```, is then invisible to us.
+
+We benefit because the symbolic programs
+can then safely reuse the memory for in-place computation.
+But on the other hand, if we later decide that we need to access `c`, we're out of luck.
+So imperative programs are better prepared to encounter all possible demands.
+If we ran the imperative version of the code in a Python console,
+we could inspect any of the intermediate variables in the future.
+
+
+
+Symbolic programs can also perform another kind of optimization, called operation folding.
+Returning to our toy example, the multiplication and addition operations
+can be folded into one operation, as shown in the following graph.
+If the computation runs on a GPU processor,
+one GPU kernel will be executed, instead of two.
+In fact, this is one way we hand-craft operations
+in optimized libraries, such as CXXNet and Caffe.
+Operation folding improves computation efficiency.
+
+![Comp Graph Folded](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/prog_model/comp_graph_fold.png)
+
+Note, you can't perform operation folding in imperative programs,
+because the intermediate values might be referenced in the future.
+Operation folding is possible in symbolic programs
+because you get the entire computation graph,
+and a clear specification of which values will be needed and which are not.
+
+
+### Case Study: Backprop and AutoDiff
+
+In this section, we compare the two programming models
+on the problem of auto differentiation, or backpropagation.
+Differentiation is of vital importance in deep learning
+because it's the mechanism by which we train our models.
+In any deep learning model, we define a _loss function_.
+A _loss function_ measures how far the model is from the desired output.
+We then typically pass over training examples (pairs of inputs and ground-truth outputs).
+At each step we update the model's _parameters_ to minimize the loss.
+To determine the direction in which to update the parameters,
+we need to take the derivative of the loss function with respect to the parameters.
+
+In the past, whenever someone defined a new model,
+they had to work out the derivative calculations by hand.
+While the math is reasonably straightforward,
+for complex models, it can be time-consuming and tedious work.
+All modern deep learning libraries make the practitioner/researcher's job
+much easier, by automatically solving the problem of gradient calculation.
+
+Both imperative and symbolic programs can perform gradient calculation.
+So let's take a look at how you might perform automatic differentiation with each.
+
+Let's start with imperative programs.
+The following example Python code performs automatic differentiation using our toy example:
+
+```python
+ class array(object) :
+ """Simple Array object that support autodiff."""
+ def __init__(self, value, name=None):
+ self.value = value
+ if name:
+ self.grad = lambda g : {name : g}
+
+ def __add__(self, other):
+ assert isinstance(other, int)
+ ret = array(self.value + other)
+ ret.grad = lambda g : self.grad(g)
+ return ret
+
+ def __mul__(self, other):
+ assert isinstance(other, array)
+ ret = array(self.value * other.value)
+ def grad(g):
+ x = self.grad(g * other.value)
+ x.update(other.grad(g * self.value))
+ return x
+ ret.grad = grad
+ return ret
+
+ # some examples
+ a = array(1, 'a')
+ b = array(2, 'b')
+ c = b * a
+ d = c + 1
+ print d.value
+ print d.grad(1)
+ # Results
+ # 3
+ # {'a': 2, 'b': 1}
+```
+
+In this code, each array object contains a grad function (it is actually a closure).
+When you run ```d.grad```, it recursively invokes the grad function of its inputs,
+backprops the gradient value back, and
+returns the gradient value of each input.
+
+This might look a bit complicated, so let's consider
+the gradient calculation for symbolic programs.
+The following program performs symbolic gradient calculation for the same task.
+
+```python
+ A = Variable('A')
+ B = Variable('B')
+ C = B * A
+ D = C + Constant(1)
+ # get gradient node.
+ gA, gB = D.grad(wrt=[A, B])
+ # compiles the gradient function.
+ f = compile([gA, gB])
+ grad_a, grad_b = f(A=np.ones(10), B=np.ones(10)*2)
+```
+
+The grad function of ```D``` generates a backward computation graph,
+and returns a gradient node, ```gA, gB```,
+which correspond to the red nodes in the following figure.
+
+![Comp Graph Folded](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/prog_model/comp_graph_backward.png)
+
+The imperative program actually does the same thing as the symbolic program.
+It implicitly saves a backward computation graph in the grad closure.
+When you invoked ```d.grad```, you start from ```d(D)```,
+backtrack through the graph to compute the gradient, and collect the results.
+
+The gradient calculations in both symbolic
+and imperative programming follow the same pattern.
+What's the difference then?
+Recall the *be prepared to encounter all possible demands* requirement of imperative programs.
+If you are creating an array library that supports automatic differentiation,
+you have to keep the grad closure along with the computation.
+This means that none of the history variables can be
+garbage-collected because they are referenced by variable `d` by way of function closure.
+
+What if you want to compute only the value of `d`,
+and don't want the gradient value?
+In symbolic programming, you declare this with `f=compiled([D])`.
+This also declares the boundary of computation,
+telling the system that you want to compute only the forward pass.
+As a result, the system can free the memory of previous results,
+and share the memory between inputs and outputs.
+
+Imagine running a deep neural network with ```n``` layers.
+If you are running only the forward pass,
+not the backward(gradient) pass,
+you need to allocate only two copies of
+temporal space to store the values of the intermediate layers,
+instead of ```n``` copies of them.
+However, because imperative programs need to be prepared
+to encounter all possible demands of getting the gradient,
+they have to store the intermediate values,
+which requires ```n``` copies of temporal space.
+
+As you can see, the level of optimization depends
+on the restrictions on what you can do.
+Symbolic programs ask you to clearly specify
+these restrictions when you compile the graph.
+One the other hand, imperative programs
+must be prepared for a wider range of demands.
+Symbolic programs have a natural advantage
+because they know more about what you do and don't want.
+
+There are ways in which we can modify imperative programs
+to incorporate similar restrictions.
+For example, one solution to the preceding
+problem is to introduce a context variable.
+You can introduce a no-gradient context variable
+to turn gradient calculation off.
+
+```python
+ with context.NoGradient():
+ a = array(1, 'a')
+ b = array(2, 'b')
+ c = b * a
+ d = c + 1
+```
+
+
+
+However, this example still must be prepared to encounter all possible demands,
+which means that you can't perform the in-place calculation
+to reuse memory in the forward pass (a trick commonly used to reduce GPU memory usage).
+The techniques we've discussed generate an explicit backward pass.
+Some of the libraries such as Caffe and CXXNet perform backprop implicitly on the same graph.
+The approach we've discussed in this section also applies to them.
+
+Most configuration-file-based libraries,
+such as CXXNet and Caffe are designed
+to meet one or two generic requirements:
+get the activation of each layer,
+or get the gradient of all of the weights.
+These libraries have the same problem:
+the more generic operations the library has to support,
+the less optimization (memory sharing) you can do,
+based on the same data structure.
+
+As you can see, the trade-off between restriction
+and flexibility is the same for most cases.
+
+### Model Checkpoint
+
+It's important to able to save a model and load it back later.
+There are different ways to *save* your work.
+Normally, to save a neural network,
+you need to save two things: a net configuration
+for the structure of the neural network and the weights of the neural network.
+
+The ability to check the configuration is a plus for symbolic programs.
+Because the symbolic construction phase does not perform computation,
+you can directly serialize the computation graph, and load it back later.
+This solves the problem of saving the configuration
+without introducing an additional layer.
+
+```python
+ A = Variable('A')
+ B = Variable('B')
+ C = B * A
+ D = C + Constant(1)
+ D.save('mygraph')
+ ...
+ D2 = load('mygraph')
+ f = compile([D2])
+ # more operations
+ ...
+```
+
+Because an imperative program executes as it describes the computation,
+you have to save the code itself as the ```configuration```,
+or build another configuration layer on top of the imperative language.
+
+### Parameter Updates
+
+Most symbolic programs are data flow (computation) graphs.
+Data flow graphs describe computation.
+But it's not obvious how to use graphs to describe parameter updates.
+That's because parameter updates introduce mutation,
+which is not a data flow concept.
+Most symbolic programs introduce a special update statement
+to update persistent state in the programs.
+
+It's usually easier to write parameter updates in an imperative style,
+especially when you need multiple updates that relate to each other.
+For symbolic programs, the update statement is also executed as you call it.
+So in that sense, most symbolic deep learning libraries
+fall back on the imperative approach to perform updates,
+while using the symbolic approach to perform gradient calculation.
+
+### There Is No Strict Boundary
+
+In comparing the two programming styles,
+some of our arguments might not be strictly true,
+i.e., it's possible to make an imperative program
+more like a traditional symbolic program or vice versa.
+However, the two archetypes are useful abstractions,
+especially for understanding the differences between deep learning libraries.
+We might reasonably conclude that there is no clear boundary between programming styles.
+For example, you can create a just-in-time (JIT) compiler in Python
+to compile imperative Python programs,
+which provides some of the advantages of global
+information held in symbolic programs.
+
+
+## Big vs. Small Operations
+
+When designing a deep learning library, another important programming model decision
+is precisely what operations to support.
+In general, there are two families of operations supported by most deep learning libraries:
+
+- Big operations - typically for computing neural network layers (e.g. FullyConnected and BatchNormalize).
+- Small operations - mathematical functions like matrix multiplication and element-wise addition.
+
+Libraries like CXXNet and Caffe support layer-level operations.
+Libraries like Theano and Minerva support fine-grained operations.
+
+### Smaller Operations Can Be More Flexible
+It's quite natural to use smaller operations to compose bigger operations.
+For example, the sigmoid unit can simply be composed of division, addition and an exponentiation:
+
+```python
+ sigmoid(x) = 1.0 / (1.0 + exp(-x))
+```
+Using smaller operations as building blocks, you can express nearly anything you want.
+If you're more familiar with CXXNet- or Caffe-style layers,
+note that these operations don't differ from a layer, except that they are smaller.
+
+```python
+ SigmoidLayer(x) = EWiseDivisionLayer(1.0, AddScalarLayer(ExpLayer(-x), 1.0))
+```
+This expression composes three layers,
+with each defining its forward and backward (gradient) function.
+Using smaller operations gives you the advantage of building new layers quickly,
+because you only need to compose the components.
+
+### Big Operations Are More Efficient
+Directly composing sigmoid layers requires three layers of operation, instead of one.
+
+```python
+ SigmoidLayer(x) = EWiseDivisionLayer(1.0, AddScalarLayer(ExpLayer(-x), 1.0))
+```
+This code creates overhead for computation and memory (which could be optimized, with cost).
+
+Libraries like CXXNet and Caffe take a different approach.
+To support coarse-grained operations,
+such as BatchNormalization and the SigmoidLayer directly,
+in each layer, the calculation kernel is hand crafted
+with one or only some CUDA kernel launches.
+This makes these implementations more efficient.
+
+### Compilation and Optimization
+
+Can small operations be optimized? Of course, they can.
+Let's look at the system optimization part of the compilation engine.
+Two types of optimization can be performed on the computation graph:
+
+- Memory allocation optimization, to reuse the memory of the intermediate computations.
+- Operator fusion, to detect sub-graph patterns, such as the sigmoid, and fuse them into a bigger operation kernel.
+
+Memory allocation optimization isn't restricted to small operations graphs.
+You can use it with bigger operations graph, too.
+However, optimization might not be essential
+for bigger operation libraries like CXXNet and Caffe,
+because you can't find the compilation step in them.
+However, there's a (dumb) ```compilation step``` in these libraries,
+that basically translates the layers into a fixed forward,
+backprop execution plan, by running each operation one by one.
+
+For computation graphs with smaller operations,
+these optimizations are crucial to performance.
+Because the operations are small,
+there are many sub-graph patterns that can be matched.
+Also, because the final, generated operations
+might not be enumerable,
+an explicit recompilation of the kernels is required,
+as opposed to the fixed amount of precompiled kernels
+in the big operation libraries.
+This creates compilation overhead for the symbolic libraries
+that support small operations.
+Requiring compilation optimization also creates engineering overhead
+for the libraries that solely support smaller operations.
+
+As in the case of symbolic vs. imperative,
+the bigger operation libraries "cheat"
+by asking you to provide restrictions (to the common layer),
+so that you actually perform the sub-graph matching.
+This moves the compilation overhead to the real brain, which is usually not too bad.
+
+### Expression Template and Statically Typed Language
+You always have a need to write small operations and compose them.
+Libraries like Caffe use hand-crafted kernels to build these bigger blocks.
+Otherwise, you would have to compose smaller operations using Python.
+
+There's a third choice that works pretty well.
+This is called the expression template.
+Basically, you use template programming to
+generate generic kernels from an expression tree at compile time.
+For details, see [Expression Template Tutorial](https://github.com/dmlc/mshadow/blob/master/guide/exp-template/README.md).
+CXXNet makes extensive use of an expression template,
+which enables creating much shorter and more readable code that matches
+the performance of hand-crafted kernels.
+
+The difference between using an expression template and Python kernel generation
+is that expression evaluation is done at compile time for C++ with an existing type,
+so there is no additional runtime overhead.
+In principle, this is also possible with other statically typed languages that support templates,
+but we've seen this trick used only in C++.
+
+Expression template libraries create a middle ground between Python operations
+and hand-crafted big kernels by allowing C++ users to craft efficient big
+operations by composing smaller operations. It's an option worth considering.
+
+## Mix the Approaches
+
+Now that we've compared the programming models, which one should you choose?
+Before delving into that, we should emphasize that depending on the problems you're trying to solve,
+our comparison might not necessarily have a big impact.
+
+Remember [Amdahl's law](https://en.wikipedia.org/wiki/Amdahl%27s_law):
+If you are optimizing a non-performance-critical part of your problem,
+you won't get much of a performance gain.
+
+As you've seen, there usually is a trade-off between efficiency,
+flexibility, and engineering complexity.
+The more suitable programming style depends on the problem you are trying to solve.
+For example, imperative programs are better for parameter updates,
+and symbolic programs for gradient calculation.
+
+We advocate *mixing* the approaches.
+Sometimes the part that we want to be flexible
+isn't crucial to performance.
+In these cases, it's okay to leave some efficiency on the table
+to support more flexible interfaces.
+In machine learning, combining methods usually works better than using just one.
+
+If you can combine the programming models correctly,
+you can get better results than when using a single programming model.
+In this section, we discuss how to do so.
+
+### Symbolic and Imperative Programs
+There are two ways to mix symbolic and imperative programs:
+
+- Use imperative programs within symbolic programs as callbacks
+- Use symbolic programs as part of imperative programs
+
+We've observed that it's usually helpful to write parameter updates imperatively,
+and perform gradient calculations in symbolic programs.
+
+Symbolic libraries already mix programs because Python itself is imperative.
+For example, the following program mixes the symbolic approach with NumPy, which is imperative.
+
+```python
+ A = Variable('A')
+ B = Variable('B')
+ C = B * A
+ D = C + Constant(1)
+ # compiles the function
+ f = compile(D)
+ d = f(A=np.ones(10), B=np.ones(10)*2)
+ d = d + 1.0
+```
+The symbolic graphs are compiled into a function that can be executed imperatively.
+The internals are a black box to the user.
+This is exactly like writing C++ programs and exposing them to Python, which we commonly do.
+
+Because parameter memory resides on the GPU,
+you might not want to use NumPy as an imperative component.
+Supporting a GPU-compatible imperative library
+that interacts with symbolic compiled functions
+or provides a limited amount of updating syntax
+in the update statement in symbolic program execution
+might be a better choice.
+
+### Small and Big Operations
+
+There might be a good reason to combine small and big operations.
+Consider applications that perform tasks such as changing
+a loss function or adding a few customized layers to an existing structure.
+Usually, you can use big operations to compose existing
+components, and use smaller operations to build the new parts.
+
+Recall Amdahl's law. Often, the new components
+are not the cause of the computation bottleneck.
+Because the performance-critical part is already optimized by
+the bigger operations, it's okay to forego optimizing the additional small operations,
+or to do a limited amount of memory optimization instead
+of operation fusion and directly running them.
+
+### Choose Your Own Approach
+
+In this document, we compared multiple approaches
+to developing programming environments for deep learning.
+We compared both the usability and efficiency implications of each,
+finding that many of these trade-offs (like imperative vs symbolic aren't necessarily black and white).
+You can choose your approach, or combine the approaches
+to create more interesting and intelligent deep learning libraries.
+
+## Contribute to MXNet
+
+This document is part of our effort to provide [open-source system design notes](index.md)
+for deep learning libraries. If you're interested in contributing to _MXNet_ or its
+documentation, [fork us on GitHub](http://github.com/dmlc/mxnet).
+
+## Next Steps
+
+* [Dependency Engine for Deep Learning](http://mxnet.io/architecture/note_engine.html)
+* [Squeeze the Memory Consumption of Deep Learning](http://mxnet.io/architecture/note_memory.html)
+* [Efficient Data Loading Module for Deep Learning](http://mxnet.io/architecture/note_data_loading.html)
+* [Survey of RNN Interface](http://mxnet.io/architecture/rnn_interface.html)
diff --git a/docs/static_site/src/pages/api/clojure/docs/tutorials/index.md b/docs/static_site/src/pages/api/clojure/docs/tutorials/index.md
new file mode 100644
index 000000000000..a424c87b335a
--- /dev/null
+++ b/docs/static_site/src/pages/api/clojure/docs/tutorials/index.md
@@ -0,0 +1,23 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_landing_tutorials
+title: Clojure Tutorials
+action: Get Started
+tag: clojure
+permalink: /api/clojure/docs/tutorials
+---
diff --git a/docs/static_site/src/pages/api/clojure/docs/tutorials/kvstore.md b/docs/static_site/src/pages/api/clojure/docs/tutorials/kvstore.md
new file mode 100644
index 000000000000..96eb03abdeb7
--- /dev/null
+++ b/docs/static_site/src/pages/api/clojure/docs/tutorials/kvstore.md
@@ -0,0 +1,109 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: KVStore API
+is_tutorial: true
+permalink: /api/clojure/docs/tutorials/kvstore
+tag: clojure
+---
+# KVStore API
+
+Topics:
+
+* [Basic Push and Pull](#basic-push-and-pull)
+* [List Key-Value Pairs](#list-key-value-pairs)
+* [API Reference](http://mxnet.incubator.apache.org/api/clojure/docs/org.apache.clojure-mxnet.kvstore.html)
+
+To follow along with this documentation, you can use this namespace to with the needed requires:
+
+```clojure
+(ns docs.kvstore
+ (:require [org.apache.clojure-mxnet.kvstore :as kvstore]
+ [org.apache.clojure-mxnet.ndarray :as ndarray]
+ [org.apache.clojure-mxnet.context :as context]))
+```
+
+## Basic Push and Pull
+
+Provides basic operation over multiple devices (GPUs) on a single device.
+
+### Initialization
+
+Let's consider a simple example. It initializes
+a (`int`, `NDArray`) pair into the store, and then pulls the value out.
+
+```clojure
+(def kv (kvstore/create "local")) ;; create a local kvstore
+(def shape [2 3])
+;;; init the kvstore with a vector of keys (strings) and ndarrays
+(kvstore/init kv ["3"] [(ndarray/* (ndarray/ones shape) 2)])
+(def a (ndarray/zeros shape))
+(kvstore/pull kv ["3"] [a])
+(ndarray/->vec a) ;=> [2.0 2.0 2.0 2.0 2.0 2.0]
+```
+
+### Push, Aggregation, and Updater
+
+For any key that's been initialized, you can push a new value with the same shape to the key, as follows:
+
+```clojure
+(kvstore/push kv ["3"] [(ndarray/* (ndarray/ones shape) 8)])
+(kvstore/pull kv ["3"] [a])
+(ndarray/->vec a);=>[8.0 8.0 8.0 8.0 8.0 8.0]
+```
+
+The data that you want to push can be stored on any device. Furthermore, you can push multiple
+values into the same key, where KVStore first sums all of these
+values, and then pushes the aggregated value, as follows (Here we use multiple cpus):
+
+```clojure
+(def cpus [(context/cpu 0) (context/cpu 1) (context/cpu 2)])
+(def b [(ndarray/ones shape {:ctx (nth cpus 0)})
+ (ndarray/ones shape {:ctx (nth cpus 1)})
+ (ndarray/ones shape {:ctx (nth cpus 2)})])
+(kvstore/push kv ["3" "3" "3"] b)
+(kvstore/pull kv "3" a)
+(ndarray/->vec a) ;=> [3.0 3.0 3.0 3.0 3.0 3.0]
+```
+
+
+### Pull
+
+You've already seen how to pull a single key-value pair. Similar to the way that you use the push command, you can
+pull the value into several devices with a single call.
+
+```clojure
+(def b [(ndarray/ones shape {:ctx (context/cpu 0)})
+ (ndarray/ones shape {:ctx (context/cpu 1)})])
+(kvstore/pull kv ["3" "3"] b)
+(map ndarray/->vec b) ;=> ([3.0 3.0 3.0 3.0 3.0 3.0] [3.0 3.0 3.0 3.0 3.0 3.0])
+```
+
+## List Key-Value Pairs
+
+All of the operations that we've discussed so far are performed on a single key. KVStore also provides
+the interface for generating a list of key-value pairs. For a single device, use the following:
+
+```clojure
+(def ks ["5" "7" "9"])
+(kvstore/init kv ks [(ndarray/ones shape) (ndarray/ones shape) (ndarray/ones shape)])
+(kvstore/push kv ks [(ndarray/ones shape) (ndarray/ones shape) (ndarray/ones shape)])
+(def b [(ndarray/zeros shape) (ndarray/zeros shape) (ndarray/zeros shape)])
+(kvstore/pull kv ks b)
+(map ndarray/->vec b);=> ([1.0 1.0 1.0 1.0 1.0 1.0] [1.0 1.0 1.0 1.0 1.0 1.0] [1.0 1.0 1.0 1.0 1.0 1.0])
+```
diff --git a/docs/static_site/src/pages/api/clojure/docs/tutorials/module.md b/docs/static_site/src/pages/api/clojure/docs/tutorials/module.md
new file mode 100644
index 000000000000..013e6ec44820
--- /dev/null
+++ b/docs/static_site/src/pages/api/clojure/docs/tutorials/module.md
@@ -0,0 +1,259 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Module API
+is_tutorial: true
+tag: clojure
+permalink: /api/clojure/docs/tutorials/module
+---
+
+# Module API
+The module API provides an intermediate and high-level interface for performing computation with neural networks in MXNet. Module wraps a Symbol and one or more Executors. It has both a high level and intermediate level API.
+
+
+Topics:
+
+* [Prepare the Data](#prepare-the-data)
+* [List Key-Value Pairs](#list-key-value-pairs)
+* [Preparing a Module for Computation](#preparing-a-module-for-computation)
+* [Training and Predicting](#training-and-predicting)
+* [Saving and Loading](#saving-and-loading)
+* [API Reference](http://mxnet.incubator.apache.org/api/clojure/docs/org.apache.clojure-mxnet.module.html)
+
+
+To follow along with this documentation, you can use this namespace to with the needed requires:
+
+```clojure
+(ns docs.module
+ (:require [clojure.java.io :as io]
+ [clojure.java.shell :refer [sh]]
+ [org.apache.clojure-mxnet.eval-metric :as eval-metric]
+ [org.apache.clojure-mxnet.io :as mx-io]
+ [org.apache.clojure-mxnet.module :as m]
+ [org.apache.clojure-mxnet.symbol :as sym]
+ [org.apache.clojure-mxnet.ndarray :as ndarray]))
+```
+
+## Prepare the Data
+
+In this example, we are going to use the MNIST data set. If you have cloned the MXNet repo and `cd contrib/clojure-package`, we can run some helper scripts to download the data for us.
+
+```clojure
+(def data-dir "data/")
+
+(when-not (.exists (io/file (str data-dir "train-images-idx3-ubyte")))
+ (sh "../../scripts/get_mnist_data.sh"))
+```
+
+MXNet provides function in the `io` namespace to load the MNIST datasets into training and test data iterators that we can use with our module.
+
+```clojure
+(def train-data (mx-io/mnist-iter {:image (str data-dir "train-images-idx3-ubyte")
+ :label (str data-dir "train-labels-idx1-ubyte")
+ :label-name "softmax_label"
+ :input-shape [784]
+ :batch-size 10
+ :shuffle true
+ :flat true
+ :silent false
+ :seed 10}))
+
+(def test-data (mx-io/mnist-iter {:image (str data-dir "t10k-images-idx3-ubyte")
+ :label (str data-dir "t10k-labels-idx1-ubyte")
+ :input-shape [784]
+ :batch-size 10
+ :flat true
+ :silent false}))
+```
+
+
+## Preparing a Module for Computation
+
+To construct a module, we need to have a symbol as input. This symbol takes input data in the first layer and then has subsequent layers of fully connected and relu activation layers, ending up in a softmax layer for output.
+
+```clojure
+(let [data (sym/variable "data")
+ fc1 (sym/fully-connected "fc1" {:data data :num-hidden 128})
+ act1 (sym/activation "relu1" {:data fc1 :act-type "relu"})
+ fc2 (sym/fully-connected "fc2" {:data act1 :num-hidden 64})
+ act2 (sym/activation "relu2" {:data fc2 :act-type "relu"})
+ fc3 (sym/fully-connected "fc3" {:data act2 :num-hidden 10})
+ out (sym/softmax-output "softmax" {:data fc3})]
+ out)
+ ;=>#object[org.apache.mxnet.Symbol 0x1f43a406 "org.apache.mxnet.Symbol@1f43a406"]
+```
+
+You can also write this with the `as->` threading macro.
+
+```clojure
+(def out (as-> (sym/variable "data") data
+ (sym/fully-connected "fc1" {:data data :num-hidden 128})
+ (sym/activation "relu1" {:data data :act-type "relu"})
+ (sym/fully-connected "fc2" {:data data :num-hidden 64})
+ (sym/activation "relu2" {:data data :act-type "relu"})
+ (sym/fully-connected "fc3" {:data data :num-hidden 10})
+ (sym/softmax-output "softmax" {:data data})))
+;=> #'tutorial.module/out
+```
+
+
+By default, `context` is the CPU. If you need data parallelization, you can specify a GPU context or an array of GPU contexts like this `(m/module out {:contexts [(context/gpu)]})`
+
+Before you can compute with a module, you need to call `bind` to allocate the device memory and `init-params` or `set-params` to initialize the parameters. If you simply want to fit a module, you don’t need to call `bind` and `init-params` explicitly, because the `fit` function automatically calls them if they are needed.
+
+```clojure
+(let [mod (m/module out)]
+ (-> mod
+ (m/bind {:data-shapes (mx-io/provide-data train-data)
+ :label-shapes (mx-io/provide-label train-data)})
+ (m/init-params)))
+```
+
+Now you can compute with the module using functions like `forward`, `backward`, etc.
+
+## Training and Predicting
+
+Modules provide high-level APIs for training, predicting, and evaluating. To fit a module, call the `fit` function with some data iterators:
+
+```clojure
+(def mod (m/fit (m/module out) {:train-data train-data :eval-data test-data :num-epoch 1}))
+;; Epoch 0 Train- [accuracy 0.12521666]
+;; Epoch 0 Time cost- 8392
+;; Epoch 0 Validation- [accuracy 0.2227]
+```
+
+You can pass in batch-end callbacks using batch-end-callback and epoch-end callbacks using epoch-end-callback in the `fit-params`. You can also set parameters using functions like in the fit-params like optimizer and eval-metric. To learn more about the fit-params, see the fit-param function options. To predict with a module, call `predict` with a DataIter:
+
+```clojure
+(def results (m/predict mod {:eval-data test-data}))
+(first results) ;=>#object[org.apache.mxnet.NDArray 0x3540b6d3 "org.apache.mxnet.NDArray@a48686ec"]
+
+(first (ndarray/->vec (first results))) ;=>0.08261358
+```
+
+The module collects and returns all of the prediction results. For more details about the format of the return values, see the documentation for the [`predict`](docs/org.apache.clojure-mxnet.module.html#var-fit-params) function.
+
+When prediction results might be too large to fit in memory, use the [`predict-every-batch`](docs/org.apache.clojure-mxnet.module.html#predict-every-batch) API.
+
+```clojure
+(let [preds (m/predict-every-batch mod {:eval-data test-data})]
+ (mx-io/reduce-batches test-data
+ (fn [i batch]
+ (println (str "pred is " (first (get preds i))))
+ (println (str "label is " (mx-io/batch-label batch)))
+ ;;; do something
+ (inc i))))
+```
+
+If you need to evaluate on a test set and don’t need the prediction output, call the `score` function with a data iterator and an eval metric:
+
+```clojure
+(m/score mod {:eval-data test-data :eval-metric (eval-metric/accuracy)}) ;=>["accuracy" 0.2227]
+```
+
+This runs predictions on each batch in the provided data iterator and computes the evaluation score using the provided eval metric. The evaluation results are stored in `eval-metric` object itself so that you can query later.
+
+
+## Saving and Loading
+
+To save the module parameters in each training epoch, use the `save-checkpoint` function:
+
+```clojure
+(let [save-prefix "my-model"]
+ (doseq [epoch-num (range 3)]
+ (mx-io/do-batches train-data (fn [batch
+ ;; do something
+]))
+ (m/save-checkpoint mod {:prefix save-prefix :epoch epoch-num :save-opt-states true})))
+
+;; INFO org.apache.mxnet.module.Module: Saved checkpoint to my-model-0000.params
+;; INFO org.apache.mxnet.module.Module: Saved optimizer state to my-model-0000.states
+;; INFO org.apache.mxnet.module.Module: Saved checkpoint to my-model-0001.params
+;; INFO org.apache.mxnet.module.Module: Saved optimizer state to my-model-0001.states
+;; INFO org.apache.mxnet.module.Module: Saved checkpoint to my-model-0002.params
+;; INFO org.apache.mxnet.module.Module: Saved optimizer state to my-model-0002.states
+
+```
+
+To load the saved module parameters, call the `load-checkpoint` function:
+
+```clojure
+(def new-mod (m/load-checkpoint {:prefix "my-model" :epoch 1 :load-optimizer-states true}))
+
+new-mod ;=> #object[org.apache.mxnet.module.Module 0x5304d0f4 "org.apache.mxnet.module.Module@5304d0f4"]
+```
+
+To initialize parameters, Bind the symbols to construct executors first with `bind` function. Then, initialize the parameters and auxiliary states by calling `init-params` function.
+
+```clojure
+(-> new-mod
+ (m/bind {:data-shapes (mx-io/provide-data train-data) :label-shapes (mx-io/provide-label train-data)})
+ (m/init-params))
+```
+
+To get current parameters, use `params`
+
+```clojure
+
+(let [[arg-params aux-params] (m/params new-mod)]
+ {:arg-params arg-params
+ :aux-params aux-params})
+
+;; {:arg-params
+;; {"fc3_bias"
+;; #object[org.apache.mxnet.NDArray 0x39adc3b0 "org.apache.mxnet.NDArray@49caf426"],
+;; "fc2_weight"
+;; #object[org.apache.mxnet.NDArray 0x25baf623 "org.apache.mxnet.NDArray@a6c8f9ac"],
+;; "fc1_bias"
+;; #object[org.apache.mxnet.NDArray 0x6e089973 "org.apache.mxnet.NDArray@9f91d6eb"],
+;; "fc3_weight"
+;; #object[org.apache.mxnet.NDArray 0x756fd109 "org.apache.mxnet.NDArray@2dd0fe3c"],
+;; "fc2_bias"
+;; #object[org.apache.mxnet.NDArray 0x1dc69c8b "org.apache.mxnet.NDArray@d128f73d"],
+;; "fc1_weight"
+;; #object[org.apache.mxnet.NDArray 0x20abc769 "org.apache.mxnet.NDArray@b8e1c5e8"]},
+;; :aux-params {}}
+
+```
+
+To assign parameter and aux state values, use the `set-params` function.
+
+```clojure
+(m/set-params new-mod {:arg-params (m/arg-params new-mod) :aux-params (m/aux-params new-mod)})
+;=> #object[org.apache.mxnet.module.Module 0x5304d0f4 "org.apache.mxnet.module.Module@5304d0f4"]
+```
+
+To resume training from a saved checkpoint, pass the loaded parameters to the `fit` function. This will prevent `fit` from initialzing randomly.
+
+Create fit-params and then use it to set `begin-epoch` so that `fit` knows to resume from a saved epoch.
+
+```clojure
+;; reset the training data before calling fit or you will get an error
+(mx-io/reset train-data)
+(mx-io/reset test-data)
+
+(m/fit new-mod {:train-data train-data :eval-data test-data :num-epoch 2
+ :fit-params (-> (m/fit-params {:begin-epoch 1}))})
+
+```
+
+
+## Next Steps
+* See [Symbolic API](symbol.md) for operations on NDArrays that assemble neural networks from layers.
+* See [NDArray API](ndarray.md) for vector/matrix/tensor operations.
+* See [KVStore API](kvstore.md) for multi-GPU and multi-host distributed training.
diff --git a/docs/static_site/src/pages/api/clojure/docs/tutorials/ndarray.md b/docs/static_site/src/pages/api/clojure/docs/tutorials/ndarray.md
new file mode 100644
index 000000000000..a989d7482ecf
--- /dev/null
+++ b/docs/static_site/src/pages/api/clojure/docs/tutorials/ndarray.md
@@ -0,0 +1,143 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: NDArray
+is_tutorial: true
+tag: clojure
+permalink: /api/clojure/docs/tutorials/ndarray
+---
+
+# NDArray API
+
+
+The NDArray API contains tensor operations similar to `numpy.ndarray`. The syntax is also similar, except for some additional calls for dealing with I/O and multiple devices.
+
+Topics:
+
+* [Create NDArray](#create-ndarray)
+* [NDArray Operations](#ndarray-operations)
+* [NDArray API Reference](http://mxnet.incubator.apache.org/api/clojure/docs/org.apache.clojure-mxnet.ndarray.html)
+
+To follow along with this documentation, you can use this namespace with the needed requires:
+
+```clojure
+(ns docs.ndarray
+ (:require [org.apache.clojure-mxnet.ndarray :as ndarray]
+ [org.apache.clojure-mxnet.context :as context]))
+```
+
+
+## Create NDArray
+
+Create `mxnet.ndarray` as follows:
+
+```clojure
+
+(def a (ndarray/zeros [100 50])) ;;all zero arrray of dimension 100 x 50
+(def b (ndarray/ones [256 32 128 1])) ;; all one array of dimension
+(def c (ndarray/array [1 2 3 4 5 6] [2 3])) ;; array with contents of a shape 2 x 3
+```
+
+There are also ways to convert a NDArray to a vec or get the shape or the NDArray as an object or vec as follows:
+
+```clojure
+(ndarray/->vec c) ;=> [1.0 2.0 3.0 4.0 5.0 6.0]
+(ndarray/shape c) ;=> #object[org.apache.mxnet.Shape 0x583c865 "(2,3)"]
+(ndarray/shape-vec c) ;=> [2 3]
+```
+
+
+## NDArray Operations
+
+There are some basic NDArray operations, like arithmetic and slice operations.
+
+### Arithmetic Operations
+
+```clojure
+(def a (ndarray/ones [1 5]))
+(def b (ndarray/ones [1 5]))
+(-> (ndarray/+ a b) (ndarray/->vec)) ;=> [2.0 2.0 2.0 2.0 2.0]
+
+;; original ndarrays are unchanged
+(ndarray/->vec a) ;=> [1.0 1.0 1.0 1.0 1.0]
+(ndarray/->vec b) ;=> [1.0 1.0 1.0 1.0 1.0]
+
+;;inplace operators
+(ndarray/+= a b)
+(ndarray/->vec a) ;=> [2.0 2.0 2.0 2.0 2.0]
+```
+
+Other arithmetic operations are similar.
+
+
+### Slice Operations
+
+```clojure
+(def a (ndarray/array [1 2 3 4 5 6] [3 2]))
+(def a1 (ndarray/slice a 1))
+(ndarray/shape-vec a1) ;=> [1 2]
+(ndarray/->vec a1) ;=> [3.0 4.0]
+
+(def a2 (ndarray/slice a 1 3))
+(ndarray/shape-vec a2) ;=>[2 2]
+(ndarray/->vec a2) ;=> [3.0 4.0 5.0 6.0]
+```
+
+### Dot Product
+
+```clojure
+(def arr1 (ndarray/array [1 2] [1 2]))
+(def arr2 (ndarray/array [3 4] [2 1]))
+(def res (ndarray/dot arr1 arr2))
+(ndarray/shape-vec res) ;=> [1 1]
+(ndarray/->vec res) ;=> [11.0]
+```
+
+### Save and Load NDArray
+
+You can use MXNet functions to save and load a list or dictionary of NDArrays from file systems, as follows:
+
+```clojure
+(ndarray/save "filename" {"arr1" arr1 "arr2" arr2})
+;; you can also do "s3://path" or "hdfs"
+```
+
+To load:
+
+```clojure
+(def from-file (ndarray/load "filename"))
+from-file
+;=>{"arr1" #object["org.apache.mxnet.NDArray@43d85753"], "arr2" #object["org.apache.mxnet.NDArray@5c93def4"]}
+```
+
+The good thing about using the `save` and `load` interface is that you can use the format across all `mxnet` language bindings. They also already support Amazon S3 and HDFS.
+
+### Multi-Device Support
+
+Device information is stored in the `mxnet.Context` structure. When creating NDArray in MXNet, you can use the context argument (the default is the CPU context) to create arrays on specific devices as follows:
+
+```clojure
+(def cpu-a (ndarray/zeros [100 200]))
+(ndarray/context cpu-a) ;=> #object[org.apache.mxnet.Context 0x3f376123 "cpu(0)"]
+
+(def gpu-b (ndarray/zeros [100 200] {:ctx (context/gpu 0)})) ;; to use with gpu
+
+```
+
+## Next Steps
+* See [KVStore API](kvstore.md) for multi-GPU and multi-host distributed training.
diff --git a/docs/static_site/src/pages/api/clojure/docs/tutorials/symbol.md b/docs/static_site/src/pages/api/clojure/docs/tutorials/symbol.md
new file mode 100644
index 000000000000..352b0406bd60
--- /dev/null
+++ b/docs/static_site/src/pages/api/clojure/docs/tutorials/symbol.md
@@ -0,0 +1,156 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Symbolic API
+is_tutorial: true
+tag: clojure
+permalink: /api/clojure/docs/tutorials/symbol
+---
+
+# MXNet Clojure Symbolic API
+
+Topics:
+
+* [How to Compose Symbols](#how-to-compose-symbols)
+* [More Complicated Compositions](#more-complicated-compositions)
+* [Group Multiple Symbols](#group-multiple-symbols)
+* [Serialization](#serialization)
+* [Executing Symbols](#executing-symbols)
+* [Symbol API Reference](http://mxnet.incubator.apache.org/api/clojure/docs/org.apache.clojure-mxnet.symbol.html)
+
+
+We also highly encourage you to read [Symbolic Configuration and Execution in Pictures](symbol_in_pictures.md).
+
+To follow along with this documentation, you can use this namespace to with the following requirements:
+
+```clojure
+(ns docs.symbol
+ (:require [org.apache.clojure-mxnet.executor :as executor]
+ [org.apache.clojure-mxnet.ndarray :as ndarray]
+ [org.apache.clojure-mxnet.symbol :as sym]
+ [org.apache.clojure-mxnet.context :as context]))
+```
+
+
+## How to Compose Symbols
+
+The Symbolic API provides a way to configure computation graphs.
+You can configure the graphs either at the level of neural network layer operations or as fine-grained operations.
+
+The following example configures a two-layer neural network.
+
+```clojure
+(def data (sym/variable "data"))
+(def fc1 (sym/fully-connected "fc1" {:data data :num-hidden 128}))
+(def act1 (sym/activation "act1" {:data fc1 :act-type "relu"}))
+(def fc2 (sym/fully-connected "fc2" {:data act1 :num-hidden 64}))
+(def net (sym/softmax-output "out" {:data fc2}))
+```
+
+This can also be combined more dynamically with the `as->` Clojure threading form.
+
+```clojure
+(as-> (sym/variable "data") data
+ (sym/fully-connected "fc1" {:data data :num-hidden 128})
+ (sym/activation "act1" {:data data :act-type "relu"})
+ (sym/fully-connected "fc2" {:data data :num-hidden 64})
+ (sym/softmax-output "out" {:data data}))
+
+net ;=> #object[org.apache.mxnet.Symbol 0x5c78c8c2 "org.apache.mxnet.Symbol@5c78c8c2"]
+```
+
+The basic arithmetic operators (plus, minus, div, multiplication) work as expected.
+
+The following example creates a computation graph that adds two inputs together.
+
+```clojure
+(def a (sym/variable "a"))
+(def b (sym/variable "b"))
+(def c (sym/+ a b))
+```
+
+## More Complicated Compositions
+
+MXNet provides well-optimized symbols for layers commonly used in deep learning (see src/operator). We can also define new operators in Python. The following example first performs an element-wise add between two symbols, then feeds them to the fully connected operator:
+
+```clojure
+(def lhs (sym/variable "data1"))
+(def rhs (sym/variable "data2"))
+(def net (sym/fully-connected "fc1" {:data (sym/+ lhs rhs) :num-hidden 128}))
+(sym/list-arguments net) ;=> ["data1" "data2" "fc1_weight" "fc1_bias"]
+```
+
+## Group Multiple Symbols
+
+To construct neural networks with multiple loss layers, we can use `group` to group multiple symbols together. The following example groups two outputs:
+
+```clojure
+(def net (sym/variable "data"))
+(def fc1 (sym/fully-connected {:data net :num-hidden 128}))
+(def net2 (sym/activation {:data fc1 :act-type "relu"}))
+(def out1 (sym/softmax-output {:data net2}))
+(def out2 (sym/linear-regression-output {:data net2}))
+(def group (sym/group [out1 out2]))
+(sym/list-outputs group)
+;=> ["softmaxoutput0_output" "linearregressionoutput0_output"]
+```
+
+## Serialization
+You can use the [`save`](docs/org.apache.clojure-mxnet.symbol.html#var-save) and [`load`](docs/org.apache.clojure-mxnet.symbol.html#var-load) functions to serialize the Symbol objects. The advantage of using save and load functions is that it is language agnostic and cloud friendly. The symbol is saved in JSON format. You can also get a JSON string directly using mxnet.Symbol.toJson. Refer to API documentation for more details.
+
+ The following example shows how to save a symbol to a file, load it back, and compare two symbols using a JSON string. You can also save to S3 as well
+
+```clojure
+(def a (sym/variable "a"))
+(def b (sym/variable "b"))
+(def c (sym/+ a b))
+(sym/save c "symbol-c.json")
+(def c2 (sym/load "symbol-c.json"))
+(= (sym/to-json c) (sym/to-json c2)) ;=>true
+```
+
+
+## Executing Symbols
+
+To execute symbols, first we need to define the data that they should run on. We can do it by using the bind method, which accepts device context and a dict mapping free variable names to NDArrays as arguments and returns an executor. The executor provides forward method for evaluation and an attribute outputs to get all the results.
+
+```clojure
+(def a (sym/variable "a"))
+(def b (sym/variable "b"))
+(def c (sym/+ a b))
+
+(def ex (sym/bind c {"a" (ndarray/ones [2 2]) "b" (ndarray/ones [2 2])}))
+(-> (executor/forward ex)
+ (executor/outputs)
+ (first)
+ (ndarray/->vec));=> [2.0 2.0 2.0 2.0]
+```
+
+We can evaluate the same symbol on GPU with different data.
+_To do this you must have the correct native library jar defined as a dependency_
+
+**Note In order to execute the following section on a cpu set gpu_device to (cpu)**
+
+
+```clojure
+(def ex (sym/bind c (context/gpu 0) {"a" (ndarray/ones [2 2]) "b" (ndarray/ones [2 2])}))
+```
+
+## Next Steps
+* See [NDArray API](ndarray.md) for vector/matrix/tensor operations.
+* See [KVStore API](kvstore.md) for multi-GPU and multi-host distributed training.
diff --git a/docs/static_site/src/pages/api/clojure/docs/tutorials/symbol_in_pictures.md b/docs/static_site/src/pages/api/clojure/docs/tutorials/symbol_in_pictures.md
new file mode 100644
index 000000000000..09c9cfce02f7
--- /dev/null
+++ b/docs/static_site/src/pages/api/clojure/docs/tutorials/symbol_in_pictures.md
@@ -0,0 +1,101 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Symbolic API with Pictures
+is_tutorial: true
+tag: clojure
+permalink: /api/clojure/docs/tutorials/symbol_in_pictures
+---
+
+# Symbolic Configuration and Execution in Pictures
+
+This topic explains symbolic construction and execution in pictures.
+
+We recommend that you read the [Symbolic API](symbol.md) as another useful reference.
+
+## Compose Symbols
+
+Symbols are a description of the computation that you want to perform. The symbolic construction API generates the computation
+graph that describes the computation. The following picture shows how you compose symbols to describe basic computations.
+
+![Symbol Compose](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/compose_basic.png)
+
+- The ```mxnet.Symbol.Variable``` function creates argument nodes that represent input to the computation.
+- The symbol is overloaded with basic element-wise mathematical operations.
+
+## Configure Neural Networks
+
+In addition to supporting fine-grained operations, MXNet provides a way to perform big operations that are analogous to layers in neural networks.
+You can use operators to describe the configuration of a neural network.
+
+![Net Compose](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/compose_net.png)
+
+
+## Example of a Multi-Input Network
+
+The following example shows how to configure multiple input neural networks.
+
+![Multi Input](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/compose_multi_in.png)
+
+
+## Bind and Execute Symbol
+
+When you need to execute a symbol graph, you call the bind function to bind ```NDArrays``` to the argument nodes
+in order to obtain an ```Executor```.
+
+![Bind](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/bind_basic.png)
+
+To get the output results, given the bound NDArrays as input, you can call ```Executor.Forward```.
+
+![Forward](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/executor_forward.png)
+
+
+## Bind Multiple Outputs
+
+To group symbols, then bind them to get outputs of both, use ```mx.symbol.Group```.
+
+![MultiOut](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/executor_multi_out.png)
+
+Remember: Bind only what you need, so that the system can perform more optimizations.
+
+
+## Calculate the Gradient
+
+In the bind function, you can specify NDArrays that will hold gradients. Calling ```Executor.backward``` after ```Executor.forward``` gives you the corresponding gradients.
+
+![Gradient](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/executor_backward.png)
+
+
+## Simple Bind Interface for Neural Networks
+
+It can be tedious to pass the argument NDArrays to the bind function, especially when you are binding a big
+graph. ```Symbol.simple_bind``` provides a way to simplify
+the procedure. You need to specify only input data shapes. The function allocates the arguments, and binds
+the Executor for you.
+
+![SimpleBind](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/executor_simple_bind.png)
+
+## Auxiliary States
+
+Auxiliary states are just like arguments, except that you can't take the gradient of them. Although auxiliary states might not be part of the computation, they can be helpful for tracking. You can pass auxiliary states in the same way that you pass arguments.
+
+![SimpleBind](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/executor_aux_state.png)
+
+## Next Steps
+
+See [Symbolic API](symbol.md) and [Python Documentation](index.md).
diff --git a/docs/static_site/src/pages/api/clojure/index.md b/docs/static_site/src/pages/api/clojure/index.md
new file mode 100644
index 000000000000..af535c8614fa
--- /dev/null
+++ b/docs/static_site/src/pages/api/clojure/index.md
@@ -0,0 +1,47 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Clojure Guide
+action: Get Started
+action_url: /get_started
+permalink: /api/clojure
+tag: clojure
+---
+
+# MXNet - Clojure API
+
+MXNet supports the Clojure programming language. The MXNet Clojure package brings flexible and efficient GPU
+computing and state-of-art deep learning to Clojure. It enables you to write seamless tensor/matrix computation with multiple GPUs in Clojure. It also lets you construct and customize the state-of-art deep learning models in Clojure, and apply them to tasks, such as image classification and data science challenges.
+
+
+## Tensor and Matrix Computations
+You can perform tensor or matrix computation in pure Clojure:
+
+```clojure
+(def arr (ndarray/ones [2 3]))
+
+arr ;=> #object[org.apache.mxnet.NDArray 0x597d72e "org.apache.mxnet.NDArray@e35c3ba9"]
+
+(ndarray/shape-vec arr) ;=> [2 3]
+
+(-> (ndarray/* arr 2)
+ (ndarray/->vec)) ;=> [2.0 2.0 2.0 2.0 2.0 2.0]
+
+(ndarray/shape-vec (ndarray/* arr 2)) ;=> [2 3]
+
+```
diff --git a/docs/static_site/src/pages/api/cpp/docs/tutorials/basics.md b/docs/static_site/src/pages/api/cpp/docs/tutorials/basics.md
new file mode 100644
index 000000000000..aec71d013ee6
--- /dev/null
+++ b/docs/static_site/src/pages/api/cpp/docs/tutorials/basics.md
@@ -0,0 +1,222 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Basics
+action: Get Started
+action_url: /get_started
+permalink: /api/cpp/docs/tutorials/basics
+is_tutorial: true
+tag: cpp
+---
+
+Basics
+======
+
+This tutorial provides basic usages of the C++ package through the classical handwritten digits
+identification database--[MNIST](http://yann.lecun.com/exdb/mnist/).
+
+The following contents assume that the working directory is `/path/to/mxnet/cpp-package/example`.
+
+Load Data
+--------
+Before going into codes, we need to fetch MNIST data. You can either use the script `/path/to/mxnet/cpp-package/example/get_data.sh`,
+or download mnist data by yourself from Lecun's [website](http://yann.lecun.com/exdb/mnist/)
+and decompress them into `data/mnist_data` folder.
+
+Except linking the MXNet shared library, the C++ package itself is a header-only package,
+which means all you need to do is to include the header files. Among the header files,
+`op.h` is special since it is generated dynamically. The generation should be done when
+[building the C++ package](http://mxnet.incubator.apache.org/versions/master/api/c++/index.html).
+It is important to note that you need to **copy the shared library** (`libmxnet.so` in Linux and MacOS,
+`libmxnet.dll` in Windows) from `/path/to/mxnet/lib` to the working directory.
+We do not recommend you to use pre-built binaries because MXNet is under heavy development,
+the operator definitions in `op.h` may be incompatible with the pre-built version.
+
+In order to use functionalities provides by the C++ package, first we include the general
+header file `MxNetCpp.h` and specify the namespaces.
+
+```c++
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace std;
+using namespace mxnet::cpp;
+```
+
+Next we can use the data iter to load MNIST data (separated to training sets and validation sets).
+The digits in MNIST are 2-dimension arrays, so we should set `flat` to true to flatten the data.
+
+```c++
+auto train_iter = MXDataIter("MNISTIter")
+ .SetParam("image", "./data/mnist_data/train-images-idx3-ubyte")
+ .SetParam("label", "./data/mnist_data/train-labels-idx1-ubyte")
+ .SetParam("batch_size", batch_size)
+ .SetParam("flat", 1)
+ .CreateDataIter();
+auto val_iter = MXDataIter("MNISTIter")
+ .SetParam("image", "./data/mnist_data/t10k-images-idx3-ubyte")
+ .SetParam("label", "./data/mnist_data/t10k-labels-idx1-ubyte")
+ .SetParam("batch_size", batch_size)
+ .SetParam("flat", 1)
+ .CreateDataIter();
+```
+
+The data have been successfully loaded. We can now easily construct various models to identify
+the digits with the help of C++ package.
+
+
+Multilayer Perceptron
+---------------------
+If you are not familiar with multilayer perceptron, you can get some basic information
+[here](http://mxnet.io/tutorials/python/mnist.html#multilayer-perceptron). We only focus on
+the implementation in this tutorial.
+
+Constructing multilayer perceptron model is straightforward, assume we store the hidden size
+for each layer in `layers`, and each layer uses
+[ReLu](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)) function as activation.
+
+```c++
+Symbol mlp(const vector &layers) {
+ auto x = Symbol::Variable("X");
+ auto label = Symbol::Variable("label");
+
+ vector weights(layers.size());
+ vector biases(layers.size());
+ vector outputs(layers.size());
+
+ for (int i=0; i args;
+args["X"] = NDArray(Shape(batch_size, image_size*image_size), ctx);
+args["label"] = NDArray(Shape(batch_size), ctx);
+// Let MXNet infer shapes other parameters such as weights
+net.InferArgsMap(ctx, &args, args);
+
+// Initialize all parameters with uniform distribution U(-0.01, 0.01)
+auto initializer = Uniform(0.01);
+for (auto& arg : args) {
+ // arg.first is parameter name, and arg.second is the value
+ initializer(arg.first, &arg.second);
+}
+```
+
+The rest is to train the model with an optimizer.
+```c++
+// Create sgd optimizer
+Optimizer* opt = OptimizerRegistry::Find("sgd");
+opt->SetParam("rescale_grad", 1.0/batch_size);
+
+// Start training
+for (int iter = 0; iter < max_epoch; ++iter) {
+ train_iter.Reset();
+
+ while (train_iter.Next()) {
+ auto data_batch = train_iter.GetDataBatch();
+ // Set data and label
+ args["X"] = data_batch.data;
+ args["label"] = data_batch.label;
+
+ // Create executor by binding parameters to the model
+ auto *exec = net.SimpleBind(ctx, args);
+ // Compute gradients
+ exec->Forward(true);
+ exec->Backward();
+ // Update parameters
+ exec->UpdateAll(opt, learning_rate, weight_decay);
+ // Remember to free the memory
+ delete exec;
+ }
+}
+```
+
+We also want to see how our model performs. The C++ package provides convenient APIs for
+evaluating. Here we use accuracy as metric. The inference is almost the same as training,
+ except that we don't need gradients.
+
+```c++
+Accuracy acc;
+val_iter.Reset();
+while (val_iter.Next()) {
+ auto data_batch = val_iter.GetDataBatch();
+ args["X"] = data_batch.data;
+ args["label"] = data_batch.label;
+ auto *exec = net.SimpleBind(ctx, args);
+ // Forward pass is enough as no gradient is needed when evaluating
+ exec->Forward(false);
+ acc.Update(data_batch.label, exec->outputs[0]);
+ delete exec;
+}
+```
+
+You can find the complete code in `mlp_cpu.cpp`. Use `make mlp_cpu` to compile it,
+ and `./mlp_cpu` to run it. If it complains that the shared library `libmxnet.so` is not found
+ after typing `./mlp_cpu`, you will need to specify the path to the shared library in
+ the environment variable `LD_LIBRARY_PATH` in Linux and `DYLD_LIBRARY_PATH`
+ in MacOS. For example, if you are using MacOS, typing
+ `DYLD_LIBRARY_PATH+=. ./mlp_cpu` would solve the problem. It basically tells the system
+ to find the shared library under the current directory since we have just copied it here.
+
+GPU Support
+-----------
+It's worth noting that changing context from `Context::cpu()` to `Context::gpu()` is not enough,
+because the data read by data iter are stored in memory, we cannot assign it directly to the
+parameters. To bridge this gap, NDArray provides data synchronization functionalities between
+GPU and CPU. We will illustrate it by making the mlp code run on GPU.
+
+In the previous code, data are used like
+
+```c++
+args["X"] = data_batch.data;
+args["label"] = data_batch.label;
+```
+
+It will be problematic if other parameters are created in the context of GPU. We can use
+`NDArray::CopyTo` to solve this problem.
+
+```c++
+// Data provided by DataIter are stored in memory, should be copied to GPU first.
+data_batch.data.CopyTo(&args["X"]);
+data_batch.label.CopyTo(&args["label"]);
+// CopyTo is imperative, need to wait for it to complete.
+NDArray::WaitAll();
+```
+
+By replacing the former code to the latter one, we successfully port the code to GPU.
+You can find the complete code in `mlp_gpu.cpp`. Compilation is similar to the cpu version.
+Note that the shared library must be built with GPU support enabled.
diff --git a/docs/static_site/src/pages/api/cpp/docs/tutorials/index.md b/docs/static_site/src/pages/api/cpp/docs/tutorials/index.md
new file mode 100644
index 000000000000..82a5330044a8
--- /dev/null
+++ b/docs/static_site/src/pages/api/cpp/docs/tutorials/index.md
@@ -0,0 +1,22 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_landing_tutorials
+title: C++ Tutorials
+permalink: /api/cpp/docs/tutorials/
+tag: cpp
+---
diff --git a/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md b/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md
new file mode 100644
index 000000000000..72a56240523a
--- /dev/null
+++ b/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md
@@ -0,0 +1,293 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: C++ API inference tutorial
+action: Get Started
+action_url: /get_started
+permalink: /api/cpp/docs/tutorials/cpp_inference
+is_tutorial: true
+tag: cpp
+---
+
+# C++ API inference tutorial
+
+## Overview
+MXNet provides various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily.
+Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Python](https://mxnet.incubator.apache.org/api/python/module/module.html), [Java](https://mxnet.incubator.apache.org/api/java/index.html), [Scala](https://mxnet.incubator.apache.org/api/scala/index.html), and [C++](https://mxnet.incubator.apache.org/api/c++/index.html) APIs.
+
+This tutorial is a continuation of the [Gluon end to end tutorial](https://mxnet.apache.org/versions/master/tutorials/gluon/gluon_from_experiment_to_deployment.html), we will focus on the MXNet C++ API. We have slightly modified the code in [C++ Inference Example](https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference) for our use case.
+
+## Prerequisites
+
+To complete this tutorial, you need:
+- Complete the training part of [Gluon end to end tutorial](https://mxnet.apache.org/versions/master/tutorials/gluon/gluon_from_experiment_to_deployment.html)
+- Learn the basics about [MXNet C++ API](https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
+
+
+## Setup the MXNet C++ API
+To use the C++ API in MXNet, you need to build MXNet from source with C++ package. Please follow the [built from source guide](https://mxnet.incubator.apache.org/install/ubuntu_setup.html), and [C++ Package documentation](https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
+to enable the C++ API.
+The summary of those two documents is that you need to build MXNet from source with `USE_CPP_PACKAGE` flag set to 1. For example: `make -j USE_CPP_PACKAGE=1`.
+
+## Load the model and run inference
+
+After you complete [the previous tutorial](https://mxnet.apache.org/versions/master/tutorials/gluon/gluon_from_experiment_to_deployment.html), you will get the following output files:
+1. Model Architecture stored in `flower-recognition-symbol.json`
+2. Model parameter values stored in `flower-recognition-0040.params` (`0040` is for 40 epochs we ran)
+3. Label names stored in `synset.txt`
+4. Mean and standard deviation values stored in `mean_std_224` for image normalization.
+
+
+Now we need to write the C++ code to load them and run prediction on a test image.
+The full code is available in the [C++ Inference Example](https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference), we will walk you through it and point out the necessary changes to make for our use case.
+
+
+
+### Write a predictor using the MXNet C++ API
+
+In general, the C++ inference code should follow the 4 steps below. We can do that using a Predictor class.
+1. Load the pre-trained model
+2. Load the parameters of pre-trained model
+3. Load the image to be classified in to NDArray and apply image transformation we did in training
+4. Run the forward pass and predict the class of the input image
+
+```c++
+class Predictor {
+ public:
+ Predictor() {}
+ Predictor(const std::string& model_json_file,
+ const std::string& model_params_file,
+ const Shape& input_shape,
+ bool gpu_context_type = false,
+ const std::string& synset_file = "",
+ const std::string& mean_image_file = "");
+ void PredictImage(const std::string& image_file);
+ ~Predictor();
+
+ private:
+ void LoadModel(const std::string& model_json_file);
+ void LoadParameters(const std::string& model_parameters_file);
+ void LoadSynset(const std::string& synset_file);
+ NDArray LoadInputImage(const std::string& image_file);
+ void LoadMeanImageData();
+ void LoadDefaultMeanImageData();
+ void NormalizeInput(const std::string& mean_image_file);
+ inline bool FileExists(const std::string& name) {
+ struct stat buffer;
+ return (stat(name.c_str(), &buffer) == 0);
+ }
+ NDArray mean_img;
+ std::map args_map;
+ std::map aux_map;
+ std::vector output_labels;
+ Symbol net;
+ Executor *executor;
+ Shape input_shape;
+ NDArray mean_image_data;
+ NDArray std_dev_image_data;
+ Context global_ctx = Context::cpu();
+ std::string mean_image_file;
+};
+```
+
+### Load the model, synset file, and normalization values
+
+In the Predictor constructor, you need to provide paths to saved json and param files. After that, add the following methods `LoadModel` and `LoadParameters` to load the network and its parameters. This part is the same as [the example](https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/inception_inference.cpp).
+
+Next, we need to load synset file, and normalization values. We have made the following change since our synset file contains flower names and we used both mean and standard deviation for image normalization.
+
+```c++
+/*
+ * The following function loads the synset file.
+ * This information will be used later to report the label of input image.
+ */
+void Predictor::LoadSynset(const std::string& synset_file) {
+ if (!FileExists(synset_file)) {
+ LG << "Synset file " << synset_file << " does not exist";
+ throw std::runtime_error("Synset file does not exist");
+ }
+ LG << "Loading the synset file.";
+ std::ifstream fi(synset_file.c_str());
+ if (!fi.is_open()) {
+ std::cerr << "Error opening synset file " << synset_file << std::endl;
+ throw std::runtime_error("Error in opening the synset file.");
+ }
+ std::string lemma;
+ while (getline(fi, lemma)) {
+ output_labels.push_back(lemma);
+ }
+ fi.close();
+}
+
+/*
+ * The following function loads the mean and standard deviation values.
+ * This data will be used for normalizing the image before running the forward
+ * pass.
+ * The output data has the same shape as that of the input image data.
+ */
+void Predictor::LoadMeanImageData() {
+ LG << "Load the mean image data that will be used to normalize "
+ << "the image before running forward pass.";
+ mean_image_data = NDArray(input_shape, global_ctx, false);
+ mean_image_data.SyncCopyFromCPU(
+ NDArray::LoadToMap(mean_image_file)["mean_img"].GetData(),
+ input_shape.Size());
+ NDArray::WaitAll();
+ std_dev_image_data = NDArray(input_shape, global_ctx, false);
+ std_dev_image_data.SyncCopyFromCPU(
+ NDArray::LoadToMap(mean_image_file)["std_img"].GetData(),
+ input_shape.Size());
+ NDArray::WaitAll();
+}
+```
+
+
+
+### Load input image
+
+Now let's add a method to load the input image we want to predict and converts it to NDArray for prediction.
+```c++
+NDArray Predictor::LoadInputImage(const std::string& image_file) {
+ if (!FileExists(image_file)) {
+ LG << "Image file " << image_file << " does not exist";
+ throw std::runtime_error("Image file does not exist");
+ }
+ LG << "Loading the image " << image_file << std::endl;
+ std::vector array;
+ cv::Mat mat = cv::imread(image_file);
+ /*resize pictures to (224, 224) according to the pretrained model*/
+ int height = input_shape[2];
+ int width = input_shape[3];
+ int channels = input_shape[1];
+ cv::resize(mat, mat, cv::Size(height, width));
+ for (int c = 0; c < channels; ++c) {
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ array.push_back(static_cast(mat.data[(i * height + j) * 3 + c]));
+ }
+ }
+ }
+ NDArray image_data = NDArray(input_shape, global_ctx, false);
+ image_data.SyncCopyFromCPU(array.data(), input_shape.Size());
+ NDArray::WaitAll();
+ return image_data;
+}
+```
+
+### Predict the image
+
+Finally, let's run the inference. It's basically using MXNet executor to do a forward pass. To run predictions on multiple images, you can load the images in a list of NDArrays and run prediction in batches. Note that the Predictor class may not be thread safe. Calling it in multi-threaded environments was not tested. To utilize multi-threaded prediction, you need to use the C predict API. Please follow the [C predict example](https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/predict-cpp).
+
+An additional step is to normalize the image NDArrays values to `(0, 1)` and apply mean and standard deviation we just loaded.
+
+```c++
+/*
+ * The following function runs the forward pass on the model.
+ * The executor is created in the constructor.
+ *
+ */
+void Predictor::PredictImage(const std::string& image_file) {
+ // Load the input image
+ NDArray image_data = LoadInputImage(image_file);
+
+ // Normalize the image
+ image_data.Slice(0, 1) /= 255.0;
+ image_data -= mean_image_data;
+ image_data /= std_dev_image_data;
+
+ LG << "Running the forward pass on model to predict the image";
+ /*
+ * The executor->arg_arrays represent the arguments to the model.
+ *
+ * Copying the image_data that contains the NDArray of input image
+ * to the arg map of the executor. The input is stored with the key "data" in the map.
+ *
+ */
+ image_data.CopyTo(&(executor->arg_dict()["data"]));
+ NDArray::WaitAll();
+
+ // Run the forward pass.
+ executor->Forward(false);
+
+ // The output is available in executor->outputs.
+ auto array = executor->outputs[0].Copy(global_ctx);
+ NDArray::WaitAll();
+
+ /*
+ * Find out the maximum accuracy and the index associated with that accuracy.
+ * This is done by using the argmax operator on NDArray.
+ */
+ auto predicted = array.ArgmaxChannel();
+ NDArray::WaitAll();
+
+ int best_idx = predicted.At(0, 0);
+ float best_accuracy = array.At(0, best_idx);
+
+ if (output_labels.empty()) {
+ LG << "The model predicts the highest accuracy of " << best_accuracy << " at index "
+ << best_idx;
+ } else {
+ LG << "The model predicts the input image to be a [" << output_labels[best_idx]
+ << " ] with Accuracy = " << best_accuracy << std::endl;
+ }
+}
+```
+
+### Compile and run the inference code
+
+You can find the [full code for the inference example](https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference) in the `cpp-package` folder of the project
+, and to compile it use this [Makefile](https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/Makefile).
+
+Make a copy of the example code, rename it to `flower_inference` and apply the changes we mentioned above. Now you will be able to compile and run inference. Run `make all`. Once this is complete, run inference with the following parameters. Remember to set your `LD_LIBRARY_PATH` to point to MXNet library if you have not done so.
+
+```bash
+make all
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH=:path/to/incubator-mxnet/lib
+./flower_inference --symbol flower-recognition-symbol.json --params flower-recognition-0040.params --synset synset.txt --mean mean_std_224.nd --image ./data/test/lotus/image_01832.jpg
+```
+
+Then it will predict your image:
+
+```bash
+[17:38:51] resnet.cpp:150: Loading the model from flower-recognition-symbol.json
+
+[17:38:51] resnet.cpp:163: Loading the model parameters from flower-recognition-0040.params
+
+[17:38:52] resnet.cpp:190: Loading the synset file.
+[17:38:52] resnet.cpp:211: Load the mean image data that will be used to normalize the image before running forward pass.
+[17:38:52] resnet.cpp:263: Loading the image ./data/test/lotus/image_01832.jpg
+
+[17:38:52] resnet.cpp:299: Running the forward pass on model to predict the image
+[17:38:52] resnet.cpp:331: The model predicts the input image to be a [lotus ] with Accuracy = 8.63046
+```
+
+
+
+## What's next
+
+Now you can explore more ways to run inference and deploy your models:
+1. [Java Inference examples](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
+2. [Scala Inference examples](https://mxnet.incubator.apache.org/tutorials/scala/)
+3. [ONNX model inference examples](https://mxnet.incubator.apache.org/tutorials/onnx/inference_on_onnx_model.html)
+4. [MXNet Model Server Examples](https://github.com/awslabs/mxnet-model-server/tree/master/examples)
+
+## References
+
+1. [Gluon end to end tutorial](https://mxnet.apache.org/versions/master/tutorials/gluon/gluon_from_experiment_to_deployment.html)
+2. [Gluon C++ inference example](https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/)
+3. [Gluon C++ package](https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
diff --git a/docs/static_site/src/pages/api/cpp/docs/tutorials/subgraphAPI.md b/docs/static_site/src/pages/api/cpp/docs/tutorials/subgraphAPI.md
new file mode 100644
index 000000000000..bd5450e0bf7a
--- /dev/null
+++ b/docs/static_site/src/pages/api/cpp/docs/tutorials/subgraphAPI.md
@@ -0,0 +1,175 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Subgraph API
+action: Get Started
+action_url: /get_started
+permalink: /api/cpp/docs/tutorials/subgraph_api
+is_tutorial: true
+tag: cpp
+---
+
+## Subgraph API
+
+The subgraph API has been proposed and implemented as the default mechanism for integrating backend libraries to MXNet. The subgraph API is a very flexible interface. Although it was proposed as an integration mechanism, it has been used as a tool for manipulating NNVM graphs for graph-level optimizations, such as operator fusion.
+
+The subgraph API works as the following steps:
+
+* Search for particular patterns in a graph.
+* Group the operators/nodes with particular patterns into a subgraph and shrink the subgraph into a single node.
+* Replace the subgraph in the original graph with the subgraph node.
+
+The figure below illustrates the subgraph mechanism.
+
+![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/subgraph/subgraph.png)
+
+The subgraph API allows the backend developers to customize the subgraph mechanism in two places:
+
+* Subgraph searching: define a subgraph selector to search for particular patterns in a computation graph.
+* Subgraph node creation: attach an operator to run the computation in the subgraph. We can potentially manipulate the subgraph here.
+
+
+The following is a demonstration of how the subgraph API can be applied to a simple task. Refer to the previous figure for an overview of the process. That is, replacing `Convolution` and `BatchNorm` with the conv_bn.
+
+The first step is to define a subgraph selector to find the required pattern. To find a pattern that has `Convolution` and `BatchNorm`, we can start the search on the node with `Convolution`. Then from the `Convolution` node, we search for `BatchNorm` along the outgoing edge.
+
+```c++
+class SgSelector : public SubgraphSelector {
+ public:
+ SgSelector() {
+ find_bn = false;
+ }
+ bool Select(const nnvm::Node &n) override {
+ // Here we start on the Convolution node to search for a subgraph.
+ return n.op() && n.op()->name == "Convolution";
+ }
+ bool SelectInput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+ // We don't need to search on the incoming edge.
+ return false;
+ }
+ bool SelectOutput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+ // We search on the outgoing edge. Once we find a BatchNorm node, we won't
+ // accept any more BatchNorm nodes.
+ if (new_node.op() && new_node.op()->name == "BatchNorm" && !find_bn) {
+ find_bn = true;
+ return true;
+ } else {
+ return false;
+ }
+ }
+ std::vector Filter(const std::vector &candidates) override {
+ // We might have found a Convolution node, but we might have failed to find a BatchNorm
+ // node that uses the output of the Convolution node. If we failed, we should skip
+ // the Convolution node as well.
+ if (find_bn)
+ return candidates;
+ else
+ return std::vector();
+ }
+ private:
+ bool find_bn;
+};
+```
+
+The second step is to define a subgraph property to use the subgraph selector above to customize the subgraph searching. By defining this class, we can also customize subgraph node creation. When customizing node creation, we can specify what operator to run the subgraph on the node. In this example, we use `CachedOp`, which itself is a graph executor, to run the subgraph with `Convolution` and `BatchNorm`. In practice, it's most likely that we use a single operator from a backend library to replace the two operators for execution.
+
+{% raw %}
+```c++
+class SgProperty : public SubgraphProperty {
+ public:
+ static SubgraphPropertyPtr Create() {
+ return std::make_shared();
+ }
+ nnvm::NodePtr CreateSubgraphNode(
+ const nnvm::Symbol &sym, const int subgraph_id = 0) const override {
+ // We can use CachedOp to execute the subgraph.
+ nnvm::NodePtr n = nnvm::Node::Create();
+ n->attrs.op = Op::Get("_CachedOp");
+ n->attrs.name = "ConvBN" + std::to_string(subgraph_id);
+ n->attrs.subgraphs.push_back(std::make_shared(sym));
+ std::vector > flags{{"static_alloc", "true"}};
+ n->attrs.parsed = CachedOpPtr(new CachedOp(sym, flags));
+ return n;
+ }
+ SubgraphSelectorPtr CreateSubgraphSelector() const override {
+ auto property = std::make_shared();
+ property->SetAttr("property_name", "subgraph example pass"); // Optional, better to have it.
+ property->SetAttr("inference_only", true); // Optional, only for inference_only pass.
+ return property;
+ }
+};
+```
+{% endraw %}
+`SetAttr` is optional and developer can define their own attributes to control property behavior.
+There're 2 built-in attributes that used by MXNet executor.
+
+`property_name` : std::string, name of this property.
+
+`inference_only` : bool, apply this property only for inference. Property will be skiped when need_grad=True. Default `false` if this attribute isn't defined.
+
+After defining the subgraph property, we need to register it under a backend in .cc file.
+
+Firstly, we need to register the backend
+
+```C++
+MXNET_REGISTER_SUBGRAPH_BACKEND(SgTest);
+```
+
+Then register the property under it.
+
+```c++
+MXNET_REGISTER_SUBGRAPH_PROPERTY(SgTest, SgProperty);
+```
+
+It's possible to register multiple properties for same backend. In practice, we recommend to put each property definition into .h file, and register backend in single .cc file. Property will be executed according to the register order.
+
+```c++
+#include "SgProperty.h" // Define SgProperty class
+#include "SgProperty2.h" // Define SgProperty2 class
+#include "SgProperty3.h" // Define SgProperty3 class
+
+MXNET_REGISTER_SUBGRAPH_BACKEND(SgTest);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(SgTest, SgProperty); // Execution order 1.
+MXNET_REGISTER_SUBGRAPH_PROPERTY(SgTest, SgProperty2); // Execution order 2.
+MXNET_REGISTER_SUBGRAPH_PROPERTY(SgTest, SgProperty3); // Execution order 3.
+```
+
+After compiling this subgraph mechanism into MXNet, we can use the environment variable `MXNET_SUBGRAPH_BACKEND` to activate it during symbol bind.
+
+```bash
+export MXNET_SUBGRAPH_BACKEND=SgTest
+```
+
+Or you can use python symbol API `get_backend_symbol` to run all properties registered for this backend and get returned symbol.
+
+```python
+sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+sym = sym.get_backend_symbol('SgTest')
+```
+
+When `SgProperty` is activated, a message will be shown in terminal as
+
+```bash
+start to execute subgraph example pass.
+```
+
+This tutorial shows a simple example of how to use the subgraph API to search for patterns in an NNVM graph.
+Intested users can try different pattern matching rules (i.e., define their own `SubgraphSelector`) and
+attach different operators to execute the subgraphs.
+
+
diff --git a/docs/static_site/src/pages/api/cpp/index.md b/docs/static_site/src/pages/api/cpp/index.md
new file mode 100644
index 000000000000..f3b65a366078
--- /dev/null
+++ b/docs/static_site/src/pages/api/cpp/index.md
@@ -0,0 +1,61 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: C++ Guide
+action: Get Started
+action_url: /get_started
+permalink: /api/cpp
+tag: cpp
+---
+
+# MXNet - C++ API
+
+The MXNet C++ Package provides C++ API bindings to the users of MXNet. Currently, these bindings are not available as standalone package.
+The users of these bindings are required to build this package as mentioned below.
+
+## Building C++ Package
+
+The cpp-package directory contains the implementation of C++ API. As mentioned above, users are required to build this directory or package before using it.
+**The cpp-package is built while building the MXNet shared library, *libmxnet.so*.**
+
+### Steps to build the C++ package:
+1. Building the MXNet C++ package requires building MXNet from source.
+2. Clone the MXNet GitHub repository **recursively** to ensure the code in submodules is available for building MXNet.
+ ```
+ git clone --recursive https://github.com/apache/incubator-mxnet mxnet
+ ```
+
+3. Install the [prerequisites](), desired [BLAS libraries]() and optional [OpenCV, CUDA, and cuDNN]() for building MXNet from source.
+4. There is a configuration file for make, [make/config.mk]() that contains all the compilation options. You can edit this file and set the appropriate options prior to running the **make** command.
+5. Please refer to [platform specific build instructions]() and available [build configurations](https://mxnet.incubator.apache.org/install/build_from_source#build-configurations) for more details.
+5. For enabling the build of C++ Package, set the **USE\_CPP\_PACKAGE = 1** in [make/config.mk](). Optionally, the compilation flag can also be specified on **make** command line as follows.
+ ```
+ make -j USE_CPP_PACKAGE=1
+ ```
+
+## Usage
+
+In order to consume the C++ API please follow the steps below.
+
+1. Ensure that the MXNet shared library is built from source with the **USE\_CPP\_PACKAGE = 1**.
+2. Include the [MxNetCpp.h]() in the program that is going to consume MXNet C++ API.
+ ```c++
+ #include
+ ```
+3. While building the program, ensure that the correct paths to the directories containing header files and MXNet shared library.
+4. The program links the MXNet shared library dynamically. Hence the library needs to be accessible to the program during runtime. This can be achieved by including the path to the shared library in the environment variable **LD\_LIBRARY\_PATH** for Linux, Mac. and Ubuntu OS and **PATH** for Windows OS.
diff --git a/docs/static_site/src/pages/api/faq/add_op_in_backend.md b/docs/static_site/src/pages/api/faq/add_op_in_backend.md
new file mode 100644
index 000000000000..f633e57ce645
--- /dev/null
+++ b/docs/static_site/src/pages/api/faq/add_op_in_backend.md
@@ -0,0 +1,692 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Exception Handling in MXNet
+category: faq
+faq_c: Extend and Contribute to MXNet
+question: How do I implement operators in MXNet backend?
+permalink: /api/faq/add_op_in_backend
+---
+
+# A Beginner's Guide to Implementing Operators in MXNet Backend
+
+## Introduction
+Operators are essential elements for constructing neural networks. They define mathematical formulas
+of transforming input data (tensors) to outputs. MXNet has a rich set of operators from simple ones,
+such as element-wise sum, to complicated ones, such as convolution, that is
+capable of constructing most of the popular neural networks. You may have noticed
+that many operators implemented in MXNet have their equivalent forms in Numpy, such as
+[repeat](https://docs.scipy.org/doc/numpy/reference/generated/numpy.repeat.html),
+[tile](https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html),
+etc., and wonder why we could not simply use those Numpy operators in MXNet. One of the
+major reasons is that we need to support both CPU and GPU computing for the operators in MXNet,
+while Numpy operators do not possess GPU computing capability.
+In addition, we have performed plenty of
+optimizations for various components in MXNet, such as tensor data structure (`NDArray`),
+execution engine, computational graph and so on, for maximizing memory and runtime efficiency.
+An operator implemented under the MXNet operator framework would greatly
+leverage those optimizations for exhaustive performance enhancement.
+
+In this tutorial, we are going to practice implementing an operator using
+C++ in the MXNet backend. After finishing the implementation,
+we will add unit tests using Python for the operator we just implemented.
+
+## Implementation
+### An Operator Example
+Let's take the [quadratic function](https://en.wikipedia.org/wiki/Quadratic_function)
+as an example: `f(x) = ax^2+bx+c`. We want to implement an operator called `quadratic`
+taking `x`, which is a tensor, as an input and generating an output tensor `y`
+satisfying `y.shape=x.shape` and each element of `y` is calculated by feeding the
+corresponding element of `x` into the quadratic function `f`.
+Here variables `a`, `b`, and `c` are user input parameters.
+In frontend, the operator works like this:
+```python
+x = [[1, 2], [3, 4]]
+y = quadratic(data=x, a=1, b=2, c=3)
+y = [[6, 11], [18, 27]]
+```
+To implement this, we first create three files: `quadratic_op-inl.h`,
+`quadratic_op.cc`, and `quadratic_op.cu`. The header file's name
+is prefixed by the operator name and followed by `op` and `-inl`
+indicating that this is an operator implementation with inline
+functions shared by CPU and GPU computing. The CPU and GPU
+specific implementations reside in their own `.cc` and `.cu` files,
+respectively. We normally put pure tensor related operators
+(e.g. `tile`, `repeat`, etc.) under
+the directory `src/operator/tensor`, and neural network operators
+(e.g. `Convolution`, `Pooling`, etc.) under `src/operator/nn`.
+You may have noticed that many neural network operators including
+`Convolution` and `Pooling` are currently saved under `src/operator`.
+We plan to move them to `src/operator/nn` for better file organization
+and clearer hierarchy in the future.
+
+Next, we are going to
+1. Define the parameter struct
+for registering `a`, `b`, and `c` in `quadratic_op-inl.h`.
+2. Define type and shape inference functions in `quadratic_op-inl.h`.
+3. Define forward and backward functions in `quadratic_op-inl.h`.
+4. Register the operator using [nnvm](https://docs.tvm.ai/dev/nnvm_overview.html)
+in `quadratic_op.cc` and `quadratic_op.cu` for
+CPU and GPU computing, respectively.
+
+Now let's walk through the process step by step.
+
+### Parameter Registration
+We first define `struct QuadraticParam` as a placeholder for the
+parameters `a`, `b`, and `c` in `quadratic_op-inl.h`.
+The struct inherits from a base template
+struct named `dmlc::Parameter`, where the template argument is the derived struct
+`QuadraticParam`. This technique, which is called [curiously recurring template
+pattern](https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern),
+achieves static polymorphism. It is similar to using a virtual function,
+but without the cost associated with dynamic polymorphism.
+
+```cpp
+struct QuadraticParam : public dmlc::Parameter {
+ float a, b, c;
+ DMLC_DECLARE_PARAMETER(QuadraticParam) {
+ DMLC_DECLARE_FIELD(a)
+ .set_default(0.0)
+ .describe("Coefficient of the quadratic term in the quadratic function.");
+ DMLC_DECLARE_FIELD(b)
+ .set_default(0.0)
+ .describe("Coefficient of the linear term in the quadratic function.");
+ DMLC_DECLARE_FIELD(c)
+ .set_default(0.0)
+ .describe("Constant term in the quadratic function.");
+ }
+};
+```
+
+The function calls in the above parameter struct are self-explanatory by their names.
+Note that for each parameter, we set the default value to `0.0` such that users can
+skip passing 0-value parameters through the quadratic operator interface. You
+can choose not to define the default value for a parameter if it is required
+at runtime. Meanwhile, adding brief descriptions to the parameters enables
+the documentation engine to display them on
+[MXNet documentation web page](https://mxnet.incubator.apache.org/api/python/index.html).
+
+### Attribute Inference
+Attribute inference is the process of deducing the properties of `NDArray`s
+in neural networks from user provided information. Two most common attributes
+of an `NDArray` are data shape and data type.
+Let's take a look at the following example.
+Given an input `NDArray` called `data`, you invoke the `quadratic` operator
+like this: `output = mx.nd.quadratic(data, a=1, b=2, c=3)`. Before calculating
+the `output` values, its shape and data type are inferred from the input
+`data`'s shape and type following
+the rules you defined in order to allocate memory space for the output tensor.
+
+One important thing to note that inference functions should be capable of
+performing **mutual inference**, i.e.
+inferring one argument's attribute from another argument's attribute if
+possible according to the definition of the operator.
+This is very useful for a computational graph to deduce unknown attributes
+for a neural network in symbolic programming. Users can view the computational
+graph as a symbol with every element initialized for running data
+throughout the neural network, including memory allocation for each tensor,
+device placement for each operator, etc. Users normally just need
+to provide minimum necessary information, such as input data shapes, etc.,
+to the computational graph, and the graph will fill up the unknown attributes
+using the attribute inference functions defined in the operators building up
+the neural network.
+
+Let's consider the following example.
+```python
+>>> import mxnet as mx
+>>> a = mx.sym.Variable('a', shape=(2, 0))
+>>> b = mx.sym.Variable('b')
+>>> c = mx.sym.Variable('c', shape=(0, 3))
+>>> d = a * b + b * c
+>>> print d.infer_shape()
+([(2L, 3L), (2L, 3L), (2L, 3L)], [(2L, 3L)], [])
+```
+The last line of the above code snippet is a tuple of three lists returned
+by `d.infer_shape()`. The first list contains all the argument shapes
+of `a`, `b`, and `c`. The second contains the output shape of `d`. The
+third one represents the shapes of auxiliary states, which is not used
+in this case, and thus is empty.
+In this example, we only specified values for variable `a`'s first dimension
+and `c`'s second dimension. The `0` in shape `(2, 0)` indicates that the size
+of the second dimension is unknown, same meaning for shape `(0, 3)`.
+However, the symbol `d` still successfully inferred the shapes
+for all the variables and final output. This is a result of mutual
+inference. In MXNet, the whole process can be interpreted as this:
+1. `a` and `b` are combined via an element-wise multiplication operator,
+so the shapes of `a` and `b` are same and `b`'s first dimension size is `2`.
+2. `b` and `c` are combined via an element-wise multiplication operator too,
+so the shapes of `b` and `c` are same and `b`'s second dimension size is `3`.
+3. Now `b`'s shape is completely known, so `a` and `c` missing dimension sizes
+are known as well.
+4. `d` is a result from adding `a * b` and `b * c`, so d should also
+have the same shape as `b`.
+
+The above four steps illustrate how shape inference logic works in MXNet.
+It is actually implemented in the shape inference functions of the operators for
+element-wise multiplication and addition.
+
+For our `quadratic` operator, shape inference possesses quite similar logic.
+```cpp
+inline bool QuadraticOpShape(const nnvm::NodeAttrs& attrs,
+ mxnet::ShapeVector* in_attrs,
+ mxnet::ShapeVector* out_attrs) {
+ CHECK_EQ(in_attrs->size(), 1U);
+ CHECK_EQ(out_attrs->size(), 1U);
+
+ SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+ SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+ return out_attrs->at(0).ndim() != 0U && out_attrs->at(0).Size() != 0U;
+}
+```
+Here are a few things to note about the above function:
+
+1. `attrs` contains parameters `a`, `b`, and `c` from user input.
+It's not used here since we don't rely on that information for shape inference.
+2. `in_attrs` is a vector containing all input shapes. Since there is
+only one input argument for operator `quadratic`, we used macro `CHECK_EQ`
+to assert when the vector's size is wrong.
+3. `out_attrs` is a vector containing all output shapes. We also used
+`CHECK_EQ` to verify the size of the vector since there is only one output.
+4. We called macro `SHAPE_ASSIGN_CHECK` twice for mutual inference. One for
+inferring the output shape from the input shape, the other one is for inferring
+the input shape from the output shape.
+If there are any unequal non-zero values in the same
+dimension of two shapes, such as `(2, 3)` and `(3, 3)`, the macro would throw an
+exception with an error message for shape inference.
+5. At the end of the function body, we checked whether the output shape
+is completely known by testing whether the shape is not empty and
+the shape's size is greater than `0`. Note that in MXNet, an empty shape
+means that the shape is unknown, and
+a `0` in a shape means that the size of that dimension is unknown. In both
+situations, the missing shape information must
+be inferred from other shapes. If it cannot be inferred,
+the function should return `false` to notify the caller about shape inference failure.
+6. MXNet provides a convenience function implementing the logic of mutual inference
+for general element-wise operators with the following interface. Users can
+instantiate this function with `n_in=1` and `n_out=1` to replace the above
+function `QuadraticOpShape` in operator registration (explained later).
+The function `QuadraticOpShape` posted here is for the purpose of illustration only.
+```cpp
+template
+inline bool ElemwiseShape(const nnvm::NodeAttrs& attrs,
+ mxnet::ShapeVector *in_attrs,
+ mxnet::ShapeVector *out_attrs);
+```
+
+The same logic goes for data type inference. We will leave the analysis of
+the following code sample to users. Note that `-1` means the data type
+is unknown and must be inferred from other input or output data types.
+```cpp
+inline bool QuadraticOpType(const nnvm::NodeAttrs& attrs,
+ std::vector* in_attrs,
+ std::vector* out_attrs) {
+ CHECK_EQ(in_attrs->size(), 1U);
+ CHECK_EQ(out_attrs->size(), 1U);
+
+ TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+ TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+ return out_attrs->at(0) != -1;
+}
+```
+
+Again, MXNet provides the following convenience function for mutual
+type inference of element-wise operators. Users can use that
+in operator registration (explained later).
+```cpp
+template
+inline bool ElemwiseType(const nnvm::NodeAttrs& attrs,
+ std::vector* in_attrs,
+ std::vector* out_attrs);
+```
+
+### Forward Function
+Forward function defines the operator's behavior in the forward pass
+of neural networks. For our `quadratic` operator, it simply implements
+the logic of running a tensor through the quadratic function by performing
+a few element-wise operations. The forward function's signature is fixed
+in MXNet as follows:
+```cpp
+void (const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector& inputs,
+ const std::vector& req,
+ const std::vector& outputs);
+```
+We first paste the whole forward function code here
+and then go through it line by line.
+```cpp
+template // 1
+void QuadraticOpForward(const nnvm::NodeAttrs& attrs, // 2
+ const OpContext& ctx, // 3
+ const std::vector& inputs, // 4
+ const std::vector& req, // 5
+ const std::vector& outputs) { // 6
+ CHECK_EQ(inputs.size(), 1U); // 7
+ CHECK_EQ(outputs.size(), 1U); // 8
+ CHECK_EQ(req.size(), 1U); // 9
+ mshadow::Stream *s = ctx.get_stream(); // 10
+ const TBlob& in_data = inputs[0]; // 11
+ const TBlob& out_data = outputs[0]; // 12
+ const QuadraticParam& param = nnvm::get(attrs.parsed); // 13
+ using namespace mxnet_op; // 14
+ MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, { // 15
+ MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, { // 16
+ Kernel, xpu>::Launch( // 17
+ s, out_data.Size(), out_data.dptr(), in_data.dptr(), // 18
+ param.a, param.b, param.c); // 19
+ }); // 20
+ }); // 21
+} // 22
+```
+- Line 1: `xpu` stands for a generic device type so that the function can be instantiated
+for both CPU and GPU computing using concrete types `cpu` and `gpu`. The instantiation happens
+at the time when the operator is registered in `.cc` and `.cu` files.
+- Line 2: `attrs` is a node attribute containing the user input parameters `a`, `b`, and `c`.
+Here the node represents a placeholder for the operator in the whole computational graph for
+the neural network.
+- Line 3: `ctx` holds something called `stream` for
+serializing asynchronous executions. Let's consider
+this example for understanding the functionality of `stream`.
+We want to launch several GPU kernels with the same `stream` from CPU.
+Even though the launching operation is non-blocking, the `stream` guarantees
+that the kernels execute in the same order on GPU as they are launched from CPU.
+- Line 4: `inputs` is a vector of input tensors (only one input tensor
+for the `quadratic` operator).
+- Line 5: `req` is a vector of `OpReqType` values. Each value defines
+the way of writing calculated values to the output tensors.
+Therefore, the number of `req`s must be the same as the number of output tensors.
+MXNet currently supports three types of `req` in frontend: `null`, `write`, and `add`.
+`null` means skipping calculating the corresponding output tensor,
+`write` means overwriting the values in the output tensor with the ones
+calculated by this operator, and `add` means adding the calculated values
+to the existing ones in the output tensor. Note that `null` and `add` are usually
+seen in backward passes. The former is for skipping calculating
+the gradients of un-learnable parameters (such as index arrays),
+and the latter is for accumulating gradients throughout networks.
+- Line 6: `outputs` is a vector of output tensors (only one
+output tensor for the `quadratic` operator).
+- Lines 7-9: Verify that the size of each vector is expected.
+Otherwise, stop moving forward and print error message.
+- Line 10: Get the `stream` from the `ctx` for launching kernels.
+- Lines 11-12: Define the references of the input and output tensors
+for later coding convenience. Note that `TBlob` can be understood
+as a uniform data structure for tensors of various dimensions, such
+that tensors of different dimensions can be put in a homogeneous container,
+such as `std::vector` and `std::list`. You can still
+get tensors of desired dimensions from a `TBlob` object through
+the interface `get_with_shape`.
+- Line 13: Get user input parameters from the node attribute.
+- Lines 15-21: This is the place where the mathematical formula of the operator
+is implemented. The macros `MSHADOW_TYPE_SWITCH` and `MXNET_ASSIGN_REQ_SWITCH` enable
+the code block to work for all the supported data types and `req` types in MXNet.
+Inside the inner-most macro, we launch the kernel for calculating
+the output tensor such that each thread takes an element from
+the input tensor, feeds it into the quadratic function, and assigns
+the output element to the output tensor based on `req` type. Note that
+`Kernel::Launch` serves as a universal interface for launching
+parallel computation on both CPU and GPU. This allows most of
+the simple operators to share the same piece of code for CPU and GPU as
+parallelization approaches are often identical on both types of devices.
+The kernel function is defined as the following, where the function
+`Map` is executed by each thread for each input element. The `out_data.Size()`,
+in the `Kernel::Launch` function corresponds to the factor by which the
+workload will get parallelized among the different threads, which here
+corresponds to the size of the output array. To explain a little
+bit more on the two macros used in the kernel struct: (1) `MSHADOW_XINLINE` is
+a consolidated macro for inlining functions compiled by both CPU and GPU
+compilers. It enables CPU and GPU computing to share the same piece of code.
+(2) `KERNEL_ASSIGN` is a macro for unifying the statements of different `req`s
+into the same line of code. It's named `KERNEL_ASSIGN` because we call
+the code blocks running parallel computation kernels.
+On CPUs, the kernels are normally wrapped by the OpenMP `parallel` directive;
+while on GPUs, they are the kernel functions launched by CUDA library.
+
+```cpp
+template
+struct quadratic_forward {
+ template
+ MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data,
+ const float a, const float b, const float c) {
+ KERNEL_ASSIGN(out_data[i], req, in_data[i] * (a * in_data[i] + b) + c);
+ }
+};
+```
+
+### Backward Function
+Backward functions play the role of propagating derivatives of loss function
+with respect to the outputs of the last layer throughout the network to the first
+layer. The whole process is often known as backward propagation. We are not
+going to delineate the principle of backward propagation here since users can find
+great details covered in other resources, such as
+[CS231n](http://cs231n.github.io/optimization-2/) and
+[How the backgropagation algorithm works](http://neuralnetworksanddeeplearning.com/chap2.html).
+The problem we are going to solve here for the `quadratic` operator is that
+given a tensor representing the gradient of the loss function with respect
+to the output of the operator, calculate the gradient with respect to
+the input of the operator. There is no need to calculate the derivatives
+of loss function with respect to user input parameters `a`, `b`, and `c`
+since they are not learnable parameters in the network. To formulate the problem:
+given `dL/dy` and `y = a*x^2 + b*x + c`, where `L` represents the loss function and
+`y` stands for the output of the quadratic tensor, we need to solve for
+`dL/dx`. Using the chain-rule, it is obvious to find that
+```
+dL/dx = dL/dy * dy/dx = dL/dy * (2*a*x + b).
+```
+The above equation indicates that `dL/dx` depends on the gradient
+of the output tensor and value of the input tensor.
+The backward function's signature is the same as the forward function's.
+With the aforementioned information in mind,
+let's breakdown the following backward function line by line.
+```cpp
+template // 1
+void QuadraticOpBackward(const nnvm::NodeAttrs& attrs, // 2
+ const OpContext& ctx, // 3
+ const std::vector& inputs, // 4
+ const std::vector& req, // 5
+ const std::vector& outputs) { // 6
+ CHECK_EQ(inputs.size(), 2U); // 7
+ CHECK_EQ(outputs.size(), 1U); // 8
+ CHECK_EQ(req.size(), 1U); // 9
+ mshadow::Stream *s = ctx.get_stream(); // 10
+ const TBlob& out_grad = inputs[0]; // 11
+ const TBlob& in_data = inputs[1]; // 12
+ const TBlob& in_grad = outputs[0]; // 13
+ const QuadraticParam& param = nnvm::get(attrs.parsed); // 14
+ using namespace mxnet_op; // 15
+ MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, { // 16
+ MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, { // 17
+ Kernel, xpu>::Launch( // 18
+ s, in_grad.Size(), in_grad.dptr(), out_grad.dptr(), // 19
+ in_data.dptr(), param.a, param.b); // 20
+ }); // 21
+ }); // 22
+} // 23
+```
+- Lines 1-6: Backward function has the same signature as forward function.
+- Lines 7-9: Check the sizes of the function arguments. One thing to note
+that since the gradient of the input depends on both the gradient of the output and
+the input tensor itself, `inputs` must contain two `TBlob` objects.
+- Line 10: Get the `stream` of the context for serializing asynchronous executions.
+- Lines 11-13: Convenience reference variables for later use. We name `out_grad`
+as the gradient of the operator output, `in_data` as the input of the operator,
+and `in_grad` as the gradient of the operator input.
+- Line 14: Get the parameter object of `QuadraticParam`.
+- Lines 16-22: Same as in the forward function, this is where parallel
+computation for `in_grad` happens. The struct `quadratic_backward` implements
+the formula of calculating each element of `in_grad` by one thread as the following.
+
+```cpp
+template
+struct quadratic_backward {
+ template
+ MSHADOW_XINLINE static void Map(int i, DType* in_grad, const DType* out_grad,
+ const DType* in_data, const float a, const float b) {
+ KERNEL_ASSIGN(in_grad[i], req, out_grad[i] * (2 * a * in_data[i] + b));
+ }
+};
+```
+
+### Operator Registration
+So far, we have implemented necessary data structure and functions for the operator `quadratic`.
+Now let's register them using `nnvm` to expose the operator `quadratic`
+to frontend. Users can consider the registration process as creating the operator object
+instance, saving it in the operator manager (a singleton),
+and setting attributes for the operator instance.
+
+The following code is from `quadratic_op.cc`, which is responsible
+for registering the operator working on CPU.
+
+{% raw %}
+
+```cpp
+DMLC_REGISTER_PARAMETER(QuadraticParam); // 1
+
+NNVM_REGISTER_OP(quadratic) // 2
+.describe(R"code(This operators implements the quadratic function: // 3
+.. math::
+
+ f(x) = ax^2+bx+c
+
+where :math:`x` is an input tensor and all operations
+in the function are element-wise.
+
+Example:
+
+ .. code-block:: python
+ :emphasize-lines: 1,3
+ x = [[1, 2], [3, 4]]
+ y = quadratic(data=x, a=1, b=2, c=3)
+ y = [[6, 11], [18, 27]]
+
+)code" ADD_FILELINE) // 4
+.set_attr_parser(ParamParser) // 5
+.set_num_inputs(1) // 6
+.set_num_outputs(1) // 7
+.set_attr("FListInputNames", // 8
+ [](const NodeAttrs& attrs) { // 9
+ return std::vector{"data"}; // 10
+ }) // 11
+.set_attr("FInferShape", QuadraticOpShape) // 12
+.set_attr("FInferType", QuadraticOpType) // 13
+.set_attr("FCompute", QuadraticOpForward) // 14
+.set_attr("FGradient", ElemwiseGradUseIn{"_backward_quadratic"}) // 15
+.set_attr("FInplaceOption", // 16
+ [](const NodeAttrs& attrs) { // 17
+ return std::vector >{{0, 0}}; // 18
+ }) // 19
+.add_argument("data", "NDArray-or-Symbol", "Input ndarray") // 20
+.add_arguments(QuadraticParam::__FIELDS__()); // 21
+
+NNVM_REGISTER_OP(_backward_quadratic) // 22
+.set_attr_parser(ParamParser) // 23
+.set_num_inputs(2) // 24
+.set_num_outputs(1) // 25
+.set_attr("TIsBackward", true) // 26
+.set_attr("FCompute", QuadraticOpBackward); // 27
+```
+
+{% endraw %}
+
+- Line 1: Register the parameter struct.
+- Line 2: Register an operator named `quadratic` by creating an instance
+of `Op` type and save it in the operator manager and return a reference
+of the just created operator object.
+- Lines 3-4: Add description as an operator attribute
+including examples of the operator. The documentation engine will extract
+this description and display it on the documentation web page.
+`emphasize-lines` is optional.
+For more examples and troubleshooting with doc strings, refer to the [MXNet
+developer wiki's Documentation Guide](https://cwiki.apache.org/confluence/display/MXNET/Documentation+Guide).
+- Line 5: Set parameter struct parser for the operator. It is used for parsing
+the parameters `a`, `b`, and `c` input from frontend.
+- Line 6: Set the number of inputs for the operator.
+- Line 7: Set the number of outputs for the operator.
+- Lines 8-11: Defines a function generating a vector of names of
+the operator input arguments. This function is used to add missing
+arguments that users did not specify when creating a symbolic operator.
+For example, `quad_func=mx.sym.quadratic()` is still a valid symbol
+since we have added the attribute `FListInputNames` to the operator node
+in the computational graph. MXNet would
+add the missing argument with name `quadratic0_data`, where the prefix
+`quadratic0` is the operator name appended with an index and the postfix
+`data` comes from the return value of the user defined `FListInputName` function.
+Users still can generate an executor for the `quad_func` like the following:
+```python
+quad_exe = quad_func.simple_bind(ctx=mx.cpu(), quadratic0_data=(1,))
+```
+- Line 12: Register shape inference function.
+- Line 13: Register type inference function.
+- Line 14: Register forward function.
+- Line 15: Register the function for creating the node of the operator in
+a backward pass. Note that we used a convenience functor struct `ElemwiseGradUseIn`.
+As you can tell from the name, the registered functor creates the node for gradient computation
+with dependencies on the output gradient node and input node. Similarly, there are
+other three functors defined as `ElemwiseGradUseOut`, `ElemwiseGradUseInOut`,
+and `ElemwiseGradUseNone` for developers' convenience. In order to add
+this attribute, we also need to register a backward operator for `quadratic` with
+several basic attributes, as it can share attribute inference
+functions with the forward operator and is not exposed to frontend.
+- Lines 16-19: This registered function implies that which output tensor can reuse
+which input tensor's memory space instead of allocating a new memory space for the output.
+In the operator `quadratic`, there is only one input and output, and the output can reuse
+the input memory space, so we store a pair of zeros in the function return vector
+indicating that `inputs[0]`'s memory space can be reused by `outputs[0]`.
+Note that this function just provides a hint to the computational graph initializer.
+If there are other nodes depending on the input tensor, the memory space
+of the input tensor will not be overwritten by the output.
+- Line 20: Define the input argument name as `data` for the operator.
+- Line 21: Add user input parameters `a`, `b`, and `c` as the attributes of the operator.
+- Line 22: Register an operator named `_backward_quadratic` for backward pass
+of the operator `quadratic`. The underscore prefix in the operator name indicates
+that this is an operator not exposed to users. The convention
+of naming an internally used backward operator is prepending the prefix `_backward_`
+to the corresponding forward operator name.
+- Line 23: Set the parameter parser for the operator `_backward_quadratic`.
+- Line 24: Set the number of inputs.
+- Line 25: Set the number of outputs.
+- Line 26: Add `TIsBackward` attribute for the operator. The shape and type
+inference passes use this attribute to determine whether a node in the graph is a
+forward or backward node.
+- Line 27: Register backward function.
+
+So far, we have acquired an operator working on CPU in frontend.
+In order to register the operator working on GPUs, we just need to add the following
+code to `quadratic_op.cu`. Note that forward and backward functions
+are registered with attribute key `FCompute`, rather than `FCompute`.
+```cpp
+NNVM_REGISTER_OP(quadratic)
+.set_attr("FCompute", QuadraticOpForward);
+
+NNVM_REGISTER_OP(_backward_quadratic)
+.set_attr("FCompute", QuadraticOpBackward);
+```
+
+### Unit Test
+Now we have finished implementing the operator `quadratic` in MXNet backend.
+If you use python, when you type `import mxnet as mx`, two python
+functions for invoking your backend implementation are
+generated on the fly: one is for imperative programming
+registered as `mxnet.ndarray.quadratic` or `mx.nd.quadratic` for short;
+the other one is for symbolic programming registered under
+module `mxnet.symbol.quadratic` or `mx.sym.quadratic` for short.
+
+In order to unit test it in frontend, we need to add the following code
+to the python file `test_operator.py`. A typical operator implementation
+tests for both the `symbol` API and the `ndarray` API. The following test
+has both these tests. The imperative API test, tests for the `ndarray` API,
+`mx.nd.contrib.quadratic`. The `symbol` API test, tests for the complete
+functionality of the operator - the forward pass and the backward
+pass. To facilitate the testing of these functionalities we use three
+helper functions available in the `mxnet.test_utils` module:
+ - `check_symbolic_forward`
+ - `check_symbolic_backward`
+ - `check_numeric_gradient`
+
+```python
+def test_quadratic_function():
+ def f(x, a, b, c):
+ return a * x**2 + b * x + c
+
+ a = np.random.random_sample()
+ b = np.random.random_sample()
+ c = np.random.random_sample()
+ data = mx.symbol.Variable('data')
+ quad_sym = mx.sym.contrib.quadratic(data=data, a=a, b=b, c=c)
+ for dtype in [np.float16, np.float32, np.float64]:
+ for ndim in range(1, 6):
+ shape = rand_shape_nd(ndim, 5)
+ data_np = np.random.randn(*shape).astype(dtype)
+ expected = f(data_np, a, b, c)
+ backward_expected = 2 * a * data_np + b
+
+ # check imperative forward
+ output = mx.nd.contrib.quadratic(mx.nd.array(data_np), a=a, b=b, c=c)
+ assert_almost_equal(output.asnumpy(),expected,
+ rtol=1e-2 if dtype is np.float16 else 1e-5,
+ atol=1e-2 if dtype is np.float16 else 1e-5)
+ # check forward
+ check_symbolic_forward(quad_sym, [data_np], [expected],
+ rtol=1e-2 if dtype is np.float16 else 1e-5,
+ atol=1e-2 if dtype is np.float16 else 1e-5)
+ # check backward
+ check_symbolic_backward(quad_sym, [data_np], [np.ones(expected.shape)],
+ [backward_expected],
+ rtol=1e-2 if dtype is np.float16 else 1e-5,
+ atol=1e-2 if dtype is np.float16 else 1e-5)
+ # check backward using finite difference
+ check_numeric_gradient(quad_sym, [data_np], atol=0.001)
+```
+
+In the above test we create a `quadratic` symbol and feed it into the three
+utility functions. The `check_symbolic_forward` and `check_symbolic_backward`
+tests the computed values against the expected values that we pass
+as an argument to the function. The `check_numeric_gradient` utility function
+performs [gradient checking](http://ufldl.stanford.edu/tutorial/supervised/DebuggingGradientChecking/)
+to verify the implementation for the backward function of the operator.
+It will perform a perturbation on the input and calculate the response
+rate of the output using the
+[finite difference method](https://en.wikipedia.org/wiki/Finite_difference_method).
+Then it will compare the gradient from the backward pass with the values
+from the finite difference method. All three of these tests will be successful
+once the comparison satisfies user specified `rtol` and `atol` values. Here `rtol`
+and `atol` expand to relative tolerance and absolute tolerance respectively. They
+are used to specify how far the computed values can deviate from the expected values.
+They are defined as follows
+```
+abs(Expected_Value - Computed_Value) < RTOL * abs(Expected_Value) + ATOL
+```
+
+For example, if `rtol` is `1e-5` and `atol` is `1e-5` and the expected value is
+`1.5623145`, then the computed value should lie within the range of
+`(1.562288876855, 1.562340123145)` else the test will fail. Make sure you
+tune the `rtol` and `atol` values accordingly. Giving very low values for `rtol`
+and `atol` will likely make the test very flaky. It is recommended that you
+use the flakiness checker tool to check if the test you have written is flaky
+or not. You can run the flakiness checker tool for the above test with the
+following command -
+
+```bash
+python tools/flakiness_checker.py test_operator.test_quadratic_function
+```
+
+Please note that for `check_symbolic_forward` and `check_symbolic_backward` we pass
+both the operator symbols and expected results for comparison, for
+`check_numeric_gradient` we only pass the operator symbol, as the
+`check_numeric_gradient` computes the expected value using finite difference
+method. Which is why it is highly recommended to add `check_numeric_gradient`
+test for every operator with backward function implemented as it eliminates
+the possibility of passing incorrect expected results into `check_symbolic_backward`.
+
+
+## Summary
+In this tutorial, we practiced implementing the operator `quadratic` in MXNet backend
+and unit testing the implementation in frontend. More specifically, we added parameter
+struct for user-input parameters, walked through shape and type inference workflow,
+implemented forward and backward functions, and registered the operator
+using nnvm. Congratulations! You now know how to add operators.
+We welcome your contributions to MXNet.
+
+**Note**: Source code in the tutorial can be found in
+[quadratic_op-inl.h](https://github.com/apache/incubator-mxnet/blob/master/src/operator/contrib/quadratic_op-inl.h),
+[quadratic_op.cc](https://github.com/apache/incubator-mxnet/blob/master/src/operator/contrib/quadratic_op.cc),
+[quadratic_op.cu](https://github.com/apache/incubator-mxnet/blob/master/src/operator/contrib/quadratic_op.cu),
+and
+[test_operator.py](https://github.com/apache/incubator-mxnet/blob/master/tests/python/unittest/test_operator.py#L6514).
+
+## Additional Resources
+- [Use TensorInspector to Help Debug Operators](tensor_inspector_tutorial.md)
diff --git a/docs/static_site/src/pages/api/faq/caffe.md b/docs/static_site/src/pages/api/faq/caffe.md
new file mode 100644
index 000000000000..62b204e85dde
--- /dev/null
+++ b/docs/static_site/src/pages/api/faq/caffe.md
@@ -0,0 +1,208 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Convert from Caffe to MXNet
+category: faq
+faq_c: Deployment Environments
+question: How to convert a Caffe model to MXNet?
+permalink: /api/faq/caffe
+---
+# How to | Convert from Caffe to MXNet
+
+Key topics covered include the following:
+
+- [Converting Caffe trained models to MXNet](#converting-caffe-trained-models-to-mxnet)
+- [Calling Caffe operators in MXNet](#calling-caffe-operators-in-mxnet)
+
+## Converting Caffe trained models to MXNet
+
+The converting tool is available at
+[tools/caffe_converter](https://github.com/dmlc/mxnet/tree/master/tools/caffe_converter). On
+the remaining of this section, we assume we are on the `tools/caffe_converter`
+directory.
+
+### How to build
+
+If Caffe's python package is installed, namely we can run `import caffe` in
+python, then we are ready to go.
+
+For example, we can used
+[AWS Deep Learning AMI](https://aws.amazon.com/marketplace/pp/B06VSPXKDX) with
+both Caffe and MXNet installed.
+
+Otherwise we can install the
+[Google protobuf](https://developers.google.com/protocol-buffers/?hl=en)
+compiler and its python binding. It is easier to install, but may be slower
+during running.
+
+1. Install the compiler:
+ - Linux: install `protobuf-compiler` e.g. `sudo apt-get install
+ protobuf-compiler` for Ubuntu and `sudo yum install protobuf-compiler` for
+ Redhat/Fedora.
+ - Windows: Download the win32 build of
+ [protobuf](https://github.com/google/protobuf/releases). Make sure to
+ download the version that corresponds to the version of the python binding
+ on the next step. Extract to any location then add that location to your
+ `PATH`
+ - Mac OS X: `brew install protobuf`
+
+2. Install the python binding by either `conda install -c conda-forge protobuf`
+ or `pip install protobuf`.
+
+3. Compile Caffe proto definition. Run `make` in Linux or Mac OS X, or
+ `make_win32.bat` in Windows
+
+### How to use
+
+There are three tools:
+
+- `convert_symbol.py` : convert Caffe model definition in protobuf into MXNet's
+ Symbol in JSON format.
+- `convert_model.py` : convert Caffe model parameters into MXNet's NDArray format
+- `convert_mean.py` : convert Caffe input mean file into MXNet's NDArray format
+
+In addition, there are two tools:
+- `convert_caffe_modelzoo.py` : download and convert models from Caffe model
+ zoo.
+- `test_converter.py` : test the converted models by checking the prediction
+ accuracy.
+
+## Calling Caffe operators in MXNet
+
+Besides converting Caffe models, MXNet supports calling most Caffe operators,
+including network layer, data layer, and loss function, directly. It is
+particularly useful if there are customized operators implemented in Caffe, then
+we do not need to re-implement them in MXNet.
+
+### How to install
+
+This feature requires Caffe. In particular, we need to re-compile Caffe before
+[PR #4527](https://github.com/BVLC/caffe/pull/4527) is merged into Caffe. There
+are the steps of how to rebuild Caffe:
+
+1. Download [Caffe](https://github.com/BVLC/caffe). E.g. `git clone
+ https://github.com/BVLC/caffe`
+2. Download the
+ [patch for the MXNet interface](https://github.com/BVLC/caffe/pull/4527.patch)
+ and apply to Caffe. E.g.
+ ```bash
+ cd caffe && wget https://github.com/BVLC/caffe/pull/4527.patch && git apply 4527.patch
+ ```
+3. Build and install Caffe by following the
+ [official guide](http://caffe.berkeleyvision.org/installation.html).
+
+Next we need to compile MXNet with Caffe supports
+
+1. Copy `make/config.mk` (for Linux) or `make/osx.mk`
+ (for Mac) into the MXNet root folder as `config.mk` if you have not done it yet
+2. Open the copied `config.mk` and uncomment these two lines
+ ```bash
+ CAFFE_PATH = $(HOME)/caffe
+ MXNET_PLUGINS += plugin/caffe/caffe.mk
+ ```
+ Modify `CAFFE_PATH` to your Caffe installation, if necessary.
+3. Then build with 8 threads `make clean && make -j8`.
+
+### How to use
+
+This Caffe plugin adds three components into MXNet:
+
+- `sym.CaffeOp` : Caffe neural network layer
+- `sym.CaffeLoss` : Caffe loss functions
+- `io.CaffeDataIter` : Caffe data layer
+
+#### Use `sym.CaffeOp`
+The following example shows the definition of a 10 classes multi-layer perceptron:
+
+```Python
+data = mx.sym.Variable('data')
+fc1 = mx.sym.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")
+act1 = mx.sym.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}")
+fc2 = mx.sym.CaffeOp(data_0=act1, num_weight=2, name='fc2', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }")
+act2 = mx.sym.CaffeOp(data_0=fc2, prototxt="layer{type:\"TanH\"}")
+fc3 = mx.sym.CaffeOp(data_0=act2, num_weight=2, name='fc3', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}")
+mlp = mx.sym.SoftmaxOutput(data=fc3, name='softmax')
+```
+
+Let's break it down. First, `data = mx.sym.Variable('data')` defines a variable
+as a placeholder for input. Then, it's fed through Caffe operators with `fc1 =
+mx.sym.CaffeOp(...)`. `CaffeOp` accepts several arguments:
+
+- The inputs to Caffe operators are named as `data_i` for *i=0, ..., num_data-1*
+- `num_data` is the number of inputs. In default it is 1, and therefore
+skipped in the above example.
+- `num_out` is the number of outputs. In default it is 1 and also skipped.
+- `num_weight` is the number of weights (`blobs_`). Its default value is 0. We
+need to explicitly specify it for a non-zero value.
+- `prototxt` is the protobuf configuration string.
+
+#### Use `sym.CaffeLoss`
+
+Using Caffe loss is similar.
+We can replace the MXNet loss with Caffe loss.
+We can replace
+
+Replacing the last line of the above example with the following two lines we can
+call Caffe loss instead of MXNet loss.
+
+```Python
+label = mx.sym.Variable('softmax_label')
+mlp = mx.sym.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}")
+```
+
+Similar to `CaffeOp`, `CaffeLoss` has arguments `num_data` (2 in default) and
+`num_out` (1 in default). But there are two differences
+
+1. Inputs are `data` and `label`. And we need to explicitly create a variable
+ placeholder for label, which is implicitly done in MXNet loss.
+2. `grad_scale` is the weight of this loss.
+
+#### Use `io.CaffeDataIter`
+
+We can also wrap a Caffe data layer into MXNet's data iterator. Below is an
+example for creating a data iterator for MNIST
+
+```python
+train = mx.io.CaffeDataIter(
+ prototxt =
+ 'layer { \
+ name: "mnist" \
+ type: "Data" \
+ top: "data" \
+ top: "label" \
+ include { \
+ phase: TEST \
+ } \
+ transform_param { \
+ scale: 0.00390625 \
+ } \
+ data_param { \
+ source: "caffe/examples/mnist/mnist_test_lmdb" \
+ batch_size: 100 \
+ backend: LMDB \
+ } \
+ }',
+ flat = flat,
+ num_examples = 60000,
+)
+```
+
+### Put it all together
+
+The complete example is available at
+[example/caffe](https://github.com/dmlc/mxnet/blob/master/example/caffe/)
diff --git a/docs/static_site/src/pages/api/faq/cloud.md b/docs/static_site/src/pages/api/faq/cloud.md
new file mode 100644
index 000000000000..1782b3fc33a7
--- /dev/null
+++ b/docs/static_site/src/pages/api/faq/cloud.md
@@ -0,0 +1,208 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: MXNet on the Cloud
+category: faq
+faq_c: Deployment Environments
+question: How to run MXNet on AWS?
+permalink: /api/faq/cloud
+---
+
+# MXNet on the Cloud
+
+Deep learning can require extremely powerful hardware, often for unpredictable durations of time.
+Moreover, _MXNet_ can benefit from both multiple GPUs and multiple machines.
+Accordingly, cloud computing, as offered by AWS and others,
+is especially well suited to training deep learning models.
+Using AWS, we can rapidly fire up multiple machines with multiple GPUs each at will
+and maintain the resources for precisely the amount of time needed.
+
+## Set Up an AWS GPU Cluster from Scratch
+
+In this document, we provide a step-by-step guide that will teach you
+how to set up an AWS cluster with _MXNet_. We show how to:
+
+- [Use Amazon S3 to host data](#use-amazon-s3-to-host-data)
+- [Set up an EC2 GPU instance with all dependencies installed](#set-up-an-ec2-gpu-instance)
+- [Build and run MXNet on a single computer](#build-and-run-mxnet-on-a-gpu-instance)
+- [Set up an EC2 GPU cluster for distributed training](#set-up-an-ec2-gpu-cluster-for-distributed-training)
+
+### Use Amazon S3 to Host Data
+
+Amazon S3 provides distributed data storage which proves especially convenient for hosting large datasets.
+To use S3, you need [AWS credentials](http://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSGettingStartedGuide/AWSCredentials.html),
+including an `ACCESS_KEY_ID` and a `SECRET_ACCESS_KEY`.
+
+To use _MXNet_ with S3, set the environment variables `AWS_ACCESS_KEY_ID` and
+`AWS_SECRET_ACCESS_KEY` by adding the following two lines in
+`~/.bashrc` (replacing the strings with the correct ones):
+
+```bash
+export AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE
+export AWS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
+```
+
+There are several ways to upload data to S3. One simple way is to use
+[s3cmd](http://s3tools.org/s3cmd). For example:
+
+```bash
+wget http://data.mxnet.io/mxnet/data/mnist.zip
+unzip mnist.zip && s3cmd put t*-ubyte s3://dmlc/mnist/
+```
+
+### Use Pre-installed EC2 GPU Instance
+The [Deep Learning AMI](https://aws.amazon.com/marketplace/pp/B01M0AXXQB?qid=1475211685369&sr=0-1&ref_=srh_res_product_title) is an Amazon Linux image
+supported and maintained by Amazon Web Services for use on Amazon Elastic Compute Cloud (Amazon EC2).
+It contains [MXNet-v0.9.3 tag](https://github.com/dmlc/mxnet) and the necessary components to get going with deep learning,
+including Nvidia drivers, CUDA, cuDNN, Anaconda, Python2 and Python3.
+The AMI IDs are the following:
+
+* us-east-1: ami-e7c96af1
+* us-west-2: ami-dfb13ebf
+* eu-west-1: ami-6e5d6808
+
+Now you can launch _MXNet_ directly on an EC2 GPU instance.
+You can also use [Jupyter](http://jupyter.org) notebook on EC2 machine.
+Here is a [good tutorial](https://github.com/dmlc/mxnet-notebooks)
+on how to connect to a Jupyter notebook running on an EC2 instance.
+
+### Set Up an EC2 GPU Instance from Scratch
+
+_MXNet_ requires the following libraries:
+
+- C++ compiler with C++11 support, such as `gcc >= 4.8`
+- `CUDA` (`CUDNN` in optional) for GPU linear algebra
+- `BLAS` (cblas, open-blas, atblas, mkl, or others) for CPU linear algebra
+- `opencv` for image augmentations
+- `curl` and `openssl` for the ability to read/write to Amazon S3
+
+Installing `CUDA` on EC2 instances requires some effort. Caffe has a good
+[tutorial](https://github.com/BVLC/caffe/wiki/Install-Caffe-on-EC2-from-scratch-(Ubuntu,-CUDA-7,-cuDNN-3))
+on how to install CUDA 7.0 on Ubuntu 14.04.
+
+***Note:*** We tried CUDA 7.5 on Nov 7, 2015, but found it problematic.
+
+You can install the rest using the package manager. For example, on Ubuntu:
+
+```
+sudo apt-get update
+sudo apt-get install -y build-essential git libcurl4-openssl-dev libatlas-base-dev libopencv-dev python-numpy
+```
+
+The Amazon Machine Image (AMI) [ami-12fd8178](https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#LaunchInstanceWizard:ami=ami-12fd8178) has the packages listed above installed.
+
+
+### Build and Run MXNet on a GPU Instance
+
+The following commands build _MXNet_ with CUDA/CUDNN, Amazon S3, and distributed
+training.
+
+```bash
+git clone --recursive https://github.com/dmlc/mxnet
+cd mxnet; cp make/config.mk .
+echo "USE_CUDA=1" >>config.mk
+echo "USE_CUDA_PATH=/usr/local/cuda" >>config.mk
+echo "USE_CUDNN=1" >>config.mk
+echo "USE_BLAS=atlas" >> config.mk
+echo "USE_DIST_KVSTORE = 1" >>config.mk
+echo "USE_S3=1" >>config.mk
+make -j$(nproc)
+```
+
+To test whether everything is installed properly, we can try training a convolutional neural network (CNN) on the MNIST dataset using a GPU:
+
+```bash
+python example/image-classification/train_mnist.py
+```
+
+If you've placed the MNIST data on `s3://dmlc/mnist`, you can read the data stored on Amazon S3 directly with the following command:
+
+```bash
+sed -i.bak "s!data_dir = 'data'!data_dir = 's3://dmlc/mnist'!" example/image-classification/train_mnist.py
+```
+
+***Note:*** You can use `sudo ln /dev/null /dev/raw1394` to fix the opencv error `libdc1394 error: Failed to initialize libdc1394`.
+
+### Set Up an EC2 GPU Cluster for Distributed Training
+
+A cluster consists of multiple computers.
+You can use one computer with _MXNet_ installed as the root computer for submitting jobs,and then launch several
+slave computers to run the jobs. For example, launch multiple instances using an
+AMI, e.g.,
+[ami-12fd8178](https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#LaunchInstanceWizard:ami=ami-12fd8178),
+with dependencies installed. There are two options:
+
+- Make all slaves' ports accessible (same for the root) by setting type: All TCP,
+ Source: Anywhere in Configure Security Group.
+
+- Use the same `pem` as the root computer to access all slave computers, and
+ then copy the `pem` file into the root computer's `~/.ssh/id_rsa`. If you do this, all slave computers can be accessed with SSH from the root.
+
+Now, run the CNN on multiple computers. Assume that we are on a working
+directory of the root computer, such as `~/train`, and MXNet is built as `~/mxnet`.
+
+1. Pack the _MXNet_ Python library into this working directory for easy
+ synchronization:
+
+ ```bash
+ cp -r ~/mxnet/python/mxnet .
+ cp ~/mxnet/lib/libmxnet.so mxnet/
+ ```
+
+ And then copy the training program:
+
+ ```bash
+ cp ~/mxnet/example/image-classification/*.py .
+ cp -r ~/mxnet/example/image-classification/common .
+ ```
+
+2. Prepare a host file with all slaves private IPs. For example, `cat hosts`:
+
+ ```bash
+ 172.30.0.172
+ 172.30.0.171
+ ```
+
+3. Assuming that there are two computers, train the CNN using two workers:
+
+ ```bash
+ ../../tools/launch.py -n 2 -H hosts --sync-dir /tmp/mxnet python train_mnist.py --kv-store dist_sync
+ ```
+
+***Note:*** Sometimes the jobs linger at the slave computers even though you've pressed `Ctrl-c`
+at the root node. To terminate them, use the following command:
+
+```bash
+cat hosts | xargs -I{} ssh -o StrictHostKeyChecking=no {} 'uname -a; pgrep python | xargs kill -9'
+```
+
+***Note:*** The preceding example is very simple to train and therefore isn't a good
+benchmark for distributed training. Consider using other [examples](https://github.com/dmlc/mxnet/tree/master/example/image-classification).
+
+### More Options
+#### Use Multiple Data Shards
+It is common to pack a dataset into multiple files, especially when working in a distributed environment.
+_MXNet_ supports direct loading from multiple data shards.
+Put all of the record files into a folder, and point the data path to the folder.
+
+#### Use YARN and SGE
+Although using SSH can be simple when you don't have a cluster scheduling framework,
+_MXNet_ is designed to be portable to various platforms.
+We provide scripts available in [tracker](https://github.com/dmlc/dmlc-core/tree/master/tracker)
+to allow running on other cluster frameworks, including Hadoop (YARN) and SGE.
+We welcome contributions from the community of examples of running _MXNet_ on your favorite distributed platform.
diff --git a/docs/static_site/src/pages/api/faq/distributed_training.md b/docs/static_site/src/pages/api/faq/distributed_training.md
new file mode 100644
index 000000000000..85e04d69cf30
--- /dev/null
+++ b/docs/static_site/src/pages/api/faq/distributed_training.md
@@ -0,0 +1,332 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Distributed Training in MXNet
+category: faq
+faq_c: Deployment Environments
+question: How to do distributed training using MXNet on AWS?
+permalink: /api/faq/distributed_training
+---
+
+# Distributed Training in MXNet
+MXNet supports distributed training enabling us to leverage multiple machines for faster training.
+In this document, we describe how it works, how to launch a distributed training job and
+some environment variables which provide more control.
+
+## Types of Parallelism
+There are two ways in which we can distribute the workload of training a neural network across multiple devices (can be either GPU or CPU).
+The first way is *data parallelism*, which refers to the case where each device stores a complete copy of the model.
+Each device works with a different part of the dataset, and the devices collectively update a shared model.
+These devices can be located on a single machine or across multiple machines.
+In this document, we describe how to train a model with devices distributed across machines in a data parallel way.
+
+When models are so large that they don't fit into device memory, then a second way called *model parallelism* is useful.
+Here, different devices are assigned the task of learning different parts of the model.
+Currently, MXNet supports Model parallelism in a single machine only. Refer [Training with multiple GPUs using model parallelism](https://mxnet.incubator.apache.org/versions/master/faq/model_parallel_lstm.html) for more on this.
+
+## How Does Distributed Training Work?
+The following concepts are key to understanding distributed training in MXNet:
+### Types of Processes
+MXNet has three types of processes which communicate with each other to accomplish training of a model.
+- Worker: A worker node actually performs training on a batch of training samples.
+Before processing each batch, the workers pull weights from servers.
+The workers also send gradients to the servers after each batch.
+Depending on the workload for training a model, it might not be a good idea to run multiple worker processes on the same machine.
+- Server: There can be multiple servers which store the model's parameters, and communicate with workers.
+A server may or may not be co-located with the worker processes.
+- Scheduler: There is only one scheduler. The role of the scheduler is to set up the cluster. This includes waiting for messages that each node has come up and which port the node is listening on.
+The scheduler then lets all processes know about every other node in the cluster, so that they can communicate with each other.
+
+### KV Store
+MXNet provides a key-value store, which is a critical component used for multi-device training. The communication of parameters across devices on a single machine, as well as across multiple machines, is relayed through one or more servers with a key-value store for the parameters. Each value in this store is represented by a key and value, where each parameter array in the network is assigned a key, and value refers to the weights of that parameter array. Workers `push` gradients after processing a batch, and `pull` updated weights before processing a new batch.
+We can also pass in optimizers for the KVStore to use while updating each weight. Optimizers like Stochastic Gradient Descent define an update rule,
+essentially a mathematical formula to compute the new weight based on the old weight, gradient, and some parameters.
+
+If you are using a Gluon Trainer object or the Module API,
+it uses a kvstore object internally to aggregate gradients from multiple devices on the same machine as well as across different machines.
+
+Although the API remains the same whether or not multiple machines are being used,
+the notion of kvstore server exists only during distributed training.
+In this case, each `push` and `pull` involves communication with the kvstore servers. When there are multiple devices on a single machine, gradients from these devices are first aggregated on the machine and then sent to the servers.
+Note that we need to compile MXNet with the build flag `USE_DIST_KVSTORE=1` to use distributed training.
+
+The distributed mode of KVStore is enabled by calling `mxnet.kvstore.create` function
+with a string argument which contains the word `dist` as follows:
+> kv = mxnet.kvstore.create('dist_sync')
+
+Refer [KVStore API](https://mxnet.incubator.apache.org/versions/master/api/python/kvstore/kvstore.html) for more information about KVStore.
+
+### Distribution of Keys
+Each server doesn't necessarily store all the keys or parameter arrays.
+Parameters are distributed across different servers. The decision of which server stores a particular key is made at random.
+This distribution of keys across different servers is handled transparently by the KVStore.
+It ensures that when a key is pulled, that request is sent to the server which has the corresponding value.
+If the value of some key is very large, it may be sharded across different servers. This means that different servers hold different parts of the value.
+Again, this is handled transparently so that the worker does not have to do anything different.
+The threshold for this sharding can be controlled with the environment variable `MXNET_KVSTORE_BIGARRAY_BOUND`.
+See [environment variables](#environment-variables) for more details.
+
+### Split training data
+When running distributed training in data parallel mode, we want each machine to be working on different parts of the dataset.
+
+For data parallel training on a single worker,
+we can use `mxnet.gluon.utils.split_and_load` to split a batch of samples provided by the data iterator, and then load each part of the batch on the device which will process it.
+
+In the case of distributed training though, we would need to divide the dataset into `n` parts at the beginning, so that each worker gets a different part. Each worker can then use `split_and_load` to again divide that part of the dataset across different devices on a single machine.
+
+Typically, this split of data for each worker happens through the data iterator,
+on passing the number of parts and the index of parts to iterate over.
+Some iterators in MXNet that support this feature are [mxnet.io.MNISTIterator](https://mxnet.incubator.apache.org/versions/master/api/python/io/io.html#mxnet.io.MNISTIter) and [mxnet.io.ImageRecordIter](https://mxnet.incubator.apache.org/versions/master/api/python/io/io.html#mxnet.io.ImageRecordIter).
+If you are using a different iterator, you can look at how the above iterators implement this.
+We can use the kvstore object to get the number of workers (`kv.num_workers`) and rank of the current worker (`kv.rank`).
+These can be passed as arguments to the iterator.
+You can look at [example/gluon/image_classification.py](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py)
+to see an example usage.
+
+### Updating weights
+KVStore server supports two modes, one which aggregates the gradients and updates the weights using those gradients, and second where the server only aggregates gradients. In the latter case, when a worker process pulls from kvstore, it gets the aggregated gradients. The worker then uses these gradients and applies the weights locally.
+
+When using Gluon there is an option to choose between these modes by passing `update_on_kvstore` variable when you create the [Trainer](https://mxnet.incubator.apache.org/versions/master/api/python/gluon/gluon.html#mxnet.gluon.Trainer) object like this:
+
+```
+trainer = gluon.Trainer(net.collect_params(), optimizer='sgd',
+ optimizer_params={'learning_rate': opt.lr,
+ 'wd': opt.wd,
+ 'momentum': opt.momentum,
+ 'multi_precision': True},
+ kvstore=kv,
+ update_on_kvstore=True)
+```
+
+When using the symbolic interface, it performs the weight updates on the server without the user having to do anything special.
+
+### Different Modes of Distributed Training
+Distributed training itself is enabled when kvstore creation string contains the word `dist`.
+
+Different modes of distributed training can be enabled by using different types of kvstore.
+
+- `dist_sync`: In synchronous distributed training, all workers use the same synchronized set of model parameters at the start of every batch.
+This means that after each batch, the server waits to receive gradients from each worker before it updates the model parameters.
+This synchronization comes at a cost because the worker pulling parameters would have to wait till the server finishes this process.
+In this mode, if a worker crashes, then it halts the progress of all workers.
+
+- `dist_async`: In asynchronous distributed training, the server receives gradients from one worker and immediately updates its store, which it uses to respond to any future pulls.
+This means that a worker who finishes processing a batch can pull the current parameters from server and start the next batch,
+even if other workers haven't finished processing the earlier batch.
+This is faster than `dist_sync` because there is no cost of synchronization, but can take more epochs to converge.
+The update of weights is atomic, meaning no two updates happen on the same weight at the same time. However, the order of updates is not guaranteed.
+In `async` mode, it is required to pass an optimizer because in the absence of an optimizer kvstore would replace the stored weights with received weights and this doesn't make sense for training in asynchronous mode. Hence, when using Gluon with `async` mode we need to set `update_on_kvstore` to `True`.
+
+- `dist_sync_device`: Same as `dist_sync` except that when there are multiple GPUs being used on each node,
+this mode aggregates gradients and updates weights on GPU while dist_sync does so on CPU memory.
+This is faster than `dist_sync` because it reduces expensive communication between GPU and CPU, but it increases memory usage on GPU.
+
+- `dist_async_device` : The analogue of `dist_sync_device` but in asynchronous mode.
+
+
+### Gradient Compression
+When communication is expensive, and the ratio of computation time to communication time is low, communication can become a bottleneck.
+In such cases, gradient compression can be used to reduce the cost of communication, thereby speeding up training.
+Refer [Gradient compression](https://mxnet.incubator.apache.org/versions/master/faq/gradient_compression.html) for more details.
+
+Note: For small models when the cost of computation is much lower than cost of communication,
+distributed training might actually be slower than training on a single machine because of the overhead of communication and synchronization.
+
+## How to Start Distributed Training?
+MXNet provides a script tools/launch.py to make it easy to launch a distributed training job. This supports various types of cluster resource managers like `ssh`, `mpirun`, `yarn` and `sge`.
+If you already have one of these clusters setup, you can skip the next section on setting up a cluster.
+If you want to use a type of cluster not mentioned above, skip ahead to Manually launching jobs section.
+
+### Setting up the Cluster
+An easy way to set up a cluster of EC2 instances for distributed deep learning is by using the [AWS CloudFormation template](https://github.com/awslabs/deeplearning-cfn).
+If you can not use the above, this section will help you manually set up a cluster of instances
+to enable you to use `ssh` for launching a distributed training job.
+Let us denote one machine as the `master` of the cluster through which we will launch and monitor the distributed training on all machines.
+
+If the machines in your cluster are a part of a cloud computing platform like AWS EC2, then your instances should be using key-based authentication already.
+Ensure that you create all instances using the same key, say `mxnet-key` and in the same security group.
+Next, we need to ensure that master has access to all other machines in the cluster through `ssh` by
+adding this key to [ssh-agent](https://en.wikipedia.org/wiki/Ssh-agent) and forwarding it to master when we log in. This will make `mxnet-key` the default key on master.
+
+```
+ssh-add .ssh/mxnet-key
+ssh -A user@MASTER_IP_ADDRESS
+```
+
+
+If your machines use passwords for authentication, see [here](https://help.ubuntu.com/community/SSH/OpenSSH/Keys) for instructions on setting up password-less authentication between machines.
+
+
+It is easier if all these machines have a shared file system so that they can access the training script. One way is to use [Amazon Elastic File System](https://aws.amazon.com/efs) to create your network file system.
+The options in the following command are the recommended options when mounting an AWS Elastic File System.
+
+```
+sudo mkdir efs && sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 NETWORK_FILE_SYSTEM_IP:/ efs
+```
+
+Tip: You might find it helpful to store large datasets on S3 for easy access from all machines in the cluster. Refer [Using data from S3 for training](https://mxnet.incubator.apache.org/versions/master/faq/s3_integration.html) for more information.
+
+### Using Launch.py
+MXNet provides a script [tools/launch.py](https://github.com/apache/incubator-mxnet/blob/master/tools/launch.py) to make it easy to launch distributed training on a cluster with `ssh`, `mpi`, `sge` or `yarn`.
+You can fetch this script by cloning the mxnet repository.
+
+```
+git clone --recursive https://github.com/apache/incubator-mxnet
+```
+
+#### Example
+Let us consider training a VGG11 model on the CIFAR10 dataset using [example/gluon/image_classification.py](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py).
+```
+cd example/gluon/
+```
+On a single machine, we can run this script as follows:
+```
+python image_classification.py --dataset cifar10 --model vgg11 --epochs 1
+```
+
+For distributed training of this example, we would do the following:
+
+If the mxnet directory which contains the script `image_classification.py` is accessible to all machines in the cluster (for example if they are on a network file system), we can run:
+```
+../../tools/launch.py -n 3 -H hosts --launcher ssh python image_classification.py --dataset cifar10 --model vgg11 --epochs 1 --kvstore dist_sync
+```
+
+If the directory with the script is not accessible from the other machines in the cluster, then we can synchronize the current directory to all machines.
+```
+../../tools/launch.py -n 3 -H hosts --launcher ssh --sync-dst-dir /tmp/mxnet_job/ python image_classification.py --dataset cifar10 --model vgg11 --epochs 1 --kvstore dist_sync
+```
+
+> Tip: If you don't have a cluster ready and still want to try this out, pass the option `--launcher local` instead of `ssh`
+
+#### Options
+Here, launch.py is used to submit the distributed training job. It takes the following options:
+- `-n` denotes the number of worker nodes to be launched.
+- `-s` denotes the number of server nodes to be launched.
+If it is not specified, it is taken to be equal to the number of worker nodes.
+The script tries to cycle through the hosts file to launch the servers and workers.
+For example, if you have 5 hosts in the hosts file and you passed `n` as 3 (and nothing for `s`).
+The script will launch a total of 3 server processes,
+one each for the first three hosts and launch a total of 3 worker processes, one each for the fourth, fifth and first host.
+If the hosts file has exactly `n` number of worker nodes, it will launch a server process and a worker process on each of the `n` hosts.
+- `--launcher` denotes the mode of communication. The options are:
+ - `ssh` if machines can communicate through ssh without passwords. This is the default launcher mode.
+ - `mpi` if Open MPI is available
+ - `sge` for Sun Grid Engine
+ - `yarn` for Apache Yarn
+ - `local` for launching all processes on the same local machine. This can be used for debugging purposes.
+- `-H` requires the path of the hosts file
+ This file contains IPs of the machines in the cluster. These machines should be able to communicate with each other without using passwords.
+ This file is only applicable and required when the launcher mode is `ssh` or `mpi`.
+ An example of the contents of the hosts file would be:
+ ```
+ 172.30.0.172
+ 172.31.0.173
+ 172.30.1.174
+ ```
+- `--sync-dst-dir` takes the path of a directory on all hosts to which the current working directory will be synchronized. This only supports `ssh` launcher mode.
+This is necessary when the working directory is not accessible to all machines in the cluster. Setting this option synchronizes the current directory using rsync before the job is launched.
+If you have not installed MXNet system-wide
+then you have to copy the folder `python/mxnet` and the file `lib/libmxnet.so` into the current directory before running `launch.py`.
+For example if you are in `example/gluon`, you can do this with `cp -r ../../python/mxnet ../../lib/libmxnet.so .`. This would work if your `lib` folder contains `libmxnet.so`, as would be the case when you use make. If you use CMake, this file would be in your `build` directory.
+
+- `python image_classification.py --dataset cifar10 --model vgg11 --epochs 1 --kvstore dist_sync`
+is the command for the training job on each machine. Note the use of `dist_sync` for the kvstore used in the script.
+
+#### Terminating Jobs
+If the training job crashes due to an error or if we try to terminate the launch script while training is running,
+jobs on all machines might not have terminated. In such a case, we would need to terminate them manually.
+If we are using `ssh` launcher, this can be done by running the following command where `hosts` is the path of the hostfile.
+```
+while read -u 10 host; do ssh -o "StrictHostKeyChecking no" $host "pkill -f python" ; done 10
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Environment Variables
+category: faq
+faq_c: Deployment Environments
+question: What are MXNet environment variables?
+permalink: /api/faq/env_var
+---
+
+Environment Variables
+=====================
+MXNet has several settings that you can change with environment variables.
+Typically, you wouldn't need to change these settings, but they are listed here for reference.
+
+For example, you can set these environment variables in Linux or macOS as follows:
+```
+export MXNET_GPU_WORKER_NTHREADS=3
+```
+
+Or in powershell:
+```
+$env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+```
+
+## Variables controlling the execution environment
+
+* MXNET_LIBRARY_PATH
+ Absolute path indicating where the mxnet dynamic library is to be located, this would be the absolute
+ path to `libmxnet.so` or `libmxnet.dll` depending on the platform. The logic for loading the
+ library is in `python/mxnet/libinfo.py`
+
+## Set the Number of Threads
+
+* MXNET_GPU_WORKER_NTHREADS
+ - Values: Int ```(default=2)```
+ - The maximum number of threads to use on each GPU. This parameter is used to parallelize the computation within a single GPU card.
+* MXNET_GPU_COPY_NTHREADS
+ - Values: Int ```(default=2)```
+ - The maximum number of concurrent threads that do the memory copy job on each GPU.
+* MXNET_CPU_WORKER_NTHREADS
+ - Values: Int ```(default=1)```
+ - The maximum number of scheduling threads on CPU. It specifies how many operators can be run in parallel. Note that most CPU operators are parallelized by OpenMP. To change the number of threads used by individual operators, please set `OMP_NUM_THREADS` instead.
+* MXNET_CPU_PRIORITY_NTHREADS
+ - Values: Int ```(default=4)```
+ - The number of threads given to prioritized CPU jobs.
+* MXNET_CPU_NNPACK_NTHREADS
+ - Values: Int ```(default=4)```
+ - The number of threads used for NNPACK. NNPACK package aims to provide high-performance implementations of some layers for multi-core CPUs. Checkout [NNPACK](http://mxnet.io/faq/nnpack.html) to know more about it.
+* MXNET_MP_WORKER_NTHREADS
+ - Values: Int ```(default=1)```
+ - The number of scheduling threads on CPU given to multiprocess workers. Enlarge this number allows more operators to run in parallel in individual workers but please consider reducing the overall `num_workers` to avoid thread contention (not available on Windows).
+* MXNET_MP_OPENCV_NUM_THREADS
+ - Values: Int ```(default=0)```
+ - The number of OpenCV execution threads given to multiprocess workers. OpenCV multithreading is disabled if `MXNET_MP_OPENCV_NUM_THREADS` < 1 (default). Enlarge this number may boost the performance of individual workers when executing underlying OpenCV functions but please consider reducing the overall `num_workers` to avoid thread contention (not available on Windows).
+
+## Memory Options
+
+* MXNET_EXEC_ENABLE_INPLACE
+ - Values: true or false ```(default=true)```
+ - Whether to enable in-place optimization in symbolic execution. Checkout [in-place optimization](http://mxnet.io/architecture/note_memory.html#in-place-operations) to know more about it.
+* NNVM_EXEC_MATCH_RANGE
+ - Values: Int ```(default=16)```
+ - The approximate matching scale in the symbolic execution memory allocator.
+ - Set this to 0 if you don't want to enable memory sharing between graph nodes(for debugging purposes).
+ - This variable has impact on the result of memory planning. So, MXNet sweep between [1, NNVM_EXEC_MATCH_RANGE], and selects the best value.
+* MXNET_EXEC_NUM_TEMP
+ - Values: Int ```(default=1)```
+ - The maximum number of temporary workspaces to allocate to each device. This controls space replicas and in turn reduces the memory usage.
+ - Setting this to a small number can save GPU memory. It will also likely decrease the level of parallelism, which is usually acceptable.
+ - MXNet internally uses graph coloring algorithm to [optimize memory consumption](http://mxnet.io/architecture/note_memory.html).
+ - This parameter is also used to get number of matching colors in graph and in turn how much parallelism one can get in each GPU. Color based match usually costs more memory but also enables more parallelism.
+* MXNET_GPU_MEM_POOL_RESERVE
+ - Values: Int ```(default=5)```
+ - The percentage of GPU memory to reserve for things other than the GPU array, such as kernel launch or cudnn handle space.
+ - If you see a strange out-of-memory error from the kernel launch, after multiple iterations, try setting this to a larger value.
+
+* MXNET_GPU_MEM_POOL_TYPE
+ - Values: String ```(default=Naive)```
+ - The type of memory pool.
+ - Choices:
+ - Naive: A simple memory pool that allocates memory for the exact requested size and cache memory buffers. If a buffered memory chunk matches the size of a new request, the chunk from the memory pool will be returned and reused.
+ - Round: A memory pool that always rounds the requested memory size and allocates memory of the rounded size. MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF defines how to round up a memory size. Caching and allocating buffered memory works in the same way as the naive memory pool.
+ - Unpooled: No memory pool is used.
+
+* MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF
+ - Values: Int ```(default=24)```
+ - The cutoff threshold that decides the rounding strategy. Let's denote the threshold as T. If the memory size is smaller than `2 ** T` (by default, it's 2 ** 24 = 16MB), it rounds to the smallest `2 ** n` that is larger than the requested memory size; if the memory size is larger than `2 ** T`, it rounds to the next k * 2 ** T.
+
+* MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE
+ - Values: Int ```(default=2097152)```
+ - When using the naive pool type, memory allocations larger than this threshhold are rounded up to a multiple of this value.
+ - The default was chosen to minimize global memory fragmentation within the GPU driver. Set this to 1 to disable.
+
+## Engine Type
+
+* MXNET_ENGINE_TYPE
+ - Values: String ```(default=ThreadedEnginePerDevice)```
+ - The type of underlying execution engine of MXNet.
+ - Choices:
+ - NaiveEngine: A very simple engine that uses the master thread to do the computation synchronously. Setting this engine disables multi-threading. You can use this type for debugging in case of any error. Backtrace will give you the series of calls that lead to the error. Remember to set MXNET_ENGINE_TYPE back to empty after debugging.
+ - ThreadedEngine: A threaded engine that uses a global thread pool to schedule jobs.
+ - ThreadedEnginePerDevice: A threaded engine that allocates thread per GPU and executes jobs asynchronously.
+
+## Execution Options
+
+* MXNET_EXEC_BULK_EXEC_INFERENCE
+ - Values: 0(false) or 1(true) ```(default=1)```
+ - If set to `1`, during inference MXNet executes the entire computation graph in bulk mode, which reduces kernel launch gaps in between symbolic operators.
+* MXNET_EXEC_BULK_EXEC_TRAIN
+ - Values: 0(false) or 1(true) ```(default=1)```
+ - If set to `1`, during training MXNet executes the computation graph as several subgraphs in bulk mode.
+* MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN
+ - Values: Int ```(default=15)```
+ - The maximum number of nodes in the subgraph executed in bulk during training (not inference). Setting this to a larger number may reduce the degree of parallelism for multi-GPU training.
+* MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD
+ - Values: Int ```(default=)```
+ - The maximum number of nodes in the subgraph executed in bulk during training (not inference) in the forward pass.
+* MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD
+ - Values: Int ```(default=)```
+ - The maximum number of nodes in the subgraph executed in bulk during training (not inference) in the backward pass.
+
+## Control the Data Communication
+
+* MXNET_KVSTORE_REDUCTION_NTHREADS
+ - Values: Int ```(default=4)```
+ - The number of CPU threads used for summing up big arrays on a single machine
+ - This will also be used for `dist_sync` kvstore to sum up arrays from different contexts on a single machine.
+ - This does not affect summing up of arrays from different machines on servers.
+ - Summing up of arrays for `dist_sync_device` kvstore is also unaffected as that happens on GPUs.
+
+* MXNET_KVSTORE_BIGARRAY_BOUND
+ - Values: Int ```(default=1000000)```
+ - The minimum size of a "big array".
+ - When the array size is bigger than this threshold, MXNET_KVSTORE_REDUCTION_NTHREADS threads are used for reduction.
+ - This parameter is also used as a load balancer in kvstore. It controls when to partition a single weight to all the servers. If the size of a single weight is less than MXNET_KVSTORE_BIGARRAY_BOUND then, it is sent to a single randomly picked server otherwise it is partitioned to all the servers.
+
+* MXNET_KVSTORE_USETREE
+ - Values: 0(false) or 1(true) ```(default=0)```
+ - If true, MXNet tries to use tree reduction for Push and Pull communication.
+ - Otherwise, MXNet uses the default Push and Pull implementation.
+ - Tree reduction technology has been shown to be faster than the standard ```--kv-store device``` Push/Pull and ```--kv-store nccl``` Push/Pull for small batch sizes.
+
+* MXNET_KVSTORE_LOGTREE
+ - Values: 0(false) or 1(true) ```(default=0)```
+ - If true and MXNET_KVSTORE_USETREE is set to 1, MXNet will log the reduction trees that have been generated.
+
+* MXNET_KVSTORE_TREE_ARRAY_BOUND
+ - Values: Int ```(default=10000000)```
+ - The minimum size of a "big array".
+ - When the array size is bigger than this threshold and MXNET_KVSTORE_USETREE is set to 1, multiple trees are used to load balance the big gradient being communicated in order to better saturate link bandwidth.
+ - Note: This environmental variable only takes effect if Tree KVStore is being used (MXNET_KVSTORE_USETREE=1).
+
+* MXNET_KVSTORE_TREE_BACKTRACK
+ - Values: 0(false) or 1(true) ```(default=0)
+ - If true and MXNET_KVSTORE_USETREE is set to 1, MXNet tries to use backtracking to generate the trees required for tree reduction.
+ - If false and MXNET_KVSTORE_USETREE is set to 1, MXNet tries to use Kernighan-Lin heuristic to generate the trees required for tree reduction.
+
+* MXNET_KVSTORE_TREE_LINK_USAGE_PENALTY
+ - Values: Float ```(default=0.7)```
+ - The multiplicative penalty term to a link being used once.
+
+* MXNET_ENABLE_GPU_P2P
+ - Values: 0(false) or 1(true) ```(default=1)```
+ - If true, MXNet tries to use GPU peer-to-peer communication, if available on your device,
+ when kvstore's type is `device`.
+
+* MXNET_UPDATE_ON_KVSTORE
+ - Values: 0(false) or 1(true) ```(default=1)```
+ - If true, weight updates are performed during the communication step, if possible.
+
+## Memonger
+
+* MXNET_BACKWARD_DO_MIRROR
+ - Values: 0(false) or 1(true) ```(default=0)```
+ - MXNet uses mirroring concept to save memory. Normally backward pass needs some forward input and it is stored in memory but you can choose to release this saved input and recalculate it in backward pass when needed. This basically trades off the computation for memory consumption.
+ - This parameter decides whether to do `mirror` during training for saving device memory.
+ - When set to `1`, during forward propagation, graph executor will `mirror` some layer's feature map and drop others, but it will re-compute this dropped feature maps when needed.
+ - `MXNET_BACKWARD_DO_MIRROR=1` will save 30%~50% of device memory, but retains about 95% of running speed.
+ - One extension of `mirror` in MXNet is called [memonger technology](https://arxiv.org/abs/1604.06174), it will only use O(sqrt(N)) memory at 75% running speed. Checkout the code [here](https://github.com/dmlc/mxnet-memonger).
+
+## Control the profiler
+
+The following environments can be used to profile the application without changing code. Execution options may affect the granularity of profiling result. If you need profiling result of every operator, please set `MXNET_EXEC_BULK_EXEC_INFERENCE`, `MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN` and `MXNET_EXEC_BULK_EXEC_TRAIN` to 0.
+
+* MXNET_PROFILER_AUTOSTART
+ - Values: 0(false) or 1(true) ```(default=0)```
+ - Set to 1, MXNet starts the profiler automatically. The profiling result is stored into profile.json in the working directory.
+
+* MXNET_PROFILER_MODE
+ - Values: 0(false) or 1(true) ```(default=0)```
+ - If set to '0', profiler records the events of the symbolic operators.
+ - If set to '1', profiler records the events of all operators.
+
+## Interface between Python and the C API
+
+* MXNET_ENABLE_CYTHON
+ - Values: 0(false), 1(true) ```(default=1)```
+ - If set to 0, MXNet uses the ctypes to interface with the C API.
+ - If set to 1, MXNet tries to use the cython modules for the ndarray and symbol. If it fails, the ctypes is used or an error occurs depending on MXNET_ENFORCE_CYTHON.
+
+* MXNET_ENFORCE_CYTHON
+ - Values: 0(false) or 1(true) ```(default=0)```
+ - This has an effect only if MXNET_ENABLE_CYTHON is 1.
+ - If set to 0, MXNet fallbacks to the ctypes if importing the cython modules fails.
+ - If set to 1, MXNet raises an error if importing the cython modules fails.
+
+If cython modules are used, `mx.nd._internal.NDArrayBase` must be `mxnet._cy3.ndarray.NDArrayBase` for python 3 or `mxnet._cy2.ndarray.NDArrayBase` for python 2.
+If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.
+
+## Other Environment Variables
+
+* MXNET_GPU_WORKER_NSTREAMS
+ - Values: 1, or 2 ```(default=1)```
+ - Determines the number of GPU streams available to operators for their functions.
+ - Setting this to 2 may yield a modest performance increase, since ops like the cuDNN convolution op can then calculate their data- and weight-gradients in parallel.
+ - Setting this to 2 may also increase a model's demand for GPU global memory.
+
+* MXNET_CUDNN_AUTOTUNE_DEFAULT
+ - Values: 0, 1, or 2 ```(default=1)```
+ - The default value of cudnn auto tuning for convolution layers.
+ - Value of 0 means there is no auto tuning to pick the convolution algo
+ - Performance tests are run to pick the convolution algo when value is 1 or 2
+ - Value of 1 chooses the best algo in a limited workspace
+ - Value of 2 chooses the fastest algo whose memory requirements may be larger than the default workspace threshold
+
+* MXNET_CUDA_ALLOW_TENSOR_CORE
+ - 0(false) or 1(true) ```(default=1)```
+ - If set to '0', disallows Tensor Core use in CUDA ops.
+ - If set to '1', allows Tensor Core use in CUDA ops.
+ - This variable can only be set once in a session.
+
+* MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION
+ - 0(false) or 1(true) ```(default=0)```
+ - If set to '0', disallows implicit type conversions to Float16 to use Tensor Cores
+ - If set to '1', allows CUDA ops like RNN and Convolution to use TensorCores even with Float32 input data by using implicit type casting to Float16. Only has an effect if `MXNET_CUDA_ALLOW_TENSOR_CORE` is `1`.
+
+* MXNET_CUDA_LIB_CHECKING
+ - 0(false) or 1(true) ```(default=1)```
+ - If set to '0', disallows various runtime checks of the cuda library version and associated warning messages.
+ - If set to '1', permits these checks (e.g. compile vs. link mismatch, old version no longer CI-tested)
+
+* MXNET_CUDNN_LIB_CHECKING
+ - 0(false) or 1(true) ```(default=1)```
+ - If set to '0', disallows various runtime checks of the cuDNN library version and associated warning messages.
+ - If set to '1', permits these checks (e.g. compile vs. link mismatch, old version no longer CI-tested)
+
+* MXNET_GLUON_REPO
+ - Values: String ```(default='https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/'```
+ - The repository url to be used for Gluon datasets and pre-trained models.
+
+* MXNET_HOME
+ - Data directory in the filesystem for storage, for example when downloading gluon models.
+ - Default in *nix is .mxnet APPDATA/mxnet in windows.
+
+* MXNET_MKLDNN_ENABLED
+ - Values: 0, 1 ```(default=1)```
+ - Flag to enable or disable MKLDNN accelerator. On by default.
+ - Only applies to mxnet that has been compiled with MKLDNN (```pip install mxnet-mkl``` or built from source with ```USE_MKLDNN=1```)
+
+* MXNET_MKLDNN_CACHE_NUM
+ - Values: Int ```(default=-1)```
+ - Flag to set num of elements that MKLDNN cache can hold. Default is -1 which means cache size is unbounded. Should only be set if your model has variable input shapes, as cache size may grow unbounded. The number represents the number of items in the cache and is proportional to the number of layers that use MKLDNN and different input shape.
+
+* MXNET_ENFORCE_DETERMINISM
+ - Values: 0(false) or 1(true) ```(default=0)```
+ - If set to true, MXNet will only use deterministic algorithms in forward and backward computation.
+ If no such algorithm exists given other constraints, MXNet will error out. This variable affects the choice
+ of CUDNN convolution algorithms. Please see [CUDNN developer guide](https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html) for more details.
+
+* MXNET_CPU_PARALLEL_COPY_SIZE
+ - Values: Int ```(default=200000)```
+ - The minimum size to call parallel copy by OpenMP in CPU2CPU mode.
+ - When the array size is bigger than or equal to this threshold, NDArray::Copy(from, to) is implemented by OpenMP with the Recommended OMP Thread Count.
+ - When the array size is less than this threshold, NDArray::Copy(from , to)) is implemented by memcpy in single thread.
+
+* MXNET_OPTIMIZER_AGGREGATION_SIZE
+ - Values: Int ```(default=4)```
+ - Maximum value is 60.
+ - This variable controls how many weights will be updated in a single call to optimizer (for optimizers that support aggregation, currently limited to SGD).
+
+* MXNET_CPU_TEMP_COPY
+ - Values: Int ```(default=4)```
+ - This variable controls how many temporary memory resources to create for all CPU context for use in operator.
+
+* MXNET_GPU_TEMP_COPY
+ - Values: Int ```(default=1)```
+ - This variable controls how many temporary memory resources to create for each GPU context for use in operator.
+
+* MXNET_CPU_PARALLEL_RAND_COPY
+ - Values: Int ```(default=1)```
+ - This variable controls how many parallel random number generator resources to create for all CPU context for use in operator.
+
+* MXNET_GPU_PARALLEL_RAND_COPY
+ - Values: Int ```(default=4)```
+ - This variable controls how many parallel random number generator resources to create for each GPU context for use in operator.
+
+* MXNET_GPU_CUDNN_DROPOUT_STATE_COPY
+ - Values: Int ```(default=4)```
+ - This variable controls how many CuDNN dropout state resources to create for each GPU context for use in operator.
+
+* MXNET_SUBGRAPH_BACKEND
+ - Values: String ```(default="MKLDNN")``` if MKLDNN is avaliable, otherwise ```(default="")```
+ - This variable controls the subgraph partitioning in MXNet.
+ - This variable is used to perform MKL-DNN FP32 operator fusion and quantization. Please refer to the [MKL-DNN operator list](../tutorials/mkldnn/operator_list.md) for how this variable is used and the list of fusion passes.
+ - Set ```MXNET_SUBGRAPH_BACKEND=NONE``` to disable subgraph backend.
+
+* MXNET_SAFE_ACCUMULATION
+ - Values: Values: 0(false) or 1(true) ```(default=0)```
+ - If this variable is set, the accumulation will enter the safe mode, meaning accumulation is done in a data type of higher precision than
+ the input data type, leading to more accurate accumulation results with a possible performance loss and backward compatibility loss.
+ For example, when the variable is set to 1(true), if the input data type is float16, then the accumulation will be done
+ with float32.
+ - Model accuracies do not necessarily improve with this environment variable turned on.
+
+Settings for Minimum Memory Usage
+---------------------------------
+- Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```
+ - The default setting satisfies this.
+
+Settings for More GPU Parallelism
+---------------------------------
+- Set ```MXNET_GPU_WORKER_NTHREADS``` to a larger number (e.g., 2)
+ - To reduce memory usage, consider setting ```MXNET_EXEC_NUM_TEMP```.
+ - This might not speed things up, especially for image applications, because GPU is usually fully utilized even with serialized jobs.
+
+Settings for controlling OMP tuning
+---------------------------------
+- Set ```MXNET_USE_OPERATOR_TUNING=0``` to disable Operator tuning code which decides whether to use OMP or not for operator
+ - Values: String representation of MXNET_ENABLE_OPERATOR_TUNING environment variable
+ - 0=disable all
+ - 1=enable all
+ - float32, float16, float32=list of types to enable, and disable those not listed
+ - refer : https://github.com/apache/incubator-mxnet/blob/master/src/operator/operator_tune-inl.h#L444
+
+- Set ```MXNET_USE_NUM_CORES_OPERATOR_TUNING``` to define num_cores to be used by operator tuning code.
+ - This reduces operator tuning overhead when there are multiple instances of mxnet running in the system and we know that
+ each mxnet will take only partial num_cores available with system.
+ - refer: https://github.com/apache/incubator-mxnet/pull/13602
diff --git a/docs/static_site/src/pages/api/faq/float16.md b/docs/static_site/src/pages/api/faq/float16.md
new file mode 100644
index 000000000000..d6b6210940eb
--- /dev/null
+++ b/docs/static_site/src/pages/api/faq/float16.md
@@ -0,0 +1,256 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Float16
+category: faq
+faq_c: Speed
+question: How do I use mixed precision (float16) with MXNet or Gluon?
+permalink: /api/faq/floa16
+---
+
+# Mixed precision training using float16
+
+In this tutorial we will walk through how one can train deep learning neural networks with mixed precision on supported hardware. We will first see how to use float16 (both with Gluon and Symbolic APIs) and then some techniques on achieving good performance and accuracy.
+
+## Background
+
+The computational resources required for training deep neural networks have been lately increasing because of growing complexity and model size. Mixed precision training allows us to reduce the utilization of the resources by using lower precision arithmetic which is computationally less expensive and less costly in terms of space utilization. In this approach you can train using 16 bit floating point (half precision) while using 32 bit floating point (single precision) for output buffers of float16 computation. This allows one to achieve the same accuracy as training with single precision, while decreasing the required memory and training or inference time.
+
+The float16 data type is a 16 bit floating point representation according to the [IEEE 754 standard](https://ieeexplore.ieee.org/document/4610935). It has a dynamic range where the precision can go from 0.0000000596046 (highest, for values closest to 0) to 32 (lowest, for values in the range 32768-65536). Despite the inherent reduced precision when compared to single precision float (float32), using float16 has many advantages. The most obvious advantages are that you can reduce the size of the model by half allowing the training of larger models and using larger batch sizes. The reduced memory footprint also helps in reducing the pressure on memory bandwidth and lowering communication costs. On hardware with specialized support for float16 computation you can also greatly improve the speed of training and inference. The Volta range of Graphics Processing Units (GPUs) from Nvidia have [Tensor Cores](https://www.nvidia.com/en-us/data-center/tensorcore/) which perform efficient float16 computation. A tensor core allows accumulation of half precision products into single or half precision outputs. For the rest of this tutorial we assume that we are working with Nvidia's Tensor Cores on a Volta GPU.
+
+## Prerequisites
+
+- [Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) range of Nvidia GPUs (e.g. AWS P3 instance)
+- CUDA 9 or higher
+- cuDNN v7 or higher
+
+This tutorial also assumes understanding of how to train a network with float32 (the default). Please refer to [logistic regression tutorial](https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/logistic_regression_explained.html) to get started with Apache MXNet and Gluon API. This tutorial focuses on the changes needed to switch from float32 to mixed precision and tips on achieving the best performance with mixed precision.
+
+## Using the Gluon API
+
+### Training or Inference
+
+With Gluon API, you need to take care of three things to convert a model to support computation with float16.
+
+1. Cast Gluon `Block`'s parameters and expected input type to float16 by calling the [cast](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block.cast) method of the `Block` representing the network.
+
+```python
+net.cast('float16')
+```
+
+2. Ensure the data input to the network is of float16 type. If your `DataLoader` or `Iterator` produces output in another datatype, then you would have to cast your data. There are different ways you can do this. The easiest would be to use the [astype](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.astype) method of NDArrays.
+
+```python
+data = data.astype('float16', copy=False)
+```
+
+If you are using images and DataLoader, you can also use a [Cast transform](https://mxnet.incubator.apache.org/api/python/gluon/data.html#mxnet.gluon.data.vision.transforms.Cast).
+
+3. It is preferable to use **multi_precision mode of optimizer** when training in float16. This mode of optimizer maintains a master copy of the weights in float32 even when the training (i.e. forward and backward pass) is in float16. This helps increase precision of the weight updates and can lead to faster convergence in some scenarios.
+
+```python
+optimizer = mx.optimizer.create('sgd', multi_precision=True, lr=0.01)
+```
+
+You can play around with mixed precision using the image classification [example](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py). We suggest using the Caltech101 dataset option in that example and using a ResNet50V1 network so you can quickly see the performance improvement and how the accuracy is unaffected. Here's the starter command to run this example.
+
+```bash
+python image_classification.py --model resnet50_v1 --dataset caltech101 --gpus 0 --num-worker 30 --dtype float16
+```
+
+### Fine-tuning
+
+You can also fine-tune a model, which was originally trained in float32, to use float16. Below is an example of how to fine-tune a pretrained model from the Model Zoo. You would first need to fetch the pretrained network and then cast that network to float16.
+
+```python
+import numpy as np
+import mxnet as mx
+from mxnet.gluon.model_zoo.vision import get_model
+
+
+pretrained_net = get_model(name='resnet50_v2', ctx=mx.cpu(),
+ pretrained=True, classes=1000)
+pretrained_net.cast('float16')
+```
+
+Then, if you have another Resnet50V2 model you want to fine-tune, you can just assign the features to that network and then cast it.
+
+```python
+net = get_model(name='resnet50_v2', ctx=mx.cpu(),
+ pretrained=False, classes=101)
+net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=mx.cpu())
+net.features = pretrained_net.features
+net.cast('float16')
+```
+
+You can check the parameters of the model by calling [summary](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block.summary) with some fake data. Notice the provided `dtype=np.float16` in the line below. As it was mentioned earlier, we have to provide data as float16 as well.
+
+```python
+net.summary(mx.nd.uniform(shape=(1, 3, 224, 224), dtype=np.float16))
+```
+
+## Using the Symbolic API
+
+Training a network in float16 with the Symbolic API involves the following steps.
+
+1. Add a layer at the beginning of the network, to cast the data to float16. This will ensure that all the following layers compute in float16.
+2. It is advisable to cast the output of the layers before softmax to float32, so that the softmax computation is done in float32. This is because softmax involves large reductions and it helps to keep that in float32 for more precise answer.
+3. It is advisable to use the multi-precision mode of the optimizer for more precise weight updates. Here's how you would enable this mode when creating an optimizer.
+
+```python
+optimizer = mx.optimizer.create('sgd', multi_precision=True, lr=0.01)
+```
+
+For a full example, please refer to [resnet.py](https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/symbols/resnet.py) file on GitHub. A small, relevant excerpt from that file is presented below.
+
+```python
+data = mx.sym.Variable(name="data")
+
+if dtype == 'float16':
+ data = mx.sym.Cast(data=data, dtype=np.float16)
+
+# ... the rest of the network
+net_out = net(data)
+
+if dtype == 'float16':
+ net_out = mx.sym.Cast(data=net_out, dtype=np.float32)
+
+output = mx.sym.SoftmaxOutput(data=net_out, name='softmax')
+```
+
+If you would like to train ResNet50 model on ImageNet using float16 precision, you can find the full script [here](https://github.com/apache/incubator-mxnet/tree/master/example/image-classificatiIfon/train_imagenet.py)
+
+If you don't have ImageNet dataset at your disposal, you can still run the script above using synthetic float16 data by providing the following command:
+
+```bash
+python train_imagenet.py --network resnet-v1 --num-layers 50 --benchmark 1 --gpus 0 --batch-size 256 --dtype float16
+```
+
+There's a similar example for float16 fine tuning [here](https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/fine-tune.py) of selected models: Inception v3, Inception v4, ResNetV1, ResNet50, ResNext or VGG. The command below shows how to use that script to fine-tune a Resnet50 model trained on Imagenet for the Caltech 256 dataset using float16.
+
+```bash
+python fine-tune.py --network resnet --num-layers 50 --pretrained-model imagenet1k-resnet-50 --data-train ~/.mxnet/dataset/caltech-256/caltech256-train.rec --data-val ~/data/caltech-256/caltech256-val.rec --num-examples 15420 --num-classes 256 --gpus 0 --batch-size 64 --dtype float16
+```
+
+If you don't have the `Caltech256` dataset, you can download it using the script below, and convert it into .rec file format using [im2rec utility file](https://github.com/apache/incubator-mxnet/blob/master/tools/im2rec.py)
+
+```python
+import os
+from os.path import expanduser
+import tarfile
+import mxnet as mx
+
+
+data_folder = expanduser("~/.mxnet/datasets/")
+dataset_name = "256_ObjectCategories"
+archive_file = "{}.tar".format(dataset_name)
+archive_path = os.path.join(data_folder, archive_file)
+data_url = "http://www.vision.caltech.edu/Image_Datasets/Caltech256/"
+
+if not os.path.isfile(archive_path):
+ mx.test_utils.download("{}{}".format(data_url, archive_file),
+ dirname=data_folder)
+ print('Extracting {} in {}...'.format(archive_file, data_folder))
+ tar = tarfile.open(archive_path)
+ tar.extractall(data_folder)
+ tar.close()
+ print('Data extracted.')
+```
+
+## Example training results
+
+Let us consider training a Resnet50V1 model on the ImageNet 2012 dataset. For this model, the GPU memory usage is close to the capacity of V100 GPU with a batch size of 128 when using float32. Using float16 allows the use of 256 batch size. Shared below are results using 8 V100 GPUs on a an [AWS p3.16xlarge](https://aws.amazon.com/ec2/instance-types/p3/#Amazon_EC2_P3_Instance_Product_Details) instance.
+
+Let us compare the three scenarios that arise here: float32 with 1024 batch size, float16 with 1024 batch size and float16 with 2048 batch size. These jobs trained for 90 epochs using a learning rate of 0.4 for 1024 batch size and 0.8 for 2048 batch size. This learning rate was decayed by a factor of 0.1 at the 30th, 60th and 80th epochs. The only changes made for the float16 jobs when compared to the float32 job were that the network and data were cast to float16, and the multi-precision mode was used for optimizer. The final accuracy at 90th epoch and the time to train are tabulated below for these three scenarios. The top-1 validation errors at the end of each epoch are also plotted below.
+
+Batch size | Data type | Top 1 Validation accuracy | Time to train | Speedup |
+--- | --- | --- | --- | --- |
+1024 | float32 | 76.18% | 11.8 hrs | 1 |
+1024 | float16 | 76.34% | 7.3 hrs | 1.62x |
+2048 | float16 | 76.29% | 6.5 hrs | 1.82x |
+
+![Training curves of Resnet50V1 on Imagenet 2012](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/mixed-precision/resnet50v1b_imagenet_fp16_fp32_training.png)
+
+The difference in accuracies above are within normal random variation, and there is no reason to expect float16 to have better accuracy than float32 in general. As the plot indicates, training behaves similarly for these cases, even though we didn't have to change any other hyperparameters. We can also see from the table that using float16 helps train faster through faster computation with float16 as well as allowing the use of larger batch sizes.
+
+## Things to keep in mind
+
+### For performance
+
+Typical performance gains seen for float16 typically range 1.6x-2x for convolutional networks like Resnet and even about 3x for networks with LSTMs. The performance gain you see can depend on certain things which this section will introduce.
+
+1. Nvidia Tensor Cores essentially perform the computation `D = A * B + C`, where A and B are half precision matrices, while C and D could be either half precision or full precision. The tensor cores are most efficient when dimensions of these matrices are multiples of 8. This means that Tensor Cores can not be used in all cases for fast float16 computation. When training models like Resnet50 on the Cifar10 dataset, the tensors involved are sometimes smaller, and Tensor Cores can not always be used. The computation in that case falls back to slower algorithms and using float16 turns out to be slower than float32 on a single GPU. Note that when using multiple GPUs, using float16 can still be faster than float32 because of reduction in communication costs.
+
+2. When you scale up the batch size ensure that IO and data pre-processing is not your bottleneck. If you see a slowdown this would be the first thing to check.
+
+3. It is advisable to use batch sizes that are multiples of 8 because of the above reason when training with float16. As always, batch sizes which are powers of 2 would be best when compared to those around it.
+
+4. You can check whether your program is using Tensor cores for fast float16 computation by profiling with `nvprof`. The operations with `s884cudnn` in their names represent the use of Tensor cores.
+
+5. When not limited by GPU memory, it can help to set the environment variable `MXNET_CUDNN_AUTOTUNE_DEFAULT` to `2`. This configures MXNet to run tuning tests and choose the fastest convolution algorithm whose memory requirements may exceed the default memory of CUDA workspace.
+
+6. Please note that float16 on CPU might not be supported for all operators, as in most cases float16 on CPU is much slower than float32.
+
+### For accuracy
+
+#### Multi precision mode
+
+When training in float16, it is advisable to still store the master copy of the weights in float32 for better accuracy. The higher precision of float32 helps overcome cases where gradient update can become 0 if represented in float16. This mode can be activated by setting the parameter `multi_precision` of optimizer params to `True` as in the above example. It has been found that this is not required for all networks to achieve the same accuracy as with float32, but nevertheless recommended. Note that for distributed training, this is currently slightly slower than without `multi_precision`, but still much faster than using float32 for training.
+
+#### Large reductions
+
+Since float16 has low precision for large numbers, it is best to leave layers which perform large reductions in float32. This includes BatchNorm and Softmax. Ensuring that Batchnorm performs reduction in float32 is handled by default in both Gluon and Module APIs. While Softmax is set to use float32 even during float16 training in Gluon, in the Module API it needs to be a cast to float32 before softmax as the above symbolic example code shows.
+
+#### Loss scaling
+
+For some networks just switching the training to float16 mode was not found to be enough to reach the same accuracy as when training with float32. This is because the activation gradients computed are too small and could not be represented in float16 representable range. Such networks can be made to achieve the accuracy reached by float32 with a couple of changes.
+
+Most of the float16 representable range is not used by activation gradients generally. So you can shift the gradients into float16 range by scaling up the loss by a factor `S`. By the chain rule, this scales up the loss before backward pass, and then you can scale back the gradients before updating the weights. This ensures that training in float16 can use the same hyperparameters as used during float32 training.
+
+Here's how you can configure the loss to be scaled up by 128 and rescale the gradient down before updating the weights.
+
+*Gluon API*
+
+```python
+loss = gluon.loss.SoftmaxCrossEntropyLoss(weight=128)
+optimizer = mx.optimizer.create('sgd',
+ multi_precision=True,
+ rescale_grad=1.0/128)
+```
+
+*Module API*
+
+```python
+mxnet.sym.SoftmaxOutput(other_args, grad_scale=128.0)
+optimizer = mx.optimizer.create('sgd',
+ multi_precision=True,
+ rescale_grad=1.0/128)
+```
+
+Networks like Multibox SSD, R-CNN, bigLSTM and Seq2seq were found to exhibit such behavior.
+You can choose a constant scaling factor while ensuring that the absolute value of gradient when multiplied by this factor remains in the range of float16. Generally powers of 2 like 64, 128, 256, 512 are chosen. Refer to the linked articles below for more details on this.
+
+## References
+
+1. [Training with Mixed Precision User Guide](http://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
+2. [Mixed Precision Training at ICLR 2018](https://arxiv.org/pdf/1710.03740.pdf)
+3. [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
+
+## Recommended Next Steps
+
+* Check out our video tutorial on [Using Mixed Precision with MXNet](https://www.youtube.com/watch?v=pR4KMh1lGC0)
diff --git a/docs/static_site/src/pages/api/faq/gradient_compression.md b/docs/static_site/src/pages/api/faq/gradient_compression.md
new file mode 100644
index 000000000000..74d35537508c
--- /dev/null
+++ b/docs/static_site/src/pages/api/faq/gradient_compression.md
@@ -0,0 +1,132 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Gradient Compression
+category: Speed
+faq_c: Speed
+question: How do I use gradient compression with distributed training?
+permalink: /api/faq/gradient_compression
+---
+
+# Gradient Compression
+
+Gradient Compression reduces communication bandwidth, and in some scenarios, it can make training more scalable and efficient without significant loss in convergence rate or accuracy. Example implementations with GPUs, CPUs, and distributed training are provided in this document.
+
+
+## Benefits
+
+**Increased Speed**
+
+For architectures with fully connected layers, the gradient compression capability is observed to speedup training by about 2x, depending on the size of the model and the network bandwidth of the instance. Bigger models see larger speedup with gradient compression.
+
+**Minimal Accuracy Loss**
+
+Gradient compression uses the approach of delaying the synchronization of weight updates which are small. Although small weight updates might not be sent for that batch, this information is not discarded. Once the weight updates for this location accumulate to become a larger value, they will be propagated. Since there is no information loss, but only delayed updates, it does not lead to a significant loss in accuracy or convergence rate. In distributed training experiments[1], the accuracy loss observed due to gradient compression was as low as 1%
+
+
+## When to Use Gradient Compression
+
+When training models whose architectures include large fully connected components, it can be helpful to use gradient compression. For larger models, as well as recurrent neural networks, the communication cost becomes a major factor. Such models stand to benefit greatly with gradient compression.
+
+
+### GPU versus CPU
+
+The greatest benefits from gradient compression are realized when using multi-node (single or multi-GPU) distributed training. Training on CPU would provide a lower compute density per compute node as compared to the massive compute density per compute node on a GPU. Due to this, the required communication bandwidth for CPU-based nodes during training is not as high as for GPU-based nodes. Hence, the benefits of gradient compression are lower for CPU-based nodes as compared to GPU-based nodes.
+
+
+### Network Latency
+
+Benefits of gradient compression can be found when using distributed training with network connected nodes. Depending on the network latency between nodes and the model's size, these can contribute to slow performance such that gradient compression may provide speed improvements.
+
+You may not want to use gradient compression if you have low latency network communication.
+
+
+### Model Size
+
+Distributed training involves synchronization of weights after each batch. Larger models have much higher communication costs during training, hence such models stand to benefit much more from gradient compression.
+When running distributed training with gradient compression, the quantize and dequantize operations happen on CPU parallelized with OpenMP. For smaller models, when training on GPUs, it helps to set `OMP_NUM_THREADS=1` on each node, so that the overhead of launching OMP threads doesn't cause the compression and decompression to be slow.
+
+### Model Architecture
+
+The communication bandwidth requirements during training vary across various neural network architectures and hence the benefits of gradient compression vary accordingly.
+
+In networks which have significant fully connected components, since such layers have low compute cost on GPUs, communication becomes a bottleneck limiting the speed of distributed training. Gradient compression can help reduce the communication cost, and thus speed up training in such cases. We have observed speedup of about 2x on large fully connected neural networks. Models like AlexNet and VGG have large fully connected components as part of the network, hence stand to benefit from gradient compression. As with these models, Long Short-Term Memory architectures require more communication bandwidth, so they also exhibit speed improvements with gradient compression.
+
+Architectures like Convolutional Neural Networks on the other hand have a higher compute cost, in which case some communication can be parallelized with computation. Since communication is not the bottleneck in such networks, gradient compression doesn't help much.
+
+
+### Single Node Gradient Compression
+
+When the training is configured to use device to device communication on a single node with multiple GPUs, gradient compression can be used to reduce the cost of communication. This can provide about 20% speedup for large models using older generation architectures. However, speed benefits may be negligible on a machine with a newer generation architecture where GPUs can communicate at low latency.
+
+
+## Approach
+
+The idea behind gradient compression comes from two observations:
+
+First, when training large neural networks, the gradients of weights computed for a small mini-batch of training data are typically sparse. Only a small fraction of the weights have significant updates after each mini-batch. The synchronization of updates that are near zero can be safely delayed longer than the typical mini-batch size. This essentially means that the rate of weight-update can vary depending on the value of an individual weight.
+
+Secondly, gradients can be compressed significantly by considering only those gradient elements whose absolute values exceed a threshold, and then quantizing them to use lower bits per gradient value. By compressing the gradients, we can reduce communication bandwidth. The delayed gradient values, in the form of quantization error and values that don't meet the threshold, are aggregated into a gradient residual which is communicated when it reaches the threshold.
+
+## Technical Implementation
+
+### Two Bit Quantization
+
+Currently the supported type of quantization uses two bits for each gradient value. Any positive value greater than or equal to the threshold sets two bits as `11`, any negative value whose absolute value is greater or equal to the threshold sets two bits as `10`, and others are set to `00`. This enables us to store 16 quantized gradients as one float. The error in quantization, which is `original_value - quantized_value` is stored in the form of a gradient residual.
+
+### Types of Kvstore
+
+Supported types of `kvstore` are `device` and all distributed kvstores such as `dist_sync`, `dist_async`, and `dist_sync_device`. When `kvstore` is `device`, the communication between GPUs is compressed. Please note that this increases the memory usage of GPUs because of the additional residual stored. When using a distributed kvstore, worker-to-server communication is compressed. In this case, compression and decompression happen on the CPU, and gradient residuals will be stored on the CPU. Server-to-worker communication and device-to-device communication are not compressed to avoid multiple levels of compression.
+
+## Enabling the Gradient Compression in MXNet
+
+Gradient compression is a run-time configuration parameter to be enabled during training. Here are the MXNet APIs to enable gradient compression:
+
+**Gluon API**:
+
+```python
+trainer = gluon.Trainer(..., compression_params={'type’:'2bit', 'threshold':0.5})
+```
+A reference `gluon` implementation with a gradient compression option can be found in the [train.py script from a word-level language modeling RNN example](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/word_language_model/train.py).
+
+**Module API**:
+
+```python
+mod = mx.mod.Module(..., compression_params={'type’:'2bit', 'threshold':0.5})
+```
+
+A `module` example is provided with [this guide for setting up MXNet with distributed training](https://mxnet.incubator.apache.org/versions/master/faq/multi_devices.html#distributed-training-with-multiple-machines). It comes with the option of turning on gradient compression as an argument to the [train_mnist.py script](https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/train_mnist.py).
+
+### Configuration Details
+
+**Threshold**
+
+A default `threshold` value of `0.5` is good for most use cases, but to get the most benefit from gradient compression for a particular scenario, it can be beneficial to experiment. If the threshold is set to a very large value, say `10.0`, then the updates become too infrequent and the training will converge slower. Setting the threshold automatically is expected in a future release.
+
+**Quantization**
+
+This release supports 2-bit quantization for encoding of gradients to reduce the communication bandwidth during training. Future releases will support 1-bit quantization and other approaches for encoding of gradients based on experimental evidence of benefits and user demand.
+
+**Sparse Format**
+
+We believe that the density of data will need to be really low (i.e. around > 90% zeros) to reap benefits of the sparse format. However, this is an area of experimentation that will be explored in a future release.
+
+
+## References
+
+1. [Nikko Storm, Amazon.com, Scalable Distributed Training using commodity GPU cloud computing.](https://s3-us-west-2.amazonaws.com/amazon.jobs-public-documents/strom_interspeech2015.pdf)
diff --git a/docs/static_site/src/pages/api/faq/model_parallel_lstm.md b/docs/static_site/src/pages/api/faq/model_parallel_lstm.md
new file mode 100644
index 000000000000..37a617042ae4
--- /dev/null
+++ b/docs/static_site/src/pages/api/faq/model_parallel_lstm.md
@@ -0,0 +1,90 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Model Parallel
+category: faq
+faq_c: Model
+question: How can I train using multiple GPUs with model parallelism?
+permalink: /api/faq/model_parallel_lstm
+---
+
+
+# Training with Multiple GPUs Using Model Parallelism
+Training deep learning models can be resource intensive.
+Even with a powerful GPU, some models can take days or weeks to train.
+Large long short-term memory (LSTM) recurrent neural networks
+can be especially slow to train,
+with each layer, at each time step, requiring eight matrix multiplications.
+Fortunately, given cloud services like AWS,
+machine learning practitioners often have access
+to multiple machines and multiple GPUs.
+One key strength of _MXNet_ is its ability to leverage
+powerful heterogeneous hardware environments to achieve significant speedups.
+
+There are two primary ways that we can spread a workload across multiple devices.
+In a previous document, [we addressed data parallelism](./multi_devices.md),
+an approach in which samples within a batch are divided among the available devices.
+With data parallelism, each device stores a complete copy of the model.
+Here, we explore _model parallelism_, a different approach.
+Instead of splitting the batch among the devices, we partition the model itself.
+Most commonly, we achieve model parallelism by assigning the parameters (and computation)
+of different layers of the network to different devices.
+
+In particular, we will focus on LSTM recurrent networks.
+LSTMS are powerful sequence models, that have proven especially useful
+for [natural language translation](https://arxiv.org/pdf/1409.0473.pdf), [speech recognition](https://arxiv.org/abs/1512.02595),
+and working with [time series data](https://arxiv.org/abs/1511.03677).
+For a general high-level introduction to LSTMs,
+see the excellent [tutorial](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) by Christopher Olah.
+
+
+## Model Parallelism: Using Multiple GPUs As a Pipeline
+Model parallelism in deep learning was first proposed
+for the _extraordinarily large_ convolutional layer in GoogleNet.
+From this implementation, we take the idea of placing each layer on a separate GPU.
+Using model parallelism in such a layer-wise fashion
+provides the benefit that no GPU has to maintain all of the model parameters in memory.
+
+
+
+In the preceding figure, each LSTM layer is assigned to a different GPU.
+After GPU 1 finishes computing layer 1 for the first sentence, it passes its output to GPU 2.
+At the same time, GPU 1 fetches the next sentence and starts training.
+This differs significantly from data parallelism.
+Here, there is no contention to update the shared model at the end of each iteration,
+and most of the communication happens when passing intermediate results between GPUs.
+
+
+## Workload Partitioning
+
+Implementing model parallelism requires knowledge of the training task.
+Here are some general heuristics that we find useful:
+
+- To minimize communication time, place neighboring layers on the same GPUs.
+- Be careful to balance the workload between GPUs.
+- Remember that different kinds of layers have different computation-memory properties.
+
+
+
+Let's take a quick look at the two pipelines in the preceding diagram.
+They both have eight layers with a decoder and an encoder layer.
+Based on our first principle, it's unwise to place all neighboring layers on separate GPUs.
+We also want to balance the workload across GPUs.
+Although the LSTM layers consume less memory than the decoder/encoder layers, they consume more computation time because of the dependency of the unrolled LSTM.
+Thus, the partition on the left will be faster than the one on the right
+because the workload is more evenly distributed.
diff --git a/docs/static_site/src/pages/api/faq/new_op.md b/docs/static_site/src/pages/api/faq/new_op.md
new file mode 100644
index 000000000000..360e7050efb5
--- /dev/null
+++ b/docs/static_site/src/pages/api/faq/new_op.md
@@ -0,0 +1,400 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Create New Operators
+category: faq
+faq_c: Extend and Contribute to MXNet
+question: How do I create new operators in MXNet with Python?
+permalink: /api/faq/new_op
+---
+
+# How to Create New Operators (Layers)
+
+This tutorials walks you through the process of creating new MXNet operators (or layers).
+We've done our best to provide high-speed operators for most common use cases.
+However, if you're engaged in research,
+there's a good chance you'll want to define custom layers,
+like a novel loss function. In these cases, you have two options:
+
+* Use CustomOp to write new operators using a front-end language (e.g., Python) that run on CPUs or GPUs.
+Depending on your implementation, this can range from very fast (if you only use operators under mx.nd) to very slow (if you copy out the data, using `.asnumpy()`).
+
+* Use C++/mshadow (CUDA). This provides the best performance, but can be difficult
+if you're not familiar with MXNet, mshadow, or Cuda.
+
+## CustomOp
+Implementing an operator in Python is simple.
+As an example, let's create a softmax operator.
+Start by subclassing `mxnet.operator.CustomOp`,
+and then override a few methods:
+
+```python
+import os
+import mxnet as mx
+import numpy as np
+
+class Softmax(mx.operator.CustomOp):
+ def forward(self, is_train, req, in_data, out_data, aux):
+ x = in_data[0].asnumpy()
+ y = np.exp(x - x.max(axis=1).reshape((x.shape[0], 1)))
+ y /= y.sum(axis=1).reshape((x.shape[0], 1))
+ self.assign(out_data[0], req[0], mx.nd.array(y))
+```
+
+We defined the computation for the forward pass of our operator.
+The forward function takes a list of input and a list of output NDArrays.
+For convenience, we called `.asnumpy()` on the first NDArray in input
+and convert it to a CPU-based NumPy array.
+This can be very slow. If you want the best performance,
+keep data in the NDArray format and use operators under mx.nd to do the computation.
+
+At the end, we used CustomOp.assign to assign the resulting array y to out_data[0]. It handles assignment based on the value of req, which can be 'write', 'add', or 'null'.
+
+Then do the same for the backward pass:
+
+```python
+def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+ l = in_data[1].asnumpy().ravel().astype(np.int)
+ y = out_data[0].asnumpy()
+ y[np.arange(l.shape[0]), l] -= 1.0
+ self.assign(in_grad[0], req[0], mx.nd.array(y))
+```
+
+Softmax defines the computation of our custom operator,
+but you still need to define its input/output format
+by subclassing mx.operator.CustomOpProp.
+First, register the new operator with the name 'softmax':
+
+```python
+@mx.operator.register("softmax")
+class SoftmaxProp(mx.operator.CustomOpProp):
+```
+
+Then, call the base constructor with `need_top_grad=False`
+because softmax is a loss layer and you don't need gradient input from preceding layers:
+
+```python
+def __init__(self):
+ super(SoftmaxProp, self).__init__(need_top_grad=False)
+```
+
+Then declare the input and output:
+
+```python
+def list_arguments(self):
+ return ['data', 'label']
+
+def list_outputs(self):
+ return ['output']
+```
+
+Note that list_arguments declares both input and parameter.
+We recommend ordering them as follows: `['input1', 'input2', ... , 'weight1', 'weight2', ...]`
+
+Next, provide `infer_shape` to declare the shape of the output/weight
+and check the consistency of the input shapes:
+
+```python
+def infer_shape(self, in_shape):
+ data_shape = in_shape[0]
+ label_shape = (in_shape[0][0],)
+ output_shape = in_shape[0]
+ return [data_shape, label_shape], [output_shape], []
+```
+The first axis of an input/output tensor corresponds to different examples within the batch.
+The label is a set of integers, one for each data entry,
+and the output has the same shape as the input.
+The `infer_shape` function should always return three lists in this order:
+inputs, outputs, and auxiliary states (which we don't have here),
+even if one of them is empty.
+
+Optionally, you can also define `infer_type` to declare the input and output data type of your operator. Supported types are `np.float32`, `np.float64`, `np.float16`, `np.uint8`, and `np.int32`.
+
+```python
+def infer_type(self, in_type):
+ dtype = in_type[0]
+ return [dtype, dtype], [dtype], []
+```
+
+Finally, define a create_operator function that will be called by the back end to create an instance of softmax:
+
+```python
+def create_operator(self, ctx, shapes, dtypes):
+ return Softmax()
+```
+
+To use the custom operator, create a mx.sym.Custom symbol with op_type as the registered name:
+
+```python
+mlp = mx.symbol.Custom(data=fc3, name='softmax', op_type='softmax')
+```
+
+Please see the full code for this example [here](https://github.com/dmlc/mxnet/blob/master/example/numpy-ops/custom_softmax.py).
+
+## C++
+With MXNet v0.9 (the NNVM refactor) or later, creating new operators has become easier.
+Operators are now registered with NNVM.
+The following code is an example on how to register an operator (checkout [src/operator/tensor](https://github.com/dmlc/mxnet/tree/master/src/operator/tensor) for more examples):
+
+```c++
+NNVM_REGISTER_OP(abs)
+.MXNET_DESCRIBE("Take absolute value of the src")
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr("FInferShape", ElemwiseShape<1,1>);
+```
+
+The syntax is quite simple, we register the operator with a name,
+then set number of inputs and outputs.
+You can register attributes with any key (`FInferShape` for example) to any operator,
+without having to modify a central class interface definition.
+
+### Operator Attribute System
+
+One of the biggest improvements brought by NNVM is the operator attribute system.
+This is like traits for types in common languages like C++.
+We can register any attribute to any operator, with the syntax
+
+``` c++
+NNVM_REGISTER_OP(op-name)
+.set_attr("AttributeKey", CorrespondingAttributeObject);
+```
+
+These attributes can be retrieved later for various purposes.
+For example, `FInferShape` is used for shape inference, `FCompute` is used for carrying out actual computation on CPU.
+
+As long as all attributes registered with the same key have the same type,
+we can register any attributes to operators.
+The more attribute an operator provides,
+the more information the system can use for optimization.
+
+### List of basic attributes
+
+In this section, we will go through the basic attributes MXNet expect for all operators.
+You can find the definition for them in the following two files:
+
+- [nnvm/op_attr_types.h](https://github.com/dmlc/nnvm/blob/master/include/nnvm/op_attr_types.h)
+- [mxnet/op_attr_types.h](https://github.com/dmlc/mxnet/blob/master/include/mxnet/op_attr_types.h)
+
+#### Descriptions (Optional)
+
+`.describe(comment)` adds a comment to the operator. Use `.MXNET_DESCRIBE(comment)` to add the current file name and line number to comment.
+
+#### Attribute Parser (Optional)
+
+Set attribute parser with `.set_attr_parser(PARSER)` where PARSER is a function with prototype `void(nnvm::NodeAttr* attrs)`. This function should parse the key-word arguments in `attrs->dict` and store the result in `attrs->parsed`.
+
+Simple arguments can be parsed like
+```c++
+NNVM_REGISTER_OP(scalar_op)
+.set_attr_parser(
+ [](NodeAttrs* attrs) {
+ attrs->parsed = std::stod(attrs->dict["scalar"]);
+ })
+```
+
+The parsed arguments can then be accessed in other attribute functions with
+```c++
+double alpha = nnvm::get(attrs.parsed);
+```
+
+More complex ops can use `dmlc::Parameters` and `ParamParser` (defined in operator_common.h) for parsing:
+
+```c++
+#include
+#include
+struct ActivationParam : public dmlc::Parameter {
+ // use int for enumeration
+ int act_type;
+ DMLC_DECLARE_PARAMETER(ActivationParam) {
+ DMLC_DECLARE_FIELD(act_type)
+ .add_enum("relu", activation::kReLU)
+ .add_enum("sigmoid", activation::kSigmoid)
+ .add_enum("tanh", activation::kTanh)
+ .add_enum("softrelu", activation::kSoftReLU)
+ .describe("Activation function to be applied.");
+ }
+};
+NNVM_REGISTER_OP(Activation)
+.set_attr_parser(ParamParser);
+// access with:
+// const ActivationParam& param = nnvm::get(attrs.parsed);
+```
+
+#### Inputs & Outputs
+
+Number of inputs/outputs can be set with `.set_num_inputs(n_in)` and `.set_num_outputs(n_out)`
+where n_in and n_out are integers.
+
+Alternatively, if the number of inputs/outputs is variable and depends on arguments,
+you can set `n_in`/`n_out` to functions with prototype `uint32_t(const nnvm::NodeAttrs& attrs)`
+that return the number of inputs/outputs based on parsed arguments.
+
+Outputs can be made invisible to other operators by registering `FNumVisibleOutputs`
+and returning an integer smaller than `n_out`.
+
+Inputs/outputs can be named by registering `FListInputNames` and `FListOutputNames` with prototype `std::vector(const NodeAttrs& attrs)`.
+
+
+#### Argument Descriptions
+
+Set argument descriptions with `.add_argument(name, type, comment)`.
+This is necessary for operators to be properly called imperatively.
+
+First, add NDArray arguments `num_inputs` times with type "NDArray"
+or one time with type "NDArray[]" for ops with variable length inputs.
+
+Then add key-word arguments with proper type (float, string, etc).
+Operators that parse key-word arguments with `dmlc::Parameter`
+can add argument descriptions in bulk with `.add_arguments(ActivationParam::__FIELDS__())`
+(NDArray arguments still need to be manually added with type "NDArray").
+
+#### FInferShape or TIsBackward (for Backward Only Ops)
+
+Normally operators need to have `FInferShape` with prototype `bool(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector *in_attrs, mxnet::ShapeVector *out_attrs)`. `FInferShape` fills unknown shapes (`shape.ndim() == 0`) in in_attrs/out_attrs based on known shapes in in_attrs/out_attrs. Use `ElemwiseShape` for simple operators with uniform shapes.
+
+Operators that are only used for a backward pass can instead register `.set_attr("TIsBackward", true)`
+and their shapes with be copied from the corresponding forward operators.
+
+#### FInferType
+
+Similar to `FInferShape`, `FInferType` fills unknown types (-1) based on known types. Use `ElemwiseType` for simple operators with uniform types. Operators that registered `TIsBackward` don't need to register this.
+
+
+#### FInplaceOption (Optional)
+
+`FInplaceOption` with prototype `std::vector >(const NodeAttrs& attrs)`
+specifies which input/output pairs can be computed in-place
+and share memory with each other.
+Each pair (i, j) in the returned list means
+that the i-th input can share memory with the j-th output.
+
+
+#### FGradient (Optional for imperative use, required for symbolic use)
+
+If an operator has gradient, it can be described with `FGradient` with prototype
+
+```c++
+std::vector(const nnvm::NodePtr& n,
+ const std::vector& ograds)
+```
+
+Use utility functions `ElemwiseGradUseIn{op_name}`, `ElemwiseGradUseOut{op_name}`, `ElemwiseGradUseNone{op_name}` for ops that need corresponding forward op's input,
+output or nothing to calculating gradient.
+
+For more complicated patterns, use `MakeGradNode(op_name, n, heads, dict)` to create gradient entries,
+where heads are input entries to the backward op, composed from ograds and n->inputs.
+
+When assembling a return vector of `std::vector ret;` a common pattern would be to
+either create nodes in place as in:
+
+```c++
+ret.emplace_back(MakeNode("zeros_like", n->attrs.name + "_xyz_backward",
+ {n->inputs[1]}, nullptr, &n))
+```
+
+Or create the node, modify and then move into NodeEntry's constructor if this node is not to be used
+again. This avoids uneccessary copies of the shared_ptr.
+
+```c++
+for (size_t i = 0; i < n->inputs.size(); ++i) {
+ nnvm::NodePtr node = nnvm::Node::Create();
+ node->attrs.op = copy_op;
+ node->inputs = {ograds[0]};
+ ret.emplace_back(std::move(node));
+}
+```
+
+The first case uses RVO and the second in place construction.
+
+#### FCompute\
+
+Simple operators can register FCompute with `.set_attr("FCompute", ...)` and `.set_attr("FCompute", ...)` for both CPU and (optionally) GPU computation.
+
+FCompute has prototype
+
+```c++
+void(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector& inputs,
+ const std::vector& req,
+ const std::vector& outputs)
+```
+
+`req` has the same length as `outputs`.
+Each entry of `req` specifies
+how the corresponding `output` should be written to.
+`OpReqType` is defined as:
+
+```c++
+enum OpReqType {
+ kNullOp,
+ kWriteTo,
+ kWriteInplace,
+ kAddTo
+};
+```
+
+Normally, the `req` of all `outputs` should be `kWriteTo`,
+meaning that the provided `outputs` tensor is a *raw* memory block,
+so the operator should write results directly into it.
+In some cases, for example, when calculating the gradient tensor,
+it would be great if we could accumulate the result,
+rather than directly overwrite the tensor contents
+so that no extra space needs to be created each time.
+In such cases, the corresponding `req` is set to `kAddTo`,
+indicating that a `+=` should be used.
+
+### Example: abs operator
+
+{% raw %}
+
+```c++
+NNVM_REGISTER_OP(abs)
+.MXNET_DESCRIBE("Take absolute value of the src")
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr("FInferShape", ElemwiseShape<1, 1>)
+.set_attr("FInferType", ElemwiseType<1, 1>)
+.set_attr("FInplaceOption",
+[](const NodeAttrs& attrs){
+ return std::vector >{{0, 0}};
+})
+.set_attr("FCompute", UnaryCompute)
+.set_attr("FGradient", ElemwiseGradUseIn{"_backward_abs"});
+.add_argument("data", "NDArray", "Source input")
+
+NNVM_REGISTER_OP(_backward_abs)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr("FInferShape", ElemwiseShape<2, 1>)
+.set_attr("FInferType", ElemwiseType<2, 1>)
+.set_attr("FInplaceOption",
+[](const NodeAttrs& attrs){
+ return std::vector >{{0, 0}, {1, 0}};
+})
+.set_attr("FCompute", BinaryCompute >);
+```
+
+{% endraw %}
+
+### Legacy Operators
+
+For the legacy (pre 0.9) way of defining operators with C++, please see:
+- [Developer Guide - Operators](http://mxnet.io/architecture/overview.html#operators-in-mxnet)
+- [Developer Guide - SimpleOp](http://mxnet.io/architecture/overview.html#simpleop-the-unified-operator-api)
diff --git a/docs/static_site/src/pages/api/faq/nnpack.md b/docs/static_site/src/pages/api/faq/nnpack.md
new file mode 100644
index 000000000000..f03b08646f62
--- /dev/null
+++ b/docs/static_site/src/pages/api/faq/nnpack.md
@@ -0,0 +1,162 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: NNPACK for Multi-Core CPU Support in MXNet
+category: faq
+faq_c: Speed
+question: Can I use nnpack to improve the CPU performance of MXNet?
+permalink: /api/faq/nnpack
+---
+
+### NNPACK for Multi-Core CPU Support in MXNet
+[NNPACK](https://github.com/Maratyszcza/NNPACK) is an acceleration package
+for neural network computations, which can run on x86-64, ARMv7, or ARM64 architecture CPUs.
+Using NNPACK, higher-level libraries like _MXNet_ can speed up
+the execution on multi-core CPU computers, including laptops and mobile devices.
+
+_MXNet_ supports NNPACK for forward propagation (inference only) in convolution, max-pooling, and fully-connected layers.
+In this document, we give a high level overview of how to use NNPACK with _MXNet_.
+
+
+### Conditions
+The underlying implementation of NNPACK utilizes several acceleration methods,
+including [fft](https://arxiv.org/abs/1312.5851) and [winograd](https://arxiv.org/abs/1509.09308).
+These algorithms work better on some special `batch size`, `kernel size`, and `stride` settings than on other,
+so depending on the context, not all convolution, max-pooling, or fully-connected layers can be powered by NNPACK.
+When favorable conditions for running NNPACKS are not met,
+_MXNet_ will fall back to the default implementation automatically.
+
+NNPACK only supports Linux and OS X systems. Windows is not supported at present.
+The following table explains under which conditions NNPACK will work.
+
+| operation | conditions |
+|:--------- |:---------- |
+|convolution |2d convolution `and` no-bias=False `and` dilate=(1,1) `and` num_group=1 `and` batch-size = 1 or batch-size > 1 && stride = (1,1);|
+|pooling | max-pooling `and` kernel=(2,2) `and` stride=(2,2) `and` pooling_convention=full |
+|fully-connected| without any restrictions |
+
+### Build/Install NNPACK with MXNet
+
+If the trained model meets some conditions of using NNPACK,
+you can build MXNet with NNPACK support.
+Follow these simple steps:
+* Build NNPACK shared library with the following commands. _MXNet_ will link NNPACK dynamically.
+
+Note: The following NNPACK installation instructions have been tested on Ubuntu 14.04 and 16.04.
+
+```bash
+# Install Pip
+$ sudo apt-get update
+$ sudo apt-get install -y python-pip
+$ sudo pip install --upgrade pip
+
+# Install Peach
+$ git clone https://github.com/Maratyszcza/PeachPy.git
+$ cd PeachPy
+$ sudo pip install --upgrade -r requirements.txt
+$ python setup.py generate
+$ sudo pip install --upgrade .
+
+# Install Ninja Build System
+$ sudo apt-get install ninja-build
+$ pip install ninja-syntax
+
+# Build NNPack shared library
+$ cd ~
+$ git clone --recursive https://github.com/Maratyszcza/NNPACK.git
+$ cd NNPACK
+# Latest NNPACK do not support building NNPACK as shared library using --enable-shared flag
+# Reset to commit that supports it.
+$ git reset --hard 9c6747d7b80051b40e6f92d6828e2ed997529cd2
+$ git submodule init && git submodule update --recursive
+$ python ./configure.py --enable-shared
+$ ninja
+$ cd ~
+
+```
+
+* Set lib path of NNPACK as the environment variable, e.g. `export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$YOUR_NNPACK_INSTALL_PATH/lib`
+* Add the include file of NNPACK and its third-party to `ADD_CFLAGS` in config.mk, e.g. `ADD_CFLAGS = -I$(YOUR_NNPACK_INSTALL_PATH)/include/ -I$(YOUR_NNPACK_INSTALL_PATH)/third-party/pthreadpool/include/`
+* Set `USE_NNPACK = 1` in config.mk.
+* Build MXNet from source following the [install guide](http://mxnet.io/install/index.html).
+
+### NNPACK Performance
+
+Though not all convolutional, pooling, and fully-connected layers can make full use of NNPACK,
+for some popular models it provides significant speedups. These include the most popular image recognition networks: Alexnet, VGG, and Inception-bn.
+
+To benchmark NNPACK, we use `example/image-classification/benchmark_score.py`(changed with more range of batch-size). We use CPU e5-2670, MXNET_CPU_NNPACK_NTHREADS=4.
+
+build MXNet without NNPACK, the log is:
+```
+INFO:root:network: alexnet
+INFO:root:device: cpu(0)
+INFO:root:batch size 1, image/sec: 6.389429
+INFO:root:batch size 2, image/sec: 7.961457
+INFO:root:batch size 4, image/sec: 8.950112
+INFO:root:batch size 8, image/sec: 9.578176
+INFO:root:batch size 16, image/sec: 9.701248
+INFO:root:batch size 32, image/sec: 9.839940
+INFO:root:batch size 64, image/sec: 10.075369
+INFO:root:batch size 128, image/sec: 10.053556
+INFO:root:batch size 256, image/sec: 9.972228
+INFO:root:network: vgg
+INFO:root:device: cpu(0)
+INFO:root:batch size 1, image/sec: 1.223822
+INFO:root:batch size 2, image/sec: 1.322814
+INFO:root:batch size 4, image/sec: 1.383586
+INFO:root:batch size 8, image/sec: 1.402376
+INFO:root:batch size 16, image/sec: 1.415972
+INFO:root:batch size 32, image/sec: 1.428377
+INFO:root:batch size 64, image/sec: 1.443987
+INFO:root:batch size 128, image/sec: 1.427531
+INFO:root:batch size 256, image/sec: 1.435279
+```
+
+build MXNet with NNPACK, log is:
+
+```
+INFO:root:network: alexnet
+INFO:root:device: cpu(0)
+INFO:root:batch size 1, image/sec: 19.027215
+INFO:root:batch size 2, image/sec: 12.879975
+INFO:root:batch size 4, image/sec: 17.424076
+INFO:root:batch size 8, image/sec: 21.283966
+INFO:root:batch size 16, image/sec: 24.469325
+INFO:root:batch size 32, image/sec: 25.910348
+INFO:root:batch size 64, image/sec: 27.441672
+INFO:root:batch size 128, image/sec: 28.009156
+INFO:root:batch size 256, image/sec: 28.918950
+INFO:root:network: vgg
+INFO:root:device: cpu(0)
+INFO:root:batch size 1, image/sec: 3.980907
+INFO:root:batch size 2, image/sec: 2.392069
+INFO:root:batch size 4, image/sec: 3.610553
+INFO:root:batch size 8, image/sec: 4.994450
+INFO:root:batch size 16, image/sec: 6.396612
+INFO:root:batch size 32, image/sec: 7.614288
+INFO:root:batch size 64, image/sec: 8.826084
+INFO:root:batch size 128, image/sec: 9.193653
+INFO:root:batch size 256, image/sec: 9.991472
+```
+
+The results show that NNPACK can confer a speedup of about 2X~7X as compared to the original _MXNet_ CPU implementation.
+
+### Tips
+
+NNPACK aims to provide high-performance implementations of some layers for multi-core CPUs, so you can easily set the thread number by changing the environmental variable `MXNET_CPU_NNPACK_NTHREADS`. However, we found that the performance is not proportional to the number of threads, and suggest using 4~8 threads when using NNPACK.
diff --git a/docs/static_site/src/pages/api/faq/perf.md b/docs/static_site/src/pages/api/faq/perf.md
new file mode 100644
index 000000000000..21eefb6c3fc7
--- /dev/null
+++ b/docs/static_site/src/pages/api/faq/perf.md
@@ -0,0 +1,315 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Some Tips for Improving MXNet Performance
+category: faq
+faq_c: Speed
+question: What are the best setup and data-handling tips and tricks for improving speed?
+permalink: /api/faq/perf
+---
+
+
+# Some Tips for Improving MXNet Performance
+Even after fixing the training or deployment environment and parallelization scheme,
+a number of configuration settings and data-handling choices can impact the _MXNet_ performance.
+In this document, we address some tips for improving _MXNet_ performance.
+
+Performance is mainly affected by the following 4 factors:
+
+1. Implementation of operators (Convolution, Pooling, ..)
+ - [Intel CPU](#intel-cpu)
+ - [Nvidia GPU](#nvidia-gpu)
+2. Input data loading and augmentation
+ - [Input Data](#input-data)
+3. Workloads (computation graph) optimization and scheduling
+ - [Profiler](#profiler)
+4. Communication for multi-devices training
+ - [Multiple Devices](#multiple-devices)
+
+## Intel CPU
+
+When using Intel Xeon CPUs for training and inference, the `mxnet-mkl` package is recommended. Adding `--pre` installs a nightly build from master. Without it you will install the latest patched release of MXNet:
+
+```
+$ pip install mxnet-mkl [--pre]
+```
+
+Or build MXNet from source code with `USE_MKLDNN=1`. For Linux users, `USE_MKLDNN=1` will be turned on by default.
+
+We also find that setting the following environment variables can help:
+
+
+| Variable | Description |
+| :-------- | :---------- |
+| `OMP_NUM_THREADS` | Suggested value: `vCPUs / 2` in which `vCPUs` is the number of virtual CPUs. For more information, please see the guide for [setting the number of threads using an OpenMP environment variable](https://software.intel.com/en-us/mkl-windows-developer-guide-setting-the-number-of-threads-using-an-openmp-environment-variable) |
+| `KMP_AFFINITY` | Suggested value: `granularity=fine,compact,1,0`. For more information, please see the guide for [Thread Affinity Interface (Linux* and Windows*)](https://software.intel.com/en-us/node/522691). |
+| `MXNET_SUBGRAPH_BACKEND` | Set to MKLDNN to enable the [subgraph feature](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Graph+Optimization+and+Quantization+based+on+subgraph+and+MKL-DNN) for better performance. For more information please see [Build/Install MXNet with MKL-DNN](https://github.com/apache/incubator-mxnet/blob/master/docs/tutorials/mkldnn/MKLDNN_README.md)|
+
+Note that _MXNet_ treats all CPUs on a single machine as a single device.
+So whether you specify `cpu(0)` or `cpu()`, _MXNet_ will use all CPU cores on the machine.
+
+### Scoring results
+The following table shows performance of [MXNet-1.2.0.rc1](https://github.com/apache/incubator-mxnet/releases/download/1.2.0.rc1/apache-mxnet-src-1.2.0.rc1-incubating.tar.gz),
+namely number of images that can be predicted per second.
+We used [example/image-classification/benchmark_score.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/benchmark_score.py)
+to measure the performance on different AWS EC2 machines.
+
+AWS EC2 C5.18xlarge:
+
+
+| Batch | Alexnet | VGG 16 | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+|-------|---------|--------|--------------|--------------|-----------|------------|
+| 1 | 390.53 | 81.57 | 124.13 | 62.26 | 76.22 | 32.92 |
+| 2 | 596.45 | 100.84 | 206.58 | 93.36 | 119.55 | 46.80 |
+| 4 | 710.77 | 119.04 | 275.55 | 127.86 | 148.62 | 59.36 |
+| 8 | 921.40 | 120.38 | 380.82 | 157.11 | 167.95 | 70.78 |
+| 16 | 1018.43 | 115.30 | 411.67 | 168.71 | 178.54 | 75.13 |
+| 32 | 1290.31 | 107.19 | 483.34 | 179.38 | 193.47 | 85.86 |
+
+
+
+AWS EC2 C5.9xlarge:
+
+
+| Batch | Alexnet | VGG 16 | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+|-------|---------|-------|--------------|--------------|-----------|------------|
+| 1 | 257.77 | 50.61 | 130.99 | 66.95 | 75.38 | 32.33 |
+| 2 | 410.60 | 63.02 | 195.14 | 87.84 | 102.67 | 41.57 |
+| 4 | 462.59 | 62.64 | 263.15 | 109.87 | 127.15 | 50.69 |
+| 8 | 573.79 | 63.95 | 309.99 | 121.36 | 140.84 | 59.01 |
+| 16 | 709.47 | 67.79 | 350.19 | 128.26 | 147.41 | 64.15 |
+| 32 | 831.46 | 69.58 | 354.91 | 129.92 | 149.18 | 64.25 |
+
+
+AWS EC2 C5.4xlarge:
+
+| Batch | Alexnet | VGG 16 | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+|-------|---------|-------|--------------|--------------|-----------|------------|
+| 1 | 214.15 | 29.32 | 114.97 | 47.96 | 61.01 | 23.92 |
+| 2 | 310.04 | 34.81 | 150.09 | 60.89 | 71.16 | 27.92 |
+| 4 | 330.69 | 34.56 | 186.63 | 74.15 | 86.86 | 34.37 |
+| 8 | 378.88 | 35.46 | 204.89 | 77.05 | 91.10 | 36.93 |
+| 16 | 424.00 | 36.49 | 211.55 | 78.39 | 91.23 | 37.34 |
+| 32 | 481.95 | 37.23 | 213.71 | 78.23 | 91.68 | 37.26 |
+
+
+AWS EC2 C5.2xlarge:
+
+| Batch | Alexnet | VGG 16 | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+|-------|---------|-------|--------------|--------------|-----------|------------|
+| 1 | 131.01 | 15.67 | 78.75 | 31.12 | 37.30 | 14.75 |
+| 2 | 182.29 | 18.01 | 98.59 | 39.13 | 45.98 | 17.84 |
+| 4 | 189.31 | 18.25 | 110.26 | 41.35 | 49.21 | 19.32 |
+| 8 | 211.75 | 18.57 | 115.46 | 42.53 | 49.98 | 19.81 |
+| 16 | 236.06 | 19.11 | 117.18 | 42.59 | 50.20 | 19.92 |
+| 32 | 261.13 | 19.46 | 116.20 | 42.72 | 49.95 | 19.80 |
+
+
+AWS EC2 C5.xlarge:
+
+| Batch | Alexnet | VGG 16 | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+|-------|---------|------|--------------|--------------|-----------|------------|
+| 1 | 36.64 | 3.93 | 27.06 | 10.09 | 12.98 | 5.06 |
+| 2 | 49.21 | 4.49 | 29.67 | 10.80 | 12.94 | 5.14 |
+| 4 | 50.12 | 4.50 | 30.31 | 10.83 | 13.17 | 5.19 |
+| 8 | 54.71 | 4.58 | 30.22 | 10.89 | 13.19 | 5.20 |
+| 16 | 60.23 | 4.70 | 30.20 | 10.91 | 13.23 | 5.19 |
+| 32 | 66.37 | 4.76 | 30.10 | 10.90 | 13.22 | 5.15 |
+
+
+## Other CPU
+
+If using CPUs (not just Intel CPUs -- ARMs also), NNPACK can improve the running performance with 2x~7x, please check [nnpack.md](./nnpack.md) for details.
+
+## Nvidia GPU
+
+`cuDNN` typically accelerates _MXNet_ performance on NVIDIA GPUs significantly,
+especially for convolution layers.
+We suggest always checking to make sure that a recent cuDNN version is used.
+
+Setting the environment `export MXNET_CUDNN_AUTOTUNE_DEFAULT=1` sometimes also helps.
+
+We show results when using various GPUs including K80 (EC2 p2.2xlarge), M60 (EC2 g3.4xlarge),
+and V100 (EC2 p3.2xlarge).
+
+### Scoring results
+
+Based on
+[example/image-classification/benchmark_score.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/benchmark_score.py)
+and [MXNet-1.2.0.rc1](https://github.com/apache/incubator-mxnet/releases/download/1.2.0.rc1/apache-mxnet-src-1.2.0.rc1-incubating.tar.gz), with cuDNN 7.0.5
+
+- K80 (single GPU)
+
+| Batch | Alexnet | VGG 16 | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+|-------|---------|--------|--------------|--------------|-----------|------------|
+| 1 | 243.93 | 43.59 | 68.62 | 35.52 | 67.41 | 23.65 |
+| 2 | 338.16 | 49.14 | 113.41 | 56.29 | 93.35 | 33.88 |
+| 4 | 478.92 | 53.44 | 159.61 | 74.43 | 119.18 | 45.23 |
+| 8 | 683.52 | 70.50 | 190.49 | 86.23 | 131.32 | 50.54 |
+| 16 | 1004.66 | 109.01 | 254.20 | 105.70 | 155.40 | 62.55 |
+| 32 | 1238.55 | 114.98 | 285.49 | 116.79 | 159.42 | 64.99 |
+| 64 | 1346.72 | 123.56 | 308.73 | 122.21 | 167.58 | 70.21 |
+| 128 | 1416.91 | OOM | 320.98 | 123.11 | 171.55 | 71.85 |
+| 256 | 1462.97 | OOM | 329.16 | 127.53 | 153.01 | 57.23 |
+
+- M60
+
+| Batch | Alexnet | VGG 16 | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+|-------|---------|--------|--------------|--------------|-----------|------------|
+| 1 | 243.49 | 59.95 | 101.97 | 48.30 | 95.46 | 39.29 |
+| 2 | 491.04 | 69.14 | 170.35 | 80.27 | 142.61 | 60.17 |
+| 4 | 711.54 | 78.94 | 257.89 | 123.09 | 182.36 | 76.51 |
+| 8 | 1077.73 | 109.34 | 343.42 | 152.82 | 208.74 | 87.27 |
+| 16 | 1447.21 | 144.93 | 390.25 | 166.32 | 220.73 | 92.41 |
+| 32 | 1797.66 | 151.86 | 416.69 | 176.56 | 230.19 | 97.03 |
+| 64 | 1779.38 | 150.18 | 427.51 | 183.47 | 239.12 | 101.59 |
+| 128 | 1787.36 | OOM | 439.04 | 185.29 | 243.31 | 103.39 |
+| 256 | 1899.10 | OOM | 450.22 | 183.42 | 242.36 | 100.98 |
+
+
+- V100
+
+| Batch | Alexnet | VGG 16 | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+|-------|---------|--------|--------------|--------------|-----------|------------|
+| 1 | 659.51 | 205.16 | 157.37 | 87.71 | 162.15 | 61.38 |
+| 2 | 1248.21 | 265.40 | 297.34 | 159.24 | 293.74 | 116.30 |
+| 4 | 2122.41 | 333.97 | 520.91 | 279.84 | 479.14 | 195.17 |
+| 8 | 3894.30 | 420.26 | 898.09 | 455.03 | 699.39 | 294.19 |
+| 16 | 5815.58 | 654.16 | 1430.97 | 672.54 | 947.45 | 398.79 |
+| 32 | 7906.09 | 708.43 | 1847.26 | 814.59 | 1076.81 | 451.82 |
+| 64 | 9486.26 | 701.59 | 2134.89 | 899.01 | 1168.37 | 480.44 |
+| 128 | 10177.84 | 703.30 | 2318.32 | 904.33 | 1233.15 | 511.79 |
+| 256 | 10990.46 | 473.62 | 2425.28 | 960.20 | 1155.07 | 449.35 |
+
+Below is the performance result on V100 using float 16.
+
+| Batch | VGG 16 | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+| ----- | ------- | ------------ | ------------ | --------- | ---------- |
+| 1 | 276.29 | 155.53 | 150.99 | 270.89 | 96.79 |
+| 2 | 476.91 | 296.45 | 282.02 | 493.99 | 176.88 |
+| 4 | 711.92 | 525.05 | 492.45 | 851.15 | 321.52 |
+| 8 | 1047.11 | 900.26 | 807.94 | 1282.36 | 517.66 |
+| 16 | 1299.88 | 1441.41 | 1192.21 | 1722.97 | 724.57 |
+| 32 | 1486.63 | 1854.30 | 1512.08 | 2085.51 | 887.34 |
+| 64 | 1219.65 | 2138.61 | 1687.35 | 2341.67 | 1002.90 |
+| 128 | 1169.81 | 2317.39 | 1818.26 | 2355.04 | 1046.98 |
+| 256 | 764.16 | 2425.16 | 1653.74 | 1991.88 | 976.73 |
+
+### Training results
+
+Based on
+[example/image-classification/train_imagenet.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/train_imagenet.py)
+and [MXNet-1.2.0.rc1](https://github.com/apache/incubator-mxnet/releases/download/1.2.0.rc1/apache-mxnet-src-1.2.0.rc1-incubating.tar.gz), with CUDNN 7.0.5. The benchmark script is available at
+[here](https://github.com/mli/mxnet-benchmark/blob/master/run_vary_batch.sh),
+where the batch size for Alexnet is increased by 16x.
+
+- K80 (single GPU)
+
+| Batch | Alexnet(\*16) | Inception-v3 | Resnet 50 |
+| --- | --- | --- | --- |
+| 1 | 300.30 | 10.48 | 15.61 |
+| 2 | 406.08 | 16.00 | 23.88 |
+| 4 | 461.01 | 22.10 | 32.26 |
+| 8 | 484.00 | 26.80 | 39.42 |
+| 16 | 490.45 | 31.62 | 46.69 |
+| 32 | 414.72 | 33.78 | 49.48 |
+
+- M60
+
+| Batch | Alexnet(\*16) | Inception-v3 | Resnet 50 |
+| --- | --- | --- | --- |
+| 1 | 380.96 | 14.06 | 20.55 |
+| 2 | 530.53 | 21.90 | 32.65 |
+| 4 | 600.17 | 31.96 | 45.57 |
+| 8 | 633.60 | 40.58 | 54.92 |
+| 16 | 639.37 | 46.88 | 64.44 |
+| 32 | 576.54 | 50.05 | 68.34 |
+
+- V100
+
+| Batch | Alexnet(\*16) | Inception-v3 | Resnet 50 |
+| --- | --- | --- | --- |
+| 1 | 1629.52 | 21.83 | 34.54 |
+| 2 | 2359.73 | 40.11 | 65.01 |
+| 4 | 2687.89 | 72.79 | 113.49 |
+| 8 | 2919.02 | 118.43 | 174.81 |
+| 16 | 2994.32 | 173.15 | 251.22 |
+| 32 | 2585.61 | 214.48 | 298.51 |
+| 64 | 1984.21 | 247.43 | 343.19 |
+| 128 | OOM | 253.68 | 363.69 |
+
+## Multiple Devices
+
+If more than one GPU or machine are used, MXNet uses `kvstore` to communicate data.
+It's critical to use the proper type of `kvstore` to get the best performance.
+Refer to [multi_device.md](http://mxnet.io/faq/multi_devices.html) for more
+details.
+
+Besides, we can use [tools/bandwidth](https://github.com/dmlc/mxnet/tree/master/tools/bandwidth)
+to find the communication cost per batch.
+Ideally, the communication cost should be less than the time to compute a batch.
+To reduce the communication cost, we can consider:
+
+- Exploring different `--kv-store` options.
+- Increasing the batch size to improve the computation to communication ratio.
+
+## Input Data
+
+To make sure you're handling input data in a reasonable way consider the following:
+
+* Data format: If you are using the `rec` format, then everything should be fine.
+* Decoding: By default, _MXNet_ uses 4 CPU threads for decoding images.
+This is often sufficient to decode more than 1K images per second.
+If you are using a low-end CPU or your GPUs are very powerful, you can increase the number of threads.
+* Storage location. Any local or distributed file system (HDFS, Amazon S3) should be fine.
+If multiple devices read the data from the shared network file system (NFS) at the same time, problems might occur.
+* Use a large batch size. We often choose the largest one that fits into GPU memory.
+A value that's too large can slow down convergence.
+For example, the safe batch size for CIFAR 10 is approximately 200, while for ImageNet 1K, the batch size can exceed 1K.
+
+## Profiler
+
+As of v0.9.1 (with the NNVM merge), _MXNet_ has a built-in profiler
+that gives detailed information about execution time at the symbol level.
+This feature complements general profiling tools like _nvprof_ and _gprof_
+by summarizing at the operator level, instead of a function, kernel, or instruction level.
+
+The profiler can be turned on with an [environment variable](http://mxnet.io/faq/env_var.html#control-the-profiler)
+for an entire program run, or programmatically for just part of a run.
+See [example/profiler](https://github.com/dmlc/mxnet/tree/master/example/profiler)
+for complete examples of how to use the profiler in code, but briefly, the Python code looks like:
+
+```python
+ mx.profiler.set_config(profile_all=True, filename='profile_output.json')
+ mx.profiler.set_state('run')
+
+ # Code to be profiled goes here...
+
+ mx.profiler.set_state('stop')
+```
+
+The `mode` parameter can be set to
+
+* `symbolic` to only include symbolic operations
+* `all` to include all operations
+
+After the program finishes, navigate to your browser's tracing (Example - chrome://tracing in a Chrome browser) and load the `profile_output.json` file output by the profiler to inspect the results.
+
+![MLP Profile](https://cloud.githubusercontent.com/assets/17693755/18035938/0a43484a-6d93-11e6-80d4-241c6ca552ea.png)
+
+Note that the output file can grow extremely large, so this approach is not recommended for general use.
diff --git a/docs/static_site/src/pages/api/faq/recordio.md b/docs/static_site/src/pages/api/faq/recordio.md
new file mode 100644
index 000000000000..93389a1da400
--- /dev/null
+++ b/docs/static_site/src/pages/api/faq/recordio.md
@@ -0,0 +1,109 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Create a Dataset Using RecordIO
+category: faq
+faq_c: Speed
+question: How can I create a .rec dataset ?
+permalink: /api/faq/recordio
+---
+
+
+## Create a Dataset Using RecordIO
+
+RecordIO implements a file format for a sequence of records. We recommend storing images as records and packing them together. The benefits include:
+
+* Storing images in a compact format--e.g., JPEG, for records--greatly reduces the size of the dataset on the disk.
+* Packing data together allows continuous reading on the disk.
+* RecordIO has a simple way to partition, simplifying distributed setting. We provide an example later.
+
+We provide two tools for creating a RecordIO dataset.
+
+* [im2rec.cc](https://github.com/dmlc/mxnet/blob/master/tools/im2rec.cc) - implements the tool using the C++ API.
+* [im2rec.py](https://github.com/apache/incubator-mxnet/blob/master/tools/im2rec.py) - implements the tool using the Python API.
+
+Both provide the same output: a RecordIO dataset.
+You may want to also review the [example using real-world data with im2rec.py.](https://mxnet.incubator.apache.org/tutorials/basic/data.html#loading-data-using-image-iterators)
+
+### Prerequisites
+
+Download the data. You don't need to resize the images manually. You can use ```im2rec``` to resize them automatically. For details, see the "Extension: Using Multiple Labels for a Single Image," later in this topic.
+
+### Step 1. Make an Image List File
+
+* Note that the im2rec.py provides a param `--list` to generate the list for you, but im2rec.cc doesn't support it.
+
+After you download the data, you need to make an image list file. The format is:
+
+```
+integer_image_index \t label_index \t path_to_image
+```
+Typically, the program takes the list of names of all of the images, shuffles them, then separates them into two lists: a training filename list and a testing filename list. Write the list in the right format.
+This is an example file:
+
+```bash
+95099 464.000000 n04467665_17283.JPEG
+10025081 412.000000 ILSVRC2010_val_00025082.JPEG
+74181 789.000000 n01915811_2739.JPEG
+10035553 859.000000 ILSVRC2010_val_00035554.JPEG
+10048727 929.000000 ILSVRC2010_val_00048728.JPEG
+94028 924.000000 n01980166_4956.JPEG
+1080682 650.000000 n11807979_571.JPEG
+972457 633.000000 n07723039_1627.JPEG
+7534 11.000000 n01630670_4486.JPEG
+1191261 249.000000 n12407079_5106.JPEG
+```
+
+### Step 2. Create the Binary File
+
+To generate a binary image, use `im2rec` in the tool folder. `im2rec` takes the path of the `image list file` you generated, the `root path` of the images, and the `output file path` as input. This process usually takes several hours, so be patient.
+
+Sample command:
+
+```bash
+./bin/im2rec image.lst image_root_dir output.bin resize=256
+```
+For more details, run ```./bin/im2rec```.
+
+### Extension: Multiple Labels for a Single Image
+
+The `im2rec` tool and `mx.io.ImageRecordIter` have multi-label support for a single image.
+For example, if you have four labels for a single image, you can use the following procedure to use the RecordIO tools.
+
+1. Write the image list files as follows:
+
+```
+integer_image_index \t label_1 \t label_2 \t label_3 \t label_4 \t path_to_image
+```
+
+2. Run `im2rec`, adding a 'label_width=4' to the command argument, for example:
+
+```bash
+./bin/im2rec image.lst image_root_dir output.bin resize=256 label_width=4
+```
+
+3. In the iterator generation code, set `label_width=4` and `path_imglist=<>`, for example:
+
+```python
+dataiter = mx.io.ImageRecordIter(
+ path_imgrec="data/cifar/train.rec",
+ data_shape=(3,28,28),
+ path_imglist="data/cifar/image.lst",
+ label_width=4
+)
+```
diff --git a/docs/static_site/src/pages/api/faq/s3_integration.md b/docs/static_site/src/pages/api/faq/s3_integration.md
new file mode 100644
index 000000000000..7c144c267c0a
--- /dev/null
+++ b/docs/static_site/src/pages/api/faq/s3_integration.md
@@ -0,0 +1,125 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Use data from S3 for training
+category: faq
+faq_c: Deployment Environments
+question: How to use data from S3 for training?
+permalink: /api/faq/s3_integration
+---
+
+
+# Use data from S3 for training
+
+AWS S3 is a cloud-based object storage service that allows storage and retrieval of large amounts of data at a very low cost. This makes it an attractive option to store large training datasets. MXNet is deeply integrated with S3 for this purpose.
+
+An S3 protocol URL (like `s3://bucket-name/training-data`) can be provided as a parameter for any data iterator that takes a file path as input. For example,
+
+```
+data_iter = mx.io.ImageRecordIter(
+ path_imgrec="s3://bucket-name/training-data/caltech_train.rec",
+ data_shape=(3, 227, 227),
+ batch_size=4,
+ resize=256)
+```
+Following are detailed instructions on how to use data from S3 for training.
+
+## Step 1: Build MXNet with S3 integration enabled
+
+Follow instructions [here](http://mxnet.io/install/index.html) to install MXNet from source with the following additional steps to enable S3 integration.
+
+1. Install `libcurl4-openssl-dev` and `libssl-dev` before building MXNet. These packages are required to read/write from AWS S3.
+2. Append `USE_S3=1` to `config.mk` before building MXNet.
+ ```
+ echo "USE_S3=1" >> config.mk
+ ```
+
+## Step 2: Configure S3 authentication tokens
+
+MXNet requires the S3 environment variables `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` to be set. [Here](https://aws.amazon.com/blogs/security/wheres-my-secret-access-key/) are instructions to get the access keys from AWS console.
+
+```
+export AWS_ACCESS_KEY_ID=
+AWS_SECRET_ACCESS_KEY=
+```
+
+## Step 3: Upload data to S3
+
+There are several ways to upload data to S3. One easy way is to use the AWS command line utility. For example, the following `sync` command will recursively copy contents from a local directory to a directory in S3.
+
+```
+aws s3 sync ./training-data s3://bucket-name/training-data
+```
+
+## Step 4: Train with data from S3
+
+Once the data is in S3, it is very straightforward to use it from MXNet. Any data iterator that can read/write data from a local drive can also read/write data from S3.
+
+Let's modify an existing example code in MXNet repository to read data from S3 instead of local disk. [`mxnet/tests/python/train/test_conv.py`](https://github.com/dmlc/mxnet/blob/master/tests/python/train/test_conv.py) trains a convolutional network using MNIST data from local disk. We'll do the following change to read the data from S3 instead.
+
+```
+~/mxnet$ sed -i -- 's/data\//s3:\/\/bucket-name\/training-data\//g' ./tests/python/train/test_conv.py
+
+~/mxnet$ git diff ./tests/python/train/test_conv.py
+diff --git a/tests/python/train/test_conv.py b/tests/python/train/test_conv.py
+index 039790e..66a60ce 100644
+--- a/tests/python/train/test_conv.py
++++ b/tests/python/train/test_conv.py
+@@ -39,14 +39,14 @@ def get_iters():
+
+ batch_size = 100
+ train_dataiter = mx.io.MNISTIter(
+- image="data/train-images-idx3-ubyte",
+- label="data/train-labels-idx1-ubyte",
++ image="s3://bucket-name/training-data/train-images-idx3-ubyte",
++ label="s3://bucket-name/training-data/train-labels-idx1-ubyte",
+ data_shape=(1, 28, 28),
+ label_name='sm_label',
+ batch_size=batch_size, shuffle=True, flat=False, silent=False, seed=10)
+ val_dataiter = mx.io.MNISTIter(
+- image="data/t10k-images-idx3-ubyte",
+- label="data/t10k-labels-idx1-ubyte",
++ image="s3://bucket-name/training-data/t10k-images-idx3-ubyte",
++ label="s3://bucket-name/training-data/t10k-labels-idx1-ubyte",
+ data_shape=(1, 28, 28),
+ label_name='sm_label',
+ batch_size=batch_size, shuffle=True, flat=False, silent=False)
+```
+
+After the above change `test_conv.py` will fetch data from S3 instead of the local disk.
+
+```
+python ./tests/python/train/test_conv.py
+[21:59:19] src/io/s3_filesys.cc:878: No AWS Region set, using default region us-east-1
+[21:59:21] src/io/iter_mnist.cc:94: MNISTIter: load 60000 images, shuffle=1, shape=(100,1,28,28)
+[21:59:21] src/io/iter_mnist.cc:94: MNISTIter: load 10000 images, shuffle=1, shape=(100,1,28,28)
+INFO:root:Start training with [cpu(0)]
+Start training with [cpu(0)]
+INFO:root:Epoch[0] Resetting Data Iterator
+Epoch[0] Resetting Data Iterator
+INFO:root:Epoch[0] Time cost=11.277
+Epoch[0] Time cost=11.277
+INFO:root:Epoch[0] Validation-accuracy=0.955100
+Epoch[0] Validation-accuracy=0.955100
+INFO:root:Finish fit...
+Finish fit...
+INFO:root:Finish predict...
+Finish predict...
+INFO:root:final accuracy = 0.955100
+final accuracy = 0.955100
+```
diff --git a/docs/static_site/src/pages/api/faq/security.md b/docs/static_site/src/pages/api/faq/security.md
new file mode 100644
index 000000000000..6a47244ea9ef
--- /dev/null
+++ b/docs/static_site/src/pages/api/faq/security.md
@@ -0,0 +1,49 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: MXNet Security Best Practices
+category: faq
+faq_c: Security
+question: How to run MXNet securely?
+permalink: /api/faq/security
+---
+
+# MXNet Security Best Practices
+
+MXNet framework has no built-in security protections. It assumes that the MXNet entities involved in model training and inferencing (hosting) are fully trusted. It also assumes that their communications cannot be eavesdropped or tampered with. MXNet consumers shall ensure that the above assumptions are met.
+
+In particular the following threat-vectors exist when training using MXNet:
+
+* When running distributed training using MXNet there is no built-in support for authenticating cluster nodes participating in the training job.
+* Data exchange between cluster nodes happens is in plain-text.
+* Using `kvstore.set_optimizer` one can use a custom optimizer to combine gradients. This optimizer code is sent to the server nodes as a pickle file. A server does not perform any further validation of the pickle file and simply executes the code trusting the sender (worker).
+* Since there is no authentication between nodes, a malicious actor running on the same network can launch a Denial of Service (DoS) attack by sending data that can overwhelm/crash a scheduler or other server nodes.
+
+It is highly recommended that the following best practices be followed when using MXNet:
+
+* Run MXNet with least privilege, i.e. not as root.
+* Run MXNet training jobs inside a secure and isolated environment. If you are using a cloud provider like Amazon AWS, running your training job inside a [private VPC](https://aws.amazon.com/vpc/) is a good way to accomplish this. Additionally, configure your network security settings so as to only allow connections that the cluster nodes require.
+* Make sure no unauthorized actors have physical or remote access to the nodes participating in MXNet training.
+* During training, one can configure MXNet to periodically save model checkpoints. To protect these model checkpoints from unauthorized access, make sure the checkpoints are written out to an encrypted storage volume, and have a provision to delete checkpoints that are no longer needed.
+* When sharing trained models, or when receiving trained models from other parties, ensure that model artifacts are authenticated and integrity protected using cryptographic signatures, thus ensuring that the data received comes from trusted sources and has not been maliciously (or accidentally) modified in transit.
+* By default, mx.random uses a static and fixed seed value. The random utilities in MXNet should therefore never be used to implement any type of security critical functionality where cryptographically secure pseudorandom number generation is required.
+
+# Deployment Considerations
+The following are not MXNet framework specific threats but are applicable to Machine Learning models in general.
+
+* When deploying high-value, proprietary models for inference, care should be taken to prevent an adversary from stealing the model. The research paper [Stealing Machine Learning Models via Prediction APIs](https://arxiv.org/pdf/1609.02943.pdf) outlines experiments performed to show how an attacker can use a prediction API to leak the ML model or construct a nearly identical replica. A simple way to thwart such an attack is to not expose the prediction probabilities to a high degree of precision in the API response.
diff --git a/docs/static_site/src/pages/api/faq/smart_device.md b/docs/static_site/src/pages/api/faq/smart_device.md
new file mode 100644
index 000000000000..5ed9ebc31bec
--- /dev/null
+++ b/docs/static_site/src/pages/api/faq/smart_device.md
@@ -0,0 +1,120 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Deep Learning at the Edge
+category: faq
+faq_c: Deployment Environments
+question: How to run MXNet securely?
+permalink: /api/faq/smart_device
+---
+
+# Deep Learning in a Single File for Smart Devices
+
+Deep learning (DL) systems are complex and often depend on a number of libraries.
+Porting a DL library to different platforms can be painful, especially for smart devices.
+One simple solution to this problem is to provide a light interface to the library, complete with all required code in a single file with minimal dependencies.
+In this document, we explain how to amalgamate all necessary code into a single file,
+and demonstrate the approach with an example in which we run image recognition on a mobile device.
+
+## Amalgamation: Making the Whole System a Single File
+
+We come to the idea of of amalgamation following the example of SQLite,
+which pack all the code needed to run a simple database into a single source file.
+All that's necessary to create the library is to compile that single file.
+This simplifies the problem of porting to various platforms.
+
+Thanks to [Jack Deng](https://github.com/jdeng),
+MXNet provides an [amalgamation](https://github.com/dmlc/mxnet/tree/master/amalgamation) script
+that compiles all code needed for prediction based on trained DL models into a single `.cc` file,
+containing approximately 30K lines of code. This code only depends on the BLAS library.
+Moreover, we've also created an even more minimal version,
+with the BLAS dependency removed.
+You can compile the single file into JavaScript by using [emscripten](https://github.com/kripken/emscripten).
+
+The compiled library can be used by any other programming language.
+The `.h` file contains a light prediction API.
+Porting to another language with a C foreign function interface requires little effort.
+
+For examples, see the following examples on GitHub:
+
+- Go: [https://github.com/jdeng/gomxnet](https://github.com/jdeng/gomxnet)
+- Java: [https://github.com/dmlc/mxnet/tree/master/amalgamation/jni](https://github.com/dmlc/mxnet/tree/master/amalgamation/jni)
+- Python: [https://github.com/dmlc/mxnet/tree/master/amalgamation/python](https://github.com/dmlc/mxnet/tree/master/amalgamation/python)
+
+
+If you plan to amalgamate your system, there are a few guidelines you ought to observe when building the project:
+
+- Minimize dependence on other libraries.
+- Use namespace to encapsulate the types and operators.
+- Avoid running commands such as ```using namespace xyz``` on the global scope.
+- Avoid cyclic include dependencies.
+
+
+## Image Recognition Demo on Mobile Devices
+
+With amalgamation, deploying the system on smart devices (such as Android or iOS) is simple. But there are two additional considerations:
+
+- The model should be small enough to fit into the device's memory.
+- The model shouldn't be too expensive to run given the relatively low computational power of these devices.
+
+Let's use image recognition as an example.
+We start with the state-of-the-art inception model.
+We train it on an ImageNet dataset,
+using multiple servers with GTX 980 cards.
+The resulting model fits into memory,
+but it's too expensive to run.
+We remove some layers, but now the results are poor.
+
+Finally, we show an Android example, thanks to Leliana, [https://github.com/Leliana/WhatsThis](https://github.com/Leliana/WhatsThis) to demonstrate how to run on Android.
+
+
+
+
+By using amalgamation, we can easily port the prediction library to mobile devices, with nearly no dependency.
+After compiling the library for smart platforms, the last thing we must do is to call C-API in the target language (Java/Swift).
+
+
+
+Besides this pre-trained Inception-BatchNorm network, we've provided two pre-trained models.
+
+We tested our model on a Nexus 5:
+
+
+| | Top-1 Validation on ILSVRC2012 | Time | App Size | Runtime Temp Memory Req |
+| ---------------- | ----------------------------------- | ----- | --- | ------------ |
+| FastPoorNet | around 52%, similar to 2011 winner | 1s | <10MB | <5MB |
+| Sub InceptionBN | around 64%, similar to 2013 winner | 2.7s | <40MB | <10MB |
+| InceptionBN | around 70% | 4s-5s | <60MB | 10MB |
+
+These models are for demonstration only.
+They aren't fine-tuned for mobile devices,
+and there is definitely room for improvement.
+We believe that making a lightweight, portable,
+and fast deep learning library is fun and interesting,
+and hope you enjoy using the library.
+
+## Source Code
+[https://github.com/Leliana/WhatsThis](https://github.com/Leliana/WhatsThis)
+
+
+## Demo APK Download
+
+- [FastPoorNet](https://github.com/dmlc/web-data/blob/master/mxnet/apk/fastpoornet.apk?raw=true)
+
+
+- [SubInception](https://github.com/dmlc/web-data/blob/master/mxnet/apk/subinception.apk?raw=true)
diff --git a/docs/static_site/src/pages/api/faq/visualize_graph.md b/docs/static_site/src/pages/api/faq/visualize_graph.md
new file mode 100644
index 000000000000..e520bc71d640
--- /dev/null
+++ b/docs/static_site/src/pages/api/faq/visualize_graph.md
@@ -0,0 +1,88 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Visualize Neural Networks
+category: faq
+faq_c: Model
+question: How do I visualize neural networks as computation graphs?
+permalink: /api/faq/visualize_graph
+---
+# How to visualize Neural Networks as computation graph
+
+Here, we'll demonstrate how to use ```mx.viz.plot_network```
+for visualizing your neural networks. ```mx.viz.plot_network```
+represents the neural network as a computation graph consisting of nodes and edges.
+The visualizations make clear which nodes correspond to inputs,
+where the computation starts,
+and which correspond to output nodes,
+from which the result can be read.
+
+## Prerequisites
+You need the [Jupyter Notebook](http://jupyter.readthedocs.io/en/latest/)
+and [Graphviz](http://www.graphviz.org/) libraries to visualize the network.
+Please make sure you have followed [installation instructions](http://mxnet.io/install/index.html)
+in setting up above dependencies along with setting up MXNet.
+
+## Visualize the sample Neural Network
+
+```mx.viz.plot_network``` takes [Symbol](http://mxnet.io/api/python/symbol/symbol.html), with your Network definition, and optional node_attrs, parameters for the shape of the node in the graph, as input and generates a computation graph.
+
+We will now try to visualize a sample Neural Network for linear matrix factorization:
+- Start Jupyter notebook server
+```bash
+ $ jupyter notebook
+```
+- Access Jupyter notebook in your browser - http://localhost:8888/.
+- Create a new notebook - "File -> New Notebook -> Python 2"
+- Copy and run below code to visualize a simple network.
+
+```python
+import mxnet as mx
+user = mx.symbol.Variable('user')
+item = mx.symbol.Variable('item')
+score = mx.symbol.Variable('score')
+
+# Set dummy dimensions
+k = 64
+max_user = 100
+max_item = 50
+
+# user feature lookup
+user = mx.symbol.Embedding(data = user, input_dim = max_user, output_dim = k)
+
+# item feature lookup
+item = mx.symbol.Embedding(data = item, input_dim = max_item, output_dim = k)
+
+# predict by the inner product, which is elementwise product and then sum
+net = user * item
+net = mx.symbol.sum_axis(data = net, axis = 1)
+net = mx.symbol.Flatten(data = net)
+
+# loss layer
+net = mx.symbol.LinearRegressionOutput(data = net, label = score)
+
+# Visualize your network
+mx.viz.plot_network(net)
+```
+You should see computation graph something like the following image:
+
+
+# References
+* [Example MXNet Matrix Factorization](https://github.com/dmlc/mxnet/blob/master/example/recommenders/demo1-MF.ipynb)
+* [Visualizing CNN Architecture of MXNet Tutorials](http://josephpcohen.com/w/visualizing-cnn-architectures-side-by-side-with-mxnet/)
diff --git a/docs/static_site/src/pages/api/faq/why_mxnet.md b/docs/static_site/src/pages/api/faq/why_mxnet.md
new file mode 100644
index 000000000000..779458ed0c9c
--- /dev/null
+++ b/docs/static_site/src/pages/api/faq/why_mxnet.md
@@ -0,0 +1,207 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_category
+title: Why MXNet came to be?
+category: faq
+faq_c: Extend and Contribute to MXNet
+question: Why was MXNet developed in the first place ?
+permalink: /api/faq/why_mxnet
+---
+
+# Why was MXNet developed in the first place ?
+
+Probably, if you've stumbled upon this page, you've heard of _deep learning_.
+Deep learning denotes the modern incarnation of neural networks,
+and it's the technology behind recent breakthroughs
+in self-driving cars, machine translation, speech recognition and more.
+While widespread interest in deep learning took off in 2012,
+deep learning has become an indispensable tool for countless industries.
+
+![alt text](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/get-started/image-classification.png)
+
+It might not come as a surprise that researchers
+have investigated neural networks for decades.
+Warren McCulloch and Walter Pitts
+suggested the forerunner of today's artificial neurons back in 1943.
+Each neuron is connected to other neurons along _edges_, analogous to the synapses that connect real neurons.
+And associated with each edge is a _weight_ that indicates whether the connection is excitatory or inhibitatory and the strength of the connection.
+
+![alt_text](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/get-started/artificial-neuron-2.png)
+
+In the 1980s, the modern version of neural networks took shape.
+Researchers arranged artificial neurons into _layers_.
+Neurons in any layer get input from the neurons in the layers below them.
+And, in turn, their output feeds into the neurons in the layer above.
+Typically, the lowest layer represents the _input_ to a neural network.
+After computing the values of each layer, the _output_ values are read out from the topmost layer.
+The behavior of the network is determined by the setting of the weights.
+And the process of _learning_ in neural networks
+is precisely the process of searching for good settings of these _weights_.
+
+All that we need is an algorithm that tells us how to perform this search.
+And since David Rumelhart and colleagues
+introduced the _backpropagation_ learning algorithm to train neural networks,
+nearly all the major ideas have been in place.
+Still, for many years neural networks took a backseat
+to classical statistical methods like logistic regression and support vector machines (SVMs).
+So you might reasonably ask, what's changed to garner such interest?
+
+## Scale and Computation
+The two biggest factors driving innovation in deep learning now are data and computation.
+With distributed cloud computing and parallelism across GPU cores,
+we can train models millions of times faster than researchers could in the 1980s.
+The availability of large, high-quality datasets is another factor driving the field forward.
+In the 1990s, the best datasets in computer vision had thousands of low-resolution images and ground truth assignments to a small number of classes.
+Today, researchers cut their teeth on ImageNet, a massive dataset containing millions of high-resolution images from a thousand distinct classes.
+The falling price of storage and high network bandwidth
+make it affordable to work with big data at will.
+
+In this new world, with bigger datasets and abundant computation,
+neural networks dominate on most pattern recognition problems.
+Over the last five years, neural networks have come to dominate on nearly every problem in computer vision,
+replacing classical models and hand-engineered features.
+Similarly, nearly every production speech recognition system now relies on neural networks,
+where replacing the hidden Markov models that previously held sway.
+
+![alt text](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/get-started/nvidia-gpus.jpg)
+
+While GPUs and clusters present a huge opportunity for accelerating neural network training,
+adapting traditional machine learning code
+to take advantage of these resources can be challenging.
+The familiar scientific computing stacks (Matlab, R, or NumPy & SciPy)
+give no straight-forward way to exploit these distributed resources.
+
+Acceleration libraries like _MXNet_ offer powerful tools
+to help developers exploit the full capabilities of GPUs and cloud computing.
+While these tools are generally useful and applicable to any mathematical computation, _MXNet_ places a special emphasis on speeding up the development and deployment of large-scale deep neural networks. In particular, we offer the following capabilities:
+* __Device Placement:__ With _MXNet_, it's easy to specify where each data structures should live.
+* __Multi-GPU training__: _MXNet_ makes it easy to scale computation with number of available GPUs.
+* __Automatic differentiation__: _MXNet_ automates the derivative calculations that once bogged down neural network research.
+* __Optimized Predefined Layers__: While you can code up your own layers in _MXNet_, the predefined layers are optimized for speed, outperforming competing libraries.
+
+## Deep Nets on Fast Computers
+While MXNet can accelerate any numerical computation,
+we developed the library with neural networks in mind.
+However you plan to use MXNet, neural networks make for a powerful motivating example to display MXNet's capabilities.
+
+Neural networks are just functions for transforming input arrays `X` into output arrays `Y`.
+In the case of image classification, `X` might represent the pixel values of an image, and `Y` might represent the corresponding probabilities that the image belongs to each of `10` classes.
+For language translation, `X` and `Y` both might denote sequences of words. We'll revisit the way you might represent sequences in subsequent tutorials - so for now it's safe to think of `X` and `Y` as fixed length vectors.
+
+To perform this mapping, neural networks stack _layers_ of computation. Each layer consists of a linear function followed by a nonlinear transformation. In _MXNet_ we might express this as:
+```python
+hidden_linear = mx.sym.dot(X, W)
+hidden_activation = mx.sym.tanh(hidden_linear)
+```
+The linear transformations consist of multiplication by parameter arrays (`W` above).
+When we talk about learning we mean finding the right set of values for `W`.
+With just one layer, we can implement the familiar family of linear models,
+including linear and logistic regression, linear support vector machines (SVMs), and the perceptron algorithm.
+With more layers and a few clever constraints, we can implement all of today's state-of-the-art deep learning techniques.
+
+Of course, tens or hundreds of matrix multiplications can be computationally taxing.
+Generally, these linear operations are the computational bottleneck.
+Fortunately, linear operators can be parallelized trivially across the thousands of cores on a GPU.
+But low-level GPU programming requires specialized skills that are not common even among leading researchers in the ML community. Moreover, even for CUDA experts, implementing a new neural network architecture shouldn't require weeks of programming to implement low-level linear algebra operations. That's where _MXNet_ comes in.
+* _MXNet_ provides optimized numerical computation for GPUs and distributed ecosystems, from the comfort of high-level environments like Python and R
+* _MXNet_ automates common workflows, so standard neural networks can be expressed concisely in just a few lines of code
+
+Now let's take a closer look at the computational demands of neural networks
+and give a sense of how _MXNet_ helps us to write better, faster, code.
+Say we have a neural network trained to recognize spam from the content of emails.
+The emails may be streaming from an online service (at inference time),
+or from a large offline dataset __D__ (at training time).
+In either case, the dataset typically must be managed by the CPU.
+
+![alt text](https://raw.githubusercontent.com/kevinthesun/web-data/master/mxnet/get-started/architecture.png)
+
+To compute the transformation of a neural network quickly, we need both the parameters and data points to make it into GPU memory. For any example _X_, the parameters _W_ are the same. Moreover the size of the model tends to dwarf the size of an individual example. So we might arrive at the natural insight that parameters should always live on the GPU, even if the dataset itself must live on the CPU or stream in. This prevents IO from becoming the bottleneck during training or inference.
+
+Fortunately, _MXNet_ makes this kind of assignment easy.
+```python
+import mxnet.ndarray as nd
+
+X = nd.zeros((10000, 40000), mx.cpu(0)) #Allocate an array to store 1000 datapoints (of 40k dimensions) that lives on the CPU
+W1 = nd.zeros(shape=(40000, 1024), mx.gpu(0)) #Allocate a 40k x 1024 weight matrix on GPU for the 1st layer of the net
+W2 = nd.zeros(shape=(1024, 10), mx.gpu(0)) #Allocate a 1024 x 1024 weight matrix on GPU for the 2nd layer of the net
+```
+
+
+Similarly, _MXNet_ makes it easy to specify the computing device
+
+```python
+with mx.Context(mx.gpu()): # Absent this statement, by default, MXNet will execute on CPU
+ h = nd.tanh(nd.dot(X, W1))
+ y = nd.sigmoid(nd.dot(h1, W2))
+```
+
+Thus, with only a high-level understanding of how our numerical computation maps onto an execution environment, _MXNet_ allows us to exert fine-grained control when needed.
+
+## Nuts and Bolts
+
+MXNet supports two styles of programming: _imperative programming_ (supported by the _NDArray_ API) and _symbolic programming_ (supported by the _Symbol_ API). In short, imperative programming is the style that you're likely to be most familiar with. Here if A and B are variables denoting matrices, then `C = A + B` is a piece of code that _when executed_ sums the values referenced by `A` and `B` and stores their sum `C` in a new variable. Symbolic programming, on the other hand, allows functions to be defined abstractly through computation graphs. In the symbolic style, we first express complex functions in terms of placeholder values. Then, we can execute these functions by _binding them_ to real values.
+
+
+### Imperative Programming with _NDArray_
+If you're familiar with NumPy, then the mechanics of _NDArray_ should be old hat. Like the corresponding `numpy.ndarray`, `mxnet.ndarray` (`mxnet.nd` for short) allows us to represent and manipulate multi-dimensional, homogenous arrays of fixed-size components. Converting between the two is effortless:
+
+```python
+# Create a numpy array from an mxnet NDArray
+A_np = np.array([[0,1,2,3,4],[5,6,7,8,9]])
+A_nd = nd.array(A)
+
+# Convert back to a numpy array
+A2_np = A_nd.asnumpy()
+```
+
+Other deep learning libraries tend to rely on NumPy exclusively for imperative programming and the syntax.
+So you might reasonably wonder, why do we need to bother with _NDArray_?
+Put simply, other libraries only reap the advantages of GPU computing when executing symbolic functions. By using _NDArray_, _MXNet_ users can specify device context and run on GPUs. In other words, _MXNet_ gives you access to the high-speed computation for imperative operations that Tensorflow and Theano only give for symbolic operations.
+
+
+```python
+X = mx.nd.array([[1,2],[3,4]])
+Y = mx.nd.array([[5,6],[7,8]])
+result = X + Y
+```
+
+
+### Symbolic Programming in _MXNet_
+
+In addition to providing fast math operations through NDArray, _MXNet_ provides an interface for defining operations abstractly via a computation graph.
+With `mxnet.symbol`, we define operations abstractly in terms of place holders. For example, in the following code `a` and `b` stand in for real values that will be supplied at run time.
+When we call `c = a+b`, no numerical computation is performed. This operation simply builds a graph that defines the relationship between `a`, `b` and `c`. In order to perform a real calculation, we need to bind `c` to real values.
+
+```python
+a = mx.sym.Variable('a')
+b = mx.sym.Variable('b')
+c = a + b
+executor = c.bind(mx.cpu(), {'a': X, 'b': Y})
+result = executor.forward()
+```
+
+Symbolic computation is useful for several reasons. First, because we define a full computation graph before executing it, _MXNet_ can perform sophisticated optimizations to eliminate unnecessary or repeated work. This tends to give better performance than imperative programming. Second, because we store the relationships between different variables in the computation graph, _MXNet_ can then perform efficient auto-differentiation.
+
+**However** Symbolic programming is error-prone and very slow to iterate with, as the graph needs to be computed before it is processed.
+
+### Gluon for briding the gap between the two
+
+[MXNet Gluon]({{'/api/python'|relative_url}}) aims to bridge the gap between the imperative nature of MXNet and its symbolic capabilities and keep the advantages of both through [hybridization](http://d2l.ai/chapter_computational-performance/hybridize.html).
+
+## Conclusions
+Given its combination of high performance, clean code, access to a high-level API, and low-level control, _MXNet_ stands out as a unique choice among deep learning frameworks.
diff --git a/docs/static_site/src/pages/api/java/docs/tutorials/index.md b/docs/static_site/src/pages/api/java/docs/tutorials/index.md
new file mode 100644
index 000000000000..00806e7911c9
--- /dev/null
+++ b/docs/static_site/src/pages/api/java/docs/tutorials/index.md
@@ -0,0 +1,22 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_landing_tutorials
+title: Java Tutorials
+permalink: /api/java/docs/tutorials
+tag: java
+---
diff --git a/docs/static_site/src/pages/api/java/docs/tutorials/mxnet_java_on_intellij.md b/docs/static_site/src/pages/api/java/docs/tutorials/mxnet_java_on_intellij.md
new file mode 100644
index 000000000000..4bfc8ab419bf
--- /dev/null
+++ b/docs/static_site/src/pages/api/java/docs/tutorials/mxnet_java_on_intellij.md
@@ -0,0 +1,188 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Java with IntelliJ
+permalink: /api/java/docs/tutorials/mxnet_java_on_intellij
+is_tutorial: true
+tag: java
+---
+
+
+# Run MXNet Java Examples Using the IntelliJ IDE (macOS)
+
+This tutorial guides you through setting up a simple Java project in IntelliJ IDE on macOS and demonstrates usage of the MXNet Java APIs.
+
+## Prerequisites:
+To use this tutorial you need the following pre-requisites:
+
+- [Java 8 JDK](http://www.oracle.com/technetwork/java/javase/downloads/index.html)
+- [Maven](https://maven.apache.org/install.html)
+- [OpenCV](https://opencv.org/)
+- [IntelliJ IDEA](https://www.jetbrains.com/idea/) (One can download the community edition from [here](https://www.jetbrains.com/idea/download))
+
+### MacOS Prerequisites
+
+Run the following commands to install the prerequisites on MacOS.
+```
+/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
+brew update
+brew tap caskroom/versions
+brew cask install java8
+brew install maven
+brew install opencv
+```
+
+### Ubuntu Prerequisites
+
+Run the following commands to install the prerequisites on Ubuntu.
+
+```
+sudo apt-get install openjdk-8-jdk maven
+```
+
+
+## Set Up Your Project
+
+**Step 1.** Install and setup [IntelliJ IDEA](https://www.jetbrains.com/idea/)
+
+**Step 2.** Create a new Project:
+
+![intellij welcome](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/scala/intellij-welcome.png)
+
+From the IntelliJ welcome screen, select "Create New Project".
+
+Choose the Maven project type.
+
+Select the checkbox for `Create from archetype`, then choose `org.apache.maven.archetypes:maven-archetype-quickstart` from the list below. More on this can be found on a Maven tutorial : [Maven in 5 Minutes](https://maven.apache.org/guides/getting-started/maven-in-five-minutes.html).
+
+![maven project type - archetype](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/java/project-archetype.png)
+
+click `Next`.
+
+![project metadata](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/java/intellij-project-metadata.png)
+
+Set the project's metadata. For this tutorial, use the following:
+
+**GroupId**
+```
+mxnet
+```
+**ArtifactId**
+```
+javaMXNet
+```
+**Version**
+```
+1.0-SNAPSHOT
+```
+
+![project properties](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/java/intellij-project-properties.png)
+
+Review the project's properties. The settings can be left as their default.
+
+![project location](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/java/intellij-project-location.png)
+
+Set the project's location. The rest of the settings can be left as their default.
+
+![project 1](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/java/intellij-project-pom.png)
+
+After clicking Finish, you will be presented with the project's first view.
+The project's `pom.xml` will be open for editing.
+
+**Step 3.** The Java packages are currently available on Maven. Add the following under the `dependencies` tag :
+
+```html
+
+ org.apache.mxnet
+ mxnet-full_2.11-osx-x86_64-cpu
+ 1.4.0
+
+```
+The official Java Packages have been released as part of MXNet 1.4 and are available on the [MXNet Maven package repository](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.mxnet%22).
+
+Note :
+- Change the osx-x86_64 to linux-x86_64 if your platform is linux.
+- Change cpu into gpu if you have a gpu backed machine and want to use gpu.
+
+
+**Step 4.** Import dependencies with Maven:
+
+ - Note the prompt in the lower right corner that states "Maven projects need to be imported". If this is not visible, click on the little greed balloon that appears in the lower right corner.
+
+![import_dependencies](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/java/project-import-changes.png)
+
+Click "Import Changes" in this prompt.
+
+**Step 5.** Build the project:
+- To build the project, from the menu choose Build, and then choose Build Project.
+
+**Step 6.** Navigate to the App.java class in the project and paste the code in `main` method from HelloWorld.java from [Java Demo project](https://github.com/apache/incubator-mxnet/blob/java-api/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/HelloWorld.java) on MXNet repository, overwriting the original hello world code.
+You can also grab the entire [Java Demo project](https://github.com/apache/incubator-mxnet/tree/java-api/scala-package/mxnet-demo/java-demo) and run it by following the instructions on the [README](https://github.com/apache/incubator-mxnet/blob/java-api/scala-package/mxnet-demo/java-demo/README.md).
+
+**Step 7.** Now run the App.java.
+
+The result should be something similar to this:
+
+```
+Hello World!
+(1,2)
+Process finished with exit code 0
+```
+
+### Troubleshooting
+
+If you get an error, check the dependencies at the beginning of this tutorial. For example, you might see the following in the middle of the error messages, where `x.x` would the version it's looking for.
+
+```
+...
+Library not loaded: /usr/local/opt/opencv/lib/libopencv_calib3d.x.x.dylib
+...
+```
+
+This can be resolved be installing OpenCV.
+
+### Command Line Build Option
+
+- You can also compile the project by using the following command at the command line. Change directories to this project's root folder then run the following:
+
+```bash
+mvn clean install dependency:copy-dependencies
+```
+If the command succeeds, you should see a lot of info and some warning messages, followed by:
+
+```bash
+[INFO] ------------------------------------------------------------------------
+[INFO] BUILD SUCCESS
+[INFO] ------------------------------------------------------------------------
+[INFO] Total time: 3.475 s
+[INFO] Finished at: 2018-11-08T05:06:31-08:00
+[INFO] ------------------------------------------------------------------------
+```
+The build generates a new jar file in the `target` folder called `javaMXNet-1.0-SNAPSHOT.jar`.
+
+To run the App.java use the following command from the project's root folder and you should see the same output as we got when the project was run from IntelliJ.
+```bash
+java -cp "target/javaMXNet-1.0-SNAPSHOT.jar:target/dependency/*" mxnet.App
+```
+
+## Next Steps
+For more information about MXNet Java resources, see the following:
+
+* [Java Inference API](/api/java/index.html)
+* [Java Inference Examples](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
+* [MXNet Tutorials Index](http://mxnet.io/tutorials/index.html)
diff --git a/docs/static_site/src/pages/api/java/docs/tutorials/ssd_inference.md b/docs/static_site/src/pages/api/java/docs/tutorials/ssd_inference.md
new file mode 100644
index 000000000000..3f30402628c0
--- /dev/null
+++ b/docs/static_site/src/pages/api/java/docs/tutorials/ssd_inference.md
@@ -0,0 +1,210 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: SSD Inference
+permalink: /api/java/docs/tutorials/ssd_inference
+is_tutorial: true
+tag: java
+---
+
+# Multi Object Detection using pre-trained SSD Model via Java Inference APIs
+
+This tutorial shows how to use MXNet Java Inference APIs to run inference on a pre-trained Single Shot Detector (SSD) Model.
+
+The SSD model is trained on the Pascal VOC 2012 dataset. The network is a SSD model built on Resnet50 as the base network to extract image features. The model is trained to detect the following entities (classes): ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']. For more details about the model, you can refer to the [MXNet SSD example](https://github.com/apache/incubator-mxnet/tree/master/example/ssd).
+
+## Prerequisites
+
+To complete this tutorial, you need the following:
+* [MXNet Java Setup on IntelliJ IDEA](mxnet_java_on_intellij.md) (Optional)
+* [wget](https://www.gnu.org/software/wget/) To download model artifacts
+* SSD Model artifacts
+ * Use the following script to get the SSD Model files :
+```bash
+data_path=/tmp/resnet50_ssd
+mkdir -p "$data_path"
+wget https://s3.amazonaws.com/model-server/models/resnet50_ssd/resnet50_ssd_model-symbol.json -P $data_path
+wget https://s3.amazonaws.com/model-server/models/resnet50_ssd/resnet50_ssd_model-0000.params -P $data_path
+wget https://s3.amazonaws.com/model-server/models/resnet50_ssd/synset.txt -P $data_path
+```
+* Test images : A few sample images to run inference on.
+ * Use the following script to download sample images :
+```bash
+image_path=/tmp/resnet50_ssd/images
+mkdir -p "$image_path"
+cd $image_path
+wget https://cloud.githubusercontent.com/assets/3307514/20012567/cbb60336-a27d-11e6-93ff-cbc3f09f5c9e.jpg -O dog.jpg
+wget https://cloud.githubusercontent.com/assets/3307514/20012563/cbb41382-a27d-11e6-92a9-18dab4fd1ad3.jpg -O person.jpg
+```
+
+Alternately, you can get the entire SSD Model artifacts + images in one single script from the MXNet Repository by running [get_ssd_data.sh script](https://github.com/apache/incubator-mxnet/blob/master/scala-package/examples/scripts/infer/objectdetector/get_ssd_data.sh)
+
+## Time to code!
+1\. Following the [MXNet Java Setup on IntelliJ IDEA](mxnet_java_on_intellij.md) tutorial, in the same project `JavaMXNet`, create a new empty class called : `ObjectDetectionTutorial.java`.
+
+2\. In the `main` function of `ObjectDetectionTutorial.java` define the downloaded model path and the image data paths. This is the same path where we downloaded the model artifacts and images in a previous step.
+
+```java
+String modelPathPrefix = "/tmp/resnet50_ssd/resnet50_ssd_model";
+String inputImagePath = "/tmp/resnet50_ssd/images/dog.jpg";
+```
+
+3\. We can run the inference code in this example on either CPU or GPU (if you have a GPU backed machine) by choosing the appropriate context.
+
+```java
+
+List context = getContext();
+...
+
+private static List getContext() {
+List ctx = new ArrayList<>();
+ctx.add(Context.cpu()); // Choosing CPU Context here
+
+return ctx;
+}
+```
+
+4\. To provide an input to the model, define the input shape to the model and the Input Data Descriptor (DataDesc) as shown below :
+
+```java
+Shape inputShape = new Shape(new int[] {1, 3, 512, 512});
+List inputDescriptors = new ArrayList();
+inputDescriptors.add(new DataDesc("data", inputShape, DType.Float32(), "NCHW"));
+```
+
+The input shape can be interpreted as follows : The input has a batch size of 1, with 3 RGB channels in the image, and the height and width of the image is 512 each.
+
+5\. To run an actual inference on the given image, add the following lines to the `ObjectDetectionTutorial.java` class :
+
+```java
+BufferedImage img = ObjectDetector.loadImageFromFile(inputImagePath);
+ObjectDetector objDet = new ObjectDetector(modelPathPrefix, inputDescriptors, context, 0);
+List> output = objDet.imageObjectDetect(img, 3); // Top 3 objects detected will be returned
+```
+
+6\. Let's piece all of the above steps together by showing the final contents of the `ObjectDetectionTutorial.java`.
+
+```java
+package mxnet;
+
+import org.apache.mxnet.infer.javaapi.ObjectDetector;
+import org.apache.mxnet.infer.javaapi.ObjectDetectorOutput;
+import org.apache.mxnet.javaapi.Context;
+import org.apache.mxnet.javaapi.DType;
+import org.apache.mxnet.javaapi.DataDesc;
+import org.apache.mxnet.javaapi.Shape;
+
+import java.awt.image.BufferedImage;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public class ObjectDetectionTutorial {
+
+ public static void main(String[] args) {
+
+ String modelPathPrefix = "/tmp/resnet50_ssd/resnet50_ssd_model";
+
+ String inputImagePath = "/tmp/resnet50_ssd/images/dog.jpg";
+
+ List context = getContext();
+
+ Shape inputShape = new Shape(new int[] {1, 3, 512, 512});
+
+ List inputDescriptors = new ArrayList();
+ inputDescriptors.add(new DataDesc("data", inputShape, DType.Float32(), "NCHW"));
+
+ BufferedImage img = ObjectDetector.loadImageFromFile(inputImagePath);
+ ObjectDetector objDet = new ObjectDetector(modelPathPrefix, inputDescriptors, context, 0);
+ List> output = objDet.imageObjectDetect(img, 3);
+
+ printOutput(output, inputShape);
+ }
+
+
+ private static List getContext() {
+ List ctx = new ArrayList<>();
+ ctx.add(Context.cpu());
+
+ return ctx;
+ }
+
+ private static void printOutput(List> output, Shape inputShape) {
+
+ StringBuilder outputStr = new StringBuilder();
+
+ int width = inputShape.get(3);
+ int height = inputShape.get(2);
+
+ for (List ele : output) {
+ for (ObjectDetectorOutput i : ele) {
+ outputStr.append("Class: " + i.getClassName() + "\n");
+ outputStr.append("Probabilties: " + i.getProbability() + "\n");
+
+ List coord = Arrays.asList(i.getXMin() * width,
+ i.getXMax() * height, i.getYMin() * width, i.getYMax() * height);
+ StringBuilder sb = new StringBuilder();
+ for (float c: coord) {
+ sb.append(", ").append(c);
+ }
+ outputStr.append("Coord:" + sb.substring(2)+ "\n");
+ }
+ }
+ System.out.println(outputStr);
+
+ }
+}
+```
+
+7\. To compile and run this code, change directories to this project's root folder, then run the following:
+```bash
+mvn clean install dependency:copy-dependencies
+```
+
+The build generates a new jar file in the `target` folder called `javaMXNet-1.0-SNAPSHOT.jar`.
+
+To run the ObjectDetectionTutorial.java use the following command from the project's root folder.
+```bash
+java -cp "target/javaMXNet-1.0-SNAPSHOT.jar:target/dependency/*" mxnet.ObjectDetectionTutorial
+```
+
+You should see a similar output being generated for the dog image that we used:
+```bash
+Class: car
+Probabilties: 0.99847263
+Coord:312.21335, 72.02908, 456.01443, 150.66176
+Class: bicycle
+Probabilties: 0.9047381
+Coord:155.9581, 149.96365, 383.83694, 418.94516
+Class: dog
+Probabilties: 0.82268167
+Coord:83.82356, 179.14001, 206.63783, 476.78754
+```
+
+![dog_1](https://cloud.githubusercontent.com/assets/3307514/20012567/cbb60336-a27d-11e6-93ff-cbc3f09f5c9e.jpg)
+
+The results returned by the inference call translate into the regions in the image where the model detected objects.
+
+![dog_2](https://cloud.githubusercontent.com/assets/3307514/19171063/91ec2792-8be0-11e6-983c-773bd6868fa8.png)
+
+## Next Steps
+For more information about MXNet Java resources, see the following:
+
+* [Java Inference API](/api/java/index.md)
+* [Java Inference Examples](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
+* [MXNet Tutorials Index](/tutorials/index.md)
diff --git a/docs/static_site/src/pages/api/java/index.md b/docs/static_site/src/pages/api/java/index.md
new file mode 100644
index 000000000000..56d779b30703
--- /dev/null
+++ b/docs/static_site/src/pages/api/java/index.md
@@ -0,0 +1,41 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Java Guide
+action: Get Started
+action_url: /get_started
+permalink: /api/java
+tag: java
+---
+
+
+# MXNet - Java Inference API
+
+MXNet supports Java for performing inference on a trained model. The MXNet Java Inference API is an extension of the [Scala Infer API](../../api/scala/infer.html) which provides model loading and inference functionality.
+
+The goal of the MXNet Java package is to provide an efficient and easy to use inference API.
+The MXNet Java package makes it easy to quickly deploy an existing model into a production level Java ecosystem.
+
+## Installation
+* [MXNet Java Inference API setup instructions](../../install/java_setup.md)
+
+## Tutorials
+See the [tutorial page](../../tutorials/index.html#java-tutorials) for detailed tutorials and examples using the Java Inference API.
+
+## Java Inference API Reference
+The [Java Infer API javadocs](docs/index.html#org.apache.mxnet.infer.package) provides detailed API information.
diff --git a/docs/static_site/src/pages/api/julia/index.md b/docs/static_site/src/pages/api/julia/index.md
new file mode 100644
index 000000000000..10c08da3ec90
--- /dev/null
+++ b/docs/static_site/src/pages/api/julia/index.md
@@ -0,0 +1,39 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Julia Guide
+action: Get Started
+action_url: /get_started
+permalink: /api/julia
+tag: julia
+---
+
+
+# MXNet - Julia API
+
+
+MXNet supports the Julia programming language. The MXNet Julia package brings flexible and efficient GPU
+computing and the state-of-art deep learning to Julia.
+
+- It enables you to write seamless tensor/matrix computation with multiple GPUs in Julia.
+- It also enables you to construct and customize the state-of-art deep learning models in Julia,
+ and apply them to tasks such as image classification and data science challenges.
+
+## Installation
+* [Ubuntu installation guide]({{'/get_started' | relative_url}})
+* Mac / Windows guides are not available (contributions welcome!)
diff --git a/docs/static_site/src/pages/api/perl/docs/tutorials/index.md b/docs/static_site/src/pages/api/perl/docs/tutorials/index.md
new file mode 100644
index 000000000000..e98a3e4b4613
--- /dev/null
+++ b/docs/static_site/src/pages/api/perl/docs/tutorials/index.md
@@ -0,0 +1,23 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_landing_tutorials
+title: Perl Tutorials
+action: Get Started
+tag: perl
+permalink: /api/perl/docs/tutorials
+---
diff --git a/docs/static_site/src/pages/api/perl/docs/tutorials/io.md b/docs/static_site/src/pages/api/perl/docs/tutorials/io.md
new file mode 100644
index 000000000000..b9ce200c5c2e
--- /dev/null
+++ b/docs/static_site/src/pages/api/perl/docs/tutorials/io.md
@@ -0,0 +1,136 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Data Loading API
+is_tutorial: true
+tag: perl
+permalink: /api/perl/docs/tutorials/io
+---
+
+# Data Loading API
+
+## Overview
+
+A data iterator reads data batch by batch.
+
+```perl
+pdl> $data = mx->nd->ones([100,10])
+pdl> $nd_iter = mx->io->NDArrayIter($data, batch_size=>25)
+pdl> for my $batch (@{ $nd_iter }) { print $batch->data->[0],"\n" }
+
+
+
+
+```
+
+If `$nd_iter->reset()` is called, then reads the data again from beginning.
+
+In addition, an iterator provides information about the batch, including the
+shapes and name.
+
+```perl
+pdl> $nd_iter = mx->io->NDArrayIter(data=>{data => mx->nd->ones([100,10])}, label=>{softmax_label => mx->nd->ones([100])}, batch_size=>25)
+pdl> print($nd_iter->provide_data->[0],"\n")
+DataDesc[data,25x10,float32,NCHW]
+pdl> print($nd_iter->provide_label->[0],"\n")
+DataDesc[softmax_label,25,float32,NCHW]
+```
+
+So this iterator can be used to train a symbol whose input data variable has
+name `data` and input label variable has name `softmax_label`.
+
+
+```perl
+pdl> $data = mx->sym->Variable('data')
+pdl> $label = mx->sym->Variable('softmax_label')
+pdl> $fullc = mx->sym->FullyConnected(data=>$data, num_hidden=>1)
+pdl> $loss = mx->sym->SoftmaxOutput(data=>$data, label=>$label)
+pdl> $mod = mx->mod->Module($loss)
+pdl> print($mod->data_names->[0])
+data
+pdl> print($mod->label_names->[0])
+softmax_label
+pdl> $mod->bind(data_shapes=>$nd_iter->provide_data, label_shapes=>$nd_iter->provide_label)
+```
+
+Then we can call `$mod->fit($nd_iter, num_epoch=>2)` to train `loss` by 2 epochs.
+
+## Predefined Data iterators
+
+```perl
+mx->io->NDArrayIter
+mx->io->CSVIter
+mx->io->ImageRecordIter
+mx->io->ImageRecordInt8Iter
+mx->io->ImageRecordUInt8Iter
+mx->io->MNISTIter
+mx->recordio->MXRecordIO
+mx->recordio->MXIndexedRecordIO
+mx->image->ImageIter
+```
+
+## Helper classes and functions
+
+Data structures and other iterators provided in the `AI::MXNet::IO` package.
+
+```perl
+AI::MXNet::DataDesc
+AI::MXNet::DataBatch
+AI::MXNet::DataIter
+AI::MXNet::ResizeIter
+AI::MXNet::MXDataIter
+```
+
+A list of image modification functions provided by `AI::MXNet::Image`.
+
+```perl
+mx->image->imdecode
+mx->image->scale_down
+mx->image->resize_short
+mx->image->fixed_crop
+mx->image->random_crop
+mx->image->center_crop
+mx->image->color_normalize
+mx->image->random_size_crop
+mx->image->ResizeAug
+mx->image->RandomCropAug
+mx->image->RandomSizedCropAug
+mx->image->CenterCropAug
+mx->image->RandomOrderAug
+mx->image->ColorJitterAug
+mx->image->LightingAug
+mx->image->ColorNormalizeAug
+mx->image->HorizontalFlipAug
+mx->image->CastAug
+mx->image->CreateAugmenter
+```
+
+Functions to read and write RecordIO files.
+
+```perl
+mx->recordio->pack
+mx->recordio->unpack
+mx->recordio->unpack_img
+```
+
+## Develop a new iterator
+
+Writing a new data iterator in Perl is straightforward. Most MXNet
+training/inference programs accept an object with ``provide_data``
+and ``provide_label`` properties.
+Please refer to AI-MXNet/examples for the examples of custom iterators.
diff --git a/docs/static_site/src/pages/api/perl/docs/tutorials/kvstore.md b/docs/static_site/src/pages/api/perl/docs/tutorials/kvstore.md
new file mode 100644
index 000000000000..ea0343f7162f
--- /dev/null
+++ b/docs/static_site/src/pages/api/perl/docs/tutorials/kvstore.md
@@ -0,0 +1,134 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: KVStore API
+is_tutorial: true
+tag: perl
+permalink: /api/perl/docs/tutorials/kvstore
+---
+
+
+# KVStore API
+
+Topics:
+* [Basic Push and Pull](#basic-push-and-pull)
+* [List Key-Value Pairs](#list-key-value-pairs)
+
+## Basic Push and Pull
+
+Provides basic operation over multiple devices (GPUs) on a single device.
+
+### Initialization
+
+Let's consider a simple example. It initializes
+a (int, NDArray) pair into the store, and then pulls the value out.
+
+```perl
+pdl> $kv = mx->kv->create('local')
+pdl> $shape = [2,3]
+pdl> $kv->init(3, mx->nd->ones($shape)*2)
+pdl> $a = mx->nd->zeros($shape)
+pdl> $kv->pull(3, out => $a)
+pdl> print $a->aspdl
+[
+ [2 2 2]
+ [2 2 2]
+]
+```
+
+### Push, Aggregation, and Updater
+
+For any key that's been initialized, you can push a new value with the same shape to the key, as follows:
+
+```perl
+pdl> $kv->push(3, mx->nd->ones($shape)*8)
+pdl> $a = mx->nd->zeros($shape)
+pdl> $kv->pull(3, out => $a)
+pdl> print $a->aspdl
+[
+ [8 8 8]
+ [8 8 8]
+]
+```
+
+The data that you want to push can be stored on any device. Furthermore, you can push multiple
+values into the same key, where KVStore first sums all of these
+values, and then you pull the aggregated value, as follows:
+
+```perl
+pdl> $kv->push(3, [mx->nd->ones($shape, ctx=>mx->cpu(0)), mx->nd->ones($shape, ctx=>mx->cpu(1))])
+pdl> $kv->pull(3, out => $a)
+pdl> print $a->aspdl
+[
+ [2 2 2]
+ [2 2 2]
+]
+```
+
+For each push command, KVStore applies the pushed value to the value stored by an
+`updater`. The default updater is `ASSIGN`. You can replace the default to
+control how data is merged.
+
+```perl
+pdl> $updater = sub { my ($key, $input, $stored) = @_; print "update on key: $key\n"; $stored += $input * 3; }
+pdl> $kv->_set_updater($updater)
+pdl> $kv->push(3, [mx->nd->ones($shape, ctx=>mx->cpu(0)), mx->nd->ones($shape, ctx=>mx->cpu(1))])
+update on key: 3
+pdl> $kv->pull(3, out => $a)
+pdl> print $a->aspdl
+[
+ [8 8 8]
+ [8 8 8]
+]
+```
+
+### Pull
+
+You've already seen how to pull a single key-value pair. Similar to the way that you use the push command, you can
+pull the value into several devices with a single call.
+
+```perl
+pdl> $b = [mx->nd->zeros($shape, ctx=>mx->cpu(0)), mx->nd->zeros($shape, ctx=>mx->cpu(1))]
+pdl> $kv->pull(3, out => $b)
+pdl> print $b->[1]->aspdl
+[
+ [8 8 8]
+ [8 8 8]
+]
+```
+
+## List Key-Value Pairs
+
+All of the operations that we've discussed so far are performed on a single key. KVStore also provides
+the interface for generating a list of key-value pairs. For a single device, use the following:
+
+```perl
+pdl> $keys = [5,7,9]
+pdl> $kv->init($keys, [map { mx->nd->ones($shape) } 0..@$keys-1])
+pdl> $kv->push($keys, [map { mx->nd->ones($shape) } 0..@$keys-1])
+update on key: 5
+update on key: 7
+update on key: 9
+pdl> $b = [map { mx->nd->ones($shape) } 0..@$keys-1]
+pdl> $kv->pull($keys, out => $b)
+pdl> print $b->[1]->aspdl
+[
+ [4 4 4]
+ [4 4 4]
+]
+```
diff --git a/docs/static_site/src/pages/api/perl/docs/tutorials/module.md b/docs/static_site/src/pages/api/perl/docs/tutorials/module.md
new file mode 100644
index 000000000000..deb25ad92348
--- /dev/null
+++ b/docs/static_site/src/pages/api/perl/docs/tutorials/module.md
@@ -0,0 +1,70 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Module API
+is_tutorial: true
+tag: perl
+permalink: /api/perl/docs/tutorials/module
+---
+
+
+# Module API
+
+## Overview
+
+The module API, defined in the `module` (or simply `mod`) package (`AI::MXNet::Module` under the hood), provides an
+intermediate and high-level interface for performing computation with a
+`AI::MXNet::Symbol` or just `mx->sym`. One can roughly think a module is a machine which can execute a
+program defined by a `Symbol`.
+
+The class `AI::MXNet::Module` is a commonly used module, which accepts a `AI::MXNet::Symbol` as
+the input:
+
+```perl
+pdl> $data = mx->symbol->Variable('data')
+pdl> $fc1 = mx->symbol->FullyConnected($data, name=>'fc1', num_hidden=>128)
+pdl> $act1 = mx->symbol->Activation($fc1, name=>'relu1', act_type=>"relu")
+pdl> $fc2 = mx->symbol->FullyConnected($act1, name=>'fc2', num_hidden=>10)
+pdl> $out = mx->symbol->SoftmaxOutput($fc2, name => 'softmax')
+pdl> $mod = mx->mod->Module($out) # create a module by given a Symbol
+```
+
+Assume there is a valid MXNet data iterator `data`. We can initialize the
+module:
+
+```perl
+pdl> $mod->bind(data_shapes=>$data->provide_data,
+ label_shapes=>$data->provide_label) # create memory by given input shapes
+pdl> $mod->init_params() # initial parameters with the default random initializer
+```
+
+Now the module is able to compute. We can call high-level API to train and
+predict:
+
+```perl
+pdl> $mod->fit($data, num_epoch=>10, ...) # train
+pdl> $mod->predict($new_data) # predict on new data
+```
+
+or use intermediate APIs to perform step-by-step computations
+
+```perl
+pdl> $mod->forward($data_batch, is_train => 1) # forward on the provided data batch
+pdl> $mod->backward() # backward to calculate the gradients
+pdl> $mod->update() # update parameters using the default optimizer
+```
diff --git a/docs/static_site/src/pages/api/perl/docs/tutorials/ndarray.md b/docs/static_site/src/pages/api/perl/docs/tutorials/ndarray.md
new file mode 100644
index 000000000000..18ae72300c07
--- /dev/null
+++ b/docs/static_site/src/pages/api/perl/docs/tutorials/ndarray.md
@@ -0,0 +1,66 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: NDArray API
+is_tutorial: true
+tag: perl
+permalink: /api/perl/docs/tutorials/ndarray
+---
+
+# NDArray API
+
+## Overview
+
+A `AI::MXNet::NDArray` is a multidimensional container of items of the same type and
+size. Various methods for data manipulation and computation are provided.
+
+```perl
+pdl> $x = mx->nd->array([[1, 2, 3], [4, 5, 6]])
+pdl> print $x->aspdl->shape
+[3, 2]
+pdl> $y = $x + mx->nd->ones($x->shape)*3
+pdl> print $y->aspdl
+[
+ [4 5 6]
+ [7 8 9]
+]
+pdl> $z = $y->as_in_context(mx->gpu(0))
+pdl> print $z,"\n"
+
+```
+
+A detailed tutorial is available at
+[http://mxnet.io/tutorials/basic/ndarray.html](http://mxnet.io/tutorials/basic/ndarray.html).
+
+Note: AI::MXNet::NDarray is similar to numpy.ndarray in some aspects. But the difference is not negligible. For example
+
+- AI::MXNet::NDArray->T does real data transpose to return new a copied array, instead
+ of returning a view of the input array.
+- AI::MXNet::NDArray->dot performs dot between the last axis of the first input array
+ and the first axis of the second input, while numpy.dot uses the second
+ last axis of the input array.
+
+In additional, NDArray supports GPU computation and various neural
+network layers.
+
+AI::MXNet::NDarray also provides almost same routines as AI::MXNet::symbol. Most
+routines between these two packages share the same C++ operator source
+codes. But AI::MXNet::NDarray differs from AI::MXNet::Symbol in several aspects:
+
+- AI::MXNet::NDArray adopts imperative programming, namely sentences are executed
+ step-by-step so that the results can be obtained immediately.
diff --git a/docs/static_site/src/pages/api/perl/docs/tutorials/symbol.md b/docs/static_site/src/pages/api/perl/docs/tutorials/symbol.md
new file mode 100644
index 000000000000..94d416a2d0b2
--- /dev/null
+++ b/docs/static_site/src/pages/api/perl/docs/tutorials/symbol.md
@@ -0,0 +1,151 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Symbol API
+is_tutorial: true
+tag: perl
+permalink: /api/perl/docs/tutorials/symbol
+---
+
+# MXNet Perl Symbolic API
+
+Topics:
+
+* [How to Compose Symbols](#how-to-compose-symbols) introduces operator overloading of symbols.
+* [Symbol Attributes](#symbol-attributes) describes how to attach attributes to symbols.
+* [Serialization](#serialization) explains how to save and load symbols.
+* [Executing Symbols](#executing-symbols) explains how to evaluate the symbols with data.
+* [Multiple Outputs](#multiple-outputs) explains how to configure multiple outputs.
+
+## How to Compose Symbols
+
+The symbolic API provides a way to configure computation graphs.
+You can configure the graphs either at the level of neural network layer operations or as fine-grained operations.
+
+The following example configures a two-layer neural network.
+
+```perl
+pdl> use AI::MXNet qw(mx)
+pdl> $data = mx->symbol->Variable("data")
+pdl> $fc1 = mx->symbol->FullyConnected(data => $data, name => "fc1", num_hidden => 128)
+pdl> $act1 = mx->symbol->Activation(data => $fc1, name => "relu1", act_type => "relu")
+pdl> $fc2 = mx->symbol->FullyConnected(data => $act1, name => "fc2", num_hidden => 64)
+pdl> $net = mx->symbol->SoftmaxOutput(data => $fc2, name => "out")
+```
+
+The basic arithmetic operators (plus, minus, div, multiplication) are overloaded for
+*element-wise operations* of symbols.
+
+The following example creates a computation graph that adds two inputs together.
+
+```perl
+pdl> use AI::MXNet qw(mx)
+pdl> $a = mx->symbol->Variable("a")
+pdl> $b = mx->symbol->Variable("b")
+pdl> $c = $a + $b
+```
+
+## Symbol Attributes
+
+You can add an attribute to a symbol by providing an attribute hash when you create a symbol.
+
+```perl
+$data = mx->symbol->Variable("data", attr => { mood => "angry" })
+$op = mx->symbol->Convolution(data => $data, kernel => [1, 1], num_filter => 1, attr => { mood => "so so" })
+```
+
+For proper communication with the C++ backend, both the key and values of the attribute dictionary should be strings. To retrieve the attributes, use `->attr($key)`:
+
+```
+ $data->attr("mood")
+```
+
+To attach attributes, you can use ```AI::MXNet::AttrScope```. ```AI::MXNet::AttrScopeAttrScope``` automatically adds
+the specified attributes to all of the symbols created within that scope.
+The user can also inherit this object to change naming behavior. For example:
+
+```perl
+use AI::MXNet qw(mx);
+use Test::More tests => 3;
+my ($data, $gdata);
+{
+ local($mx::AttrScope) = mx->AttrScope(group=>4, data=>'great');
+ $data = mx->sym->Variable("data", attr => { dtype => "data", group => "1" });
+ $gdata = mx->sym->Variable("data2");
+}
+ok($gdata->attr("group") == 4);
+ok($data->attr("group") == 1);
+
+my $exceedScopeData = mx->sym->Variable("data3");
+ok((not defined $exceedScopeData->attr("group")), "No group attr in global attr scope");
+```
+
+## Serialization
+
+There are two ways to save and load the symbols. You can use the `mx->symbol->save` and `mxnet->symbol->load` functions to serialize the ```AI::MXNet::Symbol``` objects.
+The advantage of using `save` and `load` functions is that it is language agnostic and cloud friendly.
+The symbol is saved in JSON format. You can also get a JSON string directly using `$symbol->tojson`.
+
+The following example shows how to save a symbol to an S3 bucket, load it back, and compare two symbols using a JSON string.
+
+```perl
+pdl> use AI::MXNet qw(mx)
+pdl> $a = mx->sym->Variable("a")
+pdl> $b = mx->sym->Variable("b")
+pdl> $c = $a + $b
+pdl> $c->save("s3://my-bucket/symbol-c.json")
+pdl> $c2 = $c->load("s3://my-bucket/symbol-c.json")
+pdl> ok($c->tojson eq $c2->tojson)
+ok 1
+```
+
+## Executing Symbols
+
+After you have assembled a set of symbols into a computation graph, the MXNet engine can evaluate them.
+If you are training a neural network, this is typically
+handled by the high-level [AI::MXNet::Module package](module.md) and the [`fit()`] function.
+
+For neural networks used in "feed-forward", "prediction", or "inference" mode (all terms for the same
+thing: running a trained network), the input arguments are the
+input data, and the weights of the neural network that were learned during training.
+
+To manually execute a set of symbols, you need to create an [`AI::MXNet::Executor`] object,
+which is typically constructed by calling the [`simple_bind()`] method on a AI::MXNet::Symbol.
+
+## Multiple Outputs
+
+To group the symbols together, use the [AI::MXNet::Symbol->Group](#mxnet.symbol.Group) function.
+
+```perl
+pdl> use AI::MXNet qw(mx)
+pdl> use Data::Dumper
+pdl> $data = mx->sym->Variable("data")
+pdl> $fc1 = mx->sym->FullyConnected($data, name => "fc1", num_hidden => 128)
+pdl> $act1 = mx->sym->Activation($fc1, name => "relu1", act_type => "relu")
+pdl> $fc2 = mx->sym->FullyConnected($act1, name => "fc2", num_hidden => 64)
+pdl> $net = mx->sym->SoftmaxOutput($fc2, name => "softmax")
+pdl> $group = mx->sym->Group([$fc1, $net])
+pdl> print Dumper($group->list_outputs())
+$VAR1 = [
+ 'fc1_output',
+ 'softmax_output'
+];
+```
+
+After you get the ```Group```, you can bind on ```group``` instead.
+The resulting executor will have two outputs, one for fc1_output and one for softmax_output.
diff --git a/docs/static_site/src/pages/api/perl/index.md b/docs/static_site/src/pages/api/perl/index.md
new file mode 100644
index 000000000000..79bc22dcb0c9
--- /dev/null
+++ b/docs/static_site/src/pages/api/perl/index.md
@@ -0,0 +1,91 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Perl Guide
+action: Get Started
+action_url: /get_started
+permalink: /api/perl
+tag: perl
+---
+
+# MXNet - Perl API
+
+MXNet supports the Perl programming language. The MXNet Perl package brings flexible and efficient GPU
+computing and state-of-art deep learning to Perl. It enables you to write seamless tensor/matrix computation with multiple GPUs in Perl.
+It also lets you construct and customize the state-of-art deep learning models in Perl,
+ and apply them to tasks, such as image classification and data science challenges.
+
+One important thing to internalize is that Perl interface is written to be as close as possible to the Python's API,
+so most if not all of Python's documentation and examples should just work in Perl after making few
+changes in order to make the code a bit more Perlish. In nutshell just add $ sigils and replace . = \n with -> => ; and in 99% of cases
+that's all that is needed there.
+In addition please refer to [excellent metacpan doc interface](https://metacpan.org/release/AI-MXNet) and to very detailed
+[MXNet Python API Documentation]({{'/api/python' | relative_url}}).
+
+AI::MXNet supports new imperative PyTorch like Gluon MXNet interface. Please get acquainted with this new interface
+at [Dive into Deep Learning](https://www.d2l.ai/).
+
+For specific Perl Gluon usage please refer to Perl examples and tests directories on github, but be assured that the Python and Perl usage
+are extremely close in order to make the use of the Python Gluon docs and examples as easy as possible.
+
+AI::MXNet is seamlessly glued with [PDL](https://metacpan.org/release/PDL), the C++ level state can be easily initialized from PDL and the results can be
+transferred to PDL objects in order to allow you to use all the glory and power of the PDL!
+
+Here is how you can perform tensor or matrix computation in Perl with AI::MXNet and PDL:
+
+```perl
+pdl> use AI::MXNet qw(mx); # creates 'mx' module on the fly with the interface close to the Python's API
+
+pdl> print $arr = mx->nd->ones([2, 3])
+
+
+pdl> print Data::Dumper::Dumper($arr->shape)
+$VAR1 = [
+ 2,
+ 3
+ ];
+
+pdl> print (($arr*2)->aspdl) ## converts AI::MXNet::NDArray object to PDL object
+
+[
+ [2 2 2]
+ [2 2 2]
+]
+
+pdl> print $arr = mx->nd->array([[1,2],[3,4]]) ## init the NDArray from Perl array ref given in PDL::pdl constructor format
+
+pdl> print $arr->aspdl
+
+[
+ [1 2]
+ [3 4]
+]
+
+## init the NDArray from PDL but be aware that PDL methods expect the dimensions order in column major format
+## AI::MXNet::NDArray is row major
+pdl> print mx->nd->array(sequence(2,3))->aspdl ## 3 rows, 2 columns
+
+[
+ [0 1]
+ [2 3]
+ [4 5]
+]
+```
+
+Export/import to/from sparse MXNet tensors are supported via [PDL::CCS](https://metacpan.org/release/PDL-CCS).
+Please check out the examples directory for the examples on how to use the sparse matrices.
diff --git a/docs/static_site/src/pages/api/python/index.md b/docs/static_site/src/pages/api/python/index.md
new file mode 100644
index 000000000000..bbd82e5c384f
--- /dev/null
+++ b/docs/static_site/src/pages/api/python/index.md
@@ -0,0 +1,44 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Python Guide
+action: Get Started
+action_url: /get_started
+permalink: /api/python
+tag: python
+---
+
+## MXNet - Python API
+
+MXNet provides a comprehensive and flexible Python API to serve a broad community of developers with different levels of experience and wide ranging requirements. In this section, we provide an in-depth discussion of the functionality provided by various MXNet Python packages.
+
+
+MXNet’s Python API has two primary high-level packages*: the Gluon API and Module API. We recommend that new users start with the Gluon API as it’s more flexible and easier to debug. Underlying these high-level packages are the core packages of NDArray and Symbol.
+
+
+NDArray works with arrays in an imperative fashion, i.e. you define how arrays will be transformed to get to an end result. Symbol works with arrays in a declarative fashion, i.e. you define the end result that is required (via a symbolic graph) and the MXNet engine will use various optimizations to determine the steps required to obtain this. With NDArray you have a great deal of flexibility when composing operations (as you can use Python control flow), and you can easily step through your code and inspect the values of arrays, which helps with debugging. Unfortunately, this comes at a performance cost when compared to Symbol, which can perform optimizations on the symbolic graph.
+
+
+Module API is backed by Symbol, so, although it’s very performant, it’s also a little more restrictive. With the Gluon API, you can get the best of both worlds. You can develop and test your model imperatively using NDArray, a then switch to Symbol for faster model training and inference (if Symbol equivalents exist for your operations).
+
+
+Code examples are placed throughout the API documentation and these can be run after importing MXNet as follows:
+
+```python
+>>> import mxnet as mx
+```
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md b/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md
new file mode 100644
index 000000000000..e124483a3c5f
--- /dev/null
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md
@@ -0,0 +1,278 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Callback Function
+is_tutorial: true
+tag: r
+permalink: /api/r/docs/tutorials/callback_function
+---
+
+Callback Function
+======================================
+
+This tutorial provides guidelines for using and writing callback functions,
+which can very useful in model training.
+
+Model Training Example
+----------
+
+Let's begin with a small example. We can build and train a model with the following code:
+
+
+ ```r
+ library(mxnet)
+ data(BostonHousing, package="mlbench")
+ train.ind = seq(1, 506, 3)
+ train.x = data.matrix(BostonHousing[train.ind, -14])
+ train.y = BostonHousing[train.ind, 14]
+ test.x = data.matrix(BostonHousing[-train.ind, -14])
+ test.y = BostonHousing[-train.ind, 14]
+ data <- mx.symbol.Variable("data")
+ fc1 <- mx.symbol.FullyConnected(data, num_hidden=1)
+ lro <- mx.symbol.LinearRegressionOutput(fc1)
+ mx.set.seed(0)
+ model <- mx.model.FeedForward.create(
+ lro, X=train.x, y=train.y,
+ eval.data=list(data=test.x, label=test.y),
+ ctx=mx.cpu(), num.round=10, array.batch.size=20,
+ learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse)
+ ```
+
+ ```
+ ## Auto detect layout of input matrix, use row major..
+ ## Start training with 1 devices
+ ## [1] Train-rmse=16.063282524034
+ ## [1] Validation-rmse=10.1766446093622
+ ## [2] Train-rmse=12.2792375712573
+ ## [2] Validation-rmse=12.4331776190813
+ ## [3] Train-rmse=11.1984634005885
+ ## [3] Validation-rmse=10.3303041888193
+ ## [4] Train-rmse=10.2645236892904
+ ## [4] Validation-rmse=8.42760407903415
+ ## [5] Train-rmse=9.49711005504284
+ ## [5] Validation-rmse=8.44557808483234
+ ## [6] Train-rmse=9.07733734175182
+ ## [6] Validation-rmse=8.33225500266177
+ ## [7] Train-rmse=9.07884450847991
+ ## [7] Validation-rmse=8.38827833418459
+ ## [8] Train-rmse=9.10463850277417
+ ## [8] Validation-rmse=8.37394452365264
+ ## [9] Train-rmse=9.03977049028532
+ ## [9] Validation-rmse=8.25927979725672
+ ## [10] Train-rmse=8.96870685004475
+ ## [10] Validation-rmse=8.19509291481822
+ ```
+
+We also provide two optional parameters, `batch.end.callback` and `epoch.end.callback`, which can provide great flexibility in model training.
+
+How to Use Callback Functions
+---------
+
+This package provides two callback functions:
+
+- `mx.callback.save.checkpoint` saves a checkpoint to files during each period iteration.
+
+```r
+ model <- mx.model.FeedForward.create(
+ lro, X=train.x, y=train.y,
+ eval.data=list(data=test.x, label=test.y),
+ ctx=mx.cpu(), num.round=10, array.batch.size=20,
+ learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse,
+ epoch.end.callback = mx.callback.save.checkpoint("boston"))
+```
+
+```
+ ## Auto detect layout of input matrix, use row major..
+ ## Start training with 1 devices
+ ## [1] Train-rmse=19.1621424021617
+ ## [1] Validation-rmse=20.721515592165
+ ## Model checkpoint saved to boston-0001.params
+ ## [2] Train-rmse=13.5127391952367
+ ## [2] Validation-rmse=14.1822123675007
+ ## Model checkpoint saved to boston-0002.params
+```
+
+
+- `mx.callback.log.train.metric` logs a training metric each period. You can use it either as a `batch.end.callback` or an
+`epoch.end.callback`.
+
+
+```r
+ model <- mx.model.FeedForward.create(
+ lro, X=train.x, y=train.y,
+ eval.data=list(data=test.x, label=test.y),
+ ctx=mx.cpu(), num.round=10, array.batch.size=20,
+ learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse,
+ batch.end.callback = mx.callback.log.train.metric(5))
+ ```
+
+```
+ ## Auto detect layout of input matrix, use row major..
+ ## Start training with 1 devices
+ ## Batch [5] Train-rmse=17.6514558545416
+ ## [1] Train-rmse=15.2879610219001
+ ## [1] Validation-rmse=12.3332062820921
+ ## Batch [5] Train-rmse=11.939392828565
+ ## [2] Train-rmse=11.4382242547217
+ ## [2] Validation-rmse=9.91176550103181
+ ............
+```
+
+You also can save the training and evaluation errors for later use by passing a reference class:
+
+
+ ```r
+ logger <- mx.metric.logger$new()
+ model <- mx.model.FeedForward.create(
+ lro, X=train.x, y=train.y,
+ eval.data=list(data=test.x, label=test.y),
+ ctx=mx.cpu(), num.round=10, array.batch.size=20,
+ learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse,
+ epoch.end.callback = mx.callback.log.train.metric(5, logger))
+ ```
+
+ ```
+ ## Auto detect layout of input matrix, use row major..
+ ## Start training with 1 devices
+ ## [1] Train-rmse=19.1083228733256
+ ## [1] Validation-rmse=12.7150687428974
+ ## [2] Train-rmse=15.7684378116157
+ ## [2] Validation-rmse=14.8105319420491
+ ............
+ ```
+
+ ```r
+ head(logger$train)
+ ```
+
+ ```
+ ## [1] 19.108323 15.768438 13.531470 11.386050 9.555477 9.351324
+ ```
+
+ ```r
+ head(logger$eval)
+ ```
+
+ ```
+ ## [1] 12.715069 14.810532 15.840361 10.898733 9.349706 9.363087
+ ```
+
+How to Write Your Own Callback Functions
+----------
+
+You can find the source code for the two callback functions on [GitHub](https://github.com/dmlc/mxnet/blob/master/R-package/R/callback.R) and use it as a template:
+
+Basically, all callback functions follow the following structure:
+
+
+ ```r
+ mx.callback.fun <- function() {
+ function(iteration, nbatch, env) {
+ }
+ }
+ ```
+
+The following `mx.callback.save.checkpoint` function is stateless. It gets the model from the environment and saves it:.
+
+
+ ```r
+ mx.callback.save.checkpoint <- function(prefix, period=1) {
+ function(iteration, nbatch, env) {
+ if (iteration %% period == 0) {
+ mx.model.save(env$model, prefix, iteration)
+ cat(sprintf("Model checkpoint saved to %s-%04d.params\n", prefix, iteration))
+ }
+ return(TRUE)
+ }
+ }
+ ```
+
+The `mx.callback.log.train.metric` is a little more complex. It holds a reference class and updates it during the training
+process:
+
+
+ ```r
+ mx.callback.log.train.metric <- function(period, logger=NULL) {
+ function(iteration, nbatch, env) {
+ if (nbatch %% period == 0 && !is.null(env$metric)) {
+ result <- env$metric$get(env$train.metric)
+ if (nbatch != 0)
+ cat(paste0("Batch [", nbatch, "] Train-", result$name, "=", result$value, "\n"))
+ if (!is.null(logger)) {
+ if (class(logger) != "mx.metric.logger") {
+ stop("Invalid mx.metric.logger.")
+ }
+ logger$train <- c(logger$train, result$value)
+ if (!is.null(env$eval.metric)) {
+ result <- env$metric$get(env$eval.metric)
+ if (nbatch != 0)
+ cat(paste0("Batch [", nbatch, "] Validation-", result$name, "=", result$value, "\n"))
+ logger$eval <- c(logger$eval, result$value)
+ }
+ }
+ }
+ return(TRUE)
+ }
+ }
+ ```
+
+Now you might be curious why both callback functions `return(TRUE)`.
+
+Can we `return(FALSE)`?
+
+Yes! You can stop the training early with `return(FALSE)`. See the following examples.
+
+
+ ```r
+ mx.callback.early.stop <- function(eval.metric) {
+ function(iteration, nbatch, env) {
+ if (!is.null(env$metric)) {
+ if (!is.null(eval.metric)) {
+ result <- env$metric$get(env$eval.metric)
+ if (result$value < eval.metric) {
+ return(FALSE)
+ }
+ }
+ }
+ return(TRUE)
+ }
+ }
+ model <- mx.model.FeedForward.create(
+ lro, X=train.x, y=train.y,
+ eval.data=list(data=test.x, label=test.y),
+ ctx=mx.cpu(), num.round=10, array.batch.size=20,
+ learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse,
+ epoch.end.callback = mx.callback.early.stop(10))
+ ```
+
+ ```
+ ## Auto detect layout of input matrix, use row major..
+ ## Start training with 1 devices
+ ## [1] Train-rmse=18.5897984387033
+ ## [1] Validation-rmse=13.5555213820571
+ ## [2] Train-rmse=12.5867564040256
+ ## [2] Validation-rmse=9.76304967080928
+ ```
+
+When the validation metric dips below the threshold we set, the training process stops.
+
+## Next Steps
+* [Neural Networks with MXNet in Five Minutes](http://mxnet.io/tutorials/r/fiveMinutesNeuralNetwork.html)
+* [Classify Real-World Images with a Pretrained Model](http://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
+* [Handwritten Digits Classification Competition](http://mxnet.io/tutorials/r/mnistCompetition.html)
+* [Character Language Model Using RNN](http://mxnet.io/tutorials/r/charRnnModel.html)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/char_rnn_model.md b/docs/static_site/src/pages/api/r/docs/tutorials/char_rnn_model.md
new file mode 100644
index 000000000000..2c516608a1d2
--- /dev/null
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/char_rnn_model.md
@@ -0,0 +1,317 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Char RNN Model
+is_tutorial: true
+tag: r
+permalink: /api/r/docs/tutorials/char_rnn_model
+---
+
+
+# Character-level Language Model using RNN
+
+This tutorial will demonstrate creating a language model using a character level RNN model using MXNet-R package. You will need the following R packages to run this tutorial -
+ - readr
+ - stringr
+ - stringi
+ - mxnet
+
+We will use the [tinyshakespeare](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare) dataset to build this model.
+
+
+```R
+library("readr")
+library("stringr")
+library("stringi")
+library("mxnet")
+```
+
+## Preprocess and prepare the data
+
+Download the data:
+
+
+```R
+download.data <- function(data_dir) {
+ dir.create(data_dir, showWarnings = FALSE)
+ if (!file.exists(paste0(data_dir,'input.txt'))) {
+ download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt',
+ destfile=paste0(data_dir,'input.txt'), method='wget')
+ }
+}
+```
+
+Next we transform the test into feature vectors that is fed into the RNN model. The `make_data` function reads the dataset, cleans it of any non-alphanumeric characters, splits it into individual characters and groups it into sequences of length `seq.len`.
+
+
+```R
+make_data <- function(path, seq.len = 32, dic=NULL) {
+
+ text_vec <- read_file(file = path)
+ text_vec <- stri_enc_toascii(str = text_vec)
+ text_vec <- str_replace_all(string = text_vec, pattern = "[^[:print:]]", replacement = "")
+ text_vec <- strsplit(text_vec, '') %>% unlist
+
+ if (is.null(dic)) {
+ char_keep <- sort(unique(text_vec))
+ } else char_keep <- names(dic)[!dic == 0]
+
+ # Remove terms not part of dictionary
+ text_vec <- text_vec[text_vec %in% char_keep]
+
+ # Build dictionary
+ dic <- 1:length(char_keep)
+ names(dic) <- char_keep
+
+ # reverse dictionary
+ rev_dic <- names(dic)
+ names(rev_dic) <- dic
+
+ # Adjust by -1 to have a 1-lag for labels
+ num.seq <- (length(text_vec) - 1) %/% seq.len
+
+ features <- dic[text_vec[1:(seq.len * num.seq)]]
+ labels <- dic[text_vec[1:(seq.len*num.seq) + 1]]
+
+ features_array <- array(features, dim = c(seq.len, num.seq))
+ labels_array <- array(labels, dim = c(seq.len, num.seq))
+
+ return (list(features_array = features_array, labels_array = labels_array, dic = dic, rev_dic = rev_dic))
+}
+
+
+seq.len <- 100
+data_prep <- make_data(path = "input.txt", seq.len = seq.len, dic=NULL)
+```
+
+Fetch the features and labels for training the model, and split the data into training and evaluation in 9:1 ratio.
+
+
+```R
+X <- data_prep$features_array
+Y <- data_prep$labels_array
+dic <- data_prep$dic
+rev_dic <- data_prep$rev_dic
+vocab <- length(dic)
+
+samples <- tail(dim(X), 1)
+train.val.fraction <- 0.9
+
+X.train.data <- X[, 1:as.integer(samples * train.val.fraction)]
+X.val.data <- X[, -(1:as.integer(samples * train.val.fraction))]
+
+X.train.label <- Y[, 1:as.integer(samples * train.val.fraction)]
+X.val.label <- Y[, -(1:as.integer(samples * train.val.fraction))]
+
+train_buckets <- list("100" = list(data = X.train.data, label = X.train.label))
+eval_buckets <- list("100" = list(data = X.val.data, label = X.val.label))
+
+train_buckets <- list(buckets = train_buckets, dic = dic, rev_dic = rev_dic)
+eval_buckets <- list(buckets = eval_buckets, dic = dic, rev_dic = rev_dic)
+```
+
+Create iterators for training and evaluation datasets.
+
+
+```R
+vocab <- length(eval_buckets$dic)
+
+batch.size <- 32
+
+train.data <- mx.io.bucket.iter(buckets = train_buckets$buckets, batch.size = batch.size,
+ data.mask.element = 0, shuffle = TRUE)
+
+eval.data <- mx.io.bucket.iter(buckets = eval_buckets$buckets, batch.size = batch.size,
+ data.mask.element = 0, shuffle = FALSE)
+```
+
+## Train the Model
+
+
+This model is a multi-layer RNN for sampling from character-level language models. It has a one-to-one model configuration since for each character, we want to predict the next one. For a sequence of length 100, there are also 100 labels, corresponding the same sequence of characters but offset by a position of +1. The parameters output_last_state is set to TRUE in order to access the state of the RNN cells when performing inference.
+
+
+```R
+rnn_graph_one_one <- rnn.graph(num_rnn_layer = 3,
+ num_hidden = 96,
+ input_size = vocab,
+ num_embed = 64,
+ num_decode = vocab,
+ dropout = 0.2,
+ ignore_label = 0,
+ cell_type = "lstm",
+ masking = F,
+ output_last_state = T,
+ loss_output = "softmax",
+ config = "one-to-one")
+
+graph.viz(rnn_graph_one_one, type = "graph", direction = "LR",
+ graph.height.px = 180, shape=c(100, 64))
+
+devices <- mx.cpu()
+
+initializer <- mx.init.Xavier(rnd_type = "gaussian", factor_type = "avg", magnitude = 3)
+
+optimizer <- mx.opt.create("adadelta", rho = 0.9, eps = 1e-5, wd = 1e-8,
+ clip_gradient = 5, rescale.grad = 1/batch.size)
+
+logger <- mx.metric.logger()
+epoch.end.callback <- mx.callback.log.train.metric(period = 1, logger = logger)
+batch.end.callback <- mx.callback.log.train.metric(period = 50)
+
+mx.metric.custom_nd <- function(name, feval) {
+ init <- function() {
+ c(0, 0)
+ }
+ update <- function(label, pred, state) {
+ m <- feval(label, pred)
+ state <- c(state[[1]] + 1, state[[2]] + m)
+ return(state)
+ }
+ get <- function(state) {
+ list(name=name, value = (state[[2]] / state[[1]]))
+ }
+ ret <- (list(init = init, update = update, get = get))
+ class(ret) <- "mx.metric"
+ return(ret)
+}
+
+mx.metric.Perplexity <- mx.metric.custom_nd("Perplexity", function(label, pred) {
+ label <- mx.nd.reshape(label, shape = -1)
+ label_probs <- as.array(mx.nd.choose.element.0index(pred, label))
+ batch <- length(label_probs)
+ NLL <- -sum(log(pmax(1e-15, as.array(label_probs)))) / batch
+ Perplexity <- exp(NLL)
+ return(Perplexity)
+})
+
+model <- mx.model.buckets(symbol = rnn_graph_one_one,
+ train.data = train.data, eval.data = eval.data,
+ num.round = 20, ctx = devices, verbose = TRUE,
+ metric = mx.metric.Perplexity,
+ initializer = initializer, optimizer = optimizer,
+ batch.end.callback = NULL,
+ epoch.end.callback = epoch.end.callback)
+
+mx.model.save(model, prefix = "one_to_one_seq_model", iteration = 20)
+```
+
+ Start training with 1 devices
+ [1] Train-Perplexity=13.7040474322178
+ [1] Validation-Perplexity=7.94617194460922
+ [2] Train-Perplexity=6.57039815554525
+ [2] Validation-Perplexity=6.60806110658011
+ [3] Train-Perplexity=5.65360504501481
+ [3] Validation-Perplexity=6.18932770630876
+ [4] Train-Perplexity=5.32547285727298
+ [4] Validation-Perplexity=6.02198756798859
+ [5] Train-Perplexity=5.14373631472579
+ [5] Validation-Perplexity=5.8095658243407
+ [6] Train-Perplexity=5.03077673487379
+ [6] Validation-Perplexity=5.72582993567431
+ [7] Train-Perplexity=4.94453383291536
+ [7] Validation-Perplexity=5.6445258528126
+ [8] Train-Perplexity=4.88635290100261
+ [8] Validation-Perplexity=5.6730024536433
+ [9] Train-Perplexity=4.84205646230548
+ [9] Validation-Perplexity=5.50960780230982
+ [10] Train-Perplexity=4.80441673535513
+ [10] Validation-Perplexity=5.57002263750006
+ [11] Train-Perplexity=4.77763413242626
+ [11] Validation-Perplexity=5.55152143269169
+ [12] Train-Perplexity=4.74937775290777
+ [12] Validation-Perplexity=5.44968305351486
+ [13] Train-Perplexity=4.72824849541467
+ [13] Validation-Perplexity=5.50889348298234
+ [14] Train-Perplexity=4.70980846981694
+ [14] Validation-Perplexity=5.51473225859859
+ [15] Train-Perplexity=4.69685776886122
+ [15] Validation-Perplexity=5.45391985233811
+ [16] Train-Perplexity=4.67837107034824
+ [16] Validation-Perplexity=5.46636764997829
+ [17] Train-Perplexity=4.66866961934873
+ [17] Validation-Perplexity=5.44267086113492
+ [18] Train-Perplexity=4.65611469144194
+ [18] Validation-Perplexity=5.4290169469462
+ [19] Train-Perplexity=4.64614689879405
+ [19] Validation-Perplexity=5.44221549833917
+ [20] Train-Perplexity=4.63764001963654
+ [20] Validation-Perplexity=5.42114250842862
+
+
+## Inference on the Model
+
+We now use the saved model to do inference and sample text character by character that will look like the original training data.
+
+
+```R
+set.seed(0)
+model <- mx.model.load(prefix = "one_to_one_seq_model", iteration = 20)
+
+internals <- model$symbol$get.internals()
+sym_state <- internals$get.output(which(internals$outputs %in% "RNN_state"))
+sym_state_cell <- internals$get.output(which(internals$outputs %in% "RNN_state_cell"))
+sym_output <- internals$get.output(which(internals$outputs %in% "loss_output"))
+symbol <- mx.symbol.Group(sym_output, sym_state, sym_state_cell)
+
+infer_raw <- c("Thou ")
+infer_split <- dic[strsplit(infer_raw, '') %>% unlist]
+infer_length <- length(infer_split)
+
+infer.data <- mx.io.arrayiter(data = matrix(infer_split), label = matrix(infer_split),
+ batch.size = 1, shuffle = FALSE)
+
+infer <- mx.infer.rnn.one(infer.data = infer.data,
+ symbol = symbol,
+ arg.params = model$arg.params,
+ aux.params = model$aux.params,
+ input.params = NULL,
+ ctx = devices)
+
+pred_prob <- as.numeric(as.array(mx.nd.slice.axis(
+ infer$loss_output, axis = 0, begin = infer_length-1, end = infer_length)))
+pred <- sample(length(pred_prob), prob = pred_prob, size = 1) - 1
+predict <- c(predict, pred)
+
+for (i in 1:200) {
+
+ infer.data <- mx.io.arrayiter(data = as.matrix(pred), label = as.matrix(pred),
+ batch.size = 1, shuffle = FALSE)
+
+ infer <- mx.infer.rnn.one(infer.data = infer.data,
+ symbol = symbol,
+ arg.params = model$arg.params,
+ aux.params = model$aux.params,
+ input.params = list(rnn.state = infer[[2]],
+ rnn.state.cell = infer[[3]]),
+ ctx = devices)
+
+ pred_prob <- as.numeric(as.array(infer$loss_output))
+ pred <- sample(length(pred_prob), prob = pred_prob, size = 1, replace = T) - 1
+ predict <- c(predict, pred)
+}
+
+predict_txt <- paste0(rev_dic[as.character(predict)], collapse = "")
+predict_txt_tot <- paste0(infer_raw, predict_txt, collapse = "")
+print(predict_txt_tot)
+```
+
+ [1] "Thou NAknowledge thee my Comfort and his late she.FRIAR LAURENCE:Nothing a groats waterd forth. The lend he thank that;When she I am brother draw London: and not hear that know.BENVOLIO:How along, makes your "
+
+
+
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/classify_real_image_with_pretrained_model.md b/docs/static_site/src/pages/api/r/docs/tutorials/classify_real_image_with_pretrained_model.md
new file mode 100644
index 000000000000..228ecb848a2d
--- /dev/null
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/classify_real_image_with_pretrained_model.md
@@ -0,0 +1,208 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Classify Images with a PreTrained Model
+is_tutorial: true
+tag: r
+permalink: /api/r/docs/tutorials/classify_real_image_with_pretrained_model
+---
+
+Classify Images with a PreTrained Model
+=================================================
+MXNet is a flexible and efficient deep learning framework. One of the interesting things that a deep learning
+algorithm can do is classify real world images.
+
+In this tutorial, we show how to use a pre-trained Inception-BatchNorm network to predict the class of an
+image. For information about the network architecture, see [1].
+
+The pre-trained Inception-BatchNorm network is able to be downloaded from [this link](http://data.mxnet.io/mxnet/data/Inception.zip)
+This model gives the recent state-of-art prediction accuracy on image net dataset.
+
+Load the MXNet Package
+---------------
+To get started, load the mxnet package:
+
+ ```r
+ require(mxnet)
+ ```
+
+ ```
+ ## Loading required package: mxnet
+ ## Loading required package: methods
+ ```
+
+Now load the imager package to load and preprocess the images in R:
+
+
+ ```r
+ require(imager)
+ ```
+
+ ```
+ ## Loading required package: imager
+ ## Loading required package: plyr
+ ## Loading required package: magrittr
+ ## Loading required package: stringr
+ ## Loading required package: png
+ ## Loading required package: jpeg
+ ##
+ ## Attaching package: 'imager'
+ ##
+ ## The following object is masked from 'package:magrittr':
+ ##
+ ## add
+ ##
+ ## The following object is masked from 'package:plyr':
+ ##
+ ## liply
+ ##
+ ## The following objects are masked from 'package:stats':
+ ##
+ ## convolve, spectrum
+ ##
+ ## The following object is masked from 'package:graphics':
+ ##
+ ## frame
+ ##
+ ## The following object is masked from 'package:base':
+ ##
+ ## save.image
+ ```
+
+Load the PreTrained Model
+-------------------------
+Make sure you unzip the pre-trained model in the current folder. Use the model
+loading function to load the model into R:
+
+ ```r
+ model = mx.model.load("Inception/Inception_BN", iteration=39)
+ ```
+
+Load in the mean image, which is used for preprocessing using:
+
+
+ ```r
+ mean.img = as.array(mx.nd.load("Inception/mean_224.nd")[["mean_img"]])
+ ```
+
+Load and Preprocess the Image
+-----------------------------
+Now, we are ready to classify a real image. In this example, we simply take the parrots image
+from the imager package. You can use another image, if you prefer.
+
+Load and plot the image:
+
+
+```r
+ im <- load.image(system.file("extdata/parrots.png", package="imager"))
+ plot(im)
+ ```
+
+![plot of chunk unnamed-chunk-5](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/knitr/classifyRealImageWithPretrainedModel-unnamed-chunk-5-1.png)
+
+Before feeding the image to the deep network, we need to perform some preprocessing
+to make the image meet the deep network input requirements. Preprocessing
+includes cropping and subtracting the mean.
+Because MXNet is deeply integrated with R, we can do all the processing in an R function:
+
+
+ ```r
+ preproc.image <- function(im, mean.image) {
+ # crop the image
+ shape <- dim(im)
+ short.edge <- min(shape[1:2])
+ xx <- floor((shape[1] - short.edge) / 2)
+ yy <- floor((shape[2] - short.edge) / 2)
+ cropped <- crop.borders(im, xx, yy)
+ # resize to 224 x 224, needed by input of the model.
+ resized <- resize(cropped, 224, 224)
+ # convert to array (x, y, channel)
+ arr <- as.array(resized) * 255
+ dim(arr) <- c(224, 224, 3)
+ # subtract the mean
+ normed <- arr - mean.img
+ # Reshape to format needed by mxnet (width, height, channel, num)
+ dim(normed) <- c(224, 224, 3, 1)
+ return(normed)
+ }
+ ```
+
+Use the defined preprocessing function to get the normalized image:
+
+
+ ```r
+ normed <- preproc.image(im, mean.img)
+ ```
+
+Classify the Image
+------------------
+Now we are ready to classify the image! Use the ```predict``` function
+to get the probability over classes:
+
+
+ ```r
+ prob <- predict(model, X=normed)
+ dim(prob)
+ ```
+
+ ```
+ ## [1] 1000 1
+ ```
+
+As you can see, ```prob``` is a 1 times 1000 array, which gives the probability
+over the 1000 image classes of the input.
+
+Use the ```max.col``` on the transpose of ```prob``` to get the class index:
+
+ ```r
+ max.idx <- max.col(t(prob))
+ max.idx
+ ```
+
+ ```
+ ## [1] 89
+ ```
+
+The index doesn't make much sense, so let's see what it really means.
+Read the names of the classes from the following file:
+
+
+ ```r
+ synsets <- readLines("Inception/synset.txt")
+ ```
+
+Let's see what the image really is:
+
+
+ ```r
+ print(paste0("Predicted Top-class: ", synsets [[max.idx]]))
+ ```
+
+ ```
+ ## [1] "Predicted Top-class: n01818515 macaw"
+ ```
+
+It's a macaw!
+
+Reference
+---------
+[1] Ioffe, Sergey, and Christian Szegedy. "Batch normalization: Accelerating deep network training by reducing internal covariate shift." arXiv preprint arXiv:1502.03167 (2015).
+
+## Next Steps
+* [Handwritten Digits Classification Competition](http://mxnet.io/tutorials/r/mnistCompetition.html)
+* [Character Language Model using RNN](http://mxnet.io/tutorials/r/charRnnModel.html)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/custom_iterator.md b/docs/static_site/src/pages/api/r/docs/tutorials/custom_iterator.md
new file mode 100644
index 000000000000..5ecfb47b7ce9
--- /dev/null
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/custom_iterator.md
@@ -0,0 +1,227 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Custom Iterator Tutorial
+is_tutorial: true
+tag: r
+permalink: /api/r/docs/tutorials/custom_iterator
+---
+
+
+Custom Iterator Tutorial
+========================
+
+This tutorial provides a guideline on how to use and write custom iterators, which can very useful when having a dataset that does not fit into memory.
+
+Getting the data
+----------
+The data we are going to use is the [MNIST dataset](http://yann.lecun.com/exdb/mnist/) in CSV format, the data can be found in this [web](http://pjreddie.com/projects/mnist-in-csv/).
+
+To download the data:
+
+```bash
+wget http://pjreddie.com/media/files/mnist_train.csv
+wget http://pjreddie.com/media/files/mnist_test.csv
+```
+
+You'll get two files, `mnist_train.csv` that contains 60.000 examples of hand written numbers and `mxnist_test.csv` that contains 10.000 examples. The first element of each line in the CSV is the label, which is a number between 0 and 9. The rest of the line are 784 numbers between 0 and 255, corresponding to the levels of grey of a matrix of 28x28. Therefore, each line contains an image of 28x28 pixels of a hand written number and its true label.
+
+Custom CSV Iterator
+----------
+Next we are going to create a custom CSV Iterator based on the [C++ CSVIterator class](https://github.com/dmlc/mxnet/blob/master/src/io/iter_csv.cc).
+
+For that we are going to use the R function `mx.io.CSVIter` as a base class. This class has as parameters `data.csv, data.shape, batch.size` and two main functions, `iter.next()` that calls the iterator in the next batch of data and `value()` that returns the train data and the label.
+
+The R Custom Iterator needs to inherit from the C++ data iterator class, for that we used the class `Rcpp_MXArrayDataIter` extracted with RCPP. Also, it needs to have the same parameters: `data.csv, data.shape, batch.size`. Apart from that, we can also add the field `iter`, which is the CSV Iterator that we are going to expand.
+
+```r
+CustomCSVIter <- setRefClass("CustomCSVIter",
+ fields=c("iter", "data.csv", "data.shape", "batch.size"),
+ contains = "Rcpp_MXArrayDataIter",
+ #...
+ )
+```
+
+The next step is to initialize the class. For that we call the base `mx.io.CSVIter` and fill the rest of the fields.
+
+```r
+CustomCSVIter <- setRefClass("CustomCSVIter",
+ fields=c("iter", "data.csv", "data.shape", "batch.size"),
+ contains = "Rcpp_MXArrayDataIter",
+ methods=list(
+ initialize=function(iter, data.csv, data.shape, batch.size){
+ feature_len <- data.shape*data.shape + 1
+ csv_iter <- mx.io.CSVIter(data.csv=data.csv, data.shape=c(feature_len), batch.size=batch.size)
+ .self$iter <- csv_iter
+ .self$data.csv <- data.csv
+ .self$data.shape <- data.shape
+ .self$batch.size <- batch.size
+ .self
+ },
+ #...
+ )
+ )
+```
+
+So far there is no difference between the original class and the custom class. Let's implement the function `value()`. In this case what we are going to do is transform the data that comes from the original class as an array of 785 numbers into a matrix of 28x28 and a label. We will also normalize the training data to be between 0 and 1.
+
+```r
+CustomCSVIter <- setRefClass("CustomCSVIter",
+ fields=c("iter", "data.csv", "data.shape", "batch.size"),
+ contains = "Rcpp_MXArrayDataIter",
+ methods=list(
+ initialize=function(iter, data.csv, data.shape, batch.size){
+ feature_len <- data.shape*data.shape + 1
+ csv_iter <- mx.io.CSVIter(data.csv=data.csv, data.shape=c(feature_len), batch.size=batch.size)
+ .self$iter <- csv_iter
+ .self$data.csv <- data.csv
+ .self$data.shape <- data.shape
+ .self$batch.size <- batch.size
+ .self
+ },
+ value=function(){
+ val <- as.array(.self$iter$value()$data)
+ val.x <- val[-1,]
+ val.y <- val[1,]
+ val.x <- val.x/255
+ dim(val.x) <- c(data.shape, data.shape, 1, ncol(val.x))
+ val.x <- mx.nd.array(val.x)
+ val.y <- mx.nd.array(val.y)
+ list(data=val.x, label=val.y)
+ },
+ #...
+ )
+ )
+```
+Finally we are going to add the rest of the functions needed for the training to work correctly. The final `CustomCSVIter` looks like this:
+
+```r
+CustomCSVIter <- setRefClass("CustomCSVIter",
+ fields=c("iter", "data.csv", "data.shape", "batch.size"),
+ contains = "Rcpp_MXArrayDataIter",
+ methods=list(
+ initialize=function(iter, data.csv, data.shape, batch.size){
+ feature_len <- data.shape*data.shape + 1
+ csv_iter <- mx.io.CSVIter(data.csv=data.csv, data.shape=c(feature_len), batch.size=batch.size)
+ .self$iter <- csv_iter
+ .self$data.csv <- data.csv
+ .self$data.shape <- data.shape
+ .self$batch.size <- batch.size
+ .self
+ },
+ value=function(){
+ val <- as.array(.self$iter$value()$data)
+ val.x <- val[-1,]
+ val.y <- val[1,]
+ val.x <- val.x/255
+ dim(val.x) <- c(data.shape, data.shape, 1, ncol(val.x))
+ val.x <- mx.nd.array(val.x)
+ val.y <- mx.nd.array(val.y)
+ list(data=val.x, label=val.y)
+ },
+ iter.next=function(){
+ .self$iter$iter.next()
+ },
+ reset=function(){
+ .self$iter$reset()
+ },
+ num.pad=function(){
+ .self$iter$num.pad()
+ },
+ finalize=function(){
+ .self$iter$finalize()
+ }
+ )
+ )
+```
+
+To call the class we can just do:
+
+```r
+batch.size <- 100
+train.iter <- CustomCSVIter$new(iter = NULL, data.csv = "mnist_train.csv", data.shape = 28, batch.size = batch.size)
+```
+
+CNN Model
+----------
+
+For this tutorial we are going to use the known LeNet architecture:
+
+```r
+lenet.model <- function(){
+ data <- mx.symbol.Variable('data')
+ conv1 <- mx.symbol.Convolution(data=data, kernel=c(5,5), num_filter=20) #first conv
+ tanh1 <- mx.symbol.Activation(data=conv1, act_type="tanh")
+ pool1 <- mx.symbol.Pooling(data=tanh1, pool_type="max", kernel=c(2,2), stride=c(2,2))
+ conv2 <- mx.symbol.Convolution(data=pool1, kernel=c(5,5), num_filter=50)# second conv
+ tanh2 <- mx.symbol.Activation(data=conv2, act_type="tanh")
+ pool2 <- mx.symbol.Pooling(data=tanh2, pool_type="max", kernel=c(2,2), stride=c(2,2))
+ flatten <- mx.symbol.Flatten(data=pool2)
+ fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=100) # first fullc
+ tanh3 <- mx.symbol.Activation(data=fc1, act_type="tanh")
+ fc2 <- mx.symbol.FullyConnected(data=tanh3, num_hidden=10) # second fullc
+ network <- mx.symbol.SoftmaxOutput(data=fc2) # loss
+ network
+}
+network <- lenet.model()
+```
+
+Training with the Custom Iterator
+----------
+Finally, we can directly add the custom iterator as the training data source.
+
+```r
+model <- mx.model.FeedForward.create(symbol=network,
+ X=train.iter,
+ ctx=mx.gpu(0),
+ num.round=10,
+ array.batch.size=batch.size,
+ learning.rate=0.1,
+ momentum=0.9,
+ eval.metric=mx.metric.accuracy,
+ wd=0.00001,
+ batch.end.callback=mx.callback.log.speedometer(batch.size, frequency = 100)
+ )
+```
+
+The last 2 iterations with a K80 GPU looks like this:
+
+```bash
+[8] Train-accuracy=0.998866666666667
+Batch [100] Speed: 15413.0104454713 samples/sec Train-accuracy=0.999
+Batch [200] Speed: 16629.3412459049 samples/sec Train-accuracy=0.99935
+Batch [300] Speed: 18412.6900509319 samples/sec Train-accuracy=0.9995
+Batch [400] Speed: 16757.2882328335 samples/sec Train-accuracy=0.999425
+Batch [500] Speed: 17116.6529207406 samples/sec Train-accuracy=0.99946
+Batch [600] Speed: 19627.589505195 samples/sec Train-accuracy=0.99945
+[9] Train-accuracy=0.9991
+Batch [100] Speed: 18971.5745536982 samples/sec Train-accuracy=0.9992
+Batch [200] Speed: 15554.8822435383 samples/sec Train-accuracy=0.99955
+Batch [300] Speed: 18327.6950115053 samples/sec Train-accuracy=0.9997
+Batch [400] Speed: 17103.0705411788 samples/sec Train-accuracy=0.9997
+Batch [500] Speed: 15104.8656902394 samples/sec Train-accuracy=0.99974
+Batch [600] Speed: 13818.7899518255 samples/sec Train-accuracy=0.99975
+[10] Train-accuracy=0.99975
+```
+
+Conclusion
+----------
+
+We have shown how to create a custom CSV Iterator by extending the class `mx.io.CSVIter`. In our class, we iteratively read from a CSV file a batch of data that will be transformed and then processed in the stochastic gradient descent optimization. That way, we are able to manage CSV files that are bigger than the memory of the machine we are using.
+
+Based of this custom iterator, we can also create data loaders that internally transform or expand the data, allowing to manage files of any size.
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md b/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md
new file mode 100644
index 000000000000..dbf1280462a9
--- /dev/null
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md
@@ -0,0 +1,231 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Custom Loss Function
+is_tutorial: true
+tag: r
+permalink: /api/r/docs/tutorials/custom_loss_function
+---
+
+
+Customized loss function
+======================================
+
+This tutorial provides guidelines for using customized loss function in network construction.
+
+Model Training Example
+----------------------
+
+Let's begin with a small regression example. We can build and train a regression model with the following code:
+
+``` r
+data(BostonHousing, package = "mlbench")
+BostonHousing[, sapply(BostonHousing, is.factor)] <-
+ as.numeric(as.character(BostonHousing[, sapply(BostonHousing, is.factor)]))
+BostonHousing <- data.frame(scale(BostonHousing))
+
+test.ind = seq(1, 506, 5) # 1 pt in 5 used for testing
+train.x = data.matrix(BostonHousing[-test.ind,-14])
+train.y = BostonHousing[-test.ind, 14]
+test.x = data.matrix(BostonHousing[--test.ind,-14])
+test.y = BostonHousing[--test.ind, 14]
+
+require(mxnet)
+```
+
+ ## Loading required package: mxnet
+
+``` r
+data <- mx.symbol.Variable("data")
+label <- mx.symbol.Variable("label")
+fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1")
+tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1")
+fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2")
+lro <- mx.symbol.LinearRegressionOutput(fc2, name = "lro")
+
+mx.set.seed(0)
+model <- mx.model.FeedForward.create(lro, X = train.x, y = train.y,
+ ctx = mx.cpu(),
+ num.round = 5,
+ array.batch.size = 60,
+ optimizer = "rmsprop",
+ verbose = TRUE,
+ array.layout = "rowmajor",
+ batch.end.callback = NULL,
+ epoch.end.callback = NULL)
+```
+
+ ## Start training with 1 devices
+
+``` r
+pred <- predict(model, test.x)
+```
+
+ ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
+
+``` r
+sum((test.y - pred[1,])^2) / length(test.y)
+```
+
+ ## [1] 0.2485236
+
+Besides the `LinearRegressionOutput`, we also provide `LogisticRegressionOutput` and `MAERegressionOutput`. However, this might not be enough for real-world models. You can provide your own loss function by using `mx.symbol.MakeLoss` when constructing the network.
+
+How to Use Your Own Loss Function
+---------------------------------
+
+We still use our previous example, but this time we use `mx.symbol.MakeLoss` to minimize the `(pred-label)^2`
+
+``` r
+data <- mx.symbol.Variable("data")
+label <- mx.symbol.Variable("label")
+fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1")
+tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1")
+fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2")
+lro2 <- mx.symbol.MakeLoss(mx.symbol.square(mx.symbol.Reshape(fc2, shape = 0) - label), name="lro2")
+```
+
+Then we can train the network just as usual.
+
+``` r
+mx.set.seed(0)
+model2 <- mx.model.FeedForward.create(lro2, X = train.x, y = train.y,
+ ctx = mx.cpu(),
+ num.round = 5,
+ array.batch.size = 60,
+ optimizer = "rmsprop",
+ verbose = TRUE,
+ array.layout = "rowmajor",
+ batch.end.callback = NULL,
+ epoch.end.callback = NULL)
+```
+
+ ## Start training with 1 devices
+
+We should get very similar results because we are actually minimizing the same loss function. However, the result is quite different.
+
+``` r
+pred2 <- predict(model2, test.x)
+```
+
+ ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
+
+``` r
+sum((test.y - pred2)^2) / length(test.y)
+```
+
+ ## [1] 1.234584
+
+This is because output of `mx.symbol.MakeLoss` is the gradient of loss with respect to the input data. We can get the real prediction as below.
+
+``` r
+internals = internals(model2$symbol)
+fc_symbol = internals[[match("fc2_output", outputs(internals))]]
+
+model3 <- list(symbol = fc_symbol,
+ arg.params = model2$arg.params,
+ aux.params = model2$aux.params)
+
+class(model3) <- "MXFeedForwardModel"
+
+pred3 <- predict(model3, test.x)
+```
+
+ ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
+
+``` r
+sum((test.y - pred3[1,])^2) / length(test.y)
+```
+
+ ## [1] 0.248294
+
+We have provided many operations on the symbols. An example of `|pred-label|` can be found below.
+
+``` r
+lro_abs <- mx.symbol.MakeLoss(mx.symbol.abs(mx.symbol.Reshape(fc2, shape = 0) - label))
+mx.set.seed(0)
+model4 <- mx.model.FeedForward.create(lro_abs, X = train.x, y = train.y,
+ ctx = mx.cpu(),
+ num.round = 20,
+ array.batch.size = 60,
+ optimizer = "sgd",
+ learning.rate = 0.001,
+ verbose = TRUE,
+ array.layout = "rowmajor",
+ batch.end.callback = NULL,
+ epoch.end.callback = NULL)
+```
+
+ ## Start training with 1 devices
+
+``` r
+internals = internals(model4$symbol)
+fc_symbol = internals[[match("fc2_output", outputs(internals))]]
+
+model5 <- list(symbol = fc_symbol,
+ arg.params = model4$arg.params,
+ aux.params = model4$aux.params)
+
+class(model5) <- "MXFeedForwardModel"
+
+pred5 <- predict(model5, test.x)
+```
+
+ ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
+
+``` r
+sum(abs(test.y - pred5[1,])) / length(test.y)
+```
+
+ ## [1] 0.7056902
+
+``` r
+lro_mae <- mx.symbol.MAERegressionOutput(fc2, name = "lro")
+mx.set.seed(0)
+model6 <- mx.model.FeedForward.create(lro_mae, X = train.x, y = train.y,
+ ctx = mx.cpu(),
+ num.round = 20,
+ array.batch.size = 60,
+ optimizer = "sgd",
+ learning.rate = 0.001,
+ verbose = TRUE,
+ array.layout = "rowmajor",
+ batch.end.callback = NULL,
+ epoch.end.callback = NULL)
+```
+
+ ## Start training with 1 devices
+
+``` r
+pred6 <- predict(model6, test.x)
+```
+
+ ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
+
+``` r
+sum(abs(test.y - pred6[1,])) / length(test.y)
+```
+
+ ## [1] 0.7056902
+
+
+## Next Steps
+* [Neural Networks with MXNet in Five Minutes](http://mxnet.io/tutorials/r/fiveMinutesNeuralNetwork.html)
+* [Classify Real-World Images with a PreTrained Model](http://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
+* [Handwritten Digits Classification Competition](http://mxnet.io/tutorials/r/mnistCompetition.html)
+* [Character Language Model Using RNN](http://mxnet.io/tutorials/r/charRnnModel.html)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/five_minutes_neural_network.md b/docs/static_site/src/pages/api/r/docs/tutorials/five_minutes_neural_network.md
new file mode 100644
index 000000000000..429240e2eda3
--- /dev/null
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/five_minutes_neural_network.md
@@ -0,0 +1,341 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Five Minutes Neural Network
+is_tutorial: true
+tag: r
+permalink: /api/r/docs/tutorials/five_minutes_neural_network
+---
+
+Develop a Neural Network with MXNet in Five Minutes
+=============================================
+
+This tutorial is designed for new users of the `mxnet` package for R. It shows how to construct a neural network to do regression in 5 minutes. It shows how to perform classification and regression tasks, respectively. The data we use is in the `mlbench` package. Instructions to install R and MXNet's R package in different environments can be found [here](http://mxnet.incubator.apache.org/install/index.html?platform=Linux&language=R&processor=CPU).
+
+## Classification
+
+ ```
+ ## Loading required package: mlbench
+ ```
+ ```r
+ if (!require(mlbench)) {
+ install.packages('mlbench')
+ }
+ ```
+
+ ```
+ ## Loading required package: mxnet
+ ```
+
+ ```r
+ require(mxnet)
+ ```
+
+ ```
+ ## Loading required datasets
+ ```
+
+ ```r
+ data(Sonar, package="mlbench")
+
+ Sonar[,61] = as.numeric(Sonar[,61])-1
+ train.ind = c(1:50, 100:150)
+ train.x = data.matrix(Sonar[train.ind, 1:60])
+ train.y = Sonar[train.ind, 61]
+ test.x = data.matrix(Sonar[-train.ind, 1:60])
+ test.y = Sonar[-train.ind, 61]
+ ```
+
+We are going to use a multi-layer perceptron as our classifier. In `mxnet`, we have a function called `mx.mlp` for building a general multi-layer neural network to do classification or regression.
+
+`mx.mlp` requires the following parameters:
+
+- Training data and label
+- Number of hidden nodes in each hidden layer
+- Number of nodes in the output layer
+- Type of the activation
+- Type of the output loss
+- The device to train (GPU or CPU)
+- Other parameters for `mx.model.FeedForward.create`
+
+The following code shows an example usage of `mx.mlp`:
+
+
+ ```r
+ mx.set.seed(0)
+ model <- mx.mlp(train.x, train.y, hidden_node=10, out_node=2, out_activation="softmax",
+ num.round=20, array.batch.size=15, learning.rate=0.07, momentum=0.9,
+ eval.metric=mx.metric.accuracy)
+ ```
+
+ ```
+ ## Auto detect layout of input matrix, use rowmajor..
+ ## Start training with 1 devices
+ ## [1] Train-accuracy=0.488888888888889
+ ## [2] Train-accuracy=0.514285714285714
+ ## [3] Train-accuracy=0.514285714285714
+ ## [4] Train-accuracy=0.514285714285714
+ ## [5] Train-accuracy=0.514285714285714
+ ## [6] Train-accuracy=0.523809523809524
+ ## [7] Train-accuracy=0.619047619047619
+ ## [8] Train-accuracy=0.695238095238095
+ ## [9] Train-accuracy=0.695238095238095
+ ## [10] Train-accuracy=0.761904761904762
+ ## [11] Train-accuracy=0.828571428571429
+ ## [12] Train-accuracy=0.771428571428571
+ ## [13] Train-accuracy=0.742857142857143
+ ## [14] Train-accuracy=0.733333333333333
+ ## [15] Train-accuracy=0.771428571428571
+ ## [16] Train-accuracy=0.847619047619048
+ ## [17] Train-accuracy=0.857142857142857
+ ## [18] Train-accuracy=0.838095238095238
+ ## [19] Train-accuracy=0.838095238095238
+ ## [20] Train-accuracy=0.838095238095238
+ ```
+
+Note that `mx.set.seed` controls the random process in `mxnet`. You can see the accuracy in each round during training. It's also easy to make predictions and evaluate.
+
+To get an idea of what is happening, view the computation graph from R:
+
+ ```r
+ graph.viz(model$symbol)
+ ```
+
+[](https://github.com/dmlc/mxnet)
+
+ ```r
+ preds = predict(model, test.x)
+ ```
+
+ ```
+ ## Auto detect layout of input matrix, use rowmajor.
+ ```
+
+ ```r
+ pred.label = max.col(t(preds))-1
+ table(pred.label, test.y)
+ ```
+
+ ```
+ ## test.y
+ ## pred.label 0 1
+ ## 0 24 14
+ ## 1 36 33
+ ```
+
+Note for that for multi-class predictions, mxnet outputs `nclass` x `nexamples`, with each row corresponding to the probability of the class.
+
+## Regression
+
+Again, let us preprocess the data:
+
+
+ ```r
+ data(BostonHousing, package="mlbench")
+
+ train.ind = seq(1, 506, 3)
+ train.x = data.matrix(BostonHousing[train.ind, -14])
+ train.y = BostonHousing[train.ind, 14]
+ test.x = data.matrix(BostonHousing[-train.ind, -14])
+ test.y = BostonHousing[-train.ind, 14]
+ ```
+
+Although we can use `mx.mlp` again to do regression by changing the `out_activation`, this time we are going to introduce a flexible way to configure neural networks in `mxnet`. Configuration is done by the "Symbol" system in `mxnet`. The Symbol system takes care of the links among nodes, activation, dropout ratio, etc. Configure a multi-layer neural network as follows:
+
+
+ ```r
+ # Define the input data
+ data <- mx.symbol.Variable("data")
+ # A fully connected hidden layer
+ # data: input source
+ # num_hidden: number of neurons in this hidden layer
+ fc1 <- mx.symbol.FullyConnected(data, num_hidden=1)
+
+ # Use linear regression for the output layer
+ lro <- mx.symbol.LinearRegressionOutput(fc1)
+ ```
+
+What matters for a regression task is mainly the last function. It enables the new network to optimize for squared loss. Now let's train on this simple data set. In this configuration, we dropped the hidden layer so that the input layer is directly connected to the output layer.
+
+Next, make prediction with this structure and other parameters with `mx.model.FeedForward.create`:
+
+
+ ```r
+ mx.set.seed(0)
+ model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y,
+ ctx=mx.cpu(), num.round=50, array.batch.size=20,
+ learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse)
+ ```
+
+ ```
+ ## Auto detect layout of input matrix, use rowmajor.
+ ## Start training with 1 devices
+ ## [1] Train-rmse=16.063282524034
+ ## [2] Train-rmse=12.2792375712573
+ ## [3] Train-rmse=11.1984634005885
+ ## [4] Train-rmse=10.2645236892904
+ ## [5] Train-rmse=9.49711005504284
+ ## [6] Train-rmse=9.07733734175182
+ ## [7] Train-rmse=9.07884450847991
+ ## [8] Train-rmse=9.10463850277417
+ ## [9] Train-rmse=9.03977049028532
+ ## [10] Train-rmse=8.96870685004475
+ ## [11] Train-rmse=8.93113287361574
+ ## [12] Train-rmse=8.89937257821847
+ ## [13] Train-rmse=8.87182096922953
+ ## [14] Train-rmse=8.84476075083586
+ ## [15] Train-rmse=8.81464673014974
+ ## [16] Train-rmse=8.78672567900196
+ ## [17] Train-rmse=8.76265872846474
+ ## [18] Train-rmse=8.73946101419974
+ ## [19] Train-rmse=8.71651926303267
+ ## [20] Train-rmse=8.69457600919277
+ ## [21] Train-rmse=8.67354928674563
+ ## [22] Train-rmse=8.65328755392436
+ ## [23] Train-rmse=8.63378039680078
+ ## [24] Train-rmse=8.61488162586984
+ ## [25] Train-rmse=8.5965105183022
+ ## [26] Train-rmse=8.57868133563275
+ ## [27] Train-rmse=8.56135851937663
+ ## [28] Train-rmse=8.5444819772098
+ ## [29] Train-rmse=8.52802114610432
+ ## [30] Train-rmse=8.5119504512622
+ ## [31] Train-rmse=8.49624261719241
+ ## [32] Train-rmse=8.48087453238701
+ ## [33] Train-rmse=8.46582689119887
+ ## [34] Train-rmse=8.45107881002491
+ ## [35] Train-rmse=8.43661331401712
+ ## [36] Train-rmse=8.42241575909639
+ ## [37] Train-rmse=8.40847217331365
+ ## [38] Train-rmse=8.39476931796395
+ ## [39] Train-rmse=8.38129658373974
+ ## [40] Train-rmse=8.36804269059018
+ ## [41] Train-rmse=8.35499817678397
+ ## [42] Train-rmse=8.34215505742154
+ ## [43] Train-rmse=8.32950441908131
+ ## [44] Train-rmse=8.31703985777311
+ ## [45] Train-rmse=8.30475363906755
+ ## [46] Train-rmse=8.29264031506106
+ ## [47] Train-rmse=8.28069372820073
+ ## [48] Train-rmse=8.26890902770415
+ ## [49] Train-rmse=8.25728089053853
+ ## [50] Train-rmse=8.24580511500735
+ ```
+
+It's also easy to make a prediction and evaluate it:
+
+
+ ```r
+ preds = predict(model, test.x)
+ ```
+
+ ```
+ ## Auto detect layout of input matrix, use rowmajor..
+ ```
+
+ ```r
+ sqrt(mean((preds-test.y)^2))
+ ```
+
+ ```
+ ## [1] 7.800502
+ ```
+
+Currently, we have four predefined metrics: "accuracy", "rmse", "mae", and "rmsle". MXNet provides the interface for defining your own metrics:
+
+
+ ```r
+ demo.metric.mae <- mx.metric.custom("mae", function(label, pred) {
+ pred <- mx.nd.reshape(pred, shape = 0)
+ res <- mx.nd.mean(mx.nd.abs(label-pred))
+ return(res)
+ })
+ ```
+
+This is an example of the mean absolute error metric. Simply plug it into the training function:
+
+
+ ```r
+ mx.set.seed(0)
+ model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y,
+ ctx=mx.cpu(), num.round=50, array.batch.size=20,
+ learning.rate=2e-6, momentum=0.9, eval.metric=demo.metric.mae)
+ ```
+
+ ```
+ ## Auto detect layout of input matrix, use rowmajor.
+ ## Start training with 1 devices
+ ## [1] Train-mae=14.953625731998
+ ## [2] Train-mae=11.4802955521478
+ ## [3] Train-mae=8.50700579749213
+ ## [4] Train-mae=7.30591265360514
+ ## [5] Train-mae=7.38049803839789
+ ## [6] Train-mae=7.36036252975464
+ ## [7] Train-mae=7.06519222259521
+ ## [8] Train-mae=6.9962231847975
+ ## [9] Train-mae=6.96296903822157
+ ## [10] Train-mae=6.9046172036065
+ ## [11] Train-mae=6.87867620256212
+ ## [12] Train-mae=6.85872554779053
+ ## [13] Train-mae=6.81936407089233
+ ## [14] Train-mae=6.79135354359945
+ ## [15] Train-mae=6.77438741260105
+ ## [16] Train-mae=6.75365140702989
+ ## [17] Train-mae=6.73369296391805
+ ## [18] Train-mae=6.71600982877943
+ ## [19] Train-mae=6.69932826360067
+ ## [20] Train-mae=6.6852519777086
+ ## [21] Train-mae=6.67343420452542
+ ## [22] Train-mae=6.66315894656711
+ ## [23] Train-mae=6.65314838621351
+ ## [24] Train-mae=6.64388704299927
+ ## [25] Train-mae=6.63480265935262
+ ## [26] Train-mae=6.62583245171441
+ ## [27] Train-mae=6.61697626113892
+ ## [28] Train-mae=6.60842116673787
+ ## [29] Train-mae=6.60040124257406
+ ## [30] Train-mae=6.59264140658908
+ ## [31] Train-mae=6.58551020092434
+ ## [32] Train-mae=6.57864215638902
+ ## [33] Train-mae=6.57178926467896
+ ## [34] Train-mae=6.56495311525133
+ ## [35] Train-mae=6.55813185373942
+ ## [36] Train-mae=6.5513252152337
+ ## [37] Train-mae=6.54453214009603
+ ## [38] Train-mae=6.53775374094645
+ ## [39] Train-mae=6.53098879920112
+ ## [40] Train-mae=6.52423816257053
+ ## [41] Train-mae=6.51764053768582
+ ## [42] Train-mae=6.51121346155802
+ ## [43] Train-mae=6.5047902001275
+ ## [44] Train-mae=6.49837123023139
+ ## [45] Train-mae=6.49216641320123
+ ## [46] Train-mae=6.48598252402412
+ ## [47] Train-mae=6.4798010720147
+ ## [48] Train-mae=6.47362396452162
+ ## [49] Train-mae=6.46745183732775
+ ## [50] Train-mae=6.46128723356459
+ ```
+
+Congratulations! You've learned the basics for using MXNet in R. To learn how to use MXNet's advanced features, see the other tutorials.
+
+
+## Next Steps
+* [Classify Real-World Images with Pre-trained Model](http://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
+* [Handwritten Digits Classification Competition](http://mxnet.io/tutorials/r/mnistCompetition.html)
+* [Character Language Model using RNN](http://mxnet.io/tutorials/r/charRnnModel.html)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/index.md b/docs/static_site/src/pages/api/r/docs/tutorials/index.md
new file mode 100644
index 000000000000..0e8293dd4b97
--- /dev/null
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/index.md
@@ -0,0 +1,23 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_landing_tutorials
+title: R Tutorials
+action: Get Started
+tag: r
+permalink: /api/r/docs/tutorials
+---
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/mnist_competition.md b/docs/static_site/src/pages/api/r/docs/tutorials/mnist_competition.md
new file mode 100644
index 000000000000..d8aec3782524
--- /dev/null
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/mnist_competition.md
@@ -0,0 +1,363 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: MNIST Competition
+is_tutorial: true
+tag: r
+permalink: /api/r/docs/tutorials/mnist_competition
+---
+
+Handwritten Digits Classification Competition
+=============================================
+
+[MNIST](http://yann.lecun.com/exdb/mnist/) is a handwritten digits image data set created by Yann LeCun. Every digit is represented by a 28 x 28 pixel image. It's become a standard data set for testing classifiers on simple image input. A neural network is a strong model for image classification tasks. There's a [long-term hosted competition](https://www.kaggle.com/c/digit-recognizer) on Kaggle using this data set.
+This tutorial shows how to use [MXNet](https://github.com/dmlc/mxnet/tree/master/R-package) to compete in this challenge.
+
+## Loading the Data
+
+First, let's download the data from [Kaggle](https://www.kaggle.com/c/digit-recognizer/data) and put it in the `data/` folder in your working directory.
+
+Now we can read it in R and convert it to matrices:
+
+
+ ```r
+ require(mxnet)
+ ```
+
+ ```
+ ## Loading required package: mxnet
+ ## Loading required package: methods
+ ```
+
+ ```r
+ train <- read.csv('data/train.csv', header=TRUE)
+ test <- read.csv('data/test.csv', header=TRUE)
+ train <- data.matrix(train)
+ test <- data.matrix(test)
+
+ train.x <- train[,-1]
+ train.y <- train[,1]
+ ```
+
+Every image is represented as a single row in train/test. The greyscale of each image falls in the range [0, 255]. Linearly transform it into [0,1] by using the following command:
+
+
+ ```r
+ train.x <- t(train.x/255)
+ test <- t(test/255)
+ ```
+Transpose the input matrix to npixel x nexamples, which is the major format for columns accepted by MXNet (and the convention of R).
+
+In the label section, the number of each digit is fairly evenly distributed:
+
+
+ ```r
+ table(train.y)
+ ```
+
+ ```
+ ## train.y
+ ## 0 1 2 3 4 5 6 7 8 9
+ ## 4132 4684 4177 4351 4072 3795 4137 4401 4063 4188
+```
+
+## Configuring the Network
+
+Now that we have the data, let's configure the structure of our network:
+
+
+ ```r
+ data <- mx.symbol.Variable("data")
+ fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=128)
+ act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu")
+ fc2 <- mx.symbol.FullyConnected(act1, name="fc2", num_hidden=64)
+ act2 <- mx.symbol.Activation(fc2, name="relu2", act_type="relu")
+ fc3 <- mx.symbol.FullyConnected(act2, name="fc3", num_hidden=10)
+ softmax <- mx.symbol.SoftmaxOutput(fc3, name="sm")
+ ```
+
+1. In `mxnet`, we use the data type `symbol` to configure the network. `data <- mx.symbol.Variable("data")` uses `data` to represent the input data, i.e., the input layer.
+2. We set the first hidden layer with `fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=128)`. This layer has `data` as the input, its name, and the number of hidden neurons.
+3. Activation is set with `act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu")`. The activation function takes the output from the first hidden layer, `fc1`.
+4. The second hidden layer takes the result from `act1` as input, with its name as "fc2" and the number of hidden neurons as 64.
+5. The second activation is almost the same as `act1`, except we have a different input source and name.
+6. This generates the output layer. Because there are only 10 digits, we set the number of neurons to 10.
+7. Finally, we set the activation to softmax to get a probabilistic prediction.
+
+## Training
+
+We are almost ready for the training process. Before we start the computation, let's decide which device to use:
+
+
+ ```r
+ devices <- mx.cpu()
+ ```
+
+We assign CPU to `mxnet`. Now, you can run the following command to train the neural network! Note that `mx.set.seed` is the function that controls the random process in `mxnet`:
+
+
+ ```r
+ mx.set.seed(0)
+ model <- mx.model.FeedForward.create(softmax, X=train.x, y=train.y,
+ ctx=devices, num.round=10, array.batch.size=100,
+ learning.rate=0.07, momentum=0.9, eval.metric=mx.metric.accuracy,
+ initializer=mx.init.uniform(0.07),
+ epoch.end.callback=mx.callback.log.train.metric(100))
+ ```
+
+ ```
+ ## Start training with 1 devices
+ ## Batch [100] Train-accuracy=0.6563
+ ## Batch [200] Train-accuracy=0.777999999999999
+ ## Batch [300] Train-accuracy=0.827466666666665
+ ## Batch [400] Train-accuracy=0.855499999999999
+ ## [1] Train-accuracy=0.859832935560859
+ ## Batch [100] Train-accuracy=0.9529
+ ## Batch [200] Train-accuracy=0.953049999999999
+ ## Batch [300] Train-accuracy=0.955866666666666
+ ## Batch [400] Train-accuracy=0.957525000000001
+ ## [2] Train-accuracy=0.958309523809525
+ ## Batch [100] Train-accuracy=0.968
+ ## Batch [200] Train-accuracy=0.9677
+ ## Batch [300] Train-accuracy=0.9696
+ ## Batch [400] Train-accuracy=0.970650000000002
+ ## [3] Train-accuracy=0.970809523809526
+ ## Batch [100] Train-accuracy=0.973
+ ## Batch [200] Train-accuracy=0.974249999999999
+ ## Batch [300] Train-accuracy=0.976
+ ## Batch [400] Train-accuracy=0.977100000000003
+ ## [4] Train-accuracy=0.977452380952384
+ ## Batch [100] Train-accuracy=0.9834
+ ## Batch [200] Train-accuracy=0.981949999999999
+ ## Batch [300] Train-accuracy=0.981900000000001
+ ## Batch [400] Train-accuracy=0.982600000000003
+ ## [5] Train-accuracy=0.983000000000003
+ ## Batch [100] Train-accuracy=0.983399999999999
+ ## Batch [200] Train-accuracy=0.98405
+ ## Batch [300] Train-accuracy=0.985000000000001
+ ## Batch [400] Train-accuracy=0.985725000000003
+ ## [6] Train-accuracy=0.985952380952384
+ ## Batch [100] Train-accuracy=0.988999999999999
+ ## Batch [200] Train-accuracy=0.9876
+ ## Batch [300] Train-accuracy=0.988100000000001
+ ## Batch [400] Train-accuracy=0.988750000000003
+ ## [7] Train-accuracy=0.988880952380955
+ ## Batch [100] Train-accuracy=0.991999999999999
+ ## Batch [200] Train-accuracy=0.9912
+ ## Batch [300] Train-accuracy=0.990066666666668
+ ## Batch [400] Train-accuracy=0.990275000000003
+ ## [8] Train-accuracy=0.990452380952384
+ ## Batch [100] Train-accuracy=0.9937
+ ## Batch [200] Train-accuracy=0.99235
+ ## Batch [300] Train-accuracy=0.991966666666668
+ ## Batch [400] Train-accuracy=0.991425000000003
+ ## [9] Train-accuracy=0.991500000000003
+ ## Batch [100] Train-accuracy=0.9942
+ ## Batch [200] Train-accuracy=0.99245
+ ## Batch [300] Train-accuracy=0.992433333333334
+ ## Batch [400] Train-accuracy=0.992275000000002
+ ## [10] Train-accuracy=0.992380952380955
+ ```
+
+## Making a Prediction and Submitting to the Competition
+
+To make a prediction, type:
+
+
+ ```r
+ preds <- predict(model, test)
+ dim(preds)
+ ```
+
+ ```
+ ## [1] 10 28000
+ ```
+
+It is a matrix with 28000 rows and 10 cols, containing the desired classification probabilities from the output layer. To extract the maximum label for each row, use `max.col`:
+
+
+ ```r
+ pred.label <- max.col(t(preds)) - 1
+ table(pred.label)
+ ```
+
+ ```
+ ## pred.label
+ ## 0 1 2 3 4 5 6 7 8 9
+ ## 2818 3195 2744 2767 2683 2596 2798 2790 2784 2825
+ ```
+
+With a little extra effort to modify the .csv format, our submission is ready for the competition!
+
+
+ ```r
+ submission <- data.frame(ImageId=1:ncol(test), Label=pred.label)
+ write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
+ ```
+
+## LeNet
+
+Now let's use a new network structure: [LeNet](http://yann.lecun.com/exdb/lenet/). It has been proposed by Yann LeCun for recognizing handwritten digits. We'll demonstrate how to construct and train a LeNet in `mxnet`.
+
+First, we construct the network:
+
+
+```r
+# input
+data <- mx.symbol.Variable('data')
+# first conv
+conv1 <- mx.symbol.Convolution(data=data, kernel=c(5,5), num_filter=20)
+tanh1 <- mx.symbol.Activation(data=conv1, act_type="tanh")
+pool1 <- mx.symbol.Pooling(data=tanh1, pool_type="max",
+ kernel=c(2,2), stride=c(2,2))
+# second conv
+conv2 <- mx.symbol.Convolution(data=pool1, kernel=c(5,5), num_filter=50)
+tanh2 <- mx.symbol.Activation(data=conv2, act_type="tanh")
+pool2 <- mx.symbol.Pooling(data=tanh2, pool_type="max",
+ kernel=c(2,2), stride=c(2,2))
+# first fullc
+flatten <- mx.symbol.Flatten(data=pool2)
+fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=500)
+tanh3 <- mx.symbol.Activation(data=fc1, act_type="tanh")
+# second fullc
+fc2 <- mx.symbol.FullyConnected(data=tanh3, num_hidden=10)
+# loss
+lenet <- mx.symbol.SoftmaxOutput(data=fc2)
+```
+
+Then let's reshape the matrices into arrays:
+
+
+```r
+train.array <- train.x
+dim(train.array) <- c(28, 28, 1, ncol(train.x))
+test.array <- test
+dim(test.array) <- c(28, 28, 1, ncol(test))
+```
+
+We want to compare training speed on different devices, so define the devices:
+
+
+```r
+n.gpu <- 1
+device.cpu <- mx.cpu()
+device.gpu <- lapply(0:(n.gpu-1), function(i) {
+ mx.gpu(i)
+})
+```
+
+We can pass a list of devices to ask MXNet to train on multiple GPUs (you can do this for CPUs,
+but because internal computation of CPUs is already multi-threaded, there is less gain than with using GPUs).
+
+Start by training on the CPU first. Because this takes a bit time, we run it for just one iteration.
+
+
+ ```r
+ mx.set.seed(0)
+ tic <- proc.time()
+ model <- mx.model.FeedForward.create(lenet, X=train.array, y=train.y,
+ ctx=device.cpu, num.round=1, array.batch.size=100,
+ learning.rate=0.05, momentum=0.9, wd=0.00001,
+ eval.metric=mx.metric.accuracy,
+ epoch.end.callback=mx.callback.log.train.metric(100))
+ ```
+
+ ```
+ ## Start training with 1 devices
+ ## Batch [100] Train-accuracy=0.1066
+ ## Batch [200] Train-accuracy=0.16495
+ ## Batch [300] Train-accuracy=0.401766666666667
+ ## Batch [400] Train-accuracy=0.537675
+ ## [1] Train-accuracy=0.557136038186157
+ ```
+
+ ```r
+ print(proc.time() - tic)
+ ```
+
+ ```
+ ## user system elapsed
+ ## 130.030 204.976 83.821
+ ```
+
+Train on a GPU:
+
+
+ ```r
+ mx.set.seed(0)
+ tic <- proc.time()
+ model <- mx.model.FeedForward.create(lenet, X=train.array, y=train.y,
+ ctx=device.gpu, num.round=5, array.batch.size=100,
+ learning.rate=0.05, momentum=0.9, wd=0.00001,
+ eval.metric=mx.metric.accuracy,
+ epoch.end.callback=mx.callback.log.train.metric(100))
+ ```
+
+ ```
+ ## Start training with 1 devices
+ ## Batch [100] Train-accuracy=0.1066
+ ## Batch [200] Train-accuracy=0.1596
+ ## Batch [300] Train-accuracy=0.3983
+ ## Batch [400] Train-accuracy=0.533975
+ ## [1] Train-accuracy=0.553532219570405
+ ## Batch [100] Train-accuracy=0.958
+ ## Batch [200] Train-accuracy=0.96155
+ ## Batch [300] Train-accuracy=0.966100000000001
+ ## Batch [400] Train-accuracy=0.968550000000003
+ ## [2] Train-accuracy=0.969071428571432
+ ## Batch [100] Train-accuracy=0.977
+ ## Batch [200] Train-accuracy=0.97715
+ ## Batch [300] Train-accuracy=0.979566666666668
+ ## Batch [400] Train-accuracy=0.980900000000003
+ ## [3] Train-accuracy=0.981309523809527
+ ## Batch [100] Train-accuracy=0.9853
+ ## Batch [200] Train-accuracy=0.985899999999999
+ ## Batch [300] Train-accuracy=0.986966666666668
+ ## Batch [400] Train-accuracy=0.988150000000002
+ ## [4] Train-accuracy=0.988452380952384
+ ## Batch [100] Train-accuracy=0.990199999999999
+ ## Batch [200] Train-accuracy=0.98995
+ ## Batch [300] Train-accuracy=0.990600000000001
+ ## Batch [400] Train-accuracy=0.991325000000002
+ ## [5] Train-accuracy=0.991523809523812
+ ```
+
+ ```r
+ print(proc.time() - tic)
+ ```
+
+ ```
+ ## user system elapsed
+ ## 9.288 1.680 6.889
+ ```
+
+By using a GPU processor, we significantly speed up training!
+Now, we can submit the result to Kaggle to see the improvement of our ranking!
+
+
+ ```r
+ preds <- predict(model, test.array)
+ pred.label <- max.col(t(preds)) - 1
+ submission <- data.frame(ImageId=1:ncol(test), Label=pred.label)
+ write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
+ ```
+
+![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/knitr/mnistCompetition-kaggle-submission.png)
+
+## Next Steps
+* [Character Language Model using RNN](http://mxnet.io/tutorials/r/charRnnModel.html)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/multi_dim_lstm.md b/docs/static_site/src/pages/api/r/docs/tutorials/multi_dim_lstm.md
new file mode 100644
index 000000000000..46c6f22ba849
--- /dev/null
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/multi_dim_lstm.md
@@ -0,0 +1,327 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: LSTM Time Serie
+is_tutorial: true
+tag: r
+permalink: /api/r/docs/tutorials/multi_dim_lstm
+---
+
+
+LSTM Time Series Example
+========================
+
+This tutorial shows how to use an LSTM model with multivariate data, and generate predictions from it. For demonstration purposes, we used an open source [pollution data](https://archive.ics.uci.edu/ml/datasets/Beijing+PM2.5+Data).
+The tutorial is an illustration of how to use LSTM models with MXNet-R. We are forecasting the air pollution with data recorded at the US embassy in Beijing, China for five years.
+
+Dataset Attribution:
+"PM2.5 data of US Embassy in Beijing"
+We want to predict pollution levels(PM2.5 concentration) in the city given the above dataset.
+
+```r
+Dataset description:
+No: row number
+year: year of data in this row
+month: month of data in this row
+day: day of data in this row
+hour: hour of data in this row
+pm2.5: PM2.5 concentration
+DEWP: Dew Point
+TEMP: Temperature
+PRES: Pressure
+cbwd: Combined wind direction
+Iws: Cumulated wind speed
+Is: Cumulated hours of snow
+Ir: Cumulated hours of rain
+```
+
+We use past PM2.5 concentration, dew point, temperature, pressure, wind speed, snow and rain to predict
+PM2.5 concentration levels.
+
+Load and pre-process the data
+---------
+The first step is to load in the data and preprocess it. It is assumed that the data has been downloaded in a .csv file: data.csv from the [pollution dataset](https://archive.ics.uci.edu/ml/datasets/Beijing+PM2.5+Data)
+
+ ```r
+## Loading required packages
+library("readr")
+library("dplyr")
+library("mxnet")
+library("abind")
+ ```
+
+
+
+ ```r
+## Preprocessing steps
+Data <- read.csv(file = "/Users/khedia/Downloads/data.csv",
+ header = TRUE,
+ sep = ",")
+
+## Extracting specific features from the dataset as variables for time series We extract
+## pollution, temperature, pressue, windspeed, snowfall and rainfall information from dataset
+df <- data.frame(Data$pm2.5,
+ Data$DEWP,
+ Data$TEMP,
+ Data$PRES,
+ Data$Iws,
+ Data$Is,
+ Data$Ir)
+df[is.na(df)] <- 0
+
+## Now we normalise each of the feature set to a range(0,1)
+df <- matrix(as.matrix(df),
+ ncol = ncol(df),
+ dimnames = NULL)
+
+rangenorm <- function(x) {
+ (x - min(x))/(max(x) - min(x))
+}
+df <- apply(df, 2, rangenorm)
+df <- t(df)
+ ```
+For using multidimesional data with MXNet-R, we need to convert training data to the form
+(n_dim x seq_len x num_samples). For one-to-one RNN flavours labels should be of the form (seq_len x num_samples) while for many-to-one flavour, the labels should be of the form (1 x num_samples). Please note that MXNet-R currently supports only these two flavours of RNN.
+We have used n_dim = 7, seq_len = 100, and num_samples = 430 because the dataset has 430 samples, each the length of 100 timestamps, we have seven time series as input features so each input has dimesnion of seven at each time step.
+
+
+```r
+n_dim <- 7
+seq_len <- 100
+num_samples <- 430
+
+## extract only required data from dataset
+trX <- df[1:n_dim, 25:(24 + (seq_len * num_samples))]
+
+## the label data(next PM2.5 concentration) should be one time step
+## ahead of the current PM2.5 concentration
+trY <- df[1, 26:(25 + (seq_len * num_samples))]
+
+## reshape the matrices in the format acceptable by MXNetR RNNs
+trainX <- trX
+dim(trainX) <- c(n_dim, seq_len, num_samples)
+trainY <- trY
+dim(trainY) <- c(seq_len, num_samples)
+```
+
+
+
+Defining and training the network
+---------
+
+```r
+batch.size <- 32
+
+# take first 300 samples for training - remaining 100 for evaluation
+train_ids <- 1:300
+eval_ids <- 301:400
+
+## The number of samples used for training and evaluation is arbitrary. I have kept aside few
+## samples for testing purposes create dataiterators
+train.data <- mx.io.arrayiter(data = trainX[, , train_ids, drop = F],
+ label = trainY[, train_ids],
+ batch.size = batch.size, shuffle = TRUE)
+
+eval.data <- mx.io.arrayiter(data = trainX[, , eval_ids, drop = F],
+ label = trainY[, eval_ids],
+ batch.size = batch.size, shuffle = FALSE)
+
+## Create the symbol for RNN
+symbol <- rnn.graph(num_rnn_layer = 1,
+ num_hidden = 5,
+ input_size = NULL,
+ num_embed = NULL,
+ num_decode = 1,
+ masking = F,
+ loss_output = "linear",
+ dropout = 0.2,
+ ignore_label = -1,
+ cell_type = "lstm",
+ output_last_state = T,
+ config = "one-to-one")
+
+
+
+mx.metric.mse.seq <- mx.metric.custom("MSE", function(label, pred) {
+ label = mx.nd.reshape(label, shape = -1)
+ pred = mx.nd.reshape(pred, shape = -1)
+ res <- mx.nd.mean(mx.nd.square(label - pred))
+ return(as.array(res))
+})
+
+
+
+ctx <- mx.cpu()
+
+initializer <- mx.init.Xavier(rnd_type = "gaussian",
+ factor_type = "avg",
+ magnitude = 3)
+
+optimizer <- mx.opt.create("adadelta",
+ rho = 0.9,
+ eps = 1e-05,
+ wd = 1e-06,
+ clip_gradient = 1,
+ rescale.grad = 1/batch.size)
+
+logger <- mx.metric.logger()
+epoch.end.callback <- mx.callback.log.train.metric(period = 10,
+ logger = logger)
+
+## train the network
+system.time(model <- mx.model.buckets(symbol = symbol,
+ train.data = train.data,
+ eval.data = eval.data,
+ num.round = 100,
+ ctx = ctx,
+ verbose = TRUE,
+ metric = mx.metric.mse.seq,
+ initializer = initializer,
+ optimizer = optimizer,
+ batch.end.callback = NULL,
+ epoch.end.callback = epoch.end.callback))
+```
+Output:
+```
+Start training with 1 devices
+[1] Train-MSE=0.197570244409144
+[1] Validation-MSE=0.0153861071448773
+[2] Train-MSE=0.0152517843060195
+[2] Validation-MSE=0.0128299412317574
+[3] Train-MSE=0.0124418652616441
+[3] Validation-MSE=0.010827143676579
+[4] Train-MSE=0.0105128229130059
+[4] Validation-MSE=0.00940261723008007
+[5] Train-MSE=0.00914482437074184
+[5] Validation-MSE=0.00830172537826002
+[6] Train-MSE=0.00813581114634871
+[6] Validation-MSE=0.00747016374953091
+[7] Train-MSE=0.00735094994306564
+[7] Validation-MSE=0.00679832429159433
+[8] Train-MSE=0.00672049634158611
+[8] Validation-MSE=0.00623159145470709
+[9] Train-MSE=0.00620287149213254
+[9] Validation-MSE=0.00577476259786636
+[10] Train-MSE=0.00577280316501856
+[10] Validation-MSE=0.00539038667920977
+..........
+..........
+[91] Train-MSE=0.00177705133100972
+[91] Validation-MSE=0.00154715491225943
+[92] Train-MSE=0.00177639147732407
+[92] Validation-MSE=0.00154592350008897
+[93] Train-MSE=0.00177577760769054
+[93] Validation-MSE=0.00154474508599378
+[94] Train-MSE=0.0017752077546902
+[94] Validation-MSE=0.0015436161775142
+[95] Train-MSE=0.00177468206966296
+[95] Validation-MSE=0.00154253660002723
+[96] Train-MSE=0.00177419915562496
+[96] Validation-MSE=0.00154150440357625
+[97] Train-MSE=0.0017737578949891
+[97] Validation-MSE=0.00154051734716631
+[98] Train-MSE=0.00177335749613121
+[98] Validation-MSE=0.00153957353904843
+[99] Train-MSE=0.00177299699280411
+[99] Validation-MSE=0.00153867155313492
+[100] Train-MSE=0.00177267640829086
+[100] Validation-MSE=0.00153781197150238
+
+ user system elapsed
+ 21.937 1.914 13.402
+```
+We can see how mean squared error varies with epochs below.
+
+![png](https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/r/images/loss.png?raw=true)
+
+Inference on the network
+---------
+Now we have trained the network. Let's use it for inference.
+
+```r
+## We extract the state symbols for RNN
+internals <- model$symbol$get.internals()
+sym_state <- internals$get.output(which(internals$outputs %in% "RNN_state"))
+sym_state_cell <- internals$get.output(which(internals$outputs %in% "RNN_state_cell"))
+sym_output <- internals$get.output(which(internals$outputs %in% "loss_output"))
+symbol <- mx.symbol.Group(sym_output, sym_state, sym_state_cell)
+
+## We will predict 100 timestamps for 401st sample (first sample from the test samples)
+pred_length <- 100
+predicted <- numeric()
+
+## We pass the 400th sample through the network to get the weights and use it for predicting next
+## 100 time stamps.
+data <- mx.nd.array(trainX[, , 400, drop = F])
+label <- mx.nd.array(trainY[, 400, drop = F])
+
+
+## We create dataiterators for the input, please note that the label is required to create
+## iterator and will not be used in the inference. You can use dummy values too in the label.
+infer.data <- mx.io.arrayiter(data = data,
+ label = label,
+ batch.size = 1,
+ shuffle = FALSE)
+
+infer <- mx.infer.rnn.one(infer.data = infer.data,
+ symbol = symbol,
+ arg.params = model$arg.params,
+ aux.params = model$aux.params,
+ input.params = NULL,
+ ctx = ctx)
+## Once we get the weights for the above time series, we try to predict the next 100 steps for
+## this time series, which is technically our 401st time series.
+
+actual <- trainY[, 401]
+
+## Now we iterate one by one to generate each of the next timestamp pollution values
+
+for (i in 1:pred_length) {
+
+ data <- mx.nd.array(trainX[, i, 401, drop = F])
+ label <- mx.nd.array(trainY[i, 401, drop = F])
+ infer.data <- mx.io.arrayiter(data = data,
+ label = label,
+ batch.size = 1,
+ shuffle = FALSE)
+ ## note that we use rnn state values from previous iterations here
+ infer <- mx.infer.rnn.one(infer.data = infer.data,
+ symbol = symbol,
+ ctx = ctx,
+ arg.params = model$arg.params,
+ aux.params = model$aux.params,
+ input.params = list(rnn.state = infer[[2]],
+ rnn.state.cell = infer[[3]]))
+
+ pred <- infer[[1]]
+ predicted <- c(predicted, as.numeric(as.array(pred)))
+
+}
+
+```
+Now predicted contains the predicted 100 values. We use ggplot to plot the actual and predicted values as shown below.
+
+![png](https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/r/images/sample_401.png?raw=true)
+
+We also repeated the above experiments to generate the next 100 samples to 301st time series and we got the following results.
+
+![png](https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/r/images/sample_301.png?raw=true)
+
+The above tutorial is just for demonstration purposes and has not been tuned extensively for accuracy.
+
+For more tutorials on MXNet-R, head on to [MXNet-R tutorials](https://mxnet.incubator.apache.org/tutorials/r/index.html)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md b/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md
new file mode 100644
index 000000000000..20098a04e341
--- /dev/null
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md
@@ -0,0 +1,231 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: NDArray
+is_tutorial: true
+tag: r
+permalink: /api/r/docs/tutorials/ndarray
+---
+
+
+# NDArray: Vectorized Tensor Computations on CPUs and GPUs
+
+`NDArray` is the basic vectorized operation unit in MXNet for matrix and tensor computations.
+Users can perform usual calculations as on an R"s array, but with two additional features:
+
+
+
+- Multiple devices: All operations can be run on various devices including
+CPUs and GPUs.
+
+
+- Automatic parallelization: All operations are automatically executed in
+ parallel with each other.
+
+## Create and Initialize
+
+Let"s create `NDArray` on either a GPU or a CPU:
+
+
+```r
+require(mxnet)
+```
+
+```
+## Loading required package: mxnet
+## Loading required package: methods
+```
+
+```r
+a <- mx.nd.zeros(c(2, 3)) # create a 2-by-3 matrix on cpu
+b <- mx.nd.zeros(c(2, 3), mx.cpu()) # create a 2-by-3 matrix on cpu
+# c <- mx.nd.zeros(c(2, 3), mx.gpu(0)) # create a 2-by-3 matrix on gpu 0, if you have CUDA enabled.
+```
+
+Typically for CUDA-enabled devices, the device id of a GPU starts from 0.
+That's why we passed in 0 to the GPU id.
+
+We can initialize an `NDArray` object in various ways:
+
+
+```r
+a <- mx.nd.ones(c(4, 4))
+b <- mx.rnorm(c(4, 5))
+c <- mx.nd.array(1:5)
+```
+
+To check the numbers in an `NDArray`, we can simply run:
+
+
+```r
+a <- mx.nd.ones(c(2, 3))
+b <- as.array(a)
+class(b)
+```
+
+```
+## [1] "matrix"
+```
+
+```r
+b
+```
+
+```
+## [,1] [,2] [,3]
+## [1,] 1 1 1
+## [2,] 1 1 1
+```
+
+## Performing Basic Operations
+
+### Elemental-wise Operations
+
+You can perform elemental-wise operations on `NDArray` objects, as follows:
+
+
+```r
+a <- mx.nd.ones(c(2, 4)) * 2
+b <- mx.nd.ones(c(2, 4)) / 8
+as.array(a)
+```
+
+```
+## [,1] [,2] [,3] [,4]
+## [1,] 2 2 2 2
+## [2,] 2 2 2 2
+```
+
+```r
+as.array(b)
+```
+
+```
+## [,1] [,2] [,3] [,4]
+## [1,] 0.125 0.125 0.125 0.125
+## [2,] 0.125 0.125 0.125 0.125
+```
+
+```r
+c <- a + b
+as.array(c)
+```
+
+```
+## [,1] [,2] [,3] [,4]
+## [1,] 2.125 2.125 2.125 2.125
+## [2,] 2.125 2.125 2.125 2.125
+```
+
+```r
+d <- c / a - 5
+as.array(d)
+```
+
+```
+## [,1] [,2] [,3] [,4]
+## [1,] -3.9375 -3.9375 -3.9375 -3.9375
+## [2,] -3.9375 -3.9375 -3.9375 -3.9375
+```
+
+If two `NDArray`s are located on different devices, we need to explicitly move them to the same one. For instance:
+
+
+```r
+a <- mx.nd.ones(c(2, 3)) * 2
+b <- mx.nd.ones(c(2, 3), mx.gpu()) / 8
+c <- mx.nd.copyto(a, mx.gpu()) * b
+as.array(c)
+```
+
+### Loading and Saving
+
+You can save a list of `NDArray` object to your disk with `mx.nd.save`:
+
+
+```r
+a <- mx.nd.ones(c(2, 3))
+mx.nd.save(list(a), "temp.ndarray")
+```
+
+You can load it back easily:
+
+
+```r
+a <- mx.nd.load("temp.ndarray")
+as.array(a[[1]])
+```
+
+```
+## [,1] [,2] [,3]
+## [1,] 1 1 1
+## [2,] 1 1 1
+```
+
+We can directly save data to and load it from a distributed file system, such as Amazon S3 and HDFS:
+
+
+```r
+mx.nd.save(list(a), "s3://mybucket/mydata.bin")
+mx.nd.save(list(a), "hdfs///users/myname/mydata.bin")
+```
+
+## Automatic Parallelization
+
+`NDArray` can automatically execute operations in parallel. Automatic parallelization is useful when
+using multiple resources, such as CPU cards, GPU cards, and CPU-to-GPU memory bandwidth.
+
+For example, if we write `a <- a + 1` followed by `b <- b + 1`, and `a` is on a CPU and
+`b` is on a GPU, executing them in parallel improves
+efficiency. Furthermore, because copying data between CPUs and GPUs are also expensive, running in parallel with other computations further increases efficiency.
+
+It's hard to find the code that can be executed in parallel by eye. In the
+following example, `a <- a + 1` and `c <- c * 3` can be executed in parallel, but `a <- a + 1` and
+`b <- b * 3` should be in sequential.
+
+
+```r
+a <- mx.nd.ones(c(2,3))
+b <- a
+c <- mx.nd.copyto(a, mx.cpu())
+a <- a + 1
+b <- b * 3
+c <- c * 3
+```
+
+Luckily, MXNet can automatically resolve the dependencies and
+execute operations in parallel accurately. This allows us to write our program assuming there is only a single thread. MXNet will
+automatically dispatch the program to multiple devices.
+
+MXNet achieves this with lazy evaluation. Each operation is issued to an
+internal engine, and then returned. For example, if we run `a <- a + 1`, it
+returns immediately after pushing the plus operator to the engine. This
+asynchronous processing allows us to push more operators to the engine. It determines
+the read and write dependencies and the best way to execute them in
+parallel.
+
+The actual computations are finished, allowing us to copy the results someplace else, such as `as.array(a)` or `mx.nd.save(a, "temp.dat")`. To write highly parallelized codes, we only need to postpone when we need
+the results.
+
+## Next Steps
+* [Symbol](http://mxnet.io/tutorials/r/symbol.html)
+* [Write and use callback functions](http://mxnet.io/tutorials/r/CallbackFunction.html)
+* [Neural Networks with MXNet in Five Minutes](http://mxnet.io/tutorials/r/fiveMinutesNeuralNetwork.html)
+* [Classify Real-World Images with Pre-trained Model](http://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
+* [Handwritten Digits Classification Competition](http://mxnet.io/tutorials/r/mnistCompetition.html)
+* [Character Language Model using RNN](http://mxnet.io/tutorials/r/charRnnModel.html)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md b/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md
new file mode 100644
index 000000000000..c7c7b968b630
--- /dev/null
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md
@@ -0,0 +1,154 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: NDArray
+is_tutorial: true
+tag: r
+permalink: /api/r/docs/tutorials/symbol
+---
+
+# Symbol and Automatic Differentiation
+
+The computational unit `NDArray` requires a way to construct neural networks. MXNet provides a symbolic interface, named Symbol, to do this. Symbol combines both flexibility and efficiency.
+
+## Basic Composition of Symbols
+
+The following code creates a two-layer perceptron network:
+
+
+```r
+require(mxnet)
+net <- mx.symbol.Variable("data")
+net <- mx.symbol.FullyConnected(data=net, name="fc1", num_hidden=128)
+net <- mx.symbol.Activation(data=net, name="relu1", act_type="relu")
+net <- mx.symbol.FullyConnected(data=net, name="fc2", num_hidden=64)
+net <- mx.symbol.Softmax(data=net, name="out")
+class(net)
+```
+
+```
+## [1] "Rcpp_MXSymbol"
+## attr(,"package")
+## [1] "mxnet"
+```
+
+Each symbol takes a (unique) string name. *Variable* often defines the inputs,
+or free variables. Other symbols take a symbol as the input (*data*),
+and may accept other hyper parameters, such as the number of hidden neurons (*num_hidden*)
+or the activation type (*act_type*).
+
+A symbol can be viewed as a function that takes several arguments, whose
+names are automatically generated and can be retrieved with the following command:
+
+
+```r
+arguments(net)
+```
+
+```
+## [1] "data" "fc1_weight" "fc1_bias" "fc2_weight" "fc2_bias"
+## [6] "out_label"
+```
+
+The arguments are the parameters need by each symbol:
+
+- *data*: Input data needed by the variable *data*
+- *fc1_weight* and *fc1_bias*: The weight and bias for the first fully connected layer, *fc1*
+- *fc2_weight* and *fc2_bias*: The weight and bias for the second fully connected layer, *fc2*
+- *out_label*: The label needed by the loss
+
+We can also specify the automatically generated names explicitly:
+
+
+```r
+data <- mx.symbol.Variable("data")
+w <- mx.symbol.Variable("myweight")
+net <- mx.symbol.FullyConnected(data=data, weight=w, name="fc1", num_hidden=128)
+arguments(net)
+```
+
+```
+## [1] "data" "myweight" "fc1_bias"
+```
+
+## More Complicated Composition of Symbols
+
+MXNet provides well-optimized symbols for
+commonly used layers in deep learning. You can also define new operators
+in Python. The following example first performs an element-wise add between two
+symbols, then feeds them to the fully connected operator:
+
+
+```r
+lhs <- mx.symbol.Variable("data1")
+rhs <- mx.symbol.Variable("data2")
+net <- mx.symbol.FullyConnected(data=lhs + rhs, name="fc1", num_hidden=128)
+arguments(net)
+```
+
+```
+## [1] "data1" "data2" "fc1_weight" "fc1_bias"
+```
+
+We can construct a symbol more flexibly than by using the single
+forward composition, for example:
+
+
+```r
+net <- mx.symbol.Variable("data")
+net <- mx.symbol.FullyConnected(data=net, name="fc1", num_hidden=128)
+net2 <- mx.symbol.Variable("data2")
+net2 <- mx.symbol.FullyConnected(data=net2, name="net2", num_hidden=128)
+composed.net <- mx.apply(net, data=net2, name="compose")
+arguments(composed.net)
+```
+
+```
+## [1] "data2" "net2_weight" "net2_bias" "fc1_weight" "fc1_bias"
+```
+
+In the example, *net* is used as a function to apply to an existing symbol
+*net*. The resulting *composed.net* will replace the original argument *data* with
+*net2* instead.
+
+## Training a Neural Net
+
+The [model API](https://github.com/apache/incubator-mxnet/blob/master/R-package/R/model.R) is a thin wrapper around the symbolic executors to support neural net training.
+
+We encourage you to read [Symbolic Configuration and Execution in Pictures for python package](../../api/python/symbol_in_pictures/symbol_in_pictures.md)for a detailed explanation of concepts in pictures.
+
+## How Efficient Is the Symbolic API?
+
+The Symbolic API brings the efficient C++
+operations in powerful toolkits, such as CXXNet and Caffe, together with the
+flexible dynamic NDArray operations. All of the memory and computation resources are
+allocated statically during bind operations, to maximize runtime performance and memory
+utilization.
+
+The coarse-grained operators are equivalent to CXXNet layers, which are
+extremely efficient. We also provide fine-grained operators for more flexible
+composition. Because MXNet does more in-place memory allocation, it can
+be more memory efficient than CXXNet and gets to the same runtime with
+greater flexibility.
+
+## Next Steps
+* [Write and use callback functions](http://mxnet.io/tutorials/r/CallbackFunction.html)
+* [Neural Networks with MXNet in Five Minutes](http://mxnet.io/tutorials/r/fiveMinutesNeuralNetwork.html)
+* [Classify Real-World Images with Pre-trained Model](http://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
+* [Handwritten Digits Classification Competition](http://mxnet.io/tutorials/r/mnistCompetition.html)
+* [Character Language Model using RNN](http://mxnet.io/tutorials/r/charRnnModel.html)
diff --git a/docs/static_site/src/pages/api/r/index.md b/docs/static_site/src/pages/api/r/index.md
new file mode 100644
index 000000000000..8afdb29fde69
--- /dev/null
+++ b/docs/static_site/src/pages/api/r/index.md
@@ -0,0 +1,52 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: R Guide
+action: Get Started
+action_url: /get_started
+permalink: /api/r
+tag: r
+---
+
+
+# MXNet - R API
+
+See the [MXNet R Reference Manual](https://s3.amazonaws.com/mxnet-prod/docs/R/mxnet-r-reference-manual.pdf).
+
+MXNet supports the R programming language. The MXNet R package brings flexible and efficient GPU
+computing and state-of-art deep learning to R. It enables you to write seamless tensor/matrix computation with multiple GPUs in R. It also lets you construct and customize the state-of-art deep learning models in R,
+ and apply them to tasks, such as image classification and data science challenges.
+
+You can perform tensor or matrix computation in R:
+
+```r
+ > require(mxnet)
+ Loading required package: mxnet
+ > a <- mx.nd.ones(c(2,3))
+ > a
+ [,1] [,2] [,3]
+ [1,] 1 1 1
+ [2,] 1 1 1
+ > a + 1
+ [,1] [,2] [,3]
+ [1,] 2 2 2
+ [2,] 2 2 2
+```
+## Resources
+
+* [MXNet R Reference Manual](https://s3.amazonaws.com/mxnet-prod/docs/R/mxnet-r-reference-manual.pdf)
diff --git a/docs/static_site/src/pages/api/scala/docs/tutorials/char_lstm.md b/docs/static_site/src/pages/api/scala/docs/tutorials/char_lstm.md
new file mode 100644
index 000000000000..6125ad77f5b1
--- /dev/null
+++ b/docs/static_site/src/pages/api/scala/docs/tutorials/char_lstm.md
@@ -0,0 +1,533 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Char-LSTM
+is_tutorial: true
+tag: scala
+permalink: /api/scala/docs/tutorials/char_lstm
+---
+
+# Developing a Character-level Language model
+
+This tutorial shows how to train a character-level language model with a multilayer recurrent neural network (RNN) using Scala. This model takes one text file as input and trains an RNN that learns to predict the next character in the sequence. In this tutorial, you train a multilayer LSTM (Long Short-Term Memory) network that generates relevant text using Barack Obama's speech patterns.
+
+There are many documents that explain LSTM concepts. If you aren't familiar with LSTM, refer to the following before you proceed:
+- Christopher Olah's [Understanding LSTM blog post](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)
+- [Training a LSTM char-rnn in Julia to Generate Random Sentences](http://dmlc.ml/mxnet/2015/11/15/char-lstm-in-julia.html)
+- [Bucketing in MXNet in Python](https://github.com/dmlc/mxnet-notebooks/blob/master/python/tutorials/char_lstm.ipynb)
+- [Bucketing in MXNet](http://mxnet.io/faq/bucketing.html)
+
+## How to Use This Tutorial
+
+There are three ways to use this tutorial:
+
+1) Run it by copying the provided code snippets and pasting them into the Scala command line, making the appropriate changes to the input file path.
+
+2) Reuse the code by making changes to relevant parameters and running it from command line.
+
+3) [Run the source code directly](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn) by running the [provided scripts](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/scripts/rnn).
+
+To run the scripts:
+- Build and train the model with the [run_train_charrnn.sh script](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/scripts/rnn/run_train_charrnn.sh). Edit the script as follows:
+
+Edit the CLASS_PATH variable in the script to include your operating system-specific folder (e.g., linux-x86_64-cpu/linux-x86_64-gpu/osx-x86_64-cpu) in the path. Run the script with the following command:
+
+```bash
+
+ bash run_train_charrnn.sh
+
+ e.g.,
+ bash run_train_charrnn.sh -1 ./datas/obama.txt ./models/obama
+
+```
+
+- Run inference with the [run_test_charrnn.sh script](https://github.com/dmlc/mxnet/blob/master/scala-package/examples/scripts/rnn/run_test_charrnn.sh). Edit the script as follows:
+
+Edit the CLASS_PATH variable in the script to include your operating system-specific folder (e.g., linux-x86_64-cpu/linux-x86_64-gpu/osx-x86_64-cpu) in the path. Run the script with the following command:
+
+```bash
+
+ bash run_test_charrnn.sh
+
+ e.g.,
+ bash run_test_charrnn.sh ./datas/obama.txt ./models/obama
+```
+
+In this tutorial, you will accomplish the following:
+
+- Build an LSTM network that learns speech patterns from Barack Obama's speeches at the character level. At each time interval, the input is a character.
+- Clean up the dataset.
+- Train a model.
+- Fit the model.
+- Build the inference model.
+
+## Prerequisites
+
+To complete this tutorial, setup and run the scala interpreter by following the [instructions](https://mxnet.incubator.apache.org/install/scala_setup.html#interpreter).
+
+## Download the Data
+
+First, download the data, which contains Barack Obama's speeches. The data is stored in a file called obama.txt and is available on [mxnet.io](http://data.mxnet.io/data/char_lstm.zip)
+
+To download the data which contains Barack Obama's speeches:
+
+1) Download the dataset with the following command:
+
+ ```bash
+ wget http://data.mxnet.io/data/char_lstm.zip
+ ```
+
+2) Unzip the dataset with the following command:
+
+ ```bash
+ unzip char_lstm.zip -d char_lstm/
+ ```
+
+3) The downloaded data contains President Obama's speeches. You can have sneak peek at the dataset with the following command:
+
+ ```bash
+ head -10 obama.txt
+ ```
+
+Output:
+```
+ Call to Renewal Keynote Address Call to Renewal Pt 1Call to Renewal Part 2 TOPIC: Our Past, Our Future & Vision for America June
+ 28, 2006 Call to Renewal' Keynote Address Complete Text Good morning. I appreciate the opportunity to speak here at the Call to R
+ enewal's Building a Covenant for a New America conference. I've had the opportunity to take a look at your Covenant for a New Ame
+ rica. It is filled with outstanding policies and prescriptions for much of what ails this country. So I'd like to congratulate yo
+ u all on the thoughtful presentations you've given so far about poverty and justice in America, and for putting fire under the fe
+ et of the political leadership here in Washington.But today I'd like to talk about the connection between religion and politics a
+ nd perhaps offer some thoughts about how we can sort through some of the often bitter arguments that we've been seeing over the l
+ ast several years.I do so because, as you all know, we can affirm the importance of poverty in the Bible; and we can raise up and
+ pass out this Covenant for a New America. We can talk to the press, and we can discuss the religious call to address poverty and
+ environmental stewardship all we want, but it won't have an impact unless we tackle head-on the mutual suspicion that sometimes
+```
+
+## Prepare the Data
+
+To preprocess the dataset, define the following utility functions:
+
+* `readContent` - Reads data from the data file.
+* `buildVocab` - Maps each character to a unique Integer ID, i.e., a build a vocabulary
+* `text2Id` - Encodes each sentence with an Integer ID.
+
+Then, use these utility functions to generate vocabulary from the input text file (obama.txt).
+
+To prepare the data:
+
+1) Read the dataset with the following function:
+
+ ```scala
+ scala> import scala.io.Source
+
+ import scala.io.Source
+
+ scala> // Read file
+ scala> def readContent(path: String): String = Source.fromFile(path).mkString
+
+ readContent: (path: String)String
+
+ ```
+
+2) Build a vocabulary with the following function:
+
+ ```scala
+ scala> // Build a vocabulary of what char we have in the content
+ scala> def buildVocab(path: String): Map[String, Int] = {
+ val content = readContent(path).split("\n")
+ var idx = 1 // 0 is left for zero padding
+ var theVocab = Map[String, Int]()
+ for (line <- content) {
+ for (char <- line) {
+ val key = s"$char"
+ if (!theVocab.contains(key)) {
+ theVocab = theVocab + (key -> idx)
+ idx += 1
+ }
+ }
+ }
+ theVocab
+ }
+
+ buildVocab: (path: String)Map[String,Int]
+ ```
+
+3) To assign each character a unique numerical ID, use the following function:
+
+ ```scala
+ scala> def text2Id(sentence: String, theVocab: Map[String, Int]): Array[Int] = {
+ val words = for (char <- sentence) yield theVocab(s"$char")
+ words.toArray
+ }
+
+ text2Id: (sentence: String, theVocab: Map[String,Int])Array[Int]
+ ```
+
+4) Now, build a character vocabulary from the dataset (obama.txt). Change the input filepath (dataPath) to reflect your settings.
+
+ ```scala
+ scala> // Give your system path to the "obama.txt" we have downloaded using previous steps.
+ scala> val dataPath = "obama.txt"
+ dataPath: String = obama.txt
+
+ scala> val vocab = buildVocab(dataPath)
+
+ scala> vocab.size
+ res23: Int = 82
+ ```
+
+
+## Build a Multi-layer LSTM model
+
+Now, create a multi-layer LSTM model.
+
+To create the model:
+
+1) Load the helper files (`Lstm.scala`, `BucketIo.scala` and `RnnModel.scala`).
+`Lstm.scala` contains the definition of the LSTM cell. `BucketIo.scala` creates a sentence iterator. `RnnModel.scala` is used for model inference. The helper files are available on the [MXNet site](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn).
+To load them, at the Scala command prompt type:
+
+ ```scala
+ scala> :load ../../../scala-package/examples/src/main/scala/org/apache/mxnet/examples/rnn/Lstm.scala
+ scala> :load ../../../scala-package/examples/src/main/scala/org/apache/mxnet/examples/rnn/BucketIo.scala
+ scala> :load ../../../scala-package/examples/src/main/scala/org/apache/mxnet/examples/rnn/RnnModel.scala
+ ```
+
+2) Set the LSTM hyperparameters as follows:
+
+ ```scala
+ scala> // We can support various input lengths.
+ scala> // For this problem, we cut each input sentence to a length of 129 characters.
+ scala> // So we only need a fixed length bucket length.
+ scala> val buckets = Array(129)
+ buckets: Array[Int] = Array(129)
+
+ scala> // hidden unit in LSTM cell
+ scala> val numHidden = 512
+ numHidden: Int = 512
+
+ scala> // The embedding dimension, which maps a char to a 256 dim vector
+ scala> val numEmbed = 256
+ numEmbed: Int = 256
+
+ scala> // The number of lstm layers
+ scala> val numLstmLayer = 3
+ numLstmLayer: Int = 3
+
+ scala> // The batch size for training
+ scala> val batchSize = 32
+ batchSize: Int = 32
+ ```
+
+3) Now, construct the LSTM network as a symbolic computation graph. Type the following to create a graph in which the model is unrolled for a fixed length explicitly in time.
+
+ ```scala
+ scala> // generate symbol for a length
+ scala> def symGen(seqLen: Int): Symbol = {
+ Lstm.lstmUnroll(numLstmLayer, seqLen, vocab.size + 1,
+ numHidden = numHidden, numEmbed = numEmbed,
+ numLabel = vocab.size + 1, dropout = 0.2f)
+ }
+ symGen: (seqLen: Int)org.apache.mxnet.Symbol
+
+ scala> // create the network symbol
+ scala> val symbol = symGen(buckets(0))
+ symbol: org.apache.mxnet.Symbol = org.apache.mxnet.Symbol@3a589eed
+
+ ```
+
+4) To train the model, initialize states for the LSTM and create a data iterator, which groups the data into buckets.
+Note: The BucketSentenceIter data iterator supports various length examples; however, we use only the fixed length version in this tutorial.
+
+ ```scala
+
+ scala> // initialize states for LSTM
+ scala> val initC = for (l <- 0 until numLstmLayer) yield (s"l${l}_init_c", (batchSize, numHidden))
+
+ initC: scala.collection.immutable.IndexedSeq[(String, (Int, Int))] = Vector((l0_init_c,(32,512)),
+ (l1_init_c,(32,512)), (l2_init_c,(32,512)))
+
+ scala> val initH = for (l <- 0 until numLstmLayer) yield (s"l${l}_init_h", (batchSize, numHidden))
+
+ initH: scala.collection.immutable.IndexedSeq[(String, (Int, Int))] = Vector((l0_init_h,(32,512)),
+ (l1_init_h,(32,512)), (l2_init_h,(32,512)))
+
+ scala> val initStates = initC ++ initH
+
+ initStates: scala.collection.immutable.IndexedSeq[(String, (Int, Int))] =
+ Vector((l0_init_c,(32,512)), (l1_init_c,(32,512)), (l2_init_c,(32,512)), (l0_init_h,(32,512)),
+ (l1_init_h,(32,512)), (l2_init_h,(32,512)))
+
+ scala> val dataTrain = new BucketIo.BucketSentenceIter(dataPath, vocab, buckets,
+ batchSize, initStates, seperateChar = "\n",
+ text2Id = text2Id, readContent = readContent)
+
+ dataTrain: BucketIo.BucketSentenceIter = non-empty iterator
+
+ ```
+
+5) You can set more than 100 epochs, but for this tutorial, specify 75 epochs. Each epoch can take as long as 4 minutes on a GPU. In this tutorial, you will use the [ADAM optimizer](http://mxnet.io/api/scala/docs/index.html#org.apache.mxnet.optimizer.Adam):
+
+ ```scala
+ scala> import org.apache.mxnet._
+ import org.apache.mxnet._
+
+ scala> import org.apache.mxnet.Callback.Speedometer
+ import org.apache.mxnet.Callback.Speedometer
+
+ scala> import org.apache.mxnet.optimizer.Adam
+ import org.apache.mxnet.optimizer.Adam
+
+ scala> // and we will see result by training 75 epochs
+ scala> val numEpoch = 75
+ numEpoch: Int = 75
+
+ scala> // learning rate
+ scala> val learningRate = 0.001f
+ learningRate: Float = 0.001
+
+ ```
+
+6) Define the perplexity utility function for the evaluation metric which is used to calculate the negative log-likelihood during training.
+
+ ```scala
+ scala> def perplexity(label: NDArray, pred: NDArray): Float = {
+ val shape = label.shape
+ val size = shape(0) * shape(1)
+ val labelT = {
+ val tmp = label.toArray.grouped(shape(1)).toArray
+ val result = Array.fill[Float](size)(0f)
+ var idx = 0
+ for (i <- 0 until shape(1)) {
+ for (j <- 0 until shape(0)) {
+ result(idx) = tmp(j)(i)
+ idx += 1
+ }
+ }
+ result
+ }
+ var loss = 0f
+ val predArray = pred.toArray.grouped(pred.shape(1)).toArray
+ for (i <- 0 until pred.shape(0)) {
+ loss += -Math.log(Math.max(1e-10, predArray(i)(labelT(i).toInt)).toFloat).toFloat
+ }
+ loss / size
+ }
+
+ perplexity: (label: org.apache.mxnet.NDArray, pred: org.apache.mxnet.NDArray)Float
+
+ scala> def doCheckpoint(prefix: String): EpochEndCallback = new EpochEndCallback {
+ override def invoke(epoch: Int, symbol: Symbol,
+ argParams: Map[String, NDArray],
+ auxStates: Map[String, NDArray]): Unit = {
+ Model.saveCheckpoint(prefix, epoch + 1, symbol, argParams, auxStates)
+ }
+ }
+
+ doCheckpoint: (prefix: String)org.apache.mxnet.EpochEndCallback
+
+ ```
+
+7) Define the initializer that is required for creating a model, as follows:
+
+ ```scala
+ scala> val initializer = new Xavier(factorType = "in", magnitude = 2.34f)
+
+ initializer: org.apache.mxnet.Xavier = org.apache.mxnet.Xavier@54e8f10a
+
+ ```
+
+8) Now, you have implemented all the supporting infrastructures for the char-lstm model. To train the model, use the standard [MXNet high-level API](http://mxnet.io/api/scala/docs/index.html#org.apache.mxnet.FeedForward). You can train the model on a single GPU or CPU from multiple GPUs or CPUs by changing ```scala .setContext(Array(Context.gpu(0),Context.gpu(1),Context.gpu(2),Context.gpu(3)))``` to ```scala .setContext(Array(Context.gpu(0)))```:
+
+ ```scala
+ scala> val model = FeedForward.newBuilder(symbol)
+ .setContext(Array(Context.gpu(0),Context.gpu(1),Context.gpu(2),Context.gpu(3)))
+ .setNumEpoch(numEpoch)
+ .setOptimizer(new Adam(learningRate = learningRate, wd = 0.00001f))
+ .setInitializer(initializer)
+ .setTrainData(dataTrain)
+ .setEvalMetric(new CustomMetric(perplexity, name = "perplexity"))
+ .setBatchEndCallback(new Speedometer(batchSize, 20))
+ .setEpochEndCallback(doCheckpoint("obama"))
+ .build()
+
+ model: org.apache.mxnet.FeedForward = org.apache.mxnet.FeedForward@4926f6c7
+ ```
+
+Now, you have an LSTM model and you've trained it. Use this model to create the inference.
+
+## Build the Inference Model
+
+You can now sample sentences from the trained model. The sampler works as follows:
+- Takes some fixed character set (e.g., "The United States") and feeds it into the LSTM as the starting input.
+- The LSTM produces an output distribution over the vocabulary and a state in the first time step then, samples a character from the output distribution and fixes it as the second character.
+- In the next time step, feeds the previously sampled character as input.
+- Continues running until it has sampled enough characters. Note we are running mini-batches, so several sentences could be sampled simultaneously.
+
+To build the inference model, define the following utility functions that help MXNet make inferences:
+
+* `makeRevertVocab` - Reverts the key value in the dictionary for easy access to characters while predicting
+* `makeInput` - Uses a given character as input
+* `cdf`, `choice` - `cdf` is a helper function for the `choice` function, which is used to create random samples
+* `makeOutput` - Directs the model to use either random output or fixed output by choosing the option with the greatest probability.
+
+ ```scala
+ scala> import scala.util.Random
+
+ scala> // helper structure for prediction
+ scala> def makeRevertVocab(vocab: Map[String, Int]): Map[Int, String] = {
+ var dic = Map[Int, String]()
+ vocab.foreach { case (k, v) =>
+ dic = dic + (v -> k)
+ }
+ dic
+ }
+
+ makeRevertVocab: (vocab: Map[String,Int])Map[Int,String]
+
+ scala> // make input from char
+ scala> def makeInput(char: Char, vocab: Map[String, Int], arr: NDArray): Unit = {
+ val idx = vocab(s"$char")
+ val tmp = NDArray.zeros(1)
+ tmp.set(idx)
+ arr.set(tmp)
+ }
+
+ makeInput: (char: Char, vocab: Map[String,Int], arr: org.apache.mxnet.NDArray)Unit
+
+ scala> // helper function for random sample
+ scala> def cdf(weights: Array[Float]): Array[Float] = {
+ val total = weights.sum
+ var result = Array[Float]()
+ var cumsum = 0f
+ for (w <- weights) {
+ cumsum += w
+ result = result :+ (cumsum / total)
+ }
+ result
+ }
+
+ cdf: (weights: Array[Float])Array[Float]
+
+ scala> def choice(population: Array[String], weights: Array[Float]): String = {
+ assert(population.length == weights.length)
+ val cdfVals = cdf(weights)
+ val x = Random.nextFloat()
+ var idx = 0
+ var found = false
+ for (i <- 0 until cdfVals.length) {
+ if (cdfVals(i) >= x && !found) {
+ idx = i
+ found = true
+ }
+ }
+ population(idx)
+ }
+
+ choice: (population: Array[String], weights: Array[Float])String
+
+ scala> // we can use random output or fixed output by choosing largest probability
+ scala> def makeOutput(prob: Array[Float], vocab: Map[Int, String],
+ sample: Boolean = false, temperature: Float = 1f): String = {
+ var idx = -1
+ val char = if (sample == false) {
+ idx = ((-1f, -1) /: prob.zipWithIndex) { (max, elem) =>
+ if (max._1 < elem._1) elem else max
+ }._2
+ if (vocab.contains(idx)) vocab(idx)
+ else ""
+ } else {
+ val fixDict = Array("") ++ (1 until vocab.size + 1).map(i => vocab(i))
+ var scaleProb = prob.map(x => if (x < 1e-6) 1e-6 else if (x > 1 - 1e-6) 1 - 1e-6 else x)
+ var rescale = scaleProb.map(x => Math.exp(Math.log(x) / temperature).toFloat)
+ val sum = rescale.sum.toFloat
+ rescale = rescale.map(_ / sum)
+ choice(fixDict, rescale)
+ }
+ char
+ }
+
+ makeOutput: (prob: Array[Float], vocab: Map[Int,String], sample: Boolean, temperature: Float)String
+
+ ```
+
+1) Build the inference model:
+
+ ```scala
+ scala> // load from check-point
+ scala> val (_, argParams, _) = Model.loadCheckpoint("obama", 75)
+
+ scala> // build an inference model
+ scala> val model = new RnnModel.LSTMInferenceModel(numLstmLayer, vocab.size + 1, \
+ numHidden = numHidden, numEmbed = numEmbed, \
+ numLabel = vocab.size + 1, argParams = argParams, \
+ ctx = Context.cpu(), dropout = 0.2f)
+
+ model: RnnModel.LSTMInferenceModel = RnnModel$LSTMInferenceModel@2f0c0319
+ ```
+
+2) Now you can generate a sequence of 1200 characters (you can select any number of characters you want) starting with "The United States" as follows:
+
+ ```scala
+
+ scala> val seqLength = 1200
+ seqLength: Int = 1200
+
+ scala> val inputNdarray = NDArray.zeros(1)
+ inputNdarray: org.apache.mxnet.NDArray = org.apache.mxnet.NDArray@9c231a24
+
+ scala> val revertVocab = makeRevertVocab(vocab)
+
+ scala> // Feel free to change the starter sentence
+
+ scala> var output = "The United States"
+ output: String = The United States
+
+ scala> val randomSample = true
+ randomSample: Boolean = true
+
+ scala> var newSentence = true
+ newSentence: Boolean = true
+
+ scala> val ignoreLength = output.length()
+ ignoreLength: Int = 17
+
+ scala> for (i <- 0 until seqLength) {
+ if (i <= ignoreLength - 1) makeInput(output(i), vocab, inputNdarray)
+ else makeInput(output.takeRight(1)(0), vocab, inputNdarray)
+ val prob = model.forward(inputNdarray, newSentence)
+ newSentence = false
+ val nextChar = makeOutput(prob, revertVocab, randomSample)
+ if (nextChar == "") newSentence = true
+ if (i >= ignoreLength) output = output ++ nextChar
+ }
+
+ scala> output
+
+ res7: String = The United States who have been blessed no companies would be proud that the challenges we face, it's not as directly untelle are in my daughters - you can afford -- life-saving march care and poor information and receiving battle against other speeces and lead its people. After champions of 2006, and because Africa in America, separate has been conferenced by children ation of discrimination, we remember all of this, succeeded in any other feelings of a palently better political process - at lliims being disability payment. All across all different mights of a more just a few global personal morality and industrialized ready to succeed.One can afford when the earliest days of a pension you can add to the system be confructive despair. They have starting in the demand for...
+
+ ```
+
+
+You can see the output generated from Obama's speeches. All of the line breaks, punctuation, and uppercase and lowercase letters were produced by the sampler (no post-processing was performed).
+
+
+## Next Steps
+* [Scala API](http://mxnet.io/api/scala/)
+* [More Scala Examples](https://github.com/dmlc/mxnet/tree/master/scala-package/examples/)
+* [MXNet tutorials index](http://mxnet.io/tutorials/index.html)
diff --git a/docs/static_site/src/pages/api/scala/docs/tutorials/index.md b/docs/static_site/src/pages/api/scala/docs/tutorials/index.md
new file mode 100644
index 000000000000..f9b6e56c9d4f
--- /dev/null
+++ b/docs/static_site/src/pages/api/scala/docs/tutorials/index.md
@@ -0,0 +1,22 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_landing_tutorials
+title: Scala Tutorials
+permalink: /api/scala/docs/tutorials
+tag: scala
+---
diff --git a/docs/static_site/src/pages/api/scala/docs/tutorials/infer.md b/docs/static_site/src/pages/api/scala/docs/tutorials/infer.md
new file mode 100644
index 000000000000..daa996196393
--- /dev/null
+++ b/docs/static_site/src/pages/api/scala/docs/tutorials/infer.md
@@ -0,0 +1,66 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Infer API
+is_tutorial: true
+tag: scala
+permalink: /api/scala/docs/tutorials/infer
+---
+
+# Infer API
+The MXNet Scala Infer API provides you with model loading and inference functionality using the MXNet Scala package.
+
+
+## Prerequisites
+To use the Infer API you must first install the MXNet Scala package. Instructions for this are provided in the following variations:
+* [Tutorial for setting up a project in the IntelliJ IDE](../../tutorials/scala/mxnet_scala_on_intellij.html)
+* [Installing the MXNet Scala Package for macOS](../../install/ubuntu_setup.html#install-the-mxnet-package-for-scala)
+* [Installing the MXNet Scala for Linux](../../install/ubuntu_setup.html#install-the-mxnet-package-for-scala)
+
+
+## Inference
+The Scala Infer API includes both single image and batch modes. Here is an example of running inference on a single image by using the `ImageClassifier` class. A complete [image classification example](https://github.com/apache/incubator-mxnet/blob/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExample.scala) using ResNet-152 is provided in the [Scala package's example folder](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples). This example also demonstrates inference with batches of images.
+
+```scala
+def runInferenceOnSingleImage(modelPathPrefix: String, inputImagePath: String,
+ context: Array[Context]):
+IndexedSeq[IndexedSeq[(String, Float)]] = {
+ val dType = DType.Float32
+ val inputShape = Shape(1, 3, 224, 224)
+
+ val inputDescriptor = IndexedSeq(DataDesc("data", inputShape, dType, "NCHW"))
+
+ // Create object of ImageClassifier class
+ val imgClassifier: ImageClassifier = new
+ ImageClassifier(modelPathPrefix, inputDescriptor, context)
+
+ // Loading single image from file and getting BufferedImage
+ val img = ImageClassifier.loadImageFromFile(inputImagePath)
+
+ // Running inference on single image
+ val output = imgClassifier.classifyImage(img, Some(5))
+
+ output
+}
+```
+
+
+## Related Resources
+* [Infer API Scaladocs](docs/index.html#org.apache.mxnet.infer.package)
+* [Single Shot Detector Inference Example](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/objectdetector)
+* [Image Classification Example](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier)
diff --git a/docs/static_site/src/pages/api/scala/docs/tutorials/io.md b/docs/static_site/src/pages/api/scala/docs/tutorials/io.md
new file mode 100644
index 000000000000..f9e9b462c4a2
--- /dev/null
+++ b/docs/static_site/src/pages/api/scala/docs/tutorials/io.md
@@ -0,0 +1,189 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Data Loading API
+permalink: /api/scala/docs/tutorials/io
+is_tutorial: true
+tag: scala
+---
+
+# MXNet Scala Data Loading API
+This topic introduces the data input method for MXNet. MXNet uses an iterator to provide data to the neural network. Iterators do some preprocessing and generate batches for the neural network.
+
+MXNet provides basic iterators for MNIST and RecordIO images. To hide the cost of I/O, MXNet uses a prefetch strategy that enables parallelism for the learning process and data fetching. Data is automatically fetched by an independent thread.
+
+Topics:
+
+* [Data Iterator Parameters](#parameters-for-data-iterator) clarifies the different usages for dataiter parameters.
+* [Create a Data Iterator](#create-a-data-iterator) introduces how to create a data iterator in MXNet for Scala.
+* [How to Get Data](#how-to-get-data) introduces the data resource and data preparation tools.
+* [IO API Reference](http://mxnet.incubator.apache.org/api/scala/docs/index.html#org.apache.mxnet.IO$) explains the IO API.
+
+
+## Data Iterator Parameters
+
+To create a data iterator, you typically need to provide five parameters:
+
+* **Dataset Param** provides basic information about the dataset, e.g., file path, input shape.
+* **Batch Param** provides information required to form a batch, e.g., batch size.
+* **Augmentation Param** tells MXNet which augmentation operations (e.g., crop or mirror) to perform on an input image.
+* **Backend Param** controls the behavior of the back-end threads to hide the cost of data loading.
+* **Auxiliary Param** provides options for checking and debugging.
+
+You *must* provide the **Dataset Param** and **Batch Param**, otherwise MXNet can't create the data batch. Provide other parameters as required by your algorithm and performance needs. We provide a detailed explanation and examples of the options later.
+
+## Create a Data Iterator
+
+The IO API provides a simple way to create a data iterator in Scala.
+The following example code shows how to create a CIFAR data iterator.
+
+```scala
+ val dataiter = IO.ImageRecordIter(Map(
+ // Utility Parameter
+ // Optional
+ // Name of the data, should match the name of the data input of the network
+ // data_name='data',
+ // Utility Parameter
+ // Optional
+ // Name of the label, should match the name of the label parameter of the network
+ // Usually, if the loss layer is named 'foo', then the label input has the name
+ // 'foo_label', unless overwritten
+ // label_name='softmax_label',
+ // Dataset Parameter
+ // Impulsary
+ // indicating the data file, please check the data is already there
+ "path_imgrec" -> "data/cifar/train.rec",
+ // Dataset Parameter
+ // Impulsary
+ // indicating the image size after preprocessing
+ "data_shape" -> "(3,28,28)",
+ // Batch Parameter
+ // Impulsary
+ // tells how many images in a batch
+ "batch_size" -> "100",
+ // Augmentation Parameter
+ // Optional
+ // when offers mean_img, each image will subtract the mean value at each pixel
+ "mean_img" -> "data/cifar/cifar10_mean.bin",
+ // Augmentation Parameter
+ // Optional
+ // randomly crop a patch of the data_shape from the original image
+ "rand_crop" -> "True",
+ // Augmentation Parameter
+ // Optional
+ // randomly mirror the image horizontally
+ "rand_mirror" -> "True",
+ // Augmentation Parameter
+ // Optional
+ // randomly shuffle the data
+ "shuffle" -> "False",
+ // Backend Parameter
+ // Optional
+ // Preprocessing thread number
+ "preprocess_threads" -> "4",
+ // Backend Parameter
+ // Optional
+ // Prefetch buffer size
+ "prefetch_buffer" = "1"))
+```
+
+First, explicitly specify the kind of data (MNIST, ImageRecord, etc.) to fetch. Then, provide the options for the dataset, batching, image augmentation, multi-tread processing, and prefetching operations. The code automatically validates the parameters. If a required parameter is missing, MXNet returns an error.
+
+## How to Get Data
+
+
+We provide [scripts](https://github.com/apache/incubator-mxnet/tree/master/scala-package/core/scripts) to download MNIST data and CIFAR10 ImageRecord data. If you want to create your own dataset, we recommend using the Image RecordIO data format.
+
+## Create a Dataset Using RecordIO
+
+RecordIO implements a file format for a sequence of records. We recommend storing images as records and packing them together. The benefits include:
+
+* Storing images in a compact format--e.g., JPEG, for records--greatly reduces the size of the dataset on the disk.
+* Packing data together allows continuous reading on the disk.
+* RecordIO has a simple way to partition, simplifying distributed setting. We provide an example later.
+
+We provide the [im2rec tool](https://github.com/apache/incubator-mxnet/blob/master/tools/im2rec.cc) so you can create an Image RecordIO dataset by yourself. The following walkthrough shows you how.
+
+### Prerequisites
+Download the data. You don't need to resize the images manually. You can use `im2rec` to resize them automatically. For details, see "Extension: Using Multiple Labels for a Single Image," later in this topic.
+
+### Step 1. Make an Image List File
+After you download the data, you need to make an image list file. The format is:
+
+```
+ integer_image_index \t label_index \t path_to_image
+```
+Typically, the program takes the list of names of all of the images, shuffles them, then separates them into two lists: a training filename list and a testing filename list. Write the list in the right format.
+
+This is an example file:
+
+```bash
+ 95099 464 n04467665_17283.JPEG
+ 10025081 412 ILSVRC2010_val_00025082.JPEG
+ 74181 789 n01915811_2739.JPEG
+ 10035553 859 ILSVRC2010_val_00035554.JPEG
+ 10048727 929 ILSVRC2010_val_00048728.JPEG
+ 94028 924 n01980166_4956.JPEG
+ 1080682 650 n11807979_571.JPEG
+ 972457 633 n07723039_1627.JPEG
+ 7534 11 n01630670_4486.JPEG
+ 1191261 249 n12407079_5106.JPEG
+
+```
+
+### Step 2. Create the Binary File
+To generate a binary image, use `im2rec` in the tool folder. `im2rec` takes the path of the `_image list file_` you generated, the `_root path_` of the images, and the `_output file path_` as input. This process usually takes several hours, so be patient.
+
+A sample command:
+
+```bash
+ ./bin/im2rec image.lst image_root_dir output.bin resize=256
+```
+For more details, run ```./bin/im2rec```.
+
+### Extension: Multiple Labels for a Single Image
+
+The `im2rec` tool and `IO.ImageRecordIter` have multi-label support for a single image.
+For example, if you have four labels for a single image, you can use the following procedure to use the RecordIO tools.
+
+1. Write the image list files as follows:
+
+ ```
+ integer_image_index \t label_1 \t label_2 \t label_3 \t label_4 \t path_to_image
+ ```
+
+2. Run `im2rec`, adding a 'label_width=4' to the command argument, for example:
+
+ ```bash
+ ./bin/im2rec image.lst image_root_dir output.bin resize=256 label_width=4
+ ```
+
+3. In the iterator generation code, set `label_width=4` and `path_imglist=<>`, for example:
+
+ ```scala
+ val dataiter = IO.ImageRecordIter(Map(
+ "path_imgrec" -> "data/cifar/train.rec",
+ "data_shape" -> "(3,28,28)",
+ "path_imglist" -> "data/cifar/image.lst",
+ "label_width" -> "4"
+ ))
+ ```
+
+## Next Steps
+* [NDArray API](ndarray.md) for vector/matrix/tensor operations
+* [KVStore API](kvstore.md) for multi-GPU and multi-host distributed training
diff --git a/docs/static_site/src/pages/api/scala/docs/tutorials/kvstore.md b/docs/static_site/src/pages/api/scala/docs/tutorials/kvstore.md
new file mode 100644
index 000000000000..08cb4cb50869
--- /dev/null
+++ b/docs/static_site/src/pages/api/scala/docs/tutorials/kvstore.md
@@ -0,0 +1,129 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: KVStore API
+permalink: /api/scala/docs/tutorials/kvstore
+is_tutorial: true
+tag: scala
+---
+
+# KVStore API
+
+Topics:
+* [Basic Push and Pull](#basic-push-and-pull)
+* [List Key-Value Pairs](#list-key-value-pairs)
+* [API Reference](http://mxnet.incubator.apache.org/api/scala/docs/index.html#org.apache.mxnet.KVStore)
+
+
+## Basic Push and Pull
+
+Provides basic operation over multiple devices (GPUs) on a single device.
+
+### Initialization
+
+Let's consider a simple example. It initializes
+a (`int`, `NDArray`) pair into the store, and then pulls the value out.
+
+```scala
+val kv = KVStore.create("local") // create a local kv store.
+val shape = Shape(2,3)
+kv.init(3, NDArray.ones(shape)*2)
+val a = NDArray.zeros(shape)
+kv.pull(3, out = a)
+a.toArray
+// Array[Float] = Array(2.0, 2.0, 2.0, 2.0, 2.0, 2.0)
+```
+
+### Push, Aggregation, and Updater
+
+For any key that's been initialized, you can push a new value with the same shape to the key, as follows:
+
+```scala
+kv.push(3, NDArray.ones(shape)*8)
+kv.pull(3, out = a) // pull out the value
+a.toArray
+// Array[Float] = Array(8.0, 8.0, 8.0, 8.0, 8.0, 8.0)
+```
+
+The data that you want to push can be stored on any device. Furthermore, you can push multiple
+values into the same key, where KVStore first sums all of these
+values, and then pushes the aggregated value, as follows:
+
+```scala
+val gpus = Array(Context.gpu(0), Context.gpu(1), Context.gpu(2), Context.gpu(3))
+val b = Array(NDArray.ones(shape, gpus(0)), NDArray.ones(shape, gpus(1)), \
+NDArray.ones(shape, gpus(2)), NDArray.ones(shape, gpus(3)))
+kv.push(3, b)
+kv.pull(3, out = a)
+a.toArray
+// Array[Float] = Array(4.0, 4.0, 4.0, 4.0, 4.0, 4.0)
+```
+
+For each push command, KVStore applies the pushed value to the value stored by an
+`updater`. The default updater is `ASSIGN`. You can replace the default to
+control how data is merged.
+
+```scala
+val updater = new MXKVStoreUpdater {
+ override def update(key: Int, input: NDArray, stored: NDArray): Unit = {
+ println(s"update on key $key")
+ stored += input * 2
+ }
+ override def dispose(): Unit = {}
+ }
+kv.setUpdater(updater)
+kv.pull(3, a)
+a.toArray
+// Array[Float] = Array(4.0, 4.0, 4.0, 4.0, 4.0, 4.0)
+kv.push(3, NDArray.ones(shape))
+// update on key 3
+kv.pull(3, a)
+a.toArray
+// Array[Float] = Array(6.0, 6.0, 6.0, 6.0, 6.0, 6.0)
+```
+
+### Pull
+
+You've already seen how to pull a single key-value pair. Similar to the way that you use the push command, you can
+pull the value into several devices with a single call.
+
+```scala
+val b = Array(NDArray.ones(shape, gpus(0)), NDArray.ones(shape, gpus(1)),\
+NDArray.ones(shape, gpus(2)), NDArray.ones(shape, gpus(3)))
+kv.pull(3, outs = b)
+b(1).toArray
+// Array[Float] = Array(6.0, 6.0, 6.0, 6.0, 6.0, 6.0)
+```
+
+## List Key-Value Pairs
+
+All of the operations that we've discussed so far are performed on a single key. KVStore also provides
+the interface for generating a list of key-value pairs. For a single device, use the following:
+
+```scala
+val keys = Array(5, 7, 9)
+kv.init(keys, Array.fill(keys.length)(NDArray.ones(shape)))
+kv.push(keys, Array.fill(keys.length)(NDArray.ones(shape)))
+// update on key: 5
+// update on key: 7
+// update on key: 9
+val b = Array.fill(keys.length)(NDArray.zeros(shape))
+kv.pull(keys, outs = b)
+b(1).toArray
+// Array[Float] = Array(3.0, 3.0, 3.0, 3.0, 3.0, 3.0)
+```
diff --git a/docs/static_site/src/pages/api/scala/docs/tutorials/mnist.md b/docs/static_site/src/pages/api/scala/docs/tutorials/mnist.md
new file mode 100644
index 000000000000..fbc6786b8e10
--- /dev/null
+++ b/docs/static_site/src/pages/api/scala/docs/tutorials/mnist.md
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: MNIST Example
+is_tutorial: true
+tag: scala
+permalink: /api/scala/docs/tutorials/mnist
+---
+
+# Handwritten Digit Recognition
+
+This Scala tutorial guides you through a classic computer vision application: identifying hand written digits.
+
+Let's train a 3-layer network (i.e multilayer perceptron network) on the MNIST dataset to classify handwritten digits.
+
+## Prerequisites
+To complete this tutorial, we need:
+
+- to compile the latest MXNet version. See the MXNet installation instructions for your operating system in [Setup and Installation](http://mxnet.io/install/index.html).
+- to compile the Scala API. See Scala API build instructions in [Build](https://github.com/dmlc/mxnet/tree/master/scala-package).
+
+## Define the Network
+
+First, define the neural network's architecture using the Symbol API:
+
+```scala
+import org.apache.mxnet._
+import org.apache.mxnet.optimizer.SGD
+
+// model definition
+val data = Symbol.Variable("data")
+val fc1 = Symbol.api.FullyConnected(Some(data), num_hidden = 128, name = "fc1")
+val act1 = Symbol.api.Activation(Some(fc1), "relu", "relu1")
+val fc2 = Symbol.api.FullyConnected(Some(act1), num_hidden = 64, name = "fc2")
+val act2 = Symbol.api.Activation(Some(fc2), "relu", "relu2")
+val fc3 = Symbol.api.FullyConnected(Some(act2), num_hidden = 10, name = "fc3")
+val mlp = Symbol.api.SoftmaxOutput(Some(fc3), name = "sm")
+```
+
+## Load the Data
+
+Then, load the training and validation data using DataIterators.
+
+You can download the MNIST data using the [get_mnist_data script](https://github.com/dmlc/mxnet/blob/master/scala-package/core/scripts/get_mnist_data.sh). We've already written a DataIterator for the MNIST dataset:
+
+```scala
+// load MNIST dataset
+val trainDataIter = IO.MNISTIter(Map(
+ "image" -> "data/train-images-idx3-ubyte",
+ "label" -> "data/train-labels-idx1-ubyte",
+ "data_shape" -> "(1, 28, 28)",
+ "label_name" -> "sm_label",
+ "batch_size" -> "50",
+ "shuffle" -> "1",
+ "flat" -> "0",
+ "silent" -> "0",
+ "seed" -> "10"))
+
+val valDataIter = IO.MNISTIter(Map(
+ "image" -> "data/t10k-images-idx3-ubyte",
+ "label" -> "data/t10k-labels-idx1-ubyte",
+ "data_shape" -> "(1, 28, 28)",
+ "label_name" -> "sm_label",
+ "batch_size" -> "50",
+ "shuffle" -> "1",
+ "flat" -> "0", "silent" -> "0"))
+```
+
+## Train the model
+
+We can use the FeedForward builder to train our network:
+
+```scala
+// setup model and fit the training data
+val model = FeedForward.newBuilder(mlp)
+ .setContext(Context.cpu())
+ .setNumEpoch(10)
+ .setOptimizer(new SGD(learningRate = 0.1f, momentum = 0.9f, wd = 0.0001f))
+ .setTrainData(trainDataIter)
+ .setEvalData(valDataIter)
+ .build()
+```
+
+## Make predictions
+
+Finally, let's make predictions against the validation dataset and compare the predicted labels with the real labels.
+
+```scala
+val probArrays = model.predict(valDataIter)
+// in this case, we do not have multiple outputs
+require(probArrays.length == 1)
+val prob = probArrays(0)
+
+// get real labels
+import scala.collection.mutable.ListBuffer
+valDataIter.reset()
+val labels = ListBuffer.empty[NDArray]
+while (valDataIter.hasNext) {
+ val evalData = valDataIter.next()
+ labels += evalData.label(0).copy()
+}
+val y = NDArray.concatenate(labels)
+
+// get predicted labels
+val predictedY = NDArray.argmax_channel(prob)
+require(y.shape == predictedY.shape)
+
+// calculate accuracy
+var numCorrect = 0
+var numTotal = 0
+for ((labelElem, predElem) <- y.toArray zip predictedY.toArray) {
+ if (labelElem == predElem) {
+ numCorrect += 1
+ }
+ numTotal += 1
+}
+val acc = numCorrect.toFloat / numTotal
+println(s"Final accuracy = $acc")
+```
+
+Check out more MXNet Scala examples below.
+
+## Next Steps
+* [Scala API](http://mxnet.io/api/scala/)
+* [More Scala Examples](https://github.com/dmlc/mxnet/tree/master/scala-package/examples/)
+* [MXNet tutorials index](http://mxnet.io/tutorials/index.html)
diff --git a/docs/static_site/src/pages/api/scala/docs/tutorials/model.md b/docs/static_site/src/pages/api/scala/docs/tutorials/model.md
new file mode 100644
index 000000000000..80fd24f0bca5
--- /dev/null
+++ b/docs/static_site/src/pages/api/scala/docs/tutorials/model.md
@@ -0,0 +1,142 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+layout: page_api
+title: Model API *Deprecated*
+permalink: /api/scala/docs/tutorials/model
+is_tutorial: true
+tag: scala
+---
+
+# MXNet Scala Model API
+
+The model API provides a simplified way to train neural networks using common best practices.
+It's a thin wrapper built on top of the [ndarray](ndarray.md) and [symbolic](symbol.md)
+modules that make neural network training easy.
+
+Topics:
+
+* [Train a Model](#train-a-model)
+* [Save the Model](#save-the-model)
+* [Periodic Checkpoint](#periodic-checkpointing)
+* [Multiple Devices](#use-multiple-devices)
+* [Model API Reference](#http://mxnet.incubator.apache.org/api/scala/docs/index.html#org.apache.mxnet.Model)
+
+## Train the Model
+
+To train a model, perform two steps: configure the model using the symbol parameter,
+then call ```model.Feedforward.create``` to create the model.
+The following example creates a two-layer neural network.
+
+```scala
+ // configure a two layer neuralnetwork
+ val data = Symbol.Variable("data")
+ val fc1 = Symbol.api.FullyConnected(data, num_hidden = 128, name = "fc1")
+ val act1 = Symbol.api.Activation(Some(fc1), "relu", "relu1")
+ val fc2 = Symbol.api.FullyConnected(Some(act1), num_hidden = 64, name = "fc2")
+ val softmax = Symbol.api.SoftmaxOutput(Some(fc2), name = "sm")
+
+ // Construct the FeedForward model and fit on the input training data
+ val model = FeedForward.newBuilder(softmax)
+ .setContext(Context.cpu())
+ .setNumEpoch(num_epoch)
+ .setOptimizer(new SGD(learningRate = 0.01f, momentum = 0.9f, wd = 0.0001f))
+ .setTrainData(trainDataIter)
+ .setEvalData(valDataIter)
+ .build()
+```
+You can also use the `scikit-learn-style` construct and `fit` function to create a model.
+
+```scala
+ // create a model using sklearn-style two-step way
+ val model = new FeedForward(softmax,
+ numEpoch = numEpochs,
+ argParams = argParams,
+ auxParams = auxParams,
+ beginEpoch = beginEpoch,
+ epochSize = epochSize)
+
+ model.fit(trainData = train)
+```
+For more information, see [API Reference](http://mxnet.incubator.apache.org/api/scala/docs/index.html).
+
+## Save the Model
+
+After the job is done, save your work.
+We also provide `save` and `load` functions. You can use the `load` function to load a model checkpoint from a file.
+
+```scala
+ // checkpoint the model data into file,
+ // save a model to modelPrefix-symbol.json and modelPrefix-0100.params
+ val modelPrefix: String = "checkpt"
+ val num_epoch = 100
+ Model.saveCheckpoint(modelPrefix, epoch + 1, symbol, argParams, auxStates)
+
+ // load model back
+ val model_loaded = FeedForward.load(modelPrefix, num_epoch)
+```
+The advantage of these two `save` and `load` functions is that they are language agnostic.
+You should be able to save and load directly into cloud storage, such as Amazon S3 and HDFS.
+
+## Periodic Checkpointing
+
+We recommend checkpointing your model after each iteration.
+To do this, use ```EpochEndCallback``` to add a ```Model.saveCheckpoint(