diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj
index efc6c9a0..40ea9037 100644
--- a/src/python/nimbusml.pyproj
+++ b/src/python/nimbusml.pyproj
@@ -802,6 +802,7 @@
+
diff --git a/src/python/tools/temp_docs_updater.py b/src/python/tools/temp_docs_updater.py
new file mode 100644
index 00000000..3915d19e
--- /dev/null
+++ b/src/python/tools/temp_docs_updater.py
@@ -0,0 +1,383 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# -------------------------------------------------------------------------
+import os
+import re
+import stat
+import shutil
+import argparse
+import tempfile
+import subprocess
+from pathlib import Path
+from code_fixer import run_autopep
+
+
+description = """
+This module helps with merging the changes from the master branch
+in to the temp/docs branch. Here are the steps it takes:
+
+1. Create a local clone of the users fork of NimbusML.
+
+2. Create a new branch in the clone created in step (1)
+ which tracks the temp/docs branch of the official
+ NimbusML repository.
+
+3. Remove all the tracked files from the local branch
+ created in step (2).
+
+4. Create a local clone of the master branch of the official
+ NimbusML repository and checkout the specified commit
+ (default is HEAD).
+
+5. Copy all the tracked files from (4) in to (2).
+
+6. Modify the files in (2) to be compatible with the
+ documentation requirements.
+"""
+
+
+NIMBUSML_GIT_URL = 'https://github.com/microsoft/NimbusML.git'
+
+# This list should not contain 'core/...' dirs.
+# Subdirectories will not be automatically traversed
+# and need to be explicitly added to this list.
+ENTRYPOINT_DIRS = [
+ r'src\python\nimbusml\cluster',
+ r'src\python\nimbusml\decomposition',
+ r'src\python\nimbusml\ensemble',
+ r'src\python\nimbusml\ensemble\booster',
+ r'src\python\nimbusml\ensemble\feature_selector',
+ r'src\python\nimbusml\ensemble\output_combiner',
+ r'src\python\nimbusml\ensemble\sub_model_selector',
+ r'src\python\nimbusml\ensemble\sub_model_selector\diversity_measure',
+ r'src\python\nimbusml\ensemble\subset_selector',
+ r'src\python\nimbusml\feature_extraction',
+ r'src\python\nimbusml\feature_extraction\categorical',
+ r'src\python\nimbusml\feature_extraction\image',
+ r'src\python\nimbusml\feature_extraction\text',
+ r'src\python\nimbusml\feature_extraction\text\extractor',
+ r'src\python\nimbusml\feature_extraction\text\stopwords',
+ r'src\python\nimbusml\feature_selection',
+ r'src\python\nimbusml\linear_model',
+ r'src\python\nimbusml\model_selection',
+ r'src\python\nimbusml\multiclass',
+ r'src\python\nimbusml\naive_bayes',
+ r'src\python\nimbusml\preprocessing',
+ r'src\python\nimbusml\preprocessing\filter',
+ r'src\python\nimbusml\preprocessing\missing_values',
+ r'src\python\nimbusml\preprocessing\normalization',
+ r'src\python\nimbusml\preprocessing\schema',
+ r'src\python\nimbusml\preprocessing\text',
+ r'src\python\nimbusml\timeseries',
+]
+
+
+def print_title(message):
+ print('\n', '-' * 50, message, '-' * 50, sep='\n')
+
+
+def get_dir_entries(directory, names_to_ignore=None, paths_to_ignore=None):
+ if not names_to_ignore:
+ names_to_ignore = []
+
+ if not paths_to_ignore:
+ paths_to_ignore = []
+
+ files = {}
+ sub_dirs = {}
+
+ with os.scandir(directory) as it:
+ for entry in it:
+ if entry.name in names_to_ignore:
+ continue
+
+ if any([(x in entry.path) for x in paths_to_ignore]):
+ continue
+
+ if entry.is_file():
+ files[entry.name] = entry
+
+ elif entry.is_dir():
+ sub_dirs[entry.name] = entry
+
+ return files, sub_dirs
+
+
+def rmdir(path):
+ def remove_readonly(func, path, _):
+ "Clear the readonly bit and reattempt the removal"
+ os.chmod(path, stat.S_IWRITE)
+ func(path)
+
+ shutil.rmtree(path, onerror=remove_readonly)
+
+
+def replace_file_contents(file_path, old, new, is_re=False):
+ with open(file_path, 'rt') as f:
+ contents = f.read()
+
+ if is_re:
+ contents = re.sub(old, new, contents)
+ else:
+ contents = contents.replace(old, new)
+
+ with open(file_path, 'wt') as f:
+ f.write(contents)
+
+
+def init_target_repo(repo_dir, fork_git_url, branch_name):
+ cwd = os.getcwd()
+
+ if os.path.isdir(repo_dir):
+ print(f'Directory {repo_dir} already exists. Removing it...')
+ rmdir(repo_dir)
+
+ print_title(f'Cloning repository {fork_git_url} in to {repo_dir}...')
+ os.mkdir(repo_dir)
+ os.chdir(repo_dir)
+ subprocess.run(['git', 'clone', fork_git_url, '.'])
+ subprocess.run(['git', 'remote', 'add', 'upstream', NIMBUSML_GIT_URL])
+
+ print('\nAvailable remotes:')
+ subprocess.run(['git', 'remote', '-v'])
+
+ print_title('Fetching upstream branches and creating local branch...')
+ subprocess.run(['git', 'fetch', 'upstream'])
+ subprocess.run(['git', 'checkout', '-b', branch_name, '--track', 'upstream/temp/docs'])
+
+ print('\nBranches:')
+ subprocess.run(['git', 'branch', '-vv'])
+
+ os.chdir(cwd)
+
+
+def clear_repo(repo_dir):
+ files, subdirs = get_dir_entries(repo_dir, names_to_ignore=['.git'])
+
+ for dir_entry in files.values():
+ os.remove(dir_entry)
+
+ for dir_entry in subdirs.values():
+ rmdir(dir_entry)
+
+
+def git_add_all_modifications(repo_dir):
+ cwd = os.getcwd()
+ os.chdir(repo_dir)
+ subprocess.run(['git', 'add', '-A'])
+ os.chdir(cwd)
+
+
+def get_master_repo(commit=None):
+ tmp_dir = tempfile.mkdtemp()
+ cwd = os.getcwd()
+ os.chdir(tmp_dir)
+
+ commit_name = commit if commit else 'HEAD'
+ print_title(f'Cloning master branch from {NIMBUSML_GIT_URL} in to {tmp_dir} at commit {commit_name}...')
+ subprocess.run(['git', 'clone', NIMBUSML_GIT_URL, '.'])
+
+ if commit:
+ subprocess.run(['git', 'checkout', commit])
+
+ os.chdir(cwd)
+ return tmp_dir
+
+
+def copy_to_dir(dst, src_files, src_dirs):
+ for dir_entry in src_files.values():
+ shutil.copy2(dir_entry, dst)
+
+ for dir_entry in src_dirs.values():
+ shutil.copytree(dir_entry, os.path.join(dst, dir_entry.name))
+
+
+def update_entrypoint_compiler(repo_dir):
+ print_title('Updating entrypoint_compiler...')
+
+ path = os.path.join(repo_dir, 'src', 'python', 'tools', 'entrypoint_compiler.py')
+ replace_file_contents(path,
+ 'class_file = class_name.lower()',
+ "class_file = '_' + class_name.lower()")
+
+ print('entrypoint_compiler.py updated.')
+
+
+def rename_data_dir(repo_dir):
+ print_title('Renaming data directory...')
+
+ datasets_dir = os.path.join(repo_dir, 'src', 'python', 'nimbusml', 'datasets')
+ data_dir_src = os.path.join(datasets_dir, 'data')
+ data_dir_dst = os.path.join(datasets_dir, '_data')
+ os.rename(data_dir_src, data_dir_dst)
+
+ path = os.path.join(repo_dir, 'src', 'python', 'nimbusml.pyproj')
+ replace_file_contents(path, 'nimbusml\\datasets\\data\\', 'nimbusml\\datasets\\_data\\')
+
+ # Update the dataset.py file to fix the data dir references
+ replace_file_contents(os.path.join(datasets_dir, 'datasets.py'),
+ r'([\r\n]DATA_DIR.+)data',
+ r'\g<1>_data',
+ True)
+
+ print('Data directory renamed.')
+
+
+def rename_entrypoint_file(dir_entry):
+ module_name = dir_entry.name.replace('.py', '')
+ print(f'Renaming module: {module_name}\n\t({dir_entry.path})\n')
+
+ # Update the import statement in the public file
+ replace_file_contents(dir_entry.path,
+ r'(?s)([\r\n]from\s+.*\.){0}'.format(module_name),
+ r'\g<1>_{0}'.format(module_name),
+ True)
+
+ # Rename the public file to have an underscore as its first character
+ new_path = os.path.join(os.path.dirname(dir_entry), f'_{dir_entry.name}')
+ os.rename(dir_entry.path, new_path)
+
+ # Run autopep on the modified file since the modifications
+ # might require new formatting which entrypoint_compiler is
+ # expecting when run with the --check_manual_changes option.
+ if not new_path.endswith('_cv.py'):
+ run_autopep(new_path)
+
+ # Update the import statement in __init__.py
+ init_path = os.path.join(os.path.dirname(dir_entry), '__init__.py')
+ replace_file_contents(init_path,
+ r'(^from\s+.*\.|[\r\n]from\s+.*\.){0}'.format(module_name),
+ r'\g<1>_{0}'.format(module_name),
+ True)
+
+ parts = Path(dir_entry).parts
+ last_index = max(i for i, val in enumerate(parts) if val == 'nimbusml')
+
+ base_dir = os.path.join(*parts[:last_index])
+ package_dir = os.path.join(*parts[last_index:-1])
+ internal_dir = os.path.join(*parts[:last_index+1], 'internal', 'core', *parts[last_index+1:-1])
+ internal_pkg_dir = os.path.join('nimbusml', 'internal', 'core', *parts[last_index+1:-1])
+
+ # Rename the internal file to have an underscore as its first character
+ if os.path.exists(internal_dir):
+ os.rename(os.path.join(internal_dir, dir_entry.name),
+ os.path.join(internal_dir, '_' + dir_entry.name))
+
+ # Update nimbusml.pyproj with the public and internal name changes
+ replace_file_contents(os.path.join(base_dir, 'nimbusml.pyproj'),
+ os.path.join(package_dir, dir_entry.name),
+ os.path.join(package_dir, '_' + dir_entry.name))
+ replace_file_contents(os.path.join(base_dir, 'nimbusml.pyproj'),
+ os.path.join(internal_pkg_dir, dir_entry.name),
+ os.path.join(internal_pkg_dir, '_' + dir_entry.name))
+
+
+def rename_entrypoints(repo_dir):
+ print_title('Renaming entry point files...')
+
+ for ep_dir in ENTRYPOINT_DIRS:
+ path = os.path.join(repo_dir, ep_dir)
+ files, _ = get_dir_entries(path)
+
+ for dir_entry in files.values():
+ if dir_entry.name.endswith('.py') and not dir_entry.name == '__init__.py':
+ rename_entrypoint_file(dir_entry)
+
+
+def rename_pipeline(repo_dir):
+ nimbusml_path = os.path.join(repo_dir, 'src', 'python', 'nimbusml')
+ os.rename(os.path.join(nimbusml_path, 'pipeline.py'),
+ os.path.join(nimbusml_path, '_pipeline.py'))
+
+ replace_file_contents(os.path.join(nimbusml_path, '__init__.py'),
+ 'from .pipeline import Pipeline',
+ 'from ._pipeline import Pipeline')
+
+ replace_file_contents(os.path.join(nimbusml_path, '__init__.py.in'),
+ 'from .pipeline import Pipeline',
+ 'from ._pipeline import Pipeline')
+
+ replace_file_contents(os.path.join(repo_dir, 'src', 'python', 'nimbusml.pyproj'),
+ r'nimbusml\pipeline.py',
+ r'nimbusml\_pipeline.py')
+
+ replace_file_contents(os.path.join(nimbusml_path, 'tests', 'test_syntax_expected_failures.py'),
+ 'from nimbusml.pipeline import TrainedWarning',
+ 'from nimbusml._pipeline import TrainedWarning')
+
+
+# TODO: the fixes in this method shouldn't be necessary.
+def fix_files(repo_dir):
+ stopwords_dir = os.path.join(repo_dir,
+ 'src', 'python', 'nimbusml',
+ 'feature_extraction', 'text',
+ 'stopwords')
+
+ replace_file_contents(os.path.join(stopwords_dir, '_customstopwordsremover.py'),
+ '__all__ = ["CustomStopWordsRemover"]',
+ '__all__ = ["CustomStopWordsRemover"]\n')
+
+ replace_file_contents(os.path.join(stopwords_dir, '_predefinedstopwordsremover.py'),
+ '__all__ = ["PredefinedStopWordsRemover"]',
+ '__all__ = ["PredefinedStopWordsRemover"]\n')
+
+
+def parse_command_line():
+ global description
+ arg_parser = argparse.ArgumentParser(description=description)
+
+ arg_parser.add_argument('repo_dir',
+ help='The location on disk where to create the new local '
+ 'repo which will contain the updated temp/docs branch.',
+ type=str)
+
+ arg_parser.add_argument('fork_git_url',
+ help='The url to use for the local repository. This will usually '
+ 'be the users forked repository.',
+ type=str)
+
+ arg_parser.add_argument('branch_name',
+ help='The name of the new branch which will track temp/docs. '
+ 'This branch will be created in the locally cloned copy of the '
+ 'repo pointed to by fork_git_url.',
+ type=str)
+
+ arg_parser.add_argument('-c', '--commit', help='The latest commit to include in the changes '
+ 'for the new local temp/docs branch.',
+ type=str)
+
+ args = arg_parser.parse_args()
+ return args
+
+
+def main():
+ args = parse_command_line()
+
+ repo_dir = Path(args.repo_dir).resolve()
+
+ init_target_repo(repo_dir,
+ args.fork_git_url,
+ args.branch_name)
+
+ clear_repo(repo_dir)
+
+ master_repo_dir = get_master_repo(args.commit)
+
+ entries = get_dir_entries(master_repo_dir, names_to_ignore=['.git'])
+ copy_to_dir(repo_dir, *entries)
+
+ rmdir(master_repo_dir)
+
+ update_entrypoint_compiler(repo_dir)
+ rename_data_dir(repo_dir)
+ rename_entrypoints(repo_dir)
+ rename_pipeline(repo_dir)
+
+ fix_files(repo_dir)
+
+ git_add_all_modifications(repo_dir)
+
+
+if __name__ == '__main__':
+ main()