From da5fa46ca3d114ca0a503d6a29dd530fe504c6f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sat, 13 Feb 2021 10:52:34 +0200
Subject: [PATCH 01/26] Drop using python multiprocessing pool.

---
 embuilder.py          |  13 ++-
 tests/runner.py       |   2 +-
 tools/building.py     | 263 +-----------------------------------------
 tools/js_optimizer.py |  19 +--
 tools/shared.py       |  43 +++++++
 tools/system_libs.py  |  56 +++++----
 6 files changed, 90 insertions(+), 306 deletions(-)

diff --git a/embuilder.py b/embuilder.py
index 468ec2e72c7eb..0a4c2830da87e 100755
--- a/embuilder.py
+++ b/embuilder.py
@@ -15,6 +15,7 @@
 import argparse
 import logging
 import sys
+import time
 
 from tools import shared
 from tools import system_libs
@@ -112,6 +113,9 @@ def build_port(port_name):
 
 def main():
   global force
+
+  all_build_start_time = time.time()
+
   parser = argparse.ArgumentParser(description=__doc__,
                                    formatter_class=argparse.RawDescriptionHelpFormatter,
                                    epilog=get_help())
@@ -166,6 +170,7 @@ def main():
     print('Building targets: %s' % ' '.join(tasks))
   for what in tasks:
     logger.info('building and verifying ' + what)
+    start_time = time.time()
     if what in SYSTEM_LIBRARIES:
       library = SYSTEM_LIBRARIES[what]
       if force:
@@ -260,7 +265,13 @@ def main():
       logger.error('unfamiliar build target: ' + what)
       return 1
 
-    logger.info('...success')
+    time_taken = time.time() - start_time
+    logger.info('...success. Took %s(%.2fs)' % (('%02d:%02d mins ' % (time_taken // 60, time_taken % 60) if time_taken >= 60 else ''), time_taken))
+
+  if len(tasks) > 1:
+    all_build_time_taken = time.time() - all_build_start_time
+    logger.info('Built %d targets in %s(%.2fs)' % (len(tasks), ('%02d:%02d mins ' % (all_build_time_taken // 60, all_build_time_taken % 60) if all_build_time_taken >= 60 else ''), all_build_time_taken))
+
   return 0
 
 
diff --git a/tests/runner.py b/tests/runner.py
index 871701598d787..0096597e1f8b7 100755
--- a/tests/runner.py
+++ b/tests/runner.py
@@ -791,7 +791,7 @@ def get_library(self, name, generated_libs, configure=['sh', './configure'],
                   configure_args=[], make=['make'], make_args=None,
                   env_init={}, cache_name_extra='', native=False):
     if make_args is None:
-      make_args = ['-j', str(building.get_num_cores())]
+      make_args = ['-j', str(shared.get_num_cores())]
 
     build_dir = self.get_build_dir()
     output_dir = self.get_dir()
diff --git a/tools/building.py b/tools/building.py
index d436174c78e57..ffdcd46931f21 100644
--- a/tools/building.py
+++ b/tools/building.py
@@ -6,7 +6,6 @@
 import atexit
 import json
 import logging
-import multiprocessing
 import os
 import re
 import shlex
@@ -36,7 +35,6 @@
 logger = logging.getLogger('building')
 
 #  Building
-multiprocessing_pool = None
 binaryen_checked = False
 
 EXPECTED_BINARYEN_VERSION = 100
@@ -119,15 +117,6 @@ def extract_archive_contents(archive_file):
   }
 
 
-def g_multiprocessing_initializer(*args):
-  for item in args:
-    (key, value) = item.split('=', 1)
-    if key == 'EMCC_POOL_CWD':
-      os.chdir(value)
-    else:
-      os.environ[key] = value
-
-
 def unique_ordered(values):
   """return a list of unique values in an input list, without changing order
   (list(set(.)) would change order randomly).
@@ -152,74 +141,6 @@ def clear():
   _is_ar_cache.clear()
 
 
-def get_num_cores():
-  return int(os.environ.get('EMCC_CORES', multiprocessing.cpu_count()))
-
-
-# Multiprocessing pools are very slow to build up and tear down, and having
-# several pools throughout the application has a problem of overallocating
-# child processes. Therefore maintain a single centralized pool that is shared
-# between all pooled task invocations.
-def get_multiprocessing_pool():
-  global multiprocessing_pool
-  if not multiprocessing_pool:
-    cores = get_num_cores()
-
-    # If running with one core only, create a mock instance of a pool that does not
-    # actually spawn any new subprocesses. Very useful for internal debugging.
-    if cores == 1:
-      class FakeMultiprocessor(object):
-        def map(self, func, tasks, *args, **kwargs):
-          results = []
-          for t in tasks:
-            results += [func(t)]
-          return results
-
-        def map_async(self, func, tasks, *args, **kwargs):
-          class Result:
-            def __init__(self, func, tasks):
-              self.func = func
-              self.tasks = tasks
-
-            def get(self, timeout):
-              results = []
-              for t in tasks:
-                results += [func(t)]
-              return results
-
-          return Result(func, tasks)
-
-      multiprocessing_pool = FakeMultiprocessor()
-    else:
-      child_env = [
-        # Multiprocessing pool children must have their current working
-        # directory set to a safe path that is guaranteed not to die in
-        # between of executing commands, or otherwise the pool children will
-        # have trouble spawning subprocesses of their own.
-        'EMCC_POOL_CWD=' + path_from_root(),
-        # Multiprocessing pool children can't spawn their own linear number of
-        # children, that could cause a quadratic amount of spawned processes.
-        'EMCC_CORES=1'
-      ]
-      multiprocessing_pool = multiprocessing.Pool(processes=cores, initializer=g_multiprocessing_initializer, initargs=child_env)
-
-      def close_multiprocessing_pool():
-        global multiprocessing_pool
-        try:
-          # Shut down the pool explicitly, because leaving that for Python to do at process shutdown is buggy and can generate
-          # noisy "WindowsError: [Error 5] Access is denied" spam which is not fatal.
-          multiprocessing_pool.terminate()
-          multiprocessing_pool.join()
-          multiprocessing_pool = None
-        except OSError as e:
-          # Mute the "WindowsError: [Error 5] Access is denied" errors, raise all others through
-          if not (sys.platform.startswith('win') and isinstance(e, WindowsError) and e.winerror == 5):
-            raise
-      atexit.register(close_multiprocessing_pool)
-
-  return multiprocessing_pool
-
-
 # .. but for Popen, we cannot have doublequotes, so provide functionality to
 # remove them when needed.
 def remove_quotes(arg):
@@ -358,46 +279,6 @@ def llvm_nm(file):
   return llvm_nm_multiple([file])[0]
 
 
-def read_link_inputs(files):
-  with ToolchainProfiler.profile_block('read_link_inputs'):
-    # Before performing the link, we need to look at each input file to determine which symbols
-    # each of them provides. Do this in multiple parallel processes.
-    archive_names = [] # .a files passed in to the command line to the link
-    object_names = [] # .o/.bc files passed in to the command line to the link
-    for f in files:
-      absolute_path_f = make_paths_absolute(f)
-
-      if absolute_path_f not in ar_contents and is_ar(absolute_path_f):
-        archive_names.append(absolute_path_f)
-      elif absolute_path_f not in nm_cache and is_bitcode(absolute_path_f):
-        object_names.append(absolute_path_f)
-
-    # Archives contain objects, so process all archives first in parallel to obtain the object files in them.
-    pool = get_multiprocessing_pool()
-    object_names_in_archives = pool.map(extract_archive_contents, archive_names)
-
-    def clean_temporary_archive_contents_directory(directory):
-      def clean_at_exit():
-        try_delete(directory)
-      if directory:
-        atexit.register(clean_at_exit)
-
-    for n in range(len(archive_names)):
-      if object_names_in_archives[n]['returncode'] != 0:
-        raise Exception('llvm-ar failed on archive ' + archive_names[n] + '!')
-      ar_contents[archive_names[n]] = object_names_in_archives[n]['files']
-      clean_temporary_archive_contents_directory(object_names_in_archives[n]['dir'])
-
-    for o in object_names_in_archives:
-      for f in o['files']:
-        if f not in nm_cache:
-          object_names.append(f)
-
-    # Next, extract symbols from all object files (either standalone or inside archives we just extracted)
-    # The results are not used here directly, but populated to llvm-nm cache structure.
-    llvm_nm_multiple(object_names)
-
-
 def llvm_backend_args():
   # disable slow and relatively unimportant optimization passes
   args = ['-combiner-global-alias-analysis=false']
@@ -425,11 +306,7 @@ def llvm_backend_args():
 
 
 def link_to_object(linker_inputs, target):
-  # link using lld unless LTO is requested (lld can't output LTO/bitcode object files).
-  if not Settings.LTO:
-    link_lld(linker_inputs + ['--relocatable'], target)
-  else:
-    link_bitcode(linker_inputs, target)
+  link_lld(linker_inputs + ['--relocatable'], target)
 
 
 def link_llvm(linker_inputs, target):
@@ -557,144 +434,6 @@ def link_lld(args, target, external_symbol_list=None):
   check_call(cmd)
 
 
-def link_bitcode(files, target, force_archive_contents=False):
-  # "Full-featured" linking: looks into archives (duplicates lld functionality)
-  actual_files = []
-  # Tracking unresolveds is necessary for .a linking, see below.
-  # Specify all possible entry points to seed the linking process.
-  # For a simple application, this would just be "main".
-  unresolved_symbols = set([func[1:] for func in Settings.EXPORTED_FUNCTIONS])
-  resolved_symbols = set()
-  # Paths of already included object files from archives.
-  added_contents = set()
-  has_ar = False
-  for f in files:
-    if not f.startswith('-'):
-      has_ar = has_ar or is_ar(make_paths_absolute(f))
-
-  # If we have only one archive or the force_archive_contents flag is set,
-  # then we will add every object file we see, regardless of whether it
-  # resolves any undefined symbols.
-  force_add_all = len(files) == 1 or force_archive_contents
-
-  # Considers an object file for inclusion in the link. The object is included
-  # if force_add=True or if the object provides a currently undefined symbol.
-  # If the object is included, the symbol tables are updated and the function
-  # returns True.
-  def consider_object(f, force_add=False):
-    new_symbols = llvm_nm(f)
-    # Check if the object was valid according to llvm-nm. It also accepts
-    # native object files.
-    if not new_symbols.is_valid_for_nm():
-      diagnostics.warning('emcc', 'object %s is not valid according to llvm-nm, cannot link', f)
-      return False
-    # Check the object is valid for us, and not a native object file.
-    if not is_bitcode(f):
-      exit_with_error('unknown file type: %s', f)
-    provided = new_symbols.defs.union(new_symbols.commons)
-    do_add = force_add or not unresolved_symbols.isdisjoint(provided)
-    if do_add:
-      logger.debug('adding object %s to link (forced: %d)' % (f, force_add))
-      # Update resolved_symbols table with newly resolved symbols
-      resolved_symbols.update(provided)
-      # Update unresolved_symbols table by adding newly unresolved symbols and
-      # removing newly resolved symbols.
-      unresolved_symbols.update(new_symbols.undefs.difference(resolved_symbols))
-      unresolved_symbols.difference_update(provided)
-      actual_files.append(f)
-    return do_add
-
-  # Traverse a single archive. The object files are repeatedly scanned for
-  # newly satisfied symbols until no new symbols are found. Returns true if
-  # any object files were added to the link.
-  def consider_archive(f, force_add):
-    added_any_objects = False
-    loop_again = True
-    logger.debug('considering archive %s' % (f))
-    contents = ar_contents[f]
-    while loop_again: # repeatedly traverse until we have everything we need
-      loop_again = False
-      for content in contents:
-        if content in added_contents:
-          continue
-        # Link in the .o if it provides symbols, *or* this is a singleton archive (which is
-        # apparently an exception in gcc ld)
-        if consider_object(content, force_add=force_add):
-          added_contents.add(content)
-          loop_again = True
-          added_any_objects = True
-    logger.debug('done running loop of archive %s' % (f))
-    return added_any_objects
-
-  read_link_inputs([x for x in files if not x.startswith('-')])
-
-  # Rescan a group of archives until we don't find any more objects to link.
-  def scan_archive_group(group):
-    loop_again = True
-    logger.debug('starting archive group loop')
-    while loop_again:
-      loop_again = False
-      for archive in group:
-        if consider_archive(archive, force_add=False):
-          loop_again = True
-    logger.debug('done with archive group loop')
-
-  current_archive_group = None
-  in_whole_archive = False
-  for f in files:
-    absolute_path_f = make_paths_absolute(f)
-    if f.startswith('-'):
-      if f in ['--start-group', '-(']:
-        assert current_archive_group is None, 'Nested --start-group, missing --end-group?'
-        current_archive_group = []
-      elif f in ['--end-group', '-)']:
-        assert current_archive_group is not None, '--end-group without --start-group'
-        scan_archive_group(current_archive_group)
-        current_archive_group = None
-      elif f in ['--whole-archive', '-whole-archive']:
-        in_whole_archive = True
-      elif f in ['--no-whole-archive', '-no-whole-archive']:
-        in_whole_archive = False
-      else:
-        # Command line flags should already be vetted by the time this method
-        # is called, so this is an internal error
-        assert False, 'unsupported link flag: ' + f
-    elif is_ar(absolute_path_f):
-      # Extract object files from ar archives, and link according to gnu ld semantics
-      # (link in an entire .o from the archive if it supplies symbols still unresolved)
-      consider_archive(absolute_path_f, in_whole_archive or force_add_all)
-      # If we're inside a --start-group/--end-group section, add to the list
-      # so we can loop back around later.
-      if current_archive_group is not None:
-        current_archive_group.append(absolute_path_f)
-    elif is_bitcode(absolute_path_f):
-      if has_ar:
-        consider_object(f, force_add=True)
-      else:
-        # If there are no archives then we can simply link all valid object
-        # files and skip the symbol table stuff.
-        actual_files.append(f)
-    else:
-      exit_with_error('unknown file type: %s', f)
-
-  # We have to consider the possibility that --start-group was used without a matching
-  # --end-group; GNU ld permits this behavior and implicitly treats the end of the
-  # command line as having an --end-group.
-  if current_archive_group:
-    logger.debug('--start-group without matching --end-group, rescanning')
-    scan_archive_group(current_archive_group)
-    current_archive_group = None
-
-  try_delete(target)
-
-  # Finish link
-  # tolerate people trying to link a.so a.so etc.
-  actual_files = unique_ordered(actual_files)
-
-  logger.debug('emcc: linking: %s to %s', actual_files, target)
-  link_llvm(actual_files, target)
-
-
 def get_command_with_possible_response_file(cmd):
   # 8k is a bit of an arbitrary limit, but a reasonable one
   # for max command line size before we use a response file
diff --git a/tools/js_optimizer.py b/tools/js_optimizer.py
index f932bb01224e7..e324277e936e6 100755
--- a/tools/js_optimizer.py
+++ b/tools/js_optimizer.py
@@ -295,7 +295,7 @@ def check_symbol_mapping(p):
   with ToolchainProfiler.profile_block('js_optimizer.split_to_chunks'):
     # if we are making source maps, we want our debug numbering to start from the
     # top of the file, so avoid breaking the JS into chunks
-    cores = building.get_num_cores()
+    cores = shared.get_num_cores()
 
     if not just_split:
       intended_num_chunks = int(round(cores * NUM_CHUNKS_PER_CORE))
@@ -330,22 +330,7 @@ def write_chunk(chunk, i):
   with ToolchainProfiler.profile_block('run_optimizer'):
     if len(filenames):
       commands = [config.NODE_JS + [ACORN_OPTIMIZER, f] + passes for f in filenames]
-
-      cores = min(cores, len(filenames))
-      if len(chunks) > 1 and cores >= 2:
-        # We can parallelize
-        if DEBUG:
-          print('splitting up js optimization into %d chunks, using %d cores  (total: %.2f MB)' % (len(chunks), cores, total_size / (1024 * 1024.)), file=sys.stderr)
-        with ToolchainProfiler.profile_block('optimizer_pool'):
-          pool = building.get_multiprocessing_pool()
-          filenames = pool.map(run_on_chunk, commands, chunksize=1)
-      else:
-        # We can't parallize, but still break into chunks to avoid node memory issues
-        if len(chunks) > 1 and DEBUG:
-          print('splitting up js optimization into %d chunks' % (len(chunks)), file=sys.stderr)
-        filenames = [run_on_chunk(command) for command in commands]
-    else:
-      filenames = []
+      filenames = shared.run_multiple_processes(commands, route_stdout_to_temp_files_suffix='js_opt.jo.js')
 
     for filename in filenames:
       temp_files.note(filename)
diff --git a/tools/shared.py b/tools/shared.py
index f32031d9caaa4..fa9e7eed98414 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -32,6 +32,14 @@
 from . import filelock
 
 
+import signal
+
+def signal_handler(sig, frame):
+    sys.exit(1)
+
+signal.signal(signal.SIGINT, signal_handler)
+
+
 DEBUG = int(os.environ.get('EMCC_DEBUG', '0'))
 DEBUG_SAVE = DEBUG or int(os.environ.get('EMCC_DEBUG_SAVE', '0'))
 EXPECTED_NODE_VERSION = (4, 1, 1)
@@ -96,6 +104,41 @@ def run_process(cmd, check=True, input=None, *args, **kw):
   return ret
 
 
+def get_num_cores():
+  import multiprocessing
+  return int(os.environ.get('EMCC_CORES', multiprocessing.cpu_count()))
+
+
+def run_multiple_processes(commands, child_env=None, route_stdout_to_temp_files_suffix=None, pipe_stdout=False):
+  std_outs = []
+  with ToolchainProfiler.profile_block('parallel_run_js_optimizers'):
+    processes = []
+    start = 0
+    end = 0
+    num_parallel_processes = get_num_cores()
+    temp_files = configuration.get_temp_files()
+    while start < len(commands):
+      if start + num_parallel_processes > end and end < len(commands): # Spawn a new process?
+        std_out = temp_files.get(route_stdout_to_temp_files_suffix) if route_stdout_to_temp_files_suffix else (subprocess.PIPE if pipe_stdout else None)
+        if DEBUG:
+          logger.debug('Running subprocess %d/%d: %s' % (end + 1, len(commands), ' '.join(commands[end])))
+        processes += [subprocess.Popen(commands[end], stdout=std_out, env=child_env if child_env else os.environ.copy())]
+        if route_stdout_to_temp_files_suffix:
+          std_outs += [std_out.name]
+        elif pipe_stdout:
+          std_outs += [std_out]
+        end += 1
+      else:
+        # Too many commands running in parallel, wait for one to finish.
+        out, err = processes[start].communicate()
+        if processes[start].returncode != 0:
+          if out: logger.info(out.decode('UTF-8'))
+          if err: logger.error(err.decode('UTF-8'))
+          raise Exception('Subprocess %d/%d failed with return code %d!' % (start + 1, len(commands), processes[start].returncode))
+        start += 1
+  return std_outs
+
+
 def check_call(cmd, *args, **kw):
   """Like `run_process` above but treat failures as fatal and exit_with_error."""
   print_compiler_stage(cmd)
diff --git a/tools/system_libs.py b/tools/system_libs.py
index bcb585bc7061e..74a088e9fb079 100644
--- a/tools/system_libs.py
+++ b/tools/system_libs.py
@@ -13,6 +13,7 @@
 import sys
 from glob import iglob
 
+from .toolchain_profiler import ToolchainProfiler
 from . import shared, building, ports, config, utils
 from . import deps_info, tempfiles
 from . import diagnostics
@@ -78,35 +79,39 @@ def clean_env():
   return safe_env
 
 
-def run_one_command(cmd):
-  # Helper function used by run_build_commands.
-  if shared.EM_BUILD_VERBOSE:
-    print(shared.shlex_join(cmd))
-  # TODO(sbc): Remove this one we remove the test_em_config_env_var test
-  cmd.append('-Wno-deprecated')
-  try:
-    shared.run_process(cmd, env=clean_env())
-  except subprocess.CalledProcessError as e:
-    print("'%s' failed (%d)" % (shared.shlex_join(e.cmd), e.returncode))
-    raise
-
-
 def run_build_commands(commands):
   # Before running a set of build commands make sure the common sysroot
   # headers are installed.  This prevents each sub-process from attempting
   # to setup the sysroot itself.
   ensure_sysroot()
-  cores = min(len(commands), building.get_num_cores())
-  if cores <= 1 or shared.DEBUG:
-    for command in commands:
-      run_one_command(command)
-  else:
-    pool = building.get_multiprocessing_pool()
-    # https://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
-    # https://bugs.python.org/issue8296
-    # 999999 seconds (about 11 days) is reasonably huge to not trigger actual timeout
-    # and is smaller than the maximum timeout value 4294967.0 for Python 3 on Windows (threading.TIMEOUT_MAX)
-    pool.map_async(run_one_command, commands, chunksize=1).get(999999)
+
+
+
+  safe_env = clean_env()  # We already did a sanity check launching the compiler once, no need to launch the compiler
+  # again on each child subprocess spawn.
+  safe_env['EMCC_SKIP_SANITY_CHECK'] = '1'
+
+  # If we got spawned by ccache, then launch subprocesses in ccache as well.
+  if 'EMCC_CCACHE_' in safe_env:
+    safe_env['EMCC_CCACHE'] = '1'
+
+  for i in range(len(commands)):
+    # TODO(sbc): Remove this one we remove the test_em_config_env_var test
+    commands[i].append('-Wno-deprecated')
+
+    # For subprocess spawns, do not route via the OS batch script launcher, but directly
+    # spawn the python script. This saves ~2 seconds on libc build.
+    # However if we are using ccache, we must use the wrappers, since they dispatch
+    # execution to ccache executable.
+    if 'EMCC_CCACHE' not in safe_env:
+      if commands[i][0].endswith('emcc.bat'):
+        commands[i][0] = commands[i][0].replace('emcc.bat', 'emcc.py')
+        commands[i] = [sys.executable] + commands[i]
+      elif commands[i][0].endswith('emcc'):
+        commands[i][0] = commands[i][0].replace('emcc', 'emcc.py')
+        commands[i] = [sys.executable] + commands[i]
+
+  shared.run_multiple_processes(commands)
 
 
 def create_lib(libname, inputs):
@@ -1962,4 +1967,5 @@ def install_system_headers(stamp):
 
 
 def ensure_sysroot():
-  shared.Cache.get('sysroot_install.stamp', install_system_headers, what='system headers')
+  with ToolchainProfiler.profile_block('ensure_sysroot'):
+    shared.Cache.get('sysroot_install.stamp', install_system_headers, what='system headers')

From a5d8ed7711d7be82d85241e401f9435d7966250e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sun, 14 Feb 2021 10:29:23 +0200
Subject: [PATCH 02/26] Change llvm_nm_multiple() to use
 run_multiple_processes()

---
 tools/building.py | 36 +++++++++++++++++++-----------------
 tools/shared.py   |  8 ++++----
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/tools/building.py b/tools/building.py
index ffdcd46931f21..87927f8a97e42 100644
--- a/tools/building.py
+++ b/tools/building.py
@@ -206,17 +206,26 @@ def llvm_nm_multiple(files):
   with ToolchainProfiler.profile_block('llvm_nm_multiple'):
     if len(files) == 0:
       return []
-    # Run llvm-nm on files that we haven't cached yet
+
+    # Run llvm-nm only files that we haven't cached yet
     llvm_nm_files = [f for f in files if f not in nm_cache]
 
     # We can issue multiple files in a single llvm-nm calls, but only if those
     # files are all .o or .bc files. Because of llvm-nm output format, we cannot
     # llvm-nm multiple .a files in one call, but those must be individually checked.
-    if len(llvm_nm_files) > 1:
-      llvm_nm_files = [f for f in files if f.endswith('.o') or f.endswith('.bc')]
 
-    if len(llvm_nm_files) > 0:
-      cmd = [LLVM_NM] + llvm_nm_files
+    o_files = [f for f in llvm_nm_files if os.path.splitext(f)[1].lower() in ['.o', '.obj', '.bc']]
+    a_files = [f for f in llvm_nm_files if f not in o_files]
+
+    # Issue parallel calls for .a files
+    if len(a_files) > 0:
+      results = shared.run_multiple_processes([[LLVM_NM, a] for a in a_files], pipe_stdout=True, check=False)
+      for i in range(len(results)):
+        nm_cache[a_files[i]] = parse_symbols(results[i])
+
+    # Issue a single call for multiple .o files
+    if len(o_files) > 0:
+      cmd = [LLVM_NM] + o_files
       cmd = get_command_with_possible_response_file(cmd)
       results = run_process(cmd, stdout=PIPE, stderr=PIPE, check=False)
 
@@ -240,11 +249,11 @@ def llvm_nm_multiple(files):
       # so loop over the report to extract the results
       # for each individual file.
 
-      filename = llvm_nm_files[0]
+      filename = o_files[0]
 
       # When we dispatched more than one file, we must manually parse
       # the file result delimiters (like shown structured above)
-      if len(llvm_nm_files) > 1:
+      if len(o_files) > 1:
         file_start = 0
         i = 0
 
@@ -261,18 +270,11 @@ def llvm_nm_multiple(files):
 
         nm_cache[filename] = parse_symbols(results[file_start:])
       else:
-        # We only dispatched a single file, we can just parse that directly
-        # to the output.
+        # We only dispatched a single file, so can parse all of the result directly
+        # to that file.
         nm_cache[filename] = parse_symbols(results)
 
-    # Any .a files that have multiple .o files will have hard time parsing. Scan those
-    # sequentially to confirm. TODO: Move this to use run_multiple_processes()
-    # when available.
-    for f in files:
-      if f not in nm_cache:
-        nm_cache[f] = llvm_nm(f)
-
-  return [nm_cache[f] for f in files]
+  return [nm_cache[f] if f in nm_cache else ObjectFileInfo(1, '') for f in files]
 
 
 def llvm_nm(file):
diff --git a/tools/shared.py b/tools/shared.py
index fa9e7eed98414..fe53bb490f5f0 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -109,7 +109,7 @@ def get_num_cores():
   return int(os.environ.get('EMCC_CORES', multiprocessing.cpu_count()))
 
 
-def run_multiple_processes(commands, child_env=None, route_stdout_to_temp_files_suffix=None, pipe_stdout=False):
+def run_multiple_processes(commands, child_env=None, route_stdout_to_temp_files_suffix=None, pipe_stdout=False, check=True):
   std_outs = []
   with ToolchainProfiler.profile_block('parallel_run_js_optimizers'):
     processes = []
@@ -125,13 +125,13 @@ def run_multiple_processes(commands, child_env=None, route_stdout_to_temp_files_
         processes += [subprocess.Popen(commands[end], stdout=std_out, env=child_env if child_env else os.environ.copy())]
         if route_stdout_to_temp_files_suffix:
           std_outs += [std_out.name]
-        elif pipe_stdout:
-          std_outs += [std_out]
         end += 1
       else:
         # Too many commands running in parallel, wait for one to finish.
         out, err = processes[start].communicate()
-        if processes[start].returncode != 0:
+        if pipe_stdout:
+          std_outs += out.decode('UTF-8')
+        if check and processes[start].returncode != 0:
           if out: logger.info(out.decode('UTF-8'))
           if err: logger.error(err.decode('UTF-8'))
           raise Exception('Subprocess %d/%d failed with return code %d!' % (start + 1, len(commands), processes[start].returncode))

From b9685b012c99add6f9f2e9eee803c6b562e35acf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sun, 14 Feb 2021 10:34:55 +0200
Subject: [PATCH 03/26] flake

---
 tools/building.py    |  1 -
 tools/shared.py      | 22 ++++++++++++----------
 tools/system_libs.py |  1 -
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tools/building.py b/tools/building.py
index 87927f8a97e42..e3a05b0a34478 100644
--- a/tools/building.py
+++ b/tools/building.py
@@ -3,7 +3,6 @@
 # University of Illinois/NCSA Open Source License.  Both these licenses can be
 # found in the LICENSE file.
 
-import atexit
 import json
 import logging
 import os
diff --git a/tools/shared.py b/tools/shared.py
index fe53bb490f5f0..86377779f9a8b 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -32,14 +32,6 @@
 from . import filelock
 
 
-import signal
-
-def signal_handler(sig, frame):
-    sys.exit(1)
-
-signal.signal(signal.SIGINT, signal_handler)
-
-
 DEBUG = int(os.environ.get('EMCC_DEBUG', '0'))
 DEBUG_SAVE = DEBUG or int(os.environ.get('EMCC_DEBUG_SAVE', '0'))
 EXPECTED_NODE_VERSION = (4, 1, 1)
@@ -111,6 +103,14 @@ def get_num_cores():
 
 def run_multiple_processes(commands, child_env=None, route_stdout_to_temp_files_suffix=None, pipe_stdout=False, check=True):
   std_outs = []
+
+  # TODO: Experiment with registering a signal handler here to see if that helps with Ctrl-C locking up the command prompt
+  # when multiple child processes have been spawned.
+  #import signal
+  #def signal_handler(sig, frame):
+  #  sys.exit(1)
+  #signal.signal(signal.SIGINT, signal_handler)
+
   with ToolchainProfiler.profile_block('parallel_run_js_optimizers'):
     processes = []
     start = 0
@@ -132,8 +132,10 @@ def run_multiple_processes(commands, child_env=None, route_stdout_to_temp_files_
         if pipe_stdout:
           std_outs += out.decode('UTF-8')
         if check and processes[start].returncode != 0:
-          if out: logger.info(out.decode('UTF-8'))
-          if err: logger.error(err.decode('UTF-8'))
+          if out:
+            logger.info(out.decode('UTF-8'))
+          if err:
+            logger.error(err.decode('UTF-8'))
           raise Exception('Subprocess %d/%d failed with return code %d!' % (start + 1, len(commands), processes[start].returncode))
         start += 1
   return std_outs
diff --git a/tools/system_libs.py b/tools/system_libs.py
index 74a088e9fb079..513560ce3a31a 100644
--- a/tools/system_libs.py
+++ b/tools/system_libs.py
@@ -9,7 +9,6 @@
 import logging
 import os
 import shutil
-import subprocess
 import sys
 from glob import iglob
 

From 806ac77964072cf8d0333832e4a78c85f7f6a548 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sun, 14 Feb 2021 10:37:14 +0200
Subject: [PATCH 04/26] fix stdout pipe

---
 tools/shared.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/shared.py b/tools/shared.py
index 86377779f9a8b..b6485d99f6579 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -130,7 +130,7 @@ def run_multiple_processes(commands, child_env=None, route_stdout_to_temp_files_
         # Too many commands running in parallel, wait for one to finish.
         out, err = processes[start].communicate()
         if pipe_stdout:
-          std_outs += out.decode('UTF-8')
+          std_outs += [out.decode('UTF-8')]
         if check and processes[start].returncode != 0:
           if out:
             logger.info(out.decode('UTF-8'))

From 2ac197c1f6d1b58259514166941704a782560ea0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sun, 14 Feb 2021 10:40:30 +0200
Subject: [PATCH 05/26] flake

---
 tools/shared.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/shared.py b/tools/shared.py
index b6485d99f6579..ed67e6826a6b5 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -106,10 +106,10 @@ def run_multiple_processes(commands, child_env=None, route_stdout_to_temp_files_
 
   # TODO: Experiment with registering a signal handler here to see if that helps with Ctrl-C locking up the command prompt
   # when multiple child processes have been spawned.
-  #import signal
-  #def signal_handler(sig, frame):
-  #  sys.exit(1)
-  #signal.signal(signal.SIGINT, signal_handler)
+  # import signal
+  # def signal_handler(sig, frame):
+  #   sys.exit(1)
+  # signal.signal(signal.SIGINT, signal_handler)
 
   with ToolchainProfiler.profile_block('parallel_run_js_optimizers'):
     processes = []

From 76f7fca226b855d97b8a7f84a60a9d578a091730 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sun, 14 Feb 2021 10:43:37 +0200
Subject: [PATCH 06/26] Profile block string run_multiple_processes

---
 tools/shared.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/shared.py b/tools/shared.py
index ed67e6826a6b5..27dde79f21a78 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -111,7 +111,7 @@ def run_multiple_processes(commands, child_env=None, route_stdout_to_temp_files_
   #   sys.exit(1)
   # signal.signal(signal.SIGINT, signal_handler)
 
-  with ToolchainProfiler.profile_block('parallel_run_js_optimizers'):
+  with ToolchainProfiler.profile_block('run_multiple_processes'):
     processes = []
     start = 0
     end = 0

From ed17d8e1bebd3ce15b01e4ff9b55986e3a069ee6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sun, 14 Feb 2021 10:51:54 +0200
Subject: [PATCH 07/26] Remove unnecessary EMCC_SKIP_SANITY_CHECK set

---
 tools/system_libs.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tools/system_libs.py b/tools/system_libs.py
index 513560ce3a31a..ed261b906d1b1 100644
--- a/tools/system_libs.py
+++ b/tools/system_libs.py
@@ -86,9 +86,7 @@ def run_build_commands(commands):
 
 
 
-  safe_env = clean_env()  # We already did a sanity check launching the compiler once, no need to launch the compiler
-  # again on each child subprocess spawn.
-  safe_env['EMCC_SKIP_SANITY_CHECK'] = '1'
+  safe_env = clean_env()
 
   # If we got spawned by ccache, then launch subprocesses in ccache as well.
   if 'EMCC_CCACHE_' in safe_env:

From 78a40fcd4eec4c4b24ad88dd8f324317cfca24f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sun, 14 Feb 2021 16:25:20 +0200
Subject: [PATCH 08/26] Remove EMCC_CCACHE related code.

---
 tools/system_libs.py | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/tools/system_libs.py b/tools/system_libs.py
index ed261b906d1b1..193ab5521acea 100644
--- a/tools/system_libs.py
+++ b/tools/system_libs.py
@@ -88,25 +88,18 @@ def run_build_commands(commands):
 
   safe_env = clean_env()
 
-  # If we got spawned by ccache, then launch subprocesses in ccache as well.
-  if 'EMCC_CCACHE_' in safe_env:
-    safe_env['EMCC_CCACHE'] = '1'
-
   for i in range(len(commands)):
     # TODO(sbc): Remove this one we remove the test_em_config_env_var test
     commands[i].append('-Wno-deprecated')
 
     # For subprocess spawns, do not route via the OS batch script launcher, but directly
     # spawn the python script. This saves ~2 seconds on libc build.
-    # However if we are using ccache, we must use the wrappers, since they dispatch
-    # execution to ccache executable.
-    if 'EMCC_CCACHE' not in safe_env:
-      if commands[i][0].endswith('emcc.bat'):
-        commands[i][0] = commands[i][0].replace('emcc.bat', 'emcc.py')
-        commands[i] = [sys.executable] + commands[i]
-      elif commands[i][0].endswith('emcc'):
-        commands[i][0] = commands[i][0].replace('emcc', 'emcc.py')
-        commands[i] = [sys.executable] + commands[i]
+    if commands[i][0].endswith('emcc.bat'):
+      commands[i][0] = commands[i][0].replace('emcc.bat', 'emcc.py')
+      commands[i] = [sys.executable] + commands[i]
+    elif commands[i][0].endswith('emcc'):
+      commands[i][0] = commands[i][0].replace('emcc', 'emcc.py')
+      commands[i] = [sys.executable] + commands[i]
 
   shared.run_multiple_processes(commands)
 

From aa1588c9afd7e7d09f8c35ba2fac2d4357b8c2bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sun, 28 Mar 2021 12:08:12 +0300
Subject: [PATCH 09/26] Restore bitcode linking.

---
 tools/building.py | 208 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 207 insertions(+), 1 deletion(-)

diff --git a/tools/building.py b/tools/building.py
index e3a05b0a34478..f8e970de618c8 100644
--- a/tools/building.py
+++ b/tools/building.py
@@ -140,6 +140,30 @@ def clear():
   _is_ar_cache.clear()
 
 
+def get_multiprocessing_pool():
+  class FakeMultiprocessor(object):
+    def map(self, func, tasks, *args, **kwargs):
+      results = []
+      for t in tasks:
+        results += [func(t)]
+      return results
+
+    def map_async(self, func, tasks, *args, **kwargs):
+      class Result:
+        def __init__(self, func, tasks):
+          self.func = func
+          self.tasks = tasks
+
+        def get(self, timeout):
+          results = []
+          for t in tasks:
+            results += [func(t)]
+          return results
+
+      return Result(func, tasks)
+
+  return FakeMultiprocessor()
+
 # .. but for Popen, we cannot have doublequotes, so provide functionality to
 # remove them when needed.
 def remove_quotes(arg):
@@ -280,6 +304,46 @@ def llvm_nm(file):
   return llvm_nm_multiple([file])[0]
 
 
+def read_link_inputs(files):
+  with ToolchainProfiler.profile_block('read_link_inputs'):
+    # Before performing the link, we need to look at each input file to determine which symbols
+    # each of them provides. Do this in multiple parallel processes.
+    archive_names = [] # .a files passed in to the command line to the link
+    object_names = [] # .o/.bc files passed in to the command line to the link
+    for f in files:
+      absolute_path_f = make_paths_absolute(f)
+
+      if absolute_path_f not in ar_contents and is_ar(absolute_path_f):
+        archive_names.append(absolute_path_f)
+      elif absolute_path_f not in nm_cache and is_bitcode(absolute_path_f):
+        object_names.append(absolute_path_f)
+
+    # Archives contain objects, so process all archives first in parallel to obtain the object files in them.
+    pool = get_multiprocessing_pool()
+    object_names_in_archives = pool.map(extract_archive_contents, archive_names)
+
+    def clean_temporary_archive_contents_directory(directory):
+      def clean_at_exit():
+        try_delete(directory)
+      if directory:
+        atexit.register(clean_at_exit)
+
+    for n in range(len(archive_names)):
+      if object_names_in_archives[n]['returncode'] != 0:
+        raise Exception('llvm-ar failed on archive ' + archive_names[n] + '!')
+      ar_contents[archive_names[n]] = object_names_in_archives[n]['files']
+      clean_temporary_archive_contents_directory(object_names_in_archives[n]['dir'])
+
+    for o in object_names_in_archives:
+      for f in o['files']:
+        if f not in nm_cache:
+          object_names.append(f)
+
+    # Next, extract symbols from all object files (either standalone or inside archives we just extracted)
+    # The results are not used here directly, but populated to llvm-nm cache structure.
+    llvm_nm_multiple(object_names)
+
+
 def llvm_backend_args():
   # disable slow and relatively unimportant optimization passes
   args = ['-combiner-global-alias-analysis=false']
@@ -307,7 +371,11 @@ def llvm_backend_args():
 
 
 def link_to_object(linker_inputs, target):
-  link_lld(linker_inputs + ['--relocatable'], target)
+  # link using lld unless LTO is requested (lld can't output LTO/bitcode object files).
+  if not Settings.LTO:
+    link_lld(linker_inputs + ['--relocatable'], target)
+  else:
+    link_bitcode(linker_inputs, target)
 
 
 def link_llvm(linker_inputs, target):
@@ -435,6 +503,144 @@ def link_lld(args, target, external_symbol_list=None):
   check_call(cmd)
 
 
+def link_bitcode(files, target, force_archive_contents=False):
+  # "Full-featured" linking: looks into archives (duplicates lld functionality)
+  actual_files = []
+  # Tracking unresolveds is necessary for .a linking, see below.
+  # Specify all possible entry points to seed the linking process.
+  # For a simple application, this would just be "main".
+  unresolved_symbols = set([func[1:] for func in Settings.EXPORTED_FUNCTIONS])
+  resolved_symbols = set()
+  # Paths of already included object files from archives.
+  added_contents = set()
+  has_ar = False
+  for f in files:
+    if not f.startswith('-'):
+      has_ar = has_ar or is_ar(make_paths_absolute(f))
+
+  # If we have only one archive or the force_archive_contents flag is set,
+  # then we will add every object file we see, regardless of whether it
+  # resolves any undefined symbols.
+  force_add_all = len(files) == 1 or force_archive_contents
+
+  # Considers an object file for inclusion in the link. The object is included
+  # if force_add=True or if the object provides a currently undefined symbol.
+  # If the object is included, the symbol tables are updated and the function
+  # returns True.
+  def consider_object(f, force_add=False):
+    new_symbols = llvm_nm(f)
+    # Check if the object was valid according to llvm-nm. It also accepts
+    # native object files.
+    if not new_symbols.is_valid_for_nm():
+      diagnostics.warning('emcc', 'object %s is not valid according to llvm-nm, cannot link', f)
+      return False
+    # Check the object is valid for us, and not a native object file.
+    if not is_bitcode(f):
+      exit_with_error('unknown file type: %s', f)
+    provided = new_symbols.defs.union(new_symbols.commons)
+    do_add = force_add or not unresolved_symbols.isdisjoint(provided)
+    if do_add:
+      logger.debug('adding object %s to link (forced: %d)' % (f, force_add))
+      # Update resolved_symbols table with newly resolved symbols
+      resolved_symbols.update(provided)
+      # Update unresolved_symbols table by adding newly unresolved symbols and
+      # removing newly resolved symbols.
+      unresolved_symbols.update(new_symbols.undefs.difference(resolved_symbols))
+      unresolved_symbols.difference_update(provided)
+      actual_files.append(f)
+    return do_add
+
+  # Traverse a single archive. The object files are repeatedly scanned for
+  # newly satisfied symbols until no new symbols are found. Returns true if
+  # any object files were added to the link.
+  def consider_archive(f, force_add):
+    added_any_objects = False
+    loop_again = True
+    logger.debug('considering archive %s' % (f))
+    contents = ar_contents[f]
+    while loop_again: # repeatedly traverse until we have everything we need
+      loop_again = False
+      for content in contents:
+        if content in added_contents:
+          continue
+        # Link in the .o if it provides symbols, *or* this is a singleton archive (which is
+        # apparently an exception in gcc ld)
+        if consider_object(content, force_add=force_add):
+          added_contents.add(content)
+          loop_again = True
+          added_any_objects = True
+    logger.debug('done running loop of archive %s' % (f))
+    return added_any_objects
+
+  read_link_inputs([x for x in files if not x.startswith('-')])
+
+  # Rescan a group of archives until we don't find any more objects to link.
+  def scan_archive_group(group):
+    loop_again = True
+    logger.debug('starting archive group loop')
+    while loop_again:
+      loop_again = False
+      for archive in group:
+        if consider_archive(archive, force_add=False):
+          loop_again = True
+    logger.debug('done with archive group loop')
+
+  current_archive_group = None
+  in_whole_archive = False
+  for f in files:
+    absolute_path_f = make_paths_absolute(f)
+    if f.startswith('-'):
+      if f in ['--start-group', '-(']:
+        assert current_archive_group is None, 'Nested --start-group, missing --end-group?'
+        current_archive_group = []
+      elif f in ['--end-group', '-)']:
+        assert current_archive_group is not None, '--end-group without --start-group'
+        scan_archive_group(current_archive_group)
+        current_archive_group = None
+      elif f in ['--whole-archive', '-whole-archive']:
+        in_whole_archive = True
+      elif f in ['--no-whole-archive', '-no-whole-archive']:
+        in_whole_archive = False
+      else:
+        # Command line flags should already be vetted by the time this method
+        # is called, so this is an internal error
+        assert False, 'unsupported link flag: ' + f
+    elif is_ar(absolute_path_f):
+      # Extract object files from ar archives, and link according to gnu ld semantics
+      # (link in an entire .o from the archive if it supplies symbols still unresolved)
+      consider_archive(absolute_path_f, in_whole_archive or force_add_all)
+      # If we're inside a --start-group/--end-group section, add to the list
+      # so we can loop back around later.
+      if current_archive_group is not None:
+        current_archive_group.append(absolute_path_f)
+    elif is_bitcode(absolute_path_f):
+      if has_ar:
+        consider_object(f, force_add=True)
+      else:
+        # If there are no archives then we can simply link all valid object
+        # files and skip the symbol table stuff.
+        actual_files.append(f)
+    else:
+      exit_with_error('unknown file type: %s', f)
+
+  # We have to consider the possibility that --start-group was used without a matching
+  # --end-group; GNU ld permits this behavior and implicitly treats the end of the
+  # command line as having an --end-group.
+  if current_archive_group:
+    logger.debug('--start-group without matching --end-group, rescanning')
+    scan_archive_group(current_archive_group)
+    current_archive_group = None
+
+  try_delete(target)
+
+  # Finish link
+  # tolerate people trying to link a.so a.so etc.
+  actual_files = unique_ordered(actual_files)
+
+  logger.debug('emcc: linking: %s to %s', actual_files, target)
+  link_llvm(actual_files, target)
+
+
 def get_command_with_possible_response_file(cmd):
   # 8k is a bit of an arbitrary limit, but a reasonable one
   # for max command line size before we use a response file

From 13b8912ce6430fa21975abc2e425f23dc44258b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sun, 28 Mar 2021 12:14:06 +0300
Subject: [PATCH 10/26] cleanup

---
 tools/building.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/building.py b/tools/building.py
index f8e970de618c8..78a366a75fdb2 100644
--- a/tools/building.py
+++ b/tools/building.py
@@ -229,7 +229,6 @@ def llvm_nm_multiple(files):
   with ToolchainProfiler.profile_block('llvm_nm_multiple'):
     if len(files) == 0:
       return []
-
     # Run llvm-nm only files that we haven't cached yet
     llvm_nm_files = [f for f in files if f not in nm_cache]
 
@@ -246,7 +245,7 @@ def llvm_nm_multiple(files):
       for i in range(len(results)):
         nm_cache[a_files[i]] = parse_symbols(results[i])
 
-    # Issue a single call for multiple .o files
+    # Issue a single batch call for multiple .o files
     if len(o_files) > 0:
       cmd = [LLVM_NM] + o_files
       cmd = get_command_with_possible_response_file(cmd)

From 83687edbd5a9004d910a8b66fd3c6f1ff3db9782 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sun, 28 Mar 2021 13:57:50 +0300
Subject: [PATCH 11/26] Rewrite bitcode linking to avoid python multiprocessing
 pool.

---
 tools/building.py     | 127 +++++++++++++++---------------------------
 tools/js_optimizer.py |  38 +++----------
 tools/shared.py       |  10 +++-
 3 files changed, 60 insertions(+), 115 deletions(-)

diff --git a/tools/building.py b/tools/building.py
index 78a366a75fdb2..4106d69bf1f5a 100644
--- a/tools/building.py
+++ b/tools/building.py
@@ -74,46 +74,45 @@ def warn_if_duplicate_entries(archive_contents, archive_filename):
     diagnostics.warning('emcc', msg)
 
 
-# This function creates a temporary directory specified by the 'dir' field in
-# the returned dictionary. Caller is responsible for cleaning up those files
-# after done.
-def extract_archive_contents(archive_file):
-  lines = run_process([LLVM_AR, 't', archive_file], stdout=PIPE).stdout.splitlines()
-  # ignore empty lines
-  contents = [l for l in lines if len(l)]
-  if len(contents) == 0:
-    logger.debug('Archive %s appears to be empty (recommendation: link an .so instead of .a)' % archive_file)
-    return {
-      'returncode': 0,
-      'dir': None,
-      'files': []
-    }
-
-  # `ar` files can only contains filenames. Just to be sure,  verify that each
-  # file has only as filename component and is not absolute
-  for f in contents:
-    assert not os.path.dirname(f)
-    assert not os.path.isabs(f)
-
-  warn_if_duplicate_entries(contents, archive_file)
-
-  # create temp dir
-  temp_dir = tempfile.mkdtemp('_archive_contents', 'emscripten_temp_')
-
-  # extract file in temp dir
-  proc = run_process([LLVM_AR, 'xo', archive_file], stdout=PIPE, stderr=STDOUT, cwd=temp_dir)
-  abs_contents = [os.path.join(temp_dir, c) for c in contents]
+# Extracts the given list of archive files and outputs their contents
+def extract_archive_contents(archive_files):
+  archive_results = shared.run_multiple_processes([[LLVM_AR, 't', a] for a in archive_files], pipe_stdout=True)
+
+  unpack_temp_dir = tempfile.mkdtemp('_archive_contents', 'emscripten_temp_')
+  def clean_at_exit():
+    try_delete(unpack_temp_dir)
+  shared.atexit.register(clean_at_exit)
+
+  archive_contents = []
+
+  for i in range(len(archive_results)):
+    a = archive_results[i]
+    contents = [l for l in a.splitlines() if len(l)]
+    if len(contents) == 0:
+      logger.debug('Archive %s appears to be empty (recommendation: link an .so instead of .a)' % a)
+
+    # `ar` files can only contains filenames. Just to be sure, verify that each
+    # file has only as filename component and is not absolute
+    for f in contents:
+      assert not os.path.dirname(f)
+      assert not os.path.isabs(f)
+
+    warn_if_duplicate_entries(contents, a)
+
+    archive_contents += [{
+      'archive_name': archive_files[i],
+      'o_files': [os.path.join(unpack_temp_dir, c) for c in contents]
+    }]
+
+  shared.run_multiple_processes([[LLVM_AR, 'xo', a] for a in archive_files], cwd=unpack_temp_dir)
 
   # check that all files were created
-  missing_contents = [x for x in abs_contents if not os.path.exists(x)]
-  if missing_contents:
-    exit_with_error('llvm-ar failed to extract file(s) ' + str(missing_contents) + ' from archive file ' + f + '! Error:' + str(proc.stdout))
-
-  return {
-    'returncode': proc.returncode,
-    'dir': temp_dir,
-    'files': abs_contents
-  }
+  for a in archive_contents:
+    missing_contents = [x for x in a['o_files'] if not os.path.exists(x)]
+    if missing_contents:
+      exit_with_error('llvm-ar failed to extract file(s) ' + str(missing_contents) + ' from archive file ' + f + '! Error:' + str(proc.stdout))
+
+  return archive_contents
 
 
 def unique_ordered(values):
@@ -140,30 +139,6 @@ def clear():
   _is_ar_cache.clear()
 
 
-def get_multiprocessing_pool():
-  class FakeMultiprocessor(object):
-    def map(self, func, tasks, *args, **kwargs):
-      results = []
-      for t in tasks:
-        results += [func(t)]
-      return results
-
-    def map_async(self, func, tasks, *args, **kwargs):
-      class Result:
-        def __init__(self, func, tasks):
-          self.func = func
-          self.tasks = tasks
-
-        def get(self, timeout):
-          results = []
-          for t in tasks:
-            results += [func(t)]
-          return results
-
-      return Result(func, tasks)
-
-  return FakeMultiprocessor()
-
 # .. but for Popen, we cannot have doublequotes, so provide functionality to
 # remove them when needed.
 def remove_quotes(arg):
@@ -229,7 +204,7 @@ def llvm_nm_multiple(files):
   with ToolchainProfiler.profile_block('llvm_nm_multiple'):
     if len(files) == 0:
       return []
-    # Run llvm-nm only files that we haven't cached yet
+    # Run llvm-nm on files that we haven't cached yet
     llvm_nm_files = [f for f in files if f not in nm_cache]
 
     # We can issue multiple files in a single llvm-nm calls, but only if those
@@ -318,25 +293,13 @@ def read_link_inputs(files):
         object_names.append(absolute_path_f)
 
     # Archives contain objects, so process all archives first in parallel to obtain the object files in them.
-    pool = get_multiprocessing_pool()
-    object_names_in_archives = pool.map(extract_archive_contents, archive_names)
-
-    def clean_temporary_archive_contents_directory(directory):
-      def clean_at_exit():
-        try_delete(directory)
-      if directory:
-        atexit.register(clean_at_exit)
-
-    for n in range(len(archive_names)):
-      if object_names_in_archives[n]['returncode'] != 0:
-        raise Exception('llvm-ar failed on archive ' + archive_names[n] + '!')
-      ar_contents[archive_names[n]] = object_names_in_archives[n]['files']
-      clean_temporary_archive_contents_directory(object_names_in_archives[n]['dir'])
-
-    for o in object_names_in_archives:
-      for f in o['files']:
-        if f not in nm_cache:
-          object_names.append(f)
+    archive_contents = extract_archive_contents(archive_names)
+
+    for a in archive_contents:
+      ar_contents[os.path.abspath(a['archive_name'])] = a['o_files']
+      for o in a['o_files']:
+        if o not in nm_cache:
+          object_names.append(o)
 
     # Next, extract symbols from all object files (either standalone or inside archives we just extracted)
     # The results are not used here directly, but populated to llvm-nm cache structure.
diff --git a/tools/js_optimizer.py b/tools/js_optimizer.py
index e324277e936e6..349a9f10552d2 100755
--- a/tools/js_optimizer.py
+++ b/tools/js_optimizer.py
@@ -130,36 +130,6 @@ def serialize(self):
 end_asm_marker = '// EMSCRIPTEN_END_ASM\n'
 
 
-def run_on_chunk(command):
-  try:
-    if ACORN_OPTIMIZER in command: # XXX hackish
-      index = command.index(ACORN_OPTIMIZER)
-      filename = command[index + 1]
-    else:
-      filename = command[1]
-    if os.environ.get('EMCC_SAVE_OPT_TEMP') and os.environ.get('EMCC_SAVE_OPT_TEMP') != '0':
-      saved = 'save_' + os.path.basename(filename)
-      while os.path.exists(saved):
-        saved = 'input' + str(int(saved.replace('input', '').replace('.txt', '')) + 1) + '.txt'
-      print('running js optimizer command', ' '.join([c if c != filename else saved for c in command]), file=sys.stderr)
-      shutil.copyfile(filename, os.path.join(shared.get_emscripten_temp_dir(), saved))
-    if shared.EM_BUILD_VERBOSE >= 3:
-      print('run_on_chunk: ' + str(command), file=sys.stderr)
-    proc = shared.run_process(command, stdout=subprocess.PIPE)
-    output = proc.stdout
-    assert proc.returncode == 0, 'Error in optimizer (return code ' + str(proc.returncode) + '): ' + output
-    assert len(output) and not output.startswith('Assertion failed'), 'Error in optimizer: ' + output
-    filename = temp_files.get(os.path.basename(filename) + '.jo.js').name
-    with open(filename, 'w') as f:
-      f.write(output)
-    if DEBUG and not shared.WINDOWS:
-      print('.', file=sys.stderr) # Skip debug progress indicator on Windows, since it doesn't buffer well with multiple threads printing to console.
-    return filename
-  except KeyboardInterrupt:
-    # avoid throwing keyboard interrupts from a child process
-    raise Exception()
-
-
 # Given a set of functions of form (ident, text), and a preferred chunk size,
 # generates a set of chunks for parallel processing and caching.
 def chunkify(funcs, chunk_size):
@@ -330,6 +300,14 @@ def write_chunk(chunk, i):
   with ToolchainProfiler.profile_block('run_optimizer'):
     if len(filenames):
       commands = [config.NODE_JS + [ACORN_OPTIMIZER, f] + passes for f in filenames]
+
+      if os.environ.get('EMCC_SAVE_OPT_TEMP') and os.environ.get('EMCC_SAVE_OPT_TEMP') != '0':
+        for filename in filenames:
+          saved = 'save_' + os.path.basename(filename)
+          while os.path.exists(saved):
+            saved = 'input' + str(int(saved.replace('input', '').replace('.txt', '')) + 1) + '.txt'
+          shutil.copyfile(filename, os.path.join(shared.get_emscripten_temp_dir(), saved))
+
       filenames = shared.run_multiple_processes(commands, route_stdout_to_temp_files_suffix='js_opt.jo.js')
 
     for filename in filenames:
diff --git a/tools/shared.py b/tools/shared.py
index 27dde79f21a78..d93cc4b87b2f6 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -101,7 +101,11 @@ def get_num_cores():
   return int(os.environ.get('EMCC_CORES', multiprocessing.cpu_count()))
 
 
-def run_multiple_processes(commands, child_env=None, route_stdout_to_temp_files_suffix=None, pipe_stdout=False, check=True):
+# Runs multiple subprocess commands.
+# bool 'check': If True (default), raises an exception if any of the subprocesses failed with a nonzero exit code.
+# string 'route_stdout_to_temp_files_suffix': if not None, all stdouts are instead written to files, and an array of filenames is returned.
+# bool 'pipe_stdout': If True, an array of stdouts is returned, for each subprocess.
+def run_multiple_processes(commands, child_env=None, route_stdout_to_temp_files_suffix=None, pipe_stdout=False, check=True, cwd=None):
   std_outs = []
 
   # TODO: Experiment with registering a signal handler here to see if that helps with Ctrl-C locking up the command prompt
@@ -122,7 +126,7 @@ def run_multiple_processes(commands, child_env=None, route_stdout_to_temp_files_
         std_out = temp_files.get(route_stdout_to_temp_files_suffix) if route_stdout_to_temp_files_suffix else (subprocess.PIPE if pipe_stdout else None)
         if DEBUG:
           logger.debug('Running subprocess %d/%d: %s' % (end + 1, len(commands), ' '.join(commands[end])))
-        processes += [subprocess.Popen(commands[end], stdout=std_out, env=child_env if child_env else os.environ.copy())]
+        processes += [subprocess.Popen(commands[end], stdout=std_out, env=child_env if child_env else os.environ.copy(), cwd=cwd)]
         if route_stdout_to_temp_files_suffix:
           std_outs += [std_out.name]
         end += 1
@@ -136,7 +140,7 @@ def run_multiple_processes(commands, child_env=None, route_stdout_to_temp_files_
             logger.info(out.decode('UTF-8'))
           if err:
             logger.error(err.decode('UTF-8'))
-          raise Exception('Subprocess %d/%d failed with return code %d!' % (start + 1, len(commands), processes[start].returncode))
+          raise Exception('Subprocess %d/%d failed with return code %d! (cmdline: %s)' % (start + 1, len(commands), processes[start].returncode, shlex_join(commands[start])))
         start += 1
   return std_outs
 

From 8bcbef91a1f2b952aaaac607cef1d653ce9e6c2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sun, 28 Mar 2021 14:01:55 +0300
Subject: [PATCH 12/26] Flake

---
 tools/building.py    | 5 +++--
 tools/shared.py      | 4 ++--
 tools/system_libs.py | 6 +-----
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/tools/building.py b/tools/building.py
index 4106d69bf1f5a..62a3c4d731898 100644
--- a/tools/building.py
+++ b/tools/building.py
@@ -12,7 +12,7 @@
 import subprocess
 import sys
 import tempfile
-from subprocess import STDOUT, PIPE
+from subprocess import PIPE
 
 from . import diagnostics
 from . import response_file
@@ -79,6 +79,7 @@ def extract_archive_contents(archive_files):
   archive_results = shared.run_multiple_processes([[LLVM_AR, 't', a] for a in archive_files], pipe_stdout=True)
 
   unpack_temp_dir = tempfile.mkdtemp('_archive_contents', 'emscripten_temp_')
+
   def clean_at_exit():
     try_delete(unpack_temp_dir)
   shared.atexit.register(clean_at_exit)
@@ -110,7 +111,7 @@ def clean_at_exit():
   for a in archive_contents:
     missing_contents = [x for x in a['o_files'] if not os.path.exists(x)]
     if missing_contents:
-      exit_with_error('llvm-ar failed to extract file(s) ' + str(missing_contents) + ' from archive file ' + f + '! Error:' + str(proc.stdout))
+      exit_with_error('llvm-ar failed to extract file(s) ' + str(missing_contents) + ' from archive file ' + f + '!')
 
   return archive_contents
 
diff --git a/tools/shared.py b/tools/shared.py
index d93cc4b87b2f6..17e814732ca28 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -105,7 +105,7 @@ def get_num_cores():
 # bool 'check': If True (default), raises an exception if any of the subprocesses failed with a nonzero exit code.
 # string 'route_stdout_to_temp_files_suffix': if not None, all stdouts are instead written to files, and an array of filenames is returned.
 # bool 'pipe_stdout': If True, an array of stdouts is returned, for each subprocess.
-def run_multiple_processes(commands, child_env=None, route_stdout_to_temp_files_suffix=None, pipe_stdout=False, check=True, cwd=None):
+def run_multiple_processes(commands, env=os.environ.copy(), route_stdout_to_temp_files_suffix=None, pipe_stdout=False, check=True, cwd=None):
   std_outs = []
 
   # TODO: Experiment with registering a signal handler here to see if that helps with Ctrl-C locking up the command prompt
@@ -126,7 +126,7 @@ def run_multiple_processes(commands, child_env=None, route_stdout_to_temp_files_
         std_out = temp_files.get(route_stdout_to_temp_files_suffix) if route_stdout_to_temp_files_suffix else (subprocess.PIPE if pipe_stdout else None)
         if DEBUG:
           logger.debug('Running subprocess %d/%d: %s' % (end + 1, len(commands), ' '.join(commands[end])))
-        processes += [subprocess.Popen(commands[end], stdout=std_out, env=child_env if child_env else os.environ.copy(), cwd=cwd)]
+        processes += [subprocess.Popen(commands[end], stdout=std_out, env=env, cwd=cwd)]
         if route_stdout_to_temp_files_suffix:
           std_outs += [std_out.name]
         end += 1
diff --git a/tools/system_libs.py b/tools/system_libs.py
index 193ab5521acea..d7b2cc29fb97b 100644
--- a/tools/system_libs.py
+++ b/tools/system_libs.py
@@ -84,10 +84,6 @@ def run_build_commands(commands):
   # to setup the sysroot itself.
   ensure_sysroot()
 
-
-
-  safe_env = clean_env()
-
   for i in range(len(commands)):
     # TODO(sbc): Remove this one we remove the test_em_config_env_var test
     commands[i].append('-Wno-deprecated')
@@ -101,7 +97,7 @@ def run_build_commands(commands):
       commands[i][0] = commands[i][0].replace('emcc', 'emcc.py')
       commands[i] = [sys.executable] + commands[i]
 
-  shared.run_multiple_processes(commands)
+  shared.run_multiple_processes(commands, env=clean_env())
 
 
 def create_lib(libname, inputs):

From 5358095c3c71f974e92c4def1e6897ac18af4164 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sun, 28 Mar 2021 14:15:46 +0300
Subject: [PATCH 13/26] Remove direct .py spawn in system_libs.py which
 prevents ccache.

---
 tools/system_libs.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tools/system_libs.py b/tools/system_libs.py
index d7b2cc29fb97b..15880b4e2db5e 100644
--- a/tools/system_libs.py
+++ b/tools/system_libs.py
@@ -88,15 +88,6 @@ def run_build_commands(commands):
     # TODO(sbc): Remove this one we remove the test_em_config_env_var test
     commands[i].append('-Wno-deprecated')
 
-    # For subprocess spawns, do not route via the OS batch script launcher, but directly
-    # spawn the python script. This saves ~2 seconds on libc build.
-    if commands[i][0].endswith('emcc.bat'):
-      commands[i][0] = commands[i][0].replace('emcc.bat', 'emcc.py')
-      commands[i] = [sys.executable] + commands[i]
-    elif commands[i][0].endswith('emcc'):
-      commands[i][0] = commands[i][0].replace('emcc', 'emcc.py')
-      commands[i] = [sys.executable] + commands[i]
-
   shared.run_multiple_processes(commands, env=clean_env())
 
 

From b43bcdf9524ae55d5f5fea3161dcb0a265a90f91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sun, 28 Mar 2021 14:20:56 +0300
Subject: [PATCH 14/26] Use os.cpu_count() instead of importing multiprocessing

---
 tools/shared.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/shared.py b/tools/shared.py
index 17e814732ca28..2f220ce504ca4 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -97,8 +97,7 @@ def run_process(cmd, check=True, input=None, *args, **kw):
 
 
 def get_num_cores():
-  import multiprocessing
-  return int(os.environ.get('EMCC_CORES', multiprocessing.cpu_count()))
+  return int(os.environ.get('EMCC_CORES', os.cpu_count()))
 
 
 # Runs multiple subprocess commands.

From df359385a9dc1bb486795a9fe903f21ec58c1170 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Tue, 30 Mar 2021 00:01:30 +0300
Subject: [PATCH 15/26] Rewrite run_multiple_processes() to avoid start and end
 iterators.

---
 tools/shared.py | 46 +++++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/tools/shared.py b/tools/shared.py
index 2f220ce504ca4..b2016d07da1cc 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -116,32 +116,48 @@ def run_multiple_processes(commands, env=os.environ.copy(), route_stdout_to_temp
 
   with ToolchainProfiler.profile_block('run_multiple_processes'):
     processes = []
-    start = 0
-    end = 0
     num_parallel_processes = get_num_cores()
     temp_files = configuration.get_temp_files()
-    while start < len(commands):
-      if start + num_parallel_processes > end and end < len(commands): # Spawn a new process?
+    i = 0
+    num_completed = 0
+    while num_completed < len(commands):
+      if i < len(commands) and len(processes) < num_parallel_processes:
+        # Not enough parallel processes running, spawn a new one.
         std_out = temp_files.get(route_stdout_to_temp_files_suffix) if route_stdout_to_temp_files_suffix else (subprocess.PIPE if pipe_stdout else None)
         if DEBUG:
-          logger.debug('Running subprocess %d/%d: %s' % (end + 1, len(commands), ' '.join(commands[end])))
-        processes += [subprocess.Popen(commands[end], stdout=std_out, env=env, cwd=cwd)]
+          logger.debug('Running subprocess %d/%d: %s' % (i + 1, len(commands), ' '.join(commands[i])))
+        processes += [(i, subprocess.Popen(commands[i], stdout=std_out, env=env, cwd=cwd))]
         if route_stdout_to_temp_files_suffix:
-          std_outs += [std_out.name]
-        end += 1
+          std_outs += [(i, std_out.name)]
+        i += 1
       else:
-        # Too many commands running in parallel, wait for one to finish.
-        out, err = processes[start].communicate()
+        # Not spawning a new process (Too many commands running in parallel, or no commands left): find if a process has finished.
+        def get_finished_process():
+          j = 0
+          while j < len(processes):
+            if processes[j][1].poll() is not None:
+              return j
+            j += 1
+          # All processes still running; take first (oldest) process to finish.
+          return 0
+
+        j = get_finished_process()
+        idx, finished_process = processes[j]
+        del processes[j]
+        out, err = finished_process.communicate()
         if pipe_stdout:
-          std_outs += [out.decode('UTF-8')]
-        if check and processes[start].returncode != 0:
+          std_outs += [(idx, out.decode('UTF-8'))]
+        if check and finished_process.returncode != 0:
           if out:
             logger.info(out.decode('UTF-8'))
           if err:
             logger.error(err.decode('UTF-8'))
-          raise Exception('Subprocess %d/%d failed with return code %d! (cmdline: %s)' % (start + 1, len(commands), processes[start].returncode, shlex_join(commands[start])))
-        start += 1
-  return std_outs
+          raise Exception('Subprocess %d/%d failed with return code %d! (cmdline: %s)' % (idx + 1, len(commands), finished_process.returncode, shlex_join(commands[idx])))
+        num_completed += 1
+
+  # If processes finished out of order, sort the results to the order of the input.
+  std_outs.sort(key=lambda x: x[0])
+  return [x[1] for x in std_outs]
 
 
 def check_call(cmd, *args, **kw):

From c0b7a24849d06d47b3d362b03ec247fa6dd79636 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Wed, 31 Mar 2021 11:46:03 +0300
Subject: [PATCH 16/26] Use communicate() with timeout

---
 tools/shared.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/tools/shared.py b/tools/shared.py
index b2016d07da1cc..b3ce4d22b28d1 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -133,18 +133,24 @@ def run_multiple_processes(commands, env=os.environ.copy(), route_stdout_to_temp
       else:
         # Not spawning a new process (Too many commands running in parallel, or no commands left): find if a process has finished.
         def get_finished_process():
-          j = 0
-          while j < len(processes):
-            if processes[j][1].poll() is not None:
-              return j
-            j += 1
-          # All processes still running; take first (oldest) process to finish.
-          return 0
-
-        j = get_finished_process()
+          while True:
+            j = 0
+            while j < len(processes):
+              if processes[j][1].poll() is not None:
+                out, err = processes[j][1].communicate()
+                return (j, '', '')
+              j += 1
+            # All processes still running; wait a short while for the first (oldest) process to finish,
+            # then look again if any process has completed.
+            try:
+              out, err = processes[0][1].communicate(0.2)
+              return (0, out, err)
+            except TimeoutExpired:
+              pass
+
+        j, out, err = get_finished_process()
         idx, finished_process = processes[j]
         del processes[j]
-        out, err = finished_process.communicate()
         if pipe_stdout:
           std_outs += [(idx, out.decode('UTF-8'))]
         if check and finished_process.returncode != 0:

From 7d4c63efb76dadd5cf462b6ffe0fa420525c3720 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Wed, 31 Mar 2021 13:17:50 +0300
Subject: [PATCH 17/26] Fix TimeoutExpired

---
 tools/shared.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/shared.py b/tools/shared.py
index b3ce4d22b28d1..c4d2cb97738d0 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -145,7 +145,7 @@ def get_finished_process():
             try:
               out, err = processes[0][1].communicate(0.2)
               return (0, out, err)
-            except TimeoutExpired:
+            except subprocess.TimeoutExpired:
               pass
 
         j, out, err = get_finished_process()

From edb014541a1e5226a836be46a0d6e3054d18d582 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Wed, 31 Mar 2021 17:04:24 +0300
Subject: [PATCH 18/26] Fix Linux UTF-8 decoding

---
 tools/shared.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/shared.py b/tools/shared.py
index c4d2cb97738d0..cad793c20bd4a 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -144,7 +144,7 @@ def get_finished_process():
             # then look again if any process has completed.
             try:
               out, err = processes[0][1].communicate(0.2)
-              return (0, out, err)
+              return (0, out.decode('UTF-8') if out else '', err.decode('UTF-8') if err else '')
             except subprocess.TimeoutExpired:
               pass
 
@@ -152,12 +152,12 @@ def get_finished_process():
         idx, finished_process = processes[j]
         del processes[j]
         if pipe_stdout:
-          std_outs += [(idx, out.decode('UTF-8'))]
+          std_outs += [(idx, out)]
         if check and finished_process.returncode != 0:
           if out:
-            logger.info(out.decode('UTF-8'))
+            logger.info(out)
           if err:
-            logger.error(err.decode('UTF-8'))
+            logger.error(err)
           raise Exception('Subprocess %d/%d failed with return code %d! (cmdline: %s)' % (idx + 1, len(commands), finished_process.returncode, shlex_join(commands[idx])))
         num_completed += 1
 

From 7c6c2cf51c0c53ed841452168f5ea107166b4923 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Wed, 31 Mar 2021 17:12:34 +0300
Subject: [PATCH 19/26] Pipe stderr, add check that both file and string stdout
 piping is not simultaneously active

---
 tools/shared.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/shared.py b/tools/shared.py
index cad793c20bd4a..d572a2efb90b6 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -107,6 +107,9 @@ def get_num_cores():
 def run_multiple_processes(commands, env=os.environ.copy(), route_stdout_to_temp_files_suffix=None, pipe_stdout=False, check=True, cwd=None):
   std_outs = []
 
+  if route_stdout_to_temp_files_suffix and pipe_stdout:
+    raise Exception('Cannot simultaneously pipe stdout to file and a string! Choose one or the other.')
+
   # TODO: Experiment with registering a signal handler here to see if that helps with Ctrl-C locking up the command prompt
   # when multiple child processes have been spawned.
   # import signal
@@ -126,7 +129,7 @@ def run_multiple_processes(commands, env=os.environ.copy(), route_stdout_to_temp
         std_out = temp_files.get(route_stdout_to_temp_files_suffix) if route_stdout_to_temp_files_suffix else (subprocess.PIPE if pipe_stdout else None)
         if DEBUG:
           logger.debug('Running subprocess %d/%d: %s' % (i + 1, len(commands), ' '.join(commands[i])))
-        processes += [(i, subprocess.Popen(commands[i], stdout=std_out, env=env, cwd=cwd))]
+        processes += [(i, subprocess.Popen(commands[i], stdout=std_out, stderr=subprocess.PIPE if pipe_stdout else None, env=env, cwd=cwd))]
         if route_stdout_to_temp_files_suffix:
           std_outs += [(i, std_out.name)]
         i += 1

From f15bc9d19f7f22cdcac3de001b7da22748fc9e72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Wed, 31 Mar 2021 17:53:16 +0300
Subject: [PATCH 20/26] Improve subprocess spawn utilization

---
 tools/shared.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/tools/shared.py b/tools/shared.py
index d572a2efb90b6..89f2d4b7dd951 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -123,16 +123,21 @@ def run_multiple_processes(commands, env=os.environ.copy(), route_stdout_to_temp
     temp_files = configuration.get_temp_files()
     i = 0
     num_completed = 0
+
+    def launch_new_process():
+      nonlocal processes, std_outs, i
+      std_out = temp_files.get(route_stdout_to_temp_files_suffix) if route_stdout_to_temp_files_suffix else (subprocess.PIPE if pipe_stdout else None)
+      if DEBUG:
+        logger.debug('Running subprocess %d/%d: %s' % (i + 1, len(commands), ' '.join(commands[i])))
+      processes += [(i, subprocess.Popen(commands[i], stdout=std_out, stderr=subprocess.PIPE if pipe_stdout else None, env=env, cwd=cwd))]
+      if route_stdout_to_temp_files_suffix:
+        std_outs += [(i, std_out.name)]
+      i += 1
+
     while num_completed < len(commands):
       if i < len(commands) and len(processes) < num_parallel_processes:
         # Not enough parallel processes running, spawn a new one.
-        std_out = temp_files.get(route_stdout_to_temp_files_suffix) if route_stdout_to_temp_files_suffix else (subprocess.PIPE if pipe_stdout else None)
-        if DEBUG:
-          logger.debug('Running subprocess %d/%d: %s' % (i + 1, len(commands), ' '.join(commands[i])))
-        processes += [(i, subprocess.Popen(commands[i], stdout=std_out, stderr=subprocess.PIPE if pipe_stdout else None, env=env, cwd=cwd))]
-        if route_stdout_to_temp_files_suffix:
-          std_outs += [(i, std_out.name)]
-        i += 1
+        launch_new_process()
       else:
         # Not spawning a new process (Too many commands running in parallel, or no commands left): find if a process has finished.
         def get_finished_process():
@@ -140,6 +145,9 @@ def get_finished_process():
             j = 0
             while j < len(processes):
               if processes[j][1].poll() is not None:
+                # Immediately launch the next process to maximize utilization
+                if i < len(commands):
+                  launch_new_process()
                 out, err = processes[j][1].communicate()
                 return (j, '', '')
               j += 1

From 24d925545e3e4e1226111186c181b99f01b71179 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Wed, 31 Mar 2021 18:27:16 +0300
Subject: [PATCH 21/26] Test against multiprocessing

---
 tools/shared.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/tools/shared.py b/tools/shared.py
index 89f2d4b7dd951..199fdc7c2261b 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -100,11 +100,38 @@ def get_num_cores():
   return int(os.environ.get('EMCC_CORES', os.cpu_count()))
 
 
+multiprocessing_pool = None
+
+def get_multiprocessing_pool():
+  import multiprocessing
+  global multiprocessing_pool
+  if multiprocessing_pool:
+    return multiprocessing_pool
+  multiprocessing_pool = multiprocessing.Pool(processes=get_num_cores())
+  return multiprocessing_pool
+
+
+def mp_run_process(command_tuple):
+  cmd, env, route_stdout_to_temp_files_suffix, pipe_stdout, check, cwd = command_tuple
+  std_out = temp_files.get(route_stdout_to_temp_files_suffix) if route_stdout_to_temp_files_suffix else (subprocess.PIPE if pipe_stdout else None)
+  ret = std_out.name if route_stdout_to_temp_files_suffix else None
+  proc = subprocess.Popen(cmd, stdout=std_out, stderr=subprocess.PIPE if pipe_stdout else None, env=env, cwd=cwd)
+  out, err = proc.communicate()
+  if pipe_stdout:
+    ret = out.decode('UTF-8')
+  return ret
+
+
+def run_multiple_processes_multiprocessing(commands, env, route_stdout_to_temp_files_suffix, pipe_stdout, check, cwd):
+  return get_multiprocessing_pool().map(mp_run_process, [(cmd, env, route_stdout_to_temp_files_suffix, pipe_stdout, check, cwd) for cmd in commands], chunksize=1)
+
 # Runs multiple subprocess commands.
 # bool 'check': If True (default), raises an exception if any of the subprocesses failed with a nonzero exit code.
 # string 'route_stdout_to_temp_files_suffix': if not None, all stdouts are instead written to files, and an array of filenames is returned.
 # bool 'pipe_stdout': If True, an array of stdouts is returned, for each subprocess.
 def run_multiple_processes(commands, env=os.environ.copy(), route_stdout_to_temp_files_suffix=None, pipe_stdout=False, check=True, cwd=None):
+  return run_multiple_processes_multiprocessing(commands, env, route_stdout_to_temp_files_suffix, pipe_stdout, check, cwd)
+
   std_outs = []
 
   if route_stdout_to_temp_files_suffix and pipe_stdout:

From 0b24090a01766c939695244e8c8eec2482e41e4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Wed, 31 Mar 2021 18:27:42 +0300
Subject: [PATCH 22/26] Add missing stdout

---
 tools/shared.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/shared.py b/tools/shared.py
index 199fdc7c2261b..7a1ffbd4e6c88 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -176,7 +176,7 @@ def get_finished_process():
                 if i < len(commands):
                   launch_new_process()
                 out, err = processes[j][1].communicate()
-                return (j, '', '')
+                return (j, out.decode('UTF-8') if out else '', err.decode('UTF-8') if err else '')
               j += 1
             # All processes still running; wait a short while for the first (oldest) process to finish,
             # then look again if any process has completed.

From 9a857ef80006329634afbbf6cdb53ec4860e02f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Wed, 31 Mar 2021 18:32:39 +0300
Subject: [PATCH 23/26] Remove launch_new_process()

---
 tools/shared.py | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/tools/shared.py b/tools/shared.py
index 7a1ffbd4e6c88..e01321ada0455 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -151,20 +151,16 @@ def run_multiple_processes(commands, env=os.environ.copy(), route_stdout_to_temp
     i = 0
     num_completed = 0
 
-    def launch_new_process():
-      nonlocal processes, std_outs, i
-      std_out = temp_files.get(route_stdout_to_temp_files_suffix) if route_stdout_to_temp_files_suffix else (subprocess.PIPE if pipe_stdout else None)
-      if DEBUG:
-        logger.debug('Running subprocess %d/%d: %s' % (i + 1, len(commands), ' '.join(commands[i])))
-      processes += [(i, subprocess.Popen(commands[i], stdout=std_out, stderr=subprocess.PIPE if pipe_stdout else None, env=env, cwd=cwd))]
-      if route_stdout_to_temp_files_suffix:
-        std_outs += [(i, std_out.name)]
-      i += 1
-
     while num_completed < len(commands):
       if i < len(commands) and len(processes) < num_parallel_processes:
         # Not enough parallel processes running, spawn a new one.
-        launch_new_process()
+        std_out = temp_files.get(route_stdout_to_temp_files_suffix) if route_stdout_to_temp_files_suffix else (subprocess.PIPE if pipe_stdout else None)
+        if DEBUG:
+          logger.debug('Running subprocess %d/%d: %s' % (i + 1, len(commands), ' '.join(commands[i])))
+        processes += [(i, subprocess.Popen(commands[i], stdout=std_out, stderr=subprocess.PIPE if pipe_stdout else None, env=env, cwd=cwd))]
+        if route_stdout_to_temp_files_suffix:
+          std_outs += [(i, std_out.name)]
+        i += 1
       else:
         # Not spawning a new process (Too many commands running in parallel, or no commands left): find if a process has finished.
         def get_finished_process():
@@ -172,9 +168,6 @@ def get_finished_process():
             j = 0
             while j < len(processes):
               if processes[j][1].poll() is not None:
-                # Immediately launch the next process to maximize utilization
-                if i < len(commands):
-                  launch_new_process()
                 out, err = processes[j][1].communicate()
                 return (j, out.decode('UTF-8') if out else '', err.decode('UTF-8') if err else '')
               j += 1

From 2402ef36f674b6ec38f33eea3a5b1456ebc1e76c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Wed, 31 Mar 2021 18:38:32 +0300
Subject: [PATCH 24/26] Cleanup code

---
 tools/shared.py | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/tools/shared.py b/tools/shared.py
index e01321ada0455..b13233e0132c6 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -25,7 +25,7 @@
 
 from .toolchain_profiler import ToolchainProfiler
 from .tempfiles import try_delete
-from .utils import path_from_root, exit_with_error, safe_ensure_dirs, WINDOWS
+from .utils import path_from_root, exit_with_error, safe_ensure_dirs, WINDOWS, LINUX
 from . import cache, tempfiles, colored_logger
 from . import diagnostics
 from . import config
@@ -38,6 +38,9 @@
 EXPECTED_LLVM_VERSION = "13.0"
 PYTHON = sys.executable
 
+# Used only on Linux
+multiprocessing_pool = None
+
 # can add  %(asctime)s  to see timestamps
 logging.basicConfig(format='%(name)s:%(levelname)s: %(message)s',
                     level=logging.DEBUG if DEBUG else logging.INFO)
@@ -100,37 +103,29 @@ def get_num_cores():
   return int(os.environ.get('EMCC_CORES', os.cpu_count()))
 
 
-multiprocessing_pool = None
-
-def get_multiprocessing_pool():
-  import multiprocessing
-  global multiprocessing_pool
-  if multiprocessing_pool:
-    return multiprocessing_pool
-  multiprocessing_pool = multiprocessing.Pool(processes=get_num_cores())
-  return multiprocessing_pool
-
-
 def mp_run_process(command_tuple):
   cmd, env, route_stdout_to_temp_files_suffix, pipe_stdout, check, cwd = command_tuple
   std_out = temp_files.get(route_stdout_to_temp_files_suffix) if route_stdout_to_temp_files_suffix else (subprocess.PIPE if pipe_stdout else None)
   ret = std_out.name if route_stdout_to_temp_files_suffix else None
   proc = subprocess.Popen(cmd, stdout=std_out, stderr=subprocess.PIPE if pipe_stdout else None, env=env, cwd=cwd)
-  out, err = proc.communicate()
+  out, _ = proc.communicate()
   if pipe_stdout:
     ret = out.decode('UTF-8')
   return ret
 
 
-def run_multiple_processes_multiprocessing(commands, env, route_stdout_to_temp_files_suffix, pipe_stdout, check, cwd):
-  return get_multiprocessing_pool().map(mp_run_process, [(cmd, env, route_stdout_to_temp_files_suffix, pipe_stdout, check, cwd) for cmd in commands], chunksize=1)
-
 # Runs multiple subprocess commands.
 # bool 'check': If True (default), raises an exception if any of the subprocesses failed with a nonzero exit code.
 # string 'route_stdout_to_temp_files_suffix': if not None, all stdouts are instead written to files, and an array of filenames is returned.
 # bool 'pipe_stdout': If True, an array of stdouts is returned, for each subprocess.
 def run_multiple_processes(commands, env=os.environ.copy(), route_stdout_to_temp_files_suffix=None, pipe_stdout=False, check=True, cwd=None):
-  return run_multiple_processes_multiprocessing(commands, env, route_stdout_to_temp_files_suffix, pipe_stdout, check, cwd)
+  # Spawning multiple processes on Linux is slower without multiprocessing pool. On Windows and macOS, not using multiprocessing pool is faster.
+  if LINUX:
+    import multiprocessing
+    global multiprocessing_pool
+    if not multiprocessing_pool:
+      multiprocessing_pool = multiprocessing.Pool(processes=get_num_cores())
+    return multiprocessing_pool.map(mp_run_process, [(cmd, env, route_stdout_to_temp_files_suffix, pipe_stdout, check, cwd) for cmd in commands], chunksize=1)
 
   std_outs = []
 

From ca61c04317a7432aa2828eec1520ae1e870b41ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Wed, 31 Mar 2021 22:58:42 +0300
Subject: [PATCH 25/26] Add EM_PYTHON_MULTIPROCESSING

---
 ChangeLog.md    | 3 +++
 tools/shared.py | 6 ++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/ChangeLog.md b/ChangeLog.md
index 98400f974877a..48882a9c1d8f3 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -20,6 +20,9 @@ See docs/process.md for more on how version tagging works.
 
 Current Trunk
 -------------
+- Removed use of Python multiprocessing library because of stability issues. Added
+  new environment variable EM_PYTHON_MULTIPROCESSING=1 that can be enabled
+  to revert back to using Python multiprocessing. (#13493)
 - Binaryen now always inlines single-use functions. This should reduce code size
   and improve performance (#13744).
 
diff --git a/tools/shared.py b/tools/shared.py
index b13233e0132c6..e3a5f982ef833 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -104,6 +104,7 @@ def get_num_cores():
 
 
 def mp_run_process(command_tuple):
+  temp_files = configuration.get_temp_files()
   cmd, env, route_stdout_to_temp_files_suffix, pipe_stdout, check, cwd = command_tuple
   std_out = temp_files.get(route_stdout_to_temp_files_suffix) if route_stdout_to_temp_files_suffix else (subprocess.PIPE if pipe_stdout else None)
   ret = std_out.name if route_stdout_to_temp_files_suffix else None
@@ -119,8 +120,9 @@ def mp_run_process(command_tuple):
 # string 'route_stdout_to_temp_files_suffix': if not None, all stdouts are instead written to files, and an array of filenames is returned.
 # bool 'pipe_stdout': If True, an array of stdouts is returned, for each subprocess.
 def run_multiple_processes(commands, env=os.environ.copy(), route_stdout_to_temp_files_suffix=None, pipe_stdout=False, check=True, cwd=None):
-  # Spawning multiple processes on Linux is slower without multiprocessing pool. On Windows and macOS, not using multiprocessing pool is faster.
-  if LINUX:
+  # By default, avoid using Python multiprocessing library due to a large amount of bugs it has on Windows (#8013, #718, #13785, etc.)
+  # Use EM_PYTHON_MULTIPROCESSING=1 environment variable to enable it. It can be faster, but may not work on Windows.
+  if int(os.getenv('EM_PYTHON_MULTIPROCESSING', '0')):
     import multiprocessing
     global multiprocessing_pool
     if not multiprocessing_pool:

From 29301864bd976132ec3f7dadaede38610529f75b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Wed, 31 Mar 2021 23:00:48 +0300
Subject: [PATCH 26/26] Flake

---
 tools/shared.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/shared.py b/tools/shared.py
index e3a5f982ef833..f867bcd254ff8 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -25,7 +25,7 @@
 
 from .toolchain_profiler import ToolchainProfiler
 from .tempfiles import try_delete
-from .utils import path_from_root, exit_with_error, safe_ensure_dirs, WINDOWS, LINUX
+from .utils import path_from_root, exit_with_error, safe_ensure_dirs, WINDOWS
 from . import cache, tempfiles, colored_logger
 from . import diagnostics
 from . import config