From 5baadfad5b25ce9e104cab4f24d4a5587cab0959 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Mon, 1 Dec 2025 11:34:55 -0500
Subject: [PATCH 01/14] add more loading debug prints

---
 src/staticdata.c | 132 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 112 insertions(+), 20 deletions(-)

diff --git a/src/staticdata.c b/src/staticdata.c
index ee8fa97b7be36..a641365545de5 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -91,6 +91,30 @@ External links:
 #include "valgrind.h"
 #include "julia_assert.h"
 
+// Define JL_DEBUG_LOADING to enable detailed timing of pkgimage loading
+// #define JL_DEBUG_LOADING
+
+#ifdef JL_DEBUG_LOADING
+#define JL_LOAD_TIMING_PRINT(fmt, ...) \
+    jl_printf(JL_STDERR, "[pkgload] " fmt, ##__VA_ARGS__)
+
+#define JL_LOAD_TIMING_START(varname) \
+    uint64_t varname##_start = jl_hrtime()
+
+#define JL_LOAD_TIMING_END(varname, desc) \
+    do { \
+        uint64_t varname##_end = jl_hrtime(); \
+        jl_printf(JL_STDERR, "[pkgload]   %-40s %8.3f ms\n", desc, (varname##_end - varname##_start) / 1e6); \
+    } while(0)
+
+#define JL_LOAD_TIMING_COUNTER(var) var
+#else
+#define JL_LOAD_TIMING_PRINT(fmt, ...) ((void)0)
+#define JL_LOAD_TIMING_START(varname) ((void)0)
+#define JL_LOAD_TIMING_END(varname, desc) ((void)0)
+#define JL_LOAD_TIMING_COUNTER(var) ((void)0)
+#endif
+
 static const size_t WORLD_AGE_REVALIDATION_SENTINEL = 0x1;
 JL_DLLEXPORT size_t jl_require_world = ~(size_t)0;
 JL_DLLEXPORT _Atomic(size_t) jl_first_image_replacement_world = ~(size_t)0;
@@ -3767,6 +3791,8 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
                                                  jl_array_t **new_ext_cis, jl_array_t **method_roots_list,
                                                  pkgcachesizes *cachesizes) JL_GC_DISABLED
 {
+    JL_LOAD_TIMING_START(total);
+    JL_LOAD_TIMING_START(init);
     jl_task_t *ct = jl_current_task;
     int en = jl_gc_enable(0);
     ios_t sysimg, const_data, symbols, relocs, gvar_record, fptr_record;
@@ -3796,6 +3822,8 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
     }
 
     // step 1: read section map
+    JL_LOAD_TIMING_END(init, "initialization");
+    JL_LOAD_TIMING_START(section_map);
     assert(ios_pos(f) == 0 && f->bm == bm_mem);
     size_t sizeof_sysdata = read_uint(f);
     ios_static_buffer(&sysimg, f->buf, sizeof_sysdata + sizeof(uintptr_t));
@@ -3833,6 +3861,8 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
     ios_skip(f, sizeof_fptr_record);
 
     // step 2: get references to special values
+    JL_LOAD_TIMING_END(section_map, "read section map");
+    JL_LOAD_TIMING_START(special_refs);
     ios_seek(f, LLT_ALIGN(ios_pos(f), 8));
     assert(!ios_eof(f));
     s.s = f;
@@ -3909,10 +3939,14 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
     s.s = NULL;
 
     // step 3: apply relocations
+    JL_LOAD_TIMING_END(special_refs, "read special refs & link ids");
+    JL_LOAD_TIMING_START(symbols);
     assert(!ios_eof(f));
     jl_read_symbols(&s);
     ios_close(&symbols);
+    JL_LOAD_TIMING_END(symbols, "read symbols");
 
+    JL_LOAD_TIMING_START(relocations);
     char *image_base = (char*)&sysimg.buf[0];
     reloc_t *relocs_base = (reloc_t*)&relocs.buf[0];
 
@@ -3937,6 +3971,10 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
         arraylist_new(&s.fixup_types, 0);
     }
     jl_read_arraylist(s.relocs, &s.fixup_objs);
+    JL_LOAD_TIMING_END(relocations, "apply relocations");
+    JL_LOAD_TIMING_PRINT("    uniquing_types: %zu, uniquing_objs: %zu, fixup_types: %zu, fixup_objs: %zu\n",
+                         s.uniquing_types.len, s.uniquing_objs.len, s.fixup_types.len, s.fixup_objs.len);
+
     // Perform the uniquing of objects that we don't "own" and consequently can't promise
     // weren't created by some other package before this one got loaded:
     // - iterate through all objects that need to be uniqued. The first encounter has to be the
@@ -3951,6 +3989,10 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
     arraylist_new(&cleanup_list, 0);
     arraylist_t delay_list;
     arraylist_new(&delay_list, 0);
+    JL_LOAD_TIMING_START(type_uniquing);
+#ifdef JL_DEBUG_LOADING
+    size_t n_already_done = 0, n_lookup_found = 0, n_must_be_new = 0, n_lookup_miss = 0, n_singleton = 0;
+#endif
     JL_LOCK(&typecache_lock); // Might GC--prevent other threads from changing any type caches while we inspect them all
     for (size_t i = 0; i < s.uniquing_types.len; i++) {
         uintptr_t item = (uintptr_t)s.uniquing_types.items[i];
@@ -4004,16 +4046,23 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
             jl_datatype_t *dt = (jl_datatype_t*)obj[0], *newdt;
             if (jl_is_datatype(dt)) {
                 newdt = dt; // already done
+                JL_LOAD_TIMING_COUNTER(n_already_done++);
             }
             else {
                 dt = (jl_datatype_t*)obj;
                 arraylist_push(&cleanup_list, (void*)obj);
                 ptrhash_remove(&new_dt_objs, (void*)obj); // unmark obj as invalid before must_be_new_dt
-                if (must_be_new_dt((jl_value_t*)dt, &new_dt_objs, image_base, sizeof_sysimg))
+                if (must_be_new_dt((jl_value_t*)dt, &new_dt_objs, image_base, sizeof_sysimg)) {
                     newdt = NULL;
-                else
+                    JL_LOAD_TIMING_COUNTER(n_must_be_new++);
+                }
+                else {
                     newdt = jl_lookup_cache_type_(dt);
+                    if (newdt != NULL)
+                        JL_LOAD_TIMING_COUNTER(n_lookup_found++);
+                }
                 if (newdt == NULL) {
+                    JL_LOAD_TIMING_COUNTER(n_lookup_miss++);
                     // make a non-owned copy of obj so we don't accidentally
                     // assume this is the unique copy later
                     newdt = jl_new_uninitialized_datatype();
@@ -4041,6 +4090,7 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
             newobj = ((jl_datatype_t*)otyp)->instance;
             assert(newobj && newobj != jl_nothing);
             arraylist_push(&cleanup_list, (void*)obj);
+            JL_LOAD_TIMING_COUNTER(n_singleton++);
         }
         if (tag == 1)
             *pfld = (uintptr_t)newobj | GC_OLD | GC_IN_IMAGE;
@@ -4095,7 +4145,15 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
     }
     JL_UNLOCK(&typecache_lock); // Might GC
     jl_safepoint_resume_all_threads(ct); // TODO: move this later to also protect MethodInstance allocations, but we would need to acquire all jl_specializations_get_linfo and jl_module_globalref locks, which is hard
+    JL_LOAD_TIMING_PRINT("    type uniquing breakdown: already_done=%zu, lookup_found=%zu, must_be_new=%zu, lookup_miss=%zu, singleton=%zu\n",
+                         n_already_done, n_lookup_found, n_must_be_new, n_lookup_miss, n_singleton);
+    JL_LOAD_TIMING_END(type_uniquing, "type uniquing & caching");
+
     // Perform fixups: things like updating world ages, inserting methods & specializations, etc.
+    JL_LOAD_TIMING_START(obj_uniquing);
+#ifdef JL_DEBUG_LOADING
+    size_t n_mi_done = 0, n_mi_lookup = 0, n_bind_done = 0, n_bind_lookup = 0;
+#endif
     for (size_t i = 0; i < s.uniquing_objs.len; i++) {
         uintptr_t item = (uintptr_t)s.uniquing_objs.items[i];
         // check whether this is a gvar index
@@ -4122,6 +4180,7 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
             jl_value_t *m = obj[0];
             if (jl_is_method_instance(m)) {
                 newobj = m; // already done
+                JL_LOAD_TIMING_COUNTER(n_mi_done++);
             }
             else {
                 arraylist_push(&cleanup_list, (void*)obj);
@@ -4129,18 +4188,21 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
                 jl_value_t *sparams = obj[2];
                 newobj = (jl_value_t*)jl_specializations_get_linfo((jl_method_t*)m, specTypes, (jl_svec_t*)sparams);
                 obj[0] = newobj;
+                JL_LOAD_TIMING_COUNTER(n_mi_lookup++);
             }
         }
         else if (otyp == (uintptr_t)jl_binding_type) {
             jl_value_t *m = obj[0];
             if (jl_is_binding(m)) {
                 newobj = m; // already done
+                JL_LOAD_TIMING_COUNTER(n_bind_done++);
             }
             else {
                 arraylist_push(&cleanup_list, (void*)obj);
                 jl_value_t *name = obj[1];
                 newobj = (jl_value_t*)jl_get_module_binding((jl_module_t*)m, (jl_sym_t*)name, 1);
                 obj[0] = newobj;
+                JL_LOAD_TIMING_COUNTER(n_bind_lookup++);
             }
         }
         else {
@@ -4163,6 +4225,11 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
         o->bits.in_image = 1;
     }
     arraylist_free(&cleanup_list);
+    JL_LOAD_TIMING_PRINT("    obj uniquing breakdown: mi_done=%zu, mi_lookup=%zu, bind_done=%zu, bind_lookup=%zu\n",
+                         n_mi_done, n_mi_lookup, n_bind_done, n_bind_lookup);
+    JL_LOAD_TIMING_END(obj_uniquing, "object uniquing (MI, bindings)");
+
+    JL_LOAD_TIMING_START(fixup_objs);
     for (size_t i = 0; i < s.fixup_objs.len; i++) {
         uintptr_t item = (uintptr_t)s.fixup_objs.items[i];
         jl_value_t *obj = (jl_value_t*)(image_base + item);
@@ -4210,6 +4277,9 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
             abort();
         }
     }
+    JL_LOAD_TIMING_END(fixup_objs, "fixup objects (methods, modules)");
+
+    JL_LOAD_TIMING_START(binding_validation);
     if (s.incremental) {
         int no_replacement = jl_atomic_load_relaxed(&jl_first_image_replacement_world) == ~(size_t)0;
         for (size_t i = 0; i < s.fixup_objs.len; i++) {
@@ -4232,6 +4302,9 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
             }
         }
     }
+    JL_LOAD_TIMING_END(binding_validation, "binding validation");
+
+    JL_LOAD_TIMING_START(gvar_rooting);
     arraylist_free(&s.fixup_types);
     arraylist_free(&s.fixup_objs);
 
@@ -4244,24 +4317,25 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
     htable_free(&new_dt_objs);
 
     s.s = NULL;
-
-    if (0) {
-        printf("sysimg size breakdown:\n"
-               "     sys data: %8u\n"
-               "  isbits data: %8u\n"
-               "      symbols: %8u\n"
-               "    tags list: %8u\n"
-               "   reloc list: %8u\n"
-               "    gvar list: %8u\n"
-               "    fptr list: %8u\n",
-            (unsigned)sizeof_sysdata,
-            (unsigned)sizeof_constdata,
-            (unsigned)sizeof_symbols,
-            (unsigned)sizeof_tags,
-            (unsigned)(sizeof_relocations - sizeof_tags),
-            (unsigned)sizeof_gvar_record,
-            (unsigned)sizeof_fptr_record);
-    }
+    JL_LOAD_TIMING_END(gvar_rooting, "gvar rooting & cleanup");
+
+#ifdef JL_DEBUG_LOADING
+    jl_printf(JL_STDERR, "[pkgload] sysimg size breakdown:\n"
+           "[pkgload]      sys data: %8u\n"
+           "[pkgload]   isbits data: %8u\n"
+           "[pkgload]       symbols: %8u\n"
+           "[pkgload]     tags list: %8u\n"
+           "[pkgload]    reloc list: %8u\n"
+           "[pkgload]     gvar list: %8u\n"
+           "[pkgload]     fptr list: %8u\n",
+        (unsigned)sizeof_sysdata,
+        (unsigned)sizeof_constdata,
+        (unsigned)sizeof_symbols,
+        (unsigned)sizeof_tags,
+        (unsigned)(sizeof_relocations - sizeof_tags),
+        (unsigned)sizeof_gvar_record,
+        (unsigned)sizeof_fptr_record);
+#endif
     if (cachesizes) {
         cachesizes->sysdata = sizeof_sysdata;
         cachesizes->isbitsdata = sizeof_constdata;
@@ -4272,10 +4346,13 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
         cachesizes->fptrlist = sizeof_fptr_record;
     }
 
+    JL_LOAD_TIMING_START(fptr_relocs);
     s.s = &sysimg;
     jl_update_all_fptrs(&s, image); // fptr relocs and registration
     s.s = NULL;
+    JL_LOAD_TIMING_END(fptr_relocs, "fptr relocs & registration");
 
+    JL_LOAD_TIMING_START(finalization);
     ios_close(&fptr_record);
     ios_close(&sysimg);
 
@@ -4302,12 +4379,17 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
     }
     jl_timing_counter_inc(JL_TIMING_COUNTER_ImageSize, sizeof_sysimg + sizeof(uintptr_t));
     rebuild_image_blob_tree();
+    JL_LOAD_TIMING_END(finalization, "finalization & blob tree rebuild");
 
     // jl_printf(JL_STDOUT, "%ld blobs to link against\n", jl_linkage_blobs.len >> 1);
     jl_gc_enable(en);
 
+    JL_LOAD_TIMING_START(add_methods);
     if (s.incremental)
         jl_add_methods(*extext_methods);
+    JL_LOAD_TIMING_END(add_methods, "add methods");
+    JL_LOAD_TIMING_END(total, "TOTAL restore time");
+    JL_LOAD_TIMING_PRINT("    image size: %zu bytes\n", sizeof_sysimg);
 }
 
 static jl_value_t *jl_validate_cache_file(ios_t *f, jl_array_t *depmods, uint64_t *checksum, int64_t *dataendpos, int64_t *datastartpos)
@@ -4343,10 +4425,13 @@ static jl_value_t *jl_restore_package_image_from_stream(ios_t *f, jl_image_t *im
 {
     JL_TIMING(LOAD_IMAGE, LOAD_Pkgimg);
     jl_timing_printf(JL_TIMING_DEFAULT_BLOCK, pkgname);
+    JL_LOAD_TIMING_PRINT("Loading package: %s\n", pkgname);
+    JL_LOAD_TIMING_START(validate);
     uint64_t checksum = 0;
     int64_t dataendpos = 0;
     int64_t datastartpos = 0;
     jl_value_t *verify_fail = jl_validate_cache_file(f, depmods, &checksum, &dataendpos, &datastartpos);
+    JL_LOAD_TIMING_END(validate, "validate cache file");
 
     if (verify_fail)
         return verify_fail;
@@ -4384,13 +4469,17 @@ static jl_value_t *jl_restore_package_image_from_stream(ios_t *f, jl_image_t *im
             jl_restore_system_image_from_stream_(f, image, depmods, checksum, (jl_array_t**)&restored, &init_order, &extext_methods, &internal_methods, &new_ext_cis, &method_roots_list, &cachesizes);
             JL_SIGATOMIC_END();
 
+            JL_LOAD_TIMING_START(copy_roots);
             // Add roots to methods
             int failed = jl_copy_roots(method_roots_list, jl_worklist_key((jl_array_t*)restored));
             if (failed != 0) {
                 jl_printf(JL_STDERR, "Error copying roots to methods from Module: %s\n", pkgname);
                 abort();
             }
+            JL_LOAD_TIMING_END(copy_roots, "copy method roots");
+
             // Insert method extensions and handle edges
+            JL_LOAD_TIMING_START(activate_methods);
             int new_methods = jl_array_nrows(extext_methods) > 0;
             if (!new_methods) {
                 size_t i, l = jl_array_nrows(internal_methods);
@@ -4414,6 +4503,9 @@ static jl_value_t *jl_restore_package_image_from_stream(ios_t *f, jl_image_t *im
                 jl_atomic_store_release(&jl_world_counter, world);
             // now permit more methods to be added again
             JL_UNLOCK(&world_counter_lock);
+            JL_LOAD_TIMING_END(activate_methods, "activate methods");
+            JL_LOAD_TIMING_PRINT("    extext_methods: %zu, internal_methods: %zu, new_methods: %d\n",
+                                 jl_array_nrows(extext_methods), jl_array_nrows(internal_methods), new_methods);
 
             if (completeinfo) {
                 cachesizes_sv = jl_alloc_svec(7);

From 0ddec0602fa17ff0cb3bd1a6bc794f159cd2300c Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Mon, 1 Dec 2025 12:34:51 -0500
Subject: [PATCH 02/14] staticdata: sort uniquing_objs by method pointer during
 save

Sort method instances by their method pointer when writing the uniquing_objs
list during pkgimage serialization. This improves cache locality during load
since consecutive MIs for the same method keep that method's specializations
hash table hot in CPU cache.

Initial testing showed ~24% reduction in object uniquing time (14.9ms to
11.3ms for Plots.jl), though more validation is needed.
---
 src/staticdata.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/src/staticdata.c b/src/staticdata.c
index a641365545de5..7747c406c0e43 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -217,6 +217,22 @@ static int ptr_cmp(const void *l, const void *r) JL_NOTSAFEPOINT
     return (left > right) - (left < right);
 }
 
+// Entry for sorting uniquing_objs by method pointer during serialization
+typedef struct {
+    uintptr_t offset;    // offset in serialized stream
+    uintptr_t method;    // method pointer for sorting (MIs only)
+} uniquing_obj_entry_t;
+
+static int uniquing_obj_entry_cmp(const void *a, const void *b) JL_NOTSAFEPOINT
+{
+    const uniquing_obj_entry_t *ea = (const uniquing_obj_entry_t*)a;
+    const uniquing_obj_entry_t *eb = (const uniquing_obj_entry_t*)b;
+    // Sort by method pointer, with bindings (method=0) at the end
+    if (ea->method < eb->method) return -1;
+    if (ea->method > eb->method) return 1;
+    return 0;
+}
+
 // Build an eytzinger tree from a sorted array
 static int eytzinger(uintptr_t *src, uintptr_t *dest, size_t i, size_t k, size_t n) JL_NOTSAFEPOINT
 {
@@ -358,6 +374,7 @@ typedef struct {
     arraylist_t uniquing_types; // a list of locations that reference types that must be de-duplicated
     arraylist_t uniquing_super; // a list of datatypes, used in super fields, that need to be marked in uniquing_types once they are reached, for handling unique-ing of them on deserialization
     arraylist_t uniquing_objs;  // a list of locations that reference non-types that must be de-duplicated
+    arraylist_t uniquing_objs_methods; // parallel list of method pointers for sorting uniquing_objs
     arraylist_t fixup_types;    // a list of locations of types requiring (re)caching
     arraylist_t fixup_objs;     // a list of locations of objects requiring (re)caching
     // mapping from a buildid_idx to a depmods_idx
@@ -1185,8 +1202,17 @@ static void record_uniquing(jl_serializer_state *s, jl_value_t *fld, uintptr_t o
     if (s->incremental && jl_needs_serialization(s, fld) && needs_uniquing(fld, s->query_cache)) {
         if (jl_is_datatype(fld) || jl_is_datatype_singleton((jl_datatype_t*)jl_typeof(fld)))
             arraylist_push(&s->uniquing_types, (void*)(uintptr_t)offset);
-        else if (jl_is_method_instance(fld) || jl_is_binding(fld))
+        else if (jl_is_method_instance(fld)) {
             arraylist_push(&s->uniquing_objs, (void*)(uintptr_t)offset);
+            // Store method pointer for sorting - improves cache locality during load
+            jl_method_instance_t *mi = (jl_method_instance_t*)fld;
+            arraylist_push(&s->uniquing_objs_methods, (void*)mi->def.method);
+        }
+        else if (jl_is_binding(fld)) {
+            arraylist_push(&s->uniquing_objs, (void*)(uintptr_t)offset);
+            // Bindings don't have methods, use NULL (will sort to end)
+            arraylist_push(&s->uniquing_objs_methods, NULL);
+        }
         else
             assert(0 && "unknown object type with needs_uniquing set");
     }
@@ -3026,6 +3052,7 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     arraylist_new(&s.uniquing_types, 0);
     arraylist_new(&s.uniquing_super, 0);
     arraylist_new(&s.uniquing_objs, 0);
+    arraylist_new(&s.uniquing_objs_methods, 0);
     arraylist_new(&s.fixup_types, 0);
     arraylist_new(&s.fixup_objs, 0);
     s.buildid_depmods_idxs = image_to_depmodidx(mod_array);
@@ -3222,6 +3249,22 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     jl_write_offsetlist(s.relocs, sysimg_size, &s.memref_list);
     if (s.incremental) {
         jl_write_arraylist(s.relocs, &s.uniquing_types);
+        // Sort uniquing_objs by method pointer for cache locality during load
+        assert(s.uniquing_objs.len == s.uniquing_objs_methods.len);
+        size_t n = s.uniquing_objs.len;
+        if (n > 0) {
+            uniquing_obj_entry_t *entries = (uniquing_obj_entry_t*)malloc(n * sizeof(uniquing_obj_entry_t));
+            for (size_t i = 0; i < n; i++) {
+                entries[i].offset = (uintptr_t)s.uniquing_objs.items[i];
+                entries[i].method = (uintptr_t)s.uniquing_objs_methods.items[i];
+            }
+            qsort(entries, n, sizeof(uniquing_obj_entry_t), uniquing_obj_entry_cmp);
+            // Write back sorted offsets
+            for (size_t i = 0; i < n; i++) {
+                s.uniquing_objs.items[i] = (void*)entries[i].offset;
+            }
+            free(entries);
+        }
         jl_write_arraylist(s.relocs, &s.uniquing_objs);
         jl_write_arraylist(s.relocs, &s.fixup_types);
     }
@@ -3301,6 +3344,7 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     arraylist_free(&s.uniquing_types);
     arraylist_free(&s.uniquing_super);
     arraylist_free(&s.uniquing_objs);
+    arraylist_free(&s.uniquing_objs_methods);
     arraylist_free(&s.fixup_types);
     arraylist_free(&s.fixup_objs);
     arraylist_free(&s.memowner_list);

From 8423236c2ae310f2b6adb62348a897a75d1e8ca5 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Mon, 1 Dec 2025 11:35:21 -0500
Subject: [PATCH 03/14] claude-made analysis summary doc

---
 .../devdocs/pkgimage-loading-performance.md   | 214 ++++++++++++++++++
 1 file changed, 214 insertions(+)
 create mode 100644 doc/src/devdocs/pkgimage-loading-performance.md

diff --git a/doc/src/devdocs/pkgimage-loading-performance.md b/doc/src/devdocs/pkgimage-loading-performance.md
new file mode 100644
index 0000000000000..83a2dc1a9ac2f
--- /dev/null
+++ b/doc/src/devdocs/pkgimage-loading-performance.md
@@ -0,0 +1,214 @@
+# Package Image Loading Performance Analysis
+
+This document summarizes the performance characteristics of Julia's package image (pkgimage) loading system, based on instrumented timing analysis of `src/staticdata.c`.
+
+## Overview
+
+Package image loading occurs in two main phases:
+
+1. **Restore** (`jl_restore_package_image_from_stream`) - Deserialize and reconstruct the image
+2. **Activate** - Register methods and make the package available
+
+## Timing Breakdown by Package Size
+
+### System Image (~188MB)
+
+| Phase | Time (ms) | % of Total |
+|-------|-----------|------------|
+| apply relocations | 45.2 | 89% |
+| read symbols | 4.6 | 9% |
+| fptr relocs & registration | 0.6 | 1% |
+| **TOTAL** | **50.8** | |
+
+The system image is dominated by relocation processing due to the large number of pointers (1.6M + 7.7M entries) that need adjustment.
+
+### Large Packages (e.g., Plots ~38MB)
+
+| Phase | Time (ms) | % of Total |
+|-------|-----------|------------|
+| object uniquing (MI, bindings) | 13.3 | 38% |
+| apply relocations | 7.8 | 22% |
+| type uniquing & caching | 5.8 | 17% |
+| fixup objects | 4.2 | 12% |
+| read symbols | 2.0 | 6% |
+| other | 1.9 | 5% |
+| **TOTAL restore** | **35.0** | |
+| activate methods | 4.3 | (post-restore) |
+
+Large packages spend most time in uniquing operations - ensuring types and method instances match those already in the runtime.
+
+### Medium Packages (e.g., PlotUtils ~7MB)
+
+| Phase | Time (ms) | % of Total |
+|-------|-----------|------------|
+| object uniquing | 2.4 | 37% |
+| type uniquing | 1.6 | 25% |
+| apply relocations | 1.4 | 22% |
+| fixup objects | 0.5 | 8% |
+| other | 0.5 | 8% |
+| **TOTAL restore** | **6.5** | |
+
+### Small Packages (JLLs, ~15-50KB)
+
+JLL packages load in 0.05-0.15ms total, with time split between:
+
+- Type uniquing checks (~0.005-0.01ms)
+- Symbol reads (~0.01-0.05ms)
+- Relocations (~0.005-0.02ms)
+
+## Key Bottlenecks
+
+### 1. Relocations (System Image)
+
+The `jl_read_relocations` function processes relocation lists that patch pointers in the loaded image. For the system image with ~7.8MB of relocation data (~9.3M total entries), this takes 45ms.
+
+**Current implementation:** Sequential processing with `ios_read` calls and pointer arithmetic.
+
+**Parallel relocation attempt:** Tested a two-phase approach (decode all entries, then apply in parallel with pthreads). Result: **slower** (45ms → 48ms) due to:
+
+- Memory allocation overhead for 120MB+ entry buffer
+- Thread creation/join overhead
+- Cache effects from non-sequential access
+
+**Potential optimizations:**
+
+- Batch memory operations
+- SIMD pointer arithmetic where applicable
+- Streaming decode with immediate application (current approach is near-optimal)
+
+### 2. Object Uniquing (Largest Bottleneck for Packages)
+
+Method instances and bindings must be matched with existing runtime objects or created if new. For Plots with ~62K objects, this takes ~13-15ms.
+
+**Detailed breakdown for Plots:**
+
+| Category | Count | % |
+|----------|-------|---|
+| MI already done | 36,766 | 59% |
+| MI requiring lookup/insert | 25,217 | 41% |
+| Bindings done | 6 | ~0% |
+| Bindings lookup | 3 | ~0% |
+
+**Per-lookup cost:** ~530ns per MethodInstance lookup/insertion via `jl_specializations_get_linfo`
+
+**Current implementation:**
+
+- Takes `m->writelock` for each insertion
+- Lock-free lookup first, then locked retry + insert
+- Hash lookup via `jl_smallintset_lookup`
+- Allocates new MethodInstance via `jl_get_specialized`
+- Inserts into method's specializations cache
+
+**Potential optimizations:**
+
+- **Save-side sorting by method pointer** (see below)
+- Batch lock acquisition for MIs from the same method
+- Pre-allocate MI objects in bulk before insertion
+
+### 3. Type Uniquing (Large Packages)
+
+The type uniquing phase ensures that types in the loaded image are unified with existing types in the runtime. For Plots with 49,413 types, this takes ~6ms.
+
+**Detailed breakdown for Plots:**
+
+| Category | Count | % |
+|----------|-------|---|
+| Already done (from deps) | 33,617 | 68% |
+| Lookup found in cache | 558 | 1% |
+| Must be new type | 9,951 | 20% |
+| Lookup miss (created) | 14,865 | 30% |
+| Singleton instances | 103 | 0.2% |
+
+**Current implementation:**
+
+- Holds `typecache_lock` for entire loop
+- Calls `jl_lookup_cache_type_` which uses `typekey_hash` + linear probing
+- Creates new datatype if not found via `jl_new_uninitialized_datatype`
+
+**Potential optimizations:**
+
+- Split into lock-free lookup phase + locked insertion phase
+- 68% already-done rate suggests good dependency ordering
+
+### 4. Method Activation (Packages with Many External Methods)
+
+Packages that extend methods from other packages (external methods) have activation overhead. SparseArrays with 2,528 external methods takes 132ms to activate.
+
+**Current implementation:** Sequential method table insertions with world age management.
+
+**Potential optimizations:**
+
+- Batch method table updates
+- Deferred invalidation processing
+
+## Data Sizes
+
+Representative size breakdown for the system image:
+
+| Component | Size |
+|-----------|------|
+| sys data | 118 MB |
+| isbits data | 70 MB |
+| reloc list | 7.9 MB |
+| tags list | 1.9 MB |
+| symbols | 0.5 MB |
+| fptr list | 0.3 MB |
+| gvar list | 0.08 MB |
+
+## Instrumentation
+
+To enable detailed timing output, set the environment variable:
+
+```bash
+JULIA_DEBUG_LOADING=1 julia -e "using SomePackage"
+```
+
+This will print timing for each phase of image loading, including detailed breakdowns of type and object uniquing operations.
+
+## Summary of Optimization Opportunities
+
+| Priority | Target | Expected Impact | Complexity | Status |
+|----------|--------|-----------------|------------|--------|
+| High | Object uniquing (MI sorting) | ~3-4ms for large pkgs | Low | Tested - promising |
+| High | Type uniquing lock scope | 2-6ms for large pkgs | Medium | Analysis complete |
+| Medium | Parallel relocations | Tested - not beneficial | Medium | ❌ Rejected |
+| Medium | Method activation batching | 1-130ms for method-heavy pkgs | Medium | Not started |
+| Low | Symbol reading optimization | 1-2ms for large pkgs | Low | Not started |
+
+### Tested Optimizations
+
+#### Save-side sorting of uniquing_objs by method pointer
+
+**Approach:** During pkgimage serialization, sort the `uniquing_objs` list by method pointer before writing. This requires:
+
+1. Add a parallel arraylist `uniquing_objs_methods` to store method pointers alongside offsets
+2. In `record_uniquing`, store the method pointer for each MI (NULL for bindings)
+3. Before writing `uniquing_objs`, build an array of (offset, method) pairs, sort by method, then write just the offsets
+
+**Implementation details:**
+
+- Add `uniquing_obj_entry_t` struct with offset and method pointer
+- Add `uniquing_obj_entry_cmp` comparator function
+- Modify `record_uniquing` to also push to `uniquing_objs_methods`
+- Before `jl_write_arraylist` for uniquing_objs, sort and reorder
+
+**Results:** ~24% reduction in object uniquing time (14.9ms → 11.3ms for Plots). The improvement comes from better cache locality - consecutive MIs for the same method keep the method's specializations hash table hot in CPU cache.
+
+**Trade-offs:**
+
+- Adds ~10-20ms to pkgimage save time (one-time cost during precompilation)
+- Increases serialization memory usage slightly
+
+### Performance Characteristics
+
+**Object uniquing rate:** ~4.1K objects/ms (~240ns per object)
+
+- 59% cache hit rate (already uniquified by earlier references)
+- 41% require lookup/insertion via `jl_specializations_get_linfo`
+
+**Type uniquing rate:** ~8.2K types/ms (~122ns per type)
+
+- 68% cache hit rate (already uniquified by dependencies)
+- 1% found via cache lookup
+- 31% new types requiring creation
+

From 3763fd6c1655236134371145eca2bc4227bcda00 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Tue, 2 Dec 2025 12:10:28 -0500
Subject: [PATCH 04/14] Revert "staticdata: sort uniquing_objs by method
 pointer during save"

This reverts commit 5e3bd3c440f8fdbd537167ebb06496050d9cb9d9.
---
 src/staticdata.c | 46 +---------------------------------------------
 1 file changed, 1 insertion(+), 45 deletions(-)

diff --git a/src/staticdata.c b/src/staticdata.c
index 7747c406c0e43..a641365545de5 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -217,22 +217,6 @@ static int ptr_cmp(const void *l, const void *r) JL_NOTSAFEPOINT
     return (left > right) - (left < right);
 }
 
-// Entry for sorting uniquing_objs by method pointer during serialization
-typedef struct {
-    uintptr_t offset;    // offset in serialized stream
-    uintptr_t method;    // method pointer for sorting (MIs only)
-} uniquing_obj_entry_t;
-
-static int uniquing_obj_entry_cmp(const void *a, const void *b) JL_NOTSAFEPOINT
-{
-    const uniquing_obj_entry_t *ea = (const uniquing_obj_entry_t*)a;
-    const uniquing_obj_entry_t *eb = (const uniquing_obj_entry_t*)b;
-    // Sort by method pointer, with bindings (method=0) at the end
-    if (ea->method < eb->method) return -1;
-    if (ea->method > eb->method) return 1;
-    return 0;
-}
-
 // Build an eytzinger tree from a sorted array
 static int eytzinger(uintptr_t *src, uintptr_t *dest, size_t i, size_t k, size_t n) JL_NOTSAFEPOINT
 {
@@ -374,7 +358,6 @@ typedef struct {
     arraylist_t uniquing_types; // a list of locations that reference types that must be de-duplicated
     arraylist_t uniquing_super; // a list of datatypes, used in super fields, that need to be marked in uniquing_types once they are reached, for handling unique-ing of them on deserialization
     arraylist_t uniquing_objs;  // a list of locations that reference non-types that must be de-duplicated
-    arraylist_t uniquing_objs_methods; // parallel list of method pointers for sorting uniquing_objs
     arraylist_t fixup_types;    // a list of locations of types requiring (re)caching
     arraylist_t fixup_objs;     // a list of locations of objects requiring (re)caching
     // mapping from a buildid_idx to a depmods_idx
@@ -1202,17 +1185,8 @@ static void record_uniquing(jl_serializer_state *s, jl_value_t *fld, uintptr_t o
     if (s->incremental && jl_needs_serialization(s, fld) && needs_uniquing(fld, s->query_cache)) {
         if (jl_is_datatype(fld) || jl_is_datatype_singleton((jl_datatype_t*)jl_typeof(fld)))
             arraylist_push(&s->uniquing_types, (void*)(uintptr_t)offset);
-        else if (jl_is_method_instance(fld)) {
+        else if (jl_is_method_instance(fld) || jl_is_binding(fld))
             arraylist_push(&s->uniquing_objs, (void*)(uintptr_t)offset);
-            // Store method pointer for sorting - improves cache locality during load
-            jl_method_instance_t *mi = (jl_method_instance_t*)fld;
-            arraylist_push(&s->uniquing_objs_methods, (void*)mi->def.method);
-        }
-        else if (jl_is_binding(fld)) {
-            arraylist_push(&s->uniquing_objs, (void*)(uintptr_t)offset);
-            // Bindings don't have methods, use NULL (will sort to end)
-            arraylist_push(&s->uniquing_objs_methods, NULL);
-        }
         else
             assert(0 && "unknown object type with needs_uniquing set");
     }
@@ -3052,7 +3026,6 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     arraylist_new(&s.uniquing_types, 0);
     arraylist_new(&s.uniquing_super, 0);
     arraylist_new(&s.uniquing_objs, 0);
-    arraylist_new(&s.uniquing_objs_methods, 0);
     arraylist_new(&s.fixup_types, 0);
     arraylist_new(&s.fixup_objs, 0);
     s.buildid_depmods_idxs = image_to_depmodidx(mod_array);
@@ -3249,22 +3222,6 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     jl_write_offsetlist(s.relocs, sysimg_size, &s.memref_list);
     if (s.incremental) {
         jl_write_arraylist(s.relocs, &s.uniquing_types);
-        // Sort uniquing_objs by method pointer for cache locality during load
-        assert(s.uniquing_objs.len == s.uniquing_objs_methods.len);
-        size_t n = s.uniquing_objs.len;
-        if (n > 0) {
-            uniquing_obj_entry_t *entries = (uniquing_obj_entry_t*)malloc(n * sizeof(uniquing_obj_entry_t));
-            for (size_t i = 0; i < n; i++) {
-                entries[i].offset = (uintptr_t)s.uniquing_objs.items[i];
-                entries[i].method = (uintptr_t)s.uniquing_objs_methods.items[i];
-            }
-            qsort(entries, n, sizeof(uniquing_obj_entry_t), uniquing_obj_entry_cmp);
-            // Write back sorted offsets
-            for (size_t i = 0; i < n; i++) {
-                s.uniquing_objs.items[i] = (void*)entries[i].offset;
-            }
-            free(entries);
-        }
         jl_write_arraylist(s.relocs, &s.uniquing_objs);
         jl_write_arraylist(s.relocs, &s.fixup_types);
     }
@@ -3344,7 +3301,6 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     arraylist_free(&s.uniquing_types);
     arraylist_free(&s.uniquing_super);
     arraylist_free(&s.uniquing_objs);
-    arraylist_free(&s.uniquing_objs_methods);
     arraylist_free(&s.fixup_types);
     arraylist_free(&s.fixup_objs);
     arraylist_free(&s.memowner_list);

From 0fa224ff30f77f0c65bba8cc0aac28c9051ac4f3 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Tue, 2 Dec 2025 12:39:57 -0500
Subject: [PATCH 05/14] Update pkgimage-loading-performance.md

---
 .../devdocs/pkgimage-loading-performance.md   | 150 +++++++++++++-----
 1 file changed, 112 insertions(+), 38 deletions(-)

diff --git a/doc/src/devdocs/pkgimage-loading-performance.md b/doc/src/devdocs/pkgimage-loading-performance.md
index 83a2dc1a9ac2f..baddf859c9eb4 100644
--- a/doc/src/devdocs/pkgimage-loading-performance.md
+++ b/doc/src/devdocs/pkgimage-loading-performance.md
@@ -26,14 +26,14 @@ The system image is dominated by relocation processing due to the large number o
 
 | Phase | Time (ms) | % of Total |
 |-------|-----------|------------|
-| object uniquing (MI, bindings) | 13.3 | 38% |
-| apply relocations | 7.8 | 22% |
-| type uniquing & caching | 5.8 | 17% |
-| fixup objects | 4.2 | 12% |
+| object uniquing (MI, bindings) | 12.1 | 36% |
+| apply relocations | 8.0 | 24% |
+| type uniquing & caching | 5.7 | 17% |
+| fixup objects | 3.7 | 11% |
 | read symbols | 2.0 | 6% |
-| other | 1.9 | 5% |
-| **TOTAL restore** | **35.0** | |
-| activate methods | 4.3 | (post-restore) |
+| other | 2.4 | 7% |
+| **TOTAL restore** | **33.9** | |
+| activate methods | 4.4 | (post-restore) |
 
 Large packages spend most time in uniquing operations - ensuring types and method instances match those already in the runtime.
 
@@ -78,18 +78,18 @@ The `jl_read_relocations` function processes relocation lists that patch pointer
 
 ### 2. Object Uniquing (Largest Bottleneck for Packages)
 
-Method instances and bindings must be matched with existing runtime objects or created if new. For Plots with ~62K objects, this takes ~13-15ms.
+Method instances and bindings must be matched with existing runtime objects or created if new. For Plots with ~62K objects, this takes ~12ms.
 
 **Detailed breakdown for Plots:**
 
 | Category | Count | % |
 |----------|-------|---|
-| MI already done | 36,766 | 59% |
+| MI already done | 36,767 | 59% |
 | MI requiring lookup/insert | 25,217 | 41% |
 | Bindings done | 6 | ~0% |
 | Bindings lookup | 3 | ~0% |
 
-**Per-lookup cost:** ~530ns per MethodInstance lookup/insertion via `jl_specializations_get_linfo`
+**Per-lookup cost:** ~480ns per MethodInstance lookup/insertion via `jl_specializations_get_linfo`
 
 **Current implementation:**
 
@@ -157,56 +157,130 @@ Representative size breakdown for the system image:
 
 ## Instrumentation
 
-To enable detailed timing output, set the environment variable:
+To enable detailed timing output, uncomment `#define JL_DEBUG_LOADING` near the top of `src/staticdata.c` and rebuild Julia:
+
+```c
+// Define JL_DEBUG_LOADING to enable detailed timing of pkgimage loading
+#define JL_DEBUG_LOADING
+```
+
+Then rebuild with `make -C src` and run:
 
 ```bash
-JULIA_DEBUG_LOADING=1 julia -e "using SomePackage"
+julia -e "using SomePackage"
 ```
 
-This will print timing for each phase of image loading, including detailed breakdowns of type and object uniquing operations.
+This will print timing for each phase of image loading, including detailed breakdowns of type and object uniquing operations. The instrumentation is compile-time guarded for zero overhead in release builds.
 
 ## Summary of Optimization Opportunities
 
-| Priority | Target | Expected Impact | Complexity | Status |
-|----------|--------|-----------------|------------|--------|
-| High | Object uniquing (MI sorting) | ~3-4ms for large pkgs | Low | Tested - promising |
-| High | Type uniquing lock scope | 2-6ms for large pkgs | Medium | Analysis complete |
-| Medium | Parallel relocations | Tested - not beneficial | Medium | ❌ Rejected |
-| Medium | Method activation batching | 1-130ms for method-heavy pkgs | Medium | Not started |
-| Low | Symbol reading optimization | 1-2ms for large pkgs | Low | Not started |
+### Highest Impact: Method Activation (63% of total time)
+
+The `activate methods` phase dominates package loading time, accounting for 206ms out of ~330ms total for `using Plots`. A single package (SparseArrays) takes 139ms - 67% of all method activation time.
+
+**Why SparseArrays is slow:** It extends Base methods (arithmetic operators, indexing, etc.) that have large method tables. Each external method requires:
+
+- `get_intersect_matches` - find all intersecting methods in Base's method tables
+- `jl_type_morespecific` checks for each intersection
+- `jl_type_intersection2` checks for each MethodInstance
+- Potential invalidation of existing compiled code
+
+SparseArrays: 55μs per external method (2,528 methods)
+Plots: 1.8μs per external method (2,388 methods)
+
+The 30x difference comes from which methods are being extended - Base's core arithmetic has much larger method tables than plotting-specific methods.
+
+**Potential optimizations:**
+
+| Approach | Expected Impact | Complexity | Status |
+|----------|-----------------|------------|--------|
+| Batch method activation by method table | 10-30% | Medium | ❌ Rejected - sorting overhead |
+| Parallel type intersection checks | 20-40% | High | Not tested |
+| Pre-compute intersection flags at precompile | 10-20% | Medium | Not tested |
+| Lazy method activation (on-demand) | Up to 100% for unused methods | High | Not tested |
+
+### Medium Impact: Apply Relocations (16%)
+
+52ms for all packages. Currently sequential processing.
+
+**Status:** Parallel attempt failed due to memory allocation overhead and cache effects.
+
+**Remaining ideas:**
+
+- SIMD pointer arithmetic
+- Memory-mapped I/O for large images
+- Streaming decode with prefetching
+
+### Lower Impact: Uniquing Operations (12%)
+
+Object uniquing: 24ms (7%)
+Type uniquing: 15ms (5%)
+
+**Tested and rejected:** Save-side sorting by method pointer made things 15% slower by destroying natural serialization locality.
+
+**Remaining ideas:**
+
+| Approach | Expected Impact | Complexity | Status |
+|----------|-----------------|------------|--------|
+| Type uniquing lock scope reduction | 2-6ms | Medium | Not tested |
+| Reduce uniquing_objs count at precompile | Variable | High | Not tested |
+| Pre-warm method specialization tables | 1-3ms | Low | Not tested |
+| Sort uniquing_objs by method pointer | ~3ms | Low | ❌ Rejected |
+
+### Lower Priority
+
+| Target | Time | Notes |
+|--------|------|-------|
+| read symbols | 19ms | Could investigate compression |
+| fixup objects | 9ms | Sequential, may benefit from batching |
+| add methods | 5ms | Already fast |
+
+## Rejected Optimizations
+
+### Parallel Relocations
+
+Attempted a two-phase approach: decode all relocation entries into a buffer, then apply in parallel using pthreads.
+
+**Result:** Slower (45ms → 48ms) due to memory allocation overhead, thread creation/join cost, and cache effects from non-sequential access.
+
+### Save-side Sorting of uniquing_objs
+
+**Hypothesis:** Sorting MIs by method pointer during serialization would improve cache locality during load, keeping each method's specializations hash table hot in CPU cache.
+
+**Implementation:** Added parallel arraylist to store method pointers, sorted (offset, method) pairs before writing.
 
-### Tested Optimizations
+**Results from `using Plots` benchmark:**
 
-#### Save-side sorting of uniquing_objs by method pointer
+| Metric | Master | PR (sorted) | Diff |
+|--------|--------|-------------|------|
+| Object uniquing (Plots) | 12.1 ms | 13.9 ms | **+15% slower** |
+| Total restore (Plots) | 33.9 ms | 36.7 ms | +8% slower |
+| Total restore (all 123 pkgs) | 131.1 ms | 222.9 ms | **+70% slower** |
+| mi_done / mi_lookup | 36767 / 25217 | 36767 / 25217 | Same |
 
-**Approach:** During pkgimage serialization, sort the `uniquing_objs` list by method pointer before writing. This requires:
+**Why it failed:** The identical mi_done/mi_lookup ratios show that sorting doesn't change the "already done" cache hit rate. The sorting appears to have **destroyed** natural locality that existed in the original serialization order. Objects are serialized in traversal order, which likely groups related items together. Sorting by method pointer spreads out items that were naturally co-located.
 
-1. Add a parallel arraylist `uniquing_objs_methods` to store method pointers alongside offsets
-2. In `record_uniquing`, store the method pointer for each MI (NULL for bindings)
-3. Before writing `uniquing_objs`, build an array of (offset, method) pairs, sort by method, then write just the offsets
+**Lesson learned:** The serialization traversal order may already have good cache locality properties that shouldn't be disturbed.
 
-**Implementation details:**
+### Batch Method Activation by Method Table
 
-- Add `uniquing_obj_entry_t` struct with offset and method pointer
-- Add `uniquing_obj_entry_cmp` comparator function
-- Modify `record_uniquing` to also push to `uniquing_objs_methods`
-- Before `jl_write_arraylist` for uniquing_objs, sort and reorder
+**Hypothesis:** Sorting external methods by their method table before activation would improve cache locality when accessing `mt->defs` for intersection checks.
 
-**Results:** ~24% reduction in object uniquing time (14.9ms → 11.3ms for Plots). The improvement comes from better cache locality - consecutive MIs for the same method keep the method's specializations hash table hot in CPU cache.
+**Implementation:** Added qsort to order entries by method table pointer before the activation loop.
 
-**Trade-offs:**
+**Result:** 10% slower than master. The sorting overhead outweighed any cache benefits, and likely destroyed beneficial natural ordering (methods are serialized in dependency order which may naturally group related activations).
 
-- Adds ~10-20ms to pkgimage save time (one-time cost during precompilation)
-- Increases serialization memory usage slightly
+**Lesson learned:** Same as above - the natural serialization order has good properties.
 
-### Performance Characteristics
+## Performance Characteristics
 
-**Object uniquing rate:** ~4.1K objects/ms (~240ns per object)
+**Object uniquing rate:** ~5.2K objects/ms (~195ns per object average)
 
 - 59% cache hit rate (already uniquified by earlier references)
 - 41% require lookup/insertion via `jl_specializations_get_linfo`
+- Per-lookup cost: ~480ns
 
-**Type uniquing rate:** ~8.2K types/ms (~122ns per type)
+**Type uniquing rate:** ~8.7K types/ms (~115ns per type average)
 
 - 68% cache hit rate (already uniquified by dependencies)
 - 1% found via cache lookup

From 0a0c876894b13ff0a9bc497d1252a6855b7b723f Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Tue, 2 Dec 2025 14:17:59 -0500
Subject: [PATCH 06/14] skip invalidation during load in precompile workers

---
 src/gf.c | 107 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 60 insertions(+), 47 deletions(-)

diff --git a/src/gf.c b/src/gf.c
index 1d3a9636ddfa9..2234181551df7 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -2882,6 +2882,10 @@ void jl_method_table_activate(jl_typemap_entry_t *newentry)
     jl_atomic_store_relaxed(&newentry->min_world, world);
     jl_atomic_store_relaxed(&method->primary_world, world);
     size_t max_world = world - 1;
+    // During incremental precompilation, skip expensive invalidation since there's
+    // no user-compiled code to invalidate. Dispatch correctness is maintained by
+    // still computing dispatch_status and interferences.
+    int skip_invalidation = jl_generating_output() && jl_options.incremental;
     jl_value_t *loctag = NULL;  // debug info for invalidation
     jl_value_t *isect = NULL;
     jl_value_t *isect2 = NULL;
@@ -2924,7 +2928,8 @@ void jl_method_table_activate(jl_typemap_entry_t *newentry)
             invalidated = 1;
             method_overwrite(newentry, m);
             // This is an optimized version of below, given we know the type-intersection is exact
-            jl_method_table_invalidate(m, max_world);
+            if (!skip_invalidation)
+                jl_method_table_invalidate(m, max_world);
             int m_dispatch = jl_atomic_load_relaxed(&m->dispatch_status);
             // Clear METHOD_SIG_LATEST_ONLY and METHOD_SIG_LATEST_WHICH bits
             jl_atomic_store_relaxed(&m->dispatch_status, 0);
@@ -2958,22 +2963,24 @@ void jl_method_table_activate(jl_typemap_entry_t *newentry)
                     jl_gc_wb(m2, m2_interferences);
                 }
             }
-            loctag = jl_atomic_load_relaxed(&m->specializations); // use loctag for a gcroot
-            _Atomic(jl_method_instance_t*) *data;
-            size_t l;
-            if (jl_is_svec(loctag)) {
-                data = (_Atomic(jl_method_instance_t*)*)jl_svec_data(loctag);
-                l = jl_svec_len(loctag);
-            }
-            else {
-                data = (_Atomic(jl_method_instance_t*)*) &loctag;
-                l = 1;
-            }
-            for (size_t i = 0; i < l; i++) {
-                jl_method_instance_t *mi = jl_atomic_load_relaxed(&data[i]);
-                if ((jl_value_t*)mi == jl_nothing)
-                    continue;
-                jl_array_ptr_1d_push(oldmi, (jl_value_t*)mi);
+            if (!skip_invalidation) {
+                loctag = jl_atomic_load_relaxed(&m->specializations); // use loctag for a gcroot
+                _Atomic(jl_method_instance_t*) *data;
+                size_t l;
+                if (jl_is_svec(loctag)) {
+                    data = (_Atomic(jl_method_instance_t*)*)jl_svec_data(loctag);
+                    l = jl_svec_len(loctag);
+                }
+                else {
+                    data = (_Atomic(jl_method_instance_t*)*) &loctag;
+                    l = 1;
+                }
+                for (size_t i = 0; i < l; i++) {
+                    jl_method_instance_t *mi = jl_atomic_load_relaxed(&data[i]);
+                    if ((jl_value_t*)mi == jl_nothing)
+                        continue;
+                    jl_array_ptr_1d_push(oldmi, (jl_value_t*)mi);
+                }
             }
             d = NULL;
             n = 0;
@@ -3012,6 +3019,10 @@ void jl_method_table_activate(jl_typemap_entry_t *newentry)
                 if (morespec[j])
                     continue;
 
+                // Skip invalidation during precompilation - no user code to invalidate
+                if (skip_invalidation)
+                    continue;
+
                 // Now examine if this caused any invalidations.
                 loctag = jl_atomic_load_relaxed(&m->specializations); // use loctag for a gcroot
                 _Atomic(jl_method_instance_t*) *data;
@@ -3075,39 +3086,41 @@ void jl_method_table_activate(jl_typemap_entry_t *newentry)
 
     jl_methcache_t *mc = jl_method_table->cache;
     JL_LOCK(&mc->writelock);
-    struct _typename_invalidate_backedge typename_env = {type, &isect, &isect2, d, n, max_world, invalidated};
-    if (!jl_foreach_top_typename_for(_typename_invalidate_backedges, type, 1, &typename_env)) {
-        // if the new method cannot be split into exact backedges, scan the whole table for anything that might be affected
-        jl_genericmemory_t *allbackedges = jl_method_table->backedges;
-        for (size_t i = 0, n = allbackedges->length; i < n; i += 2) {
-            jl_value_t *tn = jl_genericmemory_ptr_ref(allbackedges, i);
-            jl_value_t *backedges = jl_genericmemory_ptr_ref(allbackedges, i+1);
-            if (tn && tn != jl_nothing && backedges)
-                _typename_invalidate_backedges((jl_typename_t*)tn, 0, &typename_env);
-        }
-    }
-    invalidated |= typename_env.invalidated;
-    if (oldmi && jl_array_nrows(oldmi)) {
-        // drop leafcache and search mc->cache and drop anything that might overlap with the new method
-        // this is very cheap, so we don't mind being very conservative at over-approximating this
-        struct invalidate_mt_env mt_cache_env;
-        mt_cache_env.max_world = max_world;
-        mt_cache_env.shadowed = oldmi;
-        mt_cache_env.newentry = newentry;
-
-        jl_typemap_visitor(jl_atomic_load_relaxed(&mc->cache), invalidate_mt_cache, (void*)&mt_cache_env);
-        jl_genericmemory_t *leafcache = jl_atomic_load_relaxed(&mc->leafcache);
-        size_t i, l = leafcache->length;
-        for (i = 1; i < l; i += 2) {
-            jl_value_t *entry = jl_genericmemory_ptr_ref(leafcache, i);
-            if (entry) {
-                while (entry != jl_nothing) {
-                    jl_atomic_store_relaxed(&((jl_typemap_entry_t*)entry)->max_world, max_world);
-                    entry = (jl_value_t*)jl_atomic_load_relaxed(&((jl_typemap_entry_t*)entry)->next);
+    if (!skip_invalidation) {
+        struct _typename_invalidate_backedge typename_env = {type, &isect, &isect2, d, n, max_world, invalidated};
+        if (!jl_foreach_top_typename_for(_typename_invalidate_backedges, type, 1, &typename_env)) {
+            // if the new method cannot be split into exact backedges, scan the whole table for anything that might be affected
+            jl_genericmemory_t *allbackedges = jl_method_table->backedges;
+            for (size_t i = 0, n = allbackedges->length; i < n; i += 2) {
+                jl_value_t *tn = jl_genericmemory_ptr_ref(allbackedges, i);
+                jl_value_t *backedges = jl_genericmemory_ptr_ref(allbackedges, i+1);
+                if (tn && tn != jl_nothing && backedges)
+                    _typename_invalidate_backedges((jl_typename_t*)tn, 0, &typename_env);
+            }
+        }
+        invalidated |= typename_env.invalidated;
+        if (oldmi && jl_array_nrows(oldmi)) {
+            // drop leafcache and search mc->cache and drop anything that might overlap with the new method
+            // this is very cheap, so we don't mind being very conservative at over-approximating this
+            struct invalidate_mt_env mt_cache_env;
+            mt_cache_env.max_world = max_world;
+            mt_cache_env.shadowed = oldmi;
+            mt_cache_env.newentry = newentry;
+
+            jl_typemap_visitor(jl_atomic_load_relaxed(&mc->cache), invalidate_mt_cache, (void*)&mt_cache_env);
+            jl_genericmemory_t *leafcache = jl_atomic_load_relaxed(&mc->leafcache);
+            size_t i, l = leafcache->length;
+            for (i = 1; i < l; i += 2) {
+                jl_value_t *entry = jl_genericmemory_ptr_ref(leafcache, i);
+                if (entry) {
+                    while (entry != jl_nothing) {
+                        jl_atomic_store_relaxed(&((jl_typemap_entry_t*)entry)->max_world, max_world);
+                        entry = (jl_value_t*)jl_atomic_load_relaxed(&((jl_typemap_entry_t*)entry)->next);
+                    }
                 }
             }
+            jl_atomic_store_relaxed(&mc->leafcache, (jl_genericmemory_t*)jl_an_empty_memory_any);
         }
-        jl_atomic_store_relaxed(&mc->leafcache, (jl_genericmemory_t*)jl_an_empty_memory_any);
     }
     JL_UNLOCK(&mc->writelock);
     if (invalidated && _jl_debug_method_invalidation) {

From 24094df498862c52b86e8768bd6669ade99154f8 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Tue, 2 Dec 2025 14:27:15 -0500
Subject: [PATCH 07/14] Update pkgimage-loading-performance.md

---
 .../devdocs/pkgimage-loading-performance.md   | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/doc/src/devdocs/pkgimage-loading-performance.md b/doc/src/devdocs/pkgimage-loading-performance.md
index baddf859c9eb4..04fbae3e2024e 100644
--- a/doc/src/devdocs/pkgimage-loading-performance.md
+++ b/doc/src/devdocs/pkgimage-loading-performance.md
@@ -198,6 +198,56 @@ The 30x difference comes from which methods are being extended - Base's core ari
 | Parallel type intersection checks | 20-40% | High | Not tested |
 | Pre-compute intersection flags at precompile | 10-20% | Medium | Not tested |
 | Lazy method activation (on-demand) | Up to 100% for unused methods | High | Not tested |
+| Skip invalidation during precompilation | 3% | Low | ✅ Implemented |
+
+### Skip Invalidation During Precompilation (Implemented)
+
+During incremental precompilation (`jl_generating_output() && jl_options.incremental`), method activation can skip the expensive invalidation checks because:
+
+1. **No user-compiled code exists to invalidate** - all caches are being built fresh
+2. **Dispatch correctness is maintained** - `dispatch_status` and `interferences` are still computed
+3. **The final pkgimage captures correct state** - whatever we compute gets saved
+
+**Implementation:** Added `skip_invalidation` flag in `jl_method_table_activate` (`src/gf.c`) that skips:
+
+- Per-MethodInstance type intersection checks (`jl_type_intersection2`)
+- Backedge invalidation (`_invalidate_dispatch_backedges`)
+- Cache invalidation (`_typename_invalidate_backedges`, `invalidate_mt_cache`)
+- Method table invalidation for replaced methods (`jl_method_table_invalidate`)
+
+**Results:**
+
+| Metric | Master | PR | Improvement |
+|--------|--------|-----|-------------|
+| GLMakie precompilation (247 deps) | 201s | 195s | **3% faster** |
+
+The improvement is modest because the type intersection and morespecific checks (which we keep for dispatch correctness) dominate the activation cost. The invalidation loops we skip are relatively cheap.
+
+### Further Opportunities (Not Yet Implemented)
+
+The remaining expensive operations in method activation are:
+
+1. **`get_intersect_matches`** - Finds all methods in Base that intersect with the new method's signature. Required to compute `dispatch_bits` and update other methods' `interferences`.
+
+2. **`jl_type_morespecific` loop** - Called for each intersecting method to determine ambiguity and update dispatch optimization flags.
+
+3. **Interference set updates** - Updates both the new method's `interferences` and existing methods' interference sets.
+
+**Why these can't be skipped during precompilation:**
+
+- **Interference sets are required for correct dispatch**, not just optimization. Skipping updates to other methods' interference sets causes `MethodError` during precompilation when dispatch relies on these sets to find the correct method.
+- The `interferences` field is used in method sorting during dispatch to determine which methods should be considered.
+- Even though Base methods aren't saved with the package, their interference sets must be correct during precompilation for any code that runs (e.g., `__init__`, type inference).
+
+**Tested and rejected:** Skipping interference set updates during precompilation caused `MethodError` failures - dispatch couldn't find methods that should have matched.
+
+**Potential future optimizations:**
+
+| Approach | Expected Impact | Complexity | Risk |
+|----------|-----------------|------------|------|
+| Lazy interference computation (on first dispatch) | 10-30% | Very High | Requires dispatch changes |
+| Parallel type intersection checks | 20-40% | High | Thread safety concerns |
+| Cache intersection results across pkgimage loads | 5-15% | Medium | Memory overhead |
 
 ### Medium Impact: Apply Relocations (16%)
 

From fb88422be3b49d4d28d00ba514996891e761d57b Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Thu, 4 Dec 2025 23:39:09 -0500
Subject: [PATCH 08/14] add JL_DEBUG_SAVING

---
 base/loading.jl                               | 45 ++++++++++
 .../devdocs/pkgimage-loading-performance.md   | 16 +++-
 src/staticdata.c                              | 84 ++++++++++++++++++-
 3 files changed, 140 insertions(+), 5 deletions(-)

diff --git a/base/loading.jl b/base/loading.jl
index 32617b77ad72b..0af1692a91ec7 100644
--- a/base/loading.jl
+++ b/base/loading.jl
@@ -3217,10 +3217,40 @@ end
 # Const global for GC root
 const newly_inferred = CodeInstance[]
 
+# Debug timing support for precompilation (controlled by JL_DEBUG_SAVING in staticdata.c)
+function _debug_saving_enabled()
+    ccall(:jl_is_debug_saving_enabled, Cint, ()) != 0
+end
+
+function _pkgsave_print(msg::AbstractString)
+    str = "[pkgsave] " * string(msg) * "\n"
+    ccall(:jl_safe_printf, Cvoid, (Cstring,), str)
+end
+
+function _pkgsave_timing(desc::AbstractString, elapsed_ns::UInt64)
+    elapsed_s = Float64(elapsed_ns) / 1e9
+    # Format with 3 decimal places - manually ensure trailing zeros
+    rounded = round(elapsed_s, digits=3)
+    int_part = trunc(Int, rounded)
+    frac_part = round(Int, (rounded - int_part) * 1000)
+    time_str = string(int_part) * "." * lpad(string(frac_part), 3, '0')
+    padded_desc = rpad(string(desc), 40)
+    padded_time = lpad(time_str, 9)  # Match C's %9.3f format
+    str = "[pkgsave]     " * padded_desc * " " * padded_time * " s\n"
+    ccall(:jl_safe_printf, Cvoid, (Cstring,), str)
+end
+
 # this is called in the external process that generates precompiled package files
 function include_package_for_output(pkg::PkgId, input::String, syntax_version::VersionNumber, depot_path::Vector{String}, dl_load_path::Vector{String}, load_path::Vector{String},
                                     concrete_deps::typeof(_concrete_dependencies), source::Union{Nothing,String})
+    debug_timing = _debug_saving_enabled()
+    total_start = time_ns()
+
+    if debug_timing
+        _pkgsave_print("=== Precompiling $(pkg.name) ===")
+    end
 
+    setup_start = time_ns()
     @lock require_lock begin
     m = start_loading(pkg, UInt128(0), false)
     @assert m === nothing
@@ -3246,6 +3276,12 @@ function include_package_for_output(pkg::PkgId, input::String, syntax_version::V
     __toplevel__._internal_julia_parse = Experimental.VersionedParse(syntax_version)
     # This one is the compatibility marker for cache loading
     __toplevel__._internal_syntax_version = cache_syntax_version(syntax_version)
+
+    if debug_timing
+        _pkgsave_timing("setup & dependency loading", time_ns() - setup_start)
+    end
+
+    include_start = time_ns()
     try
         Base.include(Base.__toplevel__, input)
     catch ex
@@ -3255,6 +3291,11 @@ function include_package_for_output(pkg::PkgId, input::String, syntax_version::V
     finally
         ccall(:jl_set_newly_inferred, Cvoid, (Any,), nothing)
     end
+
+    if debug_timing
+        _pkgsave_timing("include (parse, lower, inference)", time_ns() - include_start)
+    end
+
     # check that the package defined the expected module so we can give a nice error message if not
     m = maybe_root_module(pkg)
     m isa Module || check_package_module_loaded_error(pkg)
@@ -3266,6 +3307,10 @@ function include_package_for_output(pkg::PkgId, input::String, syntax_version::V
     @lock require_lock end_loading(pkg, m)
     # insert_extension_triggers(pkg)
     # run_package_callbacks(pkg)
+
+    if debug_timing
+        _pkgsave_timing("TOTAL Julia-side precompilation", time_ns() - total_start)
+    end
 end
 
 function check_package_module_loaded_error(pkg)
diff --git a/doc/src/devdocs/pkgimage-loading-performance.md b/doc/src/devdocs/pkgimage-loading-performance.md
index 04fbae3e2024e..996fef872197f 100644
--- a/doc/src/devdocs/pkgimage-loading-performance.md
+++ b/doc/src/devdocs/pkgimage-loading-performance.md
@@ -157,20 +157,31 @@ Representative size breakdown for the system image:
 
 ## Instrumentation
 
-To enable detailed timing output, uncomment `#define JL_DEBUG_LOADING` near the top of `src/staticdata.c` and rebuild Julia:
+To enable detailed timing output for package **loading**, uncomment `#define JL_DEBUG_LOADING` near the top of `src/staticdata.c` and rebuild Julia:
 
 ```c
 // Define JL_DEBUG_LOADING to enable detailed timing of pkgimage loading
 #define JL_DEBUG_LOADING
 ```
 
+To enable detailed timing output for package cache **generation** (saving), uncomment `#define JL_DEBUG_SAVING`:
+
+```c
+// Define JL_DEBUG_SAVING to enable detailed timing of pkgimage generation
+#define JL_DEBUG_SAVING
+```
+
 Then rebuild with `make -C src` and run:
 
 ```bash
+# For loading timing
 julia -e "using SomePackage"
+
+# For saving timing (during precompilation)
+julia -e "using Pkg; Pkg.precompile()"
 ```
 
-This will print timing for each phase of image loading, including detailed breakdowns of type and object uniquing operations. The instrumentation is compile-time guarded for zero overhead in release builds.
+This will print timing for each phase of image loading/saving, including detailed breakdowns of type and object uniquing operations. The instrumentation is compile-time guarded for zero overhead in release builds.
 
 ## Summary of Optimization Opportunities
 
@@ -335,4 +346,3 @@ Attempted a two-phase approach: decode all relocation entries into a buffer, the
 - 68% cache hit rate (already uniquified by dependencies)
 - 1% found via cache lookup
 - 31% new types requiring creation
-
diff --git a/src/staticdata.c b/src/staticdata.c
index a641365545de5..57a5745ce9a56 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -104,7 +104,7 @@ External links:
 #define JL_LOAD_TIMING_END(varname, desc) \
     do { \
         uint64_t varname##_end = jl_hrtime(); \
-        jl_printf(JL_STDERR, "[pkgload]   %-40s %8.3f ms\n", desc, (varname##_end - varname##_start) / 1e6); \
+        jl_printf(JL_STDERR, "[pkgload]     %-40s %9.3f s\n", desc, (varname##_end - varname##_start) / 1e9); \
     } while(0)
 
 #define JL_LOAD_TIMING_COUNTER(var) var
@@ -115,6 +115,37 @@ External links:
 #define JL_LOAD_TIMING_COUNTER(var) ((void)0)
 #endif
 
+// Define JL_DEBUG_SAVING to enable detailed timing of pkgimage generation
+// #define JL_DEBUG_SAVING
+
+#ifdef JL_DEBUG_SAVING
+#define JL_SAVE_TIMING_PRINT(fmt, ...) \
+    jl_printf(JL_STDERR, "[pkgsave] " fmt, ##__VA_ARGS__)
+
+#define JL_SAVE_TIMING_START(varname) \
+    uint64_t varname##_start = jl_hrtime()
+
+#define JL_SAVE_TIMING_END(varname, desc) \
+    do { \
+        uint64_t varname##_end = jl_hrtime(); \
+        jl_printf(JL_STDERR, "[pkgsave]     %-40s %9.3f s\n", desc, (varname##_end - varname##_start) / 1e9); \
+    } while(0)
+
+#define JL_SAVE_TIMING_COUNTER(var) var
+static const int jl_debug_saving_enabled = 1;
+#else
+#define JL_SAVE_TIMING_PRINT(fmt, ...) ((void)0)
+#define JL_SAVE_TIMING_START(varname) ((void)0)
+#define JL_SAVE_TIMING_END(varname, desc) ((void)0)
+#define JL_SAVE_TIMING_COUNTER(var) ((void)0)
+static const int jl_debug_saving_enabled = 0;
+#endif
+
+JL_DLLEXPORT int jl_is_debug_saving_enabled(void) JL_NOTSAFEPOINT
+{
+    return jl_debug_saving_enabled;
+}
+
 static const size_t WORLD_AGE_REVALIDATION_SENTINEL = 0x1;
 JL_DLLEXPORT size_t jl_require_world = ~(size_t)0;
 JL_DLLEXPORT _Atomic(size_t) jl_first_image_replacement_world = ~(size_t)0;
@@ -2914,6 +2945,9 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
                                            jl_array_t *module_init_order, jl_array_t *worklist, jl_array_t *extext_methods,
                                            jl_array_t *new_ext_cis, jl_query_cache *query_cache)
 {
+    JL_SAVE_TIMING_START(total);
+    JL_SAVE_TIMING_PRINT("Serializing %s image...\n", worklist ? "incremental" : "system");
+    JL_SAVE_TIMING_START(init);
     htable_new(&field_replace, 0);
     htable_new(&bits_replace, 0);
     // strip metadata and IR when requested
@@ -3061,8 +3095,10 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     }
     jl_genericmemory_t *global_roots_list = NULL;
     jl_genericmemory_t *global_roots_keyset = NULL;
+    JL_SAVE_TIMING_END(init, "initialization & setup");
 
     { // step 1: record values (recursively) that need to go in the image
+        JL_SAVE_TIMING_START(queue_roots);
         size_t i;
         if (worklist == NULL) {
             for (i = 0; tags[i] != NULL; i++) {
@@ -3091,22 +3127,32 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
             // Queue the new specializations
             jl_queue_for_serialization(&s, new_ext_cis);
         }
+        JL_SAVE_TIMING_END(queue_roots, "queue roots");
+        JL_SAVE_TIMING_START(serialize_reachable1);
         jl_serialize_reachable(&s);
+        JL_SAVE_TIMING_END(serialize_reachable1, "serialize reachable (1)");
         // step 1.2: ensure all gvars are part of the sysimage too
+        JL_SAVE_TIMING_START(record_gvars);
         record_gvars(&s, &gvars);
         record_external_fns(&s, &external_fns);
         if (jl_options.trim)
             record_gvars(&s, &MIs);
+        JL_SAVE_TIMING_END(record_gvars, "record gvars & external fns");
+        JL_SAVE_TIMING_START(serialize_reachable2);
         jl_serialize_reachable(&s);
+        JL_SAVE_TIMING_END(serialize_reachable2, "serialize reachable (2)");
         // Beyond this point, all content should already have been visited, so now we can prune
         // the rest and add some internal root arrays.
         // step 1.3: include some other special roots
         if (s.incremental) {
+            JL_SAVE_TIMING_START(method_roots);
             // Queue the new roots array
             jl_queue_for_serialization(&s, s.method_roots_list);
             jl_serialize_reachable(&s);
+            JL_SAVE_TIMING_END(method_roots, "queue method roots");
         }
         // step 1.4: prune (garbage collect) special weak references from the jl_global_roots_list
+        JL_SAVE_TIMING_START(prune_roots);
         if (worklist == NULL) {
             global_roots_list = jl_alloc_memory_any(0);
             global_roots_keyset = jl_alloc_memory_any(0);
@@ -3122,7 +3168,9 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
             jl_queue_for_serialization(&s, global_roots_keyset);
             jl_serialize_reachable(&s);
         }
+        JL_SAVE_TIMING_END(prune_roots, "prune global roots");
         // step 1.5: prune (garbage collect) some special weak references known caches
+        JL_SAVE_TIMING_START(prune_caches);
         for (i = 0; i < serialization_queue.len; i++) {
             jl_value_t *v = (jl_value_t*)serialization_queue.items[i];
             if (jl_is_method(v)) {
@@ -3156,13 +3204,18 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
                 jl_prune_mt_backedges((jl_genericmemory_t*)backedges);
             }
         }
+        JL_SAVE_TIMING_END(prune_caches, "prune caches & backedges");
     }
 
     uint32_t external_fns_begin = 0;
     { // step 2: build all the sysimg sections
+        JL_SAVE_TIMING_START(write_values);
         write_padding(&sysimg, sizeof(uintptr_t));
         jl_write_values(&s);
+        JL_SAVE_TIMING_END(write_values, "write values");
+        JL_SAVE_TIMING_START(write_gvars);
         external_fns_begin = write_gvars(&s, &gvars, &external_fns);
+        JL_SAVE_TIMING_END(write_gvars, "write gvars");
     }
 
     // This ensures that we can use the low bit of addresses for
@@ -3190,6 +3243,7 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     }
 
     // step 3: combine all of the sections into one file
+    JL_SAVE_TIMING_START(combine_sections);
     assert(ios_pos(f) % JL_CACHE_BYTE_ALIGNMENT == 0);
     ssize_t sysimg_offset = ios_pos(f);
     write_uint(f, sysimg.size - sizeof(uintptr_t));
@@ -3213,6 +3267,7 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     ios_close(&symbols);
 
     // Prepare and write the relocations sections, now that the rest of the image is laid out
+    JL_SAVE_TIMING_START(prepare_relocs);
     char *base = &f->buf[0];
     jl_finish_relocs(base + sysimg_offset, sysimg_size, &s.gctags_list);
     jl_finish_relocs(base + sysimg_offset, sysimg_size, &s.relocs_list);
@@ -3226,6 +3281,7 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
         jl_write_arraylist(s.relocs, &s.fixup_types);
     }
     jl_write_arraylist(s.relocs, &s.fixup_objs);
+    JL_SAVE_TIMING_END(prepare_relocs, "prepare relocations");
     write_uint(f, relocs.size);
     write_padding(f, LLT_ALIGN(ios_pos(f), 8) - ios_pos(f));
     ios_seek(&relocs, 0);
@@ -3243,8 +3299,10 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     ios_seek(&fptr_record, 0);
     ios_copyall(f, &fptr_record);
     ios_close(&fptr_record);
+    JL_SAVE_TIMING_END(combine_sections, "combine sections");
 
     { // step 4: record locations of special roots
+        JL_SAVE_TIMING_START(write_roots);
         write_padding(f, LLT_ALIGN(ios_pos(f), 8) - ios_pos(f));
         s.s = f;
         if (worklist == NULL) {
@@ -3292,8 +3350,10 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
         write_uint32(f, jl_array_len(s.link_ids_external_fnvars));
         ios_write(f, (char*)jl_array_data(s.link_ids_external_fnvars, uint32_t), jl_array_len(s.link_ids_external_fnvars) * sizeof(uint32_t));
         write_uint32(f, external_fns_begin);
+        JL_SAVE_TIMING_END(write_roots, "write special roots & link ids");
     }
 
+    JL_SAVE_TIMING_START(cleanup);
     assert(object_worklist.len == 0);
     arraylist_free(&object_worklist);
     arraylist_free(&serialization_queue);
@@ -3318,6 +3378,9 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     htable_free(&fptr_to_id);
     htable_free(&new_methtables);
     nsym_tag = 0;
+    JL_SAVE_TIMING_END(cleanup, "cleanup");
+    JL_SAVE_TIMING_END(total, "TOTAL save time");
+    JL_SAVE_TIMING_PRINT("       image size: %zu bytes\n", (size_t)ios_pos(f));
 
     jl_gc_enable(en);
 }
@@ -3365,6 +3428,8 @@ JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *workli
                                          ios_t **s, ios_t **z, jl_array_t **udeps, int64_t *srctextpos, jl_array_t *module_init_order)
 {
     JL_TIMING(SYSIMG_DUMP, SYSIMG_DUMP);
+    JL_SAVE_TIMING_START(create_image_total);
+    JL_SAVE_TIMING_PRINT("=== Creating %s image ===\n", worklist ? "incremental" : "system");
 
     // iff emit_split
     // write header and src_text to one file f/s
@@ -3390,10 +3455,13 @@ JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *workli
     mod_array = jl_get_loaded_modules();  // __toplevel__ modules loaded in this session (from Base.loaded_modules_array)
     if (worklist) {
         if (_native_data != NULL) {
+            JL_SAVE_TIMING_START(create_native);
             if (suppress_precompile)
                 newly_inferred = NULL;
             *_native_data = jl_create_native(NULL, 0, 1, jl_atomic_load_acquire(&jl_world_counter), NULL, suppress_precompile ? (jl_array_t*)jl_an_empty_vec_any : worklist, 0, module_init_order);
+            JL_SAVE_TIMING_END(create_native, "native code generation (LLVM)");
         }
+        JL_SAVE_TIMING_START(write_header);
         jl_write_header_for_incremental(f, worklist, mod_array, udeps, srctextpos, &checksumpos);
         if (emit_split) {
             checksumpos_ff = write_header(ff, 1);
@@ -3404,15 +3472,20 @@ JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *workli
         else {
             checksumpos_ff = checksumpos;
         }
+        JL_SAVE_TIMING_END(write_header, "write header");
     }
     else if (_native_data != NULL) {
+        JL_SAVE_TIMING_START(create_native_sys);
         *_native_data = jl_create_native(NULL, jl_options.trim, 0, jl_atomic_load_acquire(&jl_world_counter), mod_array, NULL, jl_options.compile_enabled == JL_OPTIONS_COMPILE_ALL, module_init_order);
+        JL_SAVE_TIMING_END(create_native_sys, "native code generation (LLVM)");
     }
     if (_native_data != NULL)
         native_functions = *_native_data;
 
+    JL_SAVE_TIMING_START(gc_collect);
     jl_gc_collect(JL_GC_FULL);
     jl_gc_collect(JL_GC_INCREMENTAL);   // sweep finalizers
+    JL_SAVE_TIMING_END(gc_collect, "GC collection");
     // Make sure we don't run any Julia code concurrently after this point
     // since it will invalidate our serialization preparations
     jl_gc_enable_finalizers(ct, 0);
@@ -3422,6 +3495,7 @@ JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *workli
         // extext_methods: [method1, ...], worklist-owned "extending external" methods added to functions owned by modules outside the worklist
 
         // Save the inferred code from newly inferred, external methods
+        JL_SAVE_TIMING_START(collect_cis);
         if (native_functions) {
             arraylist_t CIs;
             arraylist_new(&CIs, 0);
@@ -3443,10 +3517,13 @@ JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *workli
         else {
             new_ext_cis = jl_compute_new_ext_cis();
         }
+        JL_SAVE_TIMING_END(collect_cis, "collect code instances");
 
         // Collect method extensions
+        JL_SAVE_TIMING_START(collect_methods);
         extext_methods = jl_alloc_vec_any(0);
         jl_collect_extext_methods(extext_methods, mod_array);
+        JL_SAVE_TIMING_END(collect_methods, "collect external methods");
 
         if (!emit_split) {
             write_int32(f, 0); // No clone_targets
@@ -3470,6 +3547,7 @@ JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *workli
 
     if (worklist) {
         // Go back and update the checksum in the header
+        JL_SAVE_TIMING_START(checksum);
         int64_t dataendpos = ios_pos(ff);
         uint32_t checksum = jl_crc32c(0, &ff->buf[datastartpos], dataendpos - datastartpos);
         ios_seek(ff, checksumpos_ff);
@@ -3486,6 +3564,7 @@ JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *workli
             ios_seek(f, cur);
             // Next we will write the clone_targets and afterwards the srctext
         }
+        JL_SAVE_TIMING_END(checksum, "compute checksum");
     }
 
     destroy_query_cache(&query_cache);
@@ -3494,6 +3573,7 @@ JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *workli
     *s = f;
     if (emit_split)
         *z = ff;
+    JL_SAVE_TIMING_END(create_image_total, "TOTAL image creation time");
     return;
 }
 
@@ -4389,7 +4469,7 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image,
         jl_add_methods(*extext_methods);
     JL_LOAD_TIMING_END(add_methods, "add methods");
     JL_LOAD_TIMING_END(total, "TOTAL restore time");
-    JL_LOAD_TIMING_PRINT("    image size: %zu bytes\n", sizeof_sysimg);
+    JL_LOAD_TIMING_PRINT("       image size: %zu bytes\n", sizeof_sysimg);
 }
 
 static jl_value_t *jl_validate_cache_file(ios_t *f, jl_array_t *depmods, uint64_t *checksum, int64_t *dataendpos, int64_t *datastartpos)

From 56c93cc786fe65fb69e7655acbdb679a4e31dda8 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Sat, 6 Dec 2025 13:31:36 -0500
Subject: [PATCH 09/14] include image gen in timing

---
 src/aotcompile.cpp | 13 +++++++++++++
 src/staticdata.c   |  4 ++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index fe8b97a052f03..25a66be11c9a3 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -58,6 +58,8 @@ STATISTIC(CreateNativeMethods, "Number of methods compiled for jl_create_native"
 STATISTIC(CreateNativeMax, "Max number of methods compiled at once for jl_create_native");
 STATISTIC(CreateNativeGlobals, "Number of globals compiled for jl_create_native");
 
+extern "C" JL_DLLEXPORT int jl_is_debug_saving_enabled(void) JL_NOTSAFEPOINT;
+
 static void addComdat(GlobalValue *G, Triple &T)
 {
     if (T.isOSBinFormatCOFF() && !G->isDeclaration()) {
@@ -2100,6 +2102,9 @@ void jl_dump_native_impl(void *native_code,
         jl_emission_params_t *params)
 {
     JL_TIMING(NATIVE_AOT, NATIVE_Dump);
+    uint64_t dump_native_start = 0;
+    if (jl_is_debug_saving_enabled())
+        dump_native_start = jl_hrtime();
     jl_native_code_desc_t *data = (jl_native_code_desc_t*)native_code;
     if (!bc_fname && !unopt_bc_fname && !obj_fname && !asm_fname) {
         LLVM_DEBUG(dbgs() << "No output requested, skipping native code dump?\n");
@@ -2285,6 +2290,10 @@ void jl_dump_native_impl(void *native_code,
             );
             threads = compute_image_thread_count(module_info);
             LLVM_DEBUG(dbgs() << "Using " << threads << " to emit aot image\n");
+            if (jl_is_debug_saving_enabled()) {
+                jl_safe_printf("[pkgsave]     %-40s %9u\n", "LLVM threads:", threads);
+                jl_safe_printf("[pkgsave]     %-40s %9zu\n", "LLVM module functions:", module_info.funcs);
+            }
             nfvars = data->jl_sysimg_fvars.size();
             ngvars = data->jl_sysimg_gvars.size();
             emit_table(dataM, data->jl_sysimg_gvars, "jl_gvars", T_psize);
@@ -2473,6 +2482,10 @@ void jl_dump_native_impl(void *native_code,
         WRITE_ARCHIVE(asm_fname, asm_, "", ".s");
 #undef WRITE_ARCHIVE
     }
+    if (jl_is_debug_saving_enabled() && dump_native_start != 0) {
+        uint64_t dump_native_end = jl_hrtime();
+        jl_safe_printf("[pkgsave]     %-40s %9.3f s\n", "LLVM opt & emit (dump_native)", (dump_native_end - dump_native_start) / 1e9);
+    }
 }
 
 
diff --git a/src/staticdata.c b/src/staticdata.c
index 57a5745ce9a56..3799dca9c9fac 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -3459,7 +3459,7 @@ JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *workli
             if (suppress_precompile)
                 newly_inferred = NULL;
             *_native_data = jl_create_native(NULL, 0, 1, jl_atomic_load_acquire(&jl_world_counter), NULL, suppress_precompile ? (jl_array_t*)jl_an_empty_vec_any : worklist, 0, module_init_order);
-            JL_SAVE_TIMING_END(create_native, "native code generation (LLVM)");
+            JL_SAVE_TIMING_END(create_native, "codegen (method gen & LLVM IR)");
         }
         JL_SAVE_TIMING_START(write_header);
         jl_write_header_for_incremental(f, worklist, mod_array, udeps, srctextpos, &checksumpos);
@@ -3477,7 +3477,7 @@ JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *workli
     else if (_native_data != NULL) {
         JL_SAVE_TIMING_START(create_native_sys);
         *_native_data = jl_create_native(NULL, jl_options.trim, 0, jl_atomic_load_acquire(&jl_world_counter), mod_array, NULL, jl_options.compile_enabled == JL_OPTIONS_COMPILE_ALL, module_init_order);
-        JL_SAVE_TIMING_END(create_native_sys, "native code generation (LLVM)");
+        JL_SAVE_TIMING_END(create_native_sys, "codegen (method gen & LLVM IR)");
     }
     if (_native_data != NULL)
         native_functions = *_native_data;

From 42016ebc23254e6641538ba40125bc52239ea6dd Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Sat, 6 Dec 2025 13:46:45 -0500
Subject: [PATCH 10/14] reorg

---
 src/staticdata.c | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/staticdata.c b/src/staticdata.c
index 3799dca9c9fac..f82c629116995 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -91,33 +91,12 @@ External links:
 #include "valgrind.h"
 #include "julia_assert.h"
 
-// Define JL_DEBUG_LOADING to enable detailed timing of pkgimage loading
-// #define JL_DEBUG_LOADING
-
-#ifdef JL_DEBUG_LOADING
-#define JL_LOAD_TIMING_PRINT(fmt, ...) \
-    jl_printf(JL_STDERR, "[pkgload] " fmt, ##__VA_ARGS__)
-
-#define JL_LOAD_TIMING_START(varname) \
-    uint64_t varname##_start = jl_hrtime()
-
-#define JL_LOAD_TIMING_END(varname, desc) \
-    do { \
-        uint64_t varname##_end = jl_hrtime(); \
-        jl_printf(JL_STDERR, "[pkgload]     %-40s %9.3f s\n", desc, (varname##_end - varname##_start) / 1e9); \
-    } while(0)
-
-#define JL_LOAD_TIMING_COUNTER(var) var
-#else
-#define JL_LOAD_TIMING_PRINT(fmt, ...) ((void)0)
-#define JL_LOAD_TIMING_START(varname) ((void)0)
-#define JL_LOAD_TIMING_END(varname, desc) ((void)0)
-#define JL_LOAD_TIMING_COUNTER(var) ((void)0)
-#endif
-
 // Define JL_DEBUG_SAVING to enable detailed timing of pkgimage generation
 // #define JL_DEBUG_SAVING
 
+// Define JL_DEBUG_LOADING to enable detailed timing of pkgimage loading
+// #define JL_DEBUG_LOADING
+
 #ifdef JL_DEBUG_SAVING
 #define JL_SAVE_TIMING_PRINT(fmt, ...) \
     jl_printf(JL_STDERR, "[pkgsave] " fmt, ##__VA_ARGS__)
@@ -146,6 +125,27 @@ JL_DLLEXPORT int jl_is_debug_saving_enabled(void) JL_NOTSAFEPOINT
     return jl_debug_saving_enabled;
 }
 
+#ifdef JL_DEBUG_LOADING
+#define JL_LOAD_TIMING_PRINT(fmt, ...) \
+    jl_printf(JL_STDERR, "[pkgload] " fmt, ##__VA_ARGS__)
+
+#define JL_LOAD_TIMING_START(varname) \
+    uint64_t varname##_start = jl_hrtime()
+
+#define JL_LOAD_TIMING_END(varname, desc) \
+    do { \
+        uint64_t varname##_end = jl_hrtime(); \
+        jl_printf(JL_STDERR, "[pkgload]     %-40s %9.3f s\n", desc, (varname##_end - varname##_start) / 1e9); \
+    } while(0)
+
+#define JL_LOAD_TIMING_COUNTER(var) var
+#else
+#define JL_LOAD_TIMING_PRINT(fmt, ...) ((void)0)
+#define JL_LOAD_TIMING_START(varname) ((void)0)
+#define JL_LOAD_TIMING_END(varname, desc) ((void)0)
+#define JL_LOAD_TIMING_COUNTER(var) ((void)0)
+#endif
+
 static const size_t WORLD_AGE_REVALIDATION_SENTINEL = 0x1;
 JL_DLLEXPORT size_t jl_require_world = ~(size_t)0;
 JL_DLLEXPORT _Atomic(size_t) jl_first_image_replacement_world = ~(size_t)0;

From 7aebb655f35bd378f1e453750b4709c2c1681e3e Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Sat, 6 Dec 2025 14:20:50 -0500
Subject: [PATCH 11/14] formatting etc.

---
 base/loading.jl    | 13 ++++++++-----
 src/aotcompile.cpp | 10 ++++++----
 src/staticdata.c   | 43 ++++++++++++++++++++++++++-----------------
 3 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/base/loading.jl b/base/loading.jl
index 0af1692a91ec7..d130d67adc826 100644
--- a/base/loading.jl
+++ b/base/loading.jl
@@ -3227,16 +3227,19 @@ function _pkgsave_print(msg::AbstractString)
     ccall(:jl_safe_printf, Cvoid, (Cstring,), str)
 end
 
-function _pkgsave_timing(desc::AbstractString, elapsed_ns::UInt64)
+# level 1 = top-level phases (2 spaces), level 2 = sub-phases (4 spaces)
+function _pkgsave_timing(desc::AbstractString, elapsed_ns::UInt64; level::Int=1)
     elapsed_s = Float64(elapsed_ns) / 1e9
     # Format with 3 decimal places - manually ensure trailing zeros
     rounded = round(elapsed_s, digits=3)
     int_part = trunc(Int, rounded)
     frac_part = round(Int, (rounded - int_part) * 1000)
     time_str = string(int_part) * "." * lpad(string(frac_part), 3, '0')
-    padded_desc = rpad(string(desc), 40)
-    padded_time = lpad(time_str, 9)  # Match C's %9.3f format
-    str = "[pkgsave]     " * padded_desc * " " * padded_time * " s\n"
+    indent = level == 1 ? "  " : "    "
+    width = level == 1 ? 42 : 40
+    padded_desc = rpad(string(desc), width)
+    padded_time = lpad(time_str, 9)
+    str = "[pkgsave]" * indent * padded_desc * " " * padded_time * " s\n"
     ccall(:jl_safe_printf, Cvoid, (Cstring,), str)
 end
 
@@ -3309,7 +3312,7 @@ function include_package_for_output(pkg::PkgId, input::String, syntax_version::V
     # run_package_callbacks(pkg)
 
     if debug_timing
-        _pkgsave_timing("TOTAL Julia-side precompilation", time_ns() - total_start)
+        _pkgsave_timing("SUBTOTAL Julia-side precompilation", time_ns() - total_start)
     end
 end
 
diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 25a66be11c9a3..cb3065fa5b69a 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -2103,8 +2103,10 @@ void jl_dump_native_impl(void *native_code,
 {
     JL_TIMING(NATIVE_AOT, NATIVE_Dump);
     uint64_t dump_native_start = 0;
-    if (jl_is_debug_saving_enabled())
+    if (jl_is_debug_saving_enabled()) {
+        jl_safe_printf("[pkgsave]   Writing native code...\n");
         dump_native_start = jl_hrtime();
+    }
     jl_native_code_desc_t *data = (jl_native_code_desc_t*)native_code;
     if (!bc_fname && !unopt_bc_fname && !obj_fname && !asm_fname) {
         LLVM_DEBUG(dbgs() << "No output requested, skipping native code dump?\n");
@@ -2291,8 +2293,8 @@ void jl_dump_native_impl(void *native_code,
             threads = compute_image_thread_count(module_info);
             LLVM_DEBUG(dbgs() << "Using " << threads << " to emit aot image\n");
             if (jl_is_debug_saving_enabled()) {
-                jl_safe_printf("[pkgsave]     %-40s %9u\n", "LLVM threads:", threads);
-                jl_safe_printf("[pkgsave]     %-40s %9zu\n", "LLVM module functions:", module_info.funcs);
+                jl_safe_printf("[pkgsave]     %-40s %9u\n", "threads:", threads);
+                jl_safe_printf("[pkgsave]     %-40s %9zu\n", "functions:", module_info.funcs);
             }
             nfvars = data->jl_sysimg_fvars.size();
             ngvars = data->jl_sysimg_gvars.size();
@@ -2484,7 +2486,7 @@ void jl_dump_native_impl(void *native_code,
     }
     if (jl_is_debug_saving_enabled() && dump_native_start != 0) {
         uint64_t dump_native_end = jl_hrtime();
-        jl_safe_printf("[pkgsave]     %-40s %9.3f s\n", "LLVM opt & emit (dump_native)", (dump_native_end - dump_native_start) / 1e9);
+        jl_safe_printf("[pkgsave]     %-40s %9.3f s\n", "SUBTOTAL image generation", (dump_native_end - dump_native_start) / 1e9);
     }
 }
 
diff --git a/src/staticdata.c b/src/staticdata.c
index f82c629116995..31b85f4acc11c 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -104,7 +104,15 @@ External links:
 #define JL_SAVE_TIMING_START(varname) \
     uint64_t varname##_start = jl_hrtime()
 
+// Level 1 indent (top-level phases)
 #define JL_SAVE_TIMING_END(varname, desc) \
+    do { \
+        uint64_t varname##_end = jl_hrtime(); \
+        jl_printf(JL_STDERR, "[pkgsave]   %-42s %9.3f s\n", desc, (varname##_end - varname##_start) / 1e9); \
+    } while(0)
+
+// Level 2 indent (sub-phases)
+#define JL_SAVE_TIMING_END_L2(varname, desc) \
     do { \
         uint64_t varname##_end = jl_hrtime(); \
         jl_printf(JL_STDERR, "[pkgsave]     %-40s %9.3f s\n", desc, (varname##_end - varname##_start) / 1e9); \
@@ -116,6 +124,7 @@ static const int jl_debug_saving_enabled = 1;
 #define JL_SAVE_TIMING_PRINT(fmt, ...) ((void)0)
 #define JL_SAVE_TIMING_START(varname) ((void)0)
 #define JL_SAVE_TIMING_END(varname, desc) ((void)0)
+#define JL_SAVE_TIMING_END_L2(varname, desc) ((void)0)
 #define JL_SAVE_TIMING_COUNTER(var) ((void)0)
 static const int jl_debug_saving_enabled = 0;
 #endif
@@ -2946,7 +2955,7 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
                                            jl_array_t *new_ext_cis, jl_query_cache *query_cache)
 {
     JL_SAVE_TIMING_START(total);
-    JL_SAVE_TIMING_PRINT("Serializing %s image...\n", worklist ? "incremental" : "system");
+    JL_SAVE_TIMING_PRINT("  Serializing %s image...\n", worklist ? "incremental" : "system");
     JL_SAVE_TIMING_START(init);
     htable_new(&field_replace, 0);
     htable_new(&bits_replace, 0);
@@ -3095,7 +3104,7 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     }
     jl_genericmemory_t *global_roots_list = NULL;
     jl_genericmemory_t *global_roots_keyset = NULL;
-    JL_SAVE_TIMING_END(init, "initialization & setup");
+    JL_SAVE_TIMING_END_L2(init, "initialization & setup");
 
     { // step 1: record values (recursively) that need to go in the image
         JL_SAVE_TIMING_START(queue_roots);
@@ -3127,20 +3136,20 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
             // Queue the new specializations
             jl_queue_for_serialization(&s, new_ext_cis);
         }
-        JL_SAVE_TIMING_END(queue_roots, "queue roots");
+        JL_SAVE_TIMING_END_L2(queue_roots, "queue roots");
         JL_SAVE_TIMING_START(serialize_reachable1);
         jl_serialize_reachable(&s);
-        JL_SAVE_TIMING_END(serialize_reachable1, "serialize reachable (1)");
+        JL_SAVE_TIMING_END_L2(serialize_reachable1, "serialize reachable (1)");
         // step 1.2: ensure all gvars are part of the sysimage too
         JL_SAVE_TIMING_START(record_gvars);
         record_gvars(&s, &gvars);
         record_external_fns(&s, &external_fns);
         if (jl_options.trim)
             record_gvars(&s, &MIs);
-        JL_SAVE_TIMING_END(record_gvars, "record gvars & external fns");
+        JL_SAVE_TIMING_END_L2(record_gvars, "record gvars & external fns");
         JL_SAVE_TIMING_START(serialize_reachable2);
         jl_serialize_reachable(&s);
-        JL_SAVE_TIMING_END(serialize_reachable2, "serialize reachable (2)");
+        JL_SAVE_TIMING_END_L2(serialize_reachable2, "serialize reachable (2)");
         // Beyond this point, all content should already have been visited, so now we can prune
         // the rest and add some internal root arrays.
         // step 1.3: include some other special roots
@@ -3149,7 +3158,7 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
             // Queue the new roots array
             jl_queue_for_serialization(&s, s.method_roots_list);
             jl_serialize_reachable(&s);
-            JL_SAVE_TIMING_END(method_roots, "queue method roots");
+            JL_SAVE_TIMING_END_L2(method_roots, "queue method roots");
         }
         // step 1.4: prune (garbage collect) special weak references from the jl_global_roots_list
         JL_SAVE_TIMING_START(prune_roots);
@@ -3168,7 +3177,7 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
             jl_queue_for_serialization(&s, global_roots_keyset);
             jl_serialize_reachable(&s);
         }
-        JL_SAVE_TIMING_END(prune_roots, "prune global roots");
+        JL_SAVE_TIMING_END_L2(prune_roots, "prune global roots");
         // step 1.5: prune (garbage collect) some special weak references known caches
         JL_SAVE_TIMING_START(prune_caches);
         for (i = 0; i < serialization_queue.len; i++) {
@@ -3204,7 +3213,7 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
                 jl_prune_mt_backedges((jl_genericmemory_t*)backedges);
             }
         }
-        JL_SAVE_TIMING_END(prune_caches, "prune caches & backedges");
+        JL_SAVE_TIMING_END_L2(prune_caches, "prune caches & backedges");
     }
 
     uint32_t external_fns_begin = 0;
@@ -3212,10 +3221,10 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
         JL_SAVE_TIMING_START(write_values);
         write_padding(&sysimg, sizeof(uintptr_t));
         jl_write_values(&s);
-        JL_SAVE_TIMING_END(write_values, "write values");
+        JL_SAVE_TIMING_END_L2(write_values, "write values");
         JL_SAVE_TIMING_START(write_gvars);
         external_fns_begin = write_gvars(&s, &gvars, &external_fns);
-        JL_SAVE_TIMING_END(write_gvars, "write gvars");
+        JL_SAVE_TIMING_END_L2(write_gvars, "write gvars");
     }
 
     // This ensures that we can use the low bit of addresses for
@@ -3281,7 +3290,7 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
         jl_write_arraylist(s.relocs, &s.fixup_types);
     }
     jl_write_arraylist(s.relocs, &s.fixup_objs);
-    JL_SAVE_TIMING_END(prepare_relocs, "prepare relocations");
+    JL_SAVE_TIMING_END_L2(prepare_relocs, "prepare relocations");
     write_uint(f, relocs.size);
     write_padding(f, LLT_ALIGN(ios_pos(f), 8) - ios_pos(f));
     ios_seek(&relocs, 0);
@@ -3299,7 +3308,7 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     ios_seek(&fptr_record, 0);
     ios_copyall(f, &fptr_record);
     ios_close(&fptr_record);
-    JL_SAVE_TIMING_END(combine_sections, "combine sections");
+    JL_SAVE_TIMING_END_L2(combine_sections, "combine sections");
 
     { // step 4: record locations of special roots
         JL_SAVE_TIMING_START(write_roots);
@@ -3350,7 +3359,7 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
         write_uint32(f, jl_array_len(s.link_ids_external_fnvars));
         ios_write(f, (char*)jl_array_data(s.link_ids_external_fnvars, uint32_t), jl_array_len(s.link_ids_external_fnvars) * sizeof(uint32_t));
         write_uint32(f, external_fns_begin);
-        JL_SAVE_TIMING_END(write_roots, "write special roots & link ids");
+        JL_SAVE_TIMING_END_L2(write_roots, "write special roots & link ids");
     }
 
     JL_SAVE_TIMING_START(cleanup);
@@ -3378,8 +3387,8 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     htable_free(&fptr_to_id);
     htable_free(&new_methtables);
     nsym_tag = 0;
-    JL_SAVE_TIMING_END(cleanup, "cleanup");
-    JL_SAVE_TIMING_END(total, "TOTAL save time");
+    JL_SAVE_TIMING_END_L2(cleanup, "cleanup");
+    JL_SAVE_TIMING_END_L2(total, "SUBTOTAL serialize incremental image");
     JL_SAVE_TIMING_PRINT("       image size: %zu bytes\n", (size_t)ios_pos(f));
 
     jl_gc_enable(en);
@@ -3573,7 +3582,7 @@ JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *workli
     *s = f;
     if (emit_split)
         *z = ff;
-    JL_SAVE_TIMING_END(create_image_total, "TOTAL image creation time");
+    JL_SAVE_TIMING_END(create_image_total, "SUBTOTAL image creation time");
     return;
 }
 

From 6956d12f21ecc4c92cd65d3d28b41730fb0b38f5 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Sat, 6 Dec 2025 18:14:08 -0500
Subject: [PATCH 12/14] Update pkgimage-loading-performance.md

---
 .../devdocs/pkgimage-loading-performance.md   | 98 +++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/doc/src/devdocs/pkgimage-loading-performance.md b/doc/src/devdocs/pkgimage-loading-performance.md
index 996fef872197f..3c5c01e1ff144 100644
--- a/doc/src/devdocs/pkgimage-loading-performance.md
+++ b/doc/src/devdocs/pkgimage-loading-performance.md
@@ -333,6 +333,104 @@ Attempted a two-phase approach: decode all relocation entries into a buffer, the
 
 **Lesson learned:** Same as above - the natural serialization order has good properties.
 
+## Package Cache Generation (Precompilation) Analysis
+
+In addition to loading performance, we analyzed package cache **generation** (precompilation) to identify optimization opportunities in the saving phase.
+
+### Timing Breakdown for GLMakie Precompilation
+
+| Phase | Time (s) | % of Total |
+|-------|----------|------------|
+| include (parse, lower, inference) | 24.6 | 46% |
+| codegen (method gen & LLVM IR) | 14.9 | 28% |
+| native code generation (3 threads) | 12.6 | 24% |
+| serialize incremental image | 0.4 | <1% |
+| GC collection | 0.7 | 1% |
+| other | 0.5 | <1% |
+| **TOTAL** | **53.7** | |
+
+### Redundancy Analysis
+
+We instrumented the codegen phase to detect any redundant compilation work.
+
+Input to `jl_emit_native_impl`:
+
+- 10,347 CodeInstances submitted for compilation
+- 104 duplicates skipped (1%) - from multi-world compilation, handled correctly
+- 2,530 const_return methods (no codegen needed)
+- 7,710 methods actually compiled
+
+Workqueue (call targets discovered during codegen):
+
+- 9,970 items in workqueue
+- 100% were already compiled (correct behavior - workqueue is for prototype patching, not recompilation)
+- 7,142 duplicate references (72%) - same CodeInstance called from multiple sites
+
+**Conclusion:** No significant redundant compilation work. The duplicates in the workqueue are expected - they represent multiple call sites that need their prototype declarations patched to point to the compiled function, not redundant compilation.
+
+### Parallel Codegen Exploration
+
+We explored parallelizing the codegen phase since it takes ~28% of precompilation time.
+
+#### Approach 1: Per-thread LLVM Contexts
+
+Each worker thread gets its own `LLVMContext` and `jl_codegen_params_t`:
+
+```text
+Main thread: partition work → spawn workers → collect modules → merge
+```
+
+**Result:** SEGFAULT in `ConstantPointerNull::get`
+
+**Root cause:** Modules from different LLVM contexts cannot be merged or linked. The `Linker` expects all modules to share the same context. Cross-context Value references cause assertion failures.
+
+#### Approach 2: Serialize/Deserialize Modules
+
+Worker threads compile to separate contexts, serialize modules to bitcode, main thread deserializes into shared context:
+
+```text
+Worker: codegen → BitcodeWriter → serialize
+Main: parseBitcodeFile → Linker::linkInModule
+```
+
+**Result:** Symbol not found errors on package load (`_jlplt_ijl_rethrow_38825_got`)
+
+**Root cause:** `jl_codegen_params_t` accumulates state during codegen:
+
+- `workqueue` - call targets for prototype patching
+- `global_targets` - references to Julia values
+- `external_fns` - external function references
+
+When using per-thread params, this state is lost after serialization. The workqueue contains Function pointers valid only in the source context.
+
+#### Approach 3: Shared Context with Mutex
+
+All threads share a single `jl_codegen_params_t` with mutex protection:
+
+**Result:** Deadlock
+
+**Root cause:** `jl_codegen_params_t` acquires `tsctx_lock` (the ThreadSafeContext lock) in its constructor and holds it for its entire lifetime. Worker threads cannot acquire the same lock to call `jl_emit_codeinst`.
+
+### Architectural Challenges for Parallel Codegen
+
+The current codegen architecture has deep assumptions about single-threaded execution:
+
+1. **LLVM Context Ownership:** `jl_codegen_params_t` owns a `ThreadSafeContext::Lock` that cannot be released and re-acquired
+2. **Shared Mutable State:** Many maps and vectors (`workqueue`, `global_targets`, `external_fns`, `mergedConstants`) are updated during codegen
+3. **Cross-Module References:** Generated functions reference globals and other functions that may be in different modules
+4. **GC Integration:** `temporary_roots` must be managed carefully across threads
+
+Potential future approaches:
+
+| Approach | Expected Impact | Complexity | Challenges |
+|----------|-----------------|------------|------------|
+| Refactor `jl_codegen_params_t` for explicit lock management | Enable shared context | Very High | Requires careful analysis of all shared state |
+| Pipeline parallelism (inference ‖ codegen ‖ LLVM opt) | Better utilization | High | Data dependencies, buffering |
+| Batch codegen at module granularity | Coarser parallelism | Medium | Load balancing |
+| Parallel LLVM optimization (already done) | ✅ Already parallelized | - | 3 threads used for native code gen |
+
+**Current status:** Parallel codegen disabled. The native code generation phase already uses 3 threads (visible in debug output), so some parallelism exists in the later stages.
+
 ## Performance Characteristics
 
 **Object uniquing rate:** ~5.2K objects/ms (~195ns per object average)

From 3bcd7a9aedf88575b77d889cf2cc5e6532e74061 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Sat, 6 Dec 2025 22:16:37 -0500
Subject: [PATCH 13/14] use a depot threadpool for LLVM imaging

---
 doc/src/manual/environment-variables.md |  15 ++
 src/aotcompile.cpp                      | 329 +++++++++++++++++++++++-
 2 files changed, 343 insertions(+), 1 deletion(-)

diff --git a/doc/src/manual/environment-variables.md b/doc/src/manual/environment-variables.md
index 3bf88afcc8992..ae2ae5c88c530 100644
--- a/doc/src/manual/environment-variables.md
+++ b/doc/src/manual/environment-variables.md
@@ -389,6 +389,21 @@ ignored if the module is a small module. If left unspecified, the smaller
 of the value of [`JULIA_CPU_THREADS`](@ref JULIA_CPU_THREADS) or half the
 number of logical CPU cores is used in its place.
 
+### [`JULIA_IMAGE_THREAD_POOL`](@id JULIA_IMAGE_THREAD_POOL)
+
+Set to `0` to disable cross-process thread coordination during parallel
+precompilation. By default, Julia processes use file locking to acquire threads
+from a shared pool stored at `DEPOT_PATH[1]/compiled/threadpool`, preventing
+thread oversubscription when multiple packages are being precompiled simultaneously.
+Threads are acquired just before LLVM parallel optimization and released immediately after.
+
+See also [`JULIA_IMAGE_THREAD_POOL_SIZE`](@ref JULIA_IMAGE_THREAD_POOL_SIZE).
+
+### [`JULIA_IMAGE_THREAD_POOL_SIZE`](@id JULIA_IMAGE_THREAD_POOL_SIZE)
+
+An unsigned integer that sets the maximum number of threads available in the
+thread pool for parallel image compilation. Defaults to the number of logical CPU cores.
+
 ### [`JULIA_IMAGE_TIMINGS`](@id JULIA_IMAGE_TIMINGS)
 
 A boolean value that determines if detailed timing information is printed during
diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index cb3065fa5b69a..bd8e52c59c5af 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -51,6 +51,17 @@ using namespace llvm;
 #include "julia_assert.h"
 #include "processor.h"
 
+#include <fcntl.h>
+#ifdef _OS_WINDOWS_
+#include <io.h>
+#include <windows.h>
+#include <sys/stat.h>
+#else
+#include <unistd.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#endif
+
 #define DEBUG_TYPE "julia_aotcompile"
 
 STATISTIC(CreateNativeCalls, "Number of jl_create_native calls made");
@@ -2032,6 +2043,300 @@ static SmallVector<AOTOutputs, 16> add_output(Module &M, TargetMachine &TM, Stri
     return outputs;
 }
 
+// Thread pool coordination for parallel native code generation across multiple
+// precompilation workers. Workers coordinate via file locking to avoid oversubscription.
+//
+// The pool file is stored at DEPOT_PATH[1]/compiled/threadpool, derived from the
+// output path (which is DEPOT_PATH[1]/compiled/vX.Y/PkgName/...).
+//
+// The pool file format is simple: a single integer representing threads currently in use.
+// Each worker acquires an exclusive lock, reads/modifies the count, then releases.
+
+// Get the thread pool path from the output path
+// outputji is like: /path/to/depot/compiled/v1.12/PkgName/xxx.ji
+// We want: /path/to/depot/compiled/threadpool
+static std::string get_thread_pool_path() {
+    const char *outputji = jl_options.outputji;
+    if (!outputji)
+        return "";
+
+    std::string path(outputji);
+
+    // Find "/compiled/" in the path
+    size_t pos = path.find("/compiled/");
+#ifdef _OS_WINDOWS_
+    if (pos == std::string::npos)
+        pos = path.find("\\compiled\\");
+#endif
+    if (pos == std::string::npos)
+        return "";
+
+    // Construct depot/compiled/threadpool
+    return path.substr(0, pos) + "/compiled/threadpool";
+}
+
+struct ThreadPoolGuard {
+    int fd;
+    unsigned threads_acquired;
+    bool active;
+    std::string pool_path;
+
+    ThreadPoolGuard() : fd(-1), threads_acquired(0), active(false) {}
+
+    // Acquire up to `requested` threads from the pool, waiting if necessary
+    // Returns the number of threads actually acquired (may be less than requested)
+    unsigned acquire(unsigned requested, unsigned pool_size) {
+        // Check if thread pool coordination is disabled
+        const char *use_pool = getenv("JULIA_IMAGE_THREAD_POOL");
+        if (use_pool && strcmp(use_pool, "0") == 0) {
+            // Thread pool explicitly disabled
+            threads_acquired = requested;
+            return requested;
+        }
+
+        pool_path = get_thread_pool_path();
+        if (pool_path.empty()) {
+            // Could not determine pool path
+            threads_acquired = requested;
+            return requested;
+        }
+
+#ifdef _OS_WINDOWS_
+        fd = _open(pool_path.c_str(), _O_RDWR | _O_CREAT, 0644);
+#else
+        fd = open(pool_path.c_str(), O_RDWR | O_CREAT, 0644);
+#endif
+        if (fd < 0) {
+            if (jl_is_debug_saving_enabled()) {
+                jl_safe_printf("[pkgsave]     thread pool: cannot open '%s': %s\n",
+                               pool_path.c_str(), strerror(errno));
+            }
+            threads_acquired = requested;
+            return requested;
+        }
+
+        active = true;
+
+        // Acquire exclusive lock and read/modify thread count
+        unsigned acquired = 0;
+        while (acquired == 0) {
+#ifdef _OS_WINDOWS_
+            HANDLE hFile = (HANDLE)_get_osfhandle(fd);
+            OVERLAPPED ov = {0};
+            if (!LockFileEx(hFile, LOCKFILE_EXCLUSIVE_LOCK, 0, MAXDWORD, MAXDWORD, &ov)) {
+                jl_safe_printf("WARNING: cannot lock thread pool file\n");
+                threads_acquired = requested;
+                return requested;
+            }
+#else
+            if (flock(fd, LOCK_EX) != 0) {
+                if (jl_is_debug_saving_enabled()) {
+                    jl_safe_printf("[pkgsave]     thread pool: cannot lock: %s\n", strerror(errno));
+                }
+                close(fd);
+                fd = -1;
+                active = false;
+                threads_acquired = requested;
+                return requested;
+            }
+#endif
+
+            // Check if pool file is stale (older than 15 minutes) and reset if so
+            // This handles crashed processes that didn't release their threads
+            struct stat st;
+            bool reset_pool = false;
+#ifdef _OS_WINDOWS_
+            if (_fstat(fd, &st) == 0) {
+                time_t now = time(NULL);
+                if (now - st.st_mtime > 15 * 60) {
+                    reset_pool = true;
+                }
+            }
+#else
+            if (fstat(fd, &st) == 0) {
+                time_t now = time(NULL);
+                if (now - st.st_mtime > 15 * 60) {
+                    reset_pool = true;
+                }
+            }
+#endif
+            if (reset_pool) {
+                if (jl_is_debug_saving_enabled()) {
+                    jl_safe_printf("[pkgsave]     thread pool: file is stale (>15 min), resetting\n");
+                }
+                // Reset by truncating and seeking to start
+#ifdef _OS_WINDOWS_
+                _chsize(fd, 0);
+                _lseek(fd, 0, SEEK_SET);
+#else
+                if (ftruncate(fd, 0) == 0) {
+                    lseek(fd, 0, SEEK_SET);
+                }
+#endif
+            }
+
+            // Read current count
+            char buf[32] = {0};
+#ifdef _OS_WINDOWS_
+            _lseek(fd, 0, SEEK_SET);
+            int n = _read(fd, buf, sizeof(buf) - 1);
+#else
+            lseek(fd, 0, SEEK_SET);
+            ssize_t n = read(fd, buf, sizeof(buf) - 1);
+#endif
+            unsigned in_use = 0;
+            if (n > 0) {
+                in_use = (unsigned)strtoul(buf, nullptr, 10);
+            }
+
+            // Allocation strategy:
+            // - Always get at least 2 threads (or requested if less), unless severely oversubscribed
+            // - If pool has availability, use what's available (up to requested), minimum 2
+            // - If pool is >= 2x oversubscribed, wait for threads to free up
+            bool severely_oversubscribed = (in_use >= pool_size * 2);
+
+            if (severely_oversubscribed) {
+                // Wait and retry
+                acquired = 0;
+            } else {
+                unsigned available = (in_use < pool_size) ? (pool_size - in_use) : 0;
+                unsigned minimum = std::min(requested, 2u);
+                acquired = std::max(minimum, std::min(requested, available));
+                if (jl_is_debug_saving_enabled() && available < acquired) {
+                    jl_safe_printf("[pkgsave]     thread pool: low availability, using minimum %u threads (available=%u, in_use=%u, pool_size=%u)\n",
+                                   acquired, available, in_use, pool_size);
+                }
+            }
+
+            if (acquired > 0) {
+                unsigned new_count = in_use + acquired;
+
+                // Write new count
+                snprintf(buf, sizeof(buf), "%u", new_count);
+#ifdef _OS_WINDOWS_
+                _lseek(fd, 0, SEEK_SET);
+                _chsize(fd, 0);
+                _write(fd, buf, (unsigned)strlen(buf));
+#else
+                lseek(fd, 0, SEEK_SET);
+                if (ftruncate(fd, 0) != 0) {
+                    // ignore error
+                }
+                ssize_t written = write(fd, buf, strlen(buf));
+                (void)written;
+#endif
+
+                if (jl_is_debug_saving_enabled()) {
+                    jl_safe_printf("[pkgsave]     thread pool: acquired %u threads (%u -> %u in use, pool size %u)\n",
+                                   acquired, in_use, new_count, pool_size);
+                }
+            }
+
+            // Release lock
+#ifdef _OS_WINDOWS_
+            UnlockFileEx(hFile, 0, MAXDWORD, MAXDWORD, &ov);
+#else
+            flock(fd, LOCK_UN);
+#endif
+
+            if (acquired == 0) {
+                // Severely oversubscribed, wait and retry
+                // Use exponential backoff with jitter
+                static thread_local unsigned backoff_ms = 10;
+                unsigned sleep_ms = backoff_ms + (rand() % backoff_ms);
+#ifdef _OS_WINDOWS_
+                Sleep(sleep_ms);
+#else
+                usleep(sleep_ms * 1000);
+#endif
+                backoff_ms = std::min(backoff_ms * 2, 1000u);
+                if (jl_is_debug_saving_enabled()) {
+                    jl_safe_printf("[pkgsave]     thread pool: severely oversubscribed (in_use=%u >= 2x pool_size=%u), waiting %ums\n",
+                                   in_use, pool_size, sleep_ms);
+                }
+            }
+        }
+
+        threads_acquired = acquired;
+        return acquired;
+    }
+
+    // Release threads back to the pool
+    void release() {
+        if (!active || fd < 0 || threads_acquired == 0)
+            return;
+
+#ifdef _OS_WINDOWS_
+        HANDLE hFile = (HANDLE)_get_osfhandle(fd);
+        OVERLAPPED ov = {0};
+        if (!LockFileEx(hFile, LOCKFILE_EXCLUSIVE_LOCK, 0, MAXDWORD, MAXDWORD, &ov)) {
+            jl_safe_printf("WARNING: cannot lock thread pool file for release\n");
+            _close(fd);
+            return;
+        }
+#else
+        if (flock(fd, LOCK_EX) != 0) {
+            jl_safe_printf("WARNING: cannot lock thread pool file for release: %s\n", strerror(errno));
+            close(fd);
+            return;
+        }
+#endif
+
+        // Read current count
+        char buf[32] = {0};
+#ifdef _OS_WINDOWS_
+        _lseek(fd, 0, SEEK_SET);
+        int n = _read(fd, buf, sizeof(buf) - 1);
+#else
+        lseek(fd, 0, SEEK_SET);
+        ssize_t n = read(fd, buf, sizeof(buf) - 1);
+#endif
+        unsigned in_use = 0;
+        if (n > 0) {
+            in_use = (unsigned)strtoul(buf, nullptr, 10);
+        }
+
+        unsigned new_count = (in_use > threads_acquired) ? (in_use - threads_acquired) : 0;
+
+        // Write new count
+        snprintf(buf, sizeof(buf), "%u", new_count);
+#ifdef _OS_WINDOWS_
+        _lseek(fd, 0, SEEK_SET);
+        _chsize(fd, 0);
+        _write(fd, buf, (unsigned)strlen(buf));
+#else
+        lseek(fd, 0, SEEK_SET);
+        if (ftruncate(fd, 0) != 0) {
+            // ignore error
+        }
+        ssize_t written = write(fd, buf, strlen(buf));
+        (void)written;
+#endif
+
+        if (jl_is_debug_saving_enabled()) {
+            jl_safe_printf("[pkgsave]     thread pool: released %u threads (%u -> %u in use)\n",
+                           threads_acquired, in_use, new_count);
+        }
+
+        // Release lock
+#ifdef _OS_WINDOWS_
+        UnlockFileEx(hFile, 0, MAXDWORD, MAXDWORD, &ov);
+        _close(fd);
+#else
+        flock(fd, LOCK_UN);
+        close(fd);
+#endif
+
+        fd = -1;
+        threads_acquired = 0;
+        active = false;
+    }
+
+    ~ThreadPoolGuard() {
+        release();
+    }
+};
+
 extern int jl_is_timing_passes;
 static unsigned compute_image_thread_count(const ModuleInfo &info) {
     // 32-bit systems are very memory-constrained
@@ -2048,7 +2353,7 @@ static unsigned compute_image_thread_count(const ModuleInfo &info) {
         return 1;
     }
 
-    unsigned threads = std::max(jl_effective_threads() / 2, 1);
+    unsigned threads = std::max(jl_effective_threads(), 1);
 
     auto max_threads = info.globals / 100;
     if (max_threads < threads) {
@@ -2333,6 +2638,25 @@ void jl_dump_native_impl(void *native_code,
         has_veccall = !!dataM.getModuleFlag("julia.mv.veccall");
     });
 
+    // Thread pool coordination: acquire threads just before parallel LLVM work
+    ThreadPoolGuard thread_pool;
+    if (threads > 1) {
+        // Get pool size from environment or default to CPU threads + 1
+        // The +1 gives headroom for overlap during thread handoffs
+        unsigned pool_size = jl_effective_threads() + 1;
+        if (auto env = getenv("JULIA_IMAGE_THREAD_POOL_SIZE")) {
+            char *endptr;
+            unsigned long val = strtoul(env, &endptr, 10);
+            if (!*endptr && val > 0) {
+                pool_size = (unsigned)val;
+            }
+        }
+        threads = thread_pool.acquire(threads, pool_size);
+        if (jl_is_debug_saving_enabled() && thread_pool.active) {
+            jl_safe_printf("[pkgsave]     %-40s %9u (from pool)\n", "threads:", threads);
+        }
+    }
+
     {
         // Don't use withModuleDo here since we delete the TSM midway through
         auto TSCtx = data->M.getContext();
@@ -2349,6 +2673,9 @@ void jl_dump_native_impl(void *native_code,
         });
     }
 
+    // Release threads back to pool (also done by destructor, but be explicit)
+    thread_pool.release();
+
     if (params->emit_metadata) {
         JL_TIMING(NATIVE_AOT, NATIVE_Metadata);
         LLVMContext Context;

From e29f0129c152ecf4dfea4eee8d8747b8390c0750 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Sat, 6 Dec 2025 22:16:41 -0500
Subject: [PATCH 14/14] Update pkgimage-loading-performance.md

---
 .../devdocs/pkgimage-loading-performance.md   | 44 ++++++++++++++++++-
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/doc/src/devdocs/pkgimage-loading-performance.md b/doc/src/devdocs/pkgimage-loading-performance.md
index 3c5c01e1ff144..3bf98356ad903 100644
--- a/doc/src/devdocs/pkgimage-loading-performance.md
+++ b/doc/src/devdocs/pkgimage-loading-performance.md
@@ -427,9 +427,49 @@ Potential future approaches:
 | Refactor `jl_codegen_params_t` for explicit lock management | Enable shared context | Very High | Requires careful analysis of all shared state |
 | Pipeline parallelism (inference ‖ codegen ‖ LLVM opt) | Better utilization | High | Data dependencies, buffering |
 | Batch codegen at module granularity | Coarser parallelism | Medium | Load balancing |
-| Parallel LLVM optimization (already done) | ✅ Already parallelized | - | 3 threads used for native code gen |
+| Parallel LLVM optimization (already done) | ✅ Already parallelized | - | Variable threads based on module size |
 
-**Current status:** Parallel codegen disabled. The native code generation phase already uses 3 threads (visible in debug output), so some parallelism exists in the later stages.
+**Current status:** Parallel codegen disabled. The native code generation phase uses parallel LLVM optimization (visible in debug output as "threads: N"), with thread count determined dynamically based on module complexity.
+
+### Thread Pool Coordination
+
+When multiple Julia processes run parallel precompilation (e.g., `Pkg.precompile()`), each process independently decides how many threads to use for native code generation. This can lead to thread oversubscription on the system.
+
+A thread pool mechanism allows coordination across processes:
+
+**Location:** The thread pool file is stored at `DEPOT_PATH[1]/compiled/threadpool` (derived automatically from the precompilation output path).
+
+**Environment Variables:**
+
+- `JULIA_IMAGE_THREAD_POOL`: Set to `0` to disable cross-process thread coordination. Enabled by default.
+- `JULIA_IMAGE_THREAD_POOL_SIZE`: Maximum threads in the pool (default: number of CPU threads). This limits total threads used across all concurrent precompilation workers.
+
+**Usage Example:**
+
+```bash
+# Limit thread pool to 8 threads across all workers
+export JULIA_IMAGE_THREAD_POOL_SIZE=8
+julia -e "using Pkg; Pkg.precompile()"
+
+# Disable thread pool coordination (each worker uses all available threads)
+export JULIA_IMAGE_THREAD_POOL=0
+julia -e "using Pkg; Pkg.precompile()"
+```
+
+**How it works:**
+
+1. Thread acquisition happens just before LLVM parallel optimization begins (not at precompilation start)
+2. Each worker acquires up to its desired threads from the pool, waiting if necessary
+3. Threads are released immediately after LLVM optimization completes
+4. This just-in-time approach minimizes lock contention since native code gen is only ~24% of total precompilation time
+5. The pool file is automatically located at `~/.julia/compiled/threadpool` (or equivalent depot path)
+
+**Debug output:** When `JL_DEBUG_SAVING` is enabled, thread pool operations are logged:
+
+```text
+[pkgsave]     thread pool: acquired 3 threads (2 -> 5 in use, pool size 8)
+[pkgsave]     thread pool: released 3 threads (5 -> 2 in use)
+```
 
 ## Performance Characteristics