diff --git a/numba_cuda/numba/cuda/compiler.py b/numba_cuda/numba/cuda/compiler.py index b58332c5a..42bf6e0bb 100644 --- a/numba_cuda/numba/cuda/compiler.py +++ b/numba_cuda/numba/cuda/compiler.py @@ -1,4 +1,5 @@ from llvmlite import ir +from collections import namedtuple from numba.core import ir as numba_ir from numba.core import ( cgutils, @@ -14,7 +15,6 @@ sanitize_compile_result_entries, CompilerBase, DefaultPassBuilder, - CompileResult, ) from numba.core.compiler_lock import global_compiler_lock from numba.core.compiler_machinery import ( @@ -45,25 +45,129 @@ # The CUDACompileResult (CCR) has a specially-defined entry point equal to its # id. This is because the entry point is used as a key into a dict of # overloads by the base dispatcher. The id of the CCR is the only small and -# unique property of a CompileResult in the CUDA target (cf. the CPU target, +# unique property of a CUDACompileResult in the CUDA target (cf. the CPU target, # which uses its entry_point, which is a pointer value). # # This does feel a little hackish, and there are two ways in which this could # be improved: # -# 1. We could change the core of Numba so that each CompileResult has its own +# 1. We could change the CUDACompileResult so that each instance has its own # unique ID that can be used as a key - e.g. a count, similar to the way in # which types have unique counts. # 2. At some future time when kernel launch uses a compiled function, the entry # point will no longer need to be a synthetic value, but will instead be a # pointer to the compiled function as in the CPU target. +CR_FIELDS = [ + "typing_context", + "target_context", + "entry_point", + "typing_error", + "type_annotation", + "signature", + "objectmode", + "lifted", + "fndesc", + "library", + "call_helper", + "environment", + "metadata", + # List of functions to call to initialize on unserialization + # (i.e cache load). + "reload_init", + "referenced_envs", +] + + +class CUDACompileResult(namedtuple("_CompileResult", CR_FIELDS)): + """ + A structure holding results from the compilation of a function. + """ + + __slots__ = () -class CUDACompileResult(CompileResult): @property def entry_point(self): return id(self) + def _reduce(self): + """ + Reduce a CompileResult to picklable components. + """ + libdata = self.library.serialize_using_object_code() + # Make it (un)picklable efficiently + typeann = str(self.type_annotation) + fndesc = self.fndesc + # Those don't need to be pickled and may fail + fndesc.typemap = fndesc.calltypes = None + # The CUDA target does not reference environments + referenced_envs = tuple() + return ( + libdata, + self.fndesc, + self.environment, + self.signature, + self.objectmode, + self.lifted, + typeann, + self.reload_init, + referenced_envs, + ) + + @classmethod + def _rebuild( + cls, + target_context, + libdata, + fndesc, + env, + signature, + objectmode, + lifted, + typeann, + reload_init, + referenced_envs, + ): + if reload_init: + # Re-run all + for fn in reload_init: + fn() + + library = target_context.codegen().unserialize_library(libdata) + cfunc = target_context.get_executable(library, fndesc, env) + cr = cls( + target_context=target_context, + typing_context=target_context.typing_context, + library=library, + environment=env, + entry_point=cfunc, + fndesc=fndesc, + type_annotation=typeann, + signature=signature, + objectmode=objectmode, + lifted=lifted, + typing_error=None, + call_helper=None, + metadata=None, # Do not store, arbitrary & potentially large! + reload_init=reload_init, + referenced_envs=referenced_envs, + ) + + # Load Environments + for env in referenced_envs: + library.codegen.set_env(env.env_name, env) + + return cr + + @property + def codegen(self): + return self.target_context.codegen() + + def dump(self, tab=""): + print(f"{tab}DUMP {type(self).__name__} {self.entry_point}") + self.signature.dump(tab=tab + " ") + print(f"{tab}END DUMP") + def cuda_compile_result(**entries): entries = sanitize_compile_result_entries(entries)