Skip to content

Commit e3445bc

Browse files
committed
[MetaSchedule][Runtime] Enhance Runner RandomFill
1 parent 7e376e2 commit e3445bc

File tree

19 files changed

+378
-108
lines changed

19 files changed

+378
-108
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ tvm_option(USE_CUDNN "Build with cuDNN" OFF)
8989
tvm_option(USE_CUBLAS "Build with cuBLAS" OFF)
9090
tvm_option(USE_CUTLASS "Build with CUTLASS" OFF)
9191
tvm_option(USE_THRUST "Build with Thrust" OFF)
92+
tvm_option(USE_CURAND "Build with cuRAND" OFF)
9293
tvm_option(USE_MIOPEN "Build with ROCM:MIOpen" OFF)
9394
tvm_option(USE_ROCBLAS "Build with ROCM:RoCBLAS" OFF)
9495
tvm_option(USE_SORT "Build with sort support" ON)

cmake/config.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,9 @@ set(USE_VTA_FPGA OFF)
296296
# Whether use Thrust
297297
set(USE_THRUST OFF)
298298

299+
# Whether use cuRAND
300+
set(USE_CURAND OFF)
301+
299302
# Whether to build the TensorFlow TVMDSOOp module
300303
set(USE_TF_TVMDSOOP OFF)
301304

cmake/modules/CUDA.cmake

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,18 @@ if(USE_CUDA)
6969
list(APPEND RUNTIME_SRCS ${CONTRIB_THRUST_SRC})
7070
endif(USE_THRUST)
7171

72+
if(USE_CURAND)
73+
message(STATUS "Build with cuRAND support")
74+
message(STATUS "${CUDA_CURAND_LIBRARY}")
75+
cmake_minimum_required(VERSION 3.13) # to compile CUDA code
76+
enable_language(CUDA)
77+
tvm_file_glob(GLOB CONTRIB_CURAND_SRC_CC src/runtime/contrib/curand/*.cc)
78+
tvm_file_glob(GLOB CONTRIB_CURAND_SRC_CU src/runtime/contrib/curand/*.cu)
79+
list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CURAND_LIBRARY})
80+
list(APPEND RUNTIME_SRCS ${CONTRIB_CURAND_SRC_CC})
81+
list(APPEND RUNTIME_SRCS ${CONTRIB_CURAND_SRC_CU})
82+
endif(USE_CURAND)
83+
7284
if(USE_GRAPH_EXECUTOR_CUDA_GRAPH)
7385
if(NOT USE_GRAPH_EXECUTOR)
7486
message(FATAL_ERROR "CUDA Graph is only supported by graph executor, please set USE_GRAPH_EXECUTOR=ON")

cmake/modules/LibInfo.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ function(add_lib_info src_file)
111111
TVM_INFO_USE_TFLITE="${USE_TFLITE}"
112112
TVM_INFO_USE_THREADS="${USE_THREADS}"
113113
TVM_INFO_USE_THRUST="${USE_THRUST}"
114+
TVM_INFO_USE_CURAND="${USE_CURAND}"
114115
TVM_INFO_USE_VITIS_AI="${USE_VITIS_AI}"
115116
TVM_INFO_USE_VULKAN="${USE_VULKAN}"
116117
TVM_INFO_USE_CLML="${USE_CLML}"

cmake/utils/FindCUDA.cmake

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,10 @@ macro(find_cuda use_cuda use_cudnn)
8585
PATHS ${CUDA_TOOLKIT_ROOT_DIR}
8686
PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs lib64/stubs lib/x86_64-linux-gnu
8787
NO_DEFAULT_PATH)
88+
find_library(CUDA_CURAND_LIBRARY curand
89+
${CUDA_TOOLKIT_ROOT_DIR}/lib64
90+
${CUDA_TOOLKIT_ROOT_DIR}/lib
91+
NO_DEFAULT_PATH)
8892
find_library(CUDA_CUBLAS_LIBRARY cublas
8993
${CUDA_TOOLKIT_ROOT_DIR}/lib64
9094
${CUDA_TOOLKIT_ROOT_DIR}/lib
@@ -134,6 +138,7 @@ macro(find_cuda use_cuda use_cudnn)
134138
message(STATUS "Found CUDA_CUDNN_INCLUDE_DIRS=" ${CUDA_CUDNN_INCLUDE_DIRS})
135139
message(STATUS "Found CUDA_CUDNN_LIBRARY=" ${CUDA_CUDNN_LIBRARY})
136140
message(STATUS "Found CUDA_CUBLAS_LIBRARY=" ${CUDA_CUBLAS_LIBRARY})
141+
message(STATUS "Found CUDA_CURAND_LIBRARY=" ${CUDA_CURAND_LIBRARY})
137142
message(STATUS "Found CUDA_CUBLASLT_LIBRARY=" ${CUDA_CUBLASLT_LIBRARY})
138143
endif(CUDA_FOUND)
139144
endmacro(find_cuda)

python/tvm/auto_scheduler/testing/tune_onnx.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from tvm import meta_schedule as ms
2727
from tvm import relay
2828
from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
29+
from tvm.meta_schedule.utils import cpu_count
2930
from tvm.relay.frontend import from_onnx
3031

3132

@@ -72,11 +73,6 @@ def _parse_args():
7273
type=str,
7374
required=True,
7475
)
75-
args.add_argument(
76-
"--rpc-workers",
77-
type=int,
78-
required=True,
79-
)
8076
args.add_argument(
8177
"--work-dir",
8278
type=str,
@@ -99,7 +95,7 @@ def _parse_args():
9995
)
10096
args.add_argument(
10197
"--cpu-flush",
102-
type=bool,
98+
type=int,
10399
required=True,
104100
)
105101
parsed = args.parse_args()
@@ -124,7 +120,7 @@ def main():
124120
key=ARGS.rpc_key,
125121
host=ARGS.rpc_host,
126122
port=ARGS.rpc_port,
127-
n_parallel=ARGS.rpc_workers,
123+
n_parallel=cpu_count(logical=True),
128124
number=ARGS.number,
129125
repeat=ARGS.repeat,
130126
min_repeat_ms=ARGS.min_repeat_ms,

python/tvm/auto_scheduler/testing/tune_relay.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
from tvm import relay
2727
from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
2828
from tvm.meta_schedule.testing.relay_workload import get_network
29+
from tvm.meta_schedule.utils import cpu_count
30+
from tvm.support import describe
2931

3032

3133
def _parse_args():
@@ -65,11 +67,6 @@ def _parse_args():
6567
type=str,
6668
required=True,
6769
)
68-
args.add_argument(
69-
"--rpc-workers",
70-
type=int,
71-
required=True,
72-
)
7370
args.add_argument(
7471
"--work-dir",
7572
type=str,
@@ -97,7 +94,7 @@ def _parse_args():
9794
)
9895
args.add_argument(
9996
"--cpu-flush",
100-
type=bool,
97+
type=int,
10198
required=True,
10299
)
103100
parsed = args.parse_args()
@@ -122,7 +119,7 @@ def main():
122119
key=ARGS.rpc_key,
123120
host=ARGS.rpc_host,
124121
port=ARGS.rpc_port,
125-
n_parallel=ARGS.rpc_workers,
122+
n_parallel=cpu_count(logical=True),
126123
number=ARGS.number,
127124
repeat=ARGS.repeat,
128125
min_repeat_ms=ARGS.min_repeat_ms,

python/tvm/auto_scheduler/testing/tune_te.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
import tvm
2222
from tvm import auto_scheduler
2323
from tvm.meta_schedule.testing.te_workload import CONFIGS
24+
from tvm.meta_schedule.utils import cpu_count
25+
from tvm.support import describe
2426

2527

2628
def _parse_args():
@@ -55,11 +57,6 @@ def _parse_args():
5557
type=str,
5658
required=True,
5759
)
58-
args.add_argument(
59-
"--rpc-workers",
60-
type=int,
61-
required=True,
62-
)
6360
args.add_argument(
6461
"--work-dir",
6562
type=str,
@@ -82,7 +79,7 @@ def _parse_args():
8279
)
8380
args.add_argument(
8481
"--cpu-flush",
85-
type=bool,
82+
type=int,
8683
required=True,
8784
)
8885
parsed = args.parse_args()
@@ -129,7 +126,7 @@ def main():
129126
key=ARGS.rpc_key,
130127
host=ARGS.rpc_host,
131128
port=ARGS.rpc_port,
132-
n_parallel=ARGS.rpc_workers,
129+
n_parallel=cpu_count(logical=True),
133130
number=ARGS.number,
134131
repeat=ARGS.repeat,
135132
min_repeat_ms=ARGS.min_repeat_ms,

python/tvm/meta_schedule/runner/local_runner.py

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,17 @@
2323

2424
from ...contrib.popen_pool import PopenPoolExecutor
2525
from ...runtime import Device, Module
26+
from ..profiler import Profiler
2627
from ..utils import derived_object, get_global_func_with_default_on_worker
2728
from .config import EvaluatorConfig
28-
from .runner import PyRunner, RunnerFuture, RunnerInput, RunnerResult, PyRunnerFuture
29+
from .runner import PyRunner, PyRunnerFuture, RunnerFuture, RunnerInput, RunnerResult
2930
from .utils import (
30-
T_ARGUMENT_LIST,
3131
T_ARG_INFO_JSON_OBJ_LIST,
32+
T_ARGUMENT_LIST,
3233
alloc_argument_common,
3334
run_evaluator_common,
3435
)
3536

36-
3737
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
3838

3939

@@ -137,26 +137,29 @@ def resource_handler():
137137
yield
138138
finally:
139139
# Final step. Always clean up
140-
f_cleanup()
140+
with Profiler.timeit("LocalRunner/cleanup"):
141+
f_cleanup()
141142

142143
with resource_handler():
143144
# Step 1: create the local runtime module
144-
rt_mod = tvm.runtime.load_module(artifact_path)
145-
# Step 2: create the local device
146-
device = tvm.runtime.device(dev_type=device_type, dev_id=0)
147-
# Step 3: Allocate input arguments
148-
repeated_args: List[T_ARGUMENT_LIST] = f_alloc_argument(
149-
device,
150-
args_info,
151-
alloc_repeat,
152-
)
153-
# Step 4: Run time_evaluator
154-
costs: List[float] = f_run_evaluator(
155-
rt_mod,
156-
device,
157-
evaluator_config,
158-
repeated_args,
159-
)
145+
with Profiler.timeit("LocalRunner/load_module"):
146+
rt_mod = tvm.runtime.load_module(artifact_path)
147+
# Step 2: Allocate input arguments
148+
with Profiler.timeit("LocalRunner/alloc_argument"):
149+
device = tvm.runtime.device(dev_type=device_type, dev_id=0)
150+
repeated_args: List[T_ARGUMENT_LIST] = f_alloc_argument(
151+
device,
152+
args_info,
153+
alloc_repeat,
154+
)
155+
# Step 3: Run time_evaluator
156+
with Profiler.timeit("LocalRunner/run_evaluator"):
157+
costs: List[float] = f_run_evaluator(
158+
rt_mod,
159+
device,
160+
evaluator_config,
161+
repeated_args,
162+
)
160163
return costs
161164

162165

@@ -313,9 +316,6 @@ def _check(
313316
get_global_func_with_default_on_worker(name=f_alloc_argument, default=None)
314317
get_global_func_with_default_on_worker(name=f_run_evaluator, default=None)
315318
get_global_func_with_default_on_worker(name=f_cleanup, default=None)
316-
get_global_func_with_default_on_worker(
317-
name="tvm.contrib.random.random_fill", default=None
318-
)
319319

320320
value = self.pool.submit(
321321
_check,
@@ -348,7 +348,7 @@ def default_alloc_argument(
348348
The allocation args
349349
"""
350350
f_random_fill = get_global_func_with_default_on_worker(
351-
name="tvm.contrib.random.random_fill", default=None
351+
name="tvm.contrib.random.random_fill_for_measure", default=None
352352
)
353353
return alloc_argument_common(f_random_fill, device, args_info, alloc_repeat)
354354

python/tvm/meta_schedule/runner/rpc_runner.py

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from tvm.rpc import RPCSession
2626
from tvm.runtime import Device, Module
2727

28+
from ..profiler import Profiler
2829
from ..utils import (
2930
cpu_count,
3031
derived_object,
@@ -243,7 +244,7 @@ def __init__(
243244
f_alloc_argument: Union[T_ALLOC_ARGUMENT, str, None] = None,
244245
f_run_evaluator: Union[T_RUN_EVALUATOR, str, None] = None,
245246
f_cleanup: Union[T_CLEANUP, str, None] = None,
246-
max_workers: Optional[int] = 1,
247+
max_workers: Optional[int] = None,
247248
initializer: Optional[Callable[[], None]] = None,
248249
) -> None:
249250
"""Constructor
@@ -284,7 +285,7 @@ def __init__(
284285
self.f_run_evaluator = f_run_evaluator
285286
self.f_cleanup = f_cleanup
286287
if max_workers is None:
287-
max_workers = cpu_count()
288+
max_workers = cpu_count(logical=True)
288289
logger.info("RPCRunner: max_workers = %d", max_workers)
289290
self.pool = PopenPoolExecutor(
290291
max_workers=max_workers,
@@ -378,31 +379,36 @@ def resource_handler():
378379
yield
379380
finally:
380381
# Final step. Always clean up
381-
f_cleanup(session, remote_path)
382+
with Profiler.timeit("RPCRunner/cleanup"):
383+
f_cleanup(session, remote_path)
382384

383385
with resource_handler():
384386
# Step 1. Create session
385-
session = f_create_session(rpc_config)
386-
device = session.device(dev_type=device_type, dev_id=0)
387+
with Profiler.timeit("RPCRunner/create_session"):
388+
session = f_create_session(rpc_config)
389+
device = session.device(dev_type=device_type, dev_id=0)
387390
# Step 2. Upload the module
388-
_, remote_path = osp.split(artifact_path)
389-
local_path: str = artifact_path
390-
rt_mod: Module = f_upload_module(session, local_path, remote_path)
391+
with Profiler.timeit("RPCRunner/upload_module"):
392+
_, remote_path = osp.split(artifact_path)
393+
local_path: str = artifact_path
394+
rt_mod: Module = f_upload_module(session, local_path, remote_path)
391395
# Step 3: Allocate input arguments
392-
repeated_args: List[T_ARGUMENT_LIST] = f_alloc_argument(
393-
session,
394-
device,
395-
args_info,
396-
alloc_repeat,
397-
)
396+
with Profiler.timeit("RPCRunner/alloc_argument"):
397+
repeated_args: List[T_ARGUMENT_LIST] = f_alloc_argument(
398+
session,
399+
device,
400+
args_info,
401+
alloc_repeat,
402+
)
398403
# Step 4: Run time_evaluator
399-
costs: List[float] = f_run_evaluator(
400-
session,
401-
rt_mod,
402-
device,
403-
evaluator_config,
404-
repeated_args,
405-
)
404+
with Profiler.timeit("LocalRunner/run_evaluator"):
405+
costs: List[float] = f_run_evaluator(
406+
session,
407+
rt_mod,
408+
device,
409+
evaluator_config,
410+
repeated_args,
411+
)
406412
return costs
407413

408414

@@ -474,7 +480,7 @@ def default_alloc_argument(
474480
"""
475481
f_random_fill = get_global_func_on_rpc_session(
476482
session,
477-
"tvm.contrib.random.random_fill",
483+
"tvm.contrib.random.random_fill_for_measure",
478484
"Please make sure 'USE_RANDOM' is turned ON in the config.cmake on the RPC server.",
479485
)
480486

0 commit comments

Comments
 (0)