Skip to content

Commit 4d7b681

Browse files
committed
refine
1 parent 5822152 commit 4d7b681

File tree

2 files changed

+128
-0
lines changed

2 files changed

+128
-0
lines changed
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/bin/bash
2+
3+
# Script to run engineV2.py
4+
# Usage: ./run.sh
5+
6+
# 配置参数
7+
# NUM_GPUS!=0 时,engineV2 不受外部 "CUDA_VISIBLE_DEVICES" 影响
8+
FILE_INPUT="path_to_EBRL/EBRL_config.txt"
9+
# FILE_PATTERN="report/big_tensor_gpu/error_config.txt"
10+
LOG_DIR="tester/api_config/test_log_gpu_bigtensor_regr_EBRL"
11+
NUM_GPUS=-1
12+
NUM_WORKERS_PER_GPU=1
13+
GPU_IDS="-1"
14+
# REQUIRED_MEMORY=10
15+
16+
TEST_MODE_ARGS=(
17+
--accuracy=True
18+
# --paddle_only=True
19+
# --paddle_cinn=True
20+
# --test_amp=True
21+
# --test_cpu=True
22+
--use_cached_numpy=False
23+
)
24+
25+
IN_OUT_ARGS=(
26+
--api_config_file="$FILE_INPUT"
27+
# --api_config_file_pattern="$FILE_PATTERN"
28+
--log_dir="$LOG_DIR"
29+
)
30+
31+
PARALLEL_ARGS=(
32+
--num_gpus="$NUM_GPUS"
33+
--num_workers_per_gpu="$NUM_WORKERS_PER_GPU"
34+
--gpu_ids="$GPU_IDS"
35+
# --required_memory="$REQUIRED_MEMORY"
36+
)
37+
38+
mkdir -p "$LOG_DIR" || {
39+
echo "错误:无法创建日志目录 '$LOG_DIR'"
40+
exit 1
41+
}
42+
43+
# 执行程序
44+
LOG_FILE="$LOG_DIR/log_$(date +%Y%m%d_%H%M%S).log"
45+
nohup python engineV2.py \
46+
"${TEST_MODE_ARGS[@]}" \
47+
"${IN_OUT_ARGS[@]}" \
48+
"${PARALLEL_ARGS[@]}" \
49+
>> "$LOG_FILE" 2>&1 &
50+
51+
PYTHON_PID=$!
52+
53+
sleep 1
54+
if ! ps -p "$PYTHON_PID" > /dev/null; then
55+
echo "错误:engineV2 启动失败,请检查 $LOG_FILE"
56+
exit 1
57+
fi
58+
59+
echo -e "\n\033[32m执行中... 另开终端运行监控:\033[0m"
60+
echo -e "1. GPU使用: watch -n 1 nvidia-smi"
61+
echo -e "2. 日志目录: ls -lh $LOG_DIR"
62+
echo -e "3. 详细日志: tail -f $LOG_FILE"
63+
echo -e "4. 终止任务: kill $PYTHON_PID"
64+
echo -e "\n进程已在后台运行,关闭终端不会影响进程执行"
65+
66+
exit 0
67+
68+
# watch -n 1 nvidia-smi --query-compute-apps=pid,process_name,used_memory,gpu_uuid --format=csv
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from config_analyzer import TensorConfig, APIConfig, analyse_configs
2+
from tqdm import tqdm
3+
import random
4+
5+
def is_0_size_tensor(tensor_config):
6+
for i in tensor_config.shape:
7+
if i == 0:
8+
return True
9+
return False
10+
11+
def is_0D_tensor(tensor_config):
12+
return len(tensor_config.shape) == 0
13+
14+
def tensor_numel(tensor_config):
15+
numel = 1
16+
for i in tensor_config.shape:
17+
numel = numel * i
18+
return numel
19+
20+
def get_tensor_configs(api_config):
21+
tensor_configs = []
22+
for arg_config in api_config.args:
23+
if isinstance(arg_config, TensorConfig):
24+
tensor_configs.append(arg_config)
25+
elif isinstance(arg_config, list):
26+
for j in range(len(arg_config)):
27+
if isinstance(arg_config[j], TensorConfig):
28+
tensor_configs.append(arg_config[j])
29+
elif isinstance(arg_config, tuple):
30+
for j in range(len(arg_config)):
31+
if isinstance(arg_config[j], TensorConfig):
32+
tensor_configs.append(arg_config[j])
33+
34+
for key, arg_config in api_config.kwargs.items():
35+
if isinstance(arg_config, TensorConfig):
36+
tensor_configs.append(arg_config)
37+
elif isinstance(arg_config, list):
38+
for j in range(len(arg_config)):
39+
if isinstance(arg_config[j], TensorConfig):
40+
tensor_configs.append(arg_config[j])
41+
elif isinstance(arg_config, tuple):
42+
for j in range(len(arg_config)):
43+
if isinstance(arg_config[j], TensorConfig):
44+
tensor_configs.append(arg_config[j])
45+
return tensor_configs
46+
47+
file_list = [
48+
"/host_home/wanghuan29/APItest/PaddleAPITest/EB45/EBRL.txt",
49+
]
50+
51+
if __name__ == '__main__':
52+
for file in file_list:
53+
api_configs = analyse_configs(file)
54+
for api_config in tqdm(api_configs):
55+
tensor_configs = get_tensor_configs(api_config)
56+
for tensor_config in tensor_configs:
57+
if tensor_numel(tensor_config) >= 2147483647:
58+
print(api_config.config)
59+
break
60+

0 commit comments

Comments
 (0)