Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AutoParallel] Support multi machine case for the visualize tool #59179

Merged
merged 63 commits into from
Nov 25, 2023
Merged
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
c514fbd
merge from openvino master
AndSonder Oct 18, 2023
0147f70
add InterpreterRunTime() to record interpreter's run time
AndSonder Oct 20, 2023
6d1dc3d
add profiler helper static to produce json file
AndSonder Oct 20, 2023
6f4f67c
add color map and support perfetto format
AndSonder Oct 23, 2023
14fd116
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
AndSonder Oct 23, 2023
4d51610
recover codes
AndSonder Oct 23, 2023
c70d9f9
control include env for gpu_timer.h
AndSonder Oct 23, 2023
ad0f17a
fix logic for profiler_helper_static.py
AndSonder Oct 23, 2023
e0442c6
fix build error
AndSonder Oct 23, 2023
a8a37bb
fix build error
AndSonder Oct 23, 2023
a20e6ce
recover thirdparty
AndSonder Oct 23, 2023
3e10a6d
add flag control: not support new ir now
AndSonder Oct 24, 2023
59b425e
set auto_parallel_profiler flag to false
AndSonder Oct 25, 2023
ddc5038
fix
AndSonder Oct 26, 2023
14f6228
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
AndSonder Oct 26, 2023
1dfc816
add auto_parallel_profiler as command parameter
AndSonder Oct 26, 2023
9f271ef
fix value name
AndSonder Oct 26, 2023
dabf964
support gettimeofday for win env
AndSonder Oct 27, 2023
6ad6f36
fix win build error
AndSonder Oct 27, 2023
d58cc94
fix win build error
AndSonder Oct 27, 2023
e9886ae
use job_type_to_id
AndSonder Oct 27, 2023
282285b
Fixed repeatedly timing the same stream
AndSonder Oct 27, 2023
3b0db0c
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
AndSonder Oct 31, 2023
fdc3f6d
add step line for timeline
AndSonder Nov 1, 2023
1ceadc5
add step timeline and fix logic when job overlap
AndSonder Nov 2, 2023
679cc39
update time record logic
AndSonder Nov 6, 2023
8953ae9
Merge branch 'develop' into add_profiler
AndSonder Nov 6, 2023
1a04fea
fix bug when start profile start from none zero step
AndSonder Nov 7, 2023
e1c619d
fix note
AndSonder Nov 7, 2023
58c9f65
Merge branch 'add_profiler' of https://github.com/AndSonder/Paddle in…
AndSonder Nov 7, 2023
9c8b740
remove FLAGS_auto_parallel_profiler
AndSonder Nov 7, 2023
24b7e79
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
AndSonder Nov 7, 2023
63de31b
use run config instead FLAGS_auto_parallelxx
AndSonder Nov 7, 2023
8218ecb
fix color map logic
AndSonder Nov 7, 2023
4b318fc
fix color map logic
AndSonder Nov 7, 2023
9f949f2
fix bug when log step does not start from 0
AndSonder Nov 8, 2023
ffc7b39
fix
AndSonder Nov 9, 2023
1925dd7
fix
AndSonder Nov 9, 2023
d299723
don't use set_enable_auto_parallel_profiler
AndSonder Nov 9, 2023
5297b7a
fix bug
AndSonder Nov 9, 2023
8bfb6c0
disable auto_parallel_profiler when not open flag by command line
AndSonder Nov 9, 2023
13b14d1
fix bug
AndSonder Nov 9, 2023
5bb55e1
remove resettime
AndSonder Nov 10, 2023
f422b33
fix build bug
AndSonder Nov 13, 2023
ed5f7fc
fix
AndSonder Nov 13, 2023
718cf17
remove set enable
AndSonder Nov 14, 2023
f36b57b
fix build error
AndSonder Nov 15, 2023
444b7a7
fix build error
AndSonder Nov 15, 2023
f494916
fix build error
AndSonder Nov 15, 2023
28f089f
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
AndSonder Nov 15, 2023
a2b5988
fix ci error
AndSonder Nov 15, 2023
fb748d9
fix
AndSonder Nov 15, 2023
aa5570d
fix run error
AndSonder Nov 15, 2023
6b18e10
fix
AndSonder Nov 15, 2023
f096253
fix
AndSonder Nov 16, 2023
560fb61
fix calculate_stream_timer logic
AndSonder Nov 16, 2023
bbb3071
remove fluid head
AndSonder Nov 17, 2023
e15c19e
fix build error
AndSonder Nov 17, 2023
989348c
set default value for enable_job_schedule_profiler
AndSonder Nov 17, 2023
10b84d8
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
AndSonder Nov 18, 2023
5cfd132
support multi machine
AndSonder Nov 20, 2023
0bff0af
fix load dir logic
AndSonder Nov 20, 2023
d32e4c5
Merge branch 'develop' into support_multimachine
AndSonder Nov 21, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,12 @@ def parse_args():
all_devices = ",".join([str(i) for i in range(device_count)])
parser.add_argument("--devices", type=str, default=all_devices)
parser.add_argument("--log_dir", type=str, required=True)
parser.add_argument("--multi_machine", action="store_true")
args = parser.parse_args()
return args


def process_job_log(log_data, device_id):
def process_job_log(log_data, device_id, multi_machine_idx=-1):
log_pattern = r'.*?Profiler Info: Job \((\d+)\), type = (\w+), micro_batch_id = (\d+), job_start_time = (\d+.\d+), job_end_time = (\d+.\d+)'
matches = re.findall(log_pattern, log_data)
events = []
Expand All @@ -66,21 +67,30 @@ def process_job_log(log_data, device_id):
step_start_time = start_time
step_end_time = end_time

tid_name = (
"GPU" + str(device_id)
if multi_machine_idx == -1
else "GPU"
+ str(device_id)
+ "(machine:"
+ str(multi_machine_idx)
+ ")"
)
event_start = {
"name": job_type + "_" + str(job_id),
"cat": job_type,
"ph": "B",
"ts": start_time,
"pid": 0,
"tid": "GPU" + str(device_id),
"tid": tid_name,
}
event_end = {
"name": job_type + "_" + str(job_id),
"cat": job_type,
"ph": "E",
"pid": 0,
"ts": end_time,
"tid": "GPU" + str(device_id),
"tid": tid_name,
}
if job_type in color_map:
event_start["cname"] = color_map[job_type]
Expand All @@ -100,29 +110,48 @@ def main():
all_events = []
step_infos = []
start_step = 0

for device_id in args.devices.split(","):
_logger.info(f"Process device {device_id}")
device_id = int(device_id)
log_file = os.path.join(args.log_dir, "workerlog." + str(device_id))
with open(log_file, "r") as f:
log_data = f.read()

start_step_pattern = (
r'.*?Schedule Profiler start at step (\d+) and end at step.*'
)
start_step_match = re.findall(start_step_pattern, log_data)
start_step = (
int(start_step_match[0]) if len(start_step_match) > 0 else 0
)

events, step_times = process_job_log(log_data, device_id)
all_events.extend(events)
for i, info in enumerate(step_times):
if len(step_infos) <= i:
step_infos.append([float("inf"), float("-inf")])
step_infos[i][0] = min(step_infos[i][0], info[0])
step_infos[i][1] = max(step_infos[i][1], info[1])
machine_num = 1

def process_one_machine_log(log_dir, multi_machine_idx=-1):
for device_id in args.devices.split(","):
_logger.info(f"Process device {device_id}")
device_id = int(device_id)
log_file = os.path.join(log_dir, "workerlog." + str(device_id))
with open(log_file, "r") as f:
log_data = f.read()

start_step_pattern = (
r'.*?Schedule Profiler start at step (\d+) and end at step.*'
)
start_step_match = re.findall(start_step_pattern, log_data)
start_step = (
int(start_step_match[0]) if len(start_step_match) > 0 else 0
)

events, step_times = process_job_log(
log_data, device_id, multi_machine_idx
)
all_events.extend(events)
for i, info in enumerate(step_times):
if len(step_infos) <= i:
step_infos.append([float("inf"), float("-inf")])
step_infos[i][0] = min(step_infos[i][0], info[0])
step_infos[i][1] = max(step_infos[i][1], info[1])

if args.multi_machine:
multi_machine_dirs = os.listdir(args.log_dir)
multi_machine_dirs = [
os.path.join(args.log_dir, d)
for d in multi_machine_dirs
if d.startswith("machine")
and os.path.isdir(os.path.join(args.log_dir, d))
]
machine_num = len(multi_machine_dirs)
for i, d in enumerate(multi_machine_dirs):
_logger.info(f"Process machine {i}")
process_one_machine_log(d, i)
else:
process_one_machine_log(args.log_dir)

for i, info in enumerate(step_infos):
start_time = info[0]
Expand Down Expand Up @@ -170,24 +199,41 @@ def main():
}
]
)
for i in range(len(args.devices.split(","))):
all_events.extend(
[
{
"args": {"name": f"GPU:{i}"},
"cat": "__metadata",
"name": "thread_name",
"ph": "M",
"pid": 0,
"tid": i + 2334,
"ts": 0,
}
]
)

for i in range(machine_num):
for j in range(len(args.devices.split(","))):
if machine_num > 1:
name = f"GPU:{j}(machine:{i})"
tid = i * len(args.devices.split(",")) + j + 2334
else:
name = f"GPU:{j}"
tid = j + 2334
all_events.extend(
[
{
"args": {"name": name},
"cat": "__metadata",
"name": "thread_name",
"ph": "M",
"pid": 0,
"tid": tid,
"ts": 0,
}
]
)

json_str = json.dumps({"traceEvents": all_events})
for i in range(len(args.devices.split(","))):
json_str = json_str.replace('"Step"', '2333')
json_str = json_str.replace(f'"GPU{i}"', f'{i + 2334}')
json_str = json_str.replace('"Step"', '2333')

for i in range(machine_num):
for j in range(len(args.devices.split(","))):
if machine_num > 1:
json_str = json_str.replace(
f'"GPU{j}(machine:{i})"',
f'{i * len(args.devices.split(",")) + j + 2334}',
)
else:
json_str = json_str.replace(f'"GPU{j}"', f'{j + 2334}')

with open(save_path, "w") as f:
f.write(json_str)
Expand Down