Skip to content

Commit

Permalink
[MLU] add mlu new profiler (PaddlePaddle#41138)
Browse files Browse the repository at this point in the history
* [MLU] add mlu new profiler

* fix format
  • Loading branch information
fwenguang authored and fuwenguang committed Apr 21, 2022
1 parent 58f6d45 commit 3b9dd0a
Show file tree
Hide file tree
Showing 14 changed files with 574 additions and 13 deletions.
4 changes: 3 additions & 1 deletion paddle/fluid/platform/device/mlu/mlu_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ limitations under the License. */

#ifdef PADDLE_WITH_MLU
#include <cn_api.h>
#include <cndrv_id.h>
#include <cnnl.h>
#include <cnpapi.h>
#include <cnrt.h>
#ifdef PADDLE_WITH_CNCL
#include <cncl.h>
Expand All @@ -33,7 +35,7 @@ using cnclStatus = cnclResult_t;
#endif
using mluStream = cnrtQueue_t;
using mluCnnlHandle = cnnlHandle_t;
using mluEventHandle = CNnotifier;
using mluEventHandle = cnrtNotifier_t;
using mluDeviceHandle = CNdev;

namespace platform {
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/platform/profiler/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
cc_library(host_tracer SRCS host_tracer.cc DEPS enforce)
cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog)
add_subdirectory(mlu)
cc_library(event_node SRCS event_node.cc DEPS enforce)
cc_library(profiler_utils SRCS utils.cc DEPS enforce glog)
add_subdirectory(dump)
cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils)
cc_library(event_bind SRCS event_python.cc DEPS profiler_logger)
cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog)
cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind)
cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind mlu_tracer)
cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger)
cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils)
cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind)
Expand Down
24 changes: 16 additions & 8 deletions paddle/fluid/platform/profiler/chrometracing_logger.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,12 @@ static std::string DefaultFileName() {
}

const char* ChromeTracingLogger::categary_name_[] = {
"Operator", "Dataloader", "ProfileStep", "CudaRuntime",
"Kernel", "Memcpy", "Memset", "UserDefined",
"OperatorInner", "Forward", "Backward", "Optimization",
"Communication", "PythonOp", "PythonUserDefined"};
"Operator", "Dataloader", "ProfileStep",
"CudaRuntime", "Kernel", "Memcpy",
"Memset", "UserDefined", "OperatorInner",
"Forward", "Backward", "Optimization",
"Communication", "PythonOp", "PythonUserDefined",
"MluRuntime"};

void ChromeTracingLogger::OpenFile() {
output_file_stream_.open(filename_,
Expand Down Expand Up @@ -598,6 +600,12 @@ void ChromeTracingLogger::RefineDisplayName(
(*it).second * 2, (*it).first, (*it).second, (*it).second * 2 + 1);
}

#ifdef PADDLE_WITH_MLU
static std::string device_type("MLU");
#else
static std::string device_type("GPU");
#endif

for (auto it = deviceid_streamid_set_.begin();
it != deviceid_streamid_set_.end(); ++it) {
output_file_stream_ << string_format(
Expand All @@ -607,7 +615,7 @@ void ChromeTracingLogger::RefineDisplayName(
"name": "process_name", "pid": %lld, "tid": %lld,
"ph": "M",
"args": {
"name": "Deivce %lld (GPU)"
"name": "Deivce %lld (%s)"
}
},
{
Expand All @@ -632,9 +640,9 @@ void ChromeTracingLogger::RefineDisplayName(
}
},
)JSON"),
(*it).first, (*it).second, (*it).first, (*it).first, (*it).second,
(*it).second, (*it).first, (*it).second, (*it).first + 0x10000000,
(*it).first, (*it).second, (*it).second);
(*it).first, (*it).second, (*it).first, device_type.c_str(),
(*it).first, (*it).second, (*it).second, (*it).first, (*it).second,
(*it).first + 0x10000000, (*it).first, (*it).second, (*it).second);
}
}

Expand Down
5 changes: 5 additions & 0 deletions paddle/fluid/platform/profiler/mlu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
if(WITH_MLU)
set(MLU_INFO mlu_info)
endif()

cc_library(mlu_tracer SRCS mlu_tracer.cc cnpapi_data_process.cc DEPS workqueue_utils enforce glog ${MLU_INFO})
263 changes: 263 additions & 0 deletions paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h"
#include <cstdio>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/os_info.h"

#ifdef PADDLE_WITH_MLU
namespace paddle {
namespace platform {

namespace {

inline uint64_t GetTimeGap() {
static uint64_t time_gap = []() -> uint64_t {
uint64_t cpu_time = PosixInNsec();
uint64_t mlu_time = cnpapiGetTimestamp();
return (cpu_time - mlu_time);
}();
return time_gap;
}

void AddKernelRecord(const cnpapiActivityKernel* kernel, uint64_t start_ns,
TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (kernel->start + time_gap < start_ns) {
return;
}
DeviceTraceEvent event;
event.name = demangle(kernel->name);
event.type = TracerEventType::Kernel;
event.start_ns = kernel->start + time_gap;
event.end_ns = kernel->end + time_gap;
event.device_id = kernel->device_id;
event.context_id = kernel->context_id;
event.stream_id = kernel->queue_id;
event.correlation_id = kernel->correlation_id;
event.kernel_info.block_x = kernel->dimx;
event.kernel_info.block_y = kernel->dimy;
event.kernel_info.block_z = kernel->dimz;
event.kernel_info.grid_x = kernel->kernel_type;
event.kernel_info.grid_y = 0;
event.kernel_info.grid_z = 0;
event.kernel_info.queued = kernel->queued;
event.kernel_info.submitted = kernel->submitted;
event.kernel_info.completed = kernel->received;
collector->AddDeviceEvent(std::move(event));
}

const char* MemcpyKind(cnpapiActivityMemcpyType kind) {
switch (kind) {
case CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOD:
return "MEMCPY_HtoD";
case CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOH:
return "MEMCPY_DtoH";
case CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOD:
return "MEMCPY_DtoD";
case CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOH:
return "MEMCPY_HtoH";
case CNPAPI_ACTIVITY_MEMCPY_TYPE_PTOP:
return "MEMCPY_PtoP";
default:
break;
}
return "MEMCPY";
}

void AddMemcpyRecord(const cnpapiActivityMemcpy* memcpy, uint64_t start_ns,
TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (memcpy->start + time_gap < start_ns) {
return;
}
DeviceTraceEvent event;
event.name = MemcpyKind(memcpy->copy_type);
event.type = TracerEventType::Memcpy;
event.start_ns = memcpy->start + time_gap;
event.end_ns = memcpy->end + time_gap;
event.device_id = memcpy->device_id;
event.context_id = memcpy->context_id;
event.stream_id = memcpy->queue_id;
event.correlation_id = memcpy->correlation_id;
event.memcpy_info.num_bytes = memcpy->bytes;
snprintf(event.memcpy_info.copy_kind, kMemKindMaxLen, "%s",
MemcpyKind(memcpy->copy_type));
collector->AddDeviceEvent(std::move(event));
}

void AddMemcpy2Record(const cnpapiActivityMemcpyPtoP* memcpy2,
uint64_t start_ns, TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (memcpy2->start + time_gap < start_ns) {
return;
}
DeviceTraceEvent event;
event.name = MemcpyKind(memcpy2->copy_type);
event.type = TracerEventType::Memcpy;
event.start_ns = memcpy2->start + time_gap;
event.end_ns = memcpy2->end + time_gap;
event.device_id = memcpy2->device_id;
event.context_id = memcpy2->context_id;
event.stream_id = memcpy2->queue_id;
event.correlation_id = memcpy2->correlation_id;
event.memcpy_info.num_bytes = memcpy2->bytes;
snprintf(event.memcpy_info.copy_kind, kMemKindMaxLen, "%s",
MemcpyKind(memcpy2->copy_type));
collector->AddDeviceEvent(std::move(event));
}

void AddMemsetRecord(const cnpapiActivityMemset* memset, uint64_t start_ns,
TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (memset->start + time_gap < start_ns) {
return;
}
DeviceTraceEvent event;
event.name = "MEMSET";
event.type = TracerEventType::Memset;
event.start_ns = memset->start + time_gap;
event.end_ns = memset->end + time_gap;
event.device_id = memset->device_id;
event.context_id = memset->context_id;
event.stream_id = memset->queue_id;
event.correlation_id = memset->correlation_id;
event.memset_info.num_bytes = memset->bytes;
event.memset_info.value = memset->value;
collector->AddDeviceEvent(std::move(event));
}

class CnpapiRuntimeCbidStr {
public:
static const CnpapiRuntimeCbidStr& GetInstance() {
static CnpapiRuntimeCbidStr inst;
return inst;
}

std::string RuntimeKind(cnpapi_CallbackId cbid) const {
auto iter = cbid_str_.find(cbid);
if (iter == cbid_str_.end()) {
return "MLU Runtime API " + std::to_string(cbid);
}
return iter->second;
}

private:
CnpapiRuntimeCbidStr();

std::unordered_map<cnpapi_CallbackId, std::string> cbid_str_;
};

CnpapiRuntimeCbidStr::CnpapiRuntimeCbidStr() {
#define REGISTER_RUNTIME_CBID_STR(cbid) \
cbid_str_[CNPAPI_CNDRV_TRACE_CBID_##cbid] = #cbid

REGISTER_RUNTIME_CBID_STR(cnMalloc);
REGISTER_RUNTIME_CBID_STR(cnMallocHost);
REGISTER_RUNTIME_CBID_STR(cnFree);
REGISTER_RUNTIME_CBID_STR(cnFreeHost);
REGISTER_RUNTIME_CBID_STR(cnMemcpy);
REGISTER_RUNTIME_CBID_STR(cnMemcpyPeer);
REGISTER_RUNTIME_CBID_STR(cnMemcpyHtoD);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoH);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD);
REGISTER_RUNTIME_CBID_STR(cnMemcpyAsync);
REGISTER_RUNTIME_CBID_STR(cnMemcpyHtoDAsync);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoHAsync);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoDAsync);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD2D);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD3D);
REGISTER_RUNTIME_CBID_STR(cnMemcpy2D);
REGISTER_RUNTIME_CBID_STR(cnMemcpy3D);
REGISTER_RUNTIME_CBID_STR(cnMemsetD8);
REGISTER_RUNTIME_CBID_STR(cnMemsetD16);
REGISTER_RUNTIME_CBID_STR(cnMemsetD32);
REGISTER_RUNTIME_CBID_STR(cnMemsetD8Async);
REGISTER_RUNTIME_CBID_STR(cnMemsetD16Async);
REGISTER_RUNTIME_CBID_STR(cnMemsetD32Async);
REGISTER_RUNTIME_CBID_STR(cnInvokeKernel);
REGISTER_RUNTIME_CBID_STR(cnCreateQueue);
REGISTER_RUNTIME_CBID_STR(cnDestroyQueue);
REGISTER_RUNTIME_CBID_STR(cnQueueSync);
REGISTER_RUNTIME_CBID_STR(cnQueueWaitNotifier);
REGISTER_RUNTIME_CBID_STR(cnWaitNotifier);
REGISTER_RUNTIME_CBID_STR(cnCreateNotifier);
REGISTER_RUNTIME_CBID_STR(cnDestroyNotifier);
REGISTER_RUNTIME_CBID_STR(cnPlaceNotifier);
REGISTER_RUNTIME_CBID_STR(cnCtxCreate);
REGISTER_RUNTIME_CBID_STR(cnCtxDestroy);
REGISTER_RUNTIME_CBID_STR(cnCtxGetCurrent);
REGISTER_RUNTIME_CBID_STR(cnCtxSetCurrent);
REGISTER_RUNTIME_CBID_STR(cnCtxGetDevice);
REGISTER_RUNTIME_CBID_STR(cnCtxSync);
#undef REGISTER_RUNTIME_CBID_STR
}

void AddApiRecord(const cnpapiActivityAPI* api, uint64_t start_ns,
TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (api->start + time_gap < start_ns) {
return;
}
RuntimeTraceEvent event;
event.name = CnpapiRuntimeCbidStr::GetInstance().RuntimeKind(api->cbid);
event.start_ns = api->start + time_gap;
event.end_ns = api->end + time_gap;
event.process_id = api->process_id;
event.thread_id = api->thread_id;
event.correlation_id = api->correlation_id;
event.callback_id = api->cbid;
event.type = TracerEventType::MluRuntime;
collector->AddRuntimeEvent(std::move(event));
}

} // namespace

namespace details {

void ProcessCnpapiActivityRecord(const cnpapiActivity* record,
uint64_t start_ns,
TraceEventCollector* collector) {
switch (record->type) {
case CNPAPI_ACTIVITY_TYPE_KERNEL:
AddKernelRecord(reinterpret_cast<const cnpapiActivityKernel*>(record),
start_ns, collector);
break;
case CNPAPI_ACTIVITY_TYPE_MEMCPY:
AddMemcpyRecord(reinterpret_cast<const cnpapiActivityMemcpy*>(record),
start_ns, collector);
break;
case CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP:
AddMemcpy2Record(
reinterpret_cast<const cnpapiActivityMemcpyPtoP*>(record), start_ns,
collector);
break;
case CNPAPI_ACTIVITY_TYPE_MEMSET:
AddMemsetRecord(reinterpret_cast<const cnpapiActivityMemset*>(record),
start_ns, collector);
break;
case CNPAPI_ACTIVITY_TYPE_CNDRV_API:
AddApiRecord(reinterpret_cast<const cnpapiActivityAPI*>(record), start_ns,
collector);
break;
default:
break;
}
}

} // namespace details
} // namespace platform
} // namespace paddle
#endif
35 changes: 35 additions & 0 deletions paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <unordered_map>
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#include "paddle/fluid/platform/profiler/trace_event_collector.h"

namespace paddle {
namespace platform {
namespace details {

#ifdef PADDLE_WITH_MLU
void ProcessCnpapiActivityRecord(const cnpapiActivity* record,
uint64_t start_ns,
TraceEventCollector* collector);
#endif

} // namespace details
} // namespace platform
} // namespace paddle
Loading

0 comments on commit 3b9dd0a

Please sign in to comment.