Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions include/onnxruntime/core/common/profiler_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
#pragma once

#include "core/common/common.h"
#include "core/common/inlined_containers.h"

#include <string>
#include <unordered_map>

namespace onnxruntime {
namespace profiling {
Expand Down Expand Up @@ -35,7 +35,7 @@ struct EventRecord {
std::string&& event_name,
long long time_stamp,
long long duration,
std::unordered_map<std::string, std::string>&& event_args)
InlinedHashMap<std::string, std::string>&& event_args)
: cat(category),
pid(process_id),
tid(thread_id),
Expand All @@ -50,7 +50,7 @@ struct EventRecord {
const std::string& event_name,
long long time_stamp,
long long duration,
const std::unordered_map<std::string, std::string>& event_args)
const InlinedHashMap<std::string, std::string>& event_args)
: cat(category),
pid(process_id),
tid(thread_id),
Expand All @@ -70,7 +70,7 @@ struct EventRecord {
std::string name{};
long long ts = 0;
long long dur = 0;
std::unordered_map<std::string, std::string> args{};
InlinedHashMap<std::string, std::string> args{};
};

using Events = std::vector<EventRecord>;
Expand Down
8 changes: 8 additions & 0 deletions include/onnxruntime/core/framework/run_options.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@ struct OrtRunOptions {
// So it is possible that only some of the nodes are executed.
bool only_execute_path_to_fetches = false;

// Set to 'true' to enable profiling for this run.
bool enable_profiling = false;

// File prefix for profiling result for this run.
// The actual filename will be: <profile_file_prefix>_<timestamp>.json
// Only used when enable_profiling is true.
std::basic_string<ORTCHAR_T> profile_file_prefix = ORT_TSTR("onnxruntime_run_profile");

#ifdef ENABLE_TRAINING
// Used by onnxruntime::training::TrainingSession. This class is now deprecated.
// Delete training_mode when TrainingSession is deleted.
Expand Down
22 changes: 22 additions & 0 deletions include/onnxruntime/core/session/onnxruntime_c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -7195,6 +7195,28 @@ struct OrtApi {
* \since 1.24
*/
ORT_API_T(void, RunOptionsSetSyncStream, _Inout_ OrtRunOptions* options, _In_ OrtSyncStream* sync_stream);

/** \brief Enable profiling for this run
*
* \param[in] options
* \param[in] profile_file_prefix The prefix for the profile file. The actual filename will be:
* <profile_file_prefix>_<timestamp>.json
*
* \snippet{doc} snippets.dox OrtStatus Return Value
*
* \since Version 1.25.
*/
ORT_API2_STATUS(RunOptionsEnableProfiling, _Inout_ OrtRunOptions* options, _In_ const ORTCHAR_T* profile_file_prefix);

/** \brief Disable profiling for this run
*
* \param[in] options
*
* \snippet{doc} snippets.dox OrtStatus Return Value
*
* \since Version 1.25.
*/
ORT_API2_STATUS(RunOptionsDisableProfiling, _Inout_ OrtRunOptions* options);
};

/*
Expand Down
13 changes: 13 additions & 0 deletions include/onnxruntime/core/session/onnxruntime_cxx_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -1382,6 +1382,19 @@ struct RunOptions : detail::Base<OrtRunOptions> {
* \param stream The OrtSyncStream to associate with these run options. May be nullptr to clear.
*/
RunOptions& SetSyncStream(OrtSyncStream* stream);

/** \brief Enable profiling for this run
*
* Wraps OrtApi::RunOptionsEnableProfiling
* \param profile_file_prefix The prefix for the profile file
*/
RunOptions& EnableProfiling(const ORTCHAR_T* profile_file_prefix);

/** \brief Disable profiling for this run
*
* Wraps OrtApi::RunOptionsDisableProfiling
*/
RunOptions& DisableProfiling();
};

namespace detail {
Expand Down
10 changes: 10 additions & 0 deletions include/onnxruntime/core/session/onnxruntime_cxx_inline.h
Original file line number Diff line number Diff line change
Expand Up @@ -1077,6 +1077,16 @@ inline RunOptions& RunOptions::SetSyncStream(OrtSyncStream* stream) {
return *this;
}

inline RunOptions& RunOptions::EnableProfiling(const ORTCHAR_T* profile_file_prefix) {
ThrowOnError(GetApi().RunOptionsEnableProfiling(p_, profile_file_prefix));
return *this;
}

inline RunOptions& RunOptions::DisableProfiling() {
ThrowOnError(GetApi().RunOptionsDisableProfiling(p_));
return *this;
}

inline ModelCompilationOptions::ModelCompilationOptions(const Env& env, const SessionOptions& session_options) {
ThrowOnError(GetCompileApi().CreateModelCompilationOptionsFromSessionOptions(env, session_options, &this->p_));
}
Expand Down
13 changes: 7 additions & 6 deletions onnxruntime/core/common/profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,17 @@ template void Profiler::StartProfiling<char>(const std::basic_string<char>& file
template void Profiler::StartProfiling<wchar_t>(const std::basic_string<wchar_t>& file_name);
#endif

void Profiler::EndTimeAndRecordEvent(EventCategory category,
const std::string& event_name,
const TimePoint& start_time,
const std::initializer_list<std::pair<std::string, std::string>>& event_args,
bool /*sync_gpu*/) {
void Profiler::EndTimeAndRecordEvent(
EventCategory category,
const std::string& event_name,
const TimePoint& start_time,
InlinedHashMap<std::string, std::string> event_args,
bool /*sync_gpu*/) {
long long dur = TimeDiffMicroSeconds(start_time);
long long ts = TimeDiffMicroSeconds(profiling_start_time_, start_time);

EventRecord event(category, logging::GetProcessId(),
logging::GetThreadId(), event_name, ts, dur, {event_args.begin(), event_args.end()});
logging::GetThreadId(), event_name, ts, dur, std::move(event_args));
if (profile_with_logger_) {
custom_logger_->SendProfileEvent(event);
} else {
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/common/profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class Profiler {
void EndTimeAndRecordEvent(EventCategory category,
const std::string& event_name,
const TimePoint& start_time,
const std::initializer_list<std::pair<std::string, std::string>>& event_args = {},
InlinedHashMap<std::string, std::string> event_args = {},
bool sync_gpu = false);

/*
Expand Down
13 changes: 13 additions & 0 deletions onnxruntime/core/framework/run_options.cc
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,16 @@ ORT_API_STATUS_IMPL(OrtApis::RunOptionsAddActiveLoraAdapter, _Inout_ OrtRunOptio
return nullptr;
API_IMPL_END
}

ORT_API_STATUS_IMPL(OrtApis::RunOptionsEnableProfiling, _Inout_ OrtRunOptions* options,
_In_ const ORTCHAR_T* profile_file_prefix) {
options->enable_profiling = true;
options->profile_file_prefix = profile_file_prefix;
return nullptr;
}

ORT_API_STATUS_IMPL(OrtApis::RunOptionsDisableProfiling, _Inout_ OrtRunOptions* options) {
options->enable_profiling = false;
options->profile_file_prefix.clear();
return nullptr;
}
112 changes: 76 additions & 36 deletions onnxruntime/core/framework/sequential_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <vector>
#include <sstream>
#include "core/common/common.h"
#include "core/common/inlined_containers.h"
#include "core/common/logging/logging.h"
#include "core/framework/allocation_planner.h"
#include "core/framework/execution_frame.h"
Expand Down Expand Up @@ -155,8 +156,8 @@ std::string ComposeSeriesName(const GraphViewer& graph_viewer) {
class SessionScope {
public:
friend class KernelScope;
SessionScope(const SessionState& session_state, const ExecutionFrame& frame)
: session_state_(session_state)
SessionScope(const SessionState& session_state, const ExecutionFrame& frame, profiling::Profiler* run_profiler)
: session_state_(session_state), run_profiler_(run_profiler)
#if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
,
frame_(frame)
Expand All @@ -173,13 +174,10 @@ class SessionScope {
#endif
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
,
dump_context_{
session_state_.GetGraphExecutionCounter(), 0}
dump_context_{session_state_.GetGraphExecutionCounter(), 0}
#endif
{
if (session_state_.Profiler().IsEnabled()) {
session_start_ = session_state.Profiler().Start();
}
session_start_ = StartProfilingIfEnabled();

auto& logger = session_state_.Logger();
VLOGS(logger, 0) << "Begin execution";
Expand Down Expand Up @@ -225,9 +223,8 @@ class SessionScope {
}
#endif

if (session_state_.Profiler().IsEnabled()) {
session_state_.Profiler().EndTimeAndRecordEvent(profiling::SESSION_EVENT, "SequentialExecutor::Execute", session_start_);
}
StopProfilingIfEnabled(profiling::SESSION_EVENT, "SequentialExecutor::Execute", session_start_);

#if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
auto& logger = session_state_.Logger();
for (auto i : frame_.GetStaticMemorySizeInfo()) {
Expand All @@ -252,8 +249,45 @@ class SessionScope {
}
#endif

bool IsRunProfilingEnabled() const {
return run_profiler_ && run_profiler_->IsEnabled();
}

void StopProfilingIfEnabled(profiling::EventCategory category,
const std::string& event_name,
const TimePoint& start_time,
InlinedHashMap<std::string, std::string> event_args = {}) {
const bool session_profiling_enabled = session_state_.Profiler().IsEnabled();
const bool run_profiling_enabled = IsRunProfilingEnabled();

if (session_profiling_enabled) {
session_state_.Profiler().EndTimeAndRecordEvent(category,
event_name,
start_time,
std::move(event_args));
} else if (run_profiling_enabled) {
run_profiler_->EndTimeAndRecordEvent(category,
event_name,
start_time,
std::move(event_args));
}
}

TimePoint StartProfilingIfEnabled() {
const bool session_profiling_enabled = session_state_.Profiler().IsEnabled();
const bool run_profiling_enabled = IsRunProfilingEnabled();

if (session_profiling_enabled) {
return session_state_.Profiler().Start();
} else if (run_profiling_enabled) {
return run_profiler_->Start();
}
return TimePoint{};
}

private:
const SessionState& session_state_;
profiling::Profiler* run_profiler_;
TimePoint session_start_;
#if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
const ExecutionFrame& frame_;
Expand Down Expand Up @@ -340,16 +374,17 @@ class KernelScope {
utils::DumpNodeInputs(dump_context_, kernel_context_, kernel_.Node(), session_state_, session_scope_.dump_analysis_);
#endif

#ifdef ENABLE_NVTX_PROFILE
node_compute_range_.Begin();
#endif
const bool session_profiling_enabled = session_state_.Profiler().IsEnabled();
const bool run_profiling_enabled = session_scope_.IsRunProfilingEnabled();

if (session_state_.Profiler().IsEnabled()) {
if (session_profiling_enabled || run_profiling_enabled) {
auto& node = kernel.Node();
node_name_ = node.Name().empty() ? MakeString(node.OpType(), "_", node.Index()) : node.Name();
concurrency::ThreadPool::StartProfiling(session_state_.GetThreadPool());
VLOGS(session_state_.Logger(), 1) << "Computing kernel: " << node_name_;
kernel_begin_time_ = session_state_.Profiler().Start();

kernel_begin_time_ = session_scope_.StartProfilingIfEnabled();

CalculateTotalInputSizes(&kernel_context, &kernel_,
input_activation_sizes_, input_parameter_sizes_,
node_name_, input_type_shape_);
Expand All @@ -363,26 +398,30 @@ class KernelScope {
node_compute_range_.End();
#endif

if (session_state_.Profiler().IsEnabled()) {
auto& profiler = session_state_.Profiler();
const bool session_profiling_enabled = session_state_.Profiler().IsEnabled();
const bool run_profiling_enabled = session_scope_.IsRunProfilingEnabled();

if (session_profiling_enabled || run_profiling_enabled) {
std::string output_type_shape_;
CalculateTotalOutputSizes(&kernel_context_, total_output_sizes_, node_name_, output_type_shape_);
profiler.EndTimeAndRecordEvent(profiling::NODE_EVENT,
node_name_ + "_kernel_time",
kernel_begin_time_,
// Log additional operation args / info.
{
{"op_name", kernel_.KernelDef().OpName()},
{"provider", kernel_.KernelDef().Provider()},
{"node_index", std::to_string(kernel_.Node().Index())},
{"activation_size", std::to_string(input_activation_sizes_)},
{"parameter_size", std::to_string(input_parameter_sizes_)},
{"output_size", std::to_string(total_output_sizes_)},
{"input_type_shape", input_type_shape_},
{"output_type_shape", output_type_shape_},
{"thread_scheduling_stats",
concurrency::ThreadPool::StopProfiling(session_state_.GetThreadPool())},
});

InlinedHashMap<std::string, std::string> event_args = {
{"op_name", kernel_.KernelDef().OpName()},
{"provider", kernel_.KernelDef().Provider()},
{"node_index", std::to_string(kernel_.Node().Index())},
{"activation_size", std::to_string(input_activation_sizes_)},
{"parameter_size", std::to_string(input_parameter_sizes_)},
{"output_size", std::to_string(total_output_sizes_)},
{"input_type_shape", input_type_shape_},
{"output_type_shape", output_type_shape_},
{"thread_scheduling_stats",
concurrency::ThreadPool::StopProfiling(session_state_.GetThreadPool())},
};

session_scope_.StopProfilingIfEnabled(profiling::NODE_EVENT,
node_name_ + "_kernel_time",
kernel_begin_time_,
std::move(event_args));
}

#ifdef ONNXRUNTIME_ENABLE_INSTRUMENT
Expand Down Expand Up @@ -588,7 +627,8 @@ onnxruntime::Status ExecuteThePlan(const SessionState& session_state, gsl::span<
#endif
const bool& terminate_flag,
const bool only_execute_path_to_fetches,
bool single_thread_mode) {
bool single_thread_mode,
profiling::Profiler* run_profiler) {
auto* execution_plan = session_state.GetExecutionPlan();
VLOGS(logger, 0) << "Number of streams: " << execution_plan->execution_plan.size();
int32_t valid_streams = 0;
Expand Down Expand Up @@ -631,7 +671,7 @@ onnxruntime::Status ExecuteThePlan(const SessionState& session_state, gsl::span<
ORT_UNUSED_PARAMETER(only_execute_path_to_fetches);
#endif

SessionScope session_scope(session_state, ctx.GetExecutionFrame());
SessionScope session_scope(session_state, ctx.GetExecutionFrame(), run_profiler);

auto* tp = single_thread_mode ? nullptr : session_state.GetInterOpThreadPool();

Expand Down Expand Up @@ -692,7 +732,7 @@ onnxruntime::Status PartialExecuteThePlan(const SessionState& session_state, gsl

ctx.SetCurrentRange(&state.GetProgramRegions(session_state));

SessionScope session_scope(session_state, ctx.GetExecutionFrame());
SessionScope session_scope(session_state, ctx.GetExecutionFrame(), nullptr);

#if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
// Only flush memory info for the 2nd partial graph execution (since ORTModule runs this function twice).
Expand Down
3 changes: 2 additions & 1 deletion onnxruntime/core/framework/sequential_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ onnxruntime::Status ExecuteThePlan(const SessionState& session_state, gsl::span<
#endif
const bool& terminate_flag,
const bool only_execute_path_to_fetches,
bool single_thread_mode);
bool single_thread_mode,
profiling::Profiler* run_profiler = nullptr);

#ifdef ENABLE_TRAINING
onnxruntime::Status PartialExecuteThePlan(const SessionState& session_state, gsl::span<const int> feed_mlvalue_idxs,
Expand Down
Loading
Loading