diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 07019e570e97..56cb7ba48185 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -25,6 +25,7 @@ env: TRITON_BUILD_WITH_CLANG_LLD: "TRUE" TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE" TRITON_DISABLE_LINE_INFO: 1 + PROTON_SKIP_PC_SAMPLING_TEST: 1 jobs: Runner-Preparation: runs-on: ubuntu-latest diff --git a/.github/workflows/integration-tests.yml.in b/.github/workflows/integration-tests.yml.in index 02fe9f289e52..5b64b8d6ac16 100644 --- a/.github/workflows/integration-tests.yml.in +++ b/.github/workflows/integration-tests.yml.in @@ -27,7 +27,7 @@ env: TRITON_BUILD_WITH_CLANG_LLD: "TRUE" TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE" TRITON_DISABLE_LINE_INFO: 1 - + PROTON_SKIP_PC_SAMPLING_TEST: 1 jobs: Runner-Preparation: diff --git a/third_party/proton/README.md b/third_party/proton/README.md index 8b94f180c3b7..ccd79e721200 100644 --- a/third_party/proton/README.md +++ b/third_party/proton/README.md @@ -119,7 +119,7 @@ flops64: float # The number of 64-bit floating-point operations bytes: int # The number of bytes expected to be transferred ``` -### Command Line +### Command line Proton can be used as a command-line tool to profile Python scripts and Pytest tests. The following examples demonstrate how to use Proton command-line. @@ -149,6 +149,22 @@ More options can be found by running the following command. proton-viewer -h ``` +### Instruction sampling (experimental) + +Proton supports instruction sampling on NVIDIA GPUs. +Please note that this is an experimental feature and may not work on all GPUs. +You may experience ~20x end-to-end overhead when using instruction sampling, although the overhead for each individual GPU kernel is negligible. +The overhead is mostly caused by data transfer and processing on the CPU. +Additionally, the proton-viewer options `-i -d -t ` can be helpful for filtering out GPU kernels that are not of interest. +The following example demonstrates how to use instruction sampling: + +```python +import triton.profiler as proton + + +proton.start(name="profile_name", context="shadow", backend="cupti_pcsampling") +``` + ## Proton *vs* nsys - Runtime overhead (up to 1.5x) @@ -173,11 +189,24 @@ Proton is designed to be portable and can be used on AMD GPUs. nsys only support Proton can register hooks to analyze the metadata of triton kernels, while nsys cannot. **Note** that the hooks do add additional overhead to proton. -## Known Issues +## Proton *vs* ncu + +Similar to the comparison between Proton and Nsight Systems (Nsys), Proton has a lower profiling overhead than Nsight Compute (NCU). We also plan to support instruction sampling on AMD GPUs. +However, Nsight Compute supports the collection of more detailed metrics than Proton, such as memory access patterns, memory transactions, and other instruction-level metrics. +In contrast, Proton only supports instruction sampling and is designed to be lightweight and portable. + +## Known issues -- CUDA Graph +- CUDA graph `hooks` cannot be used to accurately accumulate the number of FLOPs in CUDA graph mode profiling because kernels are captured and launched separately; metrics are not accumulated when kernels are launched in graph mode. This issue can be circumvented by using `scope` to supply FLOPs. If profiling is initiated after CUDA graph capturing, there may be minor memory leak issues. This is because the number of kernels in a graph instance (i.e., `cuGraphExec`) is unknown, preventing the deletion of mappings between the kernel ID and the graph ID. + +- Instruction sampling + +If you encounter permission related problems when using instruction sampling, you can lookup this [page](https://developer.nvidia.com/nvidia-development-tools-solutions-err_nvgpuctrperm-permission-issue-performance-counters) for help. + +The overhead of instruction sampling on NVIDIA GPUs is about 20x using Proton because we haven't enabled continuous sampling yet. +Continuous sampling can allow for more runtime optimizations, but it makes it more challenging to attribute performance data back to the GPU kernels because: (1) it enables profiling of concurrent kernels, (2) it doesn't allow profiling of time and instruction samples simultaneously, and (3) it works best if we have a separate thread dedicated to attributing instruction samples to the GPU kernels diff --git a/third_party/proton/csrc/include/Data/Metric.h b/third_party/proton/csrc/include/Data/Metric.h index 0e22f7a0504d..a75692877c5b 100644 --- a/third_party/proton/csrc/include/Data/Metric.h +++ b/third_party/proton/csrc/include/Data/Metric.h @@ -7,7 +7,7 @@ namespace proton { -enum class MetricKind { Flexible, Kernel, Count }; +enum class MetricKind { Flexible, Kernel, PCSampling, Count }; using MetricValueType = std::variant; @@ -143,8 +143,78 @@ class KernelMetric : public Metric { const static inline bool AGGREGABLE[kernelMetricKind::Count] = { false, false, true, true, false, false}; const static inline std::string VALUE_NAMES[kernelMetricKind::Count] = { - "StartTime (ns)", "EndTime (ns)", "Count", - "Time (ns)", "DeviceId", "DeviceType", + "start_time (ns)", "end_time (ns)", "count", + "time (ns)", "device_id", "device_type", + }; +}; + +class PCSamplingMetric : public Metric { +public: + enum PCSamplingMetricKind : int { + NumSamples, + NumStalledSamples, + StalledBranchResolving, + StalledNoInstruction, + StalledShortScoreboard, + StalledWait, + StalledLongScoreboard, + StalledTexThrottle, + StalledBarrier, + StalledMembar, + StalledIMCMiss, + StalledMIOThrottle, + StalledMathPipeThrottle, + StalledDrain, + StalledLGThrottle, + StalledNotSelected, + StalledMisc, + StalledDispatchStall, + StalledSleeping, + StalledSelected, + Count, + }; + + PCSamplingMetric() + : Metric(MetricKind::PCSampling, PCSamplingMetricKind::Count) {} + + PCSamplingMetric(PCSamplingMetricKind kind, uint64_t samples, + uint64_t stalledSamples) + : PCSamplingMetric() { + this->values[kind] = stalledSamples; + this->values[PCSamplingMetricKind::NumSamples] = samples; + this->values[PCSamplingMetricKind::NumStalledSamples] = stalledSamples; + } + + virtual const std::string getName() const { return "PCSamplingMetric"; } + + virtual const std::string getValueName(int valueId) const { + return VALUE_NAMES[valueId]; + } + + virtual bool isAggregable(int valueId) const { return true; } + +private: + const static inline std::string VALUE_NAMES[PCSamplingMetricKind::Count] = { + "num_samples", + "num_stalled_samples", + "stalled_branch_resolving", + "stalled_no_instruction", + "stalled_short_scoreboard", + "stalled_wait", + "stalled_long_scoreboard", + "stalled_tex_throttle", + "stalled_barrier", + "stalled_membar", + "stalled_imc_miss", + "stalled_mio_throttle", + "stalled_math_pipe_throttle", + "stalled_drain", + "stalled_lg_throttle", + "stalled_not_Selected", + "stalled_misc", + "stalled_dispatch_stall", + "stalled_sleeping", + "stalled_selected", }; }; diff --git a/third_party/proton/csrc/include/Driver/Dispatch.h b/third_party/proton/csrc/include/Driver/Dispatch.h index 6fe2d759428f..1d8ec017cdf7 100644 --- a/third_party/proton/csrc/include/Driver/Dispatch.h +++ b/third_party/proton/csrc/include/Driver/Dispatch.h @@ -63,17 +63,17 @@ template class Dispatch { *lib = dlopen(name, RTLD_NOLOAD); } if (*lib == nullptr) { - // If not found, try to load it from the default path + // If not found, try to load it from LD_LIBRARY_PATH + *lib = dlopen(name, RTLD_LOCAL | RTLD_LAZY); + } + if (*lib == nullptr) { + // If still not found, try to load it from the default path auto dir = std::string(ExternLib::defaultDir); if (dir.length() > 0) { auto fullPath = dir + "/" + name; *lib = dlopen(fullPath.c_str(), RTLD_LOCAL | RTLD_LAZY); } } - if (*lib == nullptr) { - // If still not found, try to load it from LD_LIBRARY_PATH - *lib = dlopen(name, RTLD_LOCAL | RTLD_LAZY); - } if (*lib == nullptr) { throw std::runtime_error("Could not find `" + std::string(name) + "`. Make sure it is in your " diff --git a/third_party/proton/csrc/include/Driver/GPU/CuptiApi.h b/third_party/proton/csrc/include/Driver/GPU/CuptiApi.h index 845b415bd52f..495964923ef4 100644 --- a/third_party/proton/csrc/include/Driver/GPU/CuptiApi.h +++ b/third_party/proton/csrc/include/Driver/GPU/CuptiApi.h @@ -2,11 +2,17 @@ #define PROTON_DRIVER_GPU_CUPTI_H_ #include "cupti.h" +#include "cupti_pcsampling.h" namespace proton { namespace cupti { +template CUptiResult getVersion(uint32_t *version); + +template +CUptiResult getContextId(CUcontext context, uint32_t *pCtxId); + template CUptiResult activityRegisterCallbacks( CUpti_BuffersCallbackRequestFunc funcBufferRequested, @@ -66,6 +72,40 @@ CUptiResult getGraphExecId(CUgraphExec graph, uint32_t *pId); template CUptiResult getGraphId(CUgraph graph, uint32_t *pId); +template +CUptiResult getCubinCrc(CUpti_GetCubinCrcParams *pParams); + +template +CUptiResult +getSassToSourceCorrelation(CUpti_GetSassToSourceCorrelationParams *pParams); + +template +CUptiResult +pcSamplingGetNumStallReasons(CUpti_PCSamplingGetNumStallReasonsParams *pParams); + +template +CUptiResult +pcSamplingGetStallReasons(CUpti_PCSamplingGetStallReasonsParams *pParams); + +template +CUptiResult pcSamplingSetConfigurationAttribute( + CUpti_PCSamplingConfigurationInfoParams *pParams); + +template +CUptiResult pcSamplingEnable(CUpti_PCSamplingEnableParams *pParams); + +template +CUptiResult pcSamplingDisable(CUpti_PCSamplingDisableParams *pParams); + +template +CUptiResult pcSamplingGetData(CUpti_PCSamplingGetDataParams *pParams); + +template +CUptiResult pcSamplingStart(CUpti_PCSamplingStartParams *pParams); + +template +CUptiResult pcSamplingStop(CUpti_PCSamplingStopParams *pParams); + } // namespace cupti } // namespace proton diff --git a/third_party/proton/csrc/include/Profiler/Cupti/CuptiPCSampling.h b/third_party/proton/csrc/include/Profiler/Cupti/CuptiPCSampling.h new file mode 100644 index 000000000000..58b6e2be8191 --- /dev/null +++ b/third_party/proton/csrc/include/Profiler/Cupti/CuptiPCSampling.h @@ -0,0 +1,141 @@ +#ifndef PROTON_PROFILER_CUPTI_PC_SAMPLING_H_ +#define PROTON_PROFILER_CUPTI_PC_SAMPLING_H_ + +#include "CuptiProfiler.h" +#include "Driver/GPU/CudaApi.h" +#include "Driver/GPU/CuptiApi.h" +#include "Utility/Map.h" +#include "Utility/Singleton.h" +#include +#include + +namespace proton { + +struct CubinData { + size_t cubinCrc; + const char *cubin; + size_t cubinSize; + + struct LineInfoKey { + uint32_t functionIndex; + uint64_t pcOffset; + + bool operator<(const LineInfoKey &other) const { + return functionIndex < other.functionIndex || + (functionIndex == other.functionIndex && + pcOffset < other.pcOffset); + } + }; + + struct LineInfoValue { + uint32_t lineNumber{}; + const std::string functionName{}; + const std::string dirName{}; + const std::string fileName{}; + + LineInfoValue() = default; + + LineInfoValue(uint32_t lineNumber, const std::string &functionName, + const std::string &dirName, const std::string &fileName) + : lineNumber(lineNumber), functionName(functionName), dirName(dirName), + fileName(fileName) {} + }; + + std::map lineInfo; +}; + +struct ConfigureData { + ConfigureData() = default; + + ~ConfigureData() { + if (stallReasonNames) { + for (size_t i = 0; i < numStallReasons; i++) { + if (stallReasonNames[i]) + std::free(stallReasonNames[i]); + } + std::free(stallReasonNames); + } + if (stallReasonIndices) + std::free(stallReasonIndices); + if (pcSamplingData.pPcData) { + for (size_t i = 0; i < numValidStallReasons; ++i) { + std::free(pcSamplingData.pPcData[i].stallReason); + } + std::free(pcSamplingData.pPcData); + } + } + + void initialize(CUcontext context); + + CUpti_PCSamplingConfigurationInfo configureStallReasons(); + CUpti_PCSamplingConfigurationInfo configureSamplingPeriod(); + CUpti_PCSamplingConfigurationInfo configureSamplingBuffer(); + CUpti_PCSamplingConfigurationInfo configureScratchBuffer(); + CUpti_PCSamplingConfigurationInfo configureHardwareBufferSize(); + CUpti_PCSamplingConfigurationInfo configureStartStopControl(); + CUpti_PCSamplingConfigurationInfo configureCollectionMode(); + + // The amount of data reserved on the GPU + static constexpr size_t HardwareBufferSize = 128 * 1024 * 1024; + // The amount of data copied from the hardware buffer each time + static constexpr size_t ScratchBufferSize = 16 * 1024 * 1024; + // The number of PCs copied from the scratch buffer each time + static constexpr size_t DataBufferPCCount = 1024; + // The sampling period in cycles = 2^frequency + static constexpr uint32_t DefaultFrequency = 10; + + CUcontext context{}; + uint32_t contextId; + uint32_t numStallReasons{}; + uint32_t numValidStallReasons{}; + char **stallReasonNames{}; + uint32_t *stallReasonIndices{}; + std::map stallReasonIndexToMetricIndex{}; + std::set notIssuedStallReasonIndices{}; + CUpti_PCSamplingData pcSamplingData{}; + // The memory storing configuration information has to be kept alive during + // the profiling session + std::vector configurationInfos; +}; + +class CuptiPCSampling : public Singleton { + +public: + CuptiPCSampling() = default; + virtual ~CuptiPCSampling() = default; + + void initialize(CUcontext context); + + void start(CUcontext context); + + void stop(CUcontext context, uint64_t externId, bool isAPI); + + void finalize(CUcontext context); + + void loadModule(const char *cubin, size_t cubinSize); + + void unloadModule(const char *cubin, size_t cubinSize); + +private: + ConfigureData *getConfigureData(uint32_t contextId); + + CubinData *getCubinData(uint64_t cubinCrc); + + void processPCSamplingData(ConfigureData *configureData, uint64_t externId, + bool isAPI); + + ThreadSafeMap contextIdToConfigureData; + // In case the same cubin is loaded multiple times, we need to keep track of + // all of them + ThreadSafeMap> + cubinCrcToCubinData; + ThreadSafeSet contextInitialized; + + std::atomic pcSamplingStarted{false}; + std::mutex pcSamplingMutex{}; + std::mutex contextMutex{}; +}; + +} // namespace proton + +#endif // PROTON_PROFILER_CUPTI_PC_SAMPLING_H_ diff --git a/third_party/proton/csrc/include/Profiler/CuptiProfiler.h b/third_party/proton/csrc/include/Profiler/Cupti/CuptiProfiler.h similarity index 90% rename from third_party/proton/csrc/include/Profiler/CuptiProfiler.h rename to third_party/proton/csrc/include/Profiler/Cupti/CuptiProfiler.h index 344d0fd4b9df..c443ec2e398f 100644 --- a/third_party/proton/csrc/include/Profiler/CuptiProfiler.h +++ b/third_party/proton/csrc/include/Profiler/Cupti/CuptiProfiler.h @@ -1,7 +1,7 @@ #ifndef PROTON_PROFILER_CUPTI_PROFILER_H_ #define PROTON_PROFILER_CUPTI_PROFILER_H_ -#include "GPUProfiler.h" +#include "Profiler/GPUProfiler.h" namespace proton { diff --git a/third_party/proton/csrc/include/Profiler/GPUProfiler.h b/third_party/proton/csrc/include/Profiler/GPUProfiler.h index 26c6d10b5d50..d5033b06aa63 100644 --- a/third_party/proton/csrc/include/Profiler/GPUProfiler.h +++ b/third_party/proton/csrc/include/Profiler/GPUProfiler.h @@ -31,6 +31,16 @@ class GPUProfiler : public Profiler, std::unordered_map>>; using ApiExternIdSet = ThreadSafeSet>; + ConcreteProfilerT &enablePCSampling() { + pcSamplingEnabled = true; + return dynamic_cast(*this); + } + ConcreteProfilerT &disablePCSampling() { + pcSamplingEnabled = false; + return dynamic_cast(*this); + } + bool isPCSamplingEnabled() const { return pcSamplingEnabled; } + protected: // OpInterface void startOp(const Scope &scope) override { @@ -140,6 +150,8 @@ class GPUProfiler : public Profiler, ConcreteProfilerT &profiler; }; std::unique_ptr pImpl; + + bool pcSamplingEnabled{false}; }; } // namespace proton diff --git a/third_party/proton/csrc/include/Profiler/RoctracerProfiler.h b/third_party/proton/csrc/include/Profiler/Roctracer/RoctracerProfiler.h similarity index 91% rename from third_party/proton/csrc/include/Profiler/RoctracerProfiler.h rename to third_party/proton/csrc/include/Profiler/Roctracer/RoctracerProfiler.h index 2f1791dcb506..b9bc08de8e83 100644 --- a/third_party/proton/csrc/include/Profiler/RoctracerProfiler.h +++ b/third_party/proton/csrc/include/Profiler/Roctracer/RoctracerProfiler.h @@ -1,7 +1,7 @@ #ifndef PROTON_PROFILER_ROCTRACER_PROFILER_H_ #define PROTON_PROFILER_ROCTRACER_PROFILER_H_ -#include "GPUProfiler.h" +#include "Profiler/GPUProfiler.h" namespace proton { diff --git a/third_party/proton/csrc/include/Utility/Atomic.h b/third_party/proton/csrc/include/Utility/Atomic.h index d7e40e73cd24..0f759e0d61b9 100644 --- a/third_party/proton/csrc/include/Utility/Atomic.h +++ b/third_party/proton/csrc/include/Utility/Atomic.h @@ -1,4 +1,8 @@ +#ifndef PROTON_UTILITY_ATOMIC_H_ +#define PROTON_UTILITY_ATOMIC_H_ + #include +#include namespace proton { @@ -16,4 +20,20 @@ template T atomicMin(std::atomic &target, T value) { return current; } +template +void doubleCheckedLock(Condition enterCondition, std::mutex &lock, + Function function) { + if (!enterCondition()) + return; + + std::unique_lock guard(lock); + + if (!enterCondition()) + return; + + function(); +} + } // namespace proton + +#endif // PROTON_UTILITY_ATOMIC_H_ diff --git a/third_party/proton/csrc/include/Utility/Errors.h b/third_party/proton/csrc/include/Utility/Errors.h index 62d4f3f6650b..094723d6f7e8 100644 --- a/third_party/proton/csrc/include/Utility/Errors.h +++ b/third_party/proton/csrc/include/Utility/Errors.h @@ -1,3 +1,6 @@ +#ifndef PROTON_UTILITY_ERRORS_H_ +#define PROTON_UTILITY_ERRORS_H_ + #include namespace proton { @@ -8,3 +11,5 @@ class NotImplemented : public std::logic_error { }; } // namespace proton + +#endif // PROTON_UTILITY_ERRORS_H_ diff --git a/third_party/proton/csrc/include/Utility/String.h b/third_party/proton/csrc/include/Utility/String.h index b7d45ae1f74f..b4a1d3ff9179 100644 --- a/third_party/proton/csrc/include/Utility/String.h +++ b/third_party/proton/csrc/include/Utility/String.h @@ -13,6 +13,18 @@ inline std::string toLower(const std::string &str) { return lower; } +inline std::string replace(const std::string &str, const std::string &src, + const std::string &dst) { + std::string replaced = str; + size_t pos = replaced.find(src, pos); + while (pos != std::string::npos) { + replaced.replace(pos, src.length(), dst); + pos += dst.length(); + pos = replaced.find(src, pos); + } + return replaced; +} + } // namespace proton #endif // PROTON_UTILITY_STRING_H_ diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index b12427f77774..ec6ea1c784e3 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -180,66 +180,76 @@ void TreeData::dumpHatchet(std::ostream &os) const { jsonNodes[Tree::TreeNode::RootId] = &(output.back()); std::set valueNames; std::map> deviceIds; - this->tree->template walk( - [&](Tree::TreeNode &treeNode) { - const auto contextName = treeNode.name; - auto contextId = treeNode.id; - json *jsonNode = jsonNodes[contextId]; - (*jsonNode)["frame"] = {{"name", contextName}, {"type", "function"}}; - (*jsonNode)["metrics"] = json::object(); - for (auto [metricKind, metric] : treeNode.metrics) { - if (metricKind == MetricKind::Kernel) { - auto kernelMetric = std::dynamic_pointer_cast(metric); - auto duration = std::get( - kernelMetric->getValue(KernelMetric::Duration)); - auto invocations = std::get( - kernelMetric->getValue(KernelMetric::Invocations)); - auto deviceId = std::get( - kernelMetric->getValue(KernelMetric::DeviceId)); - auto deviceType = std::get( - kernelMetric->getValue(KernelMetric::DeviceType)); - auto deviceTypeName = - getDeviceTypeString(static_cast(deviceType)); - (*jsonNode)["metrics"] - [kernelMetric->getValueName(KernelMetric::Duration)] = - duration; - (*jsonNode)["metrics"] - [kernelMetric->getValueName(KernelMetric::Invocations)] = - invocations; - (*jsonNode)["metrics"] - [kernelMetric->getValueName(KernelMetric::DeviceId)] = - std::to_string(deviceId); - (*jsonNode)["metrics"] - [kernelMetric->getValueName(KernelMetric::DeviceType)] = - deviceTypeName; - valueNames.insert( - kernelMetric->getValueName(KernelMetric::Duration)); - valueNames.insert( - kernelMetric->getValueName(KernelMetric::Invocations)); - deviceIds.insert({deviceType, {deviceId}}); - } else { - throw std::runtime_error("MetricKind not supported"); - } - } - for (auto [_, flexibleMetric] : treeNode.flexibleMetrics) { - auto valueName = flexibleMetric.getValueName(0); + this->tree->template walk([&](Tree::TreeNode + &treeNode) { + const auto contextName = treeNode.name; + auto contextId = treeNode.id; + json *jsonNode = jsonNodes[contextId]; + (*jsonNode)["frame"] = {{"name", contextName}, {"type", "function"}}; + (*jsonNode)["metrics"] = json::object(); + for (auto [metricKind, metric] : treeNode.metrics) { + if (metricKind == MetricKind::Kernel) { + std::shared_ptr kernelMetric = + std::dynamic_pointer_cast(metric); + uint64_t duration = + std::get(kernelMetric->getValue(KernelMetric::Duration)); + uint64_t invocations = std::get( + kernelMetric->getValue(KernelMetric::Invocations)); + uint64_t deviceId = + std::get(kernelMetric->getValue(KernelMetric::DeviceId)); + uint64_t deviceType = std::get( + kernelMetric->getValue(KernelMetric::DeviceType)); + std::string deviceTypeName = + getDeviceTypeString(static_cast(deviceType)); + (*jsonNode)["metrics"] + [kernelMetric->getValueName(KernelMetric::Duration)] = + duration; + (*jsonNode)["metrics"] + [kernelMetric->getValueName(KernelMetric::Invocations)] = + invocations; + (*jsonNode)["metrics"] + [kernelMetric->getValueName(KernelMetric::DeviceId)] = + std::to_string(deviceId); + (*jsonNode)["metrics"] + [kernelMetric->getValueName(KernelMetric::DeviceType)] = + deviceTypeName; + valueNames.insert(kernelMetric->getValueName(KernelMetric::Duration)); + valueNames.insert( + kernelMetric->getValueName(KernelMetric::Invocations)); + deviceIds.insert({deviceType, {deviceId}}); + } else if (metricKind == MetricKind::PCSampling) { + auto pcSamplingMetric = + std::dynamic_pointer_cast(metric); + for (size_t i = 0; i < PCSamplingMetric::Count; i++) { + auto valueName = pcSamplingMetric->getValueName(i); valueNames.insert(valueName); std::visit( [&](auto &&value) { (*jsonNode)["metrics"][valueName] = value; }, - flexibleMetric.getValues()[0]); - } - (*jsonNode)["children"] = json::array(); - auto children = treeNode.children; - for (auto _ : children) { - (*jsonNode)["children"].push_back(json::object()); + pcSamplingMetric->getValues()[i]); } - auto idx = 0; - for (auto child : children) { - auto [index, childId] = child; - jsonNodes[childId] = &(*jsonNode)["children"][idx]; - idx++; - } - }); + } else { + throw std::runtime_error("MetricKind not supported"); + } + } + for (auto [_, flexibleMetric] : treeNode.flexibleMetrics) { + auto valueName = flexibleMetric.getValueName(0); + valueNames.insert(valueName); + std::visit( + [&](auto &&value) { (*jsonNode)["metrics"][valueName] = value; }, + flexibleMetric.getValues()[0]); + } + (*jsonNode)["children"] = json::array(); + auto children = treeNode.children; + for (auto _ : children) { + (*jsonNode)["children"].push_back(json::object()); + } + auto idx = 0; + for (auto child : children) { + auto [index, childId] = child; + jsonNodes[childId] = &(*jsonNode)["children"][idx]; + idx++; + } + }); // Hints for all available metrics for (auto valueName : valueNames) { output[Tree::TreeNode::RootId]["metrics"][valueName] = 0; diff --git a/third_party/proton/csrc/lib/Driver/GPU/CuptiApi.cpp b/third_party/proton/csrc/lib/Driver/GPU/CuptiApi.cpp index 1d7e97314a30..2c399d31c78b 100644 --- a/third_party/proton/csrc/lib/Driver/GPU/CuptiApi.cpp +++ b/third_party/proton/csrc/lib/Driver/GPU/CuptiApi.cpp @@ -22,6 +22,11 @@ struct ExternLibCupti : public ExternLibBase { void *ExternLibCupti::lib = nullptr; +DEFINE_DISPATCH(ExternLibCupti, getVersion, cuptiGetVersion, uint32_t *); + +DEFINE_DISPATCH(ExternLibCupti, getContextId, cuptiGetContextId, CUcontext, + uint32_t *); + DEFINE_DISPATCH(ExternLibCupti, activityRegisterCallbacks, cuptiActivityRegisterCallbacks, CUpti_BuffersCallbackRequestFunc, @@ -77,6 +82,40 @@ DEFINE_DISPATCH(ExternLibCupti, getGraphExecId, cuptiGetGraphExecId, DEFINE_DISPATCH(ExternLibCupti, getGraphId, cuptiGetGraphId, CUgraph, uint32_t *); +DEFINE_DISPATCH(ExternLibCupti, getCubinCrc, cuptiGetCubinCrc, + CUpti_GetCubinCrcParams *); + +DEFINE_DISPATCH(ExternLibCupti, getSassToSourceCorrelation, + cuptiGetSassToSourceCorrelation, + CUpti_GetSassToSourceCorrelationParams *); + +DEFINE_DISPATCH(ExternLibCupti, pcSamplingGetNumStallReasons, + cuptiPCSamplingGetNumStallReasons, + CUpti_PCSamplingGetNumStallReasonsParams *); + +DEFINE_DISPATCH(ExternLibCupti, pcSamplingGetStallReasons, + cuptiPCSamplingGetStallReasons, + CUpti_PCSamplingGetStallReasonsParams *); + +DEFINE_DISPATCH(ExternLibCupti, pcSamplingSetConfigurationAttribute, + cuptiPCSamplingSetConfigurationAttribute, + CUpti_PCSamplingConfigurationInfoParams *); + +DEFINE_DISPATCH(ExternLibCupti, pcSamplingEnable, cuptiPCSamplingEnable, + CUpti_PCSamplingEnableParams *); + +DEFINE_DISPATCH(ExternLibCupti, pcSamplingDisable, cuptiPCSamplingDisable, + CUpti_PCSamplingDisableParams *); + +DEFINE_DISPATCH(ExternLibCupti, pcSamplingGetData, cuptiPCSamplingGetData, + CUpti_PCSamplingGetDataParams *); + +DEFINE_DISPATCH(ExternLibCupti, pcSamplingStart, cuptiPCSamplingStart, + CUpti_PCSamplingStartParams *); + +DEFINE_DISPATCH(ExternLibCupti, pcSamplingStop, cuptiPCSamplingStop, + CUpti_PCSamplingStopParams *); + } // namespace cupti } // namespace proton diff --git a/third_party/proton/csrc/lib/Profiler/Cupti/CuptiPCSampling.cpp b/third_party/proton/csrc/lib/Profiler/Cupti/CuptiPCSampling.cpp new file mode 100644 index 000000000000..f8fb2537a03c --- /dev/null +++ b/third_party/proton/csrc/lib/Profiler/Cupti/CuptiPCSampling.cpp @@ -0,0 +1,444 @@ +#include "Profiler/Cupti/CuptiPCSampling.h" +#include "Data/Metric.h" +#include "Driver/GPU/CudaApi.h" +#include "Driver/GPU/CuptiApi.h" +#include "Utility/Atomic.h" +#include "Utility/Map.h" +#include "Utility/String.h" +#include +#include + +namespace proton { + +namespace { + +uint64_t getCubinCrc(const char *cubin, size_t size) { + CUpti_GetCubinCrcParams cubinCrcParams = { + .size = CUpti_GetCubinCrcParamsSize, + .cubinSize = size, + .cubin = cubin, + .cubinCrc = 0, + }; + cupti::getCubinCrc(&cubinCrcParams); + return cubinCrcParams.cubinCrc; +} + +size_t getNumStallReasons(CUcontext context) { + size_t numStallReasons = 0; + CUpti_PCSamplingGetNumStallReasonsParams numStallReasonsParams = { + .size = CUpti_PCSamplingGetNumStallReasonsParamsSize, + .pPriv = NULL, + .ctx = context, + .numStallReasons = &numStallReasons}; + cupti::pcSamplingGetNumStallReasons(&numStallReasonsParams); + return numStallReasons; +} + +std::tuple +getSassToSourceCorrelation(const char *functionName, uint64_t pcOffset, + const char *cubin, size_t cubinSize) { + CUpti_GetSassToSourceCorrelationParams sassToSourceParams = { + .size = CUpti_GetSassToSourceCorrelationParamsSize, + .cubin = cubin, + .functionName = functionName, + .cubinSize = cubinSize, + .lineNumber = 0, + .pcOffset = pcOffset, + .fileName = NULL, + .dirName = NULL, + }; + // Get source can fail if the line mapping is not available in the cubin so we + // don't check the return value + cupti::getSassToSourceCorrelation(&sassToSourceParams); + auto fileNameStr = sassToSourceParams.fileName + ? std::string(sassToSourceParams.fileName) + : ""; + auto dirNameStr = + sassToSourceParams.dirName ? std::string(sassToSourceParams.dirName) : ""; + // It's user's responsibility to free the memory + if (sassToSourceParams.fileName) + std::free(sassToSourceParams.fileName); + if (sassToSourceParams.dirName) + std::free(sassToSourceParams.dirName); + return std::make_tuple(sassToSourceParams.lineNumber, fileNameStr, + dirNameStr); +} + +std::pair +getStallReasonNamesAndIndices(CUcontext context, size_t numStallReasons) { + char **stallReasonNames = + static_cast(std::calloc(numStallReasons, sizeof(char *))); + for (size_t i = 0; i < numStallReasons; i++) { + stallReasonNames[i] = static_cast( + std::calloc(CUPTI_STALL_REASON_STRING_SIZE, sizeof(char))); + } + uint32_t *stallReasonIndices = + static_cast(std::calloc(numStallReasons, sizeof(uint32_t))); + // Initialize the names with 128 characters to avoid buffer overflow + CUpti_PCSamplingGetStallReasonsParams stallReasonsParams = { + .size = CUpti_PCSamplingGetStallReasonsParamsSize, + .pPriv = NULL, + .ctx = context, + .numStallReasons = numStallReasons, + .stallReasonIndex = stallReasonIndices, + .stallReasons = stallReasonNames, + }; + cupti::pcSamplingGetStallReasons(&stallReasonsParams); + return std::make_pair(stallReasonNames, stallReasonIndices); +} + +size_t matchStallReasonsToIndices( + size_t numStallReasons, char **stallReasonNames, + uint32_t *stallReasonIndices, + std::map &stallReasonIndexToMetricIndex, + std::set ¬IssuedStallReasonIndices) { + // In case there's any invalid stall reasons, we only collect valid ones. + // Invalid ones are swapped to the end of the list + std::vector validIndex(numStallReasons, false); + size_t numValidStalls = 0; + for (size_t i = 0; i < numStallReasons; i++) { + bool notIssued = std::string(stallReasonNames[i]).find("not_issued") != + std::string::npos; + std::string cuptiStallName = std::string(stallReasonNames[i]); + for (size_t j = 0; j < PCSamplingMetric::PCSamplingMetricKind::Count; j++) { + auto metricName = PCSamplingMetric().getValueName(j); + if (cuptiStallName.find(metricName) != std::string::npos) { + if (notIssued) + notIssuedStallReasonIndices.insert(stallReasonIndices[i]); + stallReasonIndexToMetricIndex[stallReasonIndices[i]] = j; + validIndex[i] = true; + numValidStalls++; + break; + } + } + } + int invalidIndex = -1; + for (size_t i = 0; i < numStallReasons; i++) { + if (invalidIndex == -1 && !validIndex[i]) { + invalidIndex = i; + } else if (invalidIndex != -1 && validIndex[i]) { + std::swap(stallReasonIndices[invalidIndex], stallReasonIndices[i]); + std::swap(stallReasonNames[invalidIndex], stallReasonNames[i]); + validIndex[invalidIndex] = true; + invalidIndex++; + } + } + return numValidStalls; +} + +#define CUPTI_CUDA12_4_VERSION 22 +#define CUPTI_CUDA12_4_PC_DATA_PADDING_SIZE sizeof(uint32_t) + +CUpti_PCSamplingData allocPCSamplingData(size_t collectNumPCs, + size_t numValidStallReasons) { + uint32_t libVersion = 0; + cupti::getVersion(&libVersion); + size_t pcDataSize = sizeof(CUpti_PCSamplingPCData); + // Check cupti api version < 12.4 but cupti header version >= 12.4 + // If so, we subtract 4 bytes from the size of CUpti_PCSamplingPCData + // because it introduces a new field (i.e., correlationId) at the end of the + // struct, which is not compatible with the previous versions. + if (libVersion < CUPTI_CUDA12_4_VERSION && + CUPTI_API_VERSION >= CUPTI_CUDA12_4_VERSION) + pcDataSize -= CUPTI_CUDA12_4_PC_DATA_PADDING_SIZE; + CUpti_PCSamplingData pcSamplingData{ + .size = pcDataSize, + .collectNumPcs = collectNumPCs, + .pPcData = static_cast( + std::calloc(collectNumPCs, sizeof(CUpti_PCSamplingPCData)))}; + for (size_t i = 0; i < collectNumPCs; ++i) { + pcSamplingData.pPcData[i].stallReason = + static_cast(std::calloc( + numValidStallReasons, sizeof(CUpti_PCSamplingStallReason))); + } + return pcSamplingData; +} + +void enablePCSampling(CUcontext context) { + CUpti_PCSamplingEnableParams params = { + .size = CUpti_PCSamplingEnableParamsSize, + .pPriv = NULL, + .ctx = context, + }; + cupti::pcSamplingEnable(¶ms); +} + +void disablePCSampling(CUcontext context) { + CUpti_PCSamplingDisableParams params = { + .size = CUpti_PCSamplingDisableParamsSize, + .pPriv = NULL, + .ctx = context, + }; + cupti::pcSamplingDisable(¶ms); +} + +void startPCSampling(CUcontext context) { + CUpti_PCSamplingStartParams params = { + .size = CUpti_PCSamplingStartParamsSize, + .pPriv = NULL, + .ctx = context, + }; + cupti::pcSamplingStart(¶ms); +} + +void stopPCSampling(CUcontext context) { + CUpti_PCSamplingStopParams params = { + .size = CUpti_PCSamplingStopParamsSize, + .pPriv = NULL, + .ctx = context, + }; + cupti::pcSamplingStop(¶ms); +} + +void getPCSamplingData(CUcontext context, + CUpti_PCSamplingData *pcSamplingData) { + CUpti_PCSamplingGetDataParams params = { + .size = CUpti_PCSamplingGetDataParamsSize, + .pPriv = NULL, + .ctx = context, + .pcSamplingData = pcSamplingData, + }; + cupti::pcSamplingGetData(¶ms); +} + +void setConfigurationAttribute( + CUcontext context, + std::vector &configurationInfos) { + CUpti_PCSamplingConfigurationInfoParams infoParams = { + .size = CUpti_PCSamplingConfigurationInfoParamsSize, + .pPriv = NULL, + .ctx = context, + .numAttributes = configurationInfos.size(), + .pPCSamplingConfigurationInfo = configurationInfos.data(), + }; + cupti::pcSamplingSetConfigurationAttribute(&infoParams); +} + +} // namespace + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureStallReasons() { + numStallReasons = getNumStallReasons(context); + std::tie(this->stallReasonNames, this->stallReasonIndices) = + getStallReasonNamesAndIndices(context, numStallReasons); + numValidStallReasons = matchStallReasonsToIndices( + numStallReasons, stallReasonNames, stallReasonIndices, + stallReasonIndexToMetricIndex, notIssuedStallReasonIndices); + CUpti_PCSamplingConfigurationInfo stallReasonInfo{}; + stallReasonInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON; + stallReasonInfo.attributeData.stallReasonData.stallReasonCount = + numValidStallReasons; + stallReasonInfo.attributeData.stallReasonData.pStallReasonIndex = + stallReasonIndices; + return stallReasonInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureSamplingPeriod() { + CUpti_PCSamplingConfigurationInfo samplingPeriodInfo{}; + samplingPeriodInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD; + samplingPeriodInfo.attributeData.samplingPeriodData.samplingPeriod = + DefaultFrequency; + return samplingPeriodInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureSamplingBuffer() { + CUpti_PCSamplingConfigurationInfo samplingBufferInfo{}; + samplingBufferInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER; + this->pcSamplingData = + allocPCSamplingData(DataBufferPCCount, numValidStallReasons); + samplingBufferInfo.attributeData.samplingDataBufferData.samplingDataBuffer = + &this->pcSamplingData; + return samplingBufferInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureScratchBuffer() { + CUpti_PCSamplingConfigurationInfo scratchBufferInfo{}; + scratchBufferInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE; + scratchBufferInfo.attributeData.scratchBufferSizeData.scratchBufferSize = + ScratchBufferSize; + return scratchBufferInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureHardwareBufferSize() { + CUpti_PCSamplingConfigurationInfo hardwareBufferInfo{}; + hardwareBufferInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE; + hardwareBufferInfo.attributeData.hardwareBufferSizeData.hardwareBufferSize = + HardwareBufferSize; + return hardwareBufferInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureStartStopControl() { + CUpti_PCSamplingConfigurationInfo startStopControlInfo{}; + startStopControlInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL; + startStopControlInfo.attributeData.enableStartStopControlData + .enableStartStopControl = true; + return startStopControlInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureCollectionMode() { + CUpti_PCSamplingConfigurationInfo collectionModeInfo{}; + collectionModeInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE; + collectionModeInfo.attributeData.collectionModeData.collectionMode = + CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS; + return collectionModeInfo; +} + +void ConfigureData::initialize(CUcontext context) { + this->context = context; + cupti::getContextId(context, &contextId); + configurationInfos.emplace_back(configureStallReasons()); + configurationInfos.emplace_back(configureSamplingPeriod()); + configurationInfos.emplace_back(configureHardwareBufferSize()); + configurationInfos.emplace_back(configureScratchBuffer()); + configurationInfos.emplace_back(configureSamplingBuffer()); + configurationInfos.emplace_back(configureStartStopControl()); + configurationInfos.emplace_back(configureCollectionMode()); + setConfigurationAttribute(context, configurationInfos); +} + +ConfigureData *CuptiPCSampling::getConfigureData(uint32_t contextId) { + return &contextIdToConfigureData[contextId]; +} + +CubinData *CuptiPCSampling::getCubinData(uint64_t cubinCrc) { + return &(cubinCrcToCubinData[cubinCrc].first); +} + +void CuptiPCSampling::initialize(CUcontext context) { + uint32_t contextId = 0; + cupti::getContextId(context, &contextId); + doubleCheckedLock([&]() { return !contextInitialized.contain(contextId); }, + contextMutex, + [&]() { + enablePCSampling(context); + getConfigureData(contextId)->initialize(context); + contextInitialized.insert(contextId); + }); +} + +void CuptiPCSampling::start(CUcontext context) { + uint32_t contextId = 0; + cupti::getContextId(context, &contextId); + doubleCheckedLock([&]() -> bool { return !pcSamplingStarted; }, + pcSamplingMutex, + [&]() { + initialize(context); + // Ensure all previous operations are completed + cuda::ctxSynchronize(); + startPCSampling(context); + pcSamplingStarted = true; + }); +} + +void CuptiPCSampling::processPCSamplingData(ConfigureData *configureData, + uint64_t externId, bool isAPI) { + auto *pcSamplingData = &configureData->pcSamplingData; + auto &profiler = CuptiProfiler::instance(); + auto dataSet = profiler.getDataSet(); + // In the first round, we need to call getPCSamplingData to get the unsynced + // data from the hardware buffer + bool firstRound = true; + while (pcSamplingData->totalNumPcs > 0 || + pcSamplingData->remainingNumPcs > 0 || firstRound) { + // Handle data + for (size_t i = 0; i < pcSamplingData->totalNumPcs; ++i) { + auto *pcData = pcSamplingData->pPcData + i; + auto *cubinData = getCubinData(pcData->cubinCrc); + auto key = + CubinData::LineInfoKey{pcData->functionIndex, pcData->pcOffset}; + if (cubinData->lineInfo.find(key) == cubinData->lineInfo.end()) { + auto [lineNumber, fileName, dirName] = + getSassToSourceCorrelation(pcData->functionName, pcData->pcOffset, + cubinData->cubin, cubinData->cubinSize); + cubinData->lineInfo.try_emplace(key, lineNumber, + std::string(pcData->functionName), + dirName, fileName); + } + auto &lineInfo = cubinData->lineInfo[key]; + for (size_t j = 0; j < pcData->stallReasonCount; ++j) { + auto *stallReason = &pcData->stallReason[j]; + if (!configureData->stallReasonIndexToMetricIndex.count( + stallReason->pcSamplingStallReasonIndex)) + throw std::runtime_error("Invalid stall reason index"); + for (auto *data : dataSet) { + auto scopeId = externId; + if (isAPI) + scopeId = data->addScope(externId, lineInfo.functionName); + if (lineInfo.fileName.size()) + scopeId = data->addScope( + scopeId, lineInfo.dirName + "/" + lineInfo.fileName + ":" + + lineInfo.functionName + "@" + + std::to_string(lineInfo.lineNumber)); + auto metricKind = static_cast( + configureData->stallReasonIndexToMetricIndex + [stallReason->pcSamplingStallReasonIndex]); + auto samples = stallReason->samples; + auto stalledSamples = + configureData->notIssuedStallReasonIndices.count( + stallReason->pcSamplingStallReasonIndex) + ? 0 + : samples; + auto metric = std::make_shared(metricKind, samples, + stalledSamples); + data->addMetric(scopeId, metric); + } + } + } + if (pcSamplingData->remainingNumPcs > 0 || firstRound) { + getPCSamplingData(configureData->context, pcSamplingData); + firstRound = false; + } else + break; + } +} + +void CuptiPCSampling::stop(CUcontext context, uint64_t externId, bool isAPI) { + uint32_t contextId = 0; + cupti::getContextId(context, &contextId); + doubleCheckedLock([&]() -> bool { return pcSamplingStarted; }, + pcSamplingMutex, + [&]() { + auto *configureData = getConfigureData(contextId); + stopPCSampling(context); + pcSamplingStarted = false; + processPCSamplingData(configureData, externId, isAPI); + }); +} + +void CuptiPCSampling::finalize(CUcontext context) { + uint32_t contextId = 0; + cupti::getContextId(context, &contextId); + if (!contextInitialized.contain(contextId)) + return; + auto *configureData = getConfigureData(contextId); + contextIdToConfigureData.erase(contextId); + contextInitialized.erase(contextId); + disablePCSampling(context); +} + +void CuptiPCSampling::loadModule(const char *cubin, size_t cubinSize) { + auto cubinCrc = getCubinCrc(cubin, cubinSize); + auto *cubinData = getCubinData(cubinCrc); + cubinData->cubinCrc = cubinCrc; + cubinData->cubinSize = cubinSize; + cubinData->cubin = cubin; +} + +void CuptiPCSampling::unloadModule(const char *cubin, size_t cubinSize) { + // XXX: Unload module is supposed to be called in a thread safe manner + // i.e., no two threads will be calling unload module the same time + auto cubinCrc = getCubinCrc(cubin, cubinSize); + auto count = cubinCrcToCubinData[cubinCrc].second; + if (count > 1) + cubinCrcToCubinData[cubinCrc].second = count - 1; + else + cubinCrcToCubinData.erase(cubinCrc); +} + +} // namespace proton diff --git a/third_party/proton/csrc/lib/Profiler/CuptiProfiler.cpp b/third_party/proton/csrc/lib/Profiler/Cupti/CuptiProfiler.cpp similarity index 72% rename from third_party/proton/csrc/lib/Profiler/CuptiProfiler.cpp rename to third_party/proton/csrc/lib/Profiler/Cupti/CuptiProfiler.cpp index 573840fc6c55..9ddbd7a71547 100644 --- a/third_party/proton/csrc/lib/Profiler/CuptiProfiler.cpp +++ b/third_party/proton/csrc/lib/Profiler/Cupti/CuptiProfiler.cpp @@ -1,9 +1,10 @@ -#include "Profiler/CuptiProfiler.h" +#include "Profiler/Cupti/CuptiProfiler.h" #include "Context/Context.h" #include "Data/Metric.h" #include "Driver/Device.h" #include "Driver/GPU/CudaApi.h" #include "Driver/GPU/CuptiApi.h" +#include "Profiler/Cupti/CuptiPCSampling.h" #include "Utility/Map.h" #include @@ -162,6 +163,33 @@ void setGraphCallbacks(CUpti_SubscriberHandle subscriber, bool enable) { #undef CALLBACK_ENABLE } +void setResourceCallbacks(CUpti_SubscriberHandle subscriber, bool enable) { +#define CALLBACK_ENABLE(id) \ + cupti::enableCallback(static_cast(enable), subscriber, \ + CUPTI_CB_DOMAIN_RESOURCE, id) + + CALLBACK_ENABLE(CUPTI_CBID_RESOURCE_MODULE_LOADED); + CALLBACK_ENABLE(CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING); + CALLBACK_ENABLE(CUPTI_CBID_RESOURCE_CONTEXT_CREATED); + CALLBACK_ENABLE(CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING); +#undef CALLBACK_ENABLE +} + +bool isDriverAPILaunch(CUpti_CallbackId cbId) { + return cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunch || + cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid || + cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchGridAsync || + cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel || + cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz || + cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx || + cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx_ptsz || + cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel || + cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel_ptsz || + cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice || + cbId == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch || + cbId == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz; +} + } // namespace struct CuptiProfiler::CuptiProfilerPimpl @@ -186,6 +214,7 @@ struct CuptiProfiler::CuptiProfilerPimpl static constexpr size_t AttributeSize = sizeof(size_t); CUpti_SubscriberHandle subscriber{}; + CuptiPCSampling pcSampling; ThreadSafeMap> graphIdToNumInstances; @@ -241,33 +270,58 @@ void CuptiProfiler::CuptiProfilerPimpl::callbackFn(void *userData, if (domain == CUPTI_CB_DOMAIN_RESOURCE) { auto *resourceData = static_cast(const_cast(cbData)); - auto *graphData = - static_cast(resourceData->resourceDescriptor); auto *pImpl = dynamic_cast(profiler.pImpl.get()); - uint32_t graphId = 0; - uint32_t graphExecId = 0; - if (graphData->graph) - cupti::getGraphId(graphData->graph, &graphId); - if (graphData->graphExec) - cupti::getGraphExecId(graphData->graphExec, &graphExecId); - if (cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_CREATED || - cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_CLONED) { - if (!pImpl->graphIdToNumInstances.contain(graphId)) - pImpl->graphIdToNumInstances[graphId] = 1; - else - pImpl->graphIdToNumInstances[graphId]++; - } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_DESTROY_STARTING) { - pImpl->graphIdToNumInstances[graphId]--; - } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHEXEC_CREATED) { - pImpl->graphExecIdToGraphId[graphExecId] = graphId; - } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHEXEC_DESTROY_STARTING) { - pImpl->graphExecIdToGraphId.erase(graphExecId); - } else if (cbId == CUPTI_CBID_RESOURCE_GRAPH_DESTROY_STARTING) { - pImpl->graphIdToNumInstances.erase(graphId); + if (cbId == CUPTI_CBID_RESOURCE_MODULE_LOADED) { + auto *moduleResource = static_cast( + resourceData->resourceDescriptor); + if (profiler.isPCSamplingEnabled()) { + pImpl->pcSampling.loadModule(moduleResource->pCubin, + moduleResource->cubinSize); + } + } else if (cbId == CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING) { + auto *moduleResource = static_cast( + resourceData->resourceDescriptor); + if (profiler.isPCSamplingEnabled()) { + pImpl->pcSampling.unloadModule(moduleResource->pCubin, + moduleResource->cubinSize); + } + } else if (cbId == CUPTI_CBID_RESOURCE_CONTEXT_CREATED) { + if (profiler.isPCSamplingEnabled()) { + pImpl->pcSampling.initialize(resourceData->context); + } + } else if (cbId == CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING) { + if (profiler.isPCSamplingEnabled()) { + pImpl->pcSampling.finalize(resourceData->context); + } + } else { + auto *graphData = + static_cast(resourceData->resourceDescriptor); + uint32_t graphId = 0; + uint32_t graphExecId = 0; + if (graphData->graph) + cupti::getGraphId(graphData->graph, &graphId); + if (graphData->graphExec) + cupti::getGraphExecId(graphData->graphExec, &graphExecId); + if (cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_CREATED || + cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_CLONED) { + if (!pImpl->graphIdToNumInstances.contain(graphId)) + pImpl->graphIdToNumInstances[graphId] = 1; + else + pImpl->graphIdToNumInstances[graphId]++; + } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_DESTROY_STARTING) { + pImpl->graphIdToNumInstances[graphId]--; + } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHEXEC_CREATED) { + pImpl->graphExecIdToGraphId[graphExecId] = graphId; + } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHEXEC_DESTROY_STARTING) { + pImpl->graphExecIdToGraphId.erase(graphExecId); + } else if (cbId == CUPTI_CBID_RESOURCE_GRAPH_DESTROY_STARTING) { + pImpl->graphIdToNumInstances.erase(graphId); + } } } else { const CUpti_CallbackData *callbackData = static_cast(cbData); + auto *pImpl = dynamic_cast(profiler.pImpl.get()); if (callbackData->callbackSite == CUPTI_API_ENTER) { auto scopeId = Scope::getNewScopeId(); threadState.record(scopeId); @@ -275,7 +329,6 @@ void CuptiProfiler::CuptiProfilerPimpl::callbackFn(void *userData, size_t numInstances = 1; if (cbId == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch || cbId == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz) { - auto *pImpl = dynamic_cast(profiler.pImpl.get()); auto graphExec = static_cast( callbackData->functionParams) ->hGraph; @@ -298,7 +351,17 @@ void CuptiProfiler::CuptiProfilerPimpl::callbackFn(void *userData, << std::endl; } profiler.correlation.correlate(callbackData->correlationId, numInstances); + if (profiler.isPCSamplingEnabled() && isDriverAPILaunch(cbId)) { + pImpl->pcSampling.start(callbackData->context); + } } else if (callbackData->callbackSite == CUPTI_API_EXIT) { + if (profiler.isPCSamplingEnabled() && isDriverAPILaunch(cbId)) { + // XXX: Conservatively stop every GPU kernel for now + auto scopeId = profiler.correlation.externIdQueue.back(); + pImpl->pcSampling.stop( + callbackData->context, scopeId, + profiler.correlation.apiExternIds.contain(scopeId)); + } threadState.exitOp(); profiler.correlation.submit(callbackData->correlationId); } @@ -306,10 +369,15 @@ void CuptiProfiler::CuptiProfilerPimpl::callbackFn(void *userData, } void CuptiProfiler::CuptiProfilerPimpl::doStart() { - cupti::activityRegisterCallbacks(allocBuffer, completeBuffer); - cupti::activityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL); - // TODO: switch to directly subscribe the APIs and measure overhead cupti::subscribe(&subscriber, callbackFn, nullptr); + if (profiler.isPCSamplingEnabled()) { + setResourceCallbacks(subscriber, /*enable=*/true); + // Continuous PC sampling is not compatible with concurrent kernel profiling + cupti::activityEnable(CUPTI_ACTIVITY_KIND_KERNEL); + } else { + cupti::activityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL); + } + cupti::activityRegisterCallbacks(allocBuffer, completeBuffer); setGraphCallbacks(subscriber, /*enable=*/true); setRuntimeCallbacks(subscriber, /*enable=*/true); setDriverCallbacks(subscriber, /*enable=*/true); @@ -326,8 +394,12 @@ void CuptiProfiler::CuptiProfilerPimpl::doFlush() { // If the current context is not set, we don't do any synchronization. CUcontext cuContext = nullptr; cuda::ctxGetCurrent(&cuContext); - if (cuContext) + if (cuContext) { cuda::ctxSynchronize(); + } + if (profiler.isPCSamplingEnabled()) { + pcSampling.finalize(cuContext); + } profiler.correlation.flush( /*maxRetries=*/100, /*sleepMs=*/10, /*flush=*/[]() { @@ -341,7 +413,12 @@ void CuptiProfiler::CuptiProfilerPimpl::doFlush() { } void CuptiProfiler::CuptiProfilerPimpl::doStop() { - cupti::activityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL); + if (profiler.isPCSamplingEnabled()) { + setResourceCallbacks(subscriber, /*enable=*/false); + cupti::activityDisable(CUPTI_ACTIVITY_KIND_KERNEL); + } else { + cupti::activityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL); + } setGraphCallbacks(subscriber, /*enable=*/false); setRuntimeCallbacks(subscriber, /*enable=*/false); setDriverCallbacks(subscriber, /*enable=*/false); diff --git a/third_party/proton/csrc/lib/Profiler/RoctracerProfiler.cpp b/third_party/proton/csrc/lib/Profiler/RocTracer/RoctracerProfiler.cpp similarity index 99% rename from third_party/proton/csrc/lib/Profiler/RoctracerProfiler.cpp rename to third_party/proton/csrc/lib/Profiler/RocTracer/RoctracerProfiler.cpp index 55af9eb7140d..68f3f0beac9f 100644 --- a/third_party/proton/csrc/lib/Profiler/RoctracerProfiler.cpp +++ b/third_party/proton/csrc/lib/Profiler/RocTracer/RoctracerProfiler.cpp @@ -1,4 +1,4 @@ -#include "Profiler/RoctracerProfiler.h" +#include "Profiler/Roctracer/RoctracerProfiler.h" #include "Context/Context.h" #include "Data/Metric.h" #include "Driver/GPU/HipApi.h" diff --git a/third_party/proton/csrc/lib/Session/Session.cpp b/third_party/proton/csrc/lib/Session/Session.cpp index 1db512d07533..9b0ef10d37a0 100644 --- a/third_party/proton/csrc/lib/Session/Session.cpp +++ b/third_party/proton/csrc/lib/Session/Session.cpp @@ -2,8 +2,8 @@ #include "Context/Python.h" #include "Context/Shadow.h" #include "Data/TreeData.h" -#include "Profiler/CuptiProfiler.h" -#include "Profiler/RoctracerProfiler.h" +#include "Profiler/Cupti/CuptiProfiler.h" +#include "Profiler/Roctracer/RoctracerProfiler.h" #include "Utility/String.h" namespace proton { @@ -13,6 +13,9 @@ Profiler *getProfiler(const std::string &profilerName) { if (proton::toLower(profilerName) == "cupti") { return &CuptiProfiler::instance(); } + if (proton::toLower(profilerName) == "cupti_pcsampling") { + return &CuptiProfiler::instance().enablePCSampling(); + } if (proton::toLower(profilerName) == "roctracer") { return &RoctracerProfiler::instance(); } diff --git a/third_party/proton/proton/profile.py b/third_party/proton/proton/profile.py index 01d5a1947e8e..2dd7a6f53ed8 100644 --- a/third_party/proton/proton/profile.py +++ b/third_party/proton/proton/profile.py @@ -42,7 +42,7 @@ def start( name (str, optional): The name (with path) of the profiling session. If not provided, the default name is "~/proton.hatchet". backend (str, optional): The backend to use for profiling. - Available options are [None, "cupti", "roctracer"]. + Available options are [None, "cupti", "cupti_pcsampling", "roctracer"]. Defaults to None, which automatically selects the backend matching the current active runtime. context (str, optional): The context to use for profiling. Available options are ["shadow", "python"]. diff --git a/third_party/proton/proton/proton.py b/third_party/proton/proton/proton.py index 7ea6413ac55e..cbb7a0b6f90d 100644 --- a/third_party/proton/proton/proton.py +++ b/third_party/proton/proton/proton.py @@ -13,7 +13,8 @@ def parse_arguments(): python -m triton.profiler.proton [options] script.py [script_args] [script_options] """, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("-n", "--name", type=str, help="Name of the profiling session") - parser.add_argument("-b", "--backend", type=str, help="Profiling backend", default=None, choices=["cupti"]) + parser.add_argument("-b", "--backend", type=str, help="Profiling backend", default=None, + choices=["cupti", "cupti_pcsampling", "roctracer"]) parser.add_argument("-c", "--context", type=str, help="Profiling context", default="shadow", choices=["shadow", "python"]) parser.add_argument("-d", "--data", type=str, help="Profiling data", default="tree", choices=["tree"]) diff --git a/third_party/proton/proton/scope.py b/third_party/proton/proton/scope.py index 5695b8807500..26d946a8c155 100644 --- a/third_party/proton/proton/scope.py +++ b/third_party/proton/proton/scope.py @@ -5,7 +5,7 @@ from .flags import get_profiling_on from triton._C.libproton import proton as libproton -_local = threading.local() +thread_local_scopes = threading.local() MetricValueType = Union[float, int] PropertyValueType = Union[float, int, str] @@ -22,7 +22,7 @@ class scope: foo[1,](x, y) ``` - decoarator: + decorator: ```python @proton.scope("test0", {metric_name: metric_value}) def foo(x, y): @@ -36,25 +36,25 @@ def foo(x, y): def __init__(self, name: str, metrics: Optional[dict[str, MetricValueType]] = None, properties: Optional[dict[str, PropertyValueType]] = None) -> None: - self._name = name - self._metrics = metrics - self._properties = properties + self.name = name + self.metrics = metrics + self.properties = properties def __enter__(self): if not get_profiling_on(): return self - self._id = libproton.record_scope() - libproton.enter_scope(self._id, self._name) - if self._metrics: - libproton.add_metrics(self._id, self._metrics) - if self._properties: - libproton.set_properties(self._id, self._properties) + self.id = libproton.record_scope() + libproton.enter_scope(self.id, self.name) + if self.metrics: + libproton.add_metrics(self.id, self.metrics) + if self.properties: + libproton.set_properties(self.id, self.properties) return self def __exit__(self, exc_type, exc_value, traceback) -> None: if not get_profiling_on(): return - libproton.exit_scope(self._id, self._name) + libproton.exit_scope(self.id, self.name) def __call__(self, func): @@ -62,14 +62,14 @@ def __call__(self, func): def wrapper(*args, **kwargs): if get_profiling_on(): id = libproton.record_scope() - libproton.enter_scope(id, self._name) - if self._metrics: - libproton.add_metrics(id, self._metrics) - if self._properties: - libproton.set_properties(id, self._properties) + libproton.enter_scope(id, self.name) + if self.metrics: + libproton.add_metrics(id, self.metrics) + if self.properties: + libproton.set_properties(id, self.properties) ret = func(*args, **kwargs) if get_profiling_on(): - libproton.exit_scope(id, self._name) + libproton.exit_scope(id, self.name) return ret return wrapper @@ -80,9 +80,9 @@ def enter_scope(name: str, *, triton_op: bool = False, metrics: Optional[dict[st if not get_profiling_on(): return -1 id = libproton.record_scope() - if not hasattr(_local, "scopes"): - _local.scopes = [] - _local.scopes.append((id, name)) + if not hasattr(thread_local_scopes, "scopes"): + thread_local_scopes.scopes = [] + thread_local_scopes.scopes.append((id, name)) if triton_op: libproton.enter_op(id, name) else: @@ -97,7 +97,7 @@ def enter_scope(name: str, *, triton_op: bool = False, metrics: Optional[dict[st def exit_scope(triton_op: bool = False) -> int: if not get_profiling_on(): return -1 - id, name = _local.scopes.pop() + id, name = thread_local_scopes.scopes.pop() if triton_op: libproton.exit_op(id, name) else: diff --git a/third_party/proton/proton/viewer.py b/third_party/proton/proton/viewer.py index f77a65007fc7..456bd7a4b5f1 100644 --- a/third_party/proton/proton/viewer.py +++ b/third_party/proton/proton/viewer.py @@ -43,7 +43,7 @@ def get_min_time_flops(df, device_info): num_sms = device_info[device_type][device_index]["num_sms"] clock_rate = device_info[device_type][device_index]["clock_rate"] for width in TritonHook.flops_width: - idx = df["DeviceId"] == device_index + idx = df["device_id"] == device_index device_frames = df[idx] if f"flops{width}" not in device_frames.columns: continue @@ -72,7 +72,7 @@ def get_min_time_bytes(df, device_info): min_time_bytes = pd.DataFrame(0.0, index=df.index, columns=["min_time"]) for device_type in device_info: for device_index in device_info[device_type]: - idx = df["DeviceId"] == device_index + idx = df["device_id"] == device_index device_frames = df[idx] memory_clock_rate = device_info[device_type][device_index]["memory_clock_rate"] # in khz bus_width = device_info[device_type][device_index]["bus_width"] # in bits @@ -105,7 +105,7 @@ def get_min_time_bytes(df, device_info): def derive_metrics(gf, metrics, raw_metrics, device_info): derived_metrics = [] original_metrics = [] - internal_frame_indices = gf.dataframe["DeviceId"].isna() + internal_frame_indices = gf.dataframe["device_id"].isna() def get_time_seconds(df): time_metric_name = match_available_metrics([time_factor_dict.name], raw_metrics)[0] @@ -135,7 +135,7 @@ def get_time_seconds(df): derived_metrics.append(f"{metric} (inc)") elif metric in avg_time_factor_dict.factor: metric_time_unit = avg_time_factor_dict.name + "/" + metric.split("/")[1] - gf.dataframe[f"{metric} (inc)"] = (get_time_seconds(gf.dataframe) / gf.dataframe['Count'] / + gf.dataframe[f"{metric} (inc)"] = (get_time_seconds(gf.dataframe) / gf.dataframe['count'] / avg_time_factor_dict.factor[metric_time_unit]) gf.dataframe.loc[internal_frame_indices, f"{metric} (inc)"] = np.nan derived_metrics.append(f"{metric} (inc)") diff --git a/third_party/proton/test/example_cuda.json b/third_party/proton/test/example_cuda.json index 0db9ace447b8..445f0e224c65 100644 --- a/third_party/proton/test/example_cuda.json +++ b/third_party/proton/test/example_cuda.json @@ -8,10 +8,10 @@ "type": "function" }, "metrics": { - "Count": 10, - "DeviceId": "1", - "DeviceType": "CUDA", - "Time (ns)": 204800, + "count": 10, + "device_id": "1", + "device_type": "CUDA", + "time (ns)": 204800, "flops8": 1e11, "bytes": 1e8 } @@ -23,10 +23,10 @@ "type": "function" }, "metrics": { - "Count": 1, - "DeviceId": "0", - "DeviceType": "CUDA", - "Time (ns)": 204800, + "count": 1, + "device_id": "0", + "device_type": "CUDA", + "time (ns)": 204800, "flops8": 1e10, "bytes": 1e7 } @@ -37,8 +37,8 @@ "type": "function" }, "metrics": { - "Count": 0, - "Time (ns)": 0, + "count": 0, + "time (ns)": 0, "flops8": 0, "bytes": 0 } diff --git a/third_party/proton/test/example_frame.json b/third_party/proton/test/example_frame.json index 64789a3b743f..0069476fbc90 100644 --- a/third_party/proton/test/example_frame.json +++ b/third_party/proton/test/example_frame.json @@ -10,10 +10,10 @@ "type": "function" }, "metrics": { - "Count": 1, - "DeviceId": "0", - "DeviceType": "HIP", - "Time (ns)": 204800 + "count": 1, + "device_id": "0", + "device_type": "HIP", + "time (ns)": 204800 } } ], @@ -27,7 +27,12 @@ "frame": { "name": "test1" }, - "metrics": {} + "metrics": { + "count": 1, + "device_id": "0", + "device_type": "HIP", + "time (ns)": 204800 + } } ], "frame": { @@ -35,8 +40,8 @@ "type": "function" }, "metrics": { - "Count": 0, - "Time (ns)": 0 + "count": 0, + "time (ns)": 0 } }, { diff --git a/third_party/proton/test/example_hip.json b/third_party/proton/test/example_hip.json index 2fcfad3c5d05..68538706cfe9 100644 --- a/third_party/proton/test/example_hip.json +++ b/third_party/proton/test/example_hip.json @@ -8,10 +8,10 @@ "type": "function" }, "metrics": { - "Count": 1, - "DeviceId": "1", - "DeviceType": "HIP", - "Time (ns)": 204800, + "count": 1, + "device_id": "1", + "device_type": "HIP", + "time (ns)": 204800, "flops8": 1e11, "bytes": 1e8 } @@ -23,10 +23,10 @@ "type": "function" }, "metrics": { - "Count": 1, - "DeviceId": "0", - "DeviceType": "HIP", - "Time (ns)": 204800, + "count": 1, + "device_id": "0", + "device_type": "HIP", + "time (ns)": 204800, "flops8": 1e10, "bytes": 1e7 } @@ -37,8 +37,8 @@ "type": "function" }, "metrics": { - "Count": 0, - "Time (ns)": 0, + "count": 0, + "time (ns)": 0, "flops8": 0, "bytes": 0 } diff --git a/third_party/proton/test/test_profile.py b/third_party/proton/test/test_profile.py index 1a69608a26e6..13cb9bd99cbe 100644 --- a/third_party/proton/test/test_profile.py +++ b/third_party/proton/test/test_profile.py @@ -25,7 +25,7 @@ def test_torch(context): if context == "shadow": assert len(data[0]["children"]) == 1 assert data[0]["children"][0]["frame"]["name"] == "test" - assert data[0]["children"][0]["children"][0]["metrics"]["Time (ns)"] > 0 + assert data[0]["children"][0]["children"][0]["metrics"]["time (ns)"] > 0 elif context == "python": assert len(data[0]["children"]) == 1 # The last frame is the torch kernel @@ -111,7 +111,7 @@ def fn(): assert len(test_frame["children"]) >= 2 else: assert len(test_frame["children"]) >= 3 - assert test_frame["children"][0]["metrics"]["Time (ns)"] > 0 + assert test_frame["children"][0]["metrics"]["time (ns)"] > 0 def test_metrics(): @@ -197,7 +197,41 @@ def foo(x, size: tl.constexpr, y): assert data[0]["children"][0]["frame"]["name"] == "test0" assert data[0]["children"][0]["children"][0]["frame"]["name"] == "foo_test_1ctas_1elems" assert data[0]["children"][0]["children"][0]["metrics"]["flops32"] == 1.0 - assert data[0]["children"][0]["children"][0]["metrics"]["Time (ns)"] > 0 + assert data[0]["children"][0]["children"][0]["metrics"]["time (ns)"] > 0 + + +def test_pcsampling(): + if is_hip(): + pytest.skip("HIP backend does not support pc sampling") + + import os + if os.environ.get("PROTON_SKIP_PC_SAMPLING_TEST", "0") == "1": + pytest.skip("PC sampling test is disabled") + + @triton.jit + def foo(x, y, size: tl.constexpr): + offs = tl.arange(0, size) + for _ in range(1000): + tl.store(y + offs, tl.load(x + offs)) + + with tempfile.NamedTemporaryFile(delete=True, suffix=".hatchet") as f: + proton.start(f.name.split(".")[0], hook="triton", backend="cupti_pcsampling") + with proton.scope("init"): + x = torch.ones((1024, ), device="cuda", dtype=torch.float32) + y = torch.zeros_like(x) + with proton.scope("test"): + foo[(1, )](x, y, x.size()[0], num_warps=4) + proton.finalize() + data = json.load(f) + init_frame = data[0]["children"][0] + test_frame = data[0]["children"][1] + # With line mapping + assert "foo" in test_frame["children"][0]["frame"]["name"] + assert test_frame["children"][0]["children"][0]["metrics"]["num_samples"] > 0 + assert "@" in test_frame["children"][0]["children"][0]["frame"]["name"] + # Without line mapping + assert "elementwise" in init_frame["children"][0]["frame"]["name"] + assert init_frame["children"][0]["metrics"]["num_samples"] > 0 def test_deactivate(): @@ -211,6 +245,6 @@ def test_deactivate(): proton.finalize() data = json.load(f) # Root shouldn't have device id - assert "DeviceId" not in data[0]["metrics"] + assert "device_id" not in data[0]["metrics"] assert len(data[0]["children"]) == 1 - assert "DeviceId" in data[0]["children"][0]["metrics"] + assert "device_id" in data[0]["children"][0]["metrics"] diff --git a/third_party/proton/test/test_viewer.py b/third_party/proton/test/test_viewer.py index c8343e12672e..998825bbc8b3 100644 --- a/third_party/proton/test/test_viewer.py +++ b/third_party/proton/test/test_viewer.py @@ -52,8 +52,8 @@ def test_min_time_flops(): with open(cuda_example_file, "r") as f: gf, _, device_info = get_raw_metrics(f) ret = get_min_time_flops(gf.dataframe, device_info) - device0_idx = gf.dataframe["DeviceId"] == "0" - device1_idx = gf.dataframe["DeviceId"] == "1" + device0_idx = gf.dataframe["device_id"] == "0" + device1_idx = gf.dataframe["device_id"] == "1" # sm89 np.testing.assert_allclose(ret[device0_idx].to_numpy(), [[0.000025]], atol=1e-5) # sm90 @@ -61,8 +61,8 @@ def test_min_time_flops(): with open(hip_example_file, "r") as f: gf, _, device_info = get_raw_metrics(f) ret = get_min_time_flops(gf.dataframe, device_info) - device0_idx = gf.dataframe["DeviceId"] == "0" - device1_idx = gf.dataframe["DeviceId"] == "1" + device0_idx = gf.dataframe["device_id"] == "0" + device1_idx = gf.dataframe["device_id"] == "1" # MI200 np.testing.assert_allclose(ret[device0_idx].to_numpy(), [[0.000026]], atol=1e-5) # MI300 @@ -73,8 +73,8 @@ def test_min_time_bytes(): with open(cuda_example_file, "r") as f: gf, _, device_info = get_raw_metrics(f) ret = get_min_time_bytes(gf.dataframe, device_info) - device0_idx = gf.dataframe["DeviceId"] == "0" - device1_idx = gf.dataframe["DeviceId"] == "1" + device0_idx = gf.dataframe["device_id"] == "0" + device1_idx = gf.dataframe["device_id"] == "1" # sm89 np.testing.assert_allclose(ret[device0_idx].to_numpy(), [[9.91969e-06]], atol=1e-6) # sm90 @@ -82,8 +82,8 @@ def test_min_time_bytes(): with open(hip_example_file, "r") as f: gf, _, device_info = get_raw_metrics(f) ret = get_min_time_bytes(gf.dataframe, device_info) - device0_idx = gf.dataframe["DeviceId"] == "0" - device1_idx = gf.dataframe["DeviceId"] == "1" + device0_idx = gf.dataframe["device_id"] == "0" + device1_idx = gf.dataframe["device_id"] == "1" # MI200 np.testing.assert_allclose(ret[device0_idx].to_numpy(), [[6.10351e-06]], atol=1e-6) # MI300 diff --git a/third_party/proton/tutorials/dynamic_net.py b/third_party/proton/tutorials/dynamic_net.py index a1a82b53e27e..5793bebd0939 100644 --- a/third_party/proton/tutorials/dynamic_net.py +++ b/third_party/proton/tutorials/dynamic_net.py @@ -85,13 +85,14 @@ def run(): argparser.add_argument("--profile", action="store_true") argparser.add_argument("--mode", default="torch", choices=["torch", "torchinductor"]) argparser.add_argument("--context", default="shadow", choices=["shadow", "python"]) +argparser.add_argument("--backend", default=None, choices=["cupti", "roctracer", "cupti_pcsampling"]) args = argparser.parse_args() mode = args.mode if args.profile: - func = proton.profile(run, name="dynamic_net", context=args.context) + func = proton.profile(run, name="dynamic_net", context=args.context, backend=args.backend) else: func = run