Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create NVRTCC, a utility for running tests under NVRTC #494

Merged
merged 31 commits into from
Oct 31, 2023
Merged
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
fc98833
Add documentation
wmaxey Sep 12, 2023
3197bee
Replace the NVRTC utils with NVRTCC
wmaxey Sep 26, 2023
a5c2e39
Hack nvrtcc into lit.
wmaxey Sep 29, 2023
3648c64
Make loading executables simpler in nvrtcc.
wmaxey Sep 29, 2023
f9a5101
Delete tests we don't support.
wmaxey Sep 29, 2023
7484c9d
Make tests that previously did not support nvrtc, support nvrtc.
wmaxey Sep 29, 2023
3f590ea
Grammar fixup to README
wmaxey Sep 29, 2023
f8a81ad
Grammar fixup to README
wmaxey Sep 29, 2023
6591fce
Clarity fixup to nvrtcc.cpp
wmaxey Sep 29, 2023
3e24c96
Make nvrtcc work on Linux too
wmaxey Sep 29, 2023
8814b1b
Fix greedy parsing of cuda thread count in nvrtc builds
wmaxey Sep 29, 2023
1fb765f
Plug NVRTCC into the nvrtc CI build
wmaxey Sep 29, 2023
349858e
Fixup for every other build config except NVRTC
wmaxey Sep 29, 2023
0c8a72d
Fix a .fail test that failed for the wrong reason.
wmaxey Sep 29, 2023
fef2f37
Fix dumpMacros and dumpVersion on MSVC in lit config
wmaxey Sep 29, 2023
05cd387
Mark more tests as supportable by NVRTC.
wmaxey Oct 20, 2023
e08150f
XFAIL mdspan tests that aren't triggering static_assertions.
wmaxey Oct 20, 2023
68e4032
Fix tests that are missing `__host__ __device__`
wmaxey Oct 20, 2023
63e57d9
xfail/unsupport several NVRTC tests
wmaxey Oct 20, 2023
759fac9
Support more .fail tests with NVRTC.
wmaxey Oct 20, 2023
33416aa
Make nvrtcc respect nul for output.
wmaxey Oct 20, 2023
6bb01fb
Replace <cassert> with <cuda/std/cassert>
wmaxey Oct 20, 2023
7450a64
Make more time tests pass on nvrtc
wmaxey Oct 20, 2023
dd417f5
Support a couple more tests for NVRTC.
wmaxey Oct 20, 2023
22a1ca3
Make an nvrtcc target for CI
wmaxey Oct 20, 2023
bb48b9f
Fix a few warnings and cleanup nvrtc.
wmaxey Oct 21, 2023
6f055fb
Fix mixup with cl and clang.
wmaxey Oct 23, 2023
709aeaa
Merge branch 'main' into fea/nvrtcc
wmaxey Oct 25, 2023
46e5604
Merge branch 'main' into fea/nvrtcc
wmaxey Oct 26, 2023
636d8b1
Merge branch 'main' into fea/nvrtcc
wmaxey Oct 29, 2023
265202e
Merge branch 'main' into fea/nvrtcc
wmaxey Oct 31, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Make loading executables simpler in nvrtcc.
This change adds an output file created during build.
It contains information required for launching on GPU.
wmaxey committed Oct 18, 2023

Verified

This commit was signed with the committer’s verified signature.
wmaxey Wesley Maxey
commit 3648c645985310664e14189185d55191775ca3a4
119 changes: 88 additions & 31 deletions libcudacxx/.upstream-tests/utils/nvidia/nvrtc/nvrtcc.cpp
Original file line number Diff line number Diff line change
@@ -11,6 +11,7 @@
#include <stdio.h>

#include <algorithm>
#include <cassert>
#include <deque>
#include <functional>
#include <regex>
@@ -32,11 +33,7 @@ std::string inputFile;
bool building = false;
bool execute = false;

// Arch config is a set of unique pairs of strings and bools
// e.x. { compute_arch, real_or_virtual }
// { "sm_80", true } { "compute_80", false }
using ArchList = std::set<ArchConfig>;
ArchList buildList;
ExecutionConfig executionConfig;

enum ArgProcessorState {
NORMAL,
@@ -101,14 +98,34 @@ ArgHandlerMap argHandlers {
return ABORT; // Unreachable
}
},
{
// Greed input file type flag
make_greedy_handler("^-x$")
},
{
// Matches for CUDA input type
std::regex("^-x ?cu$"),
[&](const std::smatch& match) {
ignoredArguments.emplace_back(match[0].str());
return NORMAL;
}
},
{
// Matches anything other than CUDA
std::regex("^-x ?(.*)$"),
[&](const std::smatch& match) {
// If we're building with something else just add the default arch
buildList.emplace(translate_gpu_arch(""));
return NORMAL;
}
},
{
// The include flag is improperly formatted, greed append
make_greedy_handler("^-I$")
},
{
std::regex("^-I ?(.+)$"),
[&](const std::smatch& match) {
printf("Adding argument: %s\r\n", match[0].str().data());
nvrtcArguments.emplace_back(match[0].str());
return NORMAL;
}
@@ -119,10 +136,9 @@ ArgHandlerMap argHandlers {
{
// Matches any force include or system include directories
// Might need to figure out if we need to force include a file manually
std::regex("^(?=-include|-isystem) ?(.+)$"),
std::regex("^-include ?(.+)$"),
[&](const std::smatch& match) {
printf("Argument ignored: %s\r\n", match[0].str().data());
ignoredArguments.emplace_back(match[0].str());
nvrtcArguments.emplace_back("--pre-include=" + match[1].str());
return NORMAL;
}
},
@@ -160,45 +176,68 @@ ArgHandlerMap argHandlers {
}
},
{
// Throw away remaining arguments
std::regex("^-.+$"),
// Capture an argument that is just '-'. If no input file is listed input is on stdin
std::regex("^-$"),
[&](const std::smatch& match) {
ignoredArguments.emplace_back(match[0].str());
inputFile = match[0].str();
return NORMAL;
}
},
{
// If an input lists a .gpu file, run that file instead
std::regex("^[^-].*.gpu$"),
std::regex("^([^-].*).gpu$"),
[&](const std::smatch& match) {
execute = true;
inputFile = match[0].str();
executionConfig = ExecutionConfig {
RunConfig{1, 0},
{match[0].str()}
};

return NORMAL;
}
},
{
// If an input is a .exe file, search for other builds and run those
std::regex("^([^-].*).exe$"),
[&](const std::smatch& match) {
execute = true;
executionConfig = load_execution_config_from_file(match[1].str() + ".build.yml");
assert(executionConfig.builds.size());
return NORMAL;
}
},
{
// Capture any argument not starting with '-' as the input file
std::regex("^[^-].*$"),
std::regex("^([^-].+)[\\\\/].+$"),
[&](const std::smatch& match) {
inputFile = match[0].str();
// Capture directory of input file as an include path
nvrtcArguments.emplace_back("-I " + match[1].str());
return NORMAL;
}
},
{
// Throw away remaining arguments
std::regex("^-.+$"),
[&](const std::smatch& match) {
ignoredArguments.emplace_back(match[0].str());
return NORMAL;
}
},
};

int main(int argc, char **argv) {
printf("First arg: %s\r\n", *argv);
// Greedily take off first arg
g_argc = argc-1;
g_argv = argv+1;

ArgList args;
ArgProcessorState ps = NORMAL;
ArgProcessorState argState = NORMAL;

// Start by parsing arguments and building the configuration
std::string c_arg{};
for (auto a = g_argv; a < g_argv+g_argc; a++) {
// If the argument was greedy, we'll retry with an appended argument
c_arg = (ps == GREEDY) ? c_arg + " " + *a : *a;
c_arg = (argState == GREEDY) ? c_arg + " " + *a : *a;

for (auto& h: argHandlers) {
auto& regex = h.first;
@@ -208,23 +247,23 @@ int main(int argc, char **argv) {
std::regex_match(c_arg, matches, regex);

if (matches.size()) {
ps = handler(matches);
argState = handler(matches);
break;
}
}
}

printf("NVRTCC Configuration:\r\n");
printf(" Output dir: %s\r\n", outputDir.c_str());
printf(" Input file: %s\r\n", inputFile.c_str());
printf(" Building: %s\r\n", building ? "true" : "false");
printf(" Running: %s\r\n", execute ? "true" : "false");
fprintf(stderr, "NVRTCC Configuration:\r\n");
fprintf(stderr, " Output dir: %s\r\n", outputDir.c_str());
fprintf(stderr, " Input file: %s\r\n", inputFile.c_str());
fprintf(stderr, " Building: %s\r\n", building ? "true" : "false");
fprintf(stderr, " Executing: %s\r\n", execute ? "true" : "false");

// Load the input file and execute
if (execute) {
printf("Loading %s for execution\r\n", inputFile.c_str());
load_and_run_gpu_code(inputFile);
printf("Execution Passed\r\n");
fprintf(stderr, "Executing %s with %i threads\r\n", executionConfig.builds[0].c_str(), executionConfig.rc.threadCount);
load_and_run_gpu_code(executionConfig.builds[0], executionConfig.rc);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: Instead of needing to pass the produced fatbin to nvrtcc in order to execute the program, couldn't nvrtcc just produce a binary that does this step for you?

Basically, it would store the fatbin in the binary, and in the main() it would do load_and_run_gpu_code(/*stored fatbin*/).

That way, nvrtcc would work even more like a regular compiler in that it outputs a normal binary that can be executed just like ./a.out.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is essentially what the old script did. It produced a binary with the raw test stored inside and compiled it during execution. Every test was recompiled into the same hosting application. It bought us the 'works like a binary thing' but failed for giving us negative testing with .fail.cpp tests.

It might be possible to do this. It would get rid of needing to use the PrefixExecutor() for tests. I'll make a PR after this one with the documentation in mind.

fprintf(stderr, "Execution Passed\r\n");
return 0;
}

@@ -234,12 +273,30 @@ int main(int argc, char **argv) {
}

// Rebuild the output file template based on the filename
std::string outputTemplate = outputDir+"/"+outputFile;
std::string outputTemplate;
if (outputDir.size() && outputFile.size())
outputTemplate = outputDir+"/"+outputFile;
else
outputTemplate = "temp";

// load input test file and prepend fakemain
std::string testCu = program + load_input_file(inputFile);

// Write any needed kernel launch data to file for later
RunConfig runConfig = parse_run_config(testCu);

std::ofstream ostr(outputTemplate + ".build.yml");
ostr << "cuda_thread_count: " << runConfig.threadCount << '\n';
ostr << "cuda_block_shmem_size: " << runConfig.shmemSize << '\n';

// Do a build for each arch
// Do a build for each arch and add it to the build list
ostr << "builds:\n";
for (const auto& build : buildList) {
std::string outputPath = nvrtc_build_prog(inputFile, outputTemplate, build, nvrtcArguments);
ostr << " - ";
ostr << '\'' << nvrtc_build_prog(testCu, outputTemplate, build, nvrtcArguments) << '\'';
ostr << '\n';
}
ostr.close();

return 0;
}
54 changes: 20 additions & 34 deletions libcudacxx/.upstream-tests/utils/nvidia/nvrtc/nvrtcc_build.h
Original file line number Diff line number Diff line change
@@ -16,37 +16,25 @@
#include <stdio.h>

#include <algorithm>
#include <fstream>
#include <memory>
#include <string>
#include <vector>

// Arch configs are strings and bools determining architecture and ptx/sass compilation
using ArchConfig = std::tuple<std::string, bool>;
constexpr auto archString = [](const ArchConfig& a) {return std::get<0>(a);};
constexpr auto isArchReal = [](const ArchConfig& a) {return std::get<1>(a);};
constexpr auto archString = [](const ArchConfig& a) -> const auto& {return std::get<0>(a);};
constexpr auto isArchReal = [](const ArchConfig& a) -> const auto& {return std::get<1>(a);};

using ArgList = std::vector<std::string>;

const char * program = R"program(
__host__ __device__ int fake_main(int argc, char ** argv);
#define main fake_main
// Arch list is a set of unique pairs of strings and bools
// e.x. { compute_arch, real_or_virtual }
// { "sm_80", true } { "compute_80", false }
using ArchList = std::set<ArchConfig>;
ArchList buildList;

// extern "C" to stop the name from being mangled
extern "C" __global__ void main_kernel() {
fake_main(0, NULL);
}
)program";
using ArgList = std::vector<std::string>;

// Takes arguments for building a file and returns the path to the output file
std::string nvrtc_build_prog(const std::string& input_file, const std::string& output_template, const ArchConfig& config, const ArgList& argList) {
std::ifstream istr(input_file);
std::string test_cu(
std::istreambuf_iterator<char>{istr},
std::istreambuf_iterator<char>{} );

// Prepend fakemain
test_cu = program + test_cu;

std::string nvrtc_build_prog(const std::string& testCu, const std::string& outputTemplate, const ArchConfig& config, const ArgList& argList) {
// Assemble arguments
std::vector<const char*> optList;

@@ -57,18 +45,18 @@ std::string nvrtc_build_prog(const std::string& input_file, const std::string& o

// Use the translated architecture
std::string gpu_arch("--gpu-architecture=" + archString(config));
optList.emplace_back(gpu_arch.c_str());
optList.emplace_back(gpu_arch.c_str());

printf("NVRTC opt list:\r\n");
fprintf(stderr, "NVRTC opt list:\r\n");
for (const auto& it: optList) {
printf(" %s\r\n", it);
fprintf(stderr, " %s\r\n", it);
}

printf ("Compiling program...\r\n");
fprintf(stderr, "Compiling program...\r\n");
nvrtcProgram prog;
NVRTC_SAFE_CALL(nvrtcCreateProgram(
&prog,
test_cu.c_str(),
testCu.c_str(),
"test.cu",
0, NULL, NULL));

@@ -77,7 +65,7 @@ std::string nvrtc_build_prog(const std::string& input_file, const std::string& o
optList.size(),
optList.data());

printf ("Collecting logs...\r\n");
fprintf(stderr, "Collecting logs...\r\n");
size_t log_size;
NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &log_size));

@@ -92,8 +80,8 @@ std::string nvrtc_build_prog(const std::string& input_file, const std::string& o
}

size_t codeSize;

std::unique_ptr<char[]> code{nullptr};

if (isArchReal(config)) {
NVRTC_SAFE_CALL(nvrtcGetCUBINSize(prog, &codeSize));
code = std::unique_ptr<char[]>{new char[codeSize]};
@@ -106,12 +94,10 @@ std::string nvrtc_build_prog(const std::string& input_file, const std::string& o
}
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));

std::string output_file = output_template + "." + archString(config) + ".gpu";
printf("Writing output to: %s\r\n", output_file.c_str());
std::string output_file = outputTemplate + "." + archString(config) + ".gpu";
fprintf(stderr, "Writing output to: %s\r\n", output_file.c_str());

std::ofstream ostr(output_file, std::ios::binary);
ostr.write(code.get(), codeSize);
ostr.close();
write_output_file(code.get(), codeSize, output_file);

return output_file;
}
71 changes: 71 additions & 0 deletions libcudacxx/.upstream-tests/utils/nvidia/nvrtc/nvrtcc_common.h
Original file line number Diff line number Diff line change
@@ -10,6 +10,12 @@

#pragma once

#include <fstream>
#include <iostream>
#include <regex>
#include <string>
#include <vector>

#define NVRTC_SAFE_CALL(x) \
do { \
nvrtcResult result = x; \
@@ -41,3 +47,68 @@
exit(1); \
} \
} while(0)


static void write_output_file(const char* data, size_t datasz, const std::string& file) {
std::ofstream ostr(file, std::ios::binary);
assert(!!ostr);

ostr.write(data, datasz);
ostr.close();
}

static std::string load_input_file(const std::string& file) {
if (file == "-") {
return std::string(
std::istream_iterator<char>{std::cin},
std::istream_iterator<char>{} );
} else {
std::ifstream istr(file);
assert(!!istr);
return std::string(
std::istreambuf_iterator<char>{istr},
std::istreambuf_iterator<char>{} );
}
}

static int parse_int_assignment(const std::string& input, std::string var, int def) {
auto lineBegin = input.find(var);
auto lineEnd = input.find('\n', lineBegin);

if (lineBegin == std::string::npos || lineEnd == std::string::npos)
return def;

std::string line(input.begin()+lineBegin, input.begin()+lineEnd);
std::regex varRegex("^" + var + ".*([0-9]+);?$");
std::smatch match;
std::regex_match(line, match, varRegex);

if (match.size()) {
return std::stoi(match[1].str(), nullptr);
}

return def;
}

struct RunConfig {
int threadCount = 1;
int shmemSize = 0;
};

static RunConfig parse_run_config(const std::string& input) {
return RunConfig {
parse_int_assignment(input, "cuda_thread_count", 1),
parse_int_assignment(input, "cuda_block_shmem_size", 0),
};
}

// Fake main for adapting kernels
static const char * program = R"program(
__host__ __device__ int fake_main(int argc, char ** argv);
#define main fake_main
// extern "C" to stop the name from being mangled
extern "C" __global__ void main_kernel() {
fake_main(0, NULL);
}
)program";
71 changes: 53 additions & 18 deletions libcudacxx/.upstream-tests/utils/nvidia/nvrtc/nvrtcc_run.h
Original file line number Diff line number Diff line change
@@ -14,42 +14,77 @@
#include <cuda.h>
#include <cuda_runtime.h>


struct ExecutionConfig {
RunConfig rc;
std::vector<std::string> builds;
};

static ExecutionConfig load_execution_config_from_file(const std::string& file) {
std::vector<std::string> builds;
auto config = load_input_file(file);
std::regex config_regex("^ *- *'(.*gpu)'$");

fprintf(stderr, "Builds found: \r\n");

size_t line_begin = 0;
size_t line_end = config.find('\n');
while (line_end != std::string::npos) {
// Match any line with a .gpu file
// std::regex cannot handle multiline, so we need to make sure that's not included
std::string line(config.begin() + line_begin, config.begin() + line_end);
std::smatch match;
std::regex_match(line, match, config_regex);

if (match.size()) {
builds.emplace_back(match[1].str());
}

line_begin = line_end+1;
line_end = config.find('\n', line_begin);
}

return {
parse_run_config(config),
builds
};
}

static void list_devices()
{
int device_count;
cudaGetDeviceCount(&device_count);
printf("CUDA devices found: %d\n", device_count);
int deviceCount;
cudaGetDeviceCount(&deviceCount);
fprintf(stderr, "CUDA devices found: %d\n", deviceCount);

int selected_device;
cudaGetDevice(&selected_device);
int selectedDevice;
cudaGetDevice(&selectedDevice);

for (int dev = 0; dev < device_count; ++dev)
for (int dev = 0; dev < deviceCount; ++dev)
{
cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, dev);

printf("Device %d: \"%s\", ", dev, device_prop.name);
if(dev == selected_device)
printf("Selected, ");
fprintf(stderr, "Device %d: \"%s\", ", dev, device_prop.name);
if(dev == selectedDevice)
fprintf(stderr, "Selected, ");
else
printf("Unused, ");
fprintf(stderr, "Unused, ");

printf("SM%d%d, %zu [bytes]\n",
fprintf(stderr, "SM%d%d, %zu [bytes]\n",
device_prop.major, device_prop.minor,
device_prop.totalGlobalMem);
}
}

static void load_and_run_gpu_code(const std::string input_file) {
std::ifstream istr(input_file, std::ios::binary);
static void load_and_run_gpu_code(const std::string inputFile, const RunConfig& rc) {
std::ifstream istr(inputFile, std::ios::binary);
assert(!!istr);

std::vector<char> code(
std::istreambuf_iterator<char>{istr},
std::istreambuf_iterator<char>{} );
istr.close();

unsigned int thread_count = 1;
unsigned int shmem_size = 1;

CUdevice cuDevice;
CUcontext context;
CUmodule module;
@@ -62,8 +97,8 @@ static void load_and_run_gpu_code(const std::string input_file) {
CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "main_kernel"));
CUDA_SAFE_CALL(cuLaunchKernel(kernel,
1, 1, 1,
thread_count, 1, 1,
shmem_size,
rc.threadCount, 1, 1,
rc.shmemSize,
NULL,
NULL, 0));