Make loading executables simpler in nvrtcc.

This change adds an output file created during build. It contains information required for launching on GPU.
NVIDIA · wmaxey · Oct 31, 2023 · Sep 12, 2023 · Sep 26, 2023 · Sep 29, 2023
commit 3648c645985310664e14189185d55191775ca3a4
@@ -11,6 +11,7 @@
 #include <stdio.h>
 
 #include <algorithm>
+#include <cassert>
 #include <deque>
 #include <functional>
 #include <regex>
@@ -32,11 +33,7 @@ std::string inputFile;
 bool building = false;
 bool execute = false;
 
-// Arch config is a set of unique pairs of strings and bools
-// e.x. { compute_arch, real_or_virtual }
-// { "sm_80", true } { "compute_80", false }
-using ArchList = std::set<ArchConfig>;
-ArchList buildList;
+ExecutionConfig executionConfig;
 
 enum ArgProcessorState {
     NORMAL,
@@ -101,14 +98,34 @@ ArgHandlerMap argHandlers {
                 return ABORT; // Unreachable
             }
     },
+    {
+        // Greed input file type flag
+        make_greedy_handler("^-x$")
+    },
+    {
+        // Matches for CUDA input type
+        std::regex("^-x ?cu$"),
+            [&](const std::smatch& match) {
+                ignoredArguments.emplace_back(match[0].str());
+                return NORMAL;
+            }
+    },
+    {
+        // Matches anything other than CUDA
+        std::regex("^-x ?(.*)$"),
+            [&](const std::smatch& match) {
+                // If we're building with something else just add the default arch
+                buildList.emplace(translate_gpu_arch(""));
+                return NORMAL;
+            }
+    },
     {
         // The include flag is improperly formatted, greed append
         make_greedy_handler("^-I$")
     },
     {
         std::regex("^-I ?(.+)$"),
             [&](const std::smatch& match) {
-                printf("Adding argument: %s\r\n", match[0].str().data());
                 nvrtcArguments.emplace_back(match[0].str());
                 return NORMAL;
             }
@@ -119,10 +136,9 @@ ArgHandlerMap argHandlers {
     {
         // Matches any force include or system include directories
         // Might need to figure out if we need to force include a file manually
-        std::regex("^(?=-include|-isystem) ?(.+)$"),
+        std::regex("^-include ?(.+)$"),
             [&](const std::smatch& match) {
-                printf("Argument ignored: %s\r\n", match[0].str().data());
-                ignoredArguments.emplace_back(match[0].str());
+                nvrtcArguments.emplace_back("--pre-include=" + match[1].str());
                 return NORMAL;
             }
     },
@@ -160,45 +176,68 @@ ArgHandlerMap argHandlers {
             }
     },
     {
-        // Throw away remaining arguments
-        std::regex("^-.+$"),
+        // Capture an argument that is just '-'. If no input file is listed input is on stdin
+        std::regex("^-$"),
             [&](const std::smatch& match) {
-                ignoredArguments.emplace_back(match[0].str());
+                inputFile = match[0].str();
                 return NORMAL;
             }
     },
     {
         // If an input lists a .gpu file, run that file instead
-        std::regex("^[^-].*.gpu$"),
+        std::regex("^([^-].*).gpu$"),
             [&](const std::smatch& match) {
                 execute = true;
-                inputFile = match[0].str();
+                executionConfig = ExecutionConfig {
+                    RunConfig{1, 0},
+                    {match[0].str()}
+                };
+
+                return NORMAL;
+            }
+    },
+    {
+        // If an input is a .exe file, search for other builds and run those
+        std::regex("^([^-].*).exe$"),
+            [&](const std::smatch& match) {
+                execute = true;
+                executionConfig = load_execution_config_from_file(match[1].str() + ".build.yml");
+                assert(executionConfig.builds.size());
                 return NORMAL;
             }
     },
     {
         // Capture any argument not starting with '-' as the input file
-        std::regex("^[^-].*$"),
+        std::regex("^([^-].+)[\\\\/].+$"),
             [&](const std::smatch& match) {
                 inputFile = match[0].str();
+                // Capture directory of input file as an include path
+                nvrtcArguments.emplace_back("-I " + match[1].str());
+                return NORMAL;
+            }
+    },
+    {
+        // Throw away remaining arguments
+        std::regex("^-.+$"),
+            [&](const std::smatch& match) {
+                ignoredArguments.emplace_back(match[0].str());
                 return NORMAL;
             }
     },
 };
 
 int main(int argc, char **argv) {
-    printf("First arg: %s\r\n", *argv);
     // Greedily take off first arg
     g_argc = argc-1;
     g_argv = argv+1;
 
-    ArgList args;
-    ArgProcessorState ps = NORMAL;
+    ArgProcessorState argState = NORMAL;
 
+    // Start by parsing arguments and building the configuration
     std::string c_arg{};
     for (auto a = g_argv; a < g_argv+g_argc; a++) {
         // If the argument was greedy, we'll retry with an appended argument
-        c_arg = (ps == GREEDY) ? c_arg + " " + *a : *a;
+        c_arg = (argState == GREEDY) ? c_arg + " " + *a : *a;
 
         for (auto& h: argHandlers) {
             auto& regex = h.first;
@@ -208,23 +247,23 @@ int main(int argc, char **argv) {
             std::regex_match(c_arg, matches, regex);
 
             if (matches.size()) {
-                ps = handler(matches);
+                argState = handler(matches);
                 break;
             }
         }
     }
 
-    printf("NVRTCC Configuration:\r\n");
-    printf("  Output dir: %s\r\n", outputDir.c_str());
-    printf("  Input file: %s\r\n", inputFile.c_str());
-    printf("  Building: %s\r\n", building ? "true" : "false");
-    printf("  Running: %s\r\n", execute ? "true" : "false");
+    fprintf(stderr, "NVRTCC Configuration:\r\n");
+    fprintf(stderr, "  Output dir: %s\r\n", outputDir.c_str());
+    fprintf(stderr, "  Input file: %s\r\n", inputFile.c_str());
+    fprintf(stderr, "  Building: %s\r\n", building ? "true" : "false");
+    fprintf(stderr, "  Executing: %s\r\n", execute ? "true" : "false");
 
     // Load the input file and execute
     if (execute) {
-        printf("Loading %s for execution\r\n", inputFile.c_str());
-        load_and_run_gpu_code(inputFile);
-        printf("Execution Passed\r\n");
+        fprintf(stderr, "Executing %s with %i threads\r\n", executionConfig.builds[0].c_str(), executionConfig.rc.threadCount);
+        load_and_run_gpu_code(executionConfig.builds[0], executionConfig.rc);
+        fprintf(stderr, "Execution Passed\r\n");
         return 0;
     }
 
@@ -234,12 +273,30 @@ int main(int argc, char **argv) {
     }
 
     // Rebuild the output file template based on the filename
-    std::string outputTemplate = outputDir+"/"+outputFile;
+    std::string outputTemplate;
+    if (outputDir.size() && outputFile.size())
+        outputTemplate = outputDir+"/"+outputFile;
+    else
+        outputTemplate = "temp";
+
+    // load input test file and prepend fakemain
+    std::string testCu = program + load_input_file(inputFile);
+
+    // Write any needed kernel launch data to file for later
+    RunConfig runConfig = parse_run_config(testCu);
+
+    std::ofstream ostr(outputTemplate + ".build.yml");
+    ostr << "cuda_thread_count: " << runConfig.threadCount << '\n';
+    ostr << "cuda_block_shmem_size: " << runConfig.shmemSize << '\n';
 
-    // Do a build for each arch
+    // Do a build for each arch and add it to the build list
+    ostr << "builds:\n";
     for (const auto& build : buildList) {
-        std::string outputPath = nvrtc_build_prog(inputFile, outputTemplate, build, nvrtcArguments);
+        ostr << "  - ";
+        ostr << '\'' << nvrtc_build_prog(testCu, outputTemplate, build, nvrtcArguments) << '\'';
+        ostr << '\n';
     }
+    ostr.close();
 
     return 0;
 }
@@ -16,37 +16,25 @@
 #include <stdio.h>
 
 #include <algorithm>
-#include <fstream>
 #include <memory>
 #include <string>
 #include <vector>
 
+// Arch configs are strings and bools determining architecture and ptx/sass compilation
 using ArchConfig = std::tuple<std::string, bool>;
-constexpr auto archString = [](const ArchConfig& a) {return std::get<0>(a);};
-constexpr auto isArchReal = [](const ArchConfig& a) {return std::get<1>(a);};
+constexpr auto archString = [](const ArchConfig& a) -> const auto& {return std::get<0>(a);};
+constexpr auto isArchReal = [](const ArchConfig& a) -> const auto& {return std::get<1>(a);};
 
-using ArgList = std::vector<std::string>;
-
-const char * program = R"program(
-__host__ __device__ int fake_main(int argc, char ** argv);
-#define main fake_main
+// Arch list is a set of unique pairs of strings and bools
+// e.x. { compute_arch, real_or_virtual }
+// { "sm_80", true } { "compute_80", false }
+using ArchList = std::set<ArchConfig>;
+ArchList buildList;
 
-// extern "C" to stop the name from being mangled
-extern "C" __global__ void main_kernel() {
-    fake_main(0, NULL);
-}
-)program";
+using ArgList = std::vector<std::string>;
 
 // Takes arguments for building a file and returns the path to the output file
-std::string nvrtc_build_prog(const std::string& input_file, const std::string& output_template, const ArchConfig& config, const ArgList& argList) {
-    std::ifstream istr(input_file);
-    std::string test_cu(
-        std::istreambuf_iterator<char>{istr},
-        std::istreambuf_iterator<char>{} );
-
-    // Prepend fakemain
-    test_cu = program + test_cu;
-
+std::string nvrtc_build_prog(const std::string& testCu, const std::string& outputTemplate, const ArchConfig& config, const ArgList& argList) {
     // Assemble arguments
     std::vector<const char*> optList;
 
@@ -57,18 +45,18 @@ std::string nvrtc_build_prog(const std::string& input_file, const std::string& o
 
     // Use the translated architecture
     std::string gpu_arch("--gpu-architecture=" + archString(config));
-    optList.emplace_back(gpu_arch.c_str());
+        optList.emplace_back(gpu_arch.c_str());
 
-    printf("NVRTC opt list:\r\n");
+    fprintf(stderr, "NVRTC opt list:\r\n");
     for (const auto& it: optList) {
-        printf("  %s\r\n", it);
+        fprintf(stderr, "  %s\r\n", it);
     }
 
-    printf ("Compiling program...\r\n");
+    fprintf(stderr, "Compiling program...\r\n");
     nvrtcProgram prog;
     NVRTC_SAFE_CALL(nvrtcCreateProgram(
         &prog,
-        test_cu.c_str(),
+        testCu.c_str(),
         "test.cu",
         0, NULL, NULL));
 
@@ -77,7 +65,7 @@ std::string nvrtc_build_prog(const std::string& input_file, const std::string& o
         optList.size(),
         optList.data());
 
-    printf ("Collecting logs...\r\n");
+    fprintf(stderr, "Collecting logs...\r\n");
     size_t log_size;
     NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &log_size));
 
@@ -92,8 +80,8 @@ std::string nvrtc_build_prog(const std::string& input_file, const std::string& o
     }
 
     size_t codeSize;
-
     std::unique_ptr<char[]> code{nullptr};
+
     if (isArchReal(config)) {
         NVRTC_SAFE_CALL(nvrtcGetCUBINSize(prog, &codeSize));
         code = std::unique_ptr<char[]>{new char[codeSize]};
@@ -106,12 +94,10 @@ std::string nvrtc_build_prog(const std::string& input_file, const std::string& o
     }
     NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));
 
-    std::string output_file = output_template + "." + archString(config) + ".gpu";
-    printf("Writing output to: %s\r\n", output_file.c_str());
+    std::string output_file = outputTemplate + "." + archString(config) + ".gpu";
+    fprintf(stderr, "Writing output to: %s\r\n", output_file.c_str());
 
-    std::ofstream ostr(output_file, std::ios::binary);
-    ostr.write(code.get(), codeSize);
-    ostr.close();
+    write_output_file(code.get(), codeSize, output_file);
 
     return output_file;
 }
@@ -10,6 +10,12 @@
 
 #pragma once
 
+#include <fstream>
+#include <iostream>
+#include <regex>
+#include <string>
+#include <vector>
+
 #define NVRTC_SAFE_CALL(x)                                        \
   do {                                                            \
     nvrtcResult result = x;                                       \
@@ -41,3 +47,68 @@
       exit(1);                                                    \
     }                                                             \
   } while(0)
+
+
+static void write_output_file(const char* data, size_t datasz, const std::string& file) {
+    std::ofstream ostr(file, std::ios::binary);
+    assert(!!ostr);
+
+    ostr.write(data, datasz);
+    ostr.close();
+}
+
+static std::string load_input_file(const std::string& file) {
+    if (file == "-") {
+        return std::string(
+            std::istream_iterator<char>{std::cin},
+            std::istream_iterator<char>{} );
+    } else {
+        std::ifstream istr(file);
+        assert(!!istr);
+        return std::string(
+                std::istreambuf_iterator<char>{istr},
+                std::istreambuf_iterator<char>{} );
+    }
+}
+
+static int parse_int_assignment(const std::string& input, std::string var, int def) {
+    auto lineBegin = input.find(var);
+    auto lineEnd = input.find('\n', lineBegin);
+
+    if (lineBegin == std::string::npos || lineEnd == std::string::npos)
+        return def;
+
+    std::string line(input.begin()+lineBegin, input.begin()+lineEnd);
+    std::regex varRegex("^" + var + ".*([0-9]+);?$");
+    std::smatch match;
+    std::regex_match(line, match, varRegex);
+
+    if (match.size()) {
+        return std::stoi(match[1].str(), nullptr);
+    }
+
+    return def;
+}
+
+struct RunConfig {
+  int threadCount = 1;
+  int shmemSize = 0;
+};
+
+static RunConfig parse_run_config(const std::string& input) {
+    return RunConfig {
+        parse_int_assignment(input, "cuda_thread_count", 1),
+        parse_int_assignment(input, "cuda_block_shmem_size", 0),
+    };
+}
+
+// Fake main for adapting kernels
+static const char * program = R"program(
+__host__ __device__ int fake_main(int argc, char ** argv);
+#define main fake_main
+
+// extern "C" to stop the name from being mangled
+extern "C" __global__ void main_kernel() {
+    fake_main(0, NULL);
+}
+)program";
@@ -14,42 +14,77 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
+
+struct ExecutionConfig {
+  RunConfig rc;
+  std::vector<std::string> builds;
+};
+
+static ExecutionConfig load_execution_config_from_file(const std::string& file) {
+    std::vector<std::string> builds;
+    auto config = load_input_file(file);
+    std::regex config_regex("^ *- *'(.*gpu)'$");
+
+    fprintf(stderr, "Builds found: \r\n");
+
+    size_t line_begin = 0;
+    size_t line_end = config.find('\n');
+    while (line_end != std::string::npos) {
+        // Match any line with a .gpu file
+        // std::regex cannot handle multiline, so we need to make sure that's not included
+        std::string line(config.begin() + line_begin, config.begin() + line_end);
+        std::smatch match;
+        std::regex_match(line, match, config_regex);
+
+        if (match.size()) {
+            builds.emplace_back(match[1].str());
+        }
+
+        line_begin = line_end+1;
+        line_end = config.find('\n', line_begin);
+    }
+
+    return {
+      parse_run_config(config),
+      builds
+    };
+}
+
 static void list_devices()
 {
-    int device_count;
-    cudaGetDeviceCount(&device_count);
-    printf("CUDA devices found: %d\n", device_count);
+    int deviceCount;
+    cudaGetDeviceCount(&deviceCount);
+    fprintf(stderr, "CUDA devices found: %d\n", deviceCount);
 
-    int selected_device;
-    cudaGetDevice(&selected_device);
+    int selectedDevice;
+    cudaGetDevice(&selectedDevice);
 
-    for (int dev = 0; dev < device_count; ++dev)
+    for (int dev = 0; dev < deviceCount; ++dev)
     {
         cudaDeviceProp device_prop;
         cudaGetDeviceProperties(&device_prop, dev);
 
-        printf("Device %d: \"%s\", ", dev, device_prop.name);
-        if(dev == selected_device)
-            printf("Selected, ");
+        fprintf(stderr, "Device %d: \"%s\", ", dev, device_prop.name);
+        if(dev == selectedDevice)
+            fprintf(stderr, "Selected, ");
         else
-            printf("Unused, ");
+            fprintf(stderr, "Unused, ");
 
-        printf("SM%d%d, %zu [bytes]\n",
+        fprintf(stderr, "SM%d%d, %zu [bytes]\n",
             device_prop.major, device_prop.minor,
             device_prop.totalGlobalMem);
     }
 }
 
-static void load_and_run_gpu_code(const std::string input_file) {
-    std::ifstream istr(input_file, std::ios::binary);
+static void load_and_run_gpu_code(const std::string inputFile, const RunConfig& rc) {
+    std::ifstream istr(inputFile, std::ios::binary);
+    assert(!!istr);
+
     std::vector<char> code(
         std::istreambuf_iterator<char>{istr},
         std::istreambuf_iterator<char>{} );
     istr.close();
 
-    unsigned int thread_count = 1;
-    unsigned int shmem_size = 1;
-
     CUdevice cuDevice;
     CUcontext context;
     CUmodule module;
@@ -62,8 +97,8 @@ static void load_and_run_gpu_code(const std::string input_file) {
     CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "main_kernel"));
     CUDA_SAFE_CALL(cuLaunchKernel(kernel,
         1, 1, 1,
-        thread_count, 1, 1,
-        shmem_size,
+        rc.threadCount, 1, 1,
+        rc.shmemSize,
         NULL,
         NULL, 0));