experimental: add test-to-harness conversion logic (#495)

Adds a fuzz harness heuristic that relies on converting existing tests. At this stage, it's done without relying on FI, we simply (1) find tests files in the target project; (2) read them; (3) for each test file we use a simple prompt to convert it into a harness. At this stage, it already out-performs on some existing projects, e.g: https://github.com/jkuhlmann/cgltf/blob/master/test/main.c In this case, we have a harness generated that looks quite nice: ```c // Heuristic: TestConverterPrompt :: Target: #include <stdlib.h> #include <stdint.h> #include <stdio.h> #include <string.h> #define CGLTF_IMPLEMENTATION #include "cgltf.h" extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { if (size < 1) { return 0; } cgltf_options options; memset(&options, 0, sizeof(cgltf_options)); cgltf_data* parsed_data = NULL; cgltf_result result; // Parse input data result = cgltf_parse(&options, data, size, &parsed_data); if (result == cgltf_result_success) { result = cgltf_validate(parsed_data); } if (result == cgltf_result_success) { // Use the parsed data in some way // For example, print file type and mesh count printf("Type: %u\n", parsed_data->file_type); printf("Meshes: %u\n", (unsigned)parsed_data->meshes_count); } cgltf_free(parsed_data); return 0; } ``` Ref: #494 --------- Signed-off-by: David Korczynski <[email protected]>
google · Jul 18, 2024 · 4309def · 4309def
1 parent 4e3798e
commit 4309def
Showing 1 changed file with 155 additions and 5 deletions.
diff --git a/experimental/c-cpp/manager.py b/experimental/c-cpp/manager.py
@@ -52,6 +52,14 @@ def setup_model(model: str):
   LLM_MODEL = model
 
 
+class Test:
+  """Holder of data about tests used by a repository."""
+
+  def __init__(self, test_path, test_content):
+    self.test_path = test_path
+    self.test_content = test_content
+
+
 class AutogeneratedHarness:
   """Represents a generated harness and holds corresponding artifacts."""
 
@@ -79,6 +87,7 @@ def dump_build_and_harness(self, fuzzer_gen_dir: str) -> None:
     # Write so they can be build using `compile`
     with open(self.harness_path, 'w') as f:
       f.write(self.source_code)
+      f.write("\n//david")
     with open('/src/build.sh', 'w') as f:
       f.write(self.build_script)
 
@@ -169,9 +178,15 @@ def __init__(self, introspector_report: Dict[str, Any],
     self.introspector_report = introspector_report
     self.github_url = ''
 
-  @abstractmethod
-  def get_fuzzer_intrinsics(self, func) -> Dict[str, Any]:
-    """generates fuzzer source code, build and include directives."""
+  def get_fuzzer_intrinsics(self, func) -> Dict[str, Any]:  # pylint: disable=unused-argument
+    """Generates fuzzer source code, build and include directives."""
+    # By default return empty dictionary.
+    return {}
+
+  def get_fuzzer_test_intrinsics(self, test_case: Test) -> Dict[str, Any]:  # pylint: disable=unused-argument
+    """Generates fuzzer source code, build and include directives."""
+    # By default return empty dictionary.
+    return {}
 
   @abstractmethod
   def get_fuzzing_targets(self) -> List[Any]:
@@ -398,6 +413,80 @@ def get_fuzzer_intrinsics(self, func: Dict[str, Any]) -> Dict[str, Any]:
     return fuzzer_intrinsics
 
 
+class FuzzerGenHeuristicTestConverter(FuzzHeuristicGeneratorBase):
+  """Heuristic that provides context around target function."""
+  language = 'c'
+  name = 'TestConverterPrompt'
+
+  def __init__(self, introspector_report: Dict[str, Any],
+               all_header_files: List[str], test_dir: str):
+    super().__init__(introspector_report, all_header_files, test_dir)
+    self.introspector_report = introspector_report
+    self.all_header_files = all_header_files
+    self.github_url = ''
+
+  def get_fuzzing_targets(self) -> List:
+    return []
+
+  def get_fuzzer_test_intrinsics(self, test_case: Test) -> Dict[str, Any]:
+    """Returns the fuzzer intrinsics based on test conversion."""
+    (headers_to_include, _,
+     build_command_includes) = self.get_header_intrinsics()
+
+    # Include any weird macros defined that does not have any values. This
+    # was found empirically to be valuable.
+    macros_defined_in_test = []
+    for line in test_case.test_content.split('\n'):
+      if '#define' in line and len(line.split(' ')) == 2:
+        macros_defined_in_test.append(line)
+
+    logger.info('Sample targets:')
+    prompt = f'''I'm a security engineer looking to convert unit tests into
+fuzzing harnesses.
+
+The goal is to convert the following unit test into a fuzzing harness:
+
+```c
+TEST_SOURCE_CODE
+```
+
+The target library is {self.github_url}.
+
+Please write a fuzzing harness that is inspired by this unittest. You shuold write the fuzzing harness in
+a libFuzzer-stlye structure. This means the harness should use `int LLMVFuzzerTestOneInput`.
+
+Any macros defined in the test should also be included in the fuzz harness.
+
+There is one rule that your harness must satisfy: all of the header files in this library is {str(headers_to_include)}. Make sure to not include any header files not in this list.
+
+In your response, include *only* the code for the harness, nothing more. You should wrap the code in <code></code> tags.
+'''
+    prompt = prompt.replace('TEST_SOURCE_CODE', test_case.test_content)
+    self.log_prompt(prompt)
+
+    fuzzer_source = self.run_prompt_and_get_fuzzer_source(prompt)
+    comment_on_target = f'// Heuristic: {self.name} :: Target: \n'
+
+    total_fuzzer_source = comment_on_target
+    # Add any macros not already in the harness
+    for macro in macros_defined_in_test:
+      macro_name = macro.split(' ')[1]
+      if macro_name not in fuzzer_source:
+        total_fuzzer_source += macro + '\n'
+
+    total_fuzzer_source += FUZZER_PRE_HEADERS
+    total_fuzzer_source += fuzzer_source
+
+    fuzzer_intrinsics = {
+        'full-source-code': total_fuzzer_source,
+        'build-command-includes': build_command_includes,
+        'autogen-id': f'{self.name}-{test_case.test_path}',
+        'prompt': prompt
+    }
+
+    return fuzzer_intrinsics
+
+
 class FuzzerGenHeuristic5(FuzzHeuristicGeneratorBase):
   """Heuristic that provides context around target function."""
   language = 'c'
@@ -923,6 +1012,37 @@ def log_fuzzer_source(full_fuzzer_source: str):
   logger.info(harness_source_out)
 
 
+def get_tests_converted_to_harnesses(build_results, language, test_dir,
+                                     fuzzer_build_cmd, all_header_files,
+                                     all_test_scripts, github_url):
+  """Converts a list of test files into fuzzing harnesses."""
+  return_list = []
+
+  _, _, fuzzer_target_file, _ = get_language_defaults(language)
+
+  for test in all_test_scripts:
+
+    fuzz_converter = FuzzerGenHeuristicTestConverter({}, all_header_files,
+                                                     test_dir)
+    fuzz_converter.github_url = github_url
+    fuzzer_intrinsics = fuzz_converter.get_fuzzer_test_intrinsics(test)
+
+    # Generate a build script for compiling the fuzzer with ASAN.
+    final_asan_build_script = build_results[test_dir].build_script + '\n'
+    fuzzer_out = '/src/generated-fuzzer'
+    fuzz_cmd = ' '.join(fuzzer_build_cmd)
+    fuzz_includes = fuzzer_intrinsics['build-command-includes']
+    final_asan_build_script += f'{fuzz_cmd} {fuzz_includes} -o {fuzzer_out}'
+
+    return_list.append(
+        AutogeneratedHarness(final_asan_build_script,
+                             fuzzer_intrinsics['full-source-code'],
+                             fuzzer_target_file, fuzzer_out, fuzzer_intrinsics,
+                             language))
+
+  return return_list
+
+
 def generate_harness_intrinsics(
     heuristic: FuzzHeuristicGeneratorBase,
     results,
@@ -1256,6 +1376,26 @@ def get_heuristics_to_use() -> List[Type[FuzzHeuristicGeneratorBase]]:
   return heuristics_to_apply
 
 
+def get_all_test_scripts(target_source_path) -> List[Test]:
+  """Returns a list of the test files in the target source path."""
+  all_files = get_all_files_in_path(target_source_path)
+  all_tests = []
+  for file in all_files:
+    split_path = file.split('/')
+    is_test = any(['test' in path for path in split_path])
+    if not is_test:
+      continue
+    test_extensions = ['.cc', '.cpp', '.cxx', '.c++', 'c']
+    if not any(file.endswith(ext) for ext in test_extensions):
+      continue
+    # Let's say this is a test
+    logger.info('Found test: %s', file)
+    with open(file, 'r') as f:
+      file_content = f.read()
+    all_tests.append(Test(file, file_content))
+  return all_tests
+
+
 def auto_generate(github_url,
                   disable_testing_build_scripts=False,
                   disable_fuzzgen=False,
@@ -1324,6 +1464,9 @@ def auto_generate(github_url,
   folders_with_results = set()
   logger.info('Going through %d build results to generate fuzzers',
               len(build_results))
+
+  all_test_scripts = get_all_test_scripts(target_source_path)
+
   for test_dir, build_worker in build_results.items():
     logger.info('Checking build heuristic: %s',
                 build_worker.build_suggestion.heuristic_id)
@@ -1385,8 +1528,15 @@ def auto_generate(github_url,
       logger.info('Applying %s', heuristic.name)
 
       heuristic.github_url = github_url
-      harness_builds_to_validate = generate_harness_intrinsics(
-          heuristic, build_results, language, test_dir, fuzzer_build_cmd)
+      harness_builds_to_validate = []
+      harness_builds_to_validate.extend(
+          generate_harness_intrinsics(heuristic, build_results, language,
+                                      test_dir, fuzzer_build_cmd))
+
+      harness_builds_to_validate.extend(
+          get_tests_converted_to_harnesses(build_results, language, test_dir,
+                                           fuzzer_build_cmd, all_header_files,
+                                           all_test_scripts, github_url))
 
       # Build the fuzzer for each project
       logger.info('Fuzzer harnesses to evaluate: %d',