Skip to content

Commit

Permalink
experimental: add test-to-harness conversion logic (#495)
Browse files Browse the repository at this point in the history
Adds a fuzz harness heuristic that relies on converting existing tests.
At this stage, it's done without relying on FI, we simply (1) find tests
files in the target project; (2) read them; (3) for each test file we
use a simple prompt to convert it into a harness.

At this stage, it already out-performs on some existing projects, e.g:
https://github.com/jkuhlmann/cgltf/blob/master/test/main.c

In this case, we have a harness generated that looks quite nice:

```c
// Heuristic: TestConverterPrompt :: Target: 
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>

#define CGLTF_IMPLEMENTATION
#include "cgltf.h"

extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
    if (size < 1) {
        return 0;
    }

    cgltf_options options;
	memset(&options, 0, sizeof(cgltf_options));
	cgltf_data* parsed_data = NULL;
	cgltf_result result;

    // Parse input data
    result = cgltf_parse(&options, data, size, &parsed_data);

    if (result == cgltf_result_success) {
        result = cgltf_validate(parsed_data);
    }

    if (result == cgltf_result_success) {
        // Use the parsed data in some way
        // For example, print file type and mesh count
		printf("Type: %u\n", parsed_data->file_type);
		printf("Meshes: %u\n", (unsigned)parsed_data->meshes_count);
    }

    cgltf_free(parsed_data);

    return 0;
}
```

Ref: #494

---------

Signed-off-by: David Korczynski <[email protected]>
  • Loading branch information
DavidKorczynski authored Jul 18, 2024
1 parent 4e3798e commit 4309def
Showing 1 changed file with 155 additions and 5 deletions.
160 changes: 155 additions & 5 deletions experimental/c-cpp/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ def setup_model(model: str):
LLM_MODEL = model


class Test:
"""Holder of data about tests used by a repository."""

def __init__(self, test_path, test_content):
self.test_path = test_path
self.test_content = test_content


class AutogeneratedHarness:
"""Represents a generated harness and holds corresponding artifacts."""

Expand Down Expand Up @@ -79,6 +87,7 @@ def dump_build_and_harness(self, fuzzer_gen_dir: str) -> None:
# Write so they can be build using `compile`
with open(self.harness_path, 'w') as f:
f.write(self.source_code)
f.write("\n//david")
with open('/src/build.sh', 'w') as f:
f.write(self.build_script)

Expand Down Expand Up @@ -169,9 +178,15 @@ def __init__(self, introspector_report: Dict[str, Any],
self.introspector_report = introspector_report
self.github_url = ''

@abstractmethod
def get_fuzzer_intrinsics(self, func) -> Dict[str, Any]:
"""generates fuzzer source code, build and include directives."""
def get_fuzzer_intrinsics(self, func) -> Dict[str, Any]: # pylint: disable=unused-argument
"""Generates fuzzer source code, build and include directives."""
# By default return empty dictionary.
return {}

def get_fuzzer_test_intrinsics(self, test_case: Test) -> Dict[str, Any]: # pylint: disable=unused-argument
"""Generates fuzzer source code, build and include directives."""
# By default return empty dictionary.
return {}

@abstractmethod
def get_fuzzing_targets(self) -> List[Any]:
Expand Down Expand Up @@ -398,6 +413,80 @@ def get_fuzzer_intrinsics(self, func: Dict[str, Any]) -> Dict[str, Any]:
return fuzzer_intrinsics


class FuzzerGenHeuristicTestConverter(FuzzHeuristicGeneratorBase):
"""Heuristic that provides context around target function."""
language = 'c'
name = 'TestConverterPrompt'

def __init__(self, introspector_report: Dict[str, Any],
all_header_files: List[str], test_dir: str):
super().__init__(introspector_report, all_header_files, test_dir)
self.introspector_report = introspector_report
self.all_header_files = all_header_files
self.github_url = ''

def get_fuzzing_targets(self) -> List:
return []

def get_fuzzer_test_intrinsics(self, test_case: Test) -> Dict[str, Any]:
"""Returns the fuzzer intrinsics based on test conversion."""
(headers_to_include, _,
build_command_includes) = self.get_header_intrinsics()

# Include any weird macros defined that does not have any values. This
# was found empirically to be valuable.
macros_defined_in_test = []
for line in test_case.test_content.split('\n'):
if '#define' in line and len(line.split(' ')) == 2:
macros_defined_in_test.append(line)

logger.info('Sample targets:')
prompt = f'''I'm a security engineer looking to convert unit tests into
fuzzing harnesses.
The goal is to convert the following unit test into a fuzzing harness:
```c
TEST_SOURCE_CODE
```
The target library is {self.github_url}.
Please write a fuzzing harness that is inspired by this unittest. You shuold write the fuzzing harness in
a libFuzzer-stlye structure. This means the harness should use `int LLMVFuzzerTestOneInput`.
Any macros defined in the test should also be included in the fuzz harness.
There is one rule that your harness must satisfy: all of the header files in this library is {str(headers_to_include)}. Make sure to not include any header files not in this list.
In your response, include *only* the code for the harness, nothing more. You should wrap the code in <code></code> tags.
'''
prompt = prompt.replace('TEST_SOURCE_CODE', test_case.test_content)
self.log_prompt(prompt)

fuzzer_source = self.run_prompt_and_get_fuzzer_source(prompt)
comment_on_target = f'// Heuristic: {self.name} :: Target: \n'

total_fuzzer_source = comment_on_target
# Add any macros not already in the harness
for macro in macros_defined_in_test:
macro_name = macro.split(' ')[1]
if macro_name not in fuzzer_source:
total_fuzzer_source += macro + '\n'

total_fuzzer_source += FUZZER_PRE_HEADERS
total_fuzzer_source += fuzzer_source

fuzzer_intrinsics = {
'full-source-code': total_fuzzer_source,
'build-command-includes': build_command_includes,
'autogen-id': f'{self.name}-{test_case.test_path}',
'prompt': prompt
}

return fuzzer_intrinsics


class FuzzerGenHeuristic5(FuzzHeuristicGeneratorBase):
"""Heuristic that provides context around target function."""
language = 'c'
Expand Down Expand Up @@ -923,6 +1012,37 @@ def log_fuzzer_source(full_fuzzer_source: str):
logger.info(harness_source_out)


def get_tests_converted_to_harnesses(build_results, language, test_dir,
fuzzer_build_cmd, all_header_files,
all_test_scripts, github_url):
"""Converts a list of test files into fuzzing harnesses."""
return_list = []

_, _, fuzzer_target_file, _ = get_language_defaults(language)

for test in all_test_scripts:

fuzz_converter = FuzzerGenHeuristicTestConverter({}, all_header_files,
test_dir)
fuzz_converter.github_url = github_url
fuzzer_intrinsics = fuzz_converter.get_fuzzer_test_intrinsics(test)

# Generate a build script for compiling the fuzzer with ASAN.
final_asan_build_script = build_results[test_dir].build_script + '\n'
fuzzer_out = '/src/generated-fuzzer'
fuzz_cmd = ' '.join(fuzzer_build_cmd)
fuzz_includes = fuzzer_intrinsics['build-command-includes']
final_asan_build_script += f'{fuzz_cmd} {fuzz_includes} -o {fuzzer_out}'

return_list.append(
AutogeneratedHarness(final_asan_build_script,
fuzzer_intrinsics['full-source-code'],
fuzzer_target_file, fuzzer_out, fuzzer_intrinsics,
language))

return return_list


def generate_harness_intrinsics(
heuristic: FuzzHeuristicGeneratorBase,
results,
Expand Down Expand Up @@ -1256,6 +1376,26 @@ def get_heuristics_to_use() -> List[Type[FuzzHeuristicGeneratorBase]]:
return heuristics_to_apply


def get_all_test_scripts(target_source_path) -> List[Test]:
"""Returns a list of the test files in the target source path."""
all_files = get_all_files_in_path(target_source_path)
all_tests = []
for file in all_files:
split_path = file.split('/')
is_test = any(['test' in path for path in split_path])
if not is_test:
continue
test_extensions = ['.cc', '.cpp', '.cxx', '.c++', 'c']
if not any(file.endswith(ext) for ext in test_extensions):
continue
# Let's say this is a test
logger.info('Found test: %s', file)
with open(file, 'r') as f:
file_content = f.read()
all_tests.append(Test(file, file_content))
return all_tests


def auto_generate(github_url,
disable_testing_build_scripts=False,
disable_fuzzgen=False,
Expand Down Expand Up @@ -1324,6 +1464,9 @@ def auto_generate(github_url,
folders_with_results = set()
logger.info('Going through %d build results to generate fuzzers',
len(build_results))

all_test_scripts = get_all_test_scripts(target_source_path)

for test_dir, build_worker in build_results.items():
logger.info('Checking build heuristic: %s',
build_worker.build_suggestion.heuristic_id)
Expand Down Expand Up @@ -1385,8 +1528,15 @@ def auto_generate(github_url,
logger.info('Applying %s', heuristic.name)

heuristic.github_url = github_url
harness_builds_to_validate = generate_harness_intrinsics(
heuristic, build_results, language, test_dir, fuzzer_build_cmd)
harness_builds_to_validate = []
harness_builds_to_validate.extend(
generate_harness_intrinsics(heuristic, build_results, language,
test_dir, fuzzer_build_cmd))

harness_builds_to_validate.extend(
get_tests_converted_to_harnesses(build_results, language, test_dir,
fuzzer_build_cmd, all_header_files,
all_test_scripts, github_url))

# Build the fuzzer for each project
logger.info('Fuzzer harnesses to evaluate: %d',
Expand Down

0 comments on commit 4309def

Please sign in to comment.