test-to-harness: initial set up (#511)

Ref: #494 Some more comments on this PR in #511 (comment) --------- Signed-off-by: David Korczynski <[email protected]>
google · Aug 2, 2024 · 5b5ee46 · 5b5ee46
1 parent 5a0781d
commit 5b5ee46
Show file tree

Hide file tree

Showing 11 changed files with 379 additions and 68 deletions.
diff --git a/benchmark-sets/from-test-small/krb5.yaml b/benchmark-sets/from-test-small/krb5.yaml
@@ -0,0 +1,26 @@
+"is_test_benchmark": true
+"language": "c"
+"project": "krb5"
+"target_name": "fuzz_gss"
+"target_path": "/src/krb5/src/tests/fuzzing/fuzz_gss.c"
+"test_files":
+- "test_file_path": "//src/krb5/src/tests/gssapi/t_namingexts.c"
+- "test_file_path": "//src/krb5/src/tests/icinterleave.c"
+- "test_file_path": "//src/krb5/src/tests/gssapi/t_gssexts.c"
+- "test_file_path": "//src/krb5/src/tests/s4u2self.c"
+- "test_file_path": "//src/krb5/src/tests/localauth.c"
+- "test_file_path": "//src/krb5/src/tests/misc/test_cxx_k5int.cpp"
+- "test_file_path": "//src/krb5/src/tests/gssapi/t_inq_cred.c"
+- "test_file_path": "//src/krb5/src/tests/t_inetd.c"
+- "test_file_path": "//src/krb5/src/tests/gssapi/t_inq_ctx.c"
+- "test_file_path": "//src/krb5/src/tests/rdreq.c"
+- "test_file_path": "//src/krb5/src/tests/misc/test_cxx_kadm5.cpp"
+- "test_file_path": "//src/krb5/src/tests/asn.1/krb5_encode_test.c"
+- "test_file_path": "//src/krb5/src/tests/misc/test_getpw.c"
+- "test_file_path": "//src/krb5/src/tests/gssapi/t_saslname.c"
+- "test_file_path": "//src/krb5/src/tests/asn.1/t_trval.c"
+- "test_file_path": "//src/krb5/src/tests/unlockiter.c"
+- "test_file_path": "//src/krb5/src/tests/hooks.c"
+- "test_file_path": "//src/krb5/src/tests/gssapi/t_spnego.c"
+- "test_file_path": "//src/krb5/src/tests/misc/test_nfold.c"
+- "test_file_path": "//src/krb5/src/tests/gssapi/t_ccselect.c"
diff --git a/benchmark-sets/from-test-small/liblouis.yaml b/benchmark-sets/from-test-small/liblouis.yaml
@@ -0,0 +1,21 @@
+"is_test_benchmark": true
+"language": "c"
+"project": "liblouis"
+"target_name": "fuzz_translate_generic"
+"target_path": "/src/liblouis/tests/fuzzing/fuzz_translate_generic.c"
+"test_files":
+- "test_file_path": "//src/liblouis/tests/hyphenate_xxx.c"
+- "test_file_path": "//src/liblouis/tests/checkTable.c"
+- "test_file_path": "//src/liblouis/tests/typeform.c"
+- "test_file_path": "//src/liblouis/tests/check_metadata.c"
+- "test_file_path": "//src/liblouis/tests/charToFallbackDots.c"
+- "test_file_path": "//src/liblouis/tests/findTable.c"
+- "test_file_path": "//src/liblouis/tests/typeform_for_emphclass.c"
+- "test_file_path": "//src/liblouis/tests/resolve_table.c"
+- "test_file_path": "//src/liblouis/tests/suggestChunks.c"
+- "test_file_path": "//src/liblouis/tests/hash_collision.c"
+- "test_file_path": "//src/liblouis/tests/attributeNames.c"
+- "test_file_path": "//src/liblouis/tests/logging.c"
+- "test_file_path": "//src/liblouis/tests/getTable.c"
+- "test_file_path": "//src/liblouis/tests/check_ueb_test_data.c"
+- "test_file_path": "//src/liblouis/tests/emphclass.c"
diff --git a/benchmark-sets/from-test-small/libraw.yaml b/benchmark-sets/from-test-small/libraw.yaml
@@ -0,0 +1,19 @@
+"is_test_benchmark": true
+"language": "c++"
+"project": "libraw"
+"target_name": "libraw_fuzzer"
+"target_path": "/src/libraw_fuzzer.cc"
+"test_files":
+- "test_file_path": "//src/libraw/samples/dcraw_half.c"
+- "test_file_path": "//src/libraw/samples/half_mt_win32.c"
+- "test_file_path": "//src/libraw/samples/simple_dcraw.cpp"
+- "test_file_path": "//src/libraw/samples/raw-identify.cpp"
+- "test_file_path": "//src/libraw/samples/unprocessed_raw.cpp"
+- "test_file_path": "//src/libraw/samples/mem_image_sample.cpp"
+- "test_file_path": "//src/libraw/samples/postprocessing_benchmark.cpp"
+- "test_file_path": "//src/libraw/samples/openbayer_sample.cpp"
+- "test_file_path": "//src/libraw/samples/half_mt.c"
+- "test_file_path": "//src/libraw/samples/multirender_test.cpp"
+- "test_file_path": "//src/libraw/samples/rawtextdump.cpp"
+- "test_file_path": "//src/libraw/samples/dcraw_emu.cpp"
+- "test_file_path": "//src/libraw/samples/4channels.cpp"
diff --git a/benchmark-sets/from-test-small/libsodium.yaml b/benchmark-sets/from-test-small/libsodium.yaml
@@ -0,0 +1,26 @@
+"is_test_benchmark": true
+"language": "c++"
+"project": "libsodium"
+"target_name": "secretbox_easy_fuzzer"
+"target_path": "/src/secretbox_easy_fuzzer.cc"
+"test_files":
+- "test_file_path": "//src/libsodium/test/default/box2.c"
+- "test_file_path": "//src/libsodium/test/default/sodium_core.c"
+- "test_file_path": "//src/libsodium/test/default/stream2.c"
+- "test_file_path": "//src/libsodium/test/default/scalarmult_ristretto255.c"
+- "test_file_path": "//src/libsodium/test/default/onetimeauth2.c"
+- "test_file_path": "//src/libsodium/test/default/auth6.c"
+- "test_file_path": "//src/libsodium/test/default/hash3.c"
+- "test_file_path": "//src/libsodium/test/default/secretbox_easy2.c"
+- "test_file_path": "//src/libsodium/test/default/chacha20.c"
+- "test_file_path": "//src/libsodium/test/default/secretbox.c"
+- "test_file_path": "//src/libsodium/test/default/box_seal.c"
+- "test_file_path": "//src/libsodium/test/default/keygen.c"
+- "test_file_path": "//src/libsodium/test/default/core3.c"
+- "test_file_path": "//src/libsodium/test/default/pwhash_scrypt_ll.c"
+- "test_file_path": "//src/libsodium/test/default/verify1.c"
+- "test_file_path": "//src/libsodium/test/default/auth2.c"
+- "test_file_path": "//src/libsodium/test/default/core1.c"
+- "test_file_path": "//src/libsodium/test/default/aead_xchacha20poly1305.c"
+- "test_file_path": "//src/libsodium/test/default/secretbox2.c"
+- "test_file_path": "//src/libsodium/test/default/box_easy.c"
diff --git a/data_prep/introspector.py b/data_prep/introspector.py
@@ -57,6 +57,7 @@
 INTROSPECTOR_ORACLE_EASY_PARAMS = ''
 INTROSPECTOR_ORACLE_ALL_JVM_PUBLIC_CANDIDATES = ''
 INTROSPECTOR_ORACLE_OPTIMAL = ''
+INTROSPECTOR_ORACLE_ALL_TESTS = ''
 INTROSPECTOR_FUNCTION_SOURCE = ''
 INTROSPECTOR_PROJECT_SOURCE = ''
 INTROSPECTOR_XREF = ''
@@ -81,6 +82,7 @@ def get_oracle_dict() -> Dict[str, Any]:
       'easy-params-far-reach': query_introspector_for_easy_param_targets,
       'jvm-public-candidates': query_introspector_jvm_all_public_candidates,
       'optimal-targets': query_introspector_for_optimal_targets,
+      'test-migration': query_introspector_for_tests,
   }
   return oracle_dict
 
@@ -96,7 +98,8 @@ def set_introspector_endpoints(endpoint):
       INTROSPECTOR_ORACLE_ALL_JVM_PUBLIC_CANDIDATES, \
       INTROSPECTOR_ALL_JVM_SOURCE_PATH, INTROSPECTOR_ORACLE_OPTIMAL, \
       INTROSPECTOR_HEADERS_FOR_FUNC, \
-      INTROSPECTOR_FUNCTION_WITH_MATCHING_RETURN_TYPE
+      INTROSPECTOR_FUNCTION_WITH_MATCHING_RETURN_TYPE, \
+      INTROSPECTOR_ORACLE_ALL_TESTS
 
   INTROSPECTOR_ENDPOINT = endpoint
 
@@ -127,6 +130,7 @@ def set_introspector_endpoints(endpoint):
       f'{INTROSPECTOR_ENDPOINT}/all-project-source-files')
   INTROSPECTOR_FUNCTION_WITH_MATCHING_RETURN_TYPE = (
       f'{INTROSPECTOR_ENDPOINT}/function-with-matching-return-type')
+  INTROSPECTOR_ORACLE_ALL_TESTS = f'{INTROSPECTOR_ENDPOINT}/project-tests'
 
 
 def _construct_url(api: str, params: dict) -> str:
@@ -201,6 +205,14 @@ def _get_data(resp: Optional[requests.Response], key: str,
   return default_value
 
 
+def query_introspector_for_tests(project: str) -> list[str]:
+  """Gets the list of test files in the target project."""
+  resp = _query_introspector(INTROSPECTOR_ORACLE_ALL_TESTS, {
+      'project': project,
+  })
+  return _get_data(resp, 'test-file-list', [])
+
+
 def query_introspector_oracle(project: str, oracle_api: str) -> list[dict]:
   """Queries a fuzz target oracle API from Fuzz Introspector."""
   resp = _query_introspector(
@@ -678,10 +690,49 @@ def _select_functions_from_oracles(project: str, limit: int,
   return [all_functions[func] for func in selected_singatures]
 
 
+def populate_benchmarks_using_test_migration(
+    project: str, language: str, limit: int) -> list[benchmarklib.Benchmark]:
+  """Populates benchmarks using tests for test-to-harness conversion."""
+  harnesses, _ = project_src.search_source(project, [], language)
+  harness = pick_one(harnesses)
+  if not harness:
+    logger.error('No fuzz target found in project %s.', project)
+    return []
+  logger.info('Using harness path %s', harness)
+  potential_benchmarks = []
+  test_files = query_introspector_for_tests(project)
+  for test_file in test_files:
+    potential_benchmarks.append(
+        benchmarklib.Benchmark(benchmark_id='cli',
+                               project=project,
+                               language=language,
+                               function_signature='test-file',
+                               function_name='test-file',
+                               return_type='test',
+                               params=[],
+                               exceptions=[],
+                               is_jvm_static=False,
+                               target_path=harness,
+                               preferred_target_name='',
+                               is_test_benchmark=True,
+                               test_file_path=test_file))
+  return potential_benchmarks[:limit]
+
+
 def populate_benchmarks_using_introspector(project: str, language: str,
                                            limit: int,
                                            target_oracles: List[str]):
   """Populates benchmark YAML files from the data from FuzzIntrospector."""
+
+  # If there is any oracle with test-migration then only do this oracle
+  # selection, because the benchmarks will have different .yaml structure.
+  # TODO(David): clean up benchmark code to make it more flexible for varying
+  # forms of target selectors, and potential mixing both types of target
+  # selectors.
+  for target_oracle in target_oracles:
+    if 'test-migration' in target_oracle:
+      return populate_benchmarks_using_test_migration(project, language, limit)
+
   if language == 'jvm':
     functions = _select_functions_from_jvm_oracles(project, limit,
                                                    target_oracles)
@@ -703,11 +754,8 @@ def populate_benchmarks_using_introspector(project: str, language: str,
         for function in functions
     ]
 
-  result = project_src.search_source(project, filenames, language)
-  if not result:
-    return []
-
-  harnesses, interesting = result
+  harnesses, interesting = project_src.search_source(project, filenames,
+                                                     language)
   harness = pick_one(harnesses)
   if not harness:
     logger.error('No fuzz target found in project %s.', project)

diff --git a/experiment/benchmark.py b/experiment/benchmark.py
@@ -48,23 +48,26 @@ def to_yaml(cls, benchmarks: list[Benchmark], outdir: str = './'):
     # Register the custom representer
     yaml.add_representer(str, quoted_string_presenter)
     result = {
-        'project':
-            benchmarks[0].project,
-        'language':
-            benchmarks[0].language,
-        'target_path':
-            benchmarks[0].target_path,
-        'target_name':
-            benchmarks[0].target_name,
-        'functions': [{
-            'signature': b.function_signature,
-            'name': b.function_name,
-            'return_type': b.return_type,
-            'params': b.params,
-            'exceptions': b.exceptions,
-            'is_jvm_static': b.is_jvm_static,
-        } for b in benchmarks],
+        'project': benchmarks[0].project,
+        'language': benchmarks[0].language,
+        'target_path': benchmarks[0].target_path,
+        'target_name': benchmarks[0].target_name,
+        'is_test_benchmark': benchmarks[0].is_test_benchmark,
     }
+    if benchmarks[0].is_test_benchmark:
+      result['test_files'] = [{
+          'test_file_path': b.test_file_path
+      } for b in benchmarks]
+    else:
+      result['functions'] = [{
+          'signature': b.function_signature,
+          'name': b.function_name,
+          'return_type': b.return_type,
+          'params': b.params,
+          'exceptions': b.exceptions,
+          'is_jvm_static': b.is_jvm_static,
+      } for b in benchmarks]
+
     with open(os.path.join(outdir, f'{benchmarks[0].project}.yaml'),
               'w') as file:
       yaml.dump(result, file, default_flow_style=False, width=sys.maxsize)
@@ -83,32 +86,61 @@ def from_yaml(cls, benchmark_path: str) -> List:
     cppify_headers = data.get('cppify_headers', False)
     commit = data.get('commit')
     functions = data.get('functions', [])
-    for function in functions:
-      # Long raw_function_names (particularly for c++ projects) may exceed
-      # filesystem limits on file path/name length when creating WorkDir.
-      max_len = os.pathconf('/', 'PC_NAME_MAX') - len('output-')
-      # Docker tag name cannot exceed 127 characters, and will be suffixed by
-      # '<sample-id>-experiment'.
-      docker_name_len = 127 - len('-03-experiment')
-      max_len = min(max_len, docker_name_len)
-      truncated_id = f'{project_name}-{function.get("name")}'[:max_len]
-      benchmarks.append(
-          cls(truncated_id.lower(),
-              data['project'],
-              data['language'],
-              function.get('signature'),
-              function.get('name'),
-              function.get('return_type'),
-              function.get('params'),
-              function.get('exceptions', []),
-              function.get('is_jvm_static', False),
-              data['target_path'],
-              data.get('target_name'),
-              use_project_examples=use_project_examples,
-              cppify_headers=cppify_headers,
-              commit=commit,
-              use_context=use_context,
-              function_dict=function))
+
+    is_test_benchmark = data.get('is_test_benchmark', False)
+    test_files = data.get('test_files', [])
+    if is_test_benchmark:
+      for test_file in test_files:
+        max_len = os.pathconf('/', 'PC_NAME_MAX') - len('output-')
+        test_file_path = test_file.get('test_file_path')
+        normalized_test_path = test_file_path.replace("/",
+                                                      "_").replace(".", "_")
+        truncated_id = f'{project_name}-{normalized_test_path}'[:max_len]
+
+        benchmarks.append(
+            cls(
+                truncated_id.lower(),
+                data['project'],
+                data['language'],
+                '',
+                '',
+                '',
+                [],
+                [],
+                False,
+                data['target_path'],
+                data.get('target_name', ''),
+                is_test_benchmark=True,
+                test_file_path=test_file_path,
+            ))
+    else:
+      # function type benchmark
+      for function in functions:
+        # Long raw_function_names (particularly for c++ projects) may exceed
+        # filesystem limits on file path/name length when creating WorkDir.
+        max_len = os.pathconf('/', 'PC_NAME_MAX') - len('output-')
+        # Docker tag name cannot exceed 127 characters, and will be suffixed by
+        # '<sample-id>-experiment'.
+        docker_name_len = 127 - len('-03-experiment')
+        max_len = min(max_len, docker_name_len)
+        truncated_id = f'{project_name}-{function.get("name")}'[:max_len]
+        benchmarks.append(
+            cls(truncated_id.lower(),
+                data['project'],
+                data['language'],
+                function.get('signature'),
+                function.get('name'),
+                function.get('return_type'),
+                function.get('params'),
+                function.get('exceptions', []),
+                function.get('is_jvm_static', False),
+                data['target_path'],
+                data.get('target_name'),
+                use_project_examples=use_project_examples,
+                cppify_headers=cppify_headers,
+                commit=commit,
+                use_context=use_context,
+                function_dict=function))
 
     return benchmarks
 
@@ -128,7 +160,9 @@ def __init__(self,
                cppify_headers=False,
                use_context=False,
                commit=None,
-               function_dict: Optional[dict] = None):
+               function_dict: Optional[dict] = None,
+               is_test_benchmark: bool = False,
+               test_file_path: str = ''):
     self.id = benchmark_id
     self.project = project
     self.language = language
@@ -145,6 +179,8 @@ def __init__(self,
     self.use_context = use_context
     self.cppify_headers = cppify_headers
     self.commit = commit
+    self.test_file_path = test_file_path
+    self.is_test_benchmark = is_test_benchmark
 
     if self.language == 'jvm':
       # For java projects, in order to differentiate between overloaded methods

diff --git a/experiment/evaluator.py b/experiment/evaluator.py
@@ -207,6 +207,7 @@ def run_log_path(self, generated_target_name: str):
 
   def create_ossfuzz_project(self, name: str, target_file: str) -> str:
     """Creates an OSS-Fuzz project with the generated target."""
+    logger.info(f'target file: {target_file}')
     generated_project_path = os.path.join(oss_fuzz_checkout.OSS_FUZZ_DIR,
                                           'projects', name)
     if os.path.exists(generated_project_path):