diff --git a/examples/manipulation-demo.launch.py b/examples/manipulation-demo.launch.py index 35720a6af..ee28c9b44 100644 --- a/examples/manipulation-demo.launch.py +++ b/examples/manipulation-demo.launch.py @@ -51,7 +51,6 @@ def generate_launch_description(): launch_robotic_manipulation = Node( package="robotic_manipulation", executable="robotic_manipulation", - # name="robotic_manipulation_node", output="screen", parameters=[ {"use_sim_time": True}, diff --git a/poetry.lock b/poetry.lock index 9eb39f9fa..335612128 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5703,7 +5703,7 @@ url = "src/rai_asr" [[package]] name = "rai-bench" version = "0.1.0" -description = "" +description = "Package for running and creating benchmarks." optional = false python-versions = "^3.10" files = [] @@ -8331,4 +8331,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.10, <3.13" -content-hash = "c5469635a5db79c258554ad9f4e49331515940e406fbf912822651a0e0c33dda" +content-hash = "d943b786f2bb8dddc9249475409a4d7c9c4b0a77041611c039431df55ad94000" diff --git a/pyproject.toml b/pyproject.toml index a3edd3730..92bcfa7c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ requests = "^2.32.2" pre-commit = "^3.7.0" openai = "^1.23.3" coloredlogs = "^15.0.1" +opencv-python = "^4.9.0.80" markdown = "^3.6" boto3 = "^1.34.98" tqdm = "^4.66.4" @@ -62,7 +63,6 @@ pytest-timeout = "^2.3.1" tomli-w = "^1.1.0" faster-whisper = "^1.1.1" pydub = "^0.25.1" -opencv-python = "^4.11.0.86" [tool.poetry.group.dev.dependencies] ipykernel = "^6.29.4" diff --git a/src/rai_bench/README.md b/src/rai_bench/README.md index e2abd3d4f..3db5d04ba 100644 --- a/src/rai_bench/README.md +++ b/src/rai_bench/README.md @@ -8,33 +8,29 @@ The RAI Bench is a package including benchmarks and providing frame for creating Frame components can be found in `src/rai_bench/rai_bench/benchmark_model.py` -- `Task` - abstract class for creating specific task. It introduces helper funtions that make it easier to calculate metrics/scores. Your custom tasks must implement a prompt got agent to do, a way to calculate a result and a validation if given scene config suits the task. -- -- `Scenario` - class defined by a Scene and Task. Can be created manually like: +- `Task` +- `Scenario` +- `Benchmark` - ```python - - ``` - -- `Benchmark` - class responsible for running and logging scenarios. +For more information about these classes go to -> `src/rai_bench/rai_bench/benchmark_model.py` ### O3DE TEST BENCHMARK -O3DE Test Benchmark (src/rai_bench/rai_bench/o3de_test_bench/), contains 2 Tasks(tasks/) - GrabCarrotTask and PlaceCubesTask (these tasks implement calculating scores) and 4 scene_configs(configs/) for O3DE robotic arm simulation. +O3DE Test Benchmark (`src/rai_bench/rai_bench/o3de_test_bench/`), contains 2 Tasks(`tasks/`) - GrabCarrotTask and PlaceCubesTask (these tasks implement calculating scores) and 4 scene_configs(`configs/`) for O3DE robotic arm simulation. Both tasks calculate score, taking into consideration 4 values: -- initially_misplaced_now_correct - when the object which was in the incorrect place at the start, is in a correct place at the end -- initially_misplaced_still_incorrect - when the object which was in the incorrect place at the start, is in a incorrect place at the end -- initially_correct_still_correct - when the object which was in the correct place at the start, is in a correct place at the end -- initially_correct_now_incorrect - when the object which was in the correct place at the start, is in a incorrect place at the end +- initially_misplaced_now_correct +- initially_misplaced_still_incorrect +- initially_correct_still_correct +- initially_correct_now_incorrect The result is a value between 0 and 1, calculated like (initially_misplaced_now_correct + initially_correct_still_correct) / number_of_initial_objects. This score is calculated at the beggining and at the end of each scenario. ### Example usage -Example of how to load scenes, define scenarios and run benchmark can be found in `src/rai_bench/rai_bench/benchmark_main.py` +Example of how to load scenes, define scenarios and run benchmark can be found in `src/rai_bench/rai_bench/examples/o3de_test_benchmark.py` Scenarios can be loaded manually like: @@ -56,5 +52,3 @@ scenarios = Benchmark.create_scenarios( ``` which will result in list of scenarios with combination of every possible task and scene(task decides if scene config is suitable for it). - -Both approaches can be found in `main.py` diff --git a/src/rai_bench/pyproject.toml b/src/rai_bench/pyproject.toml index 52255eb9a..a5426f409 100644 --- a/src/rai_bench/pyproject.toml +++ b/src/rai_bench/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "rai-bench" version = "0.1.0" -description = "" +description = "Package for running and creating benchmarks." authors = ["jmatejcz "] readme = "README.md" diff --git a/src/rai_bench/rai_bench/benchmark_model.py b/src/rai_bench/rai_bench/benchmark_model.py index bc47ef407..34cb7b776 100644 --- a/src/rai_bench/rai_bench/benchmark_model.py +++ b/src/rai_bench/rai_bench/benchmark_model.py @@ -34,16 +34,17 @@ class EntitiesMismatchException(Exception): - def __init__(self, message: str) -> None: - super().__init__(message) + pass class Task(ABC): """ - Task to perform. - Specyfic implementation should implement a way to calculate results. - Abstract provides utility functions for common calculations, that can be usefull when - creating metrics + Abstract of a Task. Provides utility functions for common calculations + that can be helfull when creating metrics. + Specific child classes should implement: + - get_prompt method + - validate_config + - calculate_result """ def __init__( @@ -57,6 +58,7 @@ def __init__( @abstractmethod def get_prompt(self) -> str: + """Returns the task instruction - the prompt that will be passed to agent""" pass @abstractmethod @@ -75,7 +77,8 @@ def calculate_result( self, simulation_bridge: SimulationBridge[SimulationConfigT] ) -> float: """ - Calculate result of the task + Calculates result of the task, based on info retrieved from simulation. + Should return score between 0.0 and 1. """ pass @@ -135,7 +138,10 @@ def count_adjacent( class Scenario(Generic[SimulationConfigT]): - """Single instances are run separatly by benchmark""" + """ + A Scenarios are defined by a pair of Task and Simlation Config. + Each Scenario is executed separatly by a Benchmark. + """ def __init__( self, @@ -154,7 +160,9 @@ def __init__( class Benchmark: """ - Defined by a set of scenarios to be done + Benchmark represents a set of Scenarios to be executed and evaluated. + It manages the execution, logs results, and provides functionality + for tracking and exporting performance metrics. """ def __init__( @@ -162,16 +170,20 @@ def __init__( simulation_bridge: SimulationBridge[SimulationConfigT], scenarios: List[Scenario[SimulationConfigT]], logger: loggers_type | None = None, + results_filename: str = "benchmark_results.csv", ) -> None: self.simulation_bridge = simulation_bridge self.num_of_scenarios = len(scenarios) self.scenarios = enumerate(iter(scenarios)) self.results: List[Dict[str, Any]] = [] + self.results_filename = results_filename if logger: self._logger = logger else: self._logger = logging.getLogger(__name__) + self._initialize_results_file() + @classmethod def create_scenarios( cls, @@ -198,6 +210,23 @@ def create_scenarios( ) return scenarios + def _initialize_results_file(self): + """Initialize the CSV file with headers.""" + fieldnames = [ + "task", + "simulation_config", + "initial_score", + "final_score", + "total_time", + "number_of_tool_calls", + ] + + with open( + self.results_filename, mode="w", newline="", encoding="utf-8" + ) as file: + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writeheader() + def run_next(self, agent) -> None: """ Runs the next scenario @@ -251,40 +280,36 @@ def run_next(self, agent) -> None: f"TASK SCORE: {result}, TOTAL TIME: {total_time:.3f}, NUM_OF_TOOL_CALLS: {tool_calls_num}" ) - self.results.append( - { - "task": scenario.task.get_prompt(), - "simulation_config": scenario.simulation_config_path, - "initial_score": initial_result, - "final_score": result, - "total_time": f"{total_time:.3f}", - "number_of_tool_calls": tool_calls_num, - } - ) + scenario_result: Dict[str, Any] = { + "task": scenario.task.get_prompt(), + "simulation_config": scenario.simulation_config_path, + "initial_score": initial_result, + "final_score": result, + "total_time": f"{total_time:.3f}", + "number_of_tool_calls": tool_calls_num, + } + self.results.append(scenario_result) + self._save_scenario_result_to_csv(scenario_result) except StopIteration: print("No more scenarios left to run.") - def get_results(self) -> List[Dict[str, Any]]: - return self.results - - def dump_results_to_csv(self, filename: str) -> None: - if not self.results: - self._logger.warning("No results to save.") # type: ignore - return - + def _save_scenario_result_to_csv(self, result: Dict[str, Any]) -> None: + """Save a single scenario result to the CSV file.""" fieldnames = [ "task", - "initial_score", "simulation_config", + "initial_score", "final_score", "total_time", "number_of_tool_calls", ] - with open(filename, mode="w", newline="", encoding="utf-8") as file: + with open( + self.results_filename, mode="a", newline="", encoding="utf-8" + ) as file: writer = csv.DictWriter(file, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(self.results) + writer.writerow(result) - self._logger.info(f"Results saved to {filename}") # type: ignore + def get_results(self) -> List[Dict[str, Any]]: + return self.results diff --git a/src/rai_bench/rai_bench/main.py b/src/rai_bench/rai_bench/examples/o3de_test_benchmark.py similarity index 98% rename from src/rai_bench/rai_bench/main.py rename to src/rai_bench/rai_bench/examples/o3de_test_benchmark.py index 7875c92c4..bfa81d0bc 100644 --- a/src/rai_bench/rai_bench/main.py +++ b/src/rai_bench/rai_bench/examples/o3de_test_benchmark.py @@ -163,6 +163,7 @@ simulation_bridge=o3de, scenarios=scenarios, logger=bench_logger, + results_filename="src/rai_bench/rai_bench/results.csv", ) for i, s in enumerate(scenarios): agent = create_conversational_agent( @@ -180,7 +181,6 @@ bench_logger.info("===============================================================") bench_logger.info("ALL SCENARIOS DONE. BENCHMARK COMPLETED!") bench_logger.info("===============================================================") - benchmark.dump_results_to_csv(filename="src/rai_bench/rai_bench/results.csv") connector.shutdown() o3de.shutdown() diff --git a/src/rai_bench/rai_bench/o3de_test_bench/tasks/grab_carrot_task.py b/src/rai_bench/rai_bench/o3de_test_bench/tasks/grab_carrot_task.py index ca040fd62..56203f194 100644 --- a/src/rai_bench/rai_bench/o3de_test_bench/tasks/grab_carrot_task.py +++ b/src/rai_bench/rai_bench/o3de_test_bench/tasks/grab_carrot_task.py @@ -59,10 +59,10 @@ def calculate_result( else: self.logger.debug(f"initial positions: {initial_carrots}") # type: ignore self.logger.debug(f"current positions: {final_carrots}") # type: ignore - for ini_carrot in initial_carrots: + for initial_carrot in initial_carrots: for final_carrot in final_carrots: - if ini_carrot.name == final_carrot.name: - initial_y = ini_carrot.pose.translation.y + if initial_carrot.name == final_carrot.name: + initial_y = initial_carrot.pose.translation.y final_y = final_carrot.pose.translation.y # NOTE the specific coords that refer to for example # middle of the table can differ across simulations, @@ -90,7 +90,7 @@ def calculate_result( break else: raise EntitiesMismatchException( - f"Entity with name: {ini_carrot.name} which was present in initial scene, not found in final scene." + f"Entity with name: {initial_carrot.name} which was present in initial scene, not found in final scene." ) self.logger.info( # type: ignore diff --git a/src/rai_bench/rai_bench/o3de_test_bench/tasks/place_cubes_task.py b/src/rai_bench/rai_bench/o3de_test_bench/tasks/place_cubes_task.py index 26bdd590e..9ad03c9b2 100644 --- a/src/rai_bench/rai_bench/o3de_test_bench/tasks/place_cubes_task.py +++ b/src/rai_bench/rai_bench/o3de_test_bench/tasks/place_cubes_task.py @@ -61,19 +61,19 @@ def calculate_result( ) else: - ini_poses = [cube.pose for cube in initial_cubes] + initial_poses = [cube.pose for cube in initial_cubes] final_poses = [cube.pose for cube in final_cubes] # NOTE the specific coords that refer to for example # middle of the table can differ across simulations, # take that into consideration self.logger.debug(f"initial positions: {initial_cubes}") self.logger.debug(f"current positions: {final_cubes}") - for i, ini_cube in enumerate(initial_cubes): - for j, final_cube in enumerate(final_cubes): - if ini_cube.name == final_cube.name: + for initial_cube in initial_cubes: + for final_cube in final_cubes: + if initial_cube.name == final_cube.name: was_adjacent_initially = self.is_adjacent_to_any( - ini_cube.pose, - [p for p in ini_poses if p != ini_cube.pose], + initial_cube.pose, + [p for p in initial_poses if p != initial_cube.pose], 0.15, ) is_adjacent_finally = self.is_adjacent_to_any( @@ -93,7 +93,7 @@ def calculate_result( break else: raise EntitiesMismatchException( - f"Entity with name: {ini_cube.name} which was present in initial scene, not found in final scene." + f"Entity with name: {initial_cube.name} which was present in initial scene, not found in final scene." ) self.logger.info( diff --git a/src/rai_core/rai/agents/tool_runner.py b/src/rai_core/rai/agents/tool_runner.py index 5c35ac9a8..12e0889d3 100644 --- a/src/rai_core/rai/agents/tool_runner.py +++ b/src/rai_core/rai/agents/tool_runner.py @@ -69,13 +69,8 @@ def run_one(call: ToolCall): ts = time.perf_counter() output = self.tools_by_name[call["name"]].invoke(call, config) # type: ignore te = time.perf_counter() - ts - tool_output_log = ( - str(output.content)[:1000] + "..." - if len(str(output.content)) > 1000 - else "" - ) self.logger.info( - f"Tool {call['name']} completed in {te:.2f} seconds. Tool output: {tool_output_log}" + f"Tool {call['name']} completed in {te:.2f} seconds. Tool output: {str(output.content)[:100]}{'...' if len(str(output.content)) > 100 else ''}" ) self.logger.debug( f"Tool {call['name']} output: \n\n{str(output.content)}"