Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion examples/manipulation-demo.launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def generate_launch_description():
launch_robotic_manipulation = Node(
package="robotic_manipulation",
executable="robotic_manipulation",
# name="robotic_manipulation_node",
output="screen",
parameters=[
{"use_sim_time": True},
Expand Down
4 changes: 2 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ requests = "^2.32.2"
pre-commit = "^3.7.0"
openai = "^1.23.3"
coloredlogs = "^15.0.1"
opencv-python = "^4.9.0.80"
markdown = "^3.6"
boto3 = "^1.34.98"
tqdm = "^4.66.4"
Expand Down Expand Up @@ -62,7 +63,6 @@ pytest-timeout = "^2.3.1"
tomli-w = "^1.1.0"
faster-whisper = "^1.1.1"
pydub = "^0.25.1"
opencv-python = "^4.11.0.86"
[tool.poetry.group.dev.dependencies]
ipykernel = "^6.29.4"

Expand Down
26 changes: 10 additions & 16 deletions src/rai_bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,33 +8,29 @@ The RAI Bench is a package including benchmarks and providing frame for creating

Frame components can be found in `src/rai_bench/rai_bench/benchmark_model.py`

- `Task` - abstract class for creating specific task. It introduces helper funtions that make it easier to calculate metrics/scores. Your custom tasks must implement a prompt got agent to do, a way to calculate a result and a validation if given scene config suits the task.
-
- `Scenario` - class defined by a Scene and Task. Can be created manually like:
- `Task`
- `Scenario`
- `Benchmark`

```python

```

- `Benchmark` - class responsible for running and logging scenarios.
For more information about these classes go to -> `src/rai_bench/rai_bench/benchmark_model.py`

### O3DE TEST BENCHMARK

O3DE Test Benchmark (src/rai_bench/rai_bench/o3de_test_bench/), contains 2 Tasks(tasks/) - GrabCarrotTask and PlaceCubesTask (these tasks implement calculating scores) and 4 scene_configs(configs/) for O3DE robotic arm simulation.
O3DE Test Benchmark (`src/rai_bench/rai_bench/o3de_test_bench/`), contains 2 Tasks(`tasks/`) - GrabCarrotTask and PlaceCubesTask (these tasks implement calculating scores) and 4 scene_configs(`configs/`) for O3DE robotic arm simulation.

Both tasks calculate score, taking into consideration 4 values:

- initially_misplaced_now_correct - when the object which was in the incorrect place at the start, is in a correct place at the end
- initially_misplaced_still_incorrect - when the object which was in the incorrect place at the start, is in a incorrect place at the end
- initially_correct_still_correct - when the object which was in the correct place at the start, is in a correct place at the end
- initially_correct_now_incorrect - when the object which was in the correct place at the start, is in a incorrect place at the end
- initially_misplaced_now_correct
- initially_misplaced_still_incorrect
- initially_correct_still_correct
- initially_correct_now_incorrect

The result is a value between 0 and 1, calculated like (initially_misplaced_now_correct + initially_correct_still_correct) / number_of_initial_objects.
This score is calculated at the beggining and at the end of each scenario.

### Example usage

Example of how to load scenes, define scenarios and run benchmark can be found in `src/rai_bench/rai_bench/benchmark_main.py`
Example of how to load scenes, define scenarios and run benchmark can be found in `src/rai_bench/rai_bench/examples/o3de_test_benchmark.py`

Scenarios can be loaded manually like:

Expand All @@ -56,5 +52,3 @@ scenarios = Benchmark.create_scenarios(
```

which will result in list of scenarios with combination of every possible task and scene(task decides if scene config is suitable for it).

Both approaches can be found in `main.py`
2 changes: 1 addition & 1 deletion src/rai_bench/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool.poetry]
name = "rai-bench"
version = "0.1.0"
description = ""
description = "Package for running and creating benchmarks."
authors = ["jmatejcz <[email protected]>"]
readme = "README.md"

Expand Down
89 changes: 57 additions & 32 deletions src/rai_bench/rai_bench/benchmark_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,17 @@


class EntitiesMismatchException(Exception):
def __init__(self, message: str) -> None:
super().__init__(message)
pass


class Task(ABC):
"""
Task to perform.
Specyfic implementation should implement a way to calculate results.
Abstract provides utility functions for common calculations, that can be usefull when
creating metrics
Abstract of a Task. Provides utility functions for common calculations
that can be helfull when creating metrics.
Specific child classes should implement:
- get_prompt method
- validate_config
- calculate_result
"""

def __init__(
Expand All @@ -57,6 +58,7 @@ def __init__(

@abstractmethod
def get_prompt(self) -> str:
"""Returns the task instruction - the prompt that will be passed to agent"""
pass

@abstractmethod
Expand All @@ -75,7 +77,8 @@ def calculate_result(
self, simulation_bridge: SimulationBridge[SimulationConfigT]
) -> float:
"""
Calculate result of the task
Calculates result of the task, based on info retrieved from simulation.
Should return score between 0.0 and 1.
"""
pass

Expand Down Expand Up @@ -135,7 +138,10 @@ def count_adjacent(


class Scenario(Generic[SimulationConfigT]):
"""Single instances are run separatly by benchmark"""
"""
A Scenarios are defined by a pair of Task and Simlation Config.
Each Scenario is executed separatly by a Benchmark.
"""

def __init__(
self,
Expand All @@ -154,24 +160,30 @@ def __init__(

class Benchmark:
"""
Defined by a set of scenarios to be done
Benchmark represents a set of Scenarios to be executed and evaluated.
It manages the execution, logs results, and provides functionality
for tracking and exporting performance metrics.
"""

def __init__(
self,
simulation_bridge: SimulationBridge[SimulationConfigT],
scenarios: List[Scenario[SimulationConfigT]],
logger: loggers_type | None = None,
results_filename: str = "benchmark_results.csv",
) -> None:
self.simulation_bridge = simulation_bridge
self.num_of_scenarios = len(scenarios)
self.scenarios = enumerate(iter(scenarios))
self.results: List[Dict[str, Any]] = []
self.results_filename = results_filename
if logger:
self._logger = logger
else:
self._logger = logging.getLogger(__name__)

self._initialize_results_file()

@classmethod
def create_scenarios(
cls,
Expand All @@ -198,6 +210,23 @@ def create_scenarios(
)
return scenarios

def _initialize_results_file(self):
"""Initialize the CSV file with headers."""
fieldnames = [
"task",
"simulation_config",
"initial_score",
"final_score",
"total_time",
"number_of_tool_calls",
]

with open(
self.results_filename, mode="w", newline="", encoding="utf-8"
) as file:
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()

def run_next(self, agent) -> None:
"""
Runs the next scenario
Expand Down Expand Up @@ -251,40 +280,36 @@ def run_next(self, agent) -> None:
f"TASK SCORE: {result}, TOTAL TIME: {total_time:.3f}, NUM_OF_TOOL_CALLS: {tool_calls_num}"
)

self.results.append(
{
"task": scenario.task.get_prompt(),
"simulation_config": scenario.simulation_config_path,
"initial_score": initial_result,
"final_score": result,
"total_time": f"{total_time:.3f}",
"number_of_tool_calls": tool_calls_num,
}
)
scenario_result: Dict[str, Any] = {
"task": scenario.task.get_prompt(),
"simulation_config": scenario.simulation_config_path,
"initial_score": initial_result,
"final_score": result,
"total_time": f"{total_time:.3f}",
"number_of_tool_calls": tool_calls_num,
}
self.results.append(scenario_result)
self._save_scenario_result_to_csv(scenario_result)

except StopIteration:
print("No more scenarios left to run.")

def get_results(self) -> List[Dict[str, Any]]:
return self.results

def dump_results_to_csv(self, filename: str) -> None:
if not self.results:
self._logger.warning("No results to save.") # type: ignore
return

def _save_scenario_result_to_csv(self, result: Dict[str, Any]) -> None:
"""Save a single scenario result to the CSV file."""
fieldnames = [
"task",
"initial_score",
"simulation_config",
"initial_score",
"final_score",
"total_time",
"number_of_tool_calls",
]

with open(filename, mode="w", newline="", encoding="utf-8") as file:
with open(
self.results_filename, mode="a", newline="", encoding="utf-8"
) as file:
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(self.results)
writer.writerow(result)

self._logger.info(f"Results saved to {filename}") # type: ignore
def get_results(self) -> List[Dict[str, Any]]:
return self.results
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@
simulation_bridge=o3de,
scenarios=scenarios,
logger=bench_logger,
results_filename="src/rai_bench/rai_bench/results.csv",
)
for i, s in enumerate(scenarios):
agent = create_conversational_agent(
Expand All @@ -180,7 +181,6 @@
bench_logger.info("===============================================================")
bench_logger.info("ALL SCENARIOS DONE. BENCHMARK COMPLETED!")
bench_logger.info("===============================================================")
benchmark.dump_results_to_csv(filename="src/rai_bench/rai_bench/results.csv")

connector.shutdown()
o3de.shutdown()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ def calculate_result(
else:
self.logger.debug(f"initial positions: {initial_carrots}") # type: ignore
self.logger.debug(f"current positions: {final_carrots}") # type: ignore
for ini_carrot in initial_carrots:
for initial_carrot in initial_carrots:
for final_carrot in final_carrots:
if ini_carrot.name == final_carrot.name:
initial_y = ini_carrot.pose.translation.y
if initial_carrot.name == final_carrot.name:
initial_y = initial_carrot.pose.translation.y
final_y = final_carrot.pose.translation.y
# NOTE the specific coords that refer to for example
# middle of the table can differ across simulations,
Expand Down Expand Up @@ -90,7 +90,7 @@ def calculate_result(
break
else:
raise EntitiesMismatchException(
f"Entity with name: {ini_carrot.name} which was present in initial scene, not found in final scene."
f"Entity with name: {initial_carrot.name} which was present in initial scene, not found in final scene."
)

self.logger.info( # type: ignore
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,19 +61,19 @@ def calculate_result(
)

else:
ini_poses = [cube.pose for cube in initial_cubes]
initial_poses = [cube.pose for cube in initial_cubes]
final_poses = [cube.pose for cube in final_cubes]
# NOTE the specific coords that refer to for example
# middle of the table can differ across simulations,
# take that into consideration
self.logger.debug(f"initial positions: {initial_cubes}")
self.logger.debug(f"current positions: {final_cubes}")
for i, ini_cube in enumerate(initial_cubes):
for j, final_cube in enumerate(final_cubes):
if ini_cube.name == final_cube.name:
for initial_cube in initial_cubes:
for final_cube in final_cubes:
if initial_cube.name == final_cube.name:
was_adjacent_initially = self.is_adjacent_to_any(
ini_cube.pose,
[p for p in ini_poses if p != ini_cube.pose],
initial_cube.pose,
[p for p in initial_poses if p != initial_cube.pose],
0.15,
)
is_adjacent_finally = self.is_adjacent_to_any(
Expand All @@ -93,7 +93,7 @@ def calculate_result(
break
else:
raise EntitiesMismatchException(
f"Entity with name: {ini_cube.name} which was present in initial scene, not found in final scene."
f"Entity with name: {initial_cube.name} which was present in initial scene, not found in final scene."
)

self.logger.info(
Expand Down
7 changes: 1 addition & 6 deletions src/rai_core/rai/agents/tool_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,8 @@ def run_one(call: ToolCall):
ts = time.perf_counter()
output = self.tools_by_name[call["name"]].invoke(call, config) # type: ignore
te = time.perf_counter() - ts
tool_output_log = (
str(output.content)[:1000] + "..."
if len(str(output.content)) > 1000
else ""
)
self.logger.info(
f"Tool {call['name']} completed in {te:.2f} seconds. Tool output: {tool_output_log}"
f"Tool {call['name']} completed in {te:.2f} seconds. Tool output: {str(output.content)[:100]}{'...' if len(str(output.content)) > 100 else ''}"
)
self.logger.debug(
f"Tool {call['name']} output: \n\n{str(output.content)}"
Expand Down