diff --git a/examples/manipulation-demo.launch.py b/examples/manipulation-demo.launch.py
index 35720a6af..ee28c9b44 100644
--- a/examples/manipulation-demo.launch.py
+++ b/examples/manipulation-demo.launch.py
@@ -51,7 +51,6 @@ def generate_launch_description():
     launch_robotic_manipulation = Node(
         package="robotic_manipulation",
         executable="robotic_manipulation",
-        # name="robotic_manipulation_node",
         output="screen",
         parameters=[
             {"use_sim_time": True},
diff --git a/poetry.lock b/poetry.lock
index 9eb39f9fa..335612128 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -5703,7 +5703,7 @@ url = "src/rai_asr"
 [[package]]
 name = "rai-bench"
 version = "0.1.0"
-description = ""
+description = "Package for running and creating benchmarks."
 optional = false
 python-versions = "^3.10"
 files = []
@@ -8331,4 +8331,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10, <3.13"
-content-hash = "c5469635a5db79c258554ad9f4e49331515940e406fbf912822651a0e0c33dda"
+content-hash = "d943b786f2bb8dddc9249475409a4d7c9c4b0a77041611c039431df55ad94000"
diff --git a/pyproject.toml b/pyproject.toml
index a3edd3730..92bcfa7c7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,6 +31,7 @@ requests = "^2.32.2"
 pre-commit = "^3.7.0"
 openai = "^1.23.3"
 coloredlogs = "^15.0.1"
+opencv-python = "^4.9.0.80"
 markdown = "^3.6"
 boto3 = "^1.34.98"
 tqdm = "^4.66.4"
@@ -62,7 +63,6 @@ pytest-timeout = "^2.3.1"
 tomli-w = "^1.1.0"
 faster-whisper = "^1.1.1"
 pydub = "^0.25.1"
-opencv-python = "^4.11.0.86"
 [tool.poetry.group.dev.dependencies]
 ipykernel = "^6.29.4"
 
diff --git a/src/rai_bench/README.md b/src/rai_bench/README.md
index e2abd3d4f..3db5d04ba 100644
--- a/src/rai_bench/README.md
+++ b/src/rai_bench/README.md
@@ -8,33 +8,29 @@ The RAI Bench is a package including benchmarks and providing frame for creating
 
 Frame components can be found in `src/rai_bench/rai_bench/benchmark_model.py`
 
-- `Task` - abstract class for creating specific task. It introduces helper funtions that make it easier to calculate metrics/scores. Your custom tasks must implement a prompt got agent to do, a way to calculate a result and a validation if given scene config suits the task.
--
-- `Scenario` - class defined by a Scene and Task. Can be created manually like:
+- `Task`
+- `Scenario`
+- `Benchmark`
 
-  ```python
-
-  ```
-
-- `Benchmark` - class responsible for running and logging scenarios.
+For more information about these classes go to -> `src/rai_bench/rai_bench/benchmark_model.py`
 
 ### O3DE TEST BENCHMARK
 
-O3DE Test Benchmark (src/rai_bench/rai_bench/o3de_test_bench/), contains 2 Tasks(tasks/) - GrabCarrotTask and PlaceCubesTask (these tasks implement calculating scores) and 4 scene_configs(configs/) for O3DE robotic arm simulation.
+O3DE Test Benchmark (`src/rai_bench/rai_bench/o3de_test_bench/`), contains 2 Tasks(`tasks/`) - GrabCarrotTask and PlaceCubesTask (these tasks implement calculating scores) and 4 scene_configs(`configs/`) for O3DE robotic arm simulation.
 
 Both tasks calculate score, taking into consideration 4 values:
 
-- initially_misplaced_now_correct - when the object which was in the incorrect place at the start, is in a correct place at the end
-- initially_misplaced_still_incorrect - when the object which was in the incorrect place at the start, is in a incorrect place at the end
-- initially_correct_still_correct - when the object which was in the correct place at the start, is in a correct place at the end
-- initially_correct_now_incorrect - when the object which was in the correct place at the start, is in a incorrect place at the end
+- initially_misplaced_now_correct
+- initially_misplaced_still_incorrect
+- initially_correct_still_correct
+- initially_correct_now_incorrect
 
 The result is a value between 0 and 1, calculated like (initially_misplaced_now_correct + initially_correct_still_correct) / number_of_initial_objects.
 This score is calculated at the beggining and at the end of each scenario.
 
 ### Example usage
 
-Example of how to load scenes, define scenarios and run benchmark can be found in `src/rai_bench/rai_bench/benchmark_main.py`
+Example of how to load scenes, define scenarios and run benchmark can be found in `src/rai_bench/rai_bench/examples/o3de_test_benchmark.py`
 
 Scenarios can be loaded manually like:
 
@@ -56,5 +52,3 @@ scenarios = Benchmark.create_scenarios(
 ```
 
 which will result in list of scenarios with combination of every possible task and scene(task decides if scene config is suitable for it).
-
-Both approaches can be found in `main.py`
diff --git a/src/rai_bench/pyproject.toml b/src/rai_bench/pyproject.toml
index 52255eb9a..a5426f409 100644
--- a/src/rai_bench/pyproject.toml
+++ b/src/rai_bench/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "rai-bench"
 version = "0.1.0"
-description = ""
+description = "Package for running and creating benchmarks."
 authors = ["jmatejcz <jakub.matejczyk@robotec.ai>"]
 readme = "README.md"
 
diff --git a/src/rai_bench/rai_bench/benchmark_model.py b/src/rai_bench/rai_bench/benchmark_model.py
index bc47ef407..34cb7b776 100644
--- a/src/rai_bench/rai_bench/benchmark_model.py
+++ b/src/rai_bench/rai_bench/benchmark_model.py
@@ -34,16 +34,17 @@
 
 
 class EntitiesMismatchException(Exception):
-    def __init__(self, message: str) -> None:
-        super().__init__(message)
+    pass
 
 
 class Task(ABC):
     """
-    Task to perform.
-    Specyfic implementation should implement a way to calculate results.
-    Abstract provides utility functions for common calculations, that can be usefull when
-    creating metrics
+    Abstract of a Task. Provides utility functions for common calculations
+    that can be helfull when creating metrics.
+    Specific child classes should implement:
+    - get_prompt method
+    - validate_config
+    - calculate_result
     """
 
     def __init__(
@@ -57,6 +58,7 @@ def __init__(
 
     @abstractmethod
     def get_prompt(self) -> str:
+        """Returns the task instruction - the prompt that will be passed to agent"""
         pass
 
     @abstractmethod
@@ -75,7 +77,8 @@ def calculate_result(
         self, simulation_bridge: SimulationBridge[SimulationConfigT]
     ) -> float:
         """
-        Calculate result of the task
+        Calculates result of the task, based on info retrieved from simulation.
+        Should return score between 0.0 and 1.
         """
         pass
 
@@ -135,7 +138,10 @@ def count_adjacent(
 
 
 class Scenario(Generic[SimulationConfigT]):
-    """Single instances are run separatly by benchmark"""
+    """
+    A Scenarios are defined by a pair of Task and Simlation Config.
+    Each Scenario is executed separatly by a Benchmark.
+    """
 
     def __init__(
         self,
@@ -154,7 +160,9 @@ def __init__(
 
 class Benchmark:
     """
-    Defined by a set of scenarios to be done
+    Benchmark represents a set of Scenarios to be executed and evaluated.
+    It manages the execution, logs results, and provides functionality
+    for tracking and exporting performance metrics.
     """
 
     def __init__(
@@ -162,16 +170,20 @@ def __init__(
         simulation_bridge: SimulationBridge[SimulationConfigT],
         scenarios: List[Scenario[SimulationConfigT]],
         logger: loggers_type | None = None,
+        results_filename: str = "benchmark_results.csv",
     ) -> None:
         self.simulation_bridge = simulation_bridge
         self.num_of_scenarios = len(scenarios)
         self.scenarios = enumerate(iter(scenarios))
         self.results: List[Dict[str, Any]] = []
+        self.results_filename = results_filename
         if logger:
             self._logger = logger
         else:
             self._logger = logging.getLogger(__name__)
 
+        self._initialize_results_file()
+
     @classmethod
     def create_scenarios(
         cls,
@@ -198,6 +210,23 @@ def create_scenarios(
                     )
         return scenarios
 
+    def _initialize_results_file(self):
+        """Initialize the CSV file with headers."""
+        fieldnames = [
+            "task",
+            "simulation_config",
+            "initial_score",
+            "final_score",
+            "total_time",
+            "number_of_tool_calls",
+        ]
+
+        with open(
+            self.results_filename, mode="w", newline="", encoding="utf-8"
+        ) as file:
+            writer = csv.DictWriter(file, fieldnames=fieldnames)
+            writer.writeheader()
+
     def run_next(self, agent) -> None:
         """
         Runs the next scenario
@@ -251,40 +280,36 @@ def run_next(self, agent) -> None:
                 f"TASK SCORE: {result}, TOTAL TIME: {total_time:.3f}, NUM_OF_TOOL_CALLS: {tool_calls_num}"
             )
 
-            self.results.append(
-                {
-                    "task": scenario.task.get_prompt(),
-                    "simulation_config": scenario.simulation_config_path,
-                    "initial_score": initial_result,
-                    "final_score": result,
-                    "total_time": f"{total_time:.3f}",
-                    "number_of_tool_calls": tool_calls_num,
-                }
-            )
+            scenario_result: Dict[str, Any] = {
+                "task": scenario.task.get_prompt(),
+                "simulation_config": scenario.simulation_config_path,
+                "initial_score": initial_result,
+                "final_score": result,
+                "total_time": f"{total_time:.3f}",
+                "number_of_tool_calls": tool_calls_num,
+            }
+            self.results.append(scenario_result)
+            self._save_scenario_result_to_csv(scenario_result)
 
         except StopIteration:
             print("No more scenarios left to run.")
 
-    def get_results(self) -> List[Dict[str, Any]]:
-        return self.results
-
-    def dump_results_to_csv(self, filename: str) -> None:
-        if not self.results:
-            self._logger.warning("No results to save.")  # type: ignore
-            return
-
+    def _save_scenario_result_to_csv(self, result: Dict[str, Any]) -> None:
+        """Save a single scenario result to the CSV file."""
         fieldnames = [
             "task",
-            "initial_score",
             "simulation_config",
+            "initial_score",
             "final_score",
             "total_time",
             "number_of_tool_calls",
         ]
 
-        with open(filename, mode="w", newline="", encoding="utf-8") as file:
+        with open(
+            self.results_filename, mode="a", newline="", encoding="utf-8"
+        ) as file:
             writer = csv.DictWriter(file, fieldnames=fieldnames)
-            writer.writeheader()
-            writer.writerows(self.results)
+            writer.writerow(result)
 
-        self._logger.info(f"Results saved to {filename}")  # type: ignore
+    def get_results(self) -> List[Dict[str, Any]]:
+        return self.results
diff --git a/src/rai_bench/rai_bench/main.py b/src/rai_bench/rai_bench/examples/o3de_test_benchmark.py
similarity index 98%
rename from src/rai_bench/rai_bench/main.py
rename to src/rai_bench/rai_bench/examples/o3de_test_benchmark.py
index 7875c92c4..bfa81d0bc 100644
--- a/src/rai_bench/rai_bench/main.py
+++ b/src/rai_bench/rai_bench/examples/o3de_test_benchmark.py
@@ -163,6 +163,7 @@
         simulation_bridge=o3de,
         scenarios=scenarios,
         logger=bench_logger,
+        results_filename="src/rai_bench/rai_bench/results.csv",
     )
     for i, s in enumerate(scenarios):
         agent = create_conversational_agent(
@@ -180,7 +181,6 @@
     bench_logger.info("===============================================================")
     bench_logger.info("ALL SCENARIOS DONE. BENCHMARK COMPLETED!")
     bench_logger.info("===============================================================")
-    benchmark.dump_results_to_csv(filename="src/rai_bench/rai_bench/results.csv")
 
     connector.shutdown()
     o3de.shutdown()
diff --git a/src/rai_bench/rai_bench/o3de_test_bench/tasks/grab_carrot_task.py b/src/rai_bench/rai_bench/o3de_test_bench/tasks/grab_carrot_task.py
index ca040fd62..56203f194 100644
--- a/src/rai_bench/rai_bench/o3de_test_bench/tasks/grab_carrot_task.py
+++ b/src/rai_bench/rai_bench/o3de_test_bench/tasks/grab_carrot_task.py
@@ -59,10 +59,10 @@ def calculate_result(
         else:
             self.logger.debug(f"initial positions: {initial_carrots}")  # type: ignore
             self.logger.debug(f"current positions: {final_carrots}")  # type: ignore
-            for ini_carrot in initial_carrots:
+            for initial_carrot in initial_carrots:
                 for final_carrot in final_carrots:
-                    if ini_carrot.name == final_carrot.name:
-                        initial_y = ini_carrot.pose.translation.y
+                    if initial_carrot.name == final_carrot.name:
+                        initial_y = initial_carrot.pose.translation.y
                         final_y = final_carrot.pose.translation.y
                         # NOTE the specific coords that refer to for example
                         # middle of the table can differ across simulations,
@@ -90,7 +90,7 @@ def calculate_result(
                         break
                 else:
                     raise EntitiesMismatchException(
-                        f"Entity with name: {ini_carrot.name} which was present in initial scene, not found in final scene."
+                        f"Entity with name: {initial_carrot.name} which was present in initial scene, not found in final scene."
                     )
 
             self.logger.info(  # type: ignore
diff --git a/src/rai_bench/rai_bench/o3de_test_bench/tasks/place_cubes_task.py b/src/rai_bench/rai_bench/o3de_test_bench/tasks/place_cubes_task.py
index 26bdd590e..9ad03c9b2 100644
--- a/src/rai_bench/rai_bench/o3de_test_bench/tasks/place_cubes_task.py
+++ b/src/rai_bench/rai_bench/o3de_test_bench/tasks/place_cubes_task.py
@@ -61,19 +61,19 @@ def calculate_result(
             )
 
         else:
-            ini_poses = [cube.pose for cube in initial_cubes]
+            initial_poses = [cube.pose for cube in initial_cubes]
             final_poses = [cube.pose for cube in final_cubes]
             # NOTE the specific coords that refer to for example
             # middle of the table can differ across simulations,
             # take that into consideration
             self.logger.debug(f"initial positions: {initial_cubes}")
             self.logger.debug(f"current positions: {final_cubes}")
-            for i, ini_cube in enumerate(initial_cubes):
-                for j, final_cube in enumerate(final_cubes):
-                    if ini_cube.name == final_cube.name:
+            for initial_cube in initial_cubes:
+                for final_cube in final_cubes:
+                    if initial_cube.name == final_cube.name:
                         was_adjacent_initially = self.is_adjacent_to_any(
-                            ini_cube.pose,
-                            [p for p in ini_poses if p != ini_cube.pose],
+                            initial_cube.pose,
+                            [p for p in initial_poses if p != initial_cube.pose],
                             0.15,
                         )
                         is_adjacent_finally = self.is_adjacent_to_any(
@@ -93,7 +93,7 @@ def calculate_result(
                         break
                 else:
                     raise EntitiesMismatchException(
-                        f"Entity with name: {ini_cube.name} which was present in initial scene, not found in final scene."
+                        f"Entity with name: {initial_cube.name} which was present in initial scene, not found in final scene."
                     )
 
             self.logger.info(
diff --git a/src/rai_core/rai/agents/tool_runner.py b/src/rai_core/rai/agents/tool_runner.py
index 5c35ac9a8..12e0889d3 100644
--- a/src/rai_core/rai/agents/tool_runner.py
+++ b/src/rai_core/rai/agents/tool_runner.py
@@ -69,13 +69,8 @@ def run_one(call: ToolCall):
                 ts = time.perf_counter()
                 output = self.tools_by_name[call["name"]].invoke(call, config)  # type: ignore
                 te = time.perf_counter() - ts
-                tool_output_log = (
-                    str(output.content)[:1000] + "..."
-                    if len(str(output.content)) > 1000
-                    else ""
-                )
                 self.logger.info(
-                    f"Tool {call['name']} completed in {te:.2f} seconds. Tool output: {tool_output_log}"
+                    f"Tool {call['name']} completed in {te:.2f} seconds. Tool output: {str(output.content)[:100]}{'...' if len(str(output.content)) > 100 else ''}"
                 )
                 self.logger.debug(
                     f"Tool {call['name']} output: \n\n{str(output.content)}"