From e93fe7def79e6fdf303c1bd9193a88d5ff61553c Mon Sep 17 00:00:00 2001
From: adaickalavan <adaickalavan@gmail.com>
Date: Sat, 3 Jun 2023 12:25:01 -0400
Subject: [PATCH] Cap the steps and weight the scores.

---
 CHANGELOG.md                                  |   2 +
 .../driving_smarts/v2023/config_1.yaml        |   2 -
 .../driving_smarts/v2023/config_2.yaml        |   2 -
 .../driving_smarts/v2023/config_3.yaml        |   2 -
 .../v2023/metric_formula_drive.py             | 138 ++++++-----
 .../v2023/metric_formula_platoon.py           | 142 ++++++------
 .../entrypoints/benchmark_runner_v0.py        |  48 ++--
 .../env/gymnasium/driving_smarts_2023_env.py  |   1 +
 smarts/env/gymnasium/platoon_env.py           |   1 +
 smarts/env/gymnasium/wrappers/metric/costs.py |   8 +-
 .../env/gymnasium/wrappers/metric/formula.py  | 214 +++++++++++-------
 .../env/gymnasium/wrappers/metric/metrics.py  |  30 ++-
 smarts/env/gymnasium/wrappers/metric/types.py |  14 +-
 smarts/env/tests/test_metrics.py              |   4 +-
 smarts/sstudio/types/scenario.py              |   8 +-
 15 files changed, 366 insertions(+), 250 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 52b02584b2..46e3611340 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -33,6 +33,8 @@ Copy and pasting the git commit messages is __NOT__ enough.
 - Changed instances of `hiway-v0` and `gym` to use `hiway-v1` and `gymnasium`, respectively.
 - `RoadMap.Route` now optionally stores the start and end lanes of the route.
 - `DistToDestination` metric is now computed by summing the (i) off-route distance driven by the vehicle from its last on-route position, and (ii) the distance to goal from the vehicle's last on-route position. 
+- `Steps` metric is capped by scenario duration set in the scenario metadata.
+- Overall metric score is weighted by each agent's task difficulty.
 ### Deprecated
 - `visdom` is set to be removed from the SMARTS object parameters.
 - Deprecated `start_time` on missions.
diff --git a/smarts/benchmark/driving_smarts/v2023/config_1.yaml b/smarts/benchmark/driving_smarts/v2023/config_1.yaml
index 5f480d19d1..6b3a57f675 100644
--- a/smarts/benchmark/driving_smarts/v2023/config_1.yaml
+++ b/smarts/benchmark/driving_smarts/v2023/config_1.yaml
@@ -2,8 +2,6 @@
 benchmark:
   name: "Driving SMARTS 2023.1"
   message: |
-    This is the Driving SMARTS 2023.1 benchmark.
-
     For context see: 
         - https://smarts-project.github.io/competition/2023_driving_smarts/
         - https://codalab.lisn.upsaclay.fr/competitions/
diff --git a/smarts/benchmark/driving_smarts/v2023/config_2.yaml b/smarts/benchmark/driving_smarts/v2023/config_2.yaml
index de913eaa87..e7131ef475 100644
--- a/smarts/benchmark/driving_smarts/v2023/config_2.yaml
+++ b/smarts/benchmark/driving_smarts/v2023/config_2.yaml
@@ -2,8 +2,6 @@
 benchmark:
   name: "Driving SMARTS 2023.2"
   message: |
-    This is the Driving SMARTS 2023.2 benchmark.
-
     For context see: 
         - https://smarts-project.github.io/competition/2023_driving_smarts/
         - https://codalab.lisn.upsaclay.fr/competitions/
diff --git a/smarts/benchmark/driving_smarts/v2023/config_3.yaml b/smarts/benchmark/driving_smarts/v2023/config_3.yaml
index fd11954a73..d2e89b43f4 100644
--- a/smarts/benchmark/driving_smarts/v2023/config_3.yaml
+++ b/smarts/benchmark/driving_smarts/v2023/config_3.yaml
@@ -2,8 +2,6 @@
 benchmark:
   name: "Driving SMARTS 2023.3"
   message: |
-    This is the Driving SMARTS 2023.3 benchmark.
-
     For context see: 
         - https://smarts-project.github.io/competition/2023_driving_smarts/
         - https://codalab.lisn.upsaclay.fr/competitions/
diff --git a/smarts/benchmark/driving_smarts/v2023/metric_formula_drive.py b/smarts/benchmark/driving_smarts/v2023/metric_formula_drive.py
index 8c06a81b01..925d9dec00 100644
--- a/smarts/benchmark/driving_smarts/v2023/metric_formula_drive.py
+++ b/smarts/benchmark/driving_smarts/v2023/metric_formula_drive.py
@@ -25,13 +25,27 @@
 import numpy as np
 
 from smarts.env.gymnasium.wrappers.metric.costs import Costs
-from smarts.env.gymnasium.wrappers.metric.formula import FormulaBase, Score, avg_costs
+from smarts.env.gymnasium.wrappers.metric.formula import (
+    FormulaBase,
+    Score,
+    agent_scores,
+    agent_weights,
+    score_rule_violation,
+    weighted_score,
+)
 from smarts.env.gymnasium.wrappers.metric.params import (
+    Collisions,
     Comfort,
     DistToDestination,
     DistToObstacles,
     JerkLinear,
+    LaneCenterOffset,
+    OffRoad,
     Params,
+    SpeedLimit,
+    Steps,
+    VehicleGap,
+    WrongWay,
 )
 from smarts.env.gymnasium.wrappers.metric.types import Record
 
@@ -51,75 +65,83 @@ def params(self) -> Params:
             Params: Cost function parameters.
         """
         params = Params(
-            comfort=Comfort(
-                active=True,
-            ),
-            dist_to_destination=DistToDestination(
-                active=True,
-            ),
-            dist_to_obstacles=DistToObstacles(
-                active=False,
-            ),
+            collisions=Collisions(active=False),
+            comfort=Comfort(active=True),
+            dist_to_destination=DistToDestination(active=True),
+            dist_to_obstacles=DistToObstacles(active=False),
             jerk_linear=JerkLinear(active=False),
+            lane_center_offset=LaneCenterOffset(active=True),
+            off_road=OffRoad(active=False),
+            speed_limit=SpeedLimit(active=True),
+            steps=Steps(active=True),
+            vehicle_gap=VehicleGap(active=False),
+            wrong_way=WrongWay(active=True),
         )
         return params
 
-    def score(self, records_sum: Dict[str, Dict[str, Record]]) -> Score:
+    def score(self, records: Dict[str, Dict[str, Record]]) -> Score:
         """
         Computes several sub-component scores and one total combined score named
         "Overall" on the wrapped environment.
 
-        +-------------------+--------+-----------------------------------------------------------+
-        |                   | Range  | Remarks                                                   |
-        +===================+========+===========================================================+
-        | Overall           | [0, 1] | Total score. The higher, the better.                      |
-        +-------------------+--------+-----------------------------------------------------------+
-        | DistToDestination | [0, 1] | Remaining distance to destination. The lower, the better. |
-        +-------------------+--------+-----------------------------------------------------------+
-        | Time              | [0, 1] | Time taken to complete scenario. The lower, the better.   |
-        +-------------------+--------+-----------------------------------------------------------+
-        | HumannessError    | [0, 1] | Humanness indicator. The lower, the better.               |
-        +-------------------+--------+-----------------------------------------------------------+
-        | RuleViolation     | [0, 1] | Traffic rules compliance. The lower, the better.          |
-        +-------------------+--------+-----------------------------------------------------------+
+        Args:
+            records (Dict[str, Dict[str, Record]]): Records.       
 
         Returns:
-            Score: Contains "Overall", "DistToDestination", "VehicleGap",
-            "HumannessError", and "RuleViolation" scores.
+            Score: "Overall" score and other sub-component scores.
         """
 
-        costs_final = avg_costs(records_sum=records_sum)
-
-        # Compute sub-components of score.
-        dist_to_destination = costs_final.dist_to_destination
-        humanness_error = _humanness_error(costs=costs_final)
-        rule_violation = _rule_violation(costs=costs_final)
-        time = costs_final.steps
-        overall = (
-            0.25 * (1 - dist_to_destination)
-            + 0.25 * (1 - time)
-            + 0.25 * (1 - humanness_error)
-            + 0.25 * (1 - rule_violation)
-        )
-
-        return Score(
-            {
-                "overall": overall,
-                "dist_to_destination": dist_to_destination,
-                "time": time,
-                "humanness_error": humanness_error,
-                "rule_violation": rule_violation,
-            }
-        )
-
-
-def _humanness_error(costs: Costs) -> float:
+        agent_weight = agent_weights(records=records)
+        agent_score = agent_scores(records=records, func=costs_to_score)
+        return weighted_score(scores=agent_score, weights=agent_weight)
+
+
+def costs_to_score(costs: Costs) -> Score:
+    """Compute score from costs.
+
+    +-------------------+--------+-----------------------------------------------------------+
+    |                   | Range  | Remarks                                                   |
+    +===================+========+===========================================================+
+    | Overall           | [0, 1] | Total score. The higher, the better.                      |
+    +-------------------+--------+-----------------------------------------------------------+
+    | DistToDestination | [0, 1] | Remaining distance to destination. The lower, the better. |
+    +-------------------+--------+-----------------------------------------------------------+
+    | Time              | [0, 1] | Time taken to complete scenario. The lower, the better.   |
+    +-------------------+--------+-----------------------------------------------------------+
+    | HumannessError    | [0, 1] | Humanness indicator. The lower, the better.               |
+    +-------------------+--------+-----------------------------------------------------------+
+    | RuleViolation     | [0, 1] | Traffic rules compliance. The lower, the better.          |
+    +-------------------+--------+-----------------------------------------------------------+
+
+    Args:
+        costs (Costs): Costs.
+
+    Returns:
+        Score: Score.
+    """
+    dist_to_destination = costs.dist_to_destination
+    humanness_error = _score_humanness_error(costs=costs)
+    rule_violation = score_rule_violation(costs=costs)
+    time = costs.steps
+    overall = (
+        0.25 * (1 - dist_to_destination)
+        + 0.25 * (1 - time)
+        + 0.25 * (1 - humanness_error)
+        + 0.25 * (1 - rule_violation)
+    )
+
+    return Score(
+        {
+            "overall": overall,
+            "dist_to_destination": dist_to_destination,
+            "time": time,
+            "humanness_error": humanness_error,
+            "rule_violation": rule_violation,
+        }
+    )
+
+
+def _score_humanness_error(costs: Costs) -> float:
     humanness_error = np.array([costs.comfort, costs.lane_center_offset])
     humanness_error = np.mean(humanness_error, dtype=float)
     return humanness_error
-
-
-def _rule_violation(costs: Costs) -> float:
-    rule_violation = np.array([costs.speed_limit, costs.wrong_way])
-    rule_violation = np.mean(rule_violation, dtype=float)
-    return rule_violation
diff --git a/smarts/benchmark/driving_smarts/v2023/metric_formula_platoon.py b/smarts/benchmark/driving_smarts/v2023/metric_formula_platoon.py
index 2b10d76fa7..a3391407cd 100644
--- a/smarts/benchmark/driving_smarts/v2023/metric_formula_platoon.py
+++ b/smarts/benchmark/driving_smarts/v2023/metric_formula_platoon.py
@@ -25,15 +25,27 @@
 import numpy as np
 
 from smarts.env.gymnasium.wrappers.metric.costs import Costs
-from smarts.env.gymnasium.wrappers.metric.formula import FormulaBase, Score, avg_costs
+from smarts.env.gymnasium.wrappers.metric.formula import (
+    FormulaBase,
+    Score,
+    agent_scores,
+    agent_weights,
+    score_rule_violation,
+    weighted_score,
+)
 from smarts.env.gymnasium.wrappers.metric.params import (
+    Collisions,
     Comfort,
     DistToDestination,
     DistToObstacles,
     JerkLinear,
+    LaneCenterOffset,
+    OffRoad,
     Params,
+    SpeedLimit,
     Steps,
     VehicleGap,
+    WrongWay,
 )
 from smarts.env.gymnasium.wrappers.metric.types import Record
 
@@ -53,81 +65,83 @@ def params(self) -> Params:
             Params: Cost function parameters.
         """
         params = Params(
-            comfort=Comfort(
-                active=True,
-            ),
-            dist_to_destination=DistToDestination(
-                active=True,
-            ),
-            dist_to_obstacles=DistToObstacles(
-                active=False,
-            ),
+            collisions=Collisions(active=False),
+            comfort=Comfort(active=True),
+            dist_to_destination=DistToDestination(active=True),
+            dist_to_obstacles=DistToObstacles(active=False),
             jerk_linear=JerkLinear(active=False),
-            vehicle_gap=VehicleGap(
-                active=True,
-            ),
-            steps=Steps(
-                active=False,
-            ),
+            lane_center_offset=LaneCenterOffset(active=True),
+            off_road=OffRoad(active=False),
+            speed_limit=SpeedLimit(active=True),
+            steps=Steps(active=False),
+            vehicle_gap=VehicleGap(active=True),
+            wrong_way=WrongWay(active=True),
         )
         return params
 
-    def score(self, records_sum: Dict[str, Dict[str, Record]]) -> Score:
+    def score(self, records: Dict[str, Dict[str, Record]]) -> Score:
         """
         Computes several sub-component scores and one total combined score named
         "Overall" on the wrapped environment.
 
-        +-------------------+--------+-----------------------------------------------------------+
-        |                   | Range  | Remarks                                                   |
-        +===================+========+===========================================================+
-        | Overall           | [0, 1] | Total score. The higher, the better.                      |
-        +-------------------+--------+-----------------------------------------------------------+
-        | DistToDestination | [0, 1] | Remaining distance to destination. The lower, the better. |
-        +-------------------+--------+-----------------------------------------------------------+
-        | VehicleGap        | [0, 1] | Gap between vehicles in a convoy. The lower, the better.  |
-        +-------------------+--------+-----------------------------------------------------------+
-        | HumannessError    | [0, 1] | Humanness indicator. The lower, the better.               |
-        +-------------------+--------+-----------------------------------------------------------+
-        | RuleViolation     | [0, 1] | Traffic rules compliance. The lower, the better.          |
-        +-------------------+--------+-----------------------------------------------------------+
+        Args:
+            records (Dict[str, Dict[str, Record]]): Records.       
 
         Returns:
-            Score: Contains "Overall", "DistToDestination", "VehicleGap",
-            "HumannessError", and "RuleViolation" scores.
+            Score: "Overall" score and other sub-component scores.
         """
 
-        costs_final = avg_costs(records_sum=records_sum)
-
-        # Compute sub-components of score.
-        dist_to_destination = costs_final.dist_to_destination
-        humanness_error = _humanness_error(costs=costs_final)
-        rule_violation = _rule_violation(costs=costs_final)
-        vehicle_gap = costs_final.vehicle_gap
-        overall = (
-            0.25 * (1 - dist_to_destination)
-            + 0.25 * (1 - vehicle_gap)
-            + 0.25 * (1 - humanness_error)
-            + 0.25 * (1 - rule_violation)
-        )
-
-        return Score(
-            {
-                "overall": overall,
-                "dist_to_destination": dist_to_destination,
-                "vehicle_gap": vehicle_gap,
-                "humanness_error": humanness_error,
-                "rule_violation": rule_violation,
-            }
-        )
-
-
-def _humanness_error(costs: Costs) -> float:
+        agent_weight = agent_weights(records=records)
+        agent_score = agent_scores(records=records, func=costs_to_score)
+        return weighted_score(scores=agent_score, weights=agent_weight)
+
+
+def costs_to_score(costs: Costs) -> Score:
+    """Compute score from costs.
+
+    +-------------------+--------+-----------------------------------------------------------+
+    |                   | Range  | Remarks                                                   |
+    +===================+========+===========================================================+
+    | Overall           | [0, 1] | Total score. The higher, the better.                      |
+    +-------------------+--------+-----------------------------------------------------------+
+    | DistToDestination | [0, 1] | Remaining distance to destination. The lower, the better. |
+    +-------------------+--------+-----------------------------------------------------------+
+    | VehicleGap        | [0, 1] | Gap between vehicles in a convoy. The lower, the better.  |
+    +-------------------+--------+-----------------------------------------------------------+
+    | HumannessError    | [0, 1] | Humanness indicator. The lower, the better.               |
+    +-------------------+--------+-----------------------------------------------------------+
+    | RuleViolation     | [0, 1] | Traffic rules compliance. The lower, the better.          |
+    +-------------------+--------+-----------------------------------------------------------+
+
+    Args:
+        costs (Costs): Costs.
+
+    Returns:
+        Score: Score.
+    """
+    dist_to_destination = costs.dist_to_destination
+    humanness_error = _score_humanness_error(costs=costs)
+    rule_violation = score_rule_violation(costs=costs)
+    vehicle_gap = costs.vehicle_gap
+    overall = (
+        0.25 * (1 - dist_to_destination)
+        + 0.25 * (1 - vehicle_gap)
+        + 0.25 * (1 - humanness_error)
+        + 0.25 * (1 - rule_violation)
+    )
+
+    return Score(
+        {
+            "overall": overall,
+            "dist_to_destination": dist_to_destination,
+            "vehicle_gap": vehicle_gap,
+            "humanness_error": humanness_error,
+            "rule_violation": rule_violation,
+        }
+    )
+
+
+def _score_humanness_error(costs: Costs) -> float:
     humanness_error = np.array([costs.comfort, costs.lane_center_offset])
     humanness_error = np.mean(humanness_error, dtype=float)
     return humanness_error
-
-
-def _rule_violation(costs: Costs) -> float:
-    rule_violation = np.array([costs.speed_limit, costs.wrong_way])
-    rule_violation = np.mean(rule_violation, dtype=float)
-    return rule_violation
diff --git a/smarts/benchmark/entrypoints/benchmark_runner_v0.py b/smarts/benchmark/entrypoints/benchmark_runner_v0.py
index bedd9e2d53..294e79c70a 100644
--- a/smarts/benchmark/entrypoints/benchmark_runner_v0.py
+++ b/smarts/benchmark/entrypoints/benchmark_runner_v0.py
@@ -19,9 +19,9 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
+import json
 import logging
 import os
-import pprint
 from pathlib import Path
 from typing import Dict
 
@@ -35,7 +35,6 @@
 from smarts.env.gymnasium.wrappers.metric.formula import FormulaBase, Score
 from smarts.env.gymnasium.wrappers.metric.metrics import Metrics
 from smarts.env.gymnasium.wrappers.metric.types import Record
-from smarts.env.gymnasium.wrappers.metric.utils import multiply, op_dataclass
 from smarts.zoo import registry as agent_registry
 
 LOG_WORKERS = False
@@ -171,40 +170,49 @@ def benchmark(benchmark_args, agent_locator, log_workers=False):
             )
 
         records_cumulative: Dict[str, Dict[str, Record]] = {}
-        for name, records in iterator(
+        for _, records in iterator(
             env_args=env_args,
             benchmark_args=benchmark_args,
             agent_locator=agent_locator,
             log_workers=log_workers,
         ):
             records_cumulative.update(records)
-            print(f"\nScoring {name} ...")
 
-        score = _get_score(records=records_cumulative, metric_formula=metric_formula)
-        print("\nSCORE")
-        pprint.pprint(score)
+        score = _get_weighted_score(
+            records=records_cumulative, metric_formula=metric_formula
+        )
+        print("\n\nOverall Weighted Score:\n")
+        print(json.dumps(score, indent=2))
+        score = _get_agent_score(
+            records=records_cumulative, metric_formula=metric_formula
+        )
+        print("\n\nIndividual Agent Score:\n")
+        print(json.dumps(score, indent=2))
 
     print("\n<-- Evaluation complete -->\n")
 
 
-def _get_score(records: Dict[str, Dict[str, Record]], metric_formula: Path) -> Score:
-    # Convert averaged records into sum of records.
-    records_sum = {}
-    for scen, agents in records.items():
-        records_sum[scen] = {}
-        for agent, data in agents.items():
-            records_sum[scen][agent] = Record(
-                costs=op_dataclass(data.costs, data.counts.episodes, multiply),
-                counts=data.counts,
-            )
-
-    # Import scoring formula
+def _get_weighted_score(
+    records: Dict[str, Dict[str, Record]], metric_formula: Path
+) -> Score:
     import_module_from_file("custom_formula", metric_formula)
     from custom_formula import Formula
 
     formula: FormulaBase = Formula()
 
-    score = formula.score(records_sum=records_sum)
+    score = formula.score(records=records)
+    return score
+
+
+def _get_agent_score(
+    records: Dict[str, Dict[str, Record]], metric_formula: Path
+) -> Dict[str, Dict[str, Score]]:
+    import_module_from_file("custom_formula", metric_formula)
+    from custom_formula import costs_to_score
+
+    from smarts.env.gymnasium.wrappers.metric.formula import agent_scores
+
+    score = agent_scores(records=records, func=costs_to_score)
     return score
 
 
diff --git a/smarts/env/gymnasium/driving_smarts_2023_env.py b/smarts/env/gymnasium/driving_smarts_2023_env.py
index 86cb84edbd..25127dddfd 100644
--- a/smarts/env/gymnasium/driving_smarts_2023_env.py
+++ b/smarts/env/gymnasium/driving_smarts_2023_env.py
@@ -165,6 +165,7 @@ def resolve_agent_interface(agent_interface: AgentInterface):
         wrong_way=False,
         not_moving=False,
         agents_alive=None,
+        interest=None,
     )
     max_episode_steps = 1000
     waypoints_lookahead = 80
diff --git a/smarts/env/gymnasium/platoon_env.py b/smarts/env/gymnasium/platoon_env.py
index ff8e6662f5..1df1e23e98 100644
--- a/smarts/env/gymnasium/platoon_env.py
+++ b/smarts/env/gymnasium/platoon_env.py
@@ -169,6 +169,7 @@ def resolve_agent_interface(agent_interface: AgentInterface):
         on_shoulder=False,
         wrong_way=False,
         not_moving=False,
+        agents_alive=None,
         interest=InterestDoneCriteria(
             include_scenario_marked=True,
             strict=True,
diff --git a/smarts/env/gymnasium/wrappers/metric/costs.py b/smarts/env/gymnasium/wrappers/metric/costs.py
index 9c908275f0..f95fd1b37b 100644
--- a/smarts/env/gymnasium/wrappers/metric/costs.py
+++ b/smarts/env/gymnasium/wrappers/metric/costs.py
@@ -94,7 +94,7 @@ def func(
         T_u += u_t
 
         if not done:
-            return Costs(comfort=-np.inf)
+            return Costs(comfort=np.nan)
         else:
             T_trv = step
             for _ in range(T_p):
@@ -148,7 +148,7 @@ def func(
                 prev_route_displacement = cur_route_displacement
                 prev_dist_travelled = tot_dist_travelled
 
-            return Costs(dist_to_destination=-np.inf)
+            return Costs(dist_to_destination=np.nan)
         elif obs.events.reached_goal:
             return Costs(dist_to_destination=0)
         else:
@@ -422,10 +422,10 @@ def func(
         step = step + 1
 
         if not done:
-            return Costs(steps=-np.inf)
+            return Costs(steps=np.nan)
 
         if obs.events.reached_goal or obs.events.interest_done:
-            return Costs(steps=step / max_episode_steps)
+            return Costs(steps=min(step, max_episode_steps) / max_episode_steps)
         elif (
             len(obs.events.collisions) > 0
             or obs.events.off_road
diff --git a/smarts/env/gymnasium/wrappers/metric/formula.py b/smarts/env/gymnasium/wrappers/metric/formula.py
index 0e947c8740..ed653d661d 100644
--- a/smarts/env/gymnasium/wrappers/metric/formula.py
+++ b/smarts/env/gymnasium/wrappers/metric/formula.py
@@ -21,18 +21,12 @@
 # THE SOFTWARE.
 from __future__ import annotations
 
-import functools
-from typing import Dict, NewType
+from typing import Callable, Dict, NewType
 
 import numpy as np
 
 from smarts.env.gymnasium.wrappers.metric.params import Params
 from smarts.env.gymnasium.wrappers.metric.types import Costs, Record
-from smarts.env.gymnasium.wrappers.metric.utils import (
-    add_dataclass,
-    divide,
-    op_dataclass,
-)
 
 Score = NewType("Score", Dict[str, float])
 
@@ -53,10 +47,13 @@ def params(self) -> Params:
         """
         raise NotImplementedError
 
-    def score(self, records_sum: Dict[str, Dict[str, Record]]) -> Score:
+    def score(self, records: Dict[str, Dict[str, Record]]) -> Score:
         """Computes sub-component scores and one total combined score named
         "Overall" on the wrapped environment.
 
+        Args:
+            records (Dict[str, Dict[str, Record]]): Records.
+
         Returns:
             "Overall" score and other sub-component scores.
         """
@@ -79,57 +76,138 @@ def params(self) -> Params:
         """
         return Params()
 
-    def score(self, records_sum: Dict[str, Dict[str, Record]]) -> Score:
-        """
-        Computes four sub-component scores, namely, "DistanceToDestination",
-        "Time", "HumannessError", "RuleViolation", and one total combined score named
+    def score(self, records: Dict[str, Dict[str, Record]]) -> Score:
+        """Computes sub-component scores and one total combined score named
         "Overall" on the wrapped environment.
 
-        +-------------------+--------+-----------------------------------------------------------+
-        |                   | Range  | Remarks                                                   |
-        +===================+========+===========================================================+
-        | Overall           | [0, 1] | Total score. The higher, the better.                      |
-        +-------------------+--------+-----------------------------------------------------------+
-        | DistToDestination | [0, 1] | Remaining distance to destination. The lower, the better. |
-        +-------------------+--------+-----------------------------------------------------------+
-        | Time              | [0, 1] | Time taken to complete scenario. The lower, the better.   |
-        +-------------------+--------+-----------------------------------------------------------+
-        | HumannessError    | [0, 1] | Humanness indicator. The lower, the better.               |
-        +-------------------+--------+-----------------------------------------------------------+
-        | RuleViolation     | [0, 1] | Traffic rules compliance. The lower, the better.          |
-        +-------------------+--------+-----------------------------------------------------------+
+        Args:
+            records (Dict[str, Dict[str, Record]]): Records.
 
         Returns:
-            "Overall", "DistToDestination", "Time",
-            "HumannessError", and "RuleViolation" scores.
+            Score: "Overall" score and other sub-component scores.
         """
 
-        costs_final = avg_costs(records_sum=records_sum)
-
-        # Compute sub-components of score.
-        dist_to_destination = costs_final.dist_to_destination
-        humanness_error = _humanness_error(costs=costs_final)
-        rule_violation = _rule_violation(costs=costs_final)
-        time = costs_final.steps
-        overall = (
-            0.25 * (1 - dist_to_destination)
-            + 0.25 * (1 - time)
-            + 0.25 * (1 - humanness_error)
-            + 0.25 * (1 - rule_violation)
+        agent_weight = agent_weights(records=records)
+        agent_score = agent_scores(records=records, func=costs_to_score)
+        return weighted_score(scores=agent_score, weights=agent_weight)
+
+
+def agent_weights(records: Dict[str, Dict[str, Record]]) -> Dict[str, Dict[str, float]]:
+    """Retrieves weight for each agent in every scenario.
+
+    Args:
+        records (Dict[str, Dict[str, Record]]): Records.
+
+    Returns:
+        Dict[str,Dict[str,float]]: Weight for each agent in every scenario.
+    """
+
+    weights = {}
+    for scen, agents in records.items():
+        weights[scen] = dict(
+            map(lambda i: (i[0], i[1].metadata.difficulty), agents.items())
         )
 
-        return Score(
-            {
-                "overall": overall,
-                "dist_to_destination": dist_to_destination,
-                "time": time,
-                "humanness_error": humanness_error,
-                "rule_violation": rule_violation,
+    return weights
+
+
+def agent_scores(
+    records: Dict[str, Dict[str, Record]], func: Callable[[Costs], Score]
+) -> Dict[str, Dict[str, Score]]:
+    """Computes score for each agent in every scenario.
+
+    Args:
+        records (Dict[str, Dict[str, Record]]): Records.
+        func (Callable[[Costs],Score]): Function which computes Score given Costs.
+
+    Returns:
+        Dict[str,Dict[str,Score]]: Score for each agent in every scenario.
+    """
+
+    scores = {}
+    for scen, agents in records.items():
+        scores[scen] = dict(map(lambda i: (i[0], func(i[1].costs)), agents.items()))
+
+    return scores
+
+
+def weighted_score(
+    scores: Dict[str, Dict[str, Score]], weights: Dict[str, Dict[str, float]]
+) -> Score:
+    """Computes single overall weighted score using `weights`.
+
+    Args:
+        scores (Dict[str,Dict[str,Score]]): Score for each agent in every scenario.
+        weights (Dict[str,Dict[str,float]]): Weight for each agent in every scenario.
+
+    Returns:
+        Score: Weighted score.
+    """
+    cumulative_score = {}
+    total_weight = 0
+    for scen, agent in scores.items():
+        for agent_name, agent_score in agent.items():
+            current_score = dict(
+                map(
+                    lambda i: (i[0], i[1] * weights[scen][agent_name]),
+                    agent_score.items(),
+                )
+            )
+            cumulative_score = {
+                score_name: score_val + cumulative_score.get(score_name, 0)
+                for score_name, score_val in current_score.items()
             }
-        )
+            total_weight += weights[scen][agent_name]
+
+    return Score({key: val / total_weight for key, val in cumulative_score.items()})
+
+
+def costs_to_score(costs: Costs) -> Score:
+    """Compute score from costs.
+
+    +-------------------+--------+-----------------------------------------------------------+
+    |                   | Range  | Remarks                                                   |
+    +===================+========+===========================================================+
+    | Overall           | [0, 1] | Total score. The higher, the better.                      |
+    +-------------------+--------+-----------------------------------------------------------+
+    | DistToDestination | [0, 1] | Remaining distance to destination. The lower, the better. |
+    +-------------------+--------+-----------------------------------------------------------+
+    | Time              | [0, 1] | Time taken to complete scenario. The lower, the better.   |
+    +-------------------+--------+-----------------------------------------------------------+
+    | HumannessError    | [0, 1] | Humanness indicator. The lower, the better.               |
+    +-------------------+--------+-----------------------------------------------------------+
+    | RuleViolation     | [0, 1] | Traffic rules compliance. The lower, the better.          |
+    +-------------------+--------+-----------------------------------------------------------+
+
+    Args:
+        costs (Costs): Costs.
+
+    Returns:
+        Score: Score.
+    """
+    dist_to_destination = costs.dist_to_destination
+    humanness_error = _score_humanness_error(costs=costs)
+    rule_violation = score_rule_violation(costs=costs)
+    time = costs.steps
+    overall = (
+        0.25 * (1 - dist_to_destination)
+        + 0.25 * (1 - time)
+        + 0.25 * (1 - humanness_error)
+        + 0.25 * (1 - rule_violation)
+    )
+
+    return Score(
+        {
+            "overall": overall,
+            "dist_to_destination": dist_to_destination,
+            "time": time,
+            "humanness_error": humanness_error,
+            "rule_violation": rule_violation,
+        }
+    )
 
 
-def _humanness_error(costs: Costs) -> float:
+def _score_humanness_error(costs: Costs) -> float:
     humanness_error = np.array(
         [costs.dist_to_obstacles, costs.jerk_linear, costs.lane_center_offset]
     )
@@ -137,41 +215,15 @@ def _humanness_error(costs: Costs) -> float:
     return humanness_error
 
 
-def _rule_violation(costs: Costs) -> float:
-    rule_violation = np.array([costs.speed_limit, costs.wrong_way])
-    rule_violation = np.mean(rule_violation, dtype=float)
-    return rule_violation
-
-
-def avg_costs(records_sum: Dict[str, Dict[str, Record]]) -> Costs:
-    """Averages costs over number of agents and number of episodes.
+def score_rule_violation(costs: Costs) -> float:
+    """Default rule violation scoring formula.
 
     Args:
-        records_sum (Dict[str, Dict[str, Record]]): Raw costs.
+        costs (Costs): Costs.
 
     Returns:
-        Costs: Averaged costs.
+        float: Rule violation score.
     """
-    costs_total = Costs()
-    episodes = 0
-    for scen, val in records_sum.items():
-        # Number of agents in scenario.
-        agents_in_scenario = len(val.keys())
-        costs_list, counts_list = zip(
-            *[(record.costs, record.counts) for agent, record in val.items()]
-        )
-        # Sum costs over all agents in scenario.
-        costs_sum_agent: Costs = functools.reduce(
-            lambda a, b: add_dataclass(a, b), costs_list
-        )
-        # Average costs over number of agents in scenario.
-        costs_mean_agent = op_dataclass(costs_sum_agent, agents_in_scenario, divide)
-        # Sum costs over all scenarios.
-        costs_total = add_dataclass(costs_total, costs_mean_agent)
-        # Increment total number of episodes.
-        episodes += counts_list[0].episodes
-
-    # Average costs over total number of episodes.
-    costs_final = op_dataclass(costs_total, episodes, divide)
-
-    return costs_final
+    rule_violation = np.array([costs.speed_limit, costs.wrong_way])
+    rule_violation = np.mean(rule_violation, dtype=float)
+    return rule_violation
diff --git a/smarts/env/gymnasium/wrappers/metric/metrics.py b/smarts/env/gymnasium/wrappers/metric/metrics.py
index 14fb0c0b6a..87b03d756c 100644
--- a/smarts/env/gymnasium/wrappers/metric/metrics.py
+++ b/smarts/env/gymnasium/wrappers/metric/metrics.py
@@ -46,7 +46,7 @@
 )
 from smarts.env.gymnasium.wrappers.metric.formula import FormulaBase, Score
 from smarts.env.gymnasium.wrappers.metric.params import Params
-from smarts.env.gymnasium.wrappers.metric.types import Costs, Counts, Record
+from smarts.env.gymnasium.wrappers.metric.types import Costs, Counts, Metadata, Record
 from smarts.env.gymnasium.wrappers.metric.utils import (
     add_dataclass,
     divide,
@@ -250,12 +250,14 @@ def reset(self, **kwargs):
                     }
                 })
 
+            max_episode_steps = self._scen.metadata.get("scenario_duration",0) / self.env.smarts.fixed_timestep_sec
+            max_episode_steps = max_episode_steps or self.env.agent_interfaces[agent_name].max_episode_steps
             cost_funcs_kwargs.update({
                 "dist_to_obstacles": {
                     "ignore": self._params.dist_to_obstacles.ignore
                 },
                 "steps": {
-                    "max_episode_steps": self.env.agent_interfaces[agent_name].max_episode_steps
+                    "max_episode_steps": max_episode_steps
                 },
             })
             self._cost_funcs[agent_name] = make_cost_funcs(
@@ -268,6 +270,7 @@ def reset(self, **kwargs):
                 agent_name: Record(
                     costs=Costs(),
                     counts=Counts(),
+                    metadata=Metadata(difficulty=self._scen.metadata.get("scenario_difficulty",1)),
                 )
                 for agent_name in self._cur_agents
             }
@@ -284,11 +287,11 @@ def records(self) -> Dict[str, Dict[str, Record]]:
             $ env.records()
             $ {
                   scen1: {
-                      agent1: Record(costs, counts),
-                      agent2: Record(costs, counts),
+                      agent1: Record(costs, counts, metadata),
+                      agent2: Record(costs, counts, metadata),
                   },
                   scen2: {
-                      agent1: Record(costs, counts),
+                      agent1: Record(costs, counts, metadata),
                   },
               }
 
@@ -307,6 +310,7 @@ def records(self) -> Dict[str, Dict[str, Record]]:
                         data_copy.costs, data_copy.counts.episodes, divide
                     ),
                     counts=data_copy.counts,
+                    metadata=data_copy.metadata,
                 )
 
         return records
@@ -320,8 +324,7 @@ def score(self) -> Score:
             Dict[str, float]: Contains key-value pairs denoting score
             components.
         """
-        records_sum_copy = copy.deepcopy(self._records_sum)
-        return self._formula.score(records_sum=records_sum_copy)
+        return self._formula.score(records=self.records())
 
 
 def _get_end_and_dist(
@@ -527,8 +530,17 @@ def _check_scen(scenario: Scenario, agent_interfaces: Dict[str, AgentInterface])
         agent_interfaces (Dict[str,AgentInterface]): Agent interfaces.
 
     Raises:
-        AttributeError: If any agent's mission is not of type PositionGoal.
+        MetricsError: If (i) scenario difficulty is not properly normalized,
+            or (ii) any agent's goal is improperly configured.
     """
+
+    difficulty = scenario.metadata.get("scenario_difficulty", None)
+    if not ((difficulty is None) or (0 < difficulty <= 1)):
+        raise MetricsError(
+            "Expected scenario difficulty to be normalized within (0,1], but "
+            f"got difficulty={difficulty}."
+        )
+
     goal_types = {
         agent_name: type(agent_mission.goal)
         for agent_name, agent_mission in scenario.missions.items()
@@ -545,7 +557,7 @@ def _check_scen(scenario: Scenario, agent_interfaces: Dict[str, AgentInterface])
                 and aoi != None
             )
         ):
-            raise AttributeError(
+            raise MetricsError(
                 "{0} has an unsupported goal type {1} and interest done criteria {2} "
                 "combination.".format(
                     agent_name, goal_types[agent_name], interest_criteria
diff --git a/smarts/env/gymnasium/wrappers/metric/types.py b/smarts/env/gymnasium/wrappers/metric/types.py
index 6ad22c17cd..06e97a7a0f 100644
--- a/smarts/env/gymnasium/wrappers/metric/types.py
+++ b/smarts/env/gymnasium/wrappers/metric/types.py
@@ -55,10 +55,20 @@ class Counts:
     """
 
 
+@dataclass(frozen=True)
+class Metadata:
+    """Metadata of the record."""
+
+    difficulty: float = 1
+    """Task difficulty value.
+    """
+
+
 @dataclass
 class Record:
-    """Stores an agent's scenario-completion, performance-count, and
-    performance-cost values."""
+    """Stores an agent's performance-cost, performance-count, and
+    performance-metadata values."""
 
     costs: Costs
     counts: Counts
+    metadata: Metadata
diff --git a/smarts/env/tests/test_metrics.py b/smarts/env/tests/test_metrics.py
index 7893573d3f..a9d4394367 100644
--- a/smarts/env/tests/test_metrics.py
+++ b/smarts/env/tests/test_metrics.py
@@ -31,7 +31,7 @@
 from smarts.core.controllers import ActionSpaceType
 from smarts.core.coordinates import Heading, Point
 from smarts.core.plan import EndlessGoal, Goal, Mission, PositionalGoal, Start
-from smarts.env.gymnasium.wrappers.metric.metrics import Metrics
+from smarts.env.gymnasium.wrappers.metric.metrics import Metrics, MetricsError
 from smarts.zoo.agent_spec import AgentSpec
 
 
@@ -163,7 +163,7 @@ def test_reset(make_env):
             goal=EndlessGoal(),
         ),
     ):
-        with pytest.raises(AttributeError):
+        with pytest.raises(MetricsError):
             env = Metrics(env=make_env)
             env.reset()
         return
diff --git a/smarts/sstudio/types/scenario.py b/smarts/sstudio/types/scenario.py
index e0021190c0..eb0b0d5557 100644
--- a/smarts/sstudio/types/scenario.py
+++ b/smarts/sstudio/types/scenario.py
@@ -45,9 +45,9 @@ class ScenarioMetadataFields(IntEnum):
     actor_of_interest_re_filter = enum.auto()
     """Actors with names that match this pattern are actors of interest."""
     scenario_difficulty = enum.auto()
-    """Custom difficulty marking values."""
+    """Custom difficulty marking values, normalized to (0,1]."""
     scenario_duration = enum.auto()
-    """The expected scenario time length."""
+    """The expected scenario time length in seconds."""
 
 
 class ScenarioMetadata(StandardMetadata):
@@ -58,8 +58,8 @@ def __init__(
         metadata: Optional[Dict[Union[str, ScenarioMetadataFields], Any]] = None,
         *,
         actor_of_interest_re_filter: Optional[str] = None,
-        actor_of_interest_color: Optional[Colors] = Colors.Blue,
-        scenario_difficulty: Optional[int] = None,
+        actor_of_interest_color: Optional[Colors] = None,
+        scenario_difficulty: Optional[float] = None,
         scenario_duration: Optional[float] = None,
     ) -> None:
         if metadata is None: