diff --git a/docs/content/docs/harbor/index.mdx b/docs/content/docs/harbor/index.mdx
index 9a3eaab4e4..a3e55d524c 100644
--- a/docs/content/docs/harbor/index.mdx
+++ b/docs/content/docs/harbor/index.mdx
@@ -212,7 +212,7 @@ agent:
   override_timeout_sec: 1200          # Time (seconds) given for a single Trial to run
   kwargs:
     max_turns: 32                     # Max agent iterations per trial
-    store_all_messages: true          # Required for SkyRL to extract training data
+    collect_rollout_details: true     # Required for SkyRL to extract training data
     temperature: 1.0                  # Sampling temperature (higher = more exploration)
     enable_summarize: false           # Context summarization when nearing token limits
     model_info:
@@ -221,7 +221,7 @@ agent:
 ```
 
 <Callout type="info">
-`store_all_messages: true` is **required** for training. Without it, SkyRL cannot extract the chat history needed to compute loss masks and train the model.
+`collect_rollout_details: true` is **required** for training, where fields like `prompt_token_ids`, `completion_token_ids`, and `logprobs` are crucial for model training.
 </Callout>
 
 ### Key Knobs for RL Training
diff --git a/examples/train_integrations/harbor/entrypoints/main_harbor_fully_async.py b/examples/train_integrations/harbor/entrypoints/main_harbor_fully_async.py
new file mode 100644
index 0000000000..76150d98ed
--- /dev/null
+++ b/examples/train_integrations/harbor/entrypoints/main_harbor_fully_async.py
@@ -0,0 +1,74 @@
+"""
+Fully-async entrypoint for training on Harbor tasks.
+
+Reuses HarborExp's generator/dataset overrides and swaps in
+``FullyAsyncRayPPOTrainer``. This is the moral equivalent of
+``examples/train/fully_async/main_fully_async.py`` for harbor.
+"""
+
+import asyncio
+import sys
+
+import ray
+import yaml
+
+from skyrl.train.fully_async_trainer import FullyAsyncRayPPOTrainer
+from skyrl.train.utils import validate_cfg
+from skyrl.train.utils.utils import initialize_ray
+
+from .main_harbor import HARBOR_DEFAULT_CONFIG, HarborExp, HarborSkyRLConfig, _deep_merge
+
+
+class HarborFullyAsyncExp(HarborExp):
+    def get_trainer(
+        self,
+        cfg,
+        tracker,
+        tokenizer,
+        train_dataset,
+        eval_dataset,
+        inference_engine_client,
+        generator,
+        colocate_pg,
+    ):
+        return FullyAsyncRayPPOTrainer(
+            cfg=cfg,
+            tracker=tracker,
+            tokenizer=tokenizer,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            inference_engine_client=inference_engine_client,
+            generator=generator,
+            colocate_pg=colocate_pg,
+        )
+
+    def run(self):
+        trainer = self._setup_trainer()
+        asyncio.run(trainer.train())
+
+
+@ray.remote(num_cpus=1)
+def skyrl_entrypoint(cfg):
+    exp = HarborFullyAsyncExp(cfg)
+    exp.run()
+
+
+def main() -> None:
+    cfg = HarborSkyRLConfig.from_cli_overrides(sys.argv[1:])
+
+    with open(HARBOR_DEFAULT_CONFIG) as f:
+        defaults = yaml.safe_load(f)
+    cfg.harbor_trial_config = _deep_merge(defaults, cfg.harbor_trial_config)
+
+    validate_cfg(cfg)
+    if cfg.trainer.algorithm.max_seq_len is None:
+        raise ValueError(
+            "trainer.algorithm.max_seq_len must be explicitly set for Harbor training; "
+            "it is required to truncate responses to the maximum allowed length."
+        )
+    initialize_ray(cfg)
+    ray.get(skyrl_entrypoint.remote(cfg))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/train_integrations/harbor/harbor_generator.py b/examples/train_integrations/harbor/harbor_generator.py
index 2232f11817..4841846bde 100644
--- a/examples/train_integrations/harbor/harbor_generator.py
+++ b/examples/train_integrations/harbor/harbor_generator.py
@@ -5,14 +5,15 @@
 from loguru import logger
 from uuid import uuid4
 from skyrl.train.generators.base import GeneratorInterface, GeneratorInput, GeneratorOutput, TrajectoryID
-from skyrl.train.generators.utils import get_rollout_metrics, get_response_ids_and_loss_mask_from_messages
+from skyrl.train.generators.utils import get_rollout_metrics
 from skyrl.backends.skyrl_train.inference_engines.inference_engine_client import InferenceEngineClient
 from skyrl.backends.skyrl_train.inference_engines.base import ConversationType
 from skyrl.train.utils.rate_limiter import create_rate_limiter
 from tqdm import tqdm
-from omegaconf import DictConfig, OmegaConf
+from omegaconf import DictConfig
 from harbor.trial.trial import Trial
 from harbor.models.trial.config import TrialConfig
+from harbor.models.agent.rollout_detail import RolloutDetail
 
 # Suppress LiteLLM verbose logging
 
@@ -28,141 +29,159 @@
 MAX_NUM_RETRIES_PER_TRIAL = 2
 
 
-class ChatHistoryExtractor:
-    """Extracts a (chat_history, summarization_count, num_turns) tuple from Harbor trial results.
+@dataclass
+class HarborTrajectoryOutput:
+    """One trajectory's raw output from Harbor.
 
-    Supports two extraction strategies, tried in order:
-    1. all_messages agents (terminus-2, terminus-1, terminus): metadata["all_messages"]
-    2. Trajectory-based agents (mini-swe-agent, swe-agent, openhands):
-       trajectory.json converted to user/assistant messages
+    Holds the entire ``rollout_details`` from ``agent_result``. Per-step interpretation
+    (loss-mask / reward broadcast / overlong filtering) is done downstream in
+    `build_step_wise_generator_output`.
     """
 
-    # Agents that write trajectory.json (ATIF format) instead of metadata["all_messages"].
-    # OpenHands uses condensation (off-policy) - use reject_summarization=false to allow.
-    TRAJECTORY_BASED_AGENTS = frozenset(
-        {"mini-swe-agent", "swe-agent", "openhands", "openhands-host"})
-
-    @classmethod
-    def extract(cls, results) -> Optional[tuple]:
-        """Return (chat_history, summarization_count, num_turns) or None on failure."""
-        agent_result = results.agent_result
-        if agent_result is None:
-            return None
-
-        metadata = agent_result.metadata or {}
-        chat_history = metadata.get("all_messages")
-        if chat_history is not None:
-            return chat_history, metadata.get("summarization_count", 0), metadata.get("n_episodes", 0)
-
-        # Fallback: load from trajectory.json or completions for trajectory-based agents
-        agent_name = (getattr(results.config.agent,
-                      "name", None) or "").lower()
-        if agent_name not in cls.TRAJECTORY_BASED_AGENTS:
-            return None
-
-        trial_path = cls._trial_path_from_uri(
-            getattr(results, "trial_uri", None) or "")
-        if trial_path is None:
-            return None
-
-        trajectory_path = trial_path / "agent" / "trajectory.json"
-        chat_history = cls._from_atif_trajectory(trajectory_path)
-        if chat_history is None:
-            return None
-
-        # Trajectory-based agents don't track summarization; use 0 for strictly appending
-        return chat_history, 0, cls._count_turns(chat_history)
-
-    # ------------------------------------------------------------------
-    # Private helpers
-    # ------------------------------------------------------------------
-
-    @staticmethod
-    def _count_turns(messages: List[dict]) -> int:
-        return sum(1 for m in messages if m["role"] == "assistant")
-
-    @staticmethod
-    def _trial_path_from_uri(trial_uri: str) -> Optional[Path]:
-        """Extract local filesystem path from trial_uri (e.g. file:///path/to/trial)."""
-        if not trial_uri:
-            return None
-        try:
-            parsed = urlparse(trial_uri)
-            if parsed.scheme == "file" and parsed.path:
-                return Path(parsed.path)
-        except Exception:
-            pass
-        return None
-
-    @classmethod
-    def _from_atif_trajectory(cls, trajectory_path: Path) -> Optional[List[dict]]:
-        """Convert ATIF trajectory JSON to user/assistant chat messages for SkyRL training.
-
-        Handles system steps (prepended to first user message), agent observations
-        (converted to user messages for alternating user/assistant pattern), and
-        tool_calls (serialized into assistant content).
-        """
-        if not trajectory_path.exists():
-            return None
-        try:
-            with open(trajectory_path) as f:
-                data = json.load(f)
-        except Exception as e:
-            logger.warning(
-                f"Failed to load trajectory from {trajectory_path}: {e}")
-            return None
-
-        messages: List[dict] = []
-        pending_system: List[str] = []
-
-        for step in data.get("steps", []):
-            source = step.get("source", "")
-            message = step.get("message", "")
-            observation = step.get("observation")
-
-            if source == "system":
-                if message:
-                    pending_system.append(message)
-                continue
-
-            if source == "user":
-                content = message or ""
-                if pending_system:
-                    content = "\n\n".join(pending_system) + "\n\n" + content
-                    pending_system = []
-                messages.append({"role": "user", "content": content})
-
-            elif source == "agent":
-                content = message or ""
-                if step.get("tool_calls"):
-                    content = content + "\n" + \
-                        json.dumps({"tool_calls": step["tool_calls"]})
-                if not content:
-                    continue
-                messages.append({"role": "assistant", "content": content})
-
-                # Observations represent environment feedback; emit as user message
-                # to maintain the alternating user/assistant pattern required for RL.
-                if observation and observation.get("results"):
-                    obs_parts = [r.get("content", "")
-                                 for r in observation["results"] if r.get("content")]
-                    if obs_parts:
-                        messages.append(
-                            {"role": "user", "content": "\n".join(obs_parts)})
-
-        return messages if messages else None
-
-
-@dataclass
-class HarborAgentOutput:
-    response_ids: List[int]
-    reward: float
-    stop_reason: str
-    loss_mask: List[int]
-    prompt_ids: List[int]
     trajectory_id: TrajectoryID
-    summarization_count: Optional[int] = None
-    num_turns: Optional[int] = None
+    # Entire rollout_details list as returned by harbor's agent_result. None for failed trajectories
+    # (agent_timeout / error) that we will mask in `build_step_wise_generator_output`.
+    rollout_details: Optional[List[RolloutDetail]] = None
+    reward: float = 0.0
+    num_turns: int = 0
+    # One of: "complete", "context_length", "agent_timeout", "error". Used by
+    # `build_step_wise_generator_output` to decide whether to skip the entire prompt group.
+    stop_reason: str = "complete"
+
+
+def build_step_wise_generator_output(
+    trajectory_outputs: List[HarborTrajectoryOutput], overlong_filtering: bool
+) -> GeneratorOutput:
+    """Flatten per-trajectory rollout details into one entry per LLM turn.
+
+    Steps for one trajectory are emitted contiguously and the last step has
+    ``is_last_step=True``. Failures (timeout / unknown error / empty rollout
+    details) are batched per ``instance_id``: if any rollout for prompt P
+    failed, all rollouts for P are replaced with single zeroed-out
+    placeholder steps.
+    """
+    # 1. Identify failed instances. If any rollout for prompt P failed, mask all rollouts for P (conservative).
+    timeout_instance_ids = set()
+    error_instance_ids = set()
+    all_instance_ids = set()
+    num_timeout_trajectories = 0
+    num_error_trajectories = 0
+    for traj in trajectory_outputs:
+        instance_id = traj.trajectory_id.instance_id
+        all_instance_ids.add(instance_id)
+        if traj.stop_reason == "agent_timeout":
+            num_timeout_trajectories += 1
+            timeout_instance_ids.add(instance_id)
+        elif traj.stop_reason == "error" or traj.rollout_details is None:
+            num_error_trajectories += 1
+            error_instance_ids.add(instance_id)
+    masked_instance_ids = timeout_instance_ids | error_instance_ids
+
+    # 2. Walk trajectories and emit one entry of GeneratorOutput per step.
+    prompt_token_ids: List[List[int]] = []
+    response_ids: List[List[int]] = []
+    rewards: List[float] = []
+    loss_masks: List[List[int]] = []
+    stop_reasons: List[str] = []
+    is_last_step_list: List[bool] = []
+    out_trajectory_ids: List[TrajectoryID] = []
+    rollout_logprobs_list: List[List[float]] = []
+
+    successful_trajectories: List[HarborTrajectoryOutput] = []
+    response_ids_for_metrics: List[List[int]] = []
+    rewards_for_metrics: List[float] = []
+    for traj in trajectory_outputs:
+        tid = traj.trajectory_id
+
+        # 2.1. For failed trajectories, set loss mask to [0] and stop reason to "error".
+        if tid.instance_id in masked_instance_ids:
+            prompt_token_ids.append([0])
+            response_ids.append([0])
+            rewards.append(0.0)
+            loss_masks.append([0])
+            stop_reasons.append("error")
+            is_last_step_list.append(True)
+            out_trajectory_ids.append(tid)
+            rollout_logprobs_list.append([0.0])
+            continue
+
+        # 2.2. For successful trajectories, emit one entry per step.
+        successful_trajectories.append(traj)
+
+        # 2.3. Check rollout_details expected format.
+        # Expect no summarization; rollout_details is a single linear chat segment from the main agent.
+        # TODO(Charlie): Support summarization.
+        assert len(traj.rollout_details) == 1, f"Expected exactly one rollout segment, got {len(traj.rollout_details)}."
+        rollout_detail = traj.rollout_details[0]
+        prompt_token_ids_per_turn = rollout_detail["prompt_token_ids"]
+        completion_token_ids_per_turn = rollout_detail["completion_token_ids"]
+        logprobs_per_turn = rollout_detail["logprobs"]
+        n_turns = len(completion_token_ids_per_turn)
+        assert len(prompt_token_ids_per_turn) == n_turns and len(logprobs_per_turn) == n_turns, (
+            f"Malformed rollout_details (prompts={len(prompt_token_ids_per_turn)}, completions={n_turns}, "
+            f"logprobs={len(logprobs_per_turn)})."
+        )
+
+        # 2.4. Emit one entry per step, following SkyRL's step-wise convention.
+        for t in range(n_turns):
+            comp_ids = completion_token_ids_per_turn[t]
+            p_ids = prompt_token_ids_per_turn[t]
+            lp = logprobs_per_turn[t]
+            assert len(lp) == len(comp_ids), "logprobs and completion token ids must have the same length."
+
+            # Record actual reward in last turn, and zeros for all other turns.
+            is_last = t == n_turns - 1
+            reward = traj.reward if is_last else 0.0
+
+            # Loss mask.
+            step_loss_mask = [1] * len(comp_ids)
+            step_stop_reason = "complete"
+            if traj.stop_reason == "context_length":
+                step_stop_reason = "context_length"
+                if overlong_filtering:
+                    step_loss_mask = [0] * len(comp_ids)
+
+            prompt_token_ids.append(p_ids)
+            response_ids.append(comp_ids)
+            rewards.append(reward)
+            loss_masks.append(step_loss_mask)
+            stop_reasons.append(step_stop_reason)
+            is_last_step_list.append(is_last)
+            out_trajectory_ids.append(tid)
+            rollout_logprobs_list.append(lp)
+
+        # 2.5. For trajectory-level metrics, record the last turn's prompt IDs and response IDs which
+        # contains the entire trajectory.
+        response_ids_for_metrics.append(prompt_token_ids_per_turn[-1] + completion_token_ids_per_turn[-1])
+        rewards_for_metrics.append(traj.reward)
+
+    # 3. Aggregate trajectory-level metrics for logging.
+    if successful_trajectories:
+        rollout_metrics = get_rollout_metrics(response_ids_for_metrics, rewards_for_metrics)
+        rollout_metrics["generate/trajectories_context_length_exceeded"] = sum(
+            1 for t in successful_trajectories if t.stop_reason == "context_length"
+        )
+        rollout_metrics["generate/avg_num_turns"] = sum(t.num_turns for t in successful_trajectories) / len(
+            successful_trajectories
+        )
+    else:
+        rollout_metrics = {}
+
+    rollout_metrics["generate/num_timeout_trajectories"] = num_timeout_trajectories
+    rollout_metrics["generate/num_error_trajectories"] = num_error_trajectories
+    rollout_metrics["generate/num_masked_instances"] = len(masked_instance_ids)
+
+    return GeneratorOutput(
+        prompt_token_ids=prompt_token_ids,
+        response_ids=response_ids,
+        rewards=rewards,
+        loss_masks=loss_masks,
+        stop_reasons=stop_reasons,
+        rollout_metrics=rollout_metrics,
+        rollout_logprobs=rollout_logprobs_list,
+        trajectory_ids=out_trajectory_ids,
+        is_last_step=is_last_step_list,
+    )
 
 
 class HarborGenerator(GeneratorInterface):
@@ -188,8 +207,16 @@ def __init__(
         self.tokenizer = tokenizer
         self.max_seq_len = max_seq_len
 
-        # Harbor config template - users can specify any Harbor TrialConfig options in YAML or command line.
-        # SkyRL injects: model_name and api_base (once at init), task.path and session_id (per trial)
+        if not getattr(generator_cfg, "step_wise_trajectories", False):
+            raise ValueError(
+                "HarborGenerator only supports step-wise training. " "Set generator.step_wise_trajectories=true."
+            )
+        if not getattr(generator_cfg, "merge_stepwise_output", False):
+            logger.warning(
+                "merge_stepwise_output=true is not set; will not merge step-wise outputs. This "
+                "may result in much slower training."
+            )
+
         self._harbor_trial_config_template = deepcopy(harbor_cfg)
 
         # Set model_name and api_base once (constant across all trials)
@@ -202,26 +229,29 @@ def __init__(
         ] = f"hosted_vllm/{ie_cfg.served_model_name}"
         self._harbor_trial_config_template["agent"].setdefault("kwargs", {})["api_base"] = f"{self.base_url}/v1"
 
+        # Step-wise needs per-turn token IDs and logprobs from vLLM via Harbor.
+        agent_kwargs = self._harbor_trial_config_template["agent"]["kwargs"]
+        if not agent_kwargs.get("collect_rollout_details", False):
+            logger.warning("step_wise_trajectories=true requires collect_rollout_details=true; enabling automatically.")
+            agent_kwargs["collect_rollout_details"] = True
+
+        # Can support summarization in future.
+        if agent_kwargs.get("enable_summarize", False):
+            raise ValueError(
+                "step_wise_trajectories=true is incompatible with enable_summarize=true. "
+                "Set harbor_trial_config.agent.kwargs.enable_summarize=false."
+            )
+
         logger.info(
             f"HarborGenerator initialized with Harbor config. "
             f"Agent: {self._harbor_trial_config_template.get('agent', {}).get('name')}, "
             f"Trials dir: {self._harbor_trial_config_template.get('trials_dir', 'trials')}"
         )
 
-        # Read custom chat template
-        custom_chat_template_path = ie_cfg.engine_init_kwargs.get("chat_template", None)
-        if custom_chat_template_path:
-            with open(custom_chat_template_path, "r") as f:
-                self.custom_chat_template_content = f.read()
-            logger.info(f"HarborGenerator initialized with custom chat template read from: {custom_chat_template_path}")
-        else:
-            self.custom_chat_template_content = None
-
-        # Initialize rate limiter from generator config (not part of Harbor TrialConfig)
         rate_limit_config = getattr(generator_cfg, "rate_limit", None)
         self._rate_limiter = create_rate_limiter(rate_limit_config)
 
-    async def generate(self, input_batch: GeneratorInput) -> GeneratorOutput:
+    async def generate(self, input_batch: GeneratorInput, disable_tqdm: bool = False) -> GeneratorOutput:
         prompts = input_batch["prompts"]
         trajectory_ids = input_batch["trajectory_ids"]
 
@@ -229,11 +259,12 @@ async def generate(self, input_batch: GeneratorInput) -> GeneratorOutput:
             raise ValueError("`trajectory_ids` is required in the input batch")
         if len(prompts) != len(trajectory_ids):
             raise ValueError(
-                f"Prompt count ({len(prompts)}) doesn't match " f"trajectory_ids count ({len(trajectory_ids)})"
+                f"Prompt count ({len(prompts)}) doesn't match trajectory_ids count ({len(trajectory_ids)})"
             )
 
-        all_outputs: List[HarborAgentOutput] = [None] * len(prompts)  # type: ignore[list-item]
+        all_outputs: List[HarborTrajectoryOutput] = [None] * len(prompts)  # type: ignore[list-item]
         progress = tqdm(
+            disable=disable_tqdm,  # disable for fully async training
             total=len(prompts),
             desc="Generating Trajectories",
             miniters=max(1, len(prompts) // 10),
@@ -241,7 +272,7 @@ async def generate(self, input_batch: GeneratorInput) -> GeneratorOutput:
         )
 
         async def _worker(idx, prompt, trajectory_id):
-            result = await self.harbor_agent_loop(prompt=prompt, trajectory_id=trajectory_id)
+            result = await self._harbor_agent_loop(prompt=prompt, trajectory_id=trajectory_id)
             all_outputs[idx] = result
             progress.update(1)
 
@@ -251,110 +282,27 @@ async def _worker(idx, prompt, trajectory_id):
                     tg.create_task(_worker(idx, prompt, trajectory_id))
         finally:
             progress.close()
-        all_outputs, rollout_metrics = self._mask_failed_instances_and_compute_metrics(all_outputs)
-
-        generator_output: GeneratorOutput = {
-            "prompt_token_ids": [output.prompt_ids for output in all_outputs],
-            "response_ids": [output.response_ids for output in all_outputs],
-            "rewards": [output.reward for output in all_outputs],
-            "loss_masks": [output.loss_mask for output in all_outputs],
-            "stop_reasons": [output.stop_reason for output in all_outputs],
-            "rollout_metrics": rollout_metrics,
-            "rollout_logprobs": None,
-        }
-
-        return generator_output
-
-    @staticmethod
-    def _mask_failed_instances_and_compute_metrics(
-        all_outputs: List[HarborAgentOutput],
-    ) -> tuple[List[HarborAgentOutput], dict]:
-        """Mutates all_outputs in-place: zeros out every output belonging to a failed instance.
-
-        For a group of trajectories (n_samples_per_prompt for the same prompt),
-        if one trajectory fails we skip training the entire group.
-
-        Returns:
-            all_outputs: The same list, with failed-instance outputs zeroed out.
-            rollout_metrics: Dict of rollout metrics for logging.
-        """
-        # Count failures by type before grouping overwrites stop_reason.
-        num_timeout_trajectories = 0
-        num_error_trajectories = 0
-        timeout_instance_ids = set()
-        error_instance_ids = set()
-        all_instance_ids = set()
-        for output in all_outputs:
-            cur_instance_id = output.trajectory_id.instance_id
-            all_instance_ids.add(cur_instance_id)
-            if output.stop_reason == "agent_timeout":
-                num_timeout_trajectories += 1
-                timeout_instance_ids.add(cur_instance_id)
-            elif output.stop_reason == "error":
-                num_error_trajectories += 1
-                error_instance_ids.add(cur_instance_id)
-
-        masked_instance_ids = timeout_instance_ids | error_instance_ids
-
-        # Zero-out all outputs belonging to any timeout or error instance so we skip training on them.
-        successful_outputs: List[HarborAgentOutput] = []
-        for output in all_outputs:
-            if output.trajectory_id.instance_id in masked_instance_ids:
-                output.response_ids = [0]
-                output.stop_reason = "error"
-                output.loss_mask = [0]
-                output.prompt_ids = [0]
-                output.reward = 0
-            else:
-                successful_outputs.append(output)
-
-        # Rollout metrics for successful outputs.
-        if len(successful_outputs) > 0:
-            rollout_metrics = get_rollout_metrics(
-                [output.response_ids for output in successful_outputs],
-                [output.reward for output in successful_outputs],
-            )
-            rollout_metrics["generate/trajectories_summarized"] = sum(
-                1 for output in successful_outputs if output.summarization_count > 0
-            )
-            rollout_metrics["generate/trajectories_context_length_exceeded"] = sum(
-                1 for output in successful_outputs if output.stop_reason == "context_length"
-            )
-            rollout_metrics["generate/avg_num_turns"] = sum(output.num_turns for output in successful_outputs) / len(
-                successful_outputs
-            )
-        else:
-            rollout_metrics = {}
-
-        # Failure metrics: timeout vs unknown error trajectories, and masked instances.
-        rollout_metrics["generate/num_timeout_trajectories"] = num_timeout_trajectories
-        rollout_metrics["generate/num_error_trajectories"] = num_error_trajectories
-        rollout_metrics["generate/num_masked_instances"] = len(masked_instance_ids)
 
-        logger.info(
-            f"\n# of masked instances: {len(masked_instance_ids)} / {len(all_instance_ids)}\n"
-            f"# of timeout trajectories: {num_timeout_trajectories}\n"
-            f"# of error trajectories: {num_error_trajectories}"
+        return build_step_wise_generator_output(
+            all_outputs, overlong_filtering=self.generator_cfg.apply_overlong_filtering
         )
 
-        return all_outputs, rollout_metrics
-
-    async def harbor_agent_loop(
+    async def _harbor_agent_loop(
         self,
         prompt: ConversationType,
         trajectory_id: TrajectoryID,
-    ) -> HarborAgentOutput:
+    ) -> HarborTrajectoryOutput:
+        """Run a single Harbor trial and return the rollout details plus a trajectory-level reward.
+        Retries on unknown errors; context length errors train with reward=0; agent timeouts mask the trajectory.
         """
-        Run a single harbor agent.
-        """
-        # Run the trial to get `reward`, `chat_history`, `summarization_count`, and `num_turns`
         reward = None
-        chat_history = None
-        summarization_count = None
+        results = None
+        rollout_details = None
         num_turns = None
         successful = False
         is_context_length_error = False
         is_agent_timeout_error = False
+
         for i in range(MAX_NUM_RETRIES_PER_TRIAL):
             prefix = f"Trajectory {trajectory_id} attempt {i+1}/{MAX_NUM_RETRIES_PER_TRIAL}"
             results = None
@@ -374,101 +322,52 @@ async def harbor_agent_loop(
                 is_context_length_error = exc_type == "ContextLengthExceededError"
                 is_agent_timeout_error = exc_type == "AgentTimeoutError"
 
-                # --- Determine reward ---
+                # Determine reward.
                 if is_agent_timeout_error:
                     # AgentTimeoutError: not successful, no retry, loss-masked
                     logger.debug(f"{prefix} hit AgentTimeoutError (no retry). Results: {results}")
                     break
                 elif is_context_length_error:
                     # ContextLengthExceededError: always train with reward=0.
-                    logger.debug(
-                        f"{prefix} hit ContextLengthExceededError, will train with reward=0. Results: {results}"
-                    )
-                    reward = 0
+                    logger.debug(f"{prefix} hit ContextLengthExceededError, setting reward=0. Results: {results}")
+                    reward = 0.0
                 elif not results.verifier_result:
                     # Does not have a verifier result, so it's not successful, will retry
                     logger.warning(f"{prefix} failed: Exception info: {results.exception_info}. Results: {results}")
                     continue
                 else:
-                    reward = results.verifier_result.rewards["reward"]
+                    reward = float(results.verifier_result.rewards["reward"])
 
-                # --- Extract chat history and check for success ---
-                chat_history = results.agent_result.metadata["all_messages"]
-                summarization_count = results.agent_result.metadata["summarization_count"]
+                # Extract rollout details and check for success
+                rollout_details = results.agent_result.rollout_details
                 num_turns = results.agent_result.metadata["n_episodes"]
-                if len(chat_history) > 1 and chat_history[0]["role"] == "user":
+
+                if (
+                    rollout_details
+                    and len(rollout_details) >= 1
+                    and len(rollout_details[0].get("completion_token_ids", [])) > 0
+                ):
                     successful = True
-                    logger.debug(f"{prefix} successful: reward={reward}. Results: {results}")
+                    logger.debug(f"{prefix} successful: reward={reward}.")
                     break
                 else:
-                    logger.warning(
-                        f"{prefix} failed: Did not return a chat history with a user message. chat_history: {chat_history}\nResults: {results}"
-                    )
+                    logger.warning(f"{prefix} failed: empty/missing rollout_details. Results: {results}")
             except Exception as e:
                 logger.warning(f"{prefix} failed: Error running trial: {e}. Results: {results}")
                 continue
 
         if not successful:
-            # We make loss mask 0 so it does not contribute to model updates
             stop_reason = "agent_timeout" if is_agent_timeout_error else "error"
             error_message = f"Trajectory {trajectory_id} failed (stop_reason={stop_reason}), will set loss mask to [0]."
             if stop_reason == "error":
                 error_message += f" Results: {results}"
             logger.warning(error_message)
-            return HarborAgentOutput(
-                response_ids=[0],
-                reward=0,
-                stop_reason=stop_reason,
-                loss_mask=[0],
-                prompt_ids=[0],
+            return HarborTrajectoryOutput(trajectory_id=trajectory_id, rollout_details=None, stop_reason=stop_reason)
+        else:
+            return HarborTrajectoryOutput(
                 trajectory_id=trajectory_id,
+                rollout_details=rollout_details,
+                reward=reward,
+                num_turns=num_turns,
+                stop_reason="context_length" if is_context_length_error else "complete",
             )
-
-        # Use the first message as the prompt. We assume to be no systems messages.
-        assert chat_history[0]["role"] == "user", "The first message should be a user message"
-        prompt = [chat_history[0]]
-        prompt_ids = self.tokenizer.apply_chat_template(
-            prompt,
-            add_generation_prompt=False,  # the message below will add it themselves
-            return_dict=False,
-            tokenize=True,
-            chat_template=self.custom_chat_template_content,
-        )
-        initial_prompt_length = len(prompt_ids)
-
-        # Process response messages (everything after the first message)
-        response_messages = chat_history[1:]
-        assistant_logprobs = getattr(results.agent_result, "output_logprobs", None)
-        response_ids, loss_mask, rollout_logprobs = get_response_ids_and_loss_mask_from_messages(
-            response_messages, self.tokenizer, assistant_logprobs, chat_template=self.custom_chat_template_content
-        )
-
-        # Determine stop reason
-        max_response_tokens = max(0, self.max_seq_len - initial_prompt_length)
-        if is_context_length_error or len(response_ids) > max_response_tokens:
-            stop_reason = "context_length"
-        else:
-            stop_reason = "complete"
-
-        # Apply overlong filtering.
-        # TODO(Charlie): should this also apply when the end reason is max_turns in Harbor?
-        # Revisit. We would like to reuse `utils.py`'s implementation for overlong filtering.
-        if self.generator_cfg.apply_overlong_filtering and stop_reason == "context_length":
-            loss_mask = [0] * len(loss_mask)
-
-        # Truncate to maximum allowed length.
-        # NOTE(Charlie): though it shouldn't happen since it'd reach `ContextLengthExceededError`
-        # from Harbor first. We do it anyway to be safe.
-        response_ids = response_ids[:max_response_tokens]
-        loss_mask = loss_mask[:max_response_tokens]
-
-        return HarborAgentOutput(
-            response_ids=response_ids,
-            reward=reward,
-            stop_reason=stop_reason,
-            loss_mask=loss_mask,
-            prompt_ids=prompt_ids,
-            trajectory_id=trajectory_id,
-            summarization_count=summarization_count,
-            num_turns=num_turns,
-        )
diff --git a/examples/train_integrations/harbor/harbor_trial_config/default.yaml b/examples/train_integrations/harbor/harbor_trial_config/default.yaml
index e38d069294..9d2a920a0c 100644
--- a/examples/train_integrations/harbor/harbor_trial_config/default.yaml
+++ b/examples/train_integrations/harbor/harbor_trial_config/default.yaml
@@ -44,8 +44,8 @@ agent:
     # Whether to enable context summarization when approaching token limits
     enable_summarize: false
 
-    # Store all messages in the trial output (required for SkyRL training)
-    store_all_messages: true
+    # Collect per-turn rollout details (required for step-wise training)
+    collect_rollout_details: true
 
     # The only sampling param that directly gets passed to Terminus
     temperature: 1.0
diff --git a/examples/train_integrations/harbor/harbor_trial_config/openhands.yaml b/examples/train_integrations/harbor/harbor_trial_config/openhands.yaml
deleted file mode 100644
index c84e6b73e9..0000000000
--- a/examples/train_integrations/harbor/harbor_trial_config/openhands.yaml
+++ /dev/null
@@ -1,54 +0,0 @@
-# @package harbor_trial_config
-#
-# OpenHands agent configuration for SkyRL RL training
-
-reject_summarization: true
-
-# Harbor TrialConfig fields below
-# --------------------------------
-
-trials_dir: ~/trials
-timeout_multiplier: 1.0
-
-agent:
-  name: openhands
-  override_timeout_sec: 1800
-
-  kwargs:
-    max_turns: 32
-    suppress_max_turns_warning: true
-    enable_plan_mode: false
-
-    # Text-based tool invocation: model generates <execute_bash>, etc. in raw text.
-    # Required for RL training (preserves raw LLM output for proper tokenization).
-    disable_tool_calls: false
-
-    # Preserve raw LLM responses in trajectory for accurate RL training.
-    trajectory_config:
-      raw_content: false
-
-    # Disable reasoning effort to avoid thinking tokens.
-    #reasoning_effort: null
-
-    temperature: 1.0
-
-    # Model info for token budgeting.
-    # NOTE: max_input_tokens should match +generator.engine_init_kwargs.max_model_len
-    # NOTE: max_output_tokens must be < max_input_tokens to leave room for the prompt,
-    #       otherwise every LLM call triggers ContextWindowExceededError.
-    model_info:
-      max_input_tokens: 32768
-      max_output_tokens: 4096
-      input_cost_per_token: 0.0
-      output_cost_per_token: 0.0
-
-environment:
-  type: docker
-
-  # OpenHands needs more resources than terminus-2 (runs its own venv, tools, etc.)
-  override_cpus: 2
-  override_memory_mb: 4096
-  suppress_override_warnings: true
-
-verifier:
-  disable: false
diff --git a/examples/train_integrations/harbor/run_codecontest.sh b/examples/train_integrations/harbor/run_codecontest.sh
index eac1fc77ce..38ab9db573 100644
--- a/examples/train_integrations/harbor/run_codecontest.sh
+++ b/examples/train_integrations/harbor/run_codecontest.sh
@@ -22,30 +22,41 @@ EVAL_DATA="['$DATA_DIR/OpenThoughts-TB-dev']"
 # Directory setup
 #-----------------------
 RUN_NAME="codecontest"
-TRIALS_DIR="$HOME/$RUN_NAME/trials_run"
-CKPTS_DIR="$HOME/$RUN_NAME/ckpts"
-EXPORTS_DIR="$HOME/$RUN_NAME/exports"
-LOG_DIR="/tmp/skyrl-logs/$RUN_NAME"
+STORAGE_ROOT="/mnt/local_storage/$RUN_NAME"
+TRIALS_DIR="$STORAGE_ROOT/trials_run"
+CKPTS_DIR="$STORAGE_ROOT/ckpts"
+EXPORTS_DIR="$STORAGE_ROOT/exports"
+LOG_DIR="$STORAGE_ROOT/logs"
 
 #-----------------------
 # Training setup
 #-----------------------
+N_SAMPLES_PER_PROMPT=8
 MINI_BATCH_SIZE=32
 MAX_MODEL_LEN=32768
-APPLY_OVERLONG_FILTERING=true
 
-# Dr. GRPO parameters
-LOSS_REDUCTION="seq_mean_token_sum_norm"
+# Algorithmic parameters
+LOSS_REDUCTION="token_mean"  # with step-wise training, we have to use token_mean to be prefix-merge-invariant
 GRPO_NORM_BY_STD=false
 USE_KL_LOSS=false
+APPLY_OVERLONG_FILTERING=true
 
-# Essentially achieves interleaved thinking and hence on-policy training without step-wise training.
+# Essentially achieves interleaved thinking (does not strip thinking tokens). Allows our step-wise
+# training to be able to merge more step-wise outputs and hence speed up training.
+# If you change the model you train, please change it accordingly, and decide if you need to make
+# modifications.
 CHAT_TEMPLATE_PATH="$(dirname "$0")/../../../skyrl/train/utils/templates/qwen3_acc_thinking.jinja2"
 
+# TIS corrections
+TIS_TYPE=token
+TIS_IMP_RATIO_CAP=2.0
+
 #----------------
 # Infrastructure setup
 #----------------
-NUM_GPUS=8
+NUM_POLICY_GPUS=8
+NUM_INFERENCE_ENGINES=4
+TP_SIZE=2
 ENABLE_RATE_LIMITING=true  # Enable rate/concurrency limiting for trajectory submissions
 TRAJECTORIES_PER_SECOND=5  # Maximum trajectories per second (must be >= 1.0, fractional values like 1.5 are supported). null or omit to disable rate limiting
 MAX_CONCURRENCY=512        # Maximum concurrent trial.run() calls allowed (must be >= 1). null or omit to disable concurrency limiting
@@ -64,14 +75,16 @@ uv run --isolated --extra fsdp --extra harbor -m examples.train_integrations.har
   trainer.algorithm.loss_reduction=$LOSS_REDUCTION \
   trainer.algorithm.grpo_norm_by_std=$GRPO_NORM_BY_STD \
   trainer.algorithm.use_kl_loss=$USE_KL_LOSS \
+  trainer.algorithm.off_policy_correction.tis_ratio_type=$TIS_TYPE \
+  trainer.algorithm.off_policy_correction.token_tis_ratio_clip_high=$TIS_IMP_RATIO_CAP \
   trainer.placement.colocate_all=true \
   trainer.strategy=fsdp2 \
   trainer.placement.policy_num_nodes=1 \
   trainer.placement.ref_num_nodes=1 \
-  trainer.placement.policy_num_gpus_per_node=$NUM_GPUS \
-  trainer.placement.ref_num_gpus_per_node=$NUM_GPUS \
-  generator.inference_engine.num_engines=$NUM_GPUS \
-  generator.inference_engine.tensor_parallel_size=1 \
+  trainer.placement.policy_num_gpus_per_node=$NUM_POLICY_GPUS \
+  trainer.placement.ref_num_gpus_per_node=$NUM_POLICY_GPUS \
+  generator.inference_engine.num_engines=$NUM_INFERENCE_ENGINES \
+  generator.inference_engine.tensor_parallel_size=$TP_SIZE \
   generator.inference_engine.engine_init_kwargs.chat_template=$CHAT_TEMPLATE_PATH \
   generator.inference_engine.engine_init_kwargs.max_model_len=$MAX_MODEL_LEN \
   generator.inference_engine.engine_init_kwargs.enable_log_requests=false \
@@ -85,11 +98,14 @@ uv run --isolated --extra fsdp --extra harbor -m examples.train_integrations.har
   trainer.micro_forward_batch_size_per_gpu=1 \
   trainer.micro_train_batch_size_per_gpu=1 \
   trainer.ckpt_interval=5 \
+  trainer.max_ckpts_to_keep=5 \
   trainer.hf_save_interval=5 \
   trainer.algorithm.max_seq_len=$MAX_MODEL_LEN \
   trainer.policy.optimizer_config.lr=1.0e-6 \
-  generator.n_samples_per_prompt=8 \
-  generator.eval_n_samples_per_prompt=4 \
+  generator.step_wise_trajectories=true \
+  generator.merge_stepwise_output=true \
+  generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \
+  generator.eval_n_samples_per_prompt=2 \
   generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \
   generator.inference_engine.gpu_memory_utilization=0.8 \
   trainer.logger=wandb \
diff --git a/examples/train_integrations/harbor/run_codecontest_fully_async.sh b/examples/train_integrations/harbor/run_codecontest_fully_async.sh
new file mode 100644
index 0000000000..1c7a564613
--- /dev/null
+++ b/examples/train_integrations/harbor/run_codecontest_fully_async.sh
@@ -0,0 +1,139 @@
+set -ex
+
+# wandb api key.
+# export WANDB_API_KEY=YOUR_KEY_HERE
+
+# Pick the sandbox provider and provide the credentials.
+# export DAYTONA_API_KEY=YOUR_KEY_HERE
+# export MODAL_TOKEN_ID=YOUR_KEY_HERE
+# export MODAL_TOKEN_SECRET=YOUR_KEY_HERE
+
+#-----------------------
+# Dataset setup
+#-----------------------
+# Prepare datasets first (downloads from HuggingFace and extracts tasks):
+# uv run examples/train_integrations/harbor/prepare_harbor_dataset.py --dataset open-thoughts/CodeContests
+# uv run examples/train_integrations/harbor/prepare_harbor_dataset.py --dataset open-thoughts/OpenThoughts-TB-dev
+DATA_DIR="$HOME/data/harbor"
+TRAIN_DATA="['$DATA_DIR/CodeContests']"
+EVAL_DATA="['$DATA_DIR/OpenThoughts-TB-dev']"
+
+#-----------------------
+# Directory setup
+#-----------------------
+RUN_NAME="codecontest-fullyasync"
+STORAGE_ROOT="/mnt/local_storage/$RUN_NAME"
+TRIALS_DIR="$STORAGE_ROOT/trials_run"
+CKPTS_DIR="$STORAGE_ROOT/ckpts"
+EXPORTS_DIR="$STORAGE_ROOT/exports"
+LOG_DIR="$STORAGE_ROOT/logs"
+
+#-----------------------
+# Training setup
+#-----------------------
+N_SAMPLES_PER_PROMPT=8
+MINI_BATCH_SIZE=16
+MAX_MODEL_LEN=32768
+
+# Algorithmic parameters
+LOSS_REDUCTION="token_mean"  # with step-wise training, we have to use token_mean to be prefix-merge-invariant
+GRPO_NORM_BY_STD=false
+USE_KL_LOSS=false
+APPLY_OVERLONG_FILTERING=true
+
+# Essentially achieves interleaved thinking (does not strip thinking tokens). Allows our step-wise
+# training to be able to merge more step-wise outputs and hence speed up training.
+# If you change the model you train, please change it accordingly, and decide if you need to make
+# modifications.
+CHAT_TEMPLATE_PATH="$(dirname "$0")/../../../skyrl/train/utils/templates/qwen3_acc_thinking.jinja2"
+
+# TIS corrections
+TIS_TYPE=token
+TIS_IMP_RATIO_CAP=2.0
+
+# -------------------------
+# Fully-async knobs.
+# All knobs are tuned for 1x8xH100 node for Qwen3-8B, please adjust accordingly.
+# Constraint: mini_batch_size <= num_parallel_generation_workers <= mini_batch_size * (max_staleness_steps + 1)
+# Can increase num_parallel_generation_workers based on your hardware resources (e.g. KV cache size).
+# -------------------------
+MAX_STALENESS_STEPS=4
+NUM_PARALLEL_GENERATION_WORKERS=$(( MINI_BATCH_SIZE * 2 ))
+
+#----------------
+# Infrastructure setup.
+# All knobs are tuned for 1x8xH100 node for Qwen3-8B, please adjust accordingly.
+#----------------
+NUM_INFERENCE_ENGINES=2
+TP_SIZE=2
+NUM_POLICY_GPUS=4
+ENABLE_RATE_LIMITING=true  # Enable rate/concurrency limiting for trajectory submissions
+TRAJECTORIES_PER_SECOND=5  # Maximum trajectories per second (must be >= 1.0, fractional values like 1.5 are supported). null or omit to disable rate limiting
+MAX_CONCURRENCY=128        # Maximum concurrent trial.run() calls allowed (must be >= 1). null or omit to disable concurrency limiting
+
+# Run SkyRL command
+uv run --isolated --extra fsdp --extra harbor -m examples.train_integrations.harbor.entrypoints.main_harbor_fully_async \
+  data.train_data=$TRAIN_DATA \
+  data.val_data=$EVAL_DATA \
+  trainer.policy.model.path=Qwen/Qwen3-8B \
+  generator.inference_engine.served_model_name=Qwen3-8B \
+  harbor_trial_config.trials_dir=$TRIALS_DIR \
+  trainer.export_path=$EXPORTS_DIR \
+  trainer.ckpt_path=$CKPTS_DIR \
+  trainer.log_path=$LOG_DIR \
+  trainer.fully_async.max_staleness_steps=$MAX_STALENESS_STEPS \
+  trainer.fully_async.num_parallel_generation_workers=$NUM_PARALLEL_GENERATION_WORKERS \
+  trainer.algorithm.advantage_estimator=grpo \
+  trainer.algorithm.loss_reduction=$LOSS_REDUCTION \
+  trainer.algorithm.grpo_norm_by_std=$GRPO_NORM_BY_STD \
+  trainer.algorithm.use_kl_loss=$USE_KL_LOSS \
+  trainer.algorithm.off_policy_correction.tis_ratio_type=$TIS_TYPE \
+  trainer.algorithm.off_policy_correction.token_tis_ratio_clip_high=$TIS_IMP_RATIO_CAP \
+  trainer.placement.colocate_all=false \
+  trainer.strategy=fsdp2 \
+  trainer.placement.policy_num_nodes=1 \
+  trainer.placement.ref_num_nodes=1 \
+  trainer.placement.policy_num_gpus_per_node=$NUM_POLICY_GPUS \
+  trainer.placement.ref_num_gpus_per_node=$NUM_POLICY_GPUS \
+  generator.inference_engine.num_engines=$NUM_INFERENCE_ENGINES \
+  generator.inference_engine.tensor_parallel_size=$TP_SIZE \
+  generator.inference_engine.engine_init_kwargs.chat_template=$CHAT_TEMPLATE_PATH \
+  generator.inference_engine.engine_init_kwargs.max_model_len=$MAX_MODEL_LEN \
+  generator.inference_engine.engine_init_kwargs.enable_log_requests=false \
+  trainer.epochs=3 \
+  trainer.eval_batch_size=128 \
+  trainer.eval_before_train=false \
+  trainer.eval_interval=100 \
+  trainer.update_epochs_per_batch=1 \
+  trainer.train_batch_size=$MINI_BATCH_SIZE \
+  trainer.policy_mini_batch_size=$MINI_BATCH_SIZE \
+  trainer.micro_forward_batch_size_per_gpu=1 \
+  trainer.micro_train_batch_size_per_gpu=1 \
+  trainer.ckpt_interval=5 \
+  trainer.max_ckpts_to_keep=5 \
+  trainer.hf_save_interval=5 \
+  trainer.algorithm.max_seq_len=$MAX_MODEL_LEN \
+  trainer.policy.optimizer_config.lr=1.0e-6 \
+  generator.step_wise_trajectories=true \
+  generator.merge_stepwise_output=true \
+  generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \
+  generator.eval_n_samples_per_prompt=2 \
+  generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \
+  generator.inference_engine.gpu_memory_utilization=0.9 \
+  trainer.logger=wandb \
+  trainer.project_name=harbor \
+  trainer.run_name=$RUN_NAME \
+  trainer.resume_mode=latest \
+  generator.inference_engine.backend=vllm \
+  generator.inference_engine.run_engines_locally=true \
+  generator.inference_engine.weight_sync_backend=nccl \
+  generator.inference_engine.async_engine=true \
+  generator.batched=false \
+  generator.inference_engine.enforce_eager=false \
+  generator.inference_engine.enable_http_endpoint=true \
+  generator.inference_engine.http_endpoint_host=127.0.0.1 \
+  generator.inference_engine.http_endpoint_port=8000 \
+  generator.rate_limit.enabled=$ENABLE_RATE_LIMITING \
+  generator.rate_limit.trajectories_per_second=$TRAJECTORIES_PER_SECOND \
+  generator.rate_limit.max_concurrency=$MAX_CONCURRENCY \
+  "$@"
diff --git a/examples/train_integrations/harbor/run_codecontest_openhands.sh b/examples/train_integrations/harbor/run_codecontest_openhands.sh
deleted file mode 100644
index bed0f51d27..0000000000
--- a/examples/train_integrations/harbor/run_codecontest_openhands.sh
+++ /dev/null
@@ -1,135 +0,0 @@
-set -ex
-
-# wandb api key.
-# export WANDB_API_KEY=YOUR_KEY_HERE
-
-# Pick the sandbox provider and provide the credentials.
-# export DAYTONA_API_KEY=YOUR_KEY_HERE
-# export MODAL_TOKEN_ID=YOUR_KEY_HERE
-# export MODAL_TOKEN_SECRET=YOUR_KEY_HERE
-
-# ---- OpenHands-specific env vars ----
-# Disable condensation to ensure strictly-appending chat history for RL.
-# The Harbor OpenHands agent forwards OPENHANDS_* env vars (stripping prefix).
-export OPENHANDS_ENABLE_DEFAULT_CONDENSER=false
-# Disable history truncation to prevent infinite condensation loops when context
-# is exceeded. With this off, ContextWindowExceededError is raised cleanly instead
-# of looping through condenser requests that can never reduce essential events.
-export OPENHANDS_AGENT_ENABLE_HISTORY_TRUNCATION=false
-
-#-----------------------
-# vLLM endpoint for Docker containers
-#-----------------------
-# OpenHands runs inside Docker containers (not on the host). The containers reach
-# the host's vLLM server via the Docker bridge gateway (172.17.0.1 on Linux).
-# Override VLLM_API_BASE if your Docker bridge uses a different gateway IP.
-VLLM_PORT=8000
-VLLM_API_BASE="${VLLM_API_BASE:-http://172.17.0.1:${VLLM_PORT}/v1}"
-echo "vLLM API base for Docker containers: $VLLM_API_BASE"
-
-#-----------------------
-# Dataset setup
-#-----------------------
-# Prepare datasets first (downloads from HuggingFace and extracts tasks):
-# uv run examples/train_integrations/harbor/prepare_harbor_dataset.py --dataset open-thoughts/CodeContests
-# uv run examples/train_integrations/harbor/prepare_harbor_dataset.py --dataset open-thoughts/OpenThoughts-TB-dev
-DATA_DIR="$HOME/data/harbor"
-TRAIN_DATA="['$DATA_DIR/CodeContests']"
-EVAL_DATA="['$DATA_DIR/OpenThoughts-TB-dev']"
-
-#-----------------------
-# Directory setup
-#-----------------------
-RUN_NAME="codecontest-openhands"
-TRIALS_DIR="$HOME/$RUN_NAME/trials_run"
-CKPTS_DIR="$HOME/$RUN_NAME/ckpts"
-EXPORTS_DIR="$HOME/$RUN_NAME/exports"
-# Logs (trainer + tee) go under my_logs/ in the repo root when run from SkyRL-main.
-LOG_DIR="my_logs/$RUN_NAME"
-mkdir -p "$LOG_DIR"
-# To save the full run log when you interrupt: ... 2>&1 | stdbuf -oL tee "$LOG_DIR/training.log"
-
-#-----------------------
-# Training setup
-#-----------------------
-MINI_BATCH_SIZE=2
-MAX_MODEL_LEN=16384
-APPLY_OVERLONG_FILTERING=true
-
-# Dr. GRPO parameters
-LOSS_REDUCTION="seq_mean_token_sum_norm"
-GRPO_NORM_BY_STD=false
-USE_KL_LOSS=false
-
-CHAT_TEMPLATE_PATH="$(dirname "$0")/../../../skyrl/train/utils/templates/qwen3_acc_thinking.jinja2"
-
-#----------------
-# Infrastructure setup
-#----------------
-NUM_GPUS=1
-ENABLE_RATE_LIMITING=true
-# OpenHands trials are heavier than terminus-2 but Docker runs locally.
-TRAJECTORIES_PER_SECOND=2
-MAX_CONCURRENCY=4
-
-# Run SkyRL command with OpenHands agent
-uv run --isolated --extra fsdp --extra harbor -m examples.train_integrations.harbor.entrypoints.main_harbor \
-  data.train_data=$TRAIN_DATA \
-  data.val_data=$EVAL_DATA \
-  trainer.policy.model.path=Qwen/Qwen3-1.7B \
-  generator.served_model_name=Qwen3-1.7B \
-  hydra.searchpath=['file://examples/train_integrations/harbor'] \
-  +harbor_trial_config=openhands \
-  ++harbor_trial_config.trials_dir=$TRIALS_DIR \
-  trainer.export_path=$EXPORTS_DIR \
-  trainer.ckpt_path=$CKPTS_DIR \
-  trainer.log_path=$LOG_DIR \
-  trainer.algorithm.advantage_estimator=grpo \
-  trainer.algorithm.loss_reduction=$LOSS_REDUCTION \
-  trainer.algorithm.grpo_norm_by_std=$GRPO_NORM_BY_STD \
-  trainer.algorithm.use_kl_loss=$USE_KL_LOSS \
-  trainer.placement.colocate_all=true \
-  trainer.strategy=fsdp2 \
-  trainer.placement.policy_num_nodes=1 \
-  trainer.placement.ref_num_nodes=1 \
-  trainer.placement.policy_num_gpus_per_node=$NUM_GPUS \
-  trainer.placement.ref_num_gpus_per_node=$NUM_GPUS \
-  generator.num_inference_engines=$NUM_GPUS \
-  generator.inference_engine_tensor_parallel_size=1 \
-  +generator.engine_init_kwargs.chat_template=$CHAT_TEMPLATE_PATH \
-  +generator.engine_init_kwargs.max_model_len=$MAX_MODEL_LEN \
-  +generator.engine_init_kwargs.enable_log_requests=false \
-  trainer.epochs=1 \
-  trainer.eval_batch_size=128 \
-  trainer.eval_before_train=false \
-  trainer.eval_interval=20 \
-  trainer.update_epochs_per_batch=1 \
-  trainer.train_batch_size=$MINI_BATCH_SIZE \
-  trainer.policy_mini_batch_size=$MINI_BATCH_SIZE \
-  trainer.micro_forward_batch_size_per_gpu=1 \
-  trainer.micro_train_batch_size_per_gpu=1 \
-  trainer.ckpt_interval=5 \
-  trainer.hf_save_interval=5 \
-  trainer.algorithm.max_seq_len=$MAX_MODEL_LEN \
-  trainer.policy.optimizer_config.lr=1.0e-6 \
-  generator.n_samples_per_prompt=8 \
-  generator.eval_n_samples_per_prompt=4 \
-  generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \
-  generator.gpu_memory_utilization=0.5 \
-  trainer.logger=wandb \
-  trainer.project_name=harbor \
-  trainer.run_name=$RUN_NAME \
-  trainer.resume_mode=latest \
-  generator.backend=vllm \
-  generator.run_engines_locally=true \
-  generator.weight_sync_backend=nccl \
-  generator.async_engine=true \
-  generator.batched=false \
-  generator.enforce_eager=false \
-  generator.enable_http_endpoint=true \
-  generator.http_endpoint_host=0.0.0.0 \
-  generator.http_endpoint_port=8000 \
-  ++harbor_trial_config.agent.kwargs.api_base="${VLLM_API_BASE}" \
-  +generator.rate_limit.enabled=$ENABLE_RATE_LIMITING \
-  +generator.rate_limit.trajectories_per_second=$TRAJECTORIES_PER_SECOND \
-  +generator.rate_limit.max_concurrency=$MAX_CONCURRENCY
diff --git a/skyrl/train/fully_async_trainer.py b/skyrl/train/fully_async_trainer.py
index 63284da242..a720565290 100644
--- a/skyrl/train/fully_async_trainer.py
+++ b/skyrl/train/fully_async_trainer.py
@@ -649,6 +649,7 @@ def convert_generation_group_mini_batch_to_training_input(
         )
         assert generator_output["rollout_metrics"] is not None, "Rollout metrics should be non-null."
         self.all_metrics.update(generator_output["rollout_metrics"])
+        generator_output.pop("rollout_metrics", None)
 
         # Log staleness statistics for this step
         self.all_metrics.update(
diff --git a/skyrl/train/generators/utils.py b/skyrl/train/generators/utils.py
index ea908cbe9a..9bd9f4bf8b 100644
--- a/skyrl/train/generators/utils.py
+++ b/skyrl/train/generators/utils.py
@@ -274,6 +274,25 @@ def concatenate_generator_outputs(generator_outputs: List[GeneratorOutput], step
 
     # Re-aggregate rollout metrics
     rollout_metrics = get_rollout_metrics(result["response_ids"], result["rewards"])
+
+    # Preserve generator-specific metrics from per-group rollout_metrics. get_rollout_metrics only
+    # computes basic stats (response length, reward); generators may add custom keys, which we
+    # aggregate by inferring from the key name. TODO(Charlie): hacky, to be removed soon.
+    extra_keys: dict = {}
+    for go in generator_outputs:
+        per_group = go.get("rollout_metrics") or {}
+        for k, v in per_group.items():
+            if k not in rollout_metrics and isinstance(v, (int, float)):
+                extra_keys.setdefault(k, []).append(v)
+    for k, values in extra_keys.items():
+        if "avg" in k or "mean" in k:
+            rollout_metrics[k] = sum(values) / len(values)
+        elif "min" in k:
+            rollout_metrics[k] = min(values)
+        elif "max" in k:
+            rollout_metrics[k] = max(values)
+        else:
+            rollout_metrics[k] = sum(values)
     result["rollout_metrics"] = rollout_metrics
 
     # Validate the generator output using the number of prompts
@@ -607,7 +626,8 @@ def _slice_generator_output(generator_output: GeneratorOutput, indices: List[int
     sliced: GeneratorOutput = {}
     for key, value in generator_output.items():
         if key == "rollout_metrics":
-            sliced[key] = value
+            # Skip since metrics are already recorded before calling `merge_stepwise_output()`.
+            continue
         elif value is None:
             sliced[key] = None
         else:
@@ -716,7 +736,6 @@ def flush():
         "rewards": out_rewards,
         "loss_masks": out_loss_masks,
         "stop_reasons": out_stop_reasons,
-        "rollout_metrics": gen_out.get("rollout_metrics", None),
         "rollout_logprobs": out_logprobs,
         "trajectory_ids": out_trajectory_ids,
         "rollout_expert_indices": None,
@@ -740,6 +759,9 @@ def merge_stepwise_output(generator_output: GeneratorOutput) -> GeneratorOutput:
     When the prefix condition fails between two consecutive turns, the current
     merge group is flushed and a new group starts (greedy merging).
 
+    The returned GeneratorOutput's rollout_metrics should be ignored. We already recorded it before
+    calling this function.
+
     Args:
         generator_output: Step-wise GeneratorOutput with trajectory_ids and is_last_step.
 
@@ -765,5 +787,5 @@ def merge_stepwise_output(generator_output: GeneratorOutput) -> GeneratorOutput:
             start = i + 1
 
     merged_slices = [_merge_single_trajectory(s) for s in trajectory_slices]
-    # concatenate_generator_outputs re-aggregates rollout_metrics and validates
+
     return concatenate_generator_outputs(merged_slices, step_wise=True)
diff --git a/skyrl/train/trainer.py b/skyrl/train/trainer.py
index af1b65ed11..727e078cce 100644
--- a/skyrl/train/trainer.py
+++ b/skyrl/train/trainer.py
@@ -743,6 +743,7 @@ async def generate(
         # add rollout metrics to self.all_metrics
         if generator_output["rollout_metrics"] is not None:
             self.all_metrics.update(generator_output["rollout_metrics"])
+        generator_output.pop("rollout_metrics", None)
 
         validate_generator_output(
             len(input_batch["prompts"]),