Skip to content

Commit 58ee44b

Browse files
committed
address feedback about rng seed
1 parent e057365 commit 58ee44b

File tree

1 file changed

+8
-3
lines changed

1 file changed

+8
-3
lines changed

aiperf/dataset/composer/base.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import random
55
from abc import ABC, abstractmethod
66

7+
import numpy as np
8+
79
from aiperf.common.config import UserConfig
810
from aiperf.common.enums import ModelSelectionStrategy
911
from aiperf.common.mixins import AIPerfLoggerMixin
@@ -29,6 +31,10 @@ def __init__(self, config: UserConfig, tokenizer: Tokenizer, **kwargs):
2931
# Initialize sequence distribution
3032
self._seq_distribution = config.input.prompt.get_sequence_distribution()
3133

34+
# Initialize RNG for sequence distribution sampling (avoid reseeding on each sample)
35+
seed = getattr(self.config.input, "random_seed", None)
36+
self._seq_rng = np.random.default_rng(seed) if seed is not None else None
37+
3238
@abstractmethod
3339
def create_dataset(self) -> list[Conversation]:
3440
"""
@@ -73,9 +79,8 @@ def _sample_sequence_lengths(self) -> tuple[int, int]:
7379
or max(128, self.config.input.prompt.input_tokens.mean // 2),
7480
)
7581

76-
# Use random seed from config if available for reproducible results
77-
random_seed = getattr(self.config.input, "random_seed", None)
78-
return self._seq_distribution.sample(random_state=random_seed)
82+
# Use pre-seeded RNG to avoid reseeding on each sample
83+
return self._seq_distribution.sample(random_state=self._seq_rng)
7984

8085
def _set_max_tokens(self, turn: Turn) -> None:
8186
"""Set max_tokens for the turn based on the sequence distribution or output configuration.

0 commit comments

Comments
 (0)