Skip to content

Commit cdeb6c2

Browse files
NathanHBhynky1999
andauthored
fix nanotron (#283)
* fix nanotron * fix nanotron * fix nanotron * Update examples/nanotron/lighteval_config_override_template.yaml Co-authored-by: Hynek Kydlíček <[email protected]> --------- Co-authored-by: Hynek Kydlíček <[email protected]>
1 parent 24adaa2 commit cdeb6c2

File tree

3 files changed

+36
-39
lines changed

3 files changed

+36
-39
lines changed
Lines changed: 25 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,25 @@
1-
lighteval:
2-
batch_size: 16
3-
checkpoints_path: null
4-
generation: null
5-
logging:
6-
hub_repo_details: null
7-
hub_repo_results: null
8-
hub_repo_tensorboard: HuggingFaceBR4/thomwolf-webdata-std-two
9-
local_output_path: /scratch/thomwolf/lighteval/webdata-std-two-1p82G-wet_files_1-seed-5-698496
10-
push_details_to_hub: false
11-
push_results_to_hub: false
12-
push_results_to_tensorboard: true
13-
tensorboard_metric_prefix: e
14-
parallelism:
15-
dp: 1
16-
pp: 1
17-
pp_engine: 1f1b
18-
recompute_granularity: null
19-
tp: 1
20-
tp_linear_async_communication: false
21-
tp_mode: ALL_REDUCE
22-
tasks:
23-
custom_tasks: /fsx/thomwolf/github/lighteval/tasks_examples/custom_tasks/custom_evaluation_tasks.py
24-
dataset_loading_processes: 8
25-
max_samples: 10
26-
multichoice_continuations_start_space: null
27-
num_fewshot_seeds: null
28-
tasks: early-signal
29-
# tasks: custom|hellaswag|0
1+
batch_size: 16
2+
checkpoints_path: null
3+
generation: null
4+
logging:
5+
hub_repo_details: null
6+
hub_repo_results: null
7+
hub_repo_tensorboard: null
8+
local_output_path: ./output_dir
9+
push_details_to_hub: false
10+
push_results_to_hub: false
11+
push_results_to_tensorboard: true
12+
tensorboard_metric_prefix: e
13+
parallelism:
14+
dp: 1
15+
pp: 1
16+
pp_engine: 1f1b
17+
tp: 1
18+
tp_linear_async_communication: false
19+
tp_mode: ALL_REDUCE
20+
tasks:
21+
dataset_loading_processes: 8
22+
max_samples: 10
23+
multichoice_continuations_start_space: null
24+
num_fewshot_seeds: null
25+
tasks: lighteval|gsm8k|5|1

src/lighteval/models/nanotron_model.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ def device(self) -> Union[int, str, torch.device]:
310310
return "cuda"
311311

312312
def _get_batch_size(self, max_input_length: int, override_bs: int = 0, starting_batch_size: int = 512) -> int:
313-
if override_bs > 0:
313+
if override_bs:
314314
return override_bs
315315
logger.warning("Detecting largest batch size")
316316

@@ -1155,7 +1155,9 @@ def greedy_until(
11551155
max_input_length = min(len(context_enc) + max_gen, self.max_length)
11561156

11571157
batch_size = self._get_batch_size(
1158-
override_bs=override_bs, max_input_length=max_input_length, starting_batch_size=starting_batch_size
1158+
override_bs=self._batch_size,
1159+
max_input_length=max_input_length,
1160+
starting_batch_size=starting_batch_size,
11591161
)
11601162
# For next iteration, since the batch will be smaller, we'll test a bigger batch size
11611163
starting_batch_size = batch_size * 2

src/lighteval/pipeline.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,8 @@ def __init__(
103103
tasks: str,
104104
pipeline_parameters: PipelineParameters,
105105
evaluation_tracker: EvaluationTracker,
106-
model=None,
107106
model_config=None,
107+
model=None,
108108
):
109109
if not (model or model_config):
110110
raise ValueError("Must provide either a model or model config when creating a pipeline.")
@@ -116,10 +116,9 @@ def __init__(
116116
"WARNING: --max_samples WAS SET. THESE NUMBERS ARE ONLY PARTIAL AND SHOULD NOT BE USED FOR COMPARISON UNLESS YOU KNOW WHAT YOU ARE DOING."
117117
)
118118

119-
self.accelerator, self.parallel_context = self._init_parallelism_manager()
120-
121-
self.evaluation_tracker = evaluation_tracker
122119
self.model_config = model_config
120+
self.evaluation_tracker = evaluation_tracker
121+
self.accelerator, self.parallel_context = self._init_parallelism_manager()
123122
self.model = self._init_model(model_config, model)
124123

125124
self.evaluation_tracker.general_config_logger.log_model_info(self.model.model_info)
@@ -141,9 +140,9 @@ def _init_parallelism_manager(self):
141140
raise ValueError("You are trying to launch a nanotron model, but nanotron is not installed")
142141
dist.initialize_torch_distributed()
143142
parallel_context = ParallelContext(
144-
tensor_parallel_size=self.model_config.parallelism.tp,
145-
pipeline_parallel_size=self.model_config.parallelism.pp,
146-
data_parallel_size=self.model_config.parallelism.dp,
143+
tensor_parallel_size=self.model_config.lighteval.parallelism.tp,
144+
pipeline_parallel_size=self.model_config.lighteval.parallelism.pp,
145+
data_parallel_size=self.model_config.lighteval.parallelism.dp,
147146
)
148147
test_all_gather(parallel_context=parallel_context)
149148

@@ -156,7 +155,7 @@ def _init_model(self, model_config, model):
156155
return NanotronLightevalModel(
157156
checkpoint_path=os.path.dirname(self.pipeline_parameters.nanotron_checkpoint_path),
158157
nanotron_config=self.model_config,
159-
parallel_context=self.accelerator,
158+
parallel_context=self.parallel_context,
160159
debug_one_layer_model=False,
161160
model_class=None,
162161
env_config=self.pipeline_parameters.env_config,

0 commit comments

Comments
 (0)