fix nanotron (#283)

NathanHB · hynky1999 · web-flow · commit cdeb6c2d261e · 2024-09-02T15:17:06.000+02:00
* fix nanotron

* fix nanotron

* fix nanotron

* Update examples/nanotron/lighteval_config_override_template.yaml

Co-authored-by: Hynek Kydlíček &lt;kydlicek.hynek@gmail.com&gt;

---------

Co-authored-by: Hynek Kydlíček &lt;kydlicek.hynek@gmail.com&gt;
diff --git a/examples/nanotron/lighteval_config_override_template.yaml b/examples/nanotron/lighteval_config_override_template.yaml
@@ -1,29 +1,25 @@
-lighteval:
-  batch_size: 16
-  checkpoints_path: null
-  generation: null
-  logging:
-    hub_repo_details: null
-    hub_repo_results: null
-    hub_repo_tensorboard: HuggingFaceBR4/thomwolf-webdata-std-two
-    local_output_path: /scratch/thomwolf/lighteval/webdata-std-two-1p82G-wet_files_1-seed-5-698496
-    push_details_to_hub: false
-    push_results_to_hub: false
-    push_results_to_tensorboard: true
-    tensorboard_metric_prefix: e
-  parallelism:
-    dp: 1
-    pp: 1
-    pp_engine: 1f1b
-    recompute_granularity: null
-    tp: 1
-    tp_linear_async_communication: false
-    tp_mode: ALL_REDUCE
-  tasks:
-    custom_tasks: /fsx/thomwolf/github/lighteval/tasks_examples/custom_tasks/custom_evaluation_tasks.py
-    dataset_loading_processes: 8
-    max_samples: 10
-    multichoice_continuations_start_space: null
-    num_fewshot_seeds: null
-    tasks: early-signal
-    # tasks: custom|hellaswag|0
+batch_size: 16
+checkpoints_path: null
+generation: null
+logging:
+  hub_repo_details: null
+  hub_repo_results: null
+  hub_repo_tensorboard: null
+  local_output_path: ./output_dir
+  push_details_to_hub: false
+  push_results_to_hub: false
+  push_results_to_tensorboard: true
+  tensorboard_metric_prefix: e
+parallelism:
+  dp: 1
+  pp: 1
+  pp_engine: 1f1b
+  tp: 1
+  tp_linear_async_communication: false
+  tp_mode: ALL_REDUCE
+tasks:
+  dataset_loading_processes: 8
+  max_samples: 10
+  multichoice_continuations_start_space: null
+  num_fewshot_seeds: null
+  tasks: lighteval|gsm8k|5|1
diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py
@@ -310,7 +310,7 @@ def device(self) -> Union[int, str, torch.device]:
         return "cuda"
 
     def _get_batch_size(self, max_input_length: int, override_bs: int = 0, starting_batch_size: int = 512) -> int:
-        if override_bs > 0:
+        if override_bs:
             return override_bs
         logger.warning("Detecting largest batch size")
 
@@ -1155,7 +1155,9 @@ def greedy_until(
                 max_input_length = min(len(context_enc) + max_gen, self.max_length)
 
             batch_size = self._get_batch_size(
-                override_bs=override_bs, max_input_length=max_input_length, starting_batch_size=starting_batch_size
+                override_bs=self._batch_size,
+                max_input_length=max_input_length,
+                starting_batch_size=starting_batch_size,
             )
             # For next iteration, since the batch will be smaller, we'll test a bigger batch size
             starting_batch_size = batch_size * 2
diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
@@ -103,8 +103,8 @@ def __init__(
         tasks: str,
         pipeline_parameters: PipelineParameters,
         evaluation_tracker: EvaluationTracker,
-        model=None,
         model_config=None,
+        model=None,
     ):
         if not (model or model_config):
             raise ValueError("Must provide either a model or model config when creating a pipeline.")
@@ -116,10 +116,9 @@ def __init__(
                 "WARNING: --max_samples WAS SET. THESE NUMBERS ARE ONLY PARTIAL AND SHOULD NOT BE USED FOR COMPARISON UNLESS YOU KNOW WHAT YOU ARE DOING."
             )
 
-        self.accelerator, self.parallel_context = self._init_parallelism_manager()
-
-        self.evaluation_tracker = evaluation_tracker
         self.model_config = model_config
+        self.evaluation_tracker = evaluation_tracker
+        self.accelerator, self.parallel_context = self._init_parallelism_manager()
         self.model = self._init_model(model_config, model)
 
         self.evaluation_tracker.general_config_logger.log_model_info(self.model.model_info)
@@ -141,9 +140,9 @@ def _init_parallelism_manager(self):
                     raise ValueError("You are trying to launch a nanotron model, but nanotron is not installed")
                 dist.initialize_torch_distributed()
                 parallel_context = ParallelContext(
-                    tensor_parallel_size=self.model_config.parallelism.tp,
-                    pipeline_parallel_size=self.model_config.parallelism.pp,
-                    data_parallel_size=self.model_config.parallelism.dp,
+                    tensor_parallel_size=self.model_config.lighteval.parallelism.tp,
+                    pipeline_parallel_size=self.model_config.lighteval.parallelism.pp,
+                    data_parallel_size=self.model_config.lighteval.parallelism.dp,
                 )
                 test_all_gather(parallel_context=parallel_context)
 
@@ -156,7 +155,7 @@ def _init_model(self, model_config, model):
                     return NanotronLightevalModel(
                         checkpoint_path=os.path.dirname(self.pipeline_parameters.nanotron_checkpoint_path),
                         nanotron_config=self.model_config,
-                        parallel_context=self.accelerator,
+                        parallel_context=self.parallel_context,
                         debug_one_layer_model=False,
                         model_class=None,
                         env_config=self.pipeline_parameters.env_config,