diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx
index 0123c7ead..9d4bb8632 100644
--- a/docs/source/use-vllm-as-backend.mdx
+++ b/docs/source/use-vllm-as-backend.mdx
@@ -58,3 +58,32 @@ model: # Model specific parameters
 > [!WARNING]
 > In the case of OOM issues, you might need to reduce the context size of the
 > model as well as reduce the `gpu_memory_utilization` parameter.
+
+
+## Dynamically changing the metric configuration
+
+For special kinds of metrics like `Pass@K` or LiveCodeBench's `codegen` metric, you may need to pass specific values like the number of
+generations. This can be done in the `yaml` file in the following way:
+
+```yaml
+model: # Model specific parameters
+  base_params:
+    model_args: "pretrained=HuggingFaceTB/SmolLM-1.7B,revision=main,dtype=bfloat16" # Model args that you would pass in the command line
+  generation: # Generation specific parameters
+    temperature: 0.3
+    repetition_penalty: 1.0
+    frequency_penalty: 0.0
+    presence_penalty: 0.0
+    seed: 42
+    top_k: 0
+    min_p: 0.0
+    top_p: 0.9
+  metric_options: # Optional metric arguments
+    codegen_pass@1:16:
+      num_samples: 16
+```
+
+An optional key `metric_options` can be passed in the yaml file,
+using the name of the metric or metrics, as defined in the `Metric.metric_name`.
+In this case, the `codegen_pass@1:16` metric defined in our tasks will have the `num_samples` updated to 16,
+independently of  the number defined by default.
diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py
index 6d66faac8..8750cd476 100644
--- a/src/lighteval/main_vllm.py
+++ b/src/lighteval/main_vllm.py
@@ -134,9 +134,11 @@ def vllm(
         with open(model_args, "r") as f:
             config = yaml.safe_load(f)["model"]
         model_args = config["base_params"]["model_args"]
+        metric_options = config.get("metric_options", {})
         generation_parameters = GenerationParameters.from_dict(config)
     else:
-        generation_parameters = GenerationParameters()
+        generation_parameters = GenerationParameters.from_model_args(model_args)
+        metric_options = {}
 
     model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")}
     model_config = VLLMModelConfig(**model_args_dict, generation_parameters=generation_parameters)
@@ -146,6 +148,7 @@ def vllm(
         pipeline_parameters=pipeline_params,
         evaluation_tracker=evaluation_tracker,
         model_config=model_config,
+        metric_options=metric_options,
     )
 
     pipeline.evaluate()
diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py
index b6859232e..2c4e9ab65 100644
--- a/src/lighteval/models/model_input.py
+++ b/src/lighteval/models/model_input.py
@@ -59,6 +59,34 @@ def from_dict(cls, config_dict: dict):
         """
         return GenerationParameters(**config_dict.get("generation", {}))
 
+    @classmethod
+    def from_model_args(cls, model_args: str):
+        """Creates a GenerationParameters object from a model_args string.
+
+        It's used when the model_args are passed as a string in the command line.
+        The generation parameters must follow the following format (at any place in the string):
+        "generation_parameters={key1:value1,key2=value2}"
+
+        Args:
+            model_args (str): A string like the following:
+                "pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,dtype=float16,max_model_length=32768,generation={temperature:0.7,top_p:5}"
+        """
+
+        def parse_model_args(model_args):
+            import json
+            import re
+
+            pattern = re.compile(r"(\w+)=(\{.*\}|[^,]+)")
+            matches = pattern.findall(model_args)
+            for key, value in matches:
+                key = key.strip()
+                if key == "generation_parameters":
+                    gen_params = re.sub(r"(\w+):", r'"\1":', value)
+                    return json.loads(gen_params)
+
+        params: dict = parse_model_args(model_args) or {}
+        return GenerationParameters(**params)
+
     def to_litellm_dict(self) -> dict:
         """Selects relevant generation and sampling parameters for litellm models.
         Doc: https://docs.litellm.ai/docs/completion/input#input-params-1
diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
index 0e6282ef5..04f14edd1 100644
--- a/src/lighteval/pipeline.py
+++ b/src/lighteval/pipeline.py
@@ -132,6 +132,7 @@ def __init__(
         evaluation_tracker: EvaluationTracker,
         model_config=None,
         model=None,
+        metric_options=None,
     ):
         if not (model or model_config):
             raise ValueError("Must provide either a model or model config when creating a pipeline.")
@@ -145,6 +146,7 @@ def __init__(
 
         self.model_config = model_config
         self.evaluation_tracker = evaluation_tracker
+        self._metric_options = metric_options or {}
         self.accelerator, self.parallel_context = self._init_parallelism_manager()
         self.model = self._init_model(model_config, model)
 
@@ -209,6 +211,10 @@ def _init_tasks_and_requests(self, tasks: str):
             )
             task_names_list, fewshots_dict = taskinfo_selector(tasks, registry)
             task_dict = registry.get_task_dict(task_names_list)
+            # If there are metric_options defined from the yaml file,
+            # review if they have to be updated.
+            if self._metric_options:
+                self._update_num_samples(task_dict)
             LightevalTask.load_datasets(list(task_dict.values()), self.pipeline_parameters.dataset_loading_processes)
 
             self.evaluation_tracker.task_config_logger.log(task_dict)
@@ -230,6 +236,19 @@ def _init_tasks_and_requests(self, tasks: str):
             self.requests = requests
             self.docs = docs
 
+    def _update_num_samples(self, task_dict: dict[str, LightevalTask]):
+        """Helper function to update the num_samples of a given metric via the yaml file.
+        As it has to be done at the metric level, it's better to update the value per metric.
+        It will add a num_samples to the already defined metrics' num_samples if defined in the yaml file.
+        As later when constructing the requests the max is taken over the num_samples, this is valid.
+        """
+        for _, task in task_dict.items():
+            for metric in task.metrics:
+                if metric_data := self._metric_options.get(metric.metric_name, None):
+                    num_samples = metric_data.get("num_samples", None)
+                    if num_samples:
+                        task.num_samples = [num_samples]
+
     def _init_random_seeds(self):
         logger.info("--- INIT SEEDS ---")
         random.seed(1234)
diff --git a/src/lighteval/tasks/extended/__init__.py b/src/lighteval/tasks/extended/__init__.py
index 3e9135938..0e132f017 100644
--- a/src/lighteval/tasks/extended/__init__.py
+++ b/src/lighteval/tasks/extended/__init__.py
@@ -25,12 +25,13 @@
 
 if can_load_extended_tasks():
     import lighteval.tasks.extended.ifeval.main as ifeval
+    import lighteval.tasks.extended.lcb.main as lcb
     import lighteval.tasks.extended.mix_eval.main as mix_eval
     import lighteval.tasks.extended.mt_bench.main as mt_bench
     import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench
     import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks
 
-    AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench]
+    AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, lcb]
 
 else:
     AVAILABLE_EXTENDED_TASKS_MODULES = []
diff --git a/src/lighteval/tasks/extended/lcb/codegen_metrics.py b/src/lighteval/tasks/extended/lcb/codegen_metrics.py
new file mode 100644
index 000000000..61cfb9c40
--- /dev/null
+++ b/src/lighteval/tasks/extended/lcb/codegen_metrics.py
@@ -0,0 +1,673 @@
+# MIT License
+
+# Copyright (c) 2025 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""This module contains helper functions copied and modified from
+https://github.com/LiveCodeBench/LiveCodeBench
+and
+https://github.com/QwenLM/Qwen2.5-Coder/tree/main/qwencoder-eval/instruct/livecode_bench
+"""
+
+import ast
+import base64
+import faulthandler
+import json
+import multiprocessing
+import os
+import pickle
+import platform
+
+# to run the solution files we're using a timing based approach
+import signal
+import sys
+import time
+import zlib
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from decimal import Decimal
+from enum import Enum
+from io import StringIO
+from types import ModuleType
+from typing import Callable, Optional
+
+# used for testing the code that reads from input
+from unittest.mock import mock_open, patch
+
+import numpy as np
+from tqdm import tqdm
+
+
+sys.set_int_max_str_digits(50000)
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+import_string = "from string import *\nfrom re import *\nfrom datetime import *\nfrom collections import *\nfrom heapq import *\nfrom bisect import *\nfrom copy import *\nfrom math import *\nfrom random import *\nfrom statistics import *\nfrom itertools import *\nfrom functools import *\nfrom operator import *\nfrom io import *\nfrom sys import *\nfrom json import *\nfrom builtins import *\nfrom typing import *\nimport string\nimport re\nimport datetime\nimport collections\nimport heapq\nimport bisect\nimport copy\nimport math\nimport random\nimport statistics\nimport itertools\nimport functools\nimport operator\nimport io\nimport sys\nimport json\nsys.setrecursionlimit(50000)\n"
+
+
+class CODE_TYPE(Enum):
+    call_based = 0
+    standard_input = 1
+
+
+# stuff for setting up signal timer
+class TimeoutException(Exception):
+    pass
+
+
+def timeout_handler(signum, frame):
+    print("timeout occured: alarm went off")
+    raise TimeoutException
+
+
+# used to capture stdout as a list
+# from https://stackoverflow.com/a/16571630/6416660
+# alternative use redirect_stdout() from contextlib
+class Capturing(list):
+    def __enter__(self):
+        self._stdout = sys.stdout
+        sys.stdout = self._stringio = StringIO()
+        # Make closing the StringIO a no-op
+        self._stringio.close = lambda x: 1
+        return self
+
+    def __exit__(self, *args):
+        self.append(self._stringio.getvalue())
+        del self._stringio  # free up some memory
+        sys.stdout = self._stdout
+
+
+def clean_if_name(code: str) -> str:
+    try:
+        astree = ast.parse(code)
+        last_block = astree.body[-1]
+        if isinstance(last_block, ast.If):
+            condition = last_block.test
+            if ast.unparse(condition).strip() == "__name__ == '__main__'":
+                code = (
+                    ast.unparse(astree.body[:-1]) + "\n" + ast.unparse(last_block.body)  # type: ignore
+                )
+    except Exception:
+        pass
+
+    return code
+
+
+def make_function(code: str) -> str:
+    try:
+        import_stmts = []
+        all_other_stmts = []
+        astree = ast.parse(code)
+        for stmt in astree.body:
+            if isinstance(stmt, (ast.Import, ast.ImportFrom)):
+                import_stmts.append(stmt)
+            else:
+                all_other_stmts.append(stmt)
+
+        function_ast = ast.FunctionDef(
+            name="wrapped_function",
+            args=ast.arguments(posonlyargs=[], args=[], kwonlyargs=[], kw_defaults=[], defaults=[]),
+            body=all_other_stmts,
+            decorator_list=[],
+            lineno=-1,
+        )
+        main_code = (
+            import_string
+            + "\n"
+            + ast.unparse(import_stmts)  # type: ignore
+            + "\n"
+            + ast.unparse(function_ast)  # type: ignore
+        )
+        return main_code
+    except Exception:
+        return code
+
+
+def call_method(method: Callable, inputs: list | str):
+    if isinstance(inputs, list):
+        inputs = "\n".join(inputs)
+
+    inputs_line_iterator = iter(inputs.split("\n"))
+
+    @patch("builtins.open", mock_open(read_data=inputs))
+    @patch("sys.stdin", StringIO(inputs))
+    @patch("sys.stdin.readline", lambda *args: next(inputs_line_iterator))
+    @patch("sys.stdin.readlines", lambda *args: inputs.split("\n"))
+    @patch("sys.stdin.read", lambda *args: inputs)
+    def _inner_call_method(_method):
+        try:
+            return _method()
+        except SystemExit:
+            pass
+        finally:
+            pass
+
+    return _inner_call_method(method)
+
+
+def get_function(compiled_sol, fn_name: str) -> str:  # type: ignore
+    try:
+        assert hasattr(compiled_sol, fn_name)
+        return getattr(compiled_sol, fn_name)
+    except Exception:
+        return
+
+
+def compile_code(code: str, timeout: int) -> ModuleType:
+    signal.alarm(timeout)
+    try:
+        tmp_sol = ModuleType("tmp_sol", "")
+        exec(code, tmp_sol.__dict__)
+        if "class Solution" in code:
+            # leetcode wraps solutions in `Solution`
+            # this is a hack to check if it is leetcode solution or not
+            # currently livecodebench only supports LeetCode but
+            # else condition allows future extensibility to other platforms
+            compiled_sol = tmp_sol.Solution()
+        else:
+            # do nothing in the other case since function is accesible
+            compiled_sol = tmp_sol
+
+        assert compiled_sol is not None
+    finally:
+        signal.alarm(0)
+
+    return compiled_sol
+
+
+def convert_line_to_decimals(line: str) -> tuple[bool, list[Decimal]]:
+    try:
+        decimal_line = [Decimal(elem) for elem in line.split()]
+    except Exception:
+        return False, []
+    return True, decimal_line
+
+
+def get_stripped_lines(val: str) -> list[str]:
+    # you don't want empty lines to add empty list after splitlines!
+    val = val.strip()
+
+    return [val_line.strip() for val_line in val.split("\n")]
+
+
+def grade_call_based(code: str, all_inputs: list, all_outputs: list, fn_name: str, timeout: int) -> list[int | bool]:
+    # call-based clean up logic
+    # need to wrap in try-catch logic after to catch the correct errors, but for now this is fine.
+    code = import_string + "\n\n" + code
+    compiled_sol = compile_code(code, timeout)
+    if compiled_sol is None:
+        # The code could not be compiled as the text was maybe malformed, return [-4] error code.
+        return [-4]
+
+    method = get_function(compiled_sol, fn_name)
+    if method is None:
+        # The function to evaluate could not be extracted from the code, return [-4] error code.
+        return [-4]
+
+    all_inputs = [[json.loads(line) for line in inputs.split("\n")] for inputs in all_inputs]
+
+    all_outputs = [json.loads(output) for output in all_outputs]
+
+    total_execution = 0
+    all_results = []
+    for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
+        signal.alarm(timeout)
+        faulthandler.enable()
+        try:
+            # can lock here so time is useful
+            start = time.time()
+            prediction = method(*gt_inp)
+            total_execution += time.time() - start
+            signal.alarm(0)
+
+            # don't penalize model if it produces tuples instead of lists
+            # ground truth sequences are not tuples
+            if isinstance(prediction, tuple):
+                prediction = list(prediction)
+
+            tmp_result = prediction == gt_out
+
+            all_results.append(tmp_result)
+
+            if not tmp_result:
+                return all_results
+
+        except Exception as e:
+            signal.alarm(0)
+            if "timeoutexception" in repr(e).lower():
+                all_results.append(-3)
+                return all_results
+
+            else:
+                all_results.append(-4)
+                return all_results
+
+        finally:
+            signal.alarm(0)
+            faulthandler.disable()
+
+    return all_results
+
+
+def grade_stdio(  # noqa: C901
+    code: str,
+    all_inputs: list,
+    all_outputs: list,
+    timeout: int,
+) -> list[int | bool]:
+    # runtime doesn't interact well with __name__ == '__main__'
+    code = clean_if_name(code)
+
+    # we wrap the given code inside another function
+    code = make_function(code)
+
+    compiled_sol = compile_code(code, timeout)
+    if compiled_sol is None:
+        # The code could not be compiled as the text was maybe malformed, return [-4] error code.
+        return [-4]
+
+    method = get_function(compiled_sol, "wrapped_function")
+
+    if method is None:
+        # The function to evaluate could not be extracted from the code, return [-4] error code.
+        return [-4]
+
+    all_results = []
+    total_execution_time = 0
+    for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
+        signal.alarm(timeout)
+        faulthandler.enable()
+
+        signal.alarm(timeout)
+        with Capturing() as captured_output:
+            try:
+                start = time.time()
+                call_method(method, gt_inp)
+                total_execution_time += time.time() - start
+                # reset the alarm
+                signal.alarm(0)
+            except Exception as e:
+                signal.alarm(0)
+                if "timeoutexception" in repr(e).lower():
+                    all_results.append(-3)
+                    return all_results
+
+                else:
+                    all_results.append(-4)
+                    return all_results
+
+            finally:
+                signal.alarm(0)
+                faulthandler.disable()
+
+        prediction = captured_output[0]
+
+        stripped_prediction_lines = get_stripped_lines(prediction)
+        stripped_gt_out_lines = get_stripped_lines(gt_out)
+
+        if len(stripped_prediction_lines) != len(stripped_gt_out_lines):
+            all_results.append(-2)
+            return all_results
+
+        for output_line_idx, (
+            stripped_prediction_line,
+            stripped_gt_out_line,
+        ) in enumerate(zip(stripped_prediction_lines, stripped_gt_out_lines)):
+            # CASE 1: exact match
+            if stripped_prediction_line == stripped_gt_out_line:
+                continue
+
+            # CASE 2: element-wise comparision
+            # if there are floating elements
+            # use `decimal` library for good floating point comparision
+            # otherwise gotcha: np.isclose(50000000000000000, 50000000000000001) = True
+            # note that we should always be able to convert to decimals
+
+            success, decimal_prediction_line = convert_line_to_decimals(stripped_prediction_line)
+            if not success:
+                all_results.append(-2)
+                return all_results
+
+            success, decimal_gtout_line = convert_line_to_decimals(stripped_gt_out_line)
+            if not success:
+                all_results.append(-2)
+                return all_results
+
+            if decimal_prediction_line == decimal_gtout_line:
+                continue
+
+            all_results.append(-2)
+            return all_results
+        all_results.append(True)
+
+    return all_results
+
+
+def run_test(sample: dict[str, str], test=None, timeout: int = 6) -> list[int | bool]:
+    """If test(generated_code) is not None it'll try to run the code.
+    otherwise it'll just return an input and output pair.
+    The return codes are all for errors. A True value means the function worked, and a False
+    value means the function did not work.
+    """
+    signal.signal(signal.SIGALRM, timeout_handler)
+
+    # Disable functionalities that can make destructive changes to the test.
+    # max memory is set to 4GB
+    reliability_guard()
+
+    try:
+        in_outs = json.loads(sample["input_output"])
+    except ValueError:
+        in_outs = None
+
+    if in_outs:
+        if in_outs.get("fn_name") is None:
+            which_type = CODE_TYPE.standard_input  # Standard input
+            method_name = None
+
+        else:
+            which_type = CODE_TYPE.call_based  # Call-based
+            method_name = in_outs["fn_name"]
+
+    if test is None:
+        # assert False, "should not happen: test code is none"
+        return in_outs
+
+    if which_type == CODE_TYPE.call_based:
+        signal.alarm(timeout)
+        try:
+            return grade_call_based(
+                code=test,
+                all_inputs=in_outs["inputs"],
+                all_outputs=in_outs["outputs"],
+                fn_name=method_name,
+                timeout=timeout,
+            )
+
+        except Exception:
+            return [-4]
+
+        finally:
+            signal.alarm(0)
+
+    elif which_type == CODE_TYPE.standard_input:
+        signal.alarm(timeout)
+        try:
+            return grade_stdio(
+                code=test,
+                all_inputs=in_outs["inputs"],
+                all_outputs=in_outs["outputs"],
+                timeout=timeout,
+            )
+
+        except Exception:
+            return [-4]
+
+        finally:
+            signal.alarm(0)
+
+    return [-4]
+
+
+def reliability_guard(maximum_memory_bytes: Optional[int] = None):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    if maximum_memory_bytes is not None:
+        import resource
+
+        resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
+        resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
+        if not platform.uname().system == "Darwin":
+            resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
+
+    faulthandler.disable()
+
+    import builtins
+
+    # builtins.exit = None
+    builtins.quit = None
+
+    import os
+
+    os.environ["OMP_NUM_THREADS"] = "1"
+
+    os.kill = None
+    os.system = None
+    os.putenv = None
+    os.remove = None
+    os.removedirs = None
+    os.rmdir = None
+    os.fchdir = None
+    os.setuid = None
+    os.fork = None
+    os.forkpty = None
+    os.killpg = None
+    os.rename = None
+    os.renames = None
+    os.truncate = None
+    os.replace = None
+    os.unlink = None
+    os.fchmod = None
+    os.fchown = None
+    os.chmod = None
+    os.chown = None
+    os.chroot = None
+    os.fchdir = None
+    os.lchflags = None
+    os.lchmod = None
+    os.lchown = None
+    os.getcwd = None
+    os.chdir = None
+
+    import shutil
+
+    shutil.rmtree = None
+    shutil.move = None
+    shutil.chown = None
+
+    import subprocess
+
+    subprocess.Popen = None  # type: ignore
+
+    __builtins__["help"] = None
+
+    import sys
+
+    sys.modules["ipdb"] = None
+    sys.modules["joblib"] = None
+    sys.modules["resource"] = None
+    sys.modules["psutil"] = None
+    sys.modules["tkinter"] = None
+
+
+def check_correctness(sample, generation, timeout: int) -> list[int | bool]:
+    """Check correctness of code generation with a global timeout.
+    The global timeout is to catch some extreme/rare cases not handled by the timeouts
+    inside `run_test`.
+    """
+
+    def _temp_run(sample, generation, result):
+        result.append(run_test(sample, test=generation, timeout=timeout))
+
+    manager = multiprocessing.Manager()
+    result = manager.list()
+    p = multiprocessing.Process(target=_temp_run, args=(sample, generation, result))
+    p.start()
+    p.join(timeout=(timeout + 1) * len(json.loads(sample["input_output"])["inputs"]) + 5)
+    if p.is_alive():
+        p.kill()
+    if not result:
+        in_outs = json.loads(sample["input_output"])
+        # consider that all tests failed
+        result = [[-1 for i in range(len(in_outs["inputs"]))]]
+
+    return result[0]
+
+
+def evaluate_generations_by_problem(args: list[list]) -> list:
+    problem_generations: list[str] = args[0]
+    sample = args[1]
+    timeout: int = args[2]
+
+    res = []
+    for o_idx, o in enumerate(problem_generations):
+        curr_res = [-2]
+        try:
+            curr_res = check_correctness(sample, o, timeout=timeout)
+            fixed = []
+            for e in curr_res:
+                if isinstance(e, np.ndarray):
+                    e = e.item(0)
+                if isinstance(e, np.bool_):
+                    e = bool(e)
+                fixed.append(e)
+            curr_res = fixed
+
+        except Exception as e:
+            # The compilation failed, keep going
+            pass
+
+        finally:
+            assert isinstance(curr_res, list), curr_res
+            res.append(curr_res)
+
+    return res
+
+
+def evaluate_generations(
+    samples_list: list,
+    generations_list: list[list[str]],
+    num_process_evaluate: int = 16,
+    timeout=6,
+) -> dict[int, list[int | bool]]:
+    """We take the list of code generations and try to compile them
+    and the run their corresponding unit tests which are retrieved from the APPS dataset.
+
+    Args:
+        generations: list of code generations (same order as samples in APPS dataset)
+        level: difficulty level used in the generation, can be "all", "introductory", "interview" or "competition"
+
+    Returns:
+        results: dictionary of results, key is the problem index, value is a list of results for each generation
+        [-2] = compile error, [-1] = runtime error [False] = failed test case [True] = passed test case
+    """
+
+    # generations are code generations in the same order of the dataset
+
+    inputs = [
+        [(generations_list[index], samples_list[index], timeout), index] for index in range(len(generations_list))
+    ]
+
+    with tqdm(total=len(inputs)) as pbar:
+        with ProcessPoolExecutor(max_workers=num_process_evaluate) as executor:
+            futures = {executor.submit(evaluate_generations_by_problem, arg): index for arg, index in inputs}
+
+            results = {}
+            for future in as_completed(futures):
+                index = futures[future]
+                results[index] = future.result()
+                pbar.update(1)
+
+    assert len(results) == len(inputs), f"results = {len(results)} inputs = {len(inputs)} {results=}"
+
+    return results
+
+
+def codegen_metrics(
+    samples,
+    generations,
+    k_list=[1, 5],
+    num_process_evaluate=16,
+    timeout=6,
+):
+    results = evaluate_generations(
+        samples,
+        generations,
+        num_process_evaluate=num_process_evaluate,
+        timeout=timeout,
+    )
+    metrics = compute_metrics_from_results(results, k_list=k_list)
+
+    return metrics, results
+
+
+def estimate_pass_at_k(num_samples: np.ndarray, num_correct: np.ndarray, k: int) -> np.ndarray:
+    """Estimates pass@k of each problem and returns them in an array."""
+
+    def estimator(n: int, c: int, k: int) -> float:
+        """Calculates 1 - comb(n - c, k) / comb(n, k)."""
+        if n - c < k:
+            return 1.0
+        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+
+    import itertools
+
+    if isinstance(num_samples, int):
+        num_samples_it = itertools.repeat(num_samples, len(num_correct))
+    else:
+        assert len(num_samples) == len(num_correct)
+        num_samples_it = iter(num_samples)
+
+    return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
+
+
+def compute_metrics_from_results(results, k_list: list[int] = [1, 5]) -> dict[str, np.ndarray]:
+    total = []
+    correct = []
+    task_ids = []
+    for task_id, res in results.items():
+        all_correct = []
+        for generation in res:
+            gen = np.array(generation)
+            all_correct.append(np.all(gen > 0))
+        task_ids.append(task_id)
+        total.append(len(all_correct))
+        correct.append(sum(all_correct))
+    total = np.array(total)
+    correct = np.array(correct)
+    detail_pass_at_k = {
+        f"pass@{k}": estimate_pass_at_k(total, correct, k).tolist() for k in k_list if (total >= k).all()
+    }
+    pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() for k in k_list if (total >= k).all()}
+    detail_metrics = {k: dict(zip(task_ids, v)) for k, v in detail_pass_at_k.items()}
+    pass_at_k["detail"] = detail_metrics
+    return pass_at_k
+
+
+def translate_private_test_cases(encoded_data: str) -> dict[str, str]:
+    decoded_data = base64.b64decode(encoded_data)
+    decompressed_data = zlib.decompress(decoded_data)
+    original_data = pickle.loads(decompressed_data)
+    return json.loads(original_data)
+
+
+def extract_code(model_output: str) -> str:
+    outputlines = model_output.split("\n")
+    indexlines = [i for i, line in enumerate(outputlines) if "```" in line]
+    if len(indexlines) < 2:
+        return ""
+    return "\n".join(outputlines[indexlines[-2] + 1 : indexlines[-1]])
diff --git a/src/lighteval/tasks/extended/lcb/main.py b/src/lighteval/tasks/extended/lcb/main.py
new file mode 100644
index 000000000..b3bcd3c93
--- /dev/null
+++ b/src/lighteval/tasks/extended/lcb/main.py
@@ -0,0 +1,131 @@
+# MIT License
+
+# Copyright (c) 2025 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""Usage:
+lighteval vllm \
+    "pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,dtype=float16,tensor_parallel_size=4,max_model_length=32768,gpu_memory_utilisation=0.8" \
+    "extended|lcb:codegeneration|0|0" \
+    --custom-tasks src/lighteval/tasks/extended/lcb/main.py
+"""
+
+import json
+from typing import Any
+
+import numpy as np
+from aenum import extend_enum
+
+from lighteval.metrics.metrics import MetricCategory, Metrics, MetricUseCase, SampleLevelMetric
+from lighteval.tasks.extended.lcb.codegen_metrics import (
+    codegen_metrics,
+    extract_code,
+    translate_private_test_cases,
+)
+from lighteval.tasks.lighteval_task import Doc, LightevalTaskConfig
+
+
+def prepare_prompt(line: dict[str, Any]) -> str:
+    query = "You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests.\n\n"
+    query += f"Question: {line['question_content']}\n\n"
+    if starter_code := line.get("starter_code", None):
+        query += "You will use the following starter code to write the solution to the problem and enclose your code within delimiters."
+        query += f"```python\n{starter_code}\n```\n\n"
+    else:
+        query += "Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows."
+        query += "```python\n# YOUR CODE HERE\n```\n\n"
+    return query
+
+
+def lcb_codegeneration_prompt_fn(line, task_name: str = "lcb:codegeneration") -> Doc:
+    # For the prompt we need a more general function that can be used tweaked like in:
+    # https://github.com/LiveCodeBench/LiveCodeBench/blob/main/lcb_runner/prompts/code_generation.py
+    query = prepare_prompt(line)
+    # List of dicts of the form: [{"input": "6\nabc\nacb\nbac\nbca\ncab\ncba\n", "output": "YES\nYES\nYES\nNO\nNO\nYES\n", "testtype": "stdin"}]
+    public_test_cases = json.loads(line["public_test_cases"])
+    private_test_cases = translate_private_test_cases(line["private_test_cases"])
+    inputs = [test["input"] for test in public_test_cases + private_test_cases]
+    outputs = [test["output"] for test in public_test_cases + private_test_cases]
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=[""],
+        gold_index=0,
+        specific={
+            "inputs": inputs,
+            "outputs": outputs,
+            "fn_name": json.loads(line["metadata"]).get("func_name", None),
+        },
+    )
+
+
+def codegen_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> float:
+    """Estimates the Pass@1 metric for the code generation task.
+    Extract the code from each prediction, Runs it for each sample and generations,
+    and computes the Pass@1 over the outputs.
+    """
+    # Extract generated code snippets
+    generated_code_snippets = [[extract_code(pred) for pred in predictions]]  # noqa: F841
+    evaluation_sample = {  # noqa: F841
+        "inputs": formatted_doc.specific["inputs"],
+        "outputs": formatted_doc.specific["outputs"],
+        "fn_name": formatted_doc.specific["fn_name"],
+    }
+    # This is a list of lists because
+    evaluation_sample = [{"input_output": json.dumps(evaluation_sample)}]
+
+    metrics, _ = codegen_metrics(
+        evaluation_sample,
+        generated_code_snippets,
+        k_list=[1],  # Only run for Pass@1
+        num_process_evaluate=8,
+    )
+    return metrics["pass@1"]
+
+
+lcb_codegen_metric = SampleLevelMetric(
+    metric_name="codegen_pass@1:16",  # This is the way of informing the number of generations currently
+    category=MetricCategory.GENERATIVE_SAMPLING,
+    use_case=MetricUseCase.REASONING,
+    higher_is_better=True,
+    sample_level_fn=codegen_metric,
+    corpus_level_fn=np.mean,
+)
+
+
+extend_enum(Metrics, "lcb_codegen_metric", lcb_codegen_metric)
+
+
+task = LightevalTaskConfig(
+    name="lcb:codegeneration",
+    suite=["extended"],
+    prompt_function=lcb_codegeneration_prompt_fn,
+    hf_repo="livecodebench/code_generation_lite",
+    hf_subset="v4_v5",  # https://github.com/LiveCodeBench/LiveCodeBench/tree/main?tab=readme-ov-file#dataset-versions
+    hf_avail_splits=["test"],
+    evaluation_splits=["test"],
+    generation_size=32768,
+    metric=[Metrics.lcb_codegen_metric],
+    stop_sequence=[],  # no stop sequence, will use EOS token
+    trust_dataset=True,
+    version=0,
+)
+
+
+TASKS_TABLE = [task]
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index c187f653f..3420480b2 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -214,9 +214,8 @@ def __init__(  # noqa: C901
             metric_names = as_list(metric.metric_name)
 
             for metric_name in metric_names:
-                # If we do maj_at_ metrics, we need to use the correct number of samples
-                if "maj@" in metric_name:
-                    self.num_samples.append(int(metric_name.replace("maj@", "").split("_")[0]))
+                # Update the number of samples to generate using the information in the metric name
+                self.num_samples.append(extract_num_samples(metric_name))
 
         self.formatter = cfg.prompt_function
 
@@ -657,3 +656,24 @@ def create_requests_from_tasks(  # noqa: C901
                         requests[req_type].extend(reqs)
 
     return requests, docs
+
+
+def extract_num_samples(metric_name: str) -> int:
+    """Gets the number of samples to generate from the metric name.
+    Assumes that any metric with @ in it's name depends on the number of samples.
+
+    Args:
+        metric_name (str): The metric name in the task.
+
+    Returns:
+        int: The number of samples to generate.
+    """
+    if "@" in metric_name:
+        metric_name = metric_name.split("@")[-1]
+        if "_" in metric_name:
+            metric_name = metric_name.split("_")[0]
+        if ":" in metric_name:
+            return int(metric_name.split(":")[-1])
+        else:
+            return int(metric_name)
+    return 1
diff --git a/tests/models/test_model_input.py b/tests/models/test_model_input.py
new file mode 100644
index 000000000..7c06df445
--- /dev/null
+++ b/tests/models/test_model_input.py
@@ -0,0 +1,49 @@
+# MIT License
+
+# Copyright (c) 2025 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import pytest
+
+from lighteval.models.model_input import GenerationParameters
+
+
+class TestGenerationParameters:
+    @pytest.mark.parametrize(
+        "model_args, expected",
+        [
+            (
+                "generation_parameters={temperature: 0.7,top_p: 0.95},pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,dtype=float16,data_parallel_size=4,max_model_length=32768,gpu_memory_utilisation=0.8",
+                {"temperature": 0.7, "top_p": 0.95},
+            ),
+            (
+                "pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,dtype=float16,data_parallel_size=4,generation_parameters={temperature: 0.7,top_p: 0.95},max_model_length=32768,gpu_memory_utilisation=0.8",
+                {"temperature": 0.7, "top_p": 0.95},
+            ),
+            (
+                "pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,dtype=float16,data_parallel_size=4,max_model_length=32768,gpu_memory_utilisation=0.8,generation_parameters={temperature: 0.7,top_p: 0.95}",
+                {"temperature": 0.7, "top_p": 0.95},
+            ),
+        ],
+    )
+    def test_extract_num_samples(self, model_args: str, expected):
+        gen = GenerationParameters.from_model_args(model_args)
+        for k, v in expected.items():
+            assert getattr(gen, k) == v
diff --git a/tests/tasks/test_lighteval_task.py b/tests/tasks/test_lighteval_task.py
index 0edbb5761..8c1bcafe7 100644
--- a/tests/tasks/test_lighteval_task.py
+++ b/tests/tasks/test_lighteval_task.py
@@ -20,7 +20,9 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig
+import pytest
+
+from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig, extract_num_samples
 
 
 def dummy_prompt_function(item, task_name):
@@ -59,3 +61,18 @@ def test_dataset_filter():
     filtered_docs = task.eval_docs()
     assert len(filtered_docs) == 1
     assert filtered_docs[0] == "hi"
+
+
+@pytest.mark.parametrize(
+    "metric_name, expected",
+    [
+        ("maj@1", 1),
+        ("pass@1:32_samples", 32),
+        ("pass@10:64_samples", 64),
+        ("codegen_pass@1:16", 16),
+        ("other_name@2", 2),
+        ("other_name", 1),
+    ],
+)
+def test_extract_num_samples(metric_name, expected):
+    assert extract_num_samples(metric_name) == expected