diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md
index 59e7054af3..b7719b9053 100644
--- a/docs/evaluation/code.md
+++ b/docs/evaluation/code.md
@@ -178,6 +178,88 @@ all you need to do is replace `openhands` with `swe_agent` in the command above.
 - Benchmark is defined in [`nemo_skills/dataset/livecodebench/__init__.py`](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/livecodebench/__init__.py)
 - Original benchmark source is [here](https://github.com/LiveCodeBench/LiveCodeBench).
 
+#### Data Preparation
+
+First, prepare the dataset by running the `ns prepare_data` command. The arguments below will generate `test_v6_2408_2505.jsonl`.
+
+```
+ns prepare_data livecodebench --release_version v6 --start_date 2024-08 --end_date 2025-05
+```
+
+##### For Pypy3 Evaluation:
+If you plan to evaluate using the Pypy3 interpreter, you must add the `--keep_all_columns` flag during data preparation. This will download a larger dataset (~1.9GB) containing the necessary test cases. So, we recommend downloading the dataset into a Slurm cluster location.
+
+```
+ns prepare_data livecodebench --release_version v6 --start_date 2024-08 --end_date 2025-05 --keep_all_columns --cluster=<CLUSTER_NAME> --data_dir=<DATA_DIR>
+```
+
+#### Running the Evaluation
+
+Once the data is prepared, you can run the evaluation. Replace `<...>` placeholders with your cluster and directory paths.
+
+##### Standard Python Evaluation
+
+This command runs an evaluation of [OpenReasoning-Nemotron-32B](https://huggingface.co/nvidia/OpenReasoning-Nemotron-32B) on a Slurm cluster.
+
+```
+ns eval \
+    --cluster=<CLUSTER_NAME> \
+    --model=nvidia/OpenReasoning-Nemotron-32B \
+    --server_type=vllm \
+    --server_args="--async-scheduling" \
+    --server_nodes=1 \
+    --server_gpus=8 \
+    --benchmarks=livecodebench \
+    --split=test_v6_2408_2505 \
+    --data_dir=<DATA_DIR> \
+    --output_dir=<OUTPUT_DIR> \
+    --extra_eval_args="++eval_config.interpreter=python" \
+    --with_sandbox \
+    ++inference.temperature=0.6 \
+    ++inference.top_p=0.95 \
+    ++inference.tokens_to_generate=65536
+```
+
+##### Pypy3 Evaluation
+
+To run with the Pypy3 interpreter, modify the `--extra_eval_args` flag as shown below.
+```
+--extra_eval_args="++eval_config.interpreter=pypy3 ++eval_config.test_file=<DATA_DIR>/livecodebench/test_v6_2408_2505.jsonl"
+```
+
+##### Verifying Results
+
+After all jobs are complete, you can check the results in `<OUTPUT_DIR>/eval-results/livecodebench/metrics.json`. You can also take a look at `<OUTPUT_DIR>/eval-results/livecodebench/summarized-results/main_*` They should look something like this:
+
+```
+-------------------------- livecodebench --------------------------
+evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
+pass@1          | 454         | 15995      | 2188        | 71.15%
+
+
+------------------------ livecodebench-easy -----------------------
+evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
+pass@1          | 110         | 5338       | 1806        | 99.09%
+
+
+------------------------ livecodebench-hard -----------------------
+evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
+pass@1          | 203         | 23031      | 2188        | 46.31%
+
+
+----------------------- livecodebench-medium ----------------------
+evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
+pass@1          | 141         | 14178      | 1889        | 85.11%
+```
+
+##### Advanced: Averaging Multiple Runs
+
+Due to variance between runs, you can automatically repeat the evaluation and average the results. To run the evaluation 3 times, for example, set the `--benchmarks` flag as follows:
+
+```
+--benchmarks=livecodebench:3
+```
+
 ### livecodebench-pro
 
 - Benchmark is defined in [`nemo_skills/dataset/livecodebench-pro/__init__.py`](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/livecodebench-pro/__init__.py)
diff --git a/nemo_skills/dataset/livecodebench/prepare.py b/nemo_skills/dataset/livecodebench/prepare.py
index 65c1463ee0..40c3210cd9 100644
--- a/nemo_skills/dataset/livecodebench/prepare.py
+++ b/nemo_skills/dataset/livecodebench/prepare.py
@@ -18,7 +18,7 @@
 from datetime import datetime
 from pathlib import Path
 
-from datasets import load_dataset
+from datasets import Value, load_dataset
 from dateutil.relativedelta import relativedelta
 
 
@@ -66,7 +66,7 @@ def parse_month_range(start_date, end_date):
         raise ValueError(str(e))
 
 
-def clean_data(dataset):
+def clean_data(dataset, keep_all_columns=False):
     def map_fn(data):
         question = data["question_content"] + "\n\n"
         if data["starter_code"]:
@@ -80,22 +80,26 @@ def map_fn(data):
         data["question"] = question.replace("    ", "\t")
         return data
 
-    remove_columns = [
-        "question_title",
-        "contest_id",
-        "public_test_cases",
-        "private_test_cases",
-        "metadata",
-        "question_content",
-        "platform",
-        "question_id",
-        "starter_code",
-    ]
+    remove_columns = []
+    if not keep_all_columns:
+        remove_columns = [
+            "question_title",
+            "contest_id",
+            "metadata",
+            "question_content",
+            "platform",
+            "question_id",
+            "starter_code",
+            "public_test_cases",
+            "private_test_cases",
+        ]
+    dataset = dataset.cast_column("public_test_cases", Value("large_string"))
+    dataset = dataset.cast_column("private_test_cases", Value("large_string"))
     dataset = dataset.map(map_fn, remove_columns=remove_columns)
     return dataset
 
 
-def prepare(start_date, end_date, release_version, output_dir):
+def prepare(start_date, end_date, release_version, output_dir, keep_all_columns=False):
     start_date, end_date = parse_month_range(start_date, end_date)
     start_yymm = start_date.strftime("%y%m")
     end_yymm = end_date.strftime("%y%m")
@@ -104,7 +108,7 @@ def prepare(start_date, end_date, release_version, output_dir):
     assert release_version in ["v1", "v2", "v3", "v4", "v5", "v6"]
 
     data = parse_data(release_version=f"release_{release_version}")
-    data = clean_data(data)
+    data = clean_data(data, keep_all_columns)
     print("Len of data: ", len(data))
 
     print("Writing to file...")
@@ -115,16 +119,10 @@ def prepare(start_date, end_date, release_version, output_dir):
         for problem in data:
             input_date = datetime.strptime(problem["contest_date"], "%Y-%m-%dT%H:%M:%S").date()
             if start_date <= input_date <= end_date:
-                json.dump(
-                    {
-                        "task_id": problem["task_id"],
-                        "question": problem["question"],
-                        "difficulty": problem["difficulty"],
-                        "subset_for_metrics": problem["difficulty"],
-                        "release_version": release_version,
-                    },
-                    f,
-                )
+                output_record = {**problem}
+                output_record["subset_for_metrics"] = problem["difficulty"]
+                output_record["release_version"] = release_version
+                json.dump(output_record, f)
                 f.write("\n")
 
 
@@ -135,7 +133,6 @@ def prepare(start_date, end_date, release_version, output_dir):
     ("v6", "2024-08", "2025-05"),  # current default in lb
 ]
 
-
 if __name__ == "__main__":
     # Write an argparse to a json file, read it in and parse it
     parser = argparse.ArgumentParser()
@@ -143,6 +140,7 @@ def prepare(start_date, end_date, release_version, output_dir):
     parser.add_argument("--release_version", type=str, default="all")
     parser.add_argument("--start_date", type=str, default="all", help="End date in YYYY-MM format")
     parser.add_argument("--end_date", type=str, default="all", help="End date in YYYY-MM format")
+    parser.add_argument("--keep_all_columns", action="store_true", help="keep all columns in the output jsonl file")
 
     args = parser.parse_args()
 
@@ -150,14 +148,14 @@ def prepare(start_date, end_date, release_version, output_dir):
         # Prepare all splits
         for release_version, start_date, end_date in DEFAULT_SPLITS:
             print(f"Processing data for {release_version} from {start_date} to {end_date}")
-            prepare(start_date, end_date, release_version, args.output_dir)
+            prepare(start_date, end_date, release_version, args.output_dir, args.keep_all_columns)
     else:
         if args.release_version == "all" or args.start_date == "all" or args.end_date == "all":
             raise ValueError(
                 "If preparing a custom split, you must specify all "
                 "--release_version, --start_date, and --end_date arguments."
             )
-        prepare(args.start_date, args.end_date, args.release_version, args.output_dir)
+        prepare(args.start_date, args.end_date, args.release_version, args.output_dir, args.keep_all_columns)
 
     # test_v5_2408_2502.jsonl: 279 samples
     # test_v5_2410_2502.jsonl: 166 samples
diff --git a/nemo_skills/evaluation/evaluator/__init__.py b/nemo_skills/evaluation/evaluator/__init__.py
index 7bdbfda6ea..70b79fc924 100644
--- a/nemo_skills/evaluation/evaluator/__init__.py
+++ b/nemo_skills/evaluation/evaluator/__init__.py
@@ -21,12 +21,12 @@
     eval_bigcodebench,
     eval_evalplus,
     eval_livebench_coding,
-    eval_livecodebench,
     eval_livecodebench_pro,
 )
 from nemo_skills.evaluation.evaluator.ifbench import eval_ifbench
 from nemo_skills.evaluation.evaluator.ifeval import eval_if
 from nemo_skills.evaluation.evaluator.ioi import eval_ioi
+from nemo_skills.evaluation.evaluator.livecodebench import eval_livecodebench
 from nemo_skills.evaluation.evaluator.math import (
     Lean4ProofEvaluator,
     Lean4StatementEvaluator,
diff --git a/nemo_skills/evaluation/evaluator/code.py b/nemo_skills/evaluation/evaluator/code.py
index 0c748cf56f..c9aca7bb98 100644
--- a/nemo_skills/evaluation/evaluator/code.py
+++ b/nemo_skills/evaluation/evaluator/code.py
@@ -23,7 +23,8 @@
 
 from omegaconf import OmegaConf
 
-from nemo_skills.utils import get_logger_name, nested_dataclass, unroll_files
+from nemo_skills.file_utils import unroll_files
+from nemo_skills.utils import get_logger_name
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
@@ -99,86 +100,6 @@ def install_from_git(git_url):
         print(f"Error during installation: {e}")
 
 
-# TODO: use sandbox
-@nested_dataclass(kw_only=True)
-class LiveCodeBenchEvaluatorConfig:
-    language: str = "python"  # "cpp" is another option now
-    test_file: str = None
-
-
-def eval_livecodebench(cfg):
-    try:
-        from livecodebench.evaluate import evaluate
-    except ImportError:
-        LOG.info("Package 'livecodebench' not found. Attempting to install...")
-        # install_from_git("git+https://github.com/wasiahmad/livecodebench.git")
-        install_from_git("git+https://github.com/wasiahmad/livecodebench.git@f285640c20aaf18df1ee5917621a596af4630b5e")
-        try:
-            from livecodebench.evaluate import evaluate
-        except ImportError:
-            LOG.info("Failed to install 'livecodebench'. Please install it manually.")
-            raise
-
-    eval_config = LiveCodeBenchEvaluatorConfig(_init_nested=True, **cfg.eval_config)
-    assert eval_config.language in ["python", "cpp"]
-    if eval_config.language == "cpp":
-        assert eval_config.test_file is not None
-
-    release_version = None
-    for jsonl_file in unroll_files(cfg.input_files):
-        with open(jsonl_file) as f:
-            samples = [preprocess_code(json.loads(line), eval_config.language) for line in f]
-            for sample in samples:
-                sample["question_id"] = sample["task_id"]
-                sample["code_list"] = [sample["completion"]]
-                if release_version is None:
-                    release_version = sample["release_version"]
-                if release_version != sample["release_version"]:
-                    raise ValueError(
-                        f"All samples should have the same release version, "
-                        f"but got {release_version} and {sample['release_version']}"
-                    )
-
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                f.write(json.dumps(sample) + "\n")
-
-        # https://github.com/wasiahmad/livecodebench/blob/main/livecodebench/evaluate.py#L10
-        evaluate(
-            custom_output_file=jsonl_file,
-            release_version=f"release_{release_version}",
-            k_list=[1],
-            language=eval_config.language,
-            test_file=None if eval_config.language == "python" else eval_config.test_file,
-            num_process_evaluate=12,
-            timeout=6 if eval_config.language == "python" else 30,
-        )
-
-        with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin:
-            eval_grades = json.load(fin)
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                sample["graded_list"] = eval_grades["eval"][sample["task_id"]]["graded_list"]
-                f.write(json.dumps(sample) + "\n")
-
-        # moving eval file to ensure metrics are recomputed
-        shutil.move(jsonl_file[:-6] + "_eval_results.json", jsonl_file[:-6] + "_eval_results-saved.json")
-
-
-def eval_livecodebench_pro(cfg):
-    for jsonl_file in unroll_files(cfg.input_files):
-        with open(jsonl_file) as f:
-            samples = [preprocess_code(json.loads(line), "python") for line in f]
-            for sample in samples:
-                sample["problem_id"] = sample.pop("task_id")
-                sample["text_response"] = sample.pop("completion")
-                sample["response_meta"] = None
-
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                f.write(json.dumps(sample) + "\n")
-
-
 def eval_evalplus(cfg):
     # TODO: need to move it to a separate docker (either our sandbox or separate srun)
     from evalplus.evaluate import evaluate
@@ -228,6 +149,20 @@ def install_requirements(url):
         print(f"Error during installation: {e}")
 
 
+def eval_livecodebench_pro(cfg):
+    for jsonl_file in unroll_files(cfg.input_files):
+        with open(jsonl_file) as f:
+            samples = [preprocess_code(json.loads(line), "python") for line in f]
+            for sample in samples:
+                sample["problem_id"] = sample.pop("task_id")
+                sample["text_response"] = sample.pop("completion")
+                sample["response_meta"] = None
+
+        with open(jsonl_file, "wt", encoding="utf-8") as f:
+            for sample in samples:
+                f.write(json.dumps(sample) + "\n")
+
+
 def eval_livebench_coding(cfg):
     try:
         from livecodebench.evaluate import evaluate
diff --git a/nemo_skills/evaluation/evaluator/livecodebench.py b/nemo_skills/evaluation/evaluator/livecodebench.py
new file mode 100644
index 0000000000..770f3d0340
--- /dev/null
+++ b/nemo_skills/evaluation/evaluator/livecodebench.py
@@ -0,0 +1,129 @@
+import asyncio
+import json
+import logging
+import shlex
+import shutil
+import textwrap
+from contextlib import asynccontextmanager
+from dataclasses import field
+
+from nemo_skills.code_execution.sandbox import get_sandbox
+from nemo_skills.evaluation.evaluator.code import preprocess_code
+from nemo_skills.utils import get_logger_name, nested_dataclass, unroll_files
+
+LOG = logging.getLogger(get_logger_name(__file__))
+
+LIVECODEBENCH_PYTHON_GIT_URL = "git+https://github.com/wasiahmad/livecodebench.git@livecodebench"
+LIVECODEBENCH_PYPY3_GIT_URL = "git+https://github.com/wasiahmad/livecodebench.git"
+
+
+@nested_dataclass(kw_only=True)
+class LiveCodeBenchEvaluatorConfig:
+    sandbox: dict = field(default_factory=lambda: {"sandbox_type": "local"})
+    language: str = "python"  # "cpp" is another option now
+    test_file: str = None
+    interpreter: str = "python"  # use either "python" or pypy3
+    timeout: int = 6
+    num_processes: int = 12
+
+
+@asynccontextmanager
+async def sandbox_context(config: dict):
+    sandbox = get_sandbox(**config)
+    try:
+        yield sandbox
+    finally:
+        LOG.info("Closing sandbox...")
+        await sandbox.close()
+
+
+async def install_packages(eval_config: LiveCodeBenchEvaluatorConfig) -> bool:
+    """
+    Installs required packages in a temporary sandbox.
+    Returns True on success, False on failure.
+    """
+    async with sandbox_context(eval_config.sandbox) as sandbox:
+        LOG.info(f"Installing livecodebench with {eval_config.interpreter}...")
+        pip_cmd = "pip" if eval_config.interpreter == "python" else "pypy3 -m pip"
+        git_url = LIVECODEBENCH_PYTHON_GIT_URL if eval_config.interpreter == "python" else LIVECODEBENCH_PYPY3_GIT_URL
+        cmd = f"{pip_cmd} install {git_url}"
+
+        result, _ = await sandbox.execute_code(cmd, language="shell", timeout=300)
+        if result.get("process_status") != "completed":
+            LOG.warning(f"Failed to install livecodebench: {result.get('stderr', 'Unknown error')}")
+            return False
+
+        LOG.info("Successfully installed livecodebench.")
+        return True
+
+
+async def eval_livecodebench_async(cfg):
+    eval_config = LiveCodeBenchEvaluatorConfig(_init_nested=True, **cfg.eval_config)
+
+    if eval_config.language == "python" and eval_config.interpreter not in ["python", "pypy3"]:
+        raise ValueError("Python interpreter must be 'python' or 'pypy3'.")
+    if eval_config.language == "cpp" and eval_config.test_file is None:
+        raise ValueError("C++ evaluation requires a test_file.")
+
+    if not await install_packages(eval_config):
+        return
+
+    async with sandbox_context(eval_config.sandbox) as sandbox:
+        for jsonl_file in unroll_files(cfg.input_files):
+            LOG.info(f"Processing file: {jsonl_file}")
+
+            with open(jsonl_file, encoding="utf-8") as f_in:
+                samples = [preprocess_code(json.loads(line), eval_config.language) for line in f_in]
+
+            versions = {s["release_version"] for s in samples}
+            if len(versions) > 1:
+                raise ValueError(f"All samples should have the same release version. Found: {versions}")
+            release_version = versions.pop()
+
+            for s in samples:
+                s["code_list"] = [s["completion"]]
+
+            with open(jsonl_file, "w", encoding="utf-8") as f_out:
+                f_out.writelines(json.dumps(sample) + "\n" for sample in samples)
+
+            test_file_arg = repr(eval_config.test_file) if eval_config.test_file else "None"
+            eval_code = textwrap.dedent(f"""
+                from livecodebench.evaluate import evaluate
+                evaluate(
+                    custom_output_file='{jsonl_file}',
+                    release_version='release_{release_version}',
+                    test_file={test_file_arg},
+                    k_list=[1],
+                    language='{eval_config.language}',
+                    num_process_evaluate={eval_config.num_processes},
+                    timeout={eval_config.timeout}
+                )
+            """)
+
+            cmd = f"{eval_config.interpreter} -c {shlex.quote(eval_code)}"
+            output, _ = await sandbox.execute_code(
+                cmd,
+                language="shell",
+                timeout=eval_config.timeout * len(samples) + 60,
+                max_output_characters=100_000,
+            )
+
+            if output.get("process_status") != "completed":
+                LOG.error(f"Evaluation failed for {jsonl_file}. Stderr: {output.get('stderr')}")
+                continue
+
+            with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin:
+                eval_grades = json.load(fin)
+
+            with open(jsonl_file, "wt", encoding="utf-8") as f_out:
+                for s in samples:
+                    s["graded_list"] = eval_grades["eval"][s["task_id"]]["graded_list"]
+                    f_out.write(json.dumps(s) + "\n")
+
+            shutil.move(jsonl_file[:-6] + "_eval_results.json", jsonl_file[:-6] + "_eval_results-saved.json")
+            LOG.info(f"Finished processing {jsonl_file}, results saved.")
+
+
+def eval_livecodebench(cfg):
+    """Synchronous wrapper to run the async evaluation."""
+    asyncio.run(eval_livecodebench_async(cfg))