cxcscmu
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎README.md
+90 b/‎README.md
+90
diff --git a/‎acquire.sh
+8 b/‎acquire.sh
+8
diff --git a/‎environment.yml
+27 b/‎environment.yml
+27
diff --git a/‎mypy.ini
+11 b/‎mypy.ini
+11
diff --git a/‎pytest.ini
+3 b/‎pytest.ini
+3
diff --git a/‎reval.sh
+12 b/‎reval.sh
+12
diff --git a/‎source/__init__.py
+18 b/‎source/__init__.py
+18
diff --git a/‎source/dataset/__init__.py
+7 b/‎source/dataset/__init__.py
+7
@@ -0,0 +1 @@
+**/__pycache__
@@ -0,0 +1,90 @@
+# ESAE
+
+## Installation
+
+Please ensure that you have [Conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) installed on your system, as it is required for managing dependencies in this project.
+
+To create and activate the Conda environment using the provided `environment.yml` file, please run the following command:
+
+```sh
+conda env create -f environment.yml
+conda activate esae
+```
+
+Before proceeding, please update the system paths specified in `source/__init__.py` to match your configuration. These paths are used for storing the datasets, model checkpoints, and many others.
+
+```python
+from pathlib import Path
+
+workspace = Path("/data/user_data/haok/esae")
+workspace.mkdir(mode=0o770, parents=True, exist_ok=True)
+
+import os
+
+os.environ["HF_HOME"] = "/data/user_data/haok/huggingface"
+```
+
+## Overview
+
+To train a Sparse Autoencoder (SAE), the first step is to download the dataset and compute the embeddings that will later be reconstructed. Initialize all datasets in this repository using the following command:
+
+```sh
+python3 -m source.dataset.msMarco
+```
+
+### Running Experiments
+
+To experiment with different hyperparameters or model configurations, refer to the experiments in the `source/model/archive` directory. Each file in this directory contains a specific model setup.
+
+If you want to create a new experiment, you can do so by adding a new file with your desired hyperparameters and configurations. Once ready, run the following command, replacing {version} with your file name:
+
+```sh
+python3 -m source.model.{version}
+```
+
+For example, if your new experiment file is under `source/model/240825A.py`, you would run:
+
+```sh
+python3 -m source.model.240825A
+```
+
+Model checkpoints are automatically saved under `{workspace}/model/{version}/state/`, where workspace is the path specified in `source/__init__.py`. This makes it easy to manage and retrieve your experiment results.
+
+### Evaluating Performance
+
+## Quality Assurance
+
+### Standardized Interface
+
+To ensure a clean and reusable codebase, this repository follows best practices by defining the interfaces in `source/interface.py`. All core components implement standardized interfaces that promote consistency and modularity. For instance, the Dataset class defines a blueprint that any dataset must follow by implementing the didIter method. This method enables the client to iterate over all document IDs in batches.
+
+Here's an example:
+
+```python
+from abc import ABC, abstractmethod
+from typing import Iterator, List
+
+class Dataset(ABC):
+    name: DatasetName
+
+    @abstractmethod
+    def didIter(self, batchSize: int) -> Iterator[List[int]]:
+        """
+        Iterate over the document IDs.
+
+        :param batchSize: The batch size for each iteration.
+        :return: The iterator over the document IDs.
+        """
+        raise NotImplementedError
+```
+
+### Testing Locally
+
+To ensure matainability, this codebase is fully type-checked using mypy and thoroughly tested with pytest. As new components are integrated into the interface, please ensure that corresponding test cases are added. Place your test cases under the relevant directories to keep the test suite comprehensive and organized.
+
+You can run the following commands to perform these checks:
+
+```sh
+mypy source
+pytest source
+```
@@ -0,0 +1,8 @@
+#!/usr/bin/bash
+
+# Acquire a compute node with 32 CPUs, 96GB memory, and 1 A6000 GPU.
+# Please run this script with tmux to avoid losing the session.
+srun \
+    --partition=long --time=07-00:00:00 \
+    --cpus-per-task=32 --mem=96GB --gres=gpu:A6000:1 \
+    --pty bash
@@ -0,0 +1,27 @@
+name: esae
+
+channels:
+  - pytorch
+  - nvidia
+  - conda-forge
+
+dependencies:
+  - python=3.12
+  - pytorch=2.4.0
+  - pytorch-cuda=12.4
+  - numpy=1.26.4
+  - rich=13.7.1
+  - pyarrow==16.1.0
+  - mypy==1.10.0
+  - aiofiles==22.1.0
+  - aiohttp==3.9.5
+  - transformers==4.44.2
+  - pytest==7.4.4
+  - pytest-asyncio==0.20.3
+  - types-aiofiles==24.1.0.20240626
+  - attrs==23.1.0
+  - wandb==0.16.6
+  - faiss-gpu==1.8.0
+  - blobfile==3.0.0
+  - treevizer=0.2.4
+  - elasticsearch
@@ -0,0 +1,11 @@
+[mypy]
+exclude = source/model/archive/
+
+[mypy-transformers.*]
+ignore_missing_imports = True
+
+[mypy-pyarrow.*]
+ignore_missing_imports = True
+
+[mypy-faiss.*]
+ignore_missing_imports = True
@@ -0,0 +1,3 @@
+[pytest]
+filterwarnings =
+    ignore::UserWarning
@@ -0,0 +1,12 @@
+echo "Reconstruct: 240919A"
+python3 -m source.interpret.retrieval.reconstruct 240919A
+echo "Reconstruct: 240919B"
+python3 -m source.interpret.retrieval.reconstruct 240919B
+echo "Reconstruct: 240919C"
+python3 -m source.interpret.retrieval.reconstruct 240919C
+echo "Reconstruct: 240919D"
+python3 -m source.interpret.retrieval.reconstruct 240919D
+echo "Reconstruct: 240919E"
+python3 -m source.interpret.retrieval.reconstruct 240919E
+echo "Reconstruct: 240919F"
+python3 -m source.interpret.retrieval.reconstruct 240919F
@@ -0,0 +1,18 @@
+from pathlib import Path
+
+workspace = Path("/data/group_data/cx_group/esae")
+workspace.mkdir(mode=0o770, parents=True, exist_ok=True)
+
+import os
+
+os.environ["HF_HOME"] = Path(workspace, "huggingface").as_posix()
+
+from rich.console import Console
+
+console = Console(width=80)
+console._log_render.show_path = False
+console._log_render.omit_repeated_times = False
+
+import warnings
+
+warnings.filterwarnings("ignore")
@@ -0,0 +1,7 @@
+from pathlib import Path
+from source import workspace
+
+workspace = Path(workspace, "dataset")
+workspace.mkdir(mode=0o770, parents=True, exist_ok=True)
+
+from source.dataset.msMarco import MsMarcoDataset
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[pytest]`
	`2`	`+filterwarnings =`
	`3`	`+ ignore::UserWarning`