Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Aug 19, 2024
1 parent d14ac1e commit e61ec95
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 13 deletions.
3 changes: 1 addition & 2 deletions data/tabular/mona/example_processing_and_templates.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
"from tqdm import tqdm\n",
"\n",
"# import datasets\n",
"import rdkit\n",
"import rdkit.Chem as Chem\n",
"import rdkit.RDLogger as RDLogger"
]
Expand Down Expand Up @@ -1444,7 +1443,7 @@
" k = md[\"name\"]\n",
" v = md.get(\"value\", np.nan)\n",
" df_row[\"md_\" + transform_key(k)] = v\n",
" if not (v is np.nan):\n",
" if v is not np.nan:\n",
" md_keys.append(k)\n",
" md_key_counter.update(md_keys)\n",
" compounds = entry.get(\"compound\", [])\n",
Expand Down
5 changes: 0 additions & 5 deletions data/tabular/orbnet_denali/develop_transform.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,7 @@
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"from rdkit import Chem\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import os\n",
"import pandas as pd\n",
"from glob import glob"
]
Expand Down Expand Up @@ -474,7 +470,6 @@
"metadata": {},
"outputs": [],
"source": [
"from rdkit.Chem import rdDetermineBonds\n",
"from chemnlp.utils import xyz_to_mol"
]
},
Expand Down
25 changes: 20 additions & 5 deletions experiments/ablations/continued_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,13 @@ def load_model(


def train(
model, tokenizer, dataset, run_name: str, batch_size: int = 64, max_seq_length=2048, eval_dataset=None
model,
tokenizer,
dataset,
run_name: str,
batch_size: int = 64,
max_seq_length=2048,
eval_dataset=None,
):
wandb.init(project="chemnlp-ablations", name=run_name)
trainer = UnslothTrainer(
Expand All @@ -83,8 +89,8 @@ def train(
lr_scheduler_type="linear",
seed=3407,
output_dir=f"outputs_{run_name}",
eval_strategy = 'steps' if eval_dataset is not None else 'no',
eval_steps = 10_000 if eval_dataset is not None else None
eval_strategy="steps" if eval_dataset is not None else "no",
eval_steps=10_000 if eval_dataset is not None else None,
),
)

Expand Down Expand Up @@ -138,9 +144,18 @@ def run(
)

dataset = create_dataset(tokenizer, data_files)
eval_dataset = create_dataset(tokenizer, eval_data_files) if eval_data_files else None
eval_dataset = (
create_dataset(tokenizer, eval_data_files) if eval_data_files else None
)

train(model, tokenizer, dataset, run_name, batch_size=batch_size, eval_dataset=eval_dataset)
train(
model,
tokenizer,
dataset,
run_name,
batch_size=batch_size,
eval_dataset=eval_dataset,
)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion experiments/configs/data_configs/hf_data.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
model_name: "EleutherAI/pythia-1b"
context_length: 2048
dataset_name: "EleutherAI/pile"
dataset_args: {"name": "pubmed", "split": "train"}
dataset_args: { "name": "pubmed", "split": "train" }
batch_size: 1
string_key: "text"
save_path: "/fsx/proj-chemnlp/data/example_tokenised"

0 comments on commit e61ec95

Please sign in to comment.