diff --git a/data/tabular/mona/example_processing_and_templates.ipynb b/data/tabular/mona/example_processing_and_templates.ipynb index 5f12a6f7f..786a90365 100644 --- a/data/tabular/mona/example_processing_and_templates.ipynb +++ b/data/tabular/mona/example_processing_and_templates.ipynb @@ -20,7 +20,6 @@ "from tqdm import tqdm\n", "\n", "# import datasets\n", - "import rdkit\n", "import rdkit.Chem as Chem\n", "import rdkit.RDLogger as RDLogger" ] @@ -1444,7 +1443,7 @@ " k = md[\"name\"]\n", " v = md.get(\"value\", np.nan)\n", " df_row[\"md_\" + transform_key(k)] = v\n", - " if not (v is np.nan):\n", + " if v is not np.nan:\n", " md_keys.append(k)\n", " md_key_counter.update(md_keys)\n", " compounds = entry.get(\"compound\", [])\n", diff --git a/data/tabular/orbnet_denali/develop_transform.ipynb b/data/tabular/orbnet_denali/develop_transform.ipynb index 039c60f89..5e7f1dab6 100644 --- a/data/tabular/orbnet_denali/develop_transform.ipynb +++ b/data/tabular/orbnet_denali/develop_transform.ipynb @@ -25,11 +25,7 @@ "metadata": {}, "outputs": [], "source": [ - "from pathlib import Path\n", "from rdkit import Chem\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import os\n", "import pandas as pd\n", "from glob import glob" ] @@ -474,7 +470,6 @@ "metadata": {}, "outputs": [], "source": [ - "from rdkit.Chem import rdDetermineBonds\n", "from chemnlp.utils import xyz_to_mol" ] }, diff --git a/experiments/ablations/continued_pretrain.py b/experiments/ablations/continued_pretrain.py index 730453b95..0d5fd0577 100644 --- a/experiments/ablations/continued_pretrain.py +++ b/experiments/ablations/continued_pretrain.py @@ -57,7 +57,13 @@ def load_model( def train( - model, tokenizer, dataset, run_name: str, batch_size: int = 64, max_seq_length=2048, eval_dataset=None + model, + tokenizer, + dataset, + run_name: str, + batch_size: int = 64, + max_seq_length=2048, + eval_dataset=None, ): wandb.init(project="chemnlp-ablations", name=run_name) trainer = UnslothTrainer( @@ -83,8 +89,8 @@ def train( lr_scheduler_type="linear", seed=3407, output_dir=f"outputs_{run_name}", - eval_strategy = 'steps' if eval_dataset is not None else 'no', - eval_steps = 10_000 if eval_dataset is not None else None + eval_strategy="steps" if eval_dataset is not None else "no", + eval_steps=10_000 if eval_dataset is not None else None, ), ) @@ -138,9 +144,18 @@ def run( ) dataset = create_dataset(tokenizer, data_files) - eval_dataset = create_dataset(tokenizer, eval_data_files) if eval_data_files else None + eval_dataset = ( + create_dataset(tokenizer, eval_data_files) if eval_data_files else None + ) - train(model, tokenizer, dataset, run_name, batch_size=batch_size, eval_dataset=eval_dataset) + train( + model, + tokenizer, + dataset, + run_name, + batch_size=batch_size, + eval_dataset=eval_dataset, + ) if __name__ == "__main__": diff --git a/experiments/configs/data_configs/hf_data.yml b/experiments/configs/data_configs/hf_data.yml index c3fec721e..64d71ab4e 100644 --- a/experiments/configs/data_configs/hf_data.yml +++ b/experiments/configs/data_configs/hf_data.yml @@ -1,7 +1,7 @@ model_name: "EleutherAI/pythia-1b" context_length: 2048 dataset_name: "EleutherAI/pile" -dataset_args: {"name": "pubmed", "split": "train"} +dataset_args: { "name": "pubmed", "split": "train" } batch_size: 1 string_key: "text" save_path: "/fsx/proj-chemnlp/data/example_tokenised"