From eb8b8b78b10a769051c2bdab53f0cc196d54d43d Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Mon, 1 Jul 2024 16:29:50 -0700 Subject: [PATCH 01/19] Stricter check for query planning. (#107) * Stricter query planning checks with newer versions of dask Signed-off-by: Ayush Dattagupta * Add checks to tests/__init__ Signed-off-by: Ayush Dattagupta * Check sys.modules to ensure dask-expr is not enabled Signed-off-by: Ayush Dattagupta * Search for "dask_expr" in sys modules Co-authored-by: Richard (Rick) Zamora Signed-off-by: Ayush Dattagupta * use dask_expr instead of dask-expr Signed-off-by: Ayush Dattagupta --------- Signed-off-by: Ayush Dattagupta Co-authored-by: Richard (Rick) Zamora --- examples/fuzzy_deduplication.py | 3 ++- examples/slurm/start-slurm.sh | 1 + nemo_curator/__init__.py | 15 +++++++++++---- tests/__init__.py | 17 ++++++++++++----- 4 files changed, 26 insertions(+), 10 deletions(-) diff --git a/examples/fuzzy_deduplication.py b/examples/fuzzy_deduplication.py index 93dc869da..88fbb0200 100644 --- a/examples/fuzzy_deduplication.py +++ b/examples/fuzzy_deduplication.py @@ -16,7 +16,6 @@ import time import dask -from dask import dataframe as dd from nemo_curator import FuzzyDuplicates, FuzzyDuplicatesConfig from nemo_curator.datasets import DocumentDataset @@ -49,6 +48,8 @@ def main(args): t0 = time.time() if filetype == "parquet": + from dask import dataframe as dd + input_dataset = DocumentDataset( dd.read_parquet( dataset_dir, diff --git a/examples/slurm/start-slurm.sh b/examples/slurm/start-slurm.sh index 02c211f6a..ab4074657 100644 --- a/examples/slurm/start-slurm.sh +++ b/examples/slurm/start-slurm.sh @@ -67,6 +67,7 @@ export CUDF_SPILL="1" export RMM_SCHEDULER_POOL_SIZE="1GB" export RMM_WORKER_POOL_SIZE="72GiB" export LIBCUDF_CUFILE_POLICY=OFF +export DASK_DATAFRAME__QUERY_PLANNING=False # ================================================================= diff --git a/nemo_curator/__init__.py b/nemo_curator/__init__.py index c9e79ff74..b440156e1 100644 --- a/nemo_curator/__init__.py +++ b/nemo_curator/__init__.py @@ -12,15 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys + import dask # Disable query planning if possible # https://github.com/NVIDIA/NeMo-Curator/issues/73 -if dask.config.get("dataframe.query-planning") is True: +if dask.config.get("dataframe.query-planning") is True or "dask_expr" in sys.modules: raise NotImplementedError( - "NeMo Curator does not support query planning yet. " - "Please disable query planning before importing " - "`nemo_curator`, `dask.dataframe` or `dask_cudf`." + """ + NeMo Curator does not support query planning yet. + Please disable query planning before importing + `dask.dataframe` or `dask_cudf`. This can be done via: + `export DASK_DATAFRAME__QUERY_PLANNING=False`, or + importing `dask.dataframe/dask_cudf` after importing + `nemo_curator`. + """ ) else: dask.config.set({"dataframe.query-planning": False}) diff --git a/tests/__init__.py b/tests/__init__.py index 1950868ef..ec57fdca9 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -12,15 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys + import dask -# Disable query planning before any tests are loaded +# Disable query planning if possible # https://github.com/NVIDIA/NeMo-Curator/issues/73 -if dask.config.get("dataframe.query-planning") is True: +if dask.config.get("dataframe.query-planning") is True or "dask_expr" in sys.modules: raise NotImplementedError( - "NeMo Curator does not support query planning yet. " - "Please disable query planning before importing " - "`nemo_curator`, `dask.dataframe` or `dask_cudf`." + """ + NeMo Curator does not support query planning yet. + Please disable query planning before importing + `dask.dataframe` or `dask_cudf`. This can be done via: + `export DASK_DATAFRAME__QUERY_PLANNING=False`, or + importing `dask.dataframe/dask_cudf` after importing + `nemo_curator`. + """ ) else: dask.config.set({"dataframe.query-planning": False}) From 53626fb00fa0102d93f6935c8ff256c2467b02b2 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Tue, 2 Jul 2024 14:01:50 -0700 Subject: [PATCH 02/19] remove hardcoded labels Signed-off-by: Sarah Yurick --- .../distributeddataclassification.rst | 11 ---- examples/domain_classifier_example.py | 30 --------- examples/quality_classifier_example.py | 2 - .../modules/distributed_data_classifier.py | 62 ++++++++++++++----- .../scripts/domain_classifier_inference.py | 33 +--------- .../scripts/quality_classifier_inference.py | 23 +------ nemo_curator/utils/script_utils.py | 4 +- 7 files changed, 51 insertions(+), 114 deletions(-) diff --git a/docs/user-guide/distributeddataclassification.rst b/docs/user-guide/distributeddataclassification.rst index e35117e13..440812c82 100644 --- a/docs/user-guide/distributeddataclassification.rst +++ b/docs/user-guide/distributeddataclassification.rst @@ -39,16 +39,6 @@ Let's see how ``DomainClassifier`` works in a small excerpt taken from ``example .. code-block:: python - labels = [ - "Adult", - "Arts_and_Entertainment", - "Autos_and_Vehicles", - ..., - "Shopping", - "Sports", - "Travel_and_Transportation", - ] - model_path = "pytorch_model_file.pth" files = get_all_files_paths_under("books_dataset/") @@ -56,7 +46,6 @@ Let's see how ``DomainClassifier`` works in a small excerpt taken from ``example domain_classifier = DomainClassifier( model_path=model_path, - labels=labels, filter_by=["Games", "Sports"], ) result_dataset = domain_classifier(dataset=input_dataset) diff --git a/examples/domain_classifier_example.py b/examples/domain_classifier_example.py index d4709822d..fe559a31d 100644 --- a/examples/domain_classifier_example.py +++ b/examples/domain_classifier_example.py @@ -25,35 +25,6 @@ def main(args): global_st = time.time() - labels = [ - "Adult", - "Arts_and_Entertainment", - "Autos_and_Vehicles", - "Beauty_and_Fitness", - "Books_and_Literature", - "Business_and_Industrial", - "Computers_and_Electronics", - "Finance", - "Food_and_Drink", - "Games", - "Health", - "Hobbies_and_Leisure", - "Home_and_Garden", - "Internet_and_Telecom", - "Jobs_and_Education", - "Law_and_Government", - "News", - "Online_Communities", - "People_and_Society", - "Pets_and_Animals", - "Real_Estate", - "Science", - "Sensitive_Subjects", - "Shopping", - "Sports", - "Travel_and_Transportation", - ] - model_path = "/path/to/pytorch_model_file.pth" # Input can be a string or list @@ -68,7 +39,6 @@ def main(args): domain_classifier = DomainClassifier( model_path=model_path, - labels=labels, filter_by=["Games", "Sports"], ) result_dataset = domain_classifier(dataset=input_dataset) diff --git a/examples/quality_classifier_example.py b/examples/quality_classifier_example.py index 277200c05..c7f03b112 100644 --- a/examples/quality_classifier_example.py +++ b/examples/quality_classifier_example.py @@ -25,7 +25,6 @@ def main(args): global_st = time.time() - labels = ["High", "Medium", "Low"] model_path = "/path/to/pytorch_model_file.pth" # Input can be a string or list @@ -40,7 +39,6 @@ def main(args): quality_classifier = QualityClassifier( model_path=model_path, - labels=labels, filter_by=["High", "Medium"], ) result_dataset = quality_classifier(dataset=input_dataset) diff --git a/nemo_curator/modules/distributed_data_classifier.py b/nemo_curator/modules/distributed_data_classifier.py index 6bb975eec..9dc127b92 100644 --- a/nemo_curator/modules/distributed_data_classifier.py +++ b/nemo_curator/modules/distributed_data_classifier.py @@ -149,6 +149,9 @@ def _filter_documents( raise TypeError("filter_by must be a string or list type") + def get_labels(self): + return self.labels + def _run_classifier_helper( df: "dask_cudf.DataFrame", @@ -271,34 +274,59 @@ class DomainClassifier(DistributedDataClassifier): def __init__( self, model_path, - labels, filter_by=None, batch_size=256, - out_dim=None, pred_column="domain_pred", prob_column=None, max_chars=2000, device_type="cuda", autocast=True, ): - if out_dim is None: - out_dim = len(labels) + self.labels = [ + "Adult", + "Arts_and_Entertainment", + "Autos_and_Vehicles", + "Beauty_and_Fitness", + "Books_and_Literature", + "Business_and_Industrial", + "Computers_and_Electronics", + "Finance", + "Food_and_Drink", + "Games", + "Health", + "Hobbies_and_Leisure", + "Home_and_Garden", + "Internet_and_Telecom", + "Jobs_and_Education", + "Law_and_Government", + "News", + "Online_Communities", + "People_and_Society", + "Pets_and_Animals", + "Real_Estate", + "Science", + "Sensitive_Subjects", + "Shopping", + "Sports", + "Travel_and_Transportation", + ] + self.out_dim = len(self.labels) self.prob_column = prob_column model = DomainModel( config=DomainModelConfig, - out_dim=out_dim, + out_dim=self.out_dim, model_path=model_path, autocast=autocast, ) super().__init__( model=model, - labels=labels, + labels=self.labels, filter_by=filter_by, batch_size=batch_size, - out_dim=out_dim, + out_dim=self.out_dim, pred_column=pred_column, max_chars=max_chars, device_type=device_type, @@ -324,37 +352,39 @@ class QualityClassifier(DistributedDataClassifier): def __init__( self, model_path, - labels, + num_labels=3, filter_by=None, batch_size=256, - out_dim=None, pred_column="quality_pred", prob_column="quality_prob", max_chars=6000, device_type="cuda", autocast=True, ): - if len(labels) == 2: - out_dim = 1 # Binary classification + if num_labels == 3: + self.labels = ["High", "Medium", "Low"] + self.out_dim = num_labels # Multiclass classification + elif num_labels == 2: + self.labels = ["Medium_High", "Low"] + self.out_dim = 1 # Binary classification else: - if out_dim is None: - out_dim = len(labels) # Multiclass classification + raise ValueError("num_labels must be 2 or 3") self.prob_column = prob_column model = QualityModel( config=QualityModelConfig, - out_dim=out_dim, + out_dim=self.out_dim, model_path=model_path, autocast=autocast, ) super().__init__( model=model, - labels=labels, + labels=self.labels, filter_by=filter_by, batch_size=batch_size, - out_dim=out_dim, + out_dim=self.out_dim, pred_column=pred_column, max_chars=max_chars, device_type=device_type, diff --git a/nemo_curator/scripts/domain_classifier_inference.py b/nemo_curator/scripts/domain_classifier_inference.py index b3a31f1d4..f58723864 100644 --- a/nemo_curator/scripts/domain_classifier_inference.py +++ b/nemo_curator/scripts/domain_classifier_inference.py @@ -29,35 +29,6 @@ def main(): - labels = [ - "Adult", - "Arts_and_Entertainment", - "Autos_and_Vehicles", - "Beauty_and_Fitness", - "Books_and_Literature", - "Business_and_Industrial", - "Computers_and_Electronics", - "Finance", - "Food_and_Drink", - "Games", - "Health", - "Hobbies_and_Leisure", - "Home_and_Garden", - "Internet_and_Telecom", - "Jobs_and_Education", - "Law_and_Government", - "News", - "Online_Communities", - "People_and_Society", - "Pets_and_Animals", - "Real_Estate", - "Science", - "Sensitive_Subjects", - "Shopping", - "Sports", - "Travel_and_Transportation", - ] - args = ArgumentHelper.parse_distributed_classifier_args().parse_args() print(f"Arguments parsed = {args}", flush=True) max_chars = 2000 @@ -83,11 +54,9 @@ def main(): add_filename = True domain_classifier = DomainClassifier( - model_path=args.model_path, - labels=labels, + model_path=args.pretrained_model_name_or_path, max_chars=max_chars, batch_size=args.batch_size, - out_dim=len(labels), autocast=args.autocast, ) diff --git a/nemo_curator/scripts/quality_classifier_inference.py b/nemo_curator/scripts/quality_classifier_inference.py index 7ac0b6994..173a8f924 100644 --- a/nemo_curator/scripts/quality_classifier_inference.py +++ b/nemo_curator/scripts/quality_classifier_inference.py @@ -26,28 +26,10 @@ warnings.filterwarnings("ignore") -def get_labels(num_labels): - """ - This function returns a list of quality labels, depending on how many labels the user expects. - - Args: - num_labels: An integer representing the number of possible classification labels. - Returns: - A list of label names. - - """ - if num_labels == 3: - labels = ["High", "Medium", "Low"] - elif num_labels == 2: - labels = ["Medium_High", "Low"] - return labels - - def main(): parser = ArgumentHelper.parse_distributed_classifier_args() parser.add_argument("--num-labels", type=int, default=3) args = parser.parse_args() - labels = get_labels(args.num_labels) print(f"Arguments parsed = {args}", flush=True) max_chars = 6000 @@ -72,12 +54,11 @@ def main(): add_filename = True classifier = QualityClassifier( - model_path=args.model_path, + model_path=args.pretrained_model_name_or_path, + num_labels=args.num_labels, max_chars=max_chars, - labels=labels, batch_size=args.batch_size, autocast=args.autocast, - out_dim=len(labels), ) for file_batch_id, i in enumerate(range(0, len(input_files), files_per_run)): diff --git a/nemo_curator/utils/script_utils.py b/nemo_curator/utils/script_utils.py index 32582dafc..2f645401a 100644 --- a/nemo_curator/utils/script_utils.py +++ b/nemo_curator/utils/script_utils.py @@ -395,9 +395,9 @@ def parse_distributed_classifier_args( required=True, ) parser.add_argument( - "--model-path", + "--pretrained-model-name-or-path", type=str, - help="The path to the model file", + help="HuggingFace name of model, or local path to downloaded model file", required=True, ) parser.add_argument( From 8d7aa18b13bf405f0d0134f7e2bb4061cc3740b4 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Tue, 2 Jul 2024 14:35:30 -0700 Subject: [PATCH 03/19] update doc Signed-off-by: Sarah Yurick --- docs/user-guide/distributeddataclassification.rst | 12 ++++++------ examples/quality_classifier_example.py | 1 - 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/docs/user-guide/distributeddataclassification.rst b/docs/user-guide/distributeddataclassification.rst index 440812c82..39da06091 100644 --- a/docs/user-guide/distributeddataclassification.rst +++ b/docs/user-guide/distributeddataclassification.rst @@ -32,26 +32,26 @@ classification helps mitigate biases and inaccuracies that may arise from poorly Usage ----------------------------------------- -NeMo Curator provides a base class ``DistributedDataClassifier`` that can be extended to fit your specfic model. +NeMo Curator provides a base class ``DistributedDataClassifier`` that can be extended to fit your specific model. The only requirement is that the model can fit on a single GPU. We have also provided two subclasses that focus on domain and quality classification. Let's see how ``DomainClassifier`` works in a small excerpt taken from ``examples/domain_classifier_example.py``: .. code-block:: python - model_path = "pytorch_model_file.pth" + model = "nvidia/domain-classifier" files = get_all_files_paths_under("books_dataset/") input_dataset = DocumentDataset.read_json(files, backend="cudf", add_filename=True) - domain_classifier = DomainClassifier( - model_path=model_path, - filter_by=["Games", "Sports"], - ) + domain_classifier = DomainClassifier(model, filter_by=["Games", "Sports"]) result_dataset = domain_classifier(dataset=input_dataset) result_dataset.to_json("games_and_sports/", write_to_filename=True) +In the above excerpt, the domain classifier is obtained directly from [HuggingFace](https://huggingface.co/nvidia/domain-classifier). +Alternatively, the user may download the model and set `model = "/path/to/model.pth"`. + This module functions very similarly to the ``ScoreFilter`` module. The key differences is that it operates on the GPU instead of the CPU. Therefore, the Dask cluster must be started as a GPU one. diff --git a/examples/quality_classifier_example.py b/examples/quality_classifier_example.py index c7f03b112..6d13f9df9 100644 --- a/examples/quality_classifier_example.py +++ b/examples/quality_classifier_example.py @@ -13,7 +13,6 @@ # limitations under the License. import argparse -import os import time from nemo_curator import QualityClassifier From 28df12028458f02529c9ddcc6b8d7f223e7ed803 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 19 Jul 2024 13:52:07 -0700 Subject: [PATCH 04/19] domain huggingface Signed-off-by: Sarah Yurick --- .../distributeddataclassification.rst | 7 +- examples/domain_classifier_example.py | 8 +- .../modules/distributed_data_classifier.py | 116 ++++---- .../scripts/domain_classifier_inference.py | 1 - nemo_curator/utils/script_utils.py | 4 +- .../distributed_data_classification.ipynb | 247 +++++++----------- 6 files changed, 147 insertions(+), 236 deletions(-) diff --git a/docs/user-guide/distributeddataclassification.rst b/docs/user-guide/distributeddataclassification.rst index 39da06091..8c9f6c006 100644 --- a/docs/user-guide/distributeddataclassification.rst +++ b/docs/user-guide/distributeddataclassification.rst @@ -39,18 +39,15 @@ Let's see how ``DomainClassifier`` works in a small excerpt taken from ``example .. code-block:: python - model = "nvidia/domain-classifier" - files = get_all_files_paths_under("books_dataset/") input_dataset = DocumentDataset.read_json(files, backend="cudf", add_filename=True) - domain_classifier = DomainClassifier(model, filter_by=["Games", "Sports"]) + domain_classifier = DomainClassifier(filter_by=["Games", "Sports"]) result_dataset = domain_classifier(dataset=input_dataset) result_dataset.to_json("games_and_sports/", write_to_filename=True) -In the above excerpt, the domain classifier is obtained directly from [HuggingFace](https://huggingface.co/nvidia/domain-classifier). -Alternatively, the user may download the model and set `model = "/path/to/model.pth"`. +In the above excerpt, the domain classifier is obtained directly from `HuggingFace `_. This module functions very similarly to the ``ScoreFilter`` module. The key differences is that it operates on the GPU instead of the CPU. diff --git a/examples/domain_classifier_example.py b/examples/domain_classifier_example.py index fe559a31d..78d4612c8 100644 --- a/examples/domain_classifier_example.py +++ b/examples/domain_classifier_example.py @@ -13,7 +13,6 @@ # limitations under the License. import argparse -import os import time from nemo_curator import DomainClassifier @@ -25,8 +24,6 @@ def main(args): global_st = time.time() - model_path = "/path/to/pytorch_model_file.pth" - # Input can be a string or list input_file_path = "/path/to/data" output_file_path = "./" @@ -37,10 +34,7 @@ def main(args): input_file_path, backend="cudf", add_filename=True ) - domain_classifier = DomainClassifier( - model_path=model_path, - filter_by=["Games", "Sports"], - ) + domain_classifier = DomainClassifier(filter_by=["Games", "Sports"]) result_dataset = domain_classifier(dataset=input_dataset) result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=True) diff --git a/nemo_curator/modules/distributed_data_classifier.py b/nemo_curator/modules/distributed_data_classifier.py index 9dc127b92..1fd336118 100644 --- a/nemo_curator/modules/distributed_data_classifier.py +++ b/nemo_curator/modules/distributed_data_classifier.py @@ -22,14 +22,16 @@ import torch.nn as nn from crossfit import op from crossfit.backend.torch.hf.model import HFModel -from packaging import version -from transformers import AutoConfig, AutoModel -from transformers import __version__ as TRANSFORMERS_VERSION +from huggingface_hub import PyTorchModelHubMixin +from transformers import AutoConfig, AutoTokenizer, AutoModel from transformers.models.deberta_v2 import DebertaV2TokenizerFast from nemo_curator.datasets import DocumentDataset +DOMAIN_IDENTIFIER = "nvidia/domain-classifier" + + @dataclass class DomainModelConfig: model = "microsoft/deberta-v3-base" @@ -44,22 +46,23 @@ class QualityModelConfig: max_len = 512 -class CustomModel(nn.Module): +# TODO: Remove this class after Quality Model is uploaded to HuggingFace +class NCCustomModel(nn.Module): def __init__( self, config, out_dim, config_path=None, pretrained=False, autocast=False ): super().__init__() self.config = config if config_path is None: - self.config = AutoConfig.from_pretrained( - config.model, output_hidden_states=True - ) + self.config = AutoConfig.from_pretrained(config.model, output_hidden_states=True) else: self.config = torch.load(config_path) + if pretrained: self.model = AutoModel.from_pretrained(config.model, config=self.config) else: self.model = AutoModel(self.config) + self.fc_dropout = nn.Dropout(config.fc_dropout) self.fc = nn.Linear(self.config.hidden_size, out_dim) self._init_weights(self.fc) @@ -97,6 +100,30 @@ def forward(self, batch): return self._forward(batch) +class HFCustomModel(nn.Module, PyTorchModelHubMixin): + def __init__(self, config): + super(HFCustomModel, self).__init__() + self.model = AutoModel.from_pretrained(config["base_model"]) + self.dropout = nn.Dropout(config["fc_dropout"]) + self.fc = nn.Linear(self.model.config.hidden_size, len(config["id2label"])) + + def _forward(self, batch): + features = self.model(batch["input_ids"], batch["attention_mask"]).last_hidden_state + dropped = self.dropout(features) + outputs = self.fc(dropped) + return torch.softmax(outputs[:, 0, :], dim=1) + + def forward(self, batch): + if self.autocast: + with torch.autocast(device_type="cuda"): + return self._forward(batch) + else: + return self._forward(batch) + + def set_autocast(self, autocast): + self.autocast = autocast + + class DistributedDataClassifier(ABC): """Abstract class for running multi-node multi-GPU data classification""" @@ -183,6 +210,7 @@ def _run_classifier_helper( keep_cols=columns_to_keep_list, ) df = classifier_pipe(df) + # TODO: Make crossfit handle this cleanly # to prevent the labeler from dropping the prob_internal_col # and combine it into a single step @@ -191,6 +219,7 @@ def _run_classifier_helper( keep_cols=columns_to_keep_list + [prob_internal_col], ) df = labeling_pipe(df) + if keep_prob: df = df.rename( columns={prob_internal_col: prob_col, pred_internal_col: label_col}, @@ -198,41 +227,27 @@ def _run_classifier_helper( else: df = df.rename(columns={pred_internal_col: label_col}) df = df.drop(columns=[prob_internal_col]) + return df class DomainModel(HFModel): - def __init__(self, config, out_dim=None, model_path=None, autocast=False): + def __init__(self, config, autocast=False): self.config = config - self.out_dim = out_dim - self.model_path = model_path self.autocast = autocast super().__init__(self.config.model) def load_model(self, device="cuda"): - model = CustomModel( - self.config, - out_dim=self.out_dim, - config_path=None, - pretrained=True, - autocast=self.autocast, - ) + model = HFCustomModel.from_pretrained(DOMAIN_IDENTIFIER) + model.set_autocast(self.autocast) model = model.to(device) - if os.path.exists(self.model_path): - sd = torch.load(os.path.join(self.model_path), map_location="cpu") - sd = {k[7:] if k.startswith("module.") else k: sd[k] for k in sd.keys()} - if version.parse(TRANSFORMERS_VERSION) >= version.parse("4.31.0"): - sd.pop("model.embeddings.position_ids", None) - model.load_state_dict(sd, strict=True) - else: - raise ValueError(f"Model path {self.model_path} does not exist") return model.eval() def load_tokenizer(self): - return DebertaV2TokenizerFast.from_pretrained(self.config.model) + return AutoTokenizer.from_pretrained(DOMAIN_IDENTIFIER) def load_config(self): - return AutoConfig.from_pretrained(self.path_or_name) + return AutoConfig.from_pretrained(DOMAIN_IDENTIFIER) class QualityModel(HFModel): @@ -244,7 +259,7 @@ def __init__(self, config, out_dim=None, model_path=None, autocast=False): super().__init__(self.config.model) def load_model(self, device="cuda"): - model = CustomModel( + model = NCCustomModel( self.config, out_dim=self.out_dim, config_path=None, @@ -252,6 +267,7 @@ def load_model(self, device="cuda"): autocast=self.autocast, ) model = model.to(device) + if os.path.exists(self.model_path): sd = torch.load(self.model_path, map_location="cpu") if "model_state_dict" in sd: @@ -260,8 +276,8 @@ def load_model(self, device="cuda"): model.load_state_dict(sd, strict=True) else: raise ValueError(f"Model path {self.model_path} does not exist") - model.eval() - return model + + return model.eval() def load_tokenizer(self): return DebertaV2TokenizerFast.from_pretrained(self.config.model) @@ -273,7 +289,6 @@ def load_config(self): class DomainClassifier(DistributedDataClassifier): def __init__( self, - model_path, filter_by=None, batch_size=256, pred_column="domain_pred", @@ -282,44 +297,13 @@ def __init__( device_type="cuda", autocast=True, ): - self.labels = [ - "Adult", - "Arts_and_Entertainment", - "Autos_and_Vehicles", - "Beauty_and_Fitness", - "Books_and_Literature", - "Business_and_Industrial", - "Computers_and_Electronics", - "Finance", - "Food_and_Drink", - "Games", - "Health", - "Hobbies_and_Leisure", - "Home_and_Garden", - "Internet_and_Telecom", - "Jobs_and_Education", - "Law_and_Government", - "News", - "Online_Communities", - "People_and_Society", - "Pets_and_Animals", - "Real_Estate", - "Science", - "Sensitive_Subjects", - "Shopping", - "Sports", - "Travel_and_Transportation", - ] - self.out_dim = len(self.labels) + config = AutoConfig.from_pretrained(DOMAIN_IDENTIFIER) self.prob_column = prob_column + self.labels = list(config.label2id.keys()) + self.out_dim = len(self.labels) - model = DomainModel( - config=DomainModelConfig, - out_dim=self.out_dim, - model_path=model_path, - autocast=autocast, - ) + model = DomainModel(config=DomainModelConfig, autocast=autocast) super().__init__( model=model, diff --git a/nemo_curator/scripts/domain_classifier_inference.py b/nemo_curator/scripts/domain_classifier_inference.py index f58723864..837435043 100644 --- a/nemo_curator/scripts/domain_classifier_inference.py +++ b/nemo_curator/scripts/domain_classifier_inference.py @@ -54,7 +54,6 @@ def main(): add_filename = True domain_classifier = DomainClassifier( - model_path=args.pretrained_model_name_or_path, max_chars=max_chars, batch_size=args.batch_size, autocast=args.autocast, diff --git a/nemo_curator/utils/script_utils.py b/nemo_curator/utils/script_utils.py index 2f645401a..3304043e6 100644 --- a/nemo_curator/utils/script_utils.py +++ b/nemo_curator/utils/script_utils.py @@ -397,8 +397,8 @@ def parse_distributed_classifier_args( parser.add_argument( "--pretrained-model-name-or-path", type=str, - help="HuggingFace name of model, or local path to downloaded model file", - required=True, + help="The path to the model file", + required=False, ) parser.add_argument( "--input-file-type", diff --git a/tutorials/distributed_data_classification/distributed_data_classification.ipynb b/tutorials/distributed_data_classification/distributed_data_classification.ipynb index b0fec862c..e5ee10ca5 100644 --- a/tutorials/distributed_data_classification/distributed_data_classification.ipynb +++ b/tutorials/distributed_data_classification/distributed_data_classification.ipynb @@ -4,11 +4,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Distributed Data Classification with Quality and Domain Classifiers\n", + "# Distributed Data Classification with Domain and Quality Classifiers\n", "\n", - "The notebook demonstrates the use of two classifiers for distributed data classification, including quality and domain classifiers. The quality classifier is used to classify the quality of the data, while the domain classifier is used to classify the domain of the data. These classifers help with annotation which helps data blending for foundation model training. \n", + "The notebook demonstrates the use of two classifiers for distributed data classification, including domain and quality classifiers. The domain classifier is used to classify the domain of the data, while the quality classifier is used to classify the quality of the data. These classifers help with annotation which helps data blending for foundation model training.\n", "\n", - "The classifiers are accelerated using CrossFit,(https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + "The classifiers are accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." ] }, { @@ -25,7 +25,7 @@ } ], "source": [ - "#### Silence Warnings (HuggingFace internal warnings)\n", + "# Silence Warnings (HuggingFace internal warnings)\n", "\n", "%env PYTHONWARNINGS=ignore\n", "import warnings\n", @@ -38,10 +38,10 @@ "metadata": {}, "outputs": [], "source": [ - "from dask_cuda import LocalCUDACluster\n", - "from dask.distributed import Client\n", - "from nemo_curator import DomainClassifier, QualityClassifier\n", - "from nemo_curator.datasets import DocumentDataset" + "from nemo_curator import DomainClassifier, QualityClassifier, get_client\n", + "from nemo_curator.datasets import DocumentDataset\n", + "import cudf\n", + "import dask_cudf" ] }, { @@ -50,26 +50,23 @@ "metadata": {}, "outputs": [], "source": [ - "cluster = LocalCUDACluster(rmm_async=True, rmm_pool_size=\"1GB\")\n", - "client = Client(cluster)" + "client = get_client()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Define the data file paths " + "# Set File Paths " ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "input_file_path=\"/input_data_dir/\"\n", "output_file_path = \"output_data_dir/\"\n", - "domain_model_path = \"domain_model.pth\"\n", "quality_model_path = \"quality_model.pth\"" ] }, @@ -86,79 +83,55 @@ "metadata": {}, "outputs": [], "source": [ - "classifier_type=\"DomainClassifier\" # or \"QualityClassifier\"" + "classifier_type = \"DomainClassifier\" # or \"QualityClassifier\"" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reading 16 files\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 10.5 s, sys: 5.33 s, total: 15.8 s\n", - "Wall time: 11.4 s\n" - ] - } - ], + "outputs": [], "source": [ - "%%time\n", - "\n", - "input_dataset = DocumentDataset.read_json(\n", - " input_file_path, backend=\"cudf\", add_filename=True\n", - ")\n", + "# Create sample DataFrame\n", + "text = [\n", + " \"Quantum computing is set to revolutionize the field of cryptography.\",\n", + " \"Investing in index funds is a popular strategy for long-term financial growth.\",\n", + " \"Recent advancements in gene therapy offer new hope for treating genetic disorders.\",\n", + " \"Online learning platforms have transformed the way students access educational resources.\",\n", + " \"Traveling to Europe during the off-season can be a more budget-friendly option.\",\n", + " \"Training regimens for athletes have become more sophisticated with the use of data analytics.\",\n", + " \"Streaming services are changing the way people consume television and film content.\",\n", + " \"Vegan recipes have gained popularity as more people adopt plant-based diets.\",\n", + " \"Climate change research is critical for developing sustainable environmental policies.\",\n", + " \"Telemedicine has become increasingly popular due to its convenience and accessibility.\",\n", + "]\n", + "df = cudf.DataFrame({\"text\": text})\n", + "input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n", + "write_to_filename = False\n", "\n", + "# Alternatively, read existing directory of JSONL files\n", + "# input_file_path=\"/input_data_dir/\"\n", + "# input_dataset = DocumentDataset.read_json(\n", + "# input_file_path, backend=\"cudf\", add_filename=True\n", + "# )\n", + "# write_to_filename = True" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ "if classifier_type == \"DomainClassifier\":\n", - " domain_labels = [\n", - " \"Adult\",\n", - " \"Arts_and_Entertainment\",\n", - " \"Autos_and_Vehicles\",\n", - " \"Beauty_and_Fitness\",\n", - " \"Books_and_Literature\",\n", - " \"Business_and_Industrial\",\n", - " \"Computers_and_Electronics\",\n", - " \"Finance\",\n", - " \"Food_and_Drink\",\n", - " \"Games\",\n", - " \"Health\",\n", - " \"Hobbies_and_Leisure\",\n", - " \"Home_and_Garden\",\n", - " \"Internet_and_Telecom\",\n", - " \"Jobs_and_Education\",\n", - " \"Law_and_Government\",\n", - " \"News\",\n", - " \"Online_Communities\",\n", - " \"People_and_Society\",\n", - " \"Pets_and_Animals\",\n", - " \"Real_Estate\",\n", - " \"Science\",\n", - " \"Sensitive_Subjects\",\n", - " \"Shopping\",\n", - " \"Sports\",\n", - " \"Travel_and_Transportation\",\n", - " ]\n", - " classifier = DomainClassifier(\n", - " model_path=domain_model_path,\n", - " labels=domain_labels,\n", - " batch_size=1024,\n", - " )\n", + " classifier = DomainClassifier(batch_size=1024)\n", + "\n", "elif classifier_type == \"QualityClassifier\":\n", - " quality_labels = [\"High\", \"Medium\", \"Low\"]\n", - " model_file_name = \"quality_classifier.pth\"\n", " classifier = QualityClassifier(\n", " model_path=quality_model_path,\n", - " labels=quality_labels,\n", " batch_size=1024,\n", " )\n", + "\n", "else:\n", " raise ValueError(\"Invalid classifier type\")" ] @@ -188,31 +161,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "GPU: 0, Part: 1: 100%|██████████| 938/938 [00:09<00:00, 101.99it/s] \n", - "GPU: 0, Part: 3: 100%|██████████| 938/938 [00:10<00:00, 92.36it/s] ]\n", - "GPU: 0, Part: 0: 100%|██████████| 938/938 [00:10<00:00, 91.25it/s] ]\n", - "GPU: 0, Part: 5: 100%|██████████| 938/938 [00:10<00:00, 88.82it/s] \n", - "GPU: 0, Part: 14: 100%|██████████| 937/937 [00:10<00:00, 88.11it/s] \n", - "GPU: 0, Part: 8: 100%|██████████| 937/937 [00:10<00:00, 85.46it/s] ]\n", - "GPU: 0, Part: 9: 100%|██████████| 937/937 [00:10<00:00, 86.16it/s] \n", - "GPU: 0, Part: 4: 100%|██████████| 938/938 [00:10<00:00, 85.65it/s]]\n", - "GPU: 0, Part: 11: 100%|██████████| 937/937 [00:11<00:00, 83.73it/s] \n", - "GPU: 0, Part: 6: 100%|██████████| 938/938 [00:11<00:00, 83.62it/s]\n", - "GPU: 0, Part: 10: 100%|██████████| 937/937 [00:11<00:00, 81.27it/s] \n", - "GPU: 0, Part: 2: 100%|██████████| 938/938 [00:12<00:00, 72.59it/s]]\n", - "GPU: 0, Part: 7: 100%|██████████| 937/937 [00:13<00:00, 71.75it/s]\n", - "GPU: 0, Part: 12: 100%|██████████| 937/937 [00:13<00:00, 69.12it/s]\n", - "GPU: 0, Part: 15: 100%|██████████| 937/937 [00:13<00:00, 68.47it/s]\n", - "GPU: 0, Part: 13: 100%|██████████| 937/937 [00:14<00:00, 66.29it/s]\n" + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.23it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Writing to disk complete for 16 partitions\n", - "CPU times: user 2.34 s, sys: 2.24 s, total: 4.58 s\n", - "Wall time: 17.2 s\n" + "Writing to disk complete for 1 partitions\n", + "CPU times: user 4.69 s, sys: 5.13 s, total: 9.82 s\n", + "Wall time: 12.7 s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.07it/s]\n" ] } ], @@ -220,14 +185,14 @@ "%%time\n", "\n", "result_dataset = classifier(dataset=input_dataset)\n", - "result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=True)" + "result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Inspect the Output" + "# Inspect the Output" ] }, { @@ -239,7 +204,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Reading 16 files\n" + "Reading 1 files\n" ] }, { @@ -263,66 +228,54 @@ " \n", " \n", " \n", - " adlr_id\n", " domain_pred\n", - " filename\n", - " id\n", - " pred\n", - " source_id\n", - " split_id\n", " text\n", - " url\n", " \n", " \n", " \n", " \n", " 0\n", - " cc-2022-40-0431053204\n", - " Online_Communities\n", - " 00.jsonl\n", - " a8083fe4-525d-4888-8513-b91f43bd8ee1\n", - " Online_Communities\n", - " crawl-data-CC-MAIN-2022-40-segments-1664030336...\n", - " lambada-0003225258-0000\n", - " Having been a community leader—and member—for ...\n", - " https://lisalarter.com/7-tips-for-building-ste...\n", + " Computers_and_Electronics\n", + " Quantum computing is set to revolutionize the ...\n", " \n", " \n", " 1\n", - " cc-2022-40-0510168267\n", - " Finance\n", - " 00.jsonl\n", - " 559febdc-cb7f-4217-897a-c8dac325123b\n", " Finance\n", - " crawl-data-CC-MAIN-2022-40-segments-1664030337...\n", - " lambada-0003918122-0000\n", - " Zelle is a way of sending money to almost anyo...\n", - " https://oregonmassageandwellnessclinic.com/app...\n", + " Investing in index funds is a popular strategy...\n", + " \n", + " \n", + " 2\n", + " Health\n", + " Recent advancements in gene therapy offer new ...\n", + " \n", + " \n", + " 3\n", + " Jobs_and_Education\n", + " Online learning platforms have transformed the...\n", + " \n", + " \n", + " 4\n", + " Travel_and_Transportation\n", + " Traveling to Europe during the off-season can ...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " adlr_id domain_pred filename \\\n", - "0 cc-2022-40-0431053204 Online_Communities 00.jsonl \n", - "1 cc-2022-40-0510168267 Finance 00.jsonl \n", - "\n", - " id pred \\\n", - "0 a8083fe4-525d-4888-8513-b91f43bd8ee1 Online_Communities \n", - "1 559febdc-cb7f-4217-897a-c8dac325123b Finance \n", - "\n", - " source_id split_id \\\n", - "0 crawl-data-CC-MAIN-2022-40-segments-1664030336... lambada-0003225258-0000 \n", - "1 crawl-data-CC-MAIN-2022-40-segments-1664030337... lambada-0003918122-0000 \n", + " domain_pred \\\n", + "0 Computers_and_Electronics \n", + "1 Finance \n", + "2 Health \n", + "3 Jobs_and_Education \n", + "4 Travel_and_Transportation \n", "\n", - " text \\\n", - "0 Having been a community leader—and member—for ... \n", - "1 Zelle is a way of sending money to almost anyo... \n", - "\n", - " url \n", - "0 https://lisalarter.com/7-tips-for-building-ste... \n", - "1 https://oregonmassageandwellnessclinic.com/app... " + " text \n", + "0 Quantum computing is set to revolutionize the ... \n", + "1 Investing in index funds is a popular strategy... \n", + "2 Recent advancements in gene therapy offer new ... \n", + "3 Online learning platforms have transformed the... \n", + "4 Traveling to Europe during the off-season can ... " ] }, "execution_count": 9, @@ -331,24 +284,8 @@ } ], "source": [ - "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=True)\n", - "output_dataset.df.head(2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Cleanup the output file" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "!rm -rf $output_file_path" + "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n", + "output_dataset.df.head()" ] } ], From 7bd69cb76928f876f516c8a12d9c834a9ae53b63 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 19 Jul 2024 14:03:44 -0700 Subject: [PATCH 05/19] lint Signed-off-by: Sarah Yurick --- nemo_curator/modules/distributed_data_classifier.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/nemo_curator/modules/distributed_data_classifier.py b/nemo_curator/modules/distributed_data_classifier.py index 1fd336118..4b8c6a887 100644 --- a/nemo_curator/modules/distributed_data_classifier.py +++ b/nemo_curator/modules/distributed_data_classifier.py @@ -23,12 +23,11 @@ from crossfit import op from crossfit.backend.torch.hf.model import HFModel from huggingface_hub import PyTorchModelHubMixin -from transformers import AutoConfig, AutoTokenizer, AutoModel +from transformers import AutoConfig, AutoModel, AutoTokenizer from transformers.models.deberta_v2 import DebertaV2TokenizerFast from nemo_curator.datasets import DocumentDataset - DOMAIN_IDENTIFIER = "nvidia/domain-classifier" @@ -54,7 +53,9 @@ def __init__( super().__init__() self.config = config if config_path is None: - self.config = AutoConfig.from_pretrained(config.model, output_hidden_states=True) + self.config = AutoConfig.from_pretrained( + config.model, output_hidden_states=True + ) else: self.config = torch.load(config_path) @@ -108,7 +109,9 @@ def __init__(self, config): self.fc = nn.Linear(self.model.config.hidden_size, len(config["id2label"])) def _forward(self, batch): - features = self.model(batch["input_ids"], batch["attention_mask"]).last_hidden_state + features = self.model( + batch["input_ids"], batch["attention_mask"] + ).last_hidden_state dropped = self.dropout(features) outputs = self.fc(dropped) return torch.softmax(outputs[:, 0, :], dim=1) @@ -119,7 +122,7 @@ def forward(self, batch): return self._forward(batch) else: return self._forward(batch) - + def set_autocast(self, autocast): self.autocast = autocast From 4fec0f633fa2881f11afb666078a142061421e4c Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Tue, 2 Jul 2024 13:59:37 -0700 Subject: [PATCH 06/19] add dataframe example (#137) Signed-off-by: Sarah Yurick --- .../distributed_data_classification.ipynb | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/tutorials/distributed_data_classification/distributed_data_classification.ipynb b/tutorials/distributed_data_classification/distributed_data_classification.ipynb index e5ee10ca5..eea0276de 100644 --- a/tutorials/distributed_data_classification/distributed_data_classification.ipynb +++ b/tutorials/distributed_data_classification/distributed_data_classification.ipynb @@ -38,7 +38,13 @@ "metadata": {}, "outputs": [], "source": [ +<<<<<<< HEAD "from nemo_curator import DomainClassifier, QualityClassifier, get_client\n", +======= + "from dask_cuda import LocalCUDACluster\n", + "from dask.distributed import Client\n", + "from nemo_curator import DomainClassifier, QualityClassifier\n", +>>>>>>> 19692e0 (add dataframe example (#137)) "from nemo_curator.datasets import DocumentDataset\n", "import cudf\n", "import dask_cudf" @@ -124,9 +130,50 @@ "outputs": [], "source": [ "if classifier_type == \"DomainClassifier\":\n", +<<<<<<< HEAD " classifier = DomainClassifier(batch_size=1024)\n", "\n", "elif classifier_type == \"QualityClassifier\":\n", +======= + " domain_labels = [\n", + " \"Adult\",\n", + " \"Arts_and_Entertainment\",\n", + " \"Autos_and_Vehicles\",\n", + " \"Beauty_and_Fitness\",\n", + " \"Books_and_Literature\",\n", + " \"Business_and_Industrial\",\n", + " \"Computers_and_Electronics\",\n", + " \"Finance\",\n", + " \"Food_and_Drink\",\n", + " \"Games\",\n", + " \"Health\",\n", + " \"Hobbies_and_Leisure\",\n", + " \"Home_and_Garden\",\n", + " \"Internet_and_Telecom\",\n", + " \"Jobs_and_Education\",\n", + " \"Law_and_Government\",\n", + " \"News\",\n", + " \"Online_Communities\",\n", + " \"People_and_Society\",\n", + " \"Pets_and_Animals\",\n", + " \"Real_Estate\",\n", + " \"Science\",\n", + " \"Sensitive_Subjects\",\n", + " \"Shopping\",\n", + " \"Sports\",\n", + " \"Travel_and_Transportation\",\n", + " ]\n", + "\n", + " classifier = DomainClassifier(\n", + " model_path=domain_model_path,\n", + " labels=domain_labels,\n", + " batch_size=1024,\n", + " )\n", + "\n", + "elif classifier_type == \"QualityClassifier\":\n", + " quality_labels = [\"High\", \"Medium\", \"Low\"]\n", + "\n", +>>>>>>> 19692e0 (add dataframe example (#137)) " classifier = QualityClassifier(\n", " model_path=quality_model_path,\n", " batch_size=1024,\n", @@ -161,7 +208,11 @@ "name": "stderr", "output_type": "stream", "text": [ +<<<<<<< HEAD "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.23it/s]" +======= + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:02<00:00, 3.62it/s]" +>>>>>>> 19692e0 (add dataframe example (#137)) ] }, { @@ -169,15 +220,24 @@ "output_type": "stream", "text": [ "Writing to disk complete for 1 partitions\n", +<<<<<<< HEAD "CPU times: user 4.69 s, sys: 5.13 s, total: 9.82 s\n", "Wall time: 12.7 s\n" +======= + "CPU times: user 578 ms, sys: 429 ms, total: 1.01 s\n", + "Wall time: 9.91 s\n" +>>>>>>> 19692e0 (add dataframe example (#137)) ] }, { "name": "stderr", "output_type": "stream", "text": [ +<<<<<<< HEAD "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.07it/s]\n" +======= + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:03<00:00, 3.30it/s]\n" +>>>>>>> 19692e0 (add dataframe example (#137)) ] } ], @@ -286,6 +346,25 @@ "source": [ "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n", "output_dataset.df.head()" +<<<<<<< HEAD +======= + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Remove the Output File(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf $output_file_path" +>>>>>>> 19692e0 (add dataframe example (#137)) ] } ], From 6d7367cc8af8c8875fd496b39e81c665662338a8 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Fri, 5 Jul 2024 16:47:22 -0700 Subject: [PATCH 07/19] Enable Sem-dedup (#130) * Applying SEO Best Pratices (#104) * Rename CPUvsGPU.rst to cpuvsgpu.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename DataCuration.rsts to datacuration.rsts Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename DistributedDataClassification.rst to distributeddataclassification.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename DocumentDataset.rst to documentdataset.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename Download.rst to download.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename GpuDeduplication.rst to gpudeduplication.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename KubernetesCurator.rst to kubernetescurator.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename LanguageIdentificationUnicodeFormatting.rst to languageidentificationunicodeformatting.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename PersonalIdentifiableInformationIdentificationAndRemoval.rst to personalidentifiableinformationidentificationandremoval.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename QualityFiltering.rst to qualityfiltering.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename TaskDecontamination.rst to taskdecontamination.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Update index.rst Setting all RST files to lowercase names. Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Ignore docs for EOF fixer hook Signed-off-by: Ayush Dattagupta --------- Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> Signed-off-by: Ayush Dattagupta Co-authored-by: Ayush Dattagupta Signed-off-by: Vibhu Jawa * Shuffle CC result on group before writing out (#110) Signed-off-by: Ayush Dattagupta Signed-off-by: Vibhu Jawa * Update index.rst (#113) Added links to tutorials Signed-off-by: jgerh <163925524+jgerh@users.noreply.github.com> Signed-off-by: Vibhu Jawa * first commit Signed-off-by: avinashvem Signed-off-by: Vibhu Jawa * mv under modules dir Signed-off-by: avinashvem Signed-off-by: Vibhu Jawa * first commit Signed-off-by: avinashvem Signed-off-by: Vibhu Jawa * mv under modules dir Signed-off-by: avinashvem Signed-off-by: Vibhu Jawa * first commit Signed-off-by: Vibhu Jawa * mv under modules dir Signed-off-by: Vibhu Jawa * embed by cluster saved Signed-off-by: Vibhu Jawa * id map script Signed-off-by: Vibhu Jawa * test commit Signed-off-by: Vibhu Jawa * add id map script Signed-off-by: Vibhu Jawa * Cleanup compute_embeddings_crossfit.py Signed-off-by: Vibhu Jawa * Cleanup compute_embeddings_crossfit.py Signed-off-by: Vibhu Jawa * Pre-commit style fixes Signed-off-by: Vibhu Jawa * clustering_dask_crossfit.py Signed-off-by: Vibhu Jawa * Minor clean up to sort_clusters_crossfit.py Signed-off-by: Vibhu Jawa * cleanup semdedup_crossfit Signed-off-by: Vibhu Jawa * Remove undo changes Signed-off-by: Vibhu Jawa * Remove rename changes Signed-off-by: Vibhu Jawa * Fix rename Signed-off-by: Vibhu Jawa * Readme formatting Signed-off-by: Vibhu Jawa * add dask to semdedup_crossfit.py Signed-off-by: Vibhu Jawa * README.md updates Signed-off-by: Vibhu Jawa * README.md updates Signed-off-by: Vibhu Jawa * README.md updates Signed-off-by: Vibhu Jawa * README.md updates Signed-off-by: Vibhu Jawa * README.md updates Signed-off-by: Vibhu Jawa * configure max memory using a cli Signed-off-by: Vibhu Jawa * Dumb id results to parquet Signed-off-by: Vibhu Jawa * Embedding fixes Signed-off-by: Vibhu Jawa * README.md updates Signed-off-by: Vibhu Jawa * Working end to end Signed-off-by: Vibhu Jawa * Minor yaml fixes Signed-off-by: Vibhu Jawa * Undo changes to index.rst Signed-off-by: Vibhu Jawa * Update .pre-commit-config.yaml Signed-off-by: Vibhu Jawa * Update index.rst Signed-off-by: Vibhu Jawa * Update index.rst Signed-off-by: Vibhu Jawa * Undo changes to docs/personalidentifiableinformationidentificationandremoval.rst Signed-off-by: Vibhu Jawa * Update fuzzy_dedup.py Signed-off-by: Vibhu Jawa * Undo changes to docs/personalidentifiableinformationidentificationandremoval.rst Signed-off-by: Vibhu Jawa * Update index.rst Signed-off-by: Vibhu Jawa * Add end to end script in readme.md Signed-off-by: Vibhu Jawa * Add type hints Signed-off-by: Vibhu Jawa * Use dask for sort_clusters Signed-off-by: Vibhu Jawa * Make sort_clusters work on MNMG scales Signed-off-by: Vibhu Jawa * Cleaned up dask shutdown Signed-off-by: Vibhu Jawa * Decrease noise in E2E scripts Signed-off-by: Vibhu Jawa * Clean up scripts Signed-off-by: Vibhu Jawa * Fix scripts/end_to_end_script.sh Signed-off-by: Vibhu Jawa * Some more cleanup Signed-off-by: Vibhu Jawa * Add copyright Signed-off-by: Vibhu Jawa * Fix README.md Signed-off-by: Vibhu Jawa * Address reviews Signed-off-by: Vibhu Jawa * Make work with a SemDedupConfig Signed-off-by: Vibhu Jawa * Make work with SemDedupConfig Signed-off-by: Vibhu Jawa * Move to nemo-curator's logger Signed-off-by: Vibhu Jawa * Semdedup-extract_dedup_data.py Signed-off-by: Vibhu Jawa * Update index.rst Signed-off-by: Vibhu Jawa * Applying SEO Best Pratices (#104) * Rename CPUvsGPU.rst to cpuvsgpu.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename DataCuration.rsts to datacuration.rsts Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename DistributedDataClassification.rst to distributeddataclassification.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename DocumentDataset.rst to documentdataset.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename Download.rst to download.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename GpuDeduplication.rst to gpudeduplication.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename KubernetesCurator.rst to kubernetescurator.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename LanguageIdentificationUnicodeFormatting.rst to languageidentificationunicodeformatting.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename PersonalIdentifiableInformationIdentificationAndRemoval.rst to personalidentifiableinformationidentificationandremoval.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename QualityFiltering.rst to qualityfiltering.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Rename TaskDecontamination.rst to taskdecontamination.rst Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Update index.rst Setting all RST files to lowercase names. Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Ignore docs for EOF fixer hook Signed-off-by: Ayush Dattagupta --------- Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> Signed-off-by: Ayush Dattagupta Co-authored-by: Ayush Dattagupta * Update index.rst Signed-off-by: Vibhu Jawa * Fix bad merge Signed-off-by: Vibhu Jawa * Update index.rst Signed-off-by: Vibhu Jawa * Update index.rst Signed-off-by: Vibhu Jawa * Update index.rst Signed-off-by: Vibhu Jawa * Update index.rst Signed-off-by: Vibhu Jawa * Add Module for embedding+clustering Signed-off-by: Vibhu Jawa * Add sorting to clustering Signed-off-by: Vibhu Jawa * Refactor Semdup modules Signed-off-by: Vibhu Jawa * Refactor Semdup modules Signed-off-by: Vibhu Jawa * Refactor Semdup modules Signed-off-by: Vibhu Jawa * Fix Readme.md Signed-off-by: Vibhu Jawa * Add a environment variable to silence HF warnings Signed-off-by: Vibhu Jawa * dask-cudf fix Signed-off-by: Vibhu Jawa * dask-cudf fix Signed-off-by: Vibhu Jawa * dask-cudf fix Signed-off-by: Vibhu Jawa * Make config a flat file based on reviews Signed-off-by: Vibhu Jawa * Add docstrings Signed-off-by: Vibhu Jawa * Fix argparse and seed function Signed-off-by: Vibhu Jawa * Use argparse to read config Signed-off-by: Vibhu Jawa * Move around config files Signed-off-by: Vibhu Jawa * Move around config files Signed-off-by: Vibhu Jawa * Move around config files Signed-off-by: Vibhu Jawa * Remove end_to_end_script.sh Signed-off-by: Vibhu Jawa * Append Readme Signed-off-by: Vibhu Jawa * Address Reviews Signed-off-by: Vibhu Jawa * Change config Signed-off-by: Vibhu Jawa * Make embedding creation optionally lazy Signed-off-by: Vibhu Jawa * fix docstring Signed-off-by: Vibhu Jawa * Address Reviews and docstrings Signed-off-by: Vibhu Jawa * Address Reviews and make eps_thresholds a list of values Signed-off-by: Vibhu Jawa * Minor import fix Signed-off-by: Vibhu Jawa * Empty Commit Signed-off-by: Vibhu Jawa * Add modules to __init__ and README.md Signed-off-by: Vibhu Jawa * Fix init Signed-off-by: Vibhu Jawa * Move comment Signed-off-by: Vibhu Jawa * Empty commit to restart CI (which failed due to a download issue) Signed-off-by: Vibhu Jawa * Empty commit to restart CI (which failed due to a download issue) Signed-off-by: Vibhu Jawa --------- Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> Signed-off-by: Ayush Dattagupta Signed-off-by: Vibhu Jawa Signed-off-by: jgerh <163925524+jgerh@users.noreply.github.com> Signed-off-by: avinashvem Co-authored-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> Co-authored-by: Ayush Dattagupta Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com> Co-authored-by: avinashvem --- README.md | 3 +- config/sem_dedup_config.yaml | 32 + docs/user-guide/index.rst | 1 - examples/semdedup_example.py | 84 +++ nemo_curator/log.py | 8 +- nemo_curator/modules/__init__.py | 18 +- nemo_curator/modules/config.py | 70 ++- nemo_curator/modules/semantic_dedup.py | 573 ++++++++++++++++++ nemo_curator/scripts/semdedup/README.md | 40 ++ nemo_curator/scripts/semdedup/__init__.py | 0 nemo_curator/scripts/semdedup/clustering.py | 108 ++++ .../scripts/semdedup/compute_embeddings.py | 118 ++++ .../scripts/semdedup/extract_dedup_data.py | 88 +++ nemo_curator/utils/distributed_utils.py | 34 +- nemo_curator/utils/file_utils.py | 17 +- nemo_curator/utils/script_utils.py | 50 ++ nemo_curator/utils/semdedup_utils.py | 445 ++++++++++++++ setup.py | 6 +- 18 files changed, 1683 insertions(+), 12 deletions(-) create mode 100644 config/sem_dedup_config.yaml create mode 100644 examples/semdedup_example.py create mode 100644 nemo_curator/modules/semantic_dedup.py create mode 100644 nemo_curator/scripts/semdedup/README.md create mode 100644 nemo_curator/scripts/semdedup/__init__.py create mode 100644 nemo_curator/scripts/semdedup/clustering.py create mode 100644 nemo_curator/scripts/semdedup/compute_embeddings.py create mode 100755 nemo_curator/scripts/semdedup/extract_dedup_data.py create mode 100644 nemo_curator/utils/semdedup_utils.py diff --git a/README.md b/README.md index ead0348fb..a6e7e0f83 100644 --- a/README.md +++ b/README.md @@ -39,8 +39,9 @@ NeMo Curator provides a collection of scalable data-mining modules. Some of the - [Document-level deduplication](docs/user-guide/gpudeduplication.rst) - - Both exact and fuzzy (near-identical) deduplication are accelerated using cuDF and Dask + - exact and fuzzy (near-identical) deduplication are accelerated using cuDF and Dask - For fuzzy deduplication, our implementation follows the method described in [Microsoft Turing NLG 530B](https://arxiv.org/abs/2201.11990) + - For semantic deduplication, our implementation follows the method described in [SemDeDup] (https://arxiv.org/pdf/2303.09540) by Meta AI (FAIR) (https://github.com/facebookresearch/SemDeDup) - [Multilingual downstream-task decontamination](docs/user-guide/taskdecontamination.rst) following the approach of [OpenAI GPT3](https://arxiv.org/pdf/2005.14165.pdf) and [Microsoft Turing NLG 530B](https://arxiv.org/abs/2201.11990) diff --git a/config/sem_dedup_config.yaml b/config/sem_dedup_config.yaml new file mode 100644 index 000000000..ec847e4b9 --- /dev/null +++ b/config/sem_dedup_config.yaml @@ -0,0 +1,32 @@ +# Configuration file for semdantic dedup +cache_dir: "semdedup_cache" +num_files: 16 +id_col_name: "id" +id_col_type: "int" +input_column: "text" + +# Embeddings configuration +embeddings_save_loc: "embeddings" +embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2" +embedding_batch_size: 128 +embedding_max_mem_gb: 25 + +# Clustering configuration +clustering_save_loc: "clustering_results" +n_clusters: 1000 +seed: 1234 +max_iter: 100 +kmeans_with_cos_dist: false + +# Semdedup configuration +which_to_keep: "hard" +largest_cluster_size_to_process: 100000 +sim_metric: "cosine" + +# Extract dedup configuration +eps_thresholds: + - 0.01 + - 0.001 + +# Which threshold to use for extracting deduped data +eps_to_extract: 0.01 diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index 74c219c28..31f29069c 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -46,4 +46,3 @@ personalidentifiableinformationidentificationandremoval.rst distributeddataclassification.rst kubernetescurator.rst - diff --git a/examples/semdedup_example.py b/examples/semdedup_example.py new file mode 100644 index 000000000..a1ed163be --- /dev/null +++ b/examples/semdedup_example.py @@ -0,0 +1,84 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import time + +from nemo_curator.datasets import DocumentDataset +from nemo_curator.log import create_logger +from nemo_curator.modules.config import SemDedupConfig +from nemo_curator.modules.semantic_dedup import SemDedup +from nemo_curator.utils.distributed_utils import get_client, read_data +from nemo_curator.utils.file_utils import ( + expand_outdir_and_mkdir, + get_all_files_paths_under, +) +from nemo_curator.utils.script_utils import ArgumentHelper + + +def silence_hf_warnings(): + from transformers.utils import logging + + logging.set_verbosity_error() + + +def main(args): + semdedup_config = SemDedupConfig.from_yaml(args.config_file) + client = get_client(**ArgumentHelper.parse_client_args(args)) + + silence_hf_warnings() + client.run(silence_hf_warnings) + + expand_outdir_and_mkdir(semdedup_config.cache_dir) + logger = create_logger( + rank=0, + name="logger-end-to_end-semdup", + log_file=os.path.join(semdedup_config.cache_dir, "compute_embeddings.log"), + log_level=logging.INFO, + stdout=True, + ) + st = time.time() + input_files = get_all_files_paths_under( + root=args.input_data_dir, + ) + if semdedup_config.num_files > 0: + input_files = input_files[: semdedup_config.num_files] + logger.info(f"Processing {len(input_files)} files") + ddf = read_data( + input_files=input_files, + file_type=args.input_file_type, + add_filename=False, + backend="cudf", + ) + dataset = DocumentDataset(ddf) + semdup = SemDedup(semdedup_config, logger=logger) + dedup_ids = semdup(dataset) + print(dedup_ids.df.head()) + logger.info(f"Time taken: {time.time() - st}") + client.cancel(client.futures, force=True) + client.close() + + +def attach_args(): + parser = ArgumentHelper.parse_semdedup_args(add_input_args=True) + return parser + + +def console_script(): + main(attach_args().parse_args()) + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/nemo_curator/log.py b/nemo_curator/log.py index e69afc1f6..92cf37a37 100644 --- a/nemo_curator/log.py +++ b/nemo_curator/log.py @@ -19,7 +19,7 @@ from nemo_curator.utils.file_utils import expand_outdir_and_mkdir -def create_logger(rank, log_file, name="logger", log_level=logging.INFO): +def create_logger(rank, log_file, name="logger", log_level=logging.INFO, stdout=False): # Create the logger logger = logging.getLogger(name) logger.setLevel(log_level) @@ -36,8 +36,12 @@ def create_logger(rank, log_file, name="logger", log_level=logging.INFO): file_handler.setFormatter(formatter) logger.addHandler(file_handler) - logger = logging.LoggerAdapter(logger, extra) + if stdout: + stdout_handler = logging.StreamHandler() + stdout_handler.setFormatter(formatter) + logger.addHandler(stdout_handler) + logger = logging.LoggerAdapter(logger, extra) return logger diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py index 8b9613261..db6aca7df 100644 --- a/nemo_curator/modules/__init__.py +++ b/nemo_curator/modules/__init__.py @@ -22,7 +22,7 @@ from nemo_curator.utils.import_utils import gpu_only_import_from from .add_id import AddId -from .config import FuzzyDuplicatesConfig +from .config import FuzzyDuplicatesConfig, SemDedupConfig from .dataset_ops import blend_datasets, Shuffle from .exact_dedup import ExactDuplicates from .filter import Filter, Score, ScoreFilter @@ -36,10 +36,19 @@ FuzzyDuplicates = gpu_only_import_from( "nemo_curator.modules.fuzzy_dedup", "FuzzyDuplicates" ) - # Pytorch related imports must come after all imports that require cugraph, # because of context cleanup issues b/w pytorch and cugraph # See this issue: https://github.com/rapidsai/cugraph/issues/2718 +SemDedup = gpu_only_import_from("nemo_curator.modules.semantic_dedup", "SemDedup") +EmbeddingCreator = gpu_only_import_from( + "nemo_curator.modules.semantic_dedup", "EmbeddingCreator" +) +ClusteringModel = gpu_only_import_from( + "nemo_curator.modules.semantic_dedup", "ClusteringModel" +) +SemanticClusterLevelDedup = gpu_only_import_from( + "nemo_curator.modules.semantic_dedup", "SemanticClusterLevelDedup" +) from .distributed_data_classifier import DomainClassifier, QualityClassifier __all__ = [ @@ -59,4 +68,9 @@ "AddId", "blend_datasets", "Shuffle", + "SemDedup", + "SemDedupConfig", + "EmbeddingCreator", + "ClusteringModel", + "SemanticClusterLevelDedup", ] diff --git a/nemo_curator/modules/config.py b/nemo_curator/modules/config.py index 45ea527f2..eec5b42ed 100644 --- a/nemo_curator/modules/config.py +++ b/nemo_curator/modules/config.py @@ -13,7 +13,8 @@ # limitations under the License. import warnings -from dataclasses import dataclass +from dataclasses import dataclass, field +from typing import List import yaml @@ -98,3 +99,70 @@ def __post_init__(self): raise ValueError("Jaccard Threshold must be between [0,1]") if self.buckets_per_shuffle <= 0: raise ValueError("Buckets per shuffle must be greater than 0") + + +@dataclass +class SemDedupConfig(BaseConfig): + """ + Configuration for Semantic Deduplication. + + Attributes: + cache_dir (str): Directory to store cache. + num_files (int): Number of files. Default is -1, meaning all files. + id_col_name (str): Column name for ID. + id_col_type (str): Column type for ID. + input_column (str): Input column for embeddings. + embeddings_save_loc (str): Location to save embeddings. + embedding_model_name_or_path (str): Model name or path for embeddings. + embedding_batch_size (int): Inital Batch size for processing embeddings. + embedding_max_mem_gb (int): Maximum memory in GB for embeddings. + clustering_save_loc (str): Location to save clustering results. + n_clusters (int): Number of clusters. + seed (int): Seed for clustering. + max_iter (int): Maximum iterations for clustering. + kmeans_with_cos_dist (bool): Use KMeans with cosine distance. + which_to_keep (str): Which duplicates to keep. + largest_cluster_size_to_process (int): Largest cluster size to process. + sim_metric (str): Similarity metric for deduplication. + eps_thresholds (List[float]): Epsilon thresholds to calculate if semantically similar or not. + eps_to_extract (float): Epsilon value to extract deduplicated data. + """ + + cache_dir: str + num_files: int = -1 + id_col_name: str = "id" + id_col_type: str = "str" + input_column: str = "text" + + # Embeddings + embeddings_save_loc: str = "embeddings" + embedding_model_name_or_path: str = "sentence-transformers/all-MiniLM-L6-v2" + embedding_batch_size: int = 128 + embedding_max_mem_gb: int = 25 + + # Clustering config + clustering_save_loc: str = "clustering_results" + n_clusters: int = 1000 + seed: int = 1234 + max_iter: int = 100 + kmeans_with_cos_dist: bool = False + + # Semdedup config + which_to_keep: str = "hard" + largest_cluster_size_to_process: int = 100000 + sim_metric: str = "cosine" + + # Extract dedup config + eps_thresholds: List[float] = field(default_factory=lambda: [0.01, 0.001]) + eps_to_extract: float = 0.01 + + def __post_init__(self): + if self.cache_dir is None: + raise ValueError( + "Finding sem-dedup requires a cache directory accessible via all workers to store intermediates" + ) + + if self.eps_to_extract not in self.eps_thresholds: + raise ValueError( + f"Epsilon to extract {self.eps_to_extract} must be in eps_thresholds {self.eps_thresholds}" + ) diff --git a/nemo_curator/modules/semantic_dedup.py b/nemo_curator/modules/semantic_dedup.py new file mode 100644 index 000000000..5b95692f1 --- /dev/null +++ b/nemo_curator/modules/semantic_dedup.py @@ -0,0 +1,573 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +import os +import shutil +from dataclasses import dataclass +from typing import List, Optional, Union + +import cudf +import cupy as cp +import dask.bag as db +import dask.dataframe as dd +import dask_cudf +import numpy as np +import torch +import torch.nn as nn +from crossfit import op +from crossfit.backend.torch.hf.model import HFModel +from cuml.dask.cluster import KMeans +from torch.nn import functional as F +from transformers import AutoConfig, AutoModel, AutoTokenizer + +from nemo_curator.datasets import DocumentDataset +from nemo_curator.log import create_logger +from nemo_curator.modules.config import SemDedupConfig +from nemo_curator.utils.distributed_utils import write_to_disk +from nemo_curator.utils.file_utils import expand_outdir_and_mkdir +from nemo_curator.utils.semdedup_utils import ( + _assign_and_sort_clusters, + extract_dedup_data, + get_semantic_matches_per_cluster, +) + + +# Embedding Creation Module +@dataclass +class EmbeddingConfig: + model_name_or_path: str + max_mem_gb: int + max_seq_length: int = None + + def __post_init__(self): + self.max_seq_length = AutoTokenizer.from_pretrained( + self.model_name_or_path + ).model_max_length + # Gaurd against the HF bug + # which sets max_seq_length to max(int) for some models + if self.max_seq_length > 1e5: + self.max_seq_length = AutoConfig.from_pretrained( + self.model_name_or_path + ).max_position_embeddings + + +class EmbeddingPytorchModel(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.model = AutoModel.from_pretrained( + config.model_name_or_path, config=self.config, force_download=False + ) + + def feature(self, input_ids, attention_mask): + with torch.autocast(device_type=input_ids.device.type): + embeddings = self.model(input_ids=input_ids, attention_mask=attention_mask) + return embeddings + + @torch.no_grad() + def forward(self, batch): + feature = self.feature(batch["input_ids"], batch["attention_mask"]) + return self._mean_pooling(feature, batch["attention_mask"]) + + def _mean_pooling(self, model_output, attention_mask): + token_embeddings = model_output[0] + input_mask_expanded = ( + attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + ) + sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, dim=1) + sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9) + return F.normalize(sum_embeddings / sum_mask, dim=1) + + +class EmbeddingCrossFitModel(HFModel): + def __init__(self, config: EmbeddingConfig): + self.config = config + super().__init__( + self.config.model_name_or_path, max_mem_gb=self.config.max_mem_gb + ) + + def load_model(self, device="cuda"): + model = EmbeddingPytorchModel(self.config) + model = model.to(device) + model.eval() + return model + + def max_seq_length(self): + return self.config.max_seq_length + + def load_config(self): + return AutoConfig.from_pretrained(self.config.model_name_or_path) + + def load_tokenizer(self): + return AutoTokenizer.from_pretrained(self.config.model_name_or_path) + + +class EmbeddingCreator: + def __init__( + self, + embedding_model_name_or_path: str, + embedding_max_mem_gb: str, + embedding_batch_size: int, + embedding_output_dir: str, + input_column: str = "text", + write_embeddings_to_disk: bool = True, + write_to_filename: bool = False, + logger: Union[logging.Logger, str] = "./", + ): + """ + Initializes an EmbeddingCreator for generating embeddings using the specified model configurations. + + Args: + embedding_model_name_or_path (str): The path or identifier for the model used to generate embeddings. + embedding_max_mem_gb (str): Maximum memory usage for the embedding process. + embedding_batch_size (int): Number of samples to process in each batch. + embedding_output_dir (str): Directory path where embeddings will be saved. + input_column (str): Column name from the data to be used for embedding generation, defaults to "text". + write_embeddings_to_disk (bool, optional): If True, saves the embeddings to disk, defaults to True. + We recommend setting this to False when you have a delayed pipeline. + Setting it to False can lead to more memory overhead. + write_to_filename (bool): If True, saves the embeddings to the same filename as input files, defaults to False. + logger (Union[logging.Logger, str]): Logger object or path to store logs, defaults to "./". + + Attributes: + embeddings_config (EmbeddingConfig): Configuration for embeddings. + batch_size (int): Batch size for embedding generation. + logger (logging.Logger): Logger instance for the class. + embedding_output_dir (str): Output directory for embeddings. + input_column (str): Input column for data processing. + model (EmbeddingCrossFitModel): Model instance for embedding generation. + write_to_filename (bool): If True, saves the embeddings to the same filename as input files, defaults to False. + """ + + self.embeddings_config = EmbeddingConfig( + model_name_or_path=embedding_model_name_or_path, + max_mem_gb=embedding_max_mem_gb, + ) + self.batch_size = embedding_batch_size + self.logger = self._setup_logger(logger) + self.embedding_output_dir = embedding_output_dir + self.input_column = input_column + self.model = EmbeddingCrossFitModel(self.embeddings_config) + self.write_embeddings_to_disk = write_embeddings_to_disk + self.write_to_filename = write_to_filename + + def _setup_logger(self, logger): + if isinstance(logger, str): + return create_logger( + rank=0, + name="compute-embeddings", + log_file=os.path.join(logger, "compute_embeddings.log"), + log_level=logging.INFO, + stdout=True, + ) + else: + return logger + + def create_embeddings( + self, ddf: dask_cudf.DataFrame, input_column="text" + ) -> dask_cudf.DataFrame: + pipe = op.Sequential( + op.Tokenizer( + self.model, + cols=[input_column], + tokenizer_type="sentencepiece", + max_length=self.embeddings_config.max_seq_length, + ), + op.Predictor( + self.model, + sorted_data_loader=True, + batch_size=self.batch_size, + pred_output_col="embeddings", + ), + keep_cols=ddf.columns.tolist(), + ) + return pipe(ddf) + + def __call__(self, dataset: DocumentDataset) -> DocumentDataset: + embedding_ddf = self.create_embeddings(dataset.df, self.input_column) + if self.write_embeddings_to_disk: + write_to_disk( + embedding_ddf, + self.embedding_output_dir, + write_to_filename=self.write_to_filename, + output_type="parquet", + ) + return DocumentDataset( + dask_cudf.read_parquet( + self.embedding_output_dir, blocksize="2GB", aggregate_files=True + ) + ) + else: + return DocumentDataset(embedding_ddf) + + +### Clustering Module +def get_embedding_ar(df: "cudf.DataFrame") -> cp.ndarray: + return df["embeddings"].list.leaves.values.reshape(len(df), -1) + + +def add_dist_to_cents(df: "cudf.DataFrame", centroids: cp.ndarray) -> "cudf.DataFrame": + embed_array = get_embedding_ar(df) + centroids_ar = centroids[df["nearest_cent"].values] + dist_to_cents = cp.sqrt(np.sum((embed_array - centroids_ar) ** 2, axis=1)) + df["dist_to_cent"] = dist_to_cents + return df + + +class ClusteringModel: + def __init__( + self, + id_col: str, + max_iter: int, + n_clusters: int, + clustering_output_dir: str, + sim_metric: str = "cosine", + which_to_keep: str = "hard", + sort_clusters: bool = True, + kmeans_with_cos_dist: bool = False, + partition_size: str = "2gb", + logger: Union[logging.Logger, str] = "./", + ): + """ + Initializes the ClusteringModel with the provided settings for semantic clustering to help semantic deduplication. + + Args: + id_col (str): Column name used as the identifier in the dataset. + max_iter (int): Maximum number of iterations for the clustering algorithm. + n_clusters (int): The number of clusters to form. + clustering_output_dir (str): Directory path where clustering results will be saved. + sim_metric (str): Similarity metric to use for clustering, default is "cosine". + which_to_keep (str): Strategy to decide which duplicates to keep; default is "hard". + sort_clusters (bool): Whether to sort clusters, default is True. + kmeans_with_cos_dist (bool): Whether to use KMeans with cosine distance, default is False. + partition_size (str): The size of data partition to run kmeans with, default is "2gb". + logger (Union[logging.Logger, str]): Logger object or directory path to save logs; default is "./". + + This constructor sets up the parameters required for clustering operations. + """ + self.id_col = id_col + self.max_iter = max_iter + self.n_clusters = n_clusters + self.clustering_output_dir = clustering_output_dir + self.sim_metric = sim_metric + self.keep_hard = which_to_keep == "hard" + self.kmeans_with_cos_dist = kmeans_with_cos_dist + self.partition_size = partition_size + self.sort_clusters = sort_clusters + self.logger = self._setup_logger(logger) + + if not os.path.exists(self.clustering_output_dir): + expand_outdir_and_mkdir(self.clustering_output_dir) + else: + self.logger.warning( + f"Clustering output directory {self.clustering_output_dir} already exists and will be overwritten" + ) + + def _setup_logger(self, logger): + if isinstance(logger, str): + return create_logger( + rank=0, + name="SemanticClusterLevelDedup", + log_file=os.path.join(logger, "SemanticClusterLevelDedup.log"), + log_level=logging.INFO, + stdout=True, + ) + else: + return logger + + def __call__(self, embeddings_dataset: DocumentDataset): + embeddings_df = embeddings_dataset.df + + assert "embeddings" in embeddings_df.columns + embeddings_df = embeddings_df[[self.id_col, "embeddings"]] + + embeddings_df = embeddings_df.to_backend("pandas").persist() + embeddings_df = embeddings_df.repartition(partition_size=self.partition_size) + embeddings_df = embeddings_df.to_backend("cudf") + + cupy_darr = embeddings_df.map_partitions( + get_embedding_ar, meta=cp.ndarray([1, 1]) + ) + cupy_darr.compute_chunk_sizes() + + kmeans = KMeans(n_clusters=self.n_clusters, max_iter=self.max_iter) + self.logger.info("KMeans starting fit") + kmeans.fit(cupy_darr) + self.logger.info("KMeans fit complete") + + self.logger.info( + "Computing nearest centroids + distance to centers using kmeans.predict" + ) + nearest_cents = kmeans.predict(cupy_darr) + embeddings_df["nearest_cent"] = nearest_cents.astype(np.int32) + del nearest_cents + meta_df = embeddings_df._meta.copy() + meta_df["dist_to_cent"] = cp.zeros(1) + embeddings_df = embeddings_df.map_partitions( + add_dist_to_cents, centroids=kmeans.cluster_centers_, meta=meta_df + ) + centroids = kmeans.cluster_centers_ + embeddings_df = embeddings_df.reset_index(drop=True) + kmeans_centroids_file = os.path.join( + self.clustering_output_dir, "kmeans_centroids.npy" + ) + np.save(kmeans_centroids_file, centroids) + self.logger.info("Saving centroids complete") + del kmeans, cupy_darr, centroids + + clustering_output_dir = os.path.join( + self.clustering_output_dir, "embs_by_nearest_center" + ) + if os.path.exists(clustering_output_dir): + self.logger.warning( + f"Output directory {clustering_output_dir} already exists and will be overwritten" + ) + shutil.rmtree(clustering_output_dir) + + embeddings_df.to_parquet( + clustering_output_dir, + index=False, + partition_on="nearest_cent", + ) + self.logger.info( + f"Saved embeddings by nearest center to {clustering_output_dir}" + ) + del embeddings_df + + if self.sort_clusters: + _assign_and_sort_clusters( + id_col=self.id_col, + kmeans_centroids_file=kmeans_centroids_file, + nearest_cent_dir=clustering_output_dir, + output_sorted_clusters_dir=os.path.join( + self.clustering_output_dir, "sorted" + ), + sim_metric=self.sim_metric, + keep_hard=self.keep_hard, + kmeans_with_cos_dist=self.kmeans_with_cos_dist, + cluster_ids=range(self.n_clusters), + logger=self.logger, + ) + + fps = [ + os.path.join(clustering_output_dir, file_name) + for file_name in os.listdir(clustering_output_dir) + ] + embeddings_df = dd.from_map(cudf.read_parquet, fps) + return DocumentDataset(embeddings_df) + + +class SemanticClusterLevelDedup: + def __init__( + self, + n_clusters: int, + emb_by_clust_dir: str, + sorted_clusters_dir: str, + id_col: str, + id_col_type: str, + which_to_keep: str, + output_dir: str, + logger: Union[logging.Logger, str] = "./", + ) -> None: + """ + Initialize the SemanticClusterLevelDedup class. + + Args: + n_clusters (int): Number of clusters. + emb_by_clust_dir (str): Directory containing embeddings by cluster. + sorted_clusters_dir (str): Directory containing sorted clusters. + id_col (str): Column name for IDs. + id_col_type (str): Data type of the ID column. + which_to_keep (str): Strategy for which duplicate to keep. + output_dir (str): Directory to save output files. + logger (Union[logging.Logger, str]): Logger instance or path to the log file directory. + """ + self.n_clusters = n_clusters + self.emb_by_clust_dir = emb_by_clust_dir + self.sorted_clusters_dir = sorted_clusters_dir + self.id_col = id_col + self.id_col_type = id_col_type + self.which_to_keep = which_to_keep + self.output_dir = output_dir + self.semdedup_pruning_tables_dir = os.path.join( + output_dir, "semdedup_pruning_tables" + ) + self.computed_semantic_match_dfs = False + self.logger = self._setup_logger(logger) + + def _setup_logger(self, logger: Union[logging.Logger, str]) -> logging.Logger: + """ + Set up the logger. + + Args: + logger (Union[logging.Logger, str]): Logger instance or path to the log file directory. + + Returns: + logging.Logger: Configured logger. + """ + if isinstance(logger, str): + return create_logger( + rank=0, + name="SemanticClusterLevelDedup", + log_file=os.path.join(logger, "SemanticClusterLevelDedup.log"), + log_level=logging.INFO, + stdout=True, + ) + else: + return logger + + def compute_semantic_match_dfs( + self, eps_list: Optional[List[float]] = None + ) -> None: + """ + Compute semantic match dataframes for clusters. + + Args: + eps_list (Optional[List[float]]): List of epsilon values for clustering. + """ + if eps_list is None: + eps_list1 = [1.0e-2, 1.0e-3, 1.0e-4, 1.0e-5, 1.0e-6] + eps_list2 = [0.1 + x * 0.005 for x in range(34)] + eps_list = eps_list1 + eps_list2 + + if os.path.exists(self.semdedup_pruning_tables_dir): + self.logger.info( + f"Removing existing directory {self.semdedup_pruning_tables_dir}" + ) + shutil.rmtree(self.semdedup_pruning_tables_dir) + expand_outdir_and_mkdir(self.semdedup_pruning_tables_dir) + + tasks = db.from_sequence( + list(range(self.n_clusters)), npartitions=self.n_clusters + ).map( + lambda cluster_id: get_semantic_matches_per_cluster( + cluster_id=cluster_id, + emb_by_clust_dir=self.emb_by_clust_dir, + sorted_clusters_dir=self.sorted_clusters_dir, + id_col=self.id_col, + id_col_type=self.id_col_type, + eps_list=eps_list, + output_dir=self.semdedup_pruning_tables_dir, + which_to_keep=self.which_to_keep, + ) + ) + tasks.compute() + self.computed_semantic_match_dfs = True + + def extract_dedup_data(self, eps_to_extract: float) -> DocumentDataset: + """ + Extract deduplicated data based on epsilon value. + + Args: + eps_to_extract (float): Epsilon threshold for extracting deduplicated data. + + Returns: + DocumentDataset: Dataset containing deduplicated documents. + """ + if not self.computed_semantic_match_dfs: + raise ValueError( + "Run compute_semantic_match_dfs before calling extract_dedup_data" + ) + + output_summary_file = os.path.join( + self.output_dir, f"dedup_summary_{eps_to_extract}.csv" + ) + output_parquet_path = os.path.join( + self.output_dir, f"unique_ids_{eps_to_extract}.parquet" + ) + extract_dedup_data( + eps=eps_to_extract, + n_clusters=self.n_clusters, + id_col=self.id_col, + id_col_type=self.id_col_type, + sorted_clusters_dir=self.sorted_clusters_dir, + semdedup_pruning_tables_dir=self.semdedup_pruning_tables_dir, + output_summary_file=output_summary_file, + output_parquet_path=output_parquet_path, + logger=self.logger, + ) + + fps = [ + os.path.join(output_parquet_path, file_name) + for file_name in os.listdir(output_parquet_path) + ] + return DocumentDataset.read_parquet(fps, backend="cudf") + + +class SemDedup: + def __init__( + self, + config: SemDedupConfig, + logger: Union[logging.Logger, str] = "./", + ) -> None: + """ + Initialize the SemDedup class. + + Args: + config (SemDedupConfig): Configuration for SemDedup. + logger (Union[logging.Logger, str]): Logger instance or path to the log file directory. + """ + self.config = config + self.logger = logger + cache_dir = config.cache_dir + self.embedding_creator = EmbeddingCreator( + embedding_model_name_or_path=config.embedding_model_name_or_path, + max_memory=config.embedding_max_mem_gb, + batch_size=config.embedding_batch_size, + input_column=config.input_column, + embedding_output_dir=os.path.join(cache_dir, config.embeddings_save_loc), + logger=logger, + ) + self.clustering_model = ClusteringModel( + id_col=config.id_col_name, + max_iter=config.max_iter, + n_clusters=config.n_clusters, + clustering_output_dir=os.path.join(cache_dir, config.clustering_save_loc), + logger=logger, + ) + self.semantic_cluster_dedup = SemanticClusterLevelDedup( + n_clusters=config.n_clusters, + emb_by_clust_dir=os.path.join( + cache_dir, config.clustering_save_loc, "embs_by_nearest_center" + ), + sorted_clusters_dir=os.path.join( + cache_dir, config.clustering_save_loc, "sorted" + ), + id_col=config.id_col_name, + id_col_type=config.id_col_type, + which_to_keep=config.which_to_keep, + output_dir=os.path.join(cache_dir, config.clustering_save_loc), + logger=logger, + ) + self.eps_thresholds = config.eps_thresholds + self.eps_to_extract = config.eps_to_extract + + def __call__(self, dataset: DocumentDataset) -> DocumentDataset: + """ + Execute the SemDedup process. + + Args: + dataset (DocumentDataset): Input dataset for deduplication. + + Returns: + DocumentDataset: Deduplicated dataset. + """ + embeddings_dataset = self.embedding_creator(dataset) + self.clustering_model(embeddings_dataset) + self.semantic_cluster_dedup.compute_semantic_match_dfs(self.eps_thresholds) + return self.semantic_cluster_dedup.extract_dedup_data( + eps_to_extract=self.eps_to_extract + ) diff --git a/nemo_curator/scripts/semdedup/README.md b/nemo_curator/scripts/semdedup/README.md new file mode 100644 index 000000000..68bc16802 --- /dev/null +++ b/nemo_curator/scripts/semdedup/README.md @@ -0,0 +1,40 @@ +# SemDeDup Pipeline + +This pipeline is used to cluster and deduplicate data points based on their embeddings. +Please edit "semdedup_config.yaml" to configure the pipeline and run it using the following commands. + + +## Pipeline Steps + +1) Modify "semdedup_config.yaml" + +2) Compute embeddings: + ```sh + python compute_embeddings.py --input-data-dir "$INPUT_DATA_DIR" --input-file-type "jsonl" --input-file-extension "json" --config-file "$CONFIG_FILE" + ``` + **Input:** `config.embeddings.input_data_dir/*.jsonl` and output from step (2) + **Output:** Embedding parquet files in the embedding directory + +3) Clustering + ```sh + python clustering.py --config-file "$CONFIG_FILE" + ``` + **Input:** Output from step (3) + + **Output:** Under `{config.cache_dir}/{config.clustering_save_loc}` directory, including: + + - `kmeans_centroids.npy` + - `embs_by_nearest_center` directory, containing `nearest_cent={x}` where x ranges from 0 to `num_clusters - 1` + - Parquet files within `embs_by_nearest_center/nearest_cent={x}` containing the data points in each cluster + + +3) Extract deduplicated data + ```sh + python extract_dedup_data.py --config-file "$CONFIG_FILE" + ``` + **Input:** Output from step (3) + **Output:** `{config.cache_dir}/{config.clustering_save_loc}/unique_ids_{}.parquet` + +## End to End Script + +python3 end_to_end_example.py --input-data-dir "$INPUT_DATA_DIR" --input-file-type "jsonl" --config-file "$CONFIG_FILE" diff --git a/nemo_curator/scripts/semdedup/__init__.py b/nemo_curator/scripts/semdedup/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/nemo_curator/scripts/semdedup/clustering.py b/nemo_curator/scripts/semdedup/clustering.py new file mode 100644 index 000000000..82b83c54b --- /dev/null +++ b/nemo_curator/scripts/semdedup/clustering.py @@ -0,0 +1,108 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +from datetime import datetime + +os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" +import dask_cudf + +from nemo_curator.datasets import DocumentDataset +from nemo_curator.log import create_logger +from nemo_curator.modules.config import SemDedupConfig +from nemo_curator.modules.semantic_dedup import ClusteringModel +from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.file_utils import expand_outdir_and_mkdir +from nemo_curator.utils.script_utils import ArgumentHelper + + +def main(args): + semdedup_config = SemDedupConfig.from_yaml(args.config_file) + client = get_client(**ArgumentHelper.parse_client_args(args)) + save_folder = os.path.join( + semdedup_config.cache_dir, semdedup_config.clustering_save_loc + ) + expand_outdir_and_mkdir(save_folder) + # Initialize logger + log_file = os.path.join(save_folder, "compute_centroids.log") + + logger = create_logger( + rank=0, + log_file=log_file, + log_level=logging.INFO, + name="logger-compute-centroids", + stdout=True, + ) + + client = get_client(**ArgumentHelper.parse_client_args(args)) + dt1 = datetime.now() + print("Start time:", dt1) + + embedding_fp = os.path.join( + semdedup_config.cache_dir, semdedup_config.embeddings_save_loc + ) + clustering_output_dir = os.path.join( + semdedup_config.cache_dir, semdedup_config.clustering_save_loc + ) + # Switch to https://github.com/NVIDIA/NeMo-Curator/issues/50 + # When we fix that + embedding_df = dask_cudf.read_parquet(embedding_fp, blocksize="2GB") + embedding_dataset = DocumentDataset(embedding_df) + + clustering_model = ClusteringModel( + id_col=semdedup_config.id_col_name, + max_iter=semdedup_config.max_iter, + n_clusters=semdedup_config.n_clusters, + clustering_output_dir=clustering_output_dir, + logger=logger, + ) + clustered_embeddings = clustering_model(embedding_dataset) + clustered_embeddings.df.head(10) + dt2 = datetime.now() + elapse = dt2 - dt1 + print("End time:", dt2) + print("elapse:", elapse) + + client.cancel(client.futures, force=True) + client.close() + + +def attach_args(): + parser = ArgumentHelper.parse_semdedup_args( + description=( + "Performs clustering on the computed embeddings of a collection of documents. " + "This script requires that the embeddings have been created beforehand using: " + "semdedup_extract_embeddings" + "Input arguments include: " + "--config-file for the path to the semdedup config file. " + "Important configuration parameters include: " + " cache_dir for the directory to store cache," + " clustering_save_loc for the location to save clustering results," + " n_clusters for the number of clusters," + " seed for the seed for clustering," + " max_iter for the maximum iterations for clustering," + " kmeans_with_cos_dist for using KMeans with cosine distance," + ), + add_input_args=False, + ) + return parser + + +def console_script(): + main(attach_args().parse_args()) + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/semdedup/compute_embeddings.py b/nemo_curator/scripts/semdedup/compute_embeddings.py new file mode 100644 index 000000000..b96c8d38f --- /dev/null +++ b/nemo_curator/scripts/semdedup/compute_embeddings.py @@ -0,0 +1,118 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import time + +from nemo_curator.datasets import DocumentDataset +from nemo_curator.log import create_logger +from nemo_curator.modules.config import SemDedupConfig +from nemo_curator.modules.semantic_dedup import EmbeddingCreator +from nemo_curator.utils.distributed_utils import get_client, read_data +from nemo_curator.utils.file_utils import expand_outdir_and_mkdir, get_remaining_files +from nemo_curator.utils.script_utils import ArgumentHelper + + +def main(args): + semdedup_config = SemDedupConfig.from_yaml(args.config_file) + client = get_client(**ArgumentHelper.parse_client_args(args)) + expand_outdir_and_mkdir(semdedup_config.cache_dir) + logger = create_logger( + rank=0, + name="logger-compute-embeddings", + log_file=os.path.join(semdedup_config.cache_dir, "compute_embeddings.log"), + log_level=logging.INFO, + stdout=True, + ) + + output_data_dir = os.path.join( + semdedup_config.cache_dir, semdedup_config.embeddings_save_loc + ) + # Some time jsonl files are stored as .json + # So to handle that case we can pass the input_file_extension + if args.input_file_extension is not None: + input_file_extension = args.input_file_extension + else: + input_file_extension = args.input_file_type + print("input_file_extension", input_file_extension) + st = time.time() + input_files = get_remaining_files( + input_file_path=args.input_data_dir, + output_file_path=output_data_dir, + input_file_type=input_file_extension, + num_files=semdedup_config.num_files, + ) + logger.info(f"Processing {len(input_files)} files") + if len(input_files) == 0: + logger.info("No files to process") + return + + ddf = read_data( + input_files=input_files, file_type=args.input_file_type, add_filename=False + ) + ddf = ddf.reset_index(drop=True) + dataset = DocumentDataset(ddf) + # Can repartition here if needed + # ddf = ddf.repartition(partition_size="64MB") + embedding_creator = EmbeddingCreator( + embedding_model_name_or_path=semdedup_config.embedding_model_name_or_path, + embedding_max_mem_gb=semdedup_config.embedding_max_mem_gb, + embedding_batch_size=semdedup_config.embedding_batch_size, + embedding_output_dir=os.path.join( + semdedup_config.cache_dir, semdedup_config.embeddings_save_loc + ), + input_column=semdedup_config.input_column, + logger=logger, + write_to_filename=False, + ) + embedding_dataset = embedding_creator(dataset=dataset) + print(embedding_dataset.df.head()) + logger.info(f"Time taken: {time.time() - st}") + client.cancel(client.futures, force=True) + client.close() + + +def attach_args(): + parser = ArgumentHelper.parse_semdedup_args( + description=( + "Computes the embeddings of a collection of documents using the specified model. " + "The model is specified in the config file using embedding_model_name_or_path (e.g. 'sentence-transformers/paraphrase-MiniLM-L6-v2'). " + "The embeddings are saved in the specified cache directory under the embeddings_save_loc directory. " + "Input arguments include: " + "--input_data_dir for the directory containing input data files, " + "--input_file_extension for specifying the file extension of input files (e.g., .jsonl), " + "--input_file_type for the type of input files (e.g., json, csv), " + "--input_text_field for the field in the input files containing the text data to be embedded. " + "Additional configuration can be provided via the --config-file argument. " + "Important configuration parameters include: " + " cache_dir for the directory to store cache" + " num_files for the number of files to process (default is -1, meaning all files)," + " input_column for specifying the input column for embeddings," + " embeddings_save_loc for the location to save embeddings," + " embedding_model_name_or_path for the model name or path for embeddings," + " embedding_batch_size for the batch size for processing embeddings," + " embedding_max_mem_gb for the maximum memory in GB for embeddings" + ), + add_input_args=True, + ) + return parser + + +def console_script(): + main(attach_args().parse_args()) + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/semdedup/extract_dedup_data.py b/nemo_curator/scripts/semdedup/extract_dedup_data.py new file mode 100755 index 000000000..ca5016b98 --- /dev/null +++ b/nemo_curator/scripts/semdedup/extract_dedup_data.py @@ -0,0 +1,88 @@ +import logging +import os +from datetime import datetime + +from nemo_curator.log import create_logger +from nemo_curator.modules.config import SemDedupConfig +from nemo_curator.modules.semantic_dedup import SemanticClusterLevelDedup +from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.script_utils import ArgumentHelper + + +def main(args): + semdedup_config = SemDedupConfig.from_yaml(args.config_file) + client = get_client(**ArgumentHelper.parse_client_args(args)) + + root = semdedup_config.cache_dir + save_loc = semdedup_config.clustering_save_loc + client = get_client(**ArgumentHelper.parse_client_args(args)) + + logger = create_logger( + rank=0, + log_file=os.path.join(root, save_loc, "extract_dedup_data.log"), + name="logger-extract-dedup-data", + log_level=logging.INFO, + stdout=True, + ) + + dt1 = datetime.now() + logger.info(f"Start: {dt1}") + cache_dir = semdedup_config.cache_dir + semantic_dedup = SemanticClusterLevelDedup( + n_clusters=semdedup_config.n_clusters, + emb_by_clust_dir=os.path.join( + cache_dir, semdedup_config.clustering_save_loc, "embs_by_nearest_center" + ), + sorted_clusters_dir=os.path.join( + cache_dir, semdedup_config.clustering_save_loc, "sorted" + ), + id_col=semdedup_config.id_col_name, + id_col_type=semdedup_config.id_col_type, + which_to_keep=semdedup_config.which_to_keep, + output_dir=os.path.join( + semdedup_config.cache_dir, semdedup_config.clustering_save_loc + ), + logger=logger, + ) + + semantic_dedup.compute_semantic_match_dfs() + for eps in semdedup_config.eps_thresholds: + dedup_id_dataset = semantic_dedup.extract_dedup_data(eps_to_extract=eps) + print(dedup_id_dataset.df.head(10)) + + dt2 = datetime.now() + logger.info(f"End: {dt2}") + elapse = (dt2 - dt1).total_seconds() / 60 + logger.info(f"elapse: {elapse}") + + client.cancel(client.futures, force=True) + client.close() + + +def attach_args(): + parser = ArgumentHelper.parse_semdedup_args( + description=( + "Extracts deduplicated data from the clustered embeddings of a collection of documents. " + "This script requires that embeddings and clustering have been performed beforehand using the specified configurations. " + "earlier using semdedup_extract_embeddings and semdedup_cluster_embeddings." + "Input arguments include: " + "--config-file for the path to the semdedup config file. " + "Important configuration parameters include:" + "- cache_dir for the directory to store cache" + "which_to_keep for specifying which duplicates to keep," + "largest_cluster_size_to_process for the largest cluster size to process," + "sim_metric for the similarity metric for deduplication," + "eps_thresholds for epsilon thresholds to calculate if semantically similar or not" + "and eps_to_extract for the epsilon value to extract deduplicated data." + ), + add_input_args=False, + ) + return parser + + +def console_script(): + main(attach_args().parse_args()) + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/nemo_curator/utils/distributed_utils.py b/nemo_curator/utils/distributed_utils.py index c7769c4d9..629cc387e 100644 --- a/nemo_curator/utils/distributed_utils.py +++ b/nemo_curator/utils/distributed_utils.py @@ -17,12 +17,14 @@ import os os.environ["RAPIDS_NO_INITIALIZE"] = "1" +import random import warnings from contextlib import nullcontext from pathlib import Path from typing import Union import dask.dataframe as dd +import numpy as np import pandas as pd from dask.distributed import Client, LocalCluster, get_worker, performance_report @@ -216,8 +218,8 @@ def read_single_partition( " file formats.." ) - if filetype == "jsonl": - read_kwargs = {"lines": True} + if filetype in ["jsonl", "json"]: + read_kwargs = {"lines": filetype == "jsonl"} if backend == "cudf": read_f = cudf.read_json else: @@ -315,7 +317,7 @@ def read_data( if backend == "cudf": df = df.to_backend("cudf") - elif file_type in ["jsonl", "parquet"]: + elif file_type in ["json", "jsonl", "parquet"]: print(f"Reading {len(input_files)} files", flush=True) input_files = sorted(input_files) if files_per_partition > 1: @@ -583,3 +585,29 @@ def performance_report_if(path=None, report_name="dask-profile.html"): return performance_report(os.path.join(path, report_name)) else: return nullcontext() + + +def seed_all(seed: int = 42): + """ + Function to set seed for random number generators for reproducibility. + + Args: + seed: The seed value to use for random number generators. Default is 42. + + Returns: + None + """ + ## Imporing torch to help with context issues + import torch + + # Set seed values for various random number generators + random.seed(seed) + os.environ["PYTHONHASHSEED"] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + # Ensure deterministic behavior for CUDA algorithms + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False diff --git a/nemo_curator/utils/file_utils.py b/nemo_curator/utils/file_utils.py index 3ec466b4c..de5c78af5 100644 --- a/nemo_curator/utils/file_utils.py +++ b/nemo_curator/utils/file_utils.py @@ -63,7 +63,9 @@ def get_all_files_paths_under(root, recurse_subdirectories=True, followlinks=Fal # can lead to problems when there is an error while # writing a file we can use the offset counter approach # in jaccard shuffle as a more robust way to restart jobs -def get_remaining_files(input_file_path, output_file_path, input_file_type): +def get_remaining_files( + input_file_path, output_file_path, input_file_type, num_files=-1 +): """ This function returns a list of the files that still remain to be read. @@ -71,12 +73,16 @@ def get_remaining_files(input_file_path, output_file_path, input_file_type): input_file_path: The path of the input files. output_file_path: The path of the output files. input_file_type: The type of the input files. + num_files: The max number of files to be returned. If -1, all files are returned. Returns: A list of files that still remain to be read. """ if input_file_type == "pickle": return [input_file_path] + + if not os.path.exists(output_file_path): + expand_outdir_and_mkdir(output_file_path) completed_files = [ os.path.basename(entry.path) for entry in os.scandir(output_file_path) ] @@ -86,7 +92,16 @@ def get_remaining_files(input_file_path, output_file_path, input_file_type): for entry in os.scandir(input_file_path) if os.path.basename(entry.path) not in completed_files ] + # Gaurd against non extension files if present in the input directory + input_files = [f for f in input_files if f.endswith(input_file_type)] input_files.sort() + + len_written_files = len(completed_files) + if num_files > 0: + left_to_sample = max(num_files - len_written_files, 0) + else: + left_to_sample = len(input_files) + input_files = input_files[:left_to_sample] return input_files diff --git a/nemo_curator/utils/script_utils.py b/nemo_curator/utils/script_utils.py index 3304043e6..107ccd5e2 100644 --- a/nemo_curator/utils/script_utils.py +++ b/nemo_curator/utils/script_utils.py @@ -89,6 +89,7 @@ def add_arg_log_dir(self, default: str): def add_arg_input_data_dir( self, + required=False, help: str = "Input directory consisting of .jsonl files that are accessible " "to all nodes. Use this for a distributed file system", ): @@ -96,12 +97,14 @@ def add_arg_input_data_dir( "--input-data-dir", type=str, default=None, + required=required, help=help, ) def add_arg_input_file_type( self, choices=None, + required=False, help="File type of the dataset to be read in. Supported file formats " "include 'jsonl' (default), 'pickle', or 'parquet'.", ): @@ -109,10 +112,22 @@ def add_arg_input_file_type( "--input-file-type", type=str, default="jsonl", + required=required, choices=choices, help=help, ) + def add_arg_input_file_extension( + self, + help: str = "The file extension of the input files. If not provided, the input file type will be used.", + ): + self.parser.add_argument( + "--input-file-extension", + type=str, + default=None, + help=help, + ) + def add_arg_input_local_data_dir(self): self.parser.add_argument( "--input-local-data-dir", @@ -496,3 +511,38 @@ def parse_gpu_dedup_args(description: str) -> argparse.ArgumentParser: ) return argumentHelper.parser + + @staticmethod + def parse_semdedup_args( + add_input_args=False, + description="Default argument parser for semantic deduplication", + ) -> argparse.ArgumentParser: + """ + Adds default set of arguments that are common to multiple stages of the semantic deduplication pipeline + of the pipeline + """ + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description=description, + ) + argumentHelper = ArgumentHelper(parser) + argumentHelper.add_distributed_args() + if add_input_args: + argumentHelper.add_arg_input_data_dir(required=True) + argumentHelper.add_arg_input_file_extension() + argumentHelper.add_arg_input_file_type() + argumentHelper.add_arg_input_text_field() + + argumentHelper.parser.add_argument( + "--config-file", + type=str, + help="Path to the semdedup config file", + required=True, + ) + # Set low default RMM pool size for classifier + # to allow pytorch to grow its memory usage + # by default + parser.set_defaults(rmm_pool_size="512MB") + parser.set_defaults(device="gpu") + parser.set_defaults(set_torch_to_use_rmm=False) + return parser diff --git a/nemo_curator/utils/semdedup_utils.py b/nemo_curator/utils/semdedup_utils.py new file mode 100644 index 000000000..be7b6e5aa --- /dev/null +++ b/nemo_curator/utils/semdedup_utils.py @@ -0,0 +1,445 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +import os +import random +import shutil +import time +from typing import List, Tuple + +import cudf +import dask.bag as db +import dask.dataframe as dd +import numpy as np +import pandas as pd +import torch +from dask.distributed import progress + +from nemo_curator.utils.file_utils import expand_outdir_and_mkdir + + +def _assign_and_sort_clusters( + id_col: str, + kmeans_centroids_file: str, + nearest_cent_dir: str, + output_sorted_clusters_dir: str, + cluster_ids=List[int], + sim_metric: str = "cosine", + keep_hard: bool = True, + kmeans_with_cos_dist: bool = True, + logger: logging.Logger = None, +): + """ + Args: + id_col (str): The column name representing the unique identifier for each data point. + centroids_path (str): The location of the K-means centroids file. + nearest_cent_dir (str): The location of the nearest center files. + output_sorted_clusters_dir (str): The location to save the sorted clusters. + sim_metric (str): The similarity metric to use for clustering. Defaults to "cosine". + keep_hard (bool): When True, sorts cluster items in descending order by similarity to the cluster centroid. Defaults to True. + kmeans_with_cos_dist (bool): Whether to use cosine distance for K-means clustering. Defaults to True. + sorted_clusters_file_loc (str): The location to save the sorted clusters file. Defaults to an empty string. + cluster_ids (list): The range of cluster IDs to sort. + logger (logging.Logger): A logger object to log messages. Defaults to None. + + Returns: + None + """ + # Step 3: Sort each class/cluster + logger.info("Ranking...") + if os.path.exists(output_sorted_clusters_dir): + logger.info( + f"Removing existing sorted cluster directory: {output_sorted_clusters_dir}" + ) + shutil.rmtree(output_sorted_clusters_dir) + + expand_outdir_and_mkdir(output_sorted_clusters_dir) + + kmeans_centroids = np.load(kmeans_centroids_file) + start_time = time.time() + + cluster_ids_bag = db.from_sequence(cluster_ids, npartitions=len(cluster_ids)) + completed_count = cluster_ids_bag.map( + lambda cluster_c: rank_within_cluster( + id_col=id_col, + nearest_cent_dir=nearest_cent_dir, + output_sorted_clusters_dir=output_sorted_clusters_dir, + centroids=kmeans_centroids, + sim_metric=sim_metric, + keep_hard=keep_hard, + kmeans_with_cos_dist=kmeans_with_cos_dist, + cluster_ids=[cluster_c], + ) + ).compute() + + missing = len(cluster_ids) - sum(completed_count) + logger.info( + f"Completed {sum(completed_count)} clusters. Missing {missing} clusters." + ) + logger.info(f"Time for ranking: {(time.time() - start_time) / 60:.2f} mins") + logger.info("DONE!") + + +def rank_within_cluster( + id_col: str, + nearest_cent_dir: str, + output_sorted_clusters_dir: str, + centroids: np.ndarray, + sim_metric: str = "cosine", + keep_hard: bool = True, + kmeans_with_cos_dist: bool = False, + cluster_ids: List[int] = range(50000), +): + """ + Sorts each cluster's items by their distance to the cluster centroid. + + Args: + id_col (str): The column name representing the unique identifier for each data point. + nearest_cent_dir (str): The location of the nearest center files. + output_sorted_clusters_dir (str): The location to save the sorted clusters. + centroids (np.ndarray): The centroids for each cluster. + sim_metric (str): The similarity metric used to compute distances. Should be one of ["cosine"]. Defaults to "cosine". + keep_hard (bool): When True, sorts cluster items in descending order by similarity to the cluster centroid. Defaults to True. + kmeans_with_cos_dist (bool): Whether to use cosine distance for K-means clustering. Defaults to False. + cluster_ids (List[int]): The list of cluster IDs to process. Defaults to range(50000). + + Returns: + None + """ + assert sim_metric in [ + "cosine", + ], "sim_metric should be in ['cosine']" + + missing_files = 0 + for cluster_c in cluster_ids: + cluster_c_path = os.path.join(nearest_cent_dir, f"nearest_cent={cluster_c}") + if not os.path.exists(cluster_c_path): + missing_files += 1 + continue + + cluster_df = cudf.read_parquet( + cluster_c_path, columns=[id_col, "dist_to_cent", "embeddings"] + ) + embeds = torch.as_tensor( + cluster_df["embeddings"].list.leaves.values.reshape( + cluster_df.shape[0], -1 + ), + device="cuda", + ) + cluster_df = cluster_df.to_pandas() + + assert kmeans_with_cos_dist is False + + if sim_metric == "cosine": + cluster_c_centroid = torch.as_tensor(centroids[cluster_c], device="cuda") + sim_to_cent = torch.nn.CosineSimilarity(dim=1)(embeds, cluster_c_centroid) + sim_to_cent = sim_to_cent.cpu().numpy() + cluster_dists_to_cent = (1 - sim_to_cent).tolist() + elif sim_metric == "l2": + # Used when kmeans_with_cos_dist is True + cluster_dists_to_cent = list(cluster_df["dist_to_cent"]) + + cluster_label = np.full((len(cluster_df)), cluster_c).tolist() + example_id = list(cluster_df[id_col]) + sort_descending = keep_hard + cluster_sorted = sorted( + zip(example_id, cluster_dists_to_cent, cluster_label), + key=lambda x: x[2], + reverse=sort_descending, + ) # -- sort_descending = True for descending sort + + sorted_cluster_file_path = os.path.join( + output_sorted_clusters_dir, f"cluster_{cluster_c}.npy" + ) + np.save(sorted_cluster_file_path, cluster_sorted) + + return len(cluster_ids) - missing_files + + +def _semdedup( + cluster_reps: torch.Tensor, device: str +) -> Tuple[torch.Tensor, List[int]]: + # compute pairwise cos sim between cluster items, + # then replace to diagonal with zeros to ignore self similarity + cluster_reps.to(device) + pair_w_sim_matrix = cluster_reps @ (cluster_reps.T) + del cluster_reps + pair_w_sim_matrix.fill_diagonal_(0.0) + assert pair_w_sim_matrix.shape[0] == pair_w_sim_matrix.shape[1] + + triu_sim_mat = torch.triu(pair_w_sim_matrix, diagonal=1) + + M = torch.max(triu_sim_mat, dim=0)[0].cpu() + M1 = torch.max(triu_sim_mat, dim=0)[1].cpu().numpy().tolist() + return M, M1 + + +def get_cluster_reps( + cluster_id: int, emb_by_clust_dir: str, id_col: str, sorted_ids: np.ndarray +) -> torch.Tensor: + cluster_i_path = os.path.join(emb_by_clust_dir, f"nearest_cent={cluster_id}") + cluster_reps = cudf.read_parquet( + cluster_i_path, columns=["embeddings", id_col] + ).sort_values(by=id_col) + num = cluster_reps.shape[0] + + df_ = pd.DataFrame( + {"sorted_ids": sorted_ids, "inverse_sort": list(range(num))} + ).sort_values(by="sorted_ids") + cluster_reps["inverse_sort_id"] = df_["inverse_sort"].values + cluster_reps = cluster_reps.sort_values(by="inverse_sort_id") + + cluster_reps = torch.as_tensor( + cluster_reps["embeddings"].list.leaves.values.reshape(len(cluster_reps), -1), + device="cuda", + ) + return cluster_reps + + +def get_semantic_matches_per_cluster( + cluster_id: int, + emb_by_clust_dir: str, + sorted_clusters_dir: str, + id_col: str, + id_col_type: str, + eps_list: List[float], + output_dir: str, + which_to_keep: str, +) -> None: + + output_df_file_path = os.path.join(output_dir, f"cluster_{cluster_id}.parquet") + + sorted_file = os.path.join(sorted_clusters_dir, f"cluster_{cluster_id}.npy") + if not os.path.exists(sorted_file): + logging.info(f"{sorted_file} does not exist. Continue") + return + + cluster_i = np.load(sorted_file) + cluster_size = cluster_i.shape[0] + logging.info(f"{cluster_id}: cluster_size: {cluster_size}") + + if cluster_size == 1: + points_to_remove_df = pd.DataFrame() + points_to_remove_df["indices"] = [0] + for eps in eps_list: + points_to_remove_df[f"eps={eps}"] = [False] + points_to_remove_df.to_parquet(output_df_file_path) + return + + clutser_items_indices = list(range(cluster_size)) + + which_to_keep = which_to_keep.lower() + if which_to_keep == "random": + random.shuffle(clutser_items_indices) + cluster_i = cluster_i[clutser_items_indices] + elif which_to_keep == "easy": + clutser_items_indices = clutser_items_indices[::-1] + cluster_i = cluster_i[clutser_items_indices] + + text_ids = cluster_i[:, 0].astype(id_col_type) + + cluster_reps = get_cluster_reps(cluster_id, emb_by_clust_dir, id_col, text_ids) + M, M1 = _semdedup(cluster_reps, "cuda") + assert cluster_reps.shape[0] == len(text_ids) + + M1_id = [text_ids[m] for m in M1] + + points_to_remove_df = cudf.DataFrame() + points_to_remove_df["indices"] = clutser_items_indices + points_to_remove_df["id"] = text_ids + points_to_remove_df["max_id"] = M1_id + points_to_remove_df["cosine_sim_score"] = M.numpy().tolist() + + for eps in eps_list: + eps_points_to_remove = M > 1 - eps + points_to_remove_df[f"eps={eps}"] = eps_points_to_remove + + points_to_remove_df.to_parquet(output_df_file_path) + + +def get_num_records(file_path): + if not os.path.exists(file_path): + return 0 + with open(file_path, "rb") as f: + # Read the header of the npy file + version = np.lib.format.read_magic(f) + shape, _, _ = np.lib.format._read_array_header(f, version) + return shape[0] + + +def _get_empty_results_df(id_col, id_col_type): + meta_df = pd.DataFrame( + { + id_col: np.empty(0, dtype="int64"), + "dist": np.empty(0, dtype="float32"), + "cluster": np.empty(0, dtype="int32"), + } + ) + meta_df[id_col] = meta_df[id_col].astype(id_col_type) + return meta_df + + +def prune_single_cluster( + cluster_id: int, + id_col: str, + id_col_type: str, + sorted_clusters_dir: str, + semdedup_pruning_tables_dir: str, + eps: float, +) -> cudf.DataFrame: + """ + Processes data for a single cluster, applying pruning based on specified epsilon. + + Args: + cluster_id (int): The specific cluster ID to process. + id_col (str): The name of the ID column. + id_col_type (str): The data type of the ID column. + sorted_clusters_dir (str): Path to the sorted clusters directory. + semdedup_pruning_tables_dir (str): Path to the pruning tables directory. + eps (float): Epsilon value for pruning. + + Returns: + cudf.DataFrame: A DataFrame of the pruned cluster data + """ + sorted_fname = os.path.join(sorted_clusters_dir, f"cluster_{cluster_id}.npy") + if not os.path.exists(sorted_fname): + return _get_empty_results_df(id_col, id_col_type) + + cluster_data = np.load(sorted_fname) + df_cluster = cudf.DataFrame( + { + id_col: cluster_data[:, 0], + "dist": cluster_data[:, 1], + "cluster": cluster_data[:, 2], + } + ) + + df_cluster[id_col] = df_cluster[id_col].astype(id_col_type) + df_cluster["dist"] = df_cluster["dist"].astype("float32") + df_cluster["cluster"] = df_cluster["cluster"].astype("int32") + + cluster_df_fname = os.path.join( + semdedup_pruning_tables_dir, f"cluster_{cluster_id}.parquet" + ) + pruning_table = cudf.read_parquet(cluster_df_fname) + if pruning_table.shape[0] == 1: + return df_cluster + + # TODO: Fix this without going to host + items_to_keep = ( + pruning_table[pruning_table[f"eps={eps}"] == False]["id"].to_arrow().to_pylist() + ) + pruned_cluster = df_cluster[df_cluster[id_col].isin(items_to_keep)] + pruned_cluster[id_col] = pruned_cluster[id_col].astype(id_col_type) + return pruned_cluster + + +def extract_pruned_data( + id_col: str, + id_col_type: str, + sorted_clusters_dir: str, + semdedup_pruning_tables_dir: str, + eps: float, + n_clusters: int, + output_parquet_path: str, +) -> Tuple[int, int, int]: + """ + Extracts pruned data from sorted clusters and saves it to a CSV file. + + Args: + id_col (str): The name of the ID column. + id_col_type (str): The data type of the ID column. + sorted_clusters_dir (str): Path to the sorted clusters directory. + semdedup_pruning_tables_dir (str): Path to the pruning tables directory. + eps (float): Epsilon value for pruning. + n_clusters (int): Number of clusters. + output_csv_path (str): Path to save the output CSV file. + + Returns: + Tuple[int, int, int]: Number of kept records, removed records, and total records. + """ + + results_df = dd.from_map( + prune_single_cluster, + range(n_clusters), + id_col=id_col, + id_col_type=id_col_type, + sorted_clusters_dir=sorted_clusters_dir, + semdedup_pruning_tables_dir=semdedup_pruning_tables_dir, + eps=eps, + ) + results_df[id_col] = results_df[id_col].astype(id_col_type) + results_df = results_df.persist() + progress(results_df) + + results_df.to_parquet(output_parquet_path) + total_kept = len(results_df) + + np_files = [ + os.path.join(sorted_clusters_dir, f"cluster_{i}.npy") for i in range(n_clusters) + ] + total_records = sum(get_num_records(file_path) for file_path in np_files) + # Aggregate results + total_removed = total_records - total_kept + return total_kept, total_removed, total_records + + +def extract_dedup_data( + eps, + n_clusters, + id_col, + id_col_type, + sorted_clusters_dir, + semdedup_pruning_tables_dir, + output_summary_file, + output_parquet_path, + logger: logging.Logger, +) -> None: + """ + Extracts deduplicated data based on provided parameters and logs the process. + + Args: + + """ + + kept, removed, total = extract_pruned_data( + id_col=id_col, + id_col_type=id_col_type, + sorted_clusters_dir=sorted_clusters_dir, + semdedup_pruning_tables_dir=semdedup_pruning_tables_dir, + eps=eps, + n_clusters=n_clusters, + output_parquet_path=output_parquet_path, + ) + + logger.info( + f"DONE saving {kept} out of {total}. Removed: {removed}. Epsilon: {eps:.4f}" + ) + result_dict = { + "eps": [eps], + "kept": [kept], + "removed": [removed], + "total": [total], + } + df = pd.DataFrame(result_dict) + df.to_csv(output_summary_file, index=False) + + fps = [ + os.path.join(output_parquet_path, file_name) + for file_name in os.listdir(output_parquet_path) + ] + ids_to_keep_df = dd.from_map(cudf.read_parquet, fps) + return ids_to_keep_df diff --git a/setup.py b/setup.py index 933f4c2d8..a68840612 100644 --- a/setup.py +++ b/setup.py @@ -61,7 +61,8 @@ "presidio-anonymizer==2.2.351", "usaddress==0.5.10", "nemo_toolkit[nlp]>=1.23.0", - "crossfit @ git+https://github.com/rapidsai/crossfit.git@1ee3de4", + "Cython", + "crossfit @ git+https://github.com/rapidsai/crossfit.git@0.0.2", # justext installation breaks without lxml[html_clean] # due to this: https://github.com/miso-belica/jusText/issues/47 "lxml[html_clean]", @@ -107,6 +108,9 @@ "quality_classifier_inference=nemo_curator.scripts.quality_classifier_inference:console_script", "verify_classification_results=nemo_curator.scripts.verify_classification_results:console_script", "blend_datasets=nemo_curator.scripts.blend_datasets:console_script", + "semdedup_extract_embeddings=nemo_curator.scripts.semdedup.compute_embeddings:console_script", + "semdedup_clustering=nemo_curator.scripts.semdedup.clustering:console_script", + "semdedup_extract_dedup_ids=nemo_curator.scripts.semdedup.extract_dedup_data:console_script", ], }, ) From 896f746f7fd5f44753f79d0520ae2502775b1dea Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 8 Jul 2024 14:58:14 -0700 Subject: [PATCH 08/19] Remove lxml installation (#140) Signed-off-by: Ryan Wolf --- setup.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/setup.py b/setup.py index a68840612..205087046 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ "awscli>=1.22.55", "fasttext==0.9.2", "pycld2==0.41", - "justext==3.0.0", + "justext==3.0.1", "resiliparse", "ftfy==6.1.1", "warcio==1.7.4", @@ -63,9 +63,6 @@ "nemo_toolkit[nlp]>=1.23.0", "Cython", "crossfit @ git+https://github.com/rapidsai/crossfit.git@0.0.2", - # justext installation breaks without lxml[html_clean] - # due to this: https://github.com/miso-belica/jusText/issues/47 - "lxml[html_clean]", # Numpy 2.0 breaks with spacy https://github.com/explosion/spaCy/issues/13528 # TODO: Remove when issue is fixed "numpy<2", From c87233da93fadf1488494bade1831d44ed9d31f3 Mon Sep 17 00:00:00 2001 From: Chris Alexiuk <161380339+chrisalexiuk-nvidia@users.noreply.github.com> Date: Mon, 8 Jul 2024 18:20:28 -0400 Subject: [PATCH 09/19] Nemotron-340B SDG Tutorial (#144) Signed-off-by: Chris Alexiuk --- .../nemotron_340B_synthetic_datagen/README.md | 9 + ...ence_data_generation_nemotron_4_340B.ipynb | 868 ++++++++++++++++++ 2 files changed, 877 insertions(+) create mode 100644 tutorials/nemotron_340B_synthetic_datagen/README.md create mode 100644 tutorials/nemotron_340B_synthetic_datagen/synthetic_preference_data_generation_nemotron_4_340B.ipynb diff --git a/tutorials/nemotron_340B_synthetic_datagen/README.md b/tutorials/nemotron_340B_synthetic_datagen/README.md new file mode 100644 index 000000000..308084fb1 --- /dev/null +++ b/tutorials/nemotron_340B_synthetic_datagen/README.md @@ -0,0 +1,9 @@ +# Synthetic Preference Data Generation Using Nemotron-4 340B + +The provided notebook will demonstrate how to leverage [Nemotron-4 340B Instruct](https://build.nvidia.com/nvidia/nemotron-4-340b-instruct), and [Nemotron-4 340B Reward](https://build.nvidia.com/nvidia/nemotron-4-340b-reward) through [build.nvidia.com](https://build.nvidia.com/explore/discover). + +The build will be a demonstration of the following pipeline, as discuss in the [release blog](https://blogs.nvidia.com/blog/nemotron-4-synthetic-data-generation-llm-training/), and [technical blog](https://developer.nvidia.com/blog/leverage-our-latest-open-models-for-synthetic-data-generation-with-nvidia-nemotron-4-340b/). The pipeline is designed to create a preference dataset suitable for training a custom reward model using the [SteerLM method](https://docs.nvidia.com/nemo-framework/user-guide/latest/modelalignment/steerlm.html), however consecutive responses (e.g. sample 1 with 2, 3 with 4, etc.) share the same prompt so the dataset can also be used for preference pairs for training an RLHF Reward Model or for DPO - using the helpfulness score. + +![image](https://developer-blogs.nvidia.com/wp-content/uploads/2024/06/SDG-Pipeline-1-625x352.png) + +> NOTE: There are no specific dependencies outside of those outlined in the notebook for this tutorial! diff --git a/tutorials/nemotron_340B_synthetic_datagen/synthetic_preference_data_generation_nemotron_4_340B.ipynb b/tutorials/nemotron_340B_synthetic_datagen/synthetic_preference_data_generation_nemotron_4_340B.ipynb new file mode 100644 index 000000000..a39dba00a --- /dev/null +++ b/tutorials/nemotron_340B_synthetic_datagen/synthetic_preference_data_generation_nemotron_4_340B.ipynb @@ -0,0 +1,868 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Synthetic Preference Data Generation Using Nemotron-4 340B\n", + "\n", + "The following notebook will demonstrate how to leverage [Nemotron-4 340B Instruct](https://build.nvidia.com/nvidia/nemotron-4-340b-instruct), and [Nemotron-4 340B Reward](https://build.nvidia.com/nvidia/nemotron-4-340b-reward) through [build.nvidia.com](https://build.nvidia.com/explore/discover).\n", + "\n", + "The build will be a demonstration of the following pipeline, as discuss in the [release blog](https://blogs.nvidia.com/blog/nemotron-4-synthetic-data-generation-llm-training/), and [technical blog](https://developer.nvidia.com/blog/leverage-our-latest-open-models-for-synthetic-data-generation-with-nvidia-nemotron-4-340b/). The pipeline is designed to create a preference dataset suitable for training a custom reward model using the [SteerLM method](https://docs.nvidia.com/nemo-framework/user-guide/latest/modelalignment/steerlm.html), however consecutive responses (e.g. sample 1 with 2, 3 with 4, etc.) share the same prompt so the dataset can also be used for preference pairs for training an RLHF Reward Model or for DPO - using the helpfulness score.\n", + "\n", + "![image](https://developer-blogs.nvidia.com/wp-content/uploads/2024/06/SDG-Pipeline-1-625x352.png)\n", + "\n", + "The flow will be split into 2 general parts: \n", + "\n", + "1. **Synthetic Response Generation**: A domain specific input query will be provided by the developer - at which point Nemotron-4 340B Instruct will be leveraged to generate ~150 questions. Then, Nemotron-4 340B Instruct will be used to generated 2 responses for each question. \n", + "2. **Reward Model as a Judge**: Nemotron-4 340B Reward will be used to score the 2 responses per question to be used for further alignment training via [NeMo Aligner](https://github.com/NVIDIA/NeMo-Aligner)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## build.nvidia.com API Key Set-up!\n", + "\n", + "In order to access the endpoints through [build.nvidia.com](https://build.nvidia.com/explore/discover), an API key is required. \n", + "\n", + "A trial API key is made available with 1,000 tokens (or 5,000 tokens for corporate emails) - the example below will leverage ~4,500 tokens of data, but can be extended beyond that limit using local instances of the models.\n", + "\n", + "There are two steps to get a trial API key:\n", + "\n", + "1. Login (or sign up) through [build.nvidia.com](https://build.nvidia.com/)\n", + "2. Click the `Get API Key` button available on the the `nvidia/nemotron-4-340b-instruct` page, found [here](https://build.nvidia.com/nvidia/nemotron-4-340b-instruct).\n", + "\n", + "![image](https://i.imgur.com/dM7AwKZ.png)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 1: Generate Subtopics, questions, and responses with Nemotron-4 340B Instruct\n", + "\n", + "The first part of the notebook will cover the creation of raw synthetic data from Nemotron-4 340B Instruct, due to the model's [permissive license](https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf), the usage of the outputs of Nemotron-4 340B Instruct are permitted to be used for training, customization, etc." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prompt Templates for Synthetic Data Generation\n", + "\n", + "To generate questions and responses, there are a few prompt templates required:\n", + "\n", + "1. A prompt template to generate subtopics from a user provided topic\n", + "2. A prompt template to generate questions for a given subtopic\n", + "2. A prompt template to generate responses for a given question" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [], + "source": [ + "TOPIC_GENERATION_PROMPT_TEMPLATE = \"\"\"\\\n", + "Given a topic, generate a list of {n_subtopics} subtopics that are related to the topic.\n", + "\n", + "The topic is: {topic}\n", + "\n", + "The list must be without numbers, and without any description of the subtopics. The subtopics should be separated by a comma. There must be no other text than the list.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [], + "source": [ + "QUESTION_PROMPT_TEMPLATE = \"\"\"\\\n", + "Given a topic, generate {n_questions} questions that could be asked about that topic. Your response should be in a list format.\n", + "\n", + "The topic is: {sub_topic}\n", + "\n", + "The list must be without numbers. The questions should be separated by a newline character. There must be no other text than the list.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "metadata": {}, + "outputs": [], + "source": [ + "RESPONSE_PROMPT_TEMPLATE = \"\"\"\\\n", + "Given a question, generate 2 responses that could be given to that question. Your response should be in a list format.\n", + "\n", + "The question is: {question}\n", + "\n", + "The list must be in the format:\n", + "\n", + "RESPONSE A: Response A text here\n", + "RESPONSE B: Response B text here\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Defined below are the parameters that will be used throughout the notebook to generate numbers of datapoints. \n", + "\n", + "1. `n_subtopics`, for the given topic `10` sub-topics will be generated by Nemotron-4 340B Instruct\n", + "2. `n_questions`, for the given sub-topic, `10` questions will be generated by Nemotron-4 340B Instruct\n", + "\n", + "> NOTE: Using the default parameters above - there will be 10 sub-topics, each with 10 questions, each with 2 (hardcoded) responses. That is a total of an estimated ~200 rows of data. " + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [], + "source": [ + "n_subtopics = 10\n", + "n_questions = 10" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setting OpenAI Client for Synthetic Data Generation\n", + "\n", + "Due to [build.nvidia.com](https://build.nvidia.com/)'s integration with the OpenAI API template - the OpenAI Python library can be used to interact with Nemotron-4 340B Instruct and Nemotron-4 340B Reward.\n", + "\n", + "To begin, install the [OpenAI Python library](https://github.com/openai/openai-python)." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -qU openai" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Provide the NVIDIA API key obtained above in order to ensure access to both models." + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import getpass\n", + "\n", + "os.environ[\"NVIDIA_API_KEY\"] = getpass.getpass(\"Please enter your OpenAI API key: \")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the OpenAI Async client will enable quick and efficient data generation.\n", + "\n", + "It's as easy as pointing the `base_url` parameter to `https://integrate.api.nvidia.com/v1` - and providing the API key." + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [], + "source": [ + "from openai import AsyncOpenAI\n", + "\n", + "client = AsyncOpenAI(\n", + " base_url = \"https://integrate.api.nvidia.com/v1\",\n", + " api_key = os.environ[\"NVIDIA_API_KEY\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generating Subtopics\n", + "\n", + "To start things off, subtopics will be generated for the provided topic. \n", + "\n", + "> NOTE: The parameters of `temperature`, `top_p`, and `max_tokens` can be customized to individual preference." + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [], + "source": [ + "async def generate_subtopics(client, topic, n_subtopics):\n", + " prompt = TOPIC_GENERATION_PROMPT_TEMPLATE.format(topic=topic, n_subtopics=n_subtopics)\n", + " response = await client.chat.completions.create(\n", + " model=\"nvidia/nemotron-4-340b-instruct\",\n", + " messages=[\n", + " {\"role\" : \"user\",\n", + " \"content\" : prompt}\n", + " ],\n", + " temperature=0.2,\n", + " top_p=0.7,\n", + " max_tokens=1024,\n", + " )\n", + " return response" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The main topic can be defined below - for the example in the notebook, \"Machine Learning\" will be used." + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "metadata": {}, + "outputs": [], + "source": [ + "topic = \"Machine Learning\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The cell below will call the Nemotron-4 340B Instruct endpoint - and return a list of subtopics separated by commas." + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "metadata": {}, + "outputs": [], + "source": [ + "responses = await generate_subtopics(client, topic=topic, n_subtopics=n_subtopics)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output conforms to the expected format below.\n", + "\n", + "> NOTE: It is possible that additional data cleaning, or formatting may be necessary depending on the prompt templates used. Be sure to confirm the format of the generated data at each step." + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Supervised learning, unsupervised learning, semi-supervised learning, reinforcement learning, deep learning, neural networks, natural language processing, computer vision, recommendation systems, anomaly detection.\n" + ] + } + ], + "source": [ + "print(responses.choices[0].message.content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Due to the data being generated in a comma separated list, Python's `.split(\",\")` will convert the string into a usable list for the following steps." + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "metadata": {}, + "outputs": [], + "source": [ + "subtopic_list = responses.choices[0].message.content.split(\",\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generating Questions from Subtopic List\n", + "\n", + "With a list of subtopics, the next step will be to generate `n_questions`, for each subtopic.\n", + "\n", + "First, there needs to be a function to generate \"batches\" of questions.\n", + "\n", + "> NOTE: It would suitable to generate a single question per topic at a time, but more care would be needed to confirm there were no duplicate questions in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "metadata": {}, + "outputs": [], + "source": [ + "async def generate_questions(client, sub_topic, n_questions):\n", + " prompt = QUESTION_PROMPT_TEMPLATE.format(sub_topic=sub_topic, n_questions=n_questions)\n", + " response = await client.chat.completions.create(\n", + " model=\"nvidia/nemotron-4-340b-instruct\",\n", + " messages=[\n", + " {\"role\" : \"user\",\n", + " \"content\" : prompt}\n", + " ],\n", + " temperature=0.2,\n", + " top_p=0.7,\n", + " max_tokens=1024,\n", + " )\n", + " return response.choices[0].message.content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This step leverages [`asyncio`](https://docs.python.org/3/library/asyncio.html) from Python's standard library for efficient API calls to [build.nvidia.com](https://build.nvidia.com/)." + ] + }, + { + "cell_type": "code", + "execution_count": 182, + "metadata": {}, + "outputs": [], + "source": [ + "import asyncio\n", + "\n", + "async def question_generator(client, subtopic_list, n_question):\n", + " tasks = [generate_questions(client, subtopic, n_question) for subtopic in subtopic_list]\n", + " question_list = await asyncio.gather(*tasks)\n", + " return question_list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Due to running in a Colab environment - it is necessary to use `nest_asyncio` to run an event loop during the current Jupyter event loop." + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "metadata": {}, + "outputs": [], + "source": [ + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()\n", + "\n", + "question_list = asyncio.run(question_generator(client, subtopic_list, n_questions))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's time to examine the output of the above process!" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['What is supervised learning and how does it differ from unsupervised learning?\\nWhat are the key components of a supervised learning model?\\nCan you explain the concept of labeled data in supervised learning?\\nWhat are some popular algorithms used in supervised learning?\\nHow do you evaluate the performance of a supervised learning model?\\nWhat is overfitting and how can it be prevented in supervised learning?\\nHow do you choose the right supervised learning algorithm for a given problem?\\nWhat is the role of feature engineering in supervised learning?\\nCan you explain the concept of cross-validation in supervised learning?\\nWhat are some real-world applications of supervised learning?',\n", + " 'What is unsupervised learning and how does it differ from supervised learning?\\nWhat are some common algorithms used in unsupervised learning?\\nHow can unsupervised learning be used for anomaly detection?\\nWhat is clustering and what are some popular clustering algorithms?\\nHow can dimensionality reduction be achieved through unsupervised learning?\\nWhat is the difference between principal component analysis and t-distributed stochastic neighbor embedding?\\nHow can unsupervised learning be used for natural language processing tasks?\\nWhat is the role of unsupervised learning in deep learning?\\nHow can the performance of unsupervised learning algorithms be evaluated?\\nWhat are some real-world applications of unsupervised learning?',\n", + " 'What is semi-supervised learning and how does it differ from supervised and unsupervised learning?\\nHow can semi-supervised learning be used to improve the performance of machine learning models?\\nWhat are some common techniques used in semi-supervised learning, such as self-training and multi-view training?\\nHow can semi-supervised learning be applied to image and text classification tasks?\\nWhat are the advantages and disadvantages of using semi-supervised learning compared to other learning paradigms?\\nHow can the quality of unlabeled data be ensured in semi-supervised learning?\\nHow can semi-supervised learning be used to address the challenge of limited labeled data in real-world applications?\\nWhat are some recent advances and state-of-the-art methods in semi-supervised learning?\\nHow can the performance of semi-supervised learning algorithms be evaluated and compared?\\nWhat are some potential ethical concerns and biases that need to be considered when using semi-supervised learning in practice?',\n", + " 'What is reinforcement learning and how does it differ from other types of machine learning?\\nHow does reinforcement learning work, and what are the key components of a reinforcement learning system?\\nWhat are some of the most common applications of reinforcement learning in real-world scenarios?\\nHow can reinforcement learning be used to optimize decision-making processes in complex environments?\\nWhat are the challenges and limitations of reinforcement learning, and how can they be addressed?\\nHow does reinforcement learning handle the exploration-exploitation trade-off, and what are some common strategies for balancing these two objectives?\\nWhat are some of the most popular reinforcement learning algorithms, and how do they differ from one another?\\nHow can reinforcement learning be combined with other machine learning techniques, such as deep learning, to improve performance and efficiency?\\nWhat are some ethical considerations to keep in mind when deploying reinforcement learning systems in real-world applications?\\nHow is reinforcement learning being used in cutting-edge research and development, and what are some exciting new developments in this field?',\n", + " 'What is deep learning and how does it differ from traditional machine learning?\\nWhat are the key components of a deep learning model?\\nCan you explain the concept of backpropagation in deep learning?\\nHow do convolutional neural networks (CNNs) work in image recognition tasks?\\nWhat is the role of activation functions in deep learning?\\nHow can we prevent overfitting in deep learning models?\\nCan you explain the concept of transfer learning in deep learning?\\nWhat are some popular deep learning frameworks and libraries?\\nHow is deep learning being used in natural language processing (NLP)?\\nWhat are some ethical considerations when implementing deep learning models?',\n", + " 'What are the key components of a neural network?\\nHow do neural networks learn and adapt to new data?\\nWhat are the different types of neural networks and their applications?\\nHow can neural networks be used for image and speech recognition?\\nWhat are the challenges in designing and training deep neural networks?\\nHow can neural networks be optimized for better performance and accuracy?\\nWhat are the ethical considerations when using neural networks for decision-making?\\nHow can neural networks be used for natural language processing and generation?\\nWhat are the latest advancements and trends in neural network research?\\nHow can neural networks be integrated with other machine learning techniques for improved results?',\n", + " 'What is natural language processing and how does it work?\\nWhat are some common applications of natural language processing in everyday life?\\nHow can natural language processing be used to improve customer service?\\nWhat are the key challenges in developing accurate natural language processing systems?\\nHow does natural language processing differ from other forms of artificial intelligence?\\nWhat are some of the most popular natural language processing techniques and algorithms?\\nHow can natural language processing be used to analyze social media data?\\nWhat are the ethical considerations when using natural language processing to analyze personal data?\\nHow can natural language processing be used to improve language translation services?\\nWhat is the future of natural language processing and how will it continue to evolve?',\n", + " 'What is computer vision and how does it differ from image processing?\\nWhat are the key components of a computer vision system?\\nHow is machine learning used in computer vision?\\nWhat are some common applications of computer vision in industry?\\nHow does computer vision enable autonomous vehicles to navigate?\\nWhat are the challenges in developing accurate and reliable computer vision systems?\\nHow is computer vision used in medical imaging and diagnostics?\\nWhat are some popular datasets and benchmarks used in computer vision research?\\nHow does computer vision intersect with other fields such as natural language processing and robotics?\\nWhat are the ethical considerations in deploying computer vision systems, particularly in areas such as surveillance and facial recognition?',\n", + " 'What are the different types of recommendation systems?\\nHow do recommendation systems use collaborative filtering?\\nHow do recommendation systems use content-based filtering?\\nWhat are the benefits of using a hybrid recommendation system?\\nHow can you evaluate the performance of a recommendation system?\\nWhat are some common challenges in building recommendation systems?\\nHow do recommendation systems handle the cold start problem?\\nHow can recommendation systems incorporate user feedback?\\nWhat are some ethical considerations when building recommendation systems?\\nHow do recommendation systems impact user behavior and decision-making?',\n", + " 'What is anomaly detection and how does it work?\\nWhat are the different types of anomalies that can be detected?\\nWhat are some common algorithms used for anomaly detection?\\nHow can anomaly detection be used in cybersecurity?\\nWhat are the challenges in implementing anomaly detection in real-world scenarios?\\nHow can anomaly detection be used in predictive maintenance for industrial equipment?\\nWhat is the role of machine learning in anomaly detection?\\nHow can anomaly detection be used in fraud detection for financial transactions?\\nWhat are the best practices for evaluating the performance of anomaly detection models?\\nHow can anomaly detection be used in healthcare for early detection of diseases?']" + ] + }, + "execution_count": 184, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "question_list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The list for each question is now collected into a single long list. " + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "metadata": {}, + "outputs": [], + "source": [ + "question_list_formatted = []\n", + "\n", + "for question_set in question_list:\n", + " question_list_formatted += question_set.split(\"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "100" + ] + }, + "execution_count": 186, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(question_list_formatted)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generating Responses from Question List\n", + "\n", + "Using the question list, Nemotron-4 340B Instruct can be used to generate responses to the questions. \n", + "\n", + "The first things needed is a function that will be used to generate the response from [build.nvidia.com](https://build.nvidia.com/)!" + ] + }, + { + "cell_type": "code", + "execution_count": 216, + "metadata": {}, + "outputs": [], + "source": [ + "async def generate_responses(client, question):\n", + " prompt = RESPONSE_PROMPT_TEMPLATE.format(question=question)\n", + " response = await client.chat.completions.create(\n", + " model=\"nvidia/nemotron-4-340b-instruct\",\n", + " messages=[\n", + " {\"role\" : \"user\",\n", + " \"content\" : prompt}\n", + " ],\n", + " temperature=0.2,\n", + " top_p=0.7,\n", + " max_tokens=1024,\n", + " )\n", + " return response.choices[0].message.content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, the `asycio` library allows efficient use of the API." + ] + }, + { + "cell_type": "code", + "execution_count": 217, + "metadata": {}, + "outputs": [], + "source": [ + "async def response_generator(client, question_list):\n", + " tasks = [generate_responses(client, question) for question in question_list]\n", + " response_list = await asyncio.gather(*tasks)\n", + " return response_list" + ] + }, + { + "cell_type": "code", + "execution_count": 218, + "metadata": {}, + "outputs": [], + "source": [ + "question_response_list = asyncio.run(response_generator(client, question_list_formatted))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to move to the next stage, a dataset will be created in `.jsonl` format and will store questions with the responses generated." + ] + }, + { + "cell_type": "code", + "execution_count": 219, + "metadata": {}, + "outputs": [], + "source": [ + "question_response_pair_list = []\n", + "for question, response_set in zip(question_list_formatted, question_response_list):\n", + " question_response_pair_list.append(\n", + " {\n", + " \"question\" : question, \n", + " \"responses\" : {\n", + " \"response_a\" : {\"response\" : response_set.split(\"RESPONSE B:\")[0].replace(\"RESPONSE A:\", \"\").strip()},\n", + " \"response_b\" : {\"response\" : response_set.split(\"RESPONSE B:\")[-1].split(\"\\n\\n\")[0].strip()}\n", + " },\n", + " }\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset will be written out to a file called `synthetic_data.jsonl` below!" + ] + }, + { + "cell_type": "code", + "execution_count": 220, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "with open('synthetic_data.jsonl', 'w') as f:\n", + " for item in question_response_pair_list:\n", + " f.write(json.dumps(item))\n", + " f.write('\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Nemotron-4 340B Reward to Generate a Preference Dataset\n", + "\n", + "Equipped with a dataset that has questions that have response pairs, a preference dataset that is compatible with DPO training, SteerLM reward model training, and RLHF reward model training can be generated straightforwardly thanks to [Nemotron-4 340B Reward](https://build.nvidia.com/nvidia/nemotron-4-340b-reward) available through [build.nvidia.com](https://build.nvidia.com/)!\n", + "\n", + "First, an example of how to use the endpoint.\n", + "\n", + "1. You must both provide a user message, and an assistant message!\n", + "2. It will return a chat-style message with the scores, as well as the scores in the `logprogs` parameter.\n", + "\n", + "The response package will include scores related to five attributes:\n", + "\n", + "1. Helpfulness: Overall helpfulness of the response to the prompt.\n", + "2. Correctness: Inclusion of all pertinent facts without errors.\n", + "3. Coherence: Consistency and clarity of expression.\n", + "4. Complexity: Intellectual depth required to write response (i.e. whether the response can be written by anyone with basic language competency or requires deep domain expertise).\n", + "5. Verbosity: Amount of detail included in the response, relative to what is asked for in the prompt." + ] + }, + { + "cell_type": "code", + "execution_count": 221, + "metadata": {}, + "outputs": [], + "source": [ + "messages = [\n", + " {\n", + " \"role\" : \"user\",\n", + " \"content\" : \"Hello!\"\n", + " },\n", + " {\n", + " \"role\": \"assistant\",\n", + " \"content\": \"Hello! How can I help you today?\"\n", + " },\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 222, + "metadata": {}, + "outputs": [], + "source": [ + "response = await client.chat.completions.create(\n", + " model=\"nvidia/nemotron-4-340b-reward\",\n", + " messages=messages,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 223, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ChatCompletion(id='07548838-8ef6-4feb-a1ee-66dd97905b72', choices=[Choice(finish_reason='length', index=0, logprobs=ChoiceLogprobs(content=[ChatCompletionTokenLogprob(token='helpfulness', bytes=None, logprob=4.09375, top_logprobs=[]), ChatCompletionTokenLogprob(token='correctness', bytes=None, logprob=4.03125, top_logprobs=[]), ChatCompletionTokenLogprob(token='coherence', bytes=None, logprob=4.25, top_logprobs=[]), ChatCompletionTokenLogprob(token='complexity', bytes=None, logprob=0.5703125, top_logprobs=[]), ChatCompletionTokenLogprob(token='verbosity', bytes=None, logprob=1.109375, top_logprobs=[])]), message=[ChatCompletionMessage(content='helpfulness:4.09375,correctness:4.03125,coherence:4.25,complexity:0.5703125,verbosity:1.109375', role='assistant', function_call=None, tool_calls=None)])], created=None, model=None, object=None, service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=1, prompt_tokens=54, total_tokens=55))" + ] + }, + "execution_count": 223, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `logprobs` can be handled in a similar fashion to message content, as demonstrated below!" + ] + }, + { + "cell_type": "code", + "execution_count": 224, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[ChatCompletionTokenLogprob(token='helpfulness', bytes=None, logprob=4.09375, top_logprobs=[]),\n", + " ChatCompletionTokenLogprob(token='correctness', bytes=None, logprob=4.03125, top_logprobs=[]),\n", + " ChatCompletionTokenLogprob(token='coherence', bytes=None, logprob=4.25, top_logprobs=[]),\n", + " ChatCompletionTokenLogprob(token='complexity', bytes=None, logprob=0.5703125, top_logprobs=[]),\n", + " ChatCompletionTokenLogprob(token='verbosity', bytes=None, logprob=1.109375, top_logprobs=[])]" + ] + }, + "execution_count": 224, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response.choices[0].logprobs.content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's useful to define a simple helper function that can extract the scores to be used in the construction of a dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 225, + "metadata": {}, + "outputs": [], + "source": [ + "def get_scores_from_response(openai_response_template):\n", + " logprobs = openai_response_template.choices[0].logprobs.content\n", + " score_dict = {}\n", + " for score in logprobs:\n", + " score_dict[score.token] = score.logprob\n", + " return score_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 226, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'helpfulness': 4.09375,\n", + " 'correctness': 4.03125,\n", + " 'coherence': 4.25,\n", + " 'complexity': 0.5703125,\n", + " 'verbosity': 1.109375}" + ] + }, + "execution_count": 226, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_scores_from_response(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Similar to the synthetic data generation above, using `asyncio` will help provide scores in a time-efficient manner." + ] + }, + { + "cell_type": "code", + "execution_count": 227, + "metadata": {}, + "outputs": [], + "source": [ + "async def get_response_and_scores(client, model, question, response_content):\n", + " messages = [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": question\n", + " },\n", + " {\n", + " \"role\": \"assistant\",\n", + " \"content\": response_content\n", + " },\n", + " ]\n", + " \n", + " response = await client.chat.completions.create(\n", + " model=model,\n", + " messages=messages,\n", + " )\n", + " \n", + " scores = get_scores_from_response(response)\n", + " return scores" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copying the list is important to avoid overwriting or modifying the original data - though it can be reloaded from `JSONL`." + ] + }, + { + "cell_type": "code", + "execution_count": 228, + "metadata": {}, + "outputs": [], + "source": [ + "question_response_score_list = question_response_pair_list.copy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Scores are calculated efficiently using `asyncio`." + ] + }, + { + "cell_type": "code", + "execution_count": 231, + "metadata": {}, + "outputs": [], + "source": [ + "async def process_question_response_pairs(client, model, question_response_score_list):\n", + " tasks = []\n", + " for question_response_pair in question_response_score_list:\n", + " question = question_response_pair[\"question\"]\n", + " \n", + " task_a = get_response_and_scores(client, model, question, question_response_pair[\"responses\"][\"response_a\"][\"response\"])\n", + " task_b = get_response_and_scores(client, model, question, question_response_pair[\"responses\"][\"response_b\"][\"response\"])\n", + " \n", + " tasks.append((task_a, question_response_pair, \"response_a\"))\n", + " tasks.append((task_b, question_response_pair, \"response_b\"))\n", + " \n", + " results = await asyncio.gather(*[task[0] for task in tasks])\n", + " \n", + " for i, (result, task_info) in enumerate(zip(results, tasks)):\n", + " _, question_response_pair, response_key = task_info\n", + " question_response_pair[\"responses\"][response_key].update(result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nothing left to do but fire it off!" + ] + }, + { + "cell_type": "code", + "execution_count": 232, + "metadata": {}, + "outputs": [], + "source": [ + "await process_question_response_pairs(client, \"nvidia/nemotron-4-340b-reward\", question_response_score_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Quality can be relatively preserved by only keeping rows that have at least a `3.0` in the overall metric - in this case helpfulness. This will help ensure that the data remains high quality. " + ] + }, + { + "cell_type": "code", + "execution_count": 239, + "metadata": {}, + "outputs": [], + "source": [ + "threshold = 3.0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "FInally, the dataset can be exported in `.JSONL` format for use in [NeMo Aligner](https://github.com/NVIDIA/NeMo-Aligner)." + ] + }, + { + "cell_type": "code", + "execution_count": 240, + "metadata": {}, + "outputs": [], + "source": [ + "with open(f'synthetic_data_with_scores_filtered-{threshold}.jsonl', 'w') as f:\n", + " for item in question_response_score_list:\n", + " question = item[\"question\"]\n", + " response_a = item[\"responses\"][\"response_a\"]\n", + " response_b = item[\"responses\"][\"response_b\"]\n", + " response_a[\"question\"] = question\n", + " response_b[\"question\"] = question\n", + " if response_a[\"helpfulness\"] < threshold and response_b[\"helpfulness\"] < threshold:\n", + " continue\n", + " f.write(json.dumps(response_a))\n", + " f.write('\\n')\n", + " f.write(json.dumps(response_b))\n", + " f.write('\\n')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nvidia-sdg", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 21af732ef49ba087e3b9aa201a433c10efdfff47 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 8 Jul 2024 19:05:29 -0700 Subject: [PATCH 10/19] Add Synthetic Data Generation Module (#136) * Begin implementation on OpenAI client Signed-off-by: Ryan Wolf * Fix relative import Signed-off-by: Ryan Wolf * Add temperature Signed-off-by: Ryan Wolf * Modify client interface and begin ultrachat Signed-off-by: Ryan Wolf * Change type annotation in openai client Signed-off-by: Ryan Wolf * Make imports easier Signed-off-by: Ryan Wolf * Reformat to match nemotron report Signed-off-by: Ryan Wolf * Add yaml conversion Signed-off-by: Ryan Wolf * Fix index error Signed-off-by: Ryan Wolf * Add error handling for yaml parsing Signed-off-by: Ryan Wolf * Fix error Signed-off-by: Ryan Wolf * Add additional yaml parsing check Signed-off-by: Ryan Wolf * Add more yaml error handling Signed-off-by: Ryan Wolf * Export conversion error Signed-off-by: Ryan Wolf * Change variable naming Signed-off-by: Ryan Wolf * Make error catching more general Signed-off-by: Ryan Wolf * Refactor list out of nemotron Signed-off-by: Ryan Wolf * Add prompt helper function Signed-off-by: Ryan Wolf * Add revisions and writing prompts Signed-off-by: Ryan Wolf * Fix default prompt templates Signed-off-by: Ryan Wolf * Add closed qa Signed-off-by: Ryan Wolf * Fix prompt Signed-off-by: Ryan Wolf * Add math and coding Signed-off-by: Ryan Wolf * Add problem generation Signed-off-by: Ryan Wolf * Rename function Signed-off-by: Ryan Wolf * Add dialogue support Signed-off-by: Ryan Wolf * Fix mispell Signed-off-by: Ryan Wolf * Add two turn generation Signed-off-by: Ryan Wolf * Add reward model as judge Signed-off-by: Ryan Wolf * Refactor reward query Signed-off-by: Ryan Wolf * Add error handling for non-reward models Signed-off-by: Ryan Wolf * Add error handling to sync client Signed-off-by: Ryan Wolf * Add open qa pipeline Signed-off-by: Ryan Wolf * Improve docs and add writing pipeline Signed-off-by: Ryan Wolf * Add closed qa pipeline Signed-off-by: Ryan Wolf * Add math pipeline Signed-off-by: Ryan Wolf * Add python pipeline Signed-off-by: Ryan Wolf * Add async nemotron generator Signed-off-by: Ryan Wolf * Fix await with index Signed-off-by: Ryan Wolf * Add seed parameter Signed-off-by: Ryan Wolf * Add missing await Signed-off-by: Ryan Wolf * Fix parameter names Signed-off-by: Ryan Wolf * Fix subscript await issues Signed-off-by: Ryan Wolf * Switch parsing method for reward model Signed-off-by: Ryan Wolf * Add initial docs Signed-off-by: Ryan Wolf * Add nemo deploy client Signed-off-by: Ryan Wolf * Add easy import Signed-off-by: Ryan Wolf * Move conversation formatter Signed-off-by: Ryan Wolf * Add other file Signed-off-by: Ryan Wolf * Update nemotron import Signed-off-by: Ryan Wolf * Update model client import Signed-off-by: Ryan Wolf * Remove model in query call Signed-off-by: Ryan Wolf * Add extra index Signed-off-by: Ryan Wolf * Fix response indexing Signed-off-by: Ryan Wolf * Add top k Signed-off-by: Ryan Wolf * Remove extras Signed-off-by: Ryan Wolf * Add safe import for nemo deploy Signed-off-by: Ryan Wolf * Add pandas conversions Signed-off-by: Ryan Wolf * Add partition default Signed-off-by: Ryan Wolf * Add no format Signed-off-by: Ryan Wolf * Move no format location Signed-off-by: Ryan Wolf * Use top_k in nemo client Signed-off-by: Ryan Wolf * Address vibhu's review Signed-off-by: Ryan Wolf * Add logging import Signed-off-by: Ryan Wolf * Fix import Signed-off-by: Ryan Wolf * Fix tqdm Signed-off-by: Ryan Wolf * Add missing awaits Signed-off-by: Ryan Wolf * Standardize names Signed-off-by: Ryan Wolf * Address Ayush nit Signed-off-by: Ryan Wolf --------- Signed-off-by: Ryan Wolf --- docs/user-guide/index.rst | 3 + docs/user-guide/syntheticdata.rst | 18 + nemo_curator/__init__.py | 7 + nemo_curator/datasets/doc_dataset.py | 40 +- nemo_curator/services/__init__.py | 26 + .../services/conversation_formatter.py | 28 + nemo_curator/services/model_client.py | 93 + nemo_curator/services/nemo_client.py | 100 + nemo_curator/services/openai_client.py | 162 ++ nemo_curator/synthetic/__init__.py | 73 + nemo_curator/synthetic/async_nemotron.py | 1685 +++++++++++++++++ nemo_curator/synthetic/error.py | 20 + nemo_curator/synthetic/mixtral.py | 38 + nemo_curator/synthetic/nemotron.py | 1472 ++++++++++++++ nemo_curator/synthetic/no_format.py | 34 + nemo_curator/synthetic/prompts.py | 58 + setup.py | 3 +- tests/test_dataset.py | 25 + 18 files changed, 3883 insertions(+), 2 deletions(-) create mode 100644 docs/user-guide/syntheticdata.rst create mode 100644 nemo_curator/services/__init__.py create mode 100644 nemo_curator/services/conversation_formatter.py create mode 100644 nemo_curator/services/model_client.py create mode 100644 nemo_curator/services/nemo_client.py create mode 100644 nemo_curator/services/openai_client.py create mode 100644 nemo_curator/synthetic/__init__.py create mode 100644 nemo_curator/synthetic/async_nemotron.py create mode 100644 nemo_curator/synthetic/error.py create mode 100644 nemo_curator/synthetic/mixtral.py create mode 100644 nemo_curator/synthetic/nemotron.py create mode 100644 nemo_curator/synthetic/no_format.py create mode 100644 nemo_curator/synthetic/prompts.py create mode 100644 tests/test_dataset.py diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index 31f29069c..1b0e63165 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -18,6 +18,9 @@ :ref:`GPU Accelerated Exact and Fuzzy Deduplication ` Both exact and fuzzy deduplication functionalities are supported in NeMo Curator and accelerated using RAPIDS cuDF. +:ref:`Synthetic Data Generation ` + Synthetic data generation tools and example piplines are available within NeMo Curator. + :ref:`Downstream Task Decontamination ` After training, large language models are usually evaluated by their performance on downstream tasks consisting of unseen test data. When dealing with large datasets, there is a potential for leakage of this test data into the model’s training dataset. NeMo Curator allows you to remove sections of documents in your dataset that are present in downstream tasks. diff --git a/docs/user-guide/syntheticdata.rst b/docs/user-guide/syntheticdata.rst new file mode 100644 index 000000000..ae3206b4f --- /dev/null +++ b/docs/user-guide/syntheticdata.rst @@ -0,0 +1,18 @@ + +.. _data-curator-syntheticdata: + +====================================== +Synthetic Data Generation +====================================== +-------------------------------------- +Background +-------------------------------------- +Synthetic data generation has become increasing useful in large language model training. +It is used in pretraining, fine-tuning, and evalutation. +Synthetically generated data can be useful for adapting an LLM to low resource languages/domains, or performing knowledge distillation from other models among other purposes. +There are a variety of ways to construct synthetic data generation pipelines, with numerous LLM and classical filters. + +NeMo Curator has a simple, easy-to-use set of tools that allow you to use prebuilt synthetic generation pipelines or build your own. +Any model inference service that uses the OpenAI API is compatible with the synthetic data generation module, allowing you to generate your data from any model. +NeMo Curator has prebuilt synthetic data generation pipelines for supervised fine-tuning (SFT) and preference data that were used to generate data for the training of `Nemotron-4 340B `_. +And, you can easily interweave filtering and deduplication steps in your synthetic data pipeline with the other modules in NeMo Curator. \ No newline at end of file diff --git a/nemo_curator/__init__.py b/nemo_curator/__init__.py index b440156e1..9f1029316 100644 --- a/nemo_curator/__init__.py +++ b/nemo_curator/__init__.py @@ -34,6 +34,13 @@ from .modules import * +from .services import ( + AsyncLLMClient, + AsyncOpenAIClient, + LLMClient, + NemoDeployClient, + OpenAIClient, +) from .utils.distributed_utils import get_client # Dask will automatically convert the list score type diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py index 32b23114c..488402700 100644 --- a/nemo_curator/datasets/doc_dataset.py +++ b/nemo_curator/datasets/doc_dataset.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Union +from typing import List, Optional, Union import dask.dataframe as dd @@ -130,6 +130,44 @@ def to_pickle( ): raise NotImplementedError("DocumentDataset does not support to_pickle yet") + @classmethod + def from_pandas( + cls, + data, + npartitions: Optional[int] = 1, + chunksize: Optional[int] = None, + sort: Optional[bool] = True, + name: Optional[str] = None, + ): + """ + Creates a document dataset from a pandas data frame. + For more information on the arguments see Dask's from_pandas documentation + https://docs.dask.org/en/stable/generated/dask.dataframe.from_pandas.html + + Args: + data: A pandas dataframe + Returns: + A document dataset with a pandas backend (on the CPU). + """ + return cls( + dd.from_pandas( + data=data, + npartitions=npartitions, + chunksize=chunksize, + sort=sort, + name=name, + ) + ) + + def to_pandas(self): + """ + Creates a pandas dataframe from a DocumentDataset + + Returns: + A pandas dataframe (on the CPU) + """ + return self.df.to_backend("pandas").compute() + def _read_json_or_parquet( input_files: Union[str, List[str]], diff --git a/nemo_curator/services/__init__.py b/nemo_curator/services/__init__.py new file mode 100644 index 000000000..ff769b2e9 --- /dev/null +++ b/nemo_curator/services/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .conversation_formatter import ConversationFormatter +from .model_client import AsyncLLMClient, LLMClient +from .nemo_client import NemoDeployClient +from .openai_client import AsyncOpenAIClient, OpenAIClient + +__all__ = [ + "AsyncLLMClient", + "LLMClient", + "AsyncOpenAIClient", + "OpenAIClient", + "NemoDeployClient", + "ConversationFormatter", +] diff --git a/nemo_curator/services/conversation_formatter.py b/nemo_curator/services/conversation_formatter.py new file mode 100644 index 000000000..c4db1cc22 --- /dev/null +++ b/nemo_curator/services/conversation_formatter.py @@ -0,0 +1,28 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import ABC, abstractmethod +from typing import List + + +class ConversationFormatter(ABC): + """ + Represents a way of formatting a conversation with an LLM + such that it can response appropriately + """ + + @abstractmethod + def format_conversation(self, conv: List[dict]) -> str: + raise NotImplementedError( + "format_converstaion must be implemented by subclasses" + ) diff --git a/nemo_curator/services/model_client.py b/nemo_curator/services/model_client.py new file mode 100644 index 000000000..ca8ff6c43 --- /dev/null +++ b/nemo_curator/services/model_client.py @@ -0,0 +1,93 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import ABC, abstractmethod +from typing import Iterable, List, Optional, Union + +from nemo_curator.services.conversation_formatter import ConversationFormatter + + +class LLMClient(ABC): + """ + Interface representing a client connecting to an LLM inference server + and making requests synchronously + """ + + @abstractmethod + def query_model( + self, + *, + messages: Iterable, + model: str, + conversation_formatter: Optional[ConversationFormatter] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = 1, + seed: Optional[int] = None, + stop: Union[Optional[str], List[str]] = None, + stream: bool = False, + temperature: Optional[float] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + ) -> List[str]: + raise NotImplementedError("Subclass of LLMClient must implement 'query_model'") + + @abstractmethod + def query_reward_model( + self, + *, + messages: Iterable, + model: str, + conversation_formatter: Optional[ConversationFormatter] = None, + ) -> dict: + raise NotImplementedError( + "Subclass of LLMClient must implement 'query_reward_model'" + ) + + +class AsyncLLMClient(ABC): + """ + Interface representing a client connecting to an LLM inference server + and making requests asynchronously + """ + + @abstractmethod + async def query_model( + self, + *, + messages: Iterable, + model: str, + conversation_formatter: Optional[ConversationFormatter] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = 1, + seed: Optional[int] = None, + stop: Union[Optional[str], List[str]] = None, + stream: bool = False, + temperature: Optional[float] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + ) -> List[str]: + raise NotImplementedError( + "Subclass of AsyncLLMClient must implement 'query_model'" + ) + + @abstractmethod + async def query_reward_model( + self, + *, + messages: Iterable, + model: str, + conversation_formatter: Optional[ConversationFormatter] = None, + ) -> dict: + raise NotImplementedError( + "Subclass of LLMClient must implement 'query_reward_model'" + ) diff --git a/nemo_curator/services/nemo_client.py b/nemo_curator/services/nemo_client.py new file mode 100644 index 000000000..f83ba6242 --- /dev/null +++ b/nemo_curator/services/nemo_client.py @@ -0,0 +1,100 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings +from typing import Iterable, List, Optional, Union + +from nemo_curator.services.conversation_formatter import ConversationFormatter +from nemo_curator.utils.import_utils import safe_import_from + +from .model_client import AsyncLLMClient, LLMClient + +NemoQueryLLM = safe_import_from("nemo.deploy.nlp", "NemoQueryLLM") + + +class NemoDeployClient(LLMClient): + """ + A wrapper around NemoQueryLLM for querying models in synthetic data generation + """ + + def __init__(self, nemo_deploy: NemoQueryLLM) -> None: + self.client = nemo_deploy + + def query_model( + self, + *, + messages: Iterable, + model: str, + conversation_formatter: Optional[ConversationFormatter] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + seed: Optional[int] = None, + stop: Union[Optional[str], List[str]] = None, + stream: bool = False, + temperature: Optional[float] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + ) -> List[str]: + if conversation_formatter is None: + raise ValueError( + "NemoDeployClient's query_model requires a conversation_formatter" + ) + + prompt = conversation_formatter.format_conversation(messages) + self.client.model_name = model + + if n is not None: + warnings.warn("n is not supported in NemoDeployClient") + if stream: + warnings.warn("streamming is not supported in NeMoDeployClient") + + if isinstance(stop, str): + stop = [stop] + + response = self.client.query_llm( + prompts=[prompt], + max_output_len=max_tokens, + random_seed=seed, + stop_words_list=stop, + temperature=temperature, + top_p=top_p, + top_k=top_k, + )[0] + + return self._postprocess_response(response, stop) + + @staticmethod + def _postprocess_response(responses: List[str], stop_words: List[str]) -> List[str]: + processed_responses = [] + for response in responses: + for stop in stop_words: + if response.endswith(stop): + response = response[: -len(stop)] + processed_responses.append(response.strip()) + return processed_responses + + def query_reward_model(self, *, messages: Iterable, model: str) -> dict: + """ + Prompts an LLM Reward model to score a conversation between a user and assistant + Args: + messages: The conversation to calculate a score for. + Should be formatted like: + [{"role": "user", "content": "Write a sentence"}, {"role": "assistant", "content": "This is a sentence"}, ...] + model: The name of the model that should be used to calculate the reward. + Must be a reward model, cannot be a regular LLM. + Returns: + A mapping of score_name -> score + """ + raise NotImplementedError( + "Reward model inference is not supported in NeMo Deploy Clients" + ) diff --git a/nemo_curator/services/openai_client.py b/nemo_curator/services/openai_client.py new file mode 100644 index 000000000..350eb3087 --- /dev/null +++ b/nemo_curator/services/openai_client.py @@ -0,0 +1,162 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings +from typing import Iterable, List, Optional, Union + +from openai import AsyncOpenAI, OpenAI +from openai._types import NOT_GIVEN, NotGiven + +from nemo_curator.services.conversation_formatter import ConversationFormatter + +from .model_client import AsyncLLMClient, LLMClient + + +class OpenAIClient(LLMClient): + """ + A wrapper around OpenAI's Python client for querying models + """ + + def __init__(self, openai_client: OpenAI) -> None: + self.client = openai_client + + def query_model( + self, + *, + messages: Iterable, + model: str, + conversation_formatter: Optional[ConversationFormatter] = None, + max_tokens: Union[Optional[int], NotGiven] = NOT_GIVEN, + n: Union[Optional[int], NotGiven] = NOT_GIVEN, + seed: Union[Optional[int], NotGiven] = NOT_GIVEN, + stop: Union[Optional[str], List[str], NotGiven] = NOT_GIVEN, + stream: Union[Optional[bool], NotGiven] = False, + temperature: Union[Optional[float], NotGiven] = NOT_GIVEN, + top_k: Optional[int] = None, + top_p: Union[Optional[float], NotGiven] = NOT_GIVEN, + ) -> List[str]: + + if conversation_formatter is not None: + warnings.warn("conversation_formatter is not used in an OpenAIClient") + if top_k is not None: + warnings.warn("top_k is not used in an OpenAIClient") + + response = self.client.chat.completions.create( + messages=messages, + model=model, + max_tokens=max_tokens, + n=n, + seed=seed, + stop=stop, + stream=stream, + temperature=temperature, + top_p=top_p, + ) + + return [choice.message.content for choice in response.choices] + + def query_reward_model(self, *, messages: Iterable, model: str) -> dict: + """ + Prompts an LLM Reward model to score a conversation between a user and assistant + Args: + messages: The conversation to calculate a score for. + Should be formatted like: + [{"role": "user", "content": "Write a sentence"}, {"role": "assistant", "content": "This is a sentence"}, ...] + model: The name of the model that should be used to calculate the reward. + Must be a reward model, cannot be a regular LLM. + Returns: + A mapping of score_name -> score + """ + response = self.client.chat.completions.create(messages=messages, model=model) + + if response.choices[0].logprobs is None: + raise ValueError( + f"Logprobs not found. {model} is likely not a reward model." + ) + + scores = { + score.token: score.logprob for score in response.choices[0].logprobs.content + } + + return scores + + +class AsyncOpenAIClient(AsyncLLMClient): + """ + A wrapper around OpenAI's Python async client for querying models + """ + + def __init__(self, async_openai_client: AsyncOpenAI) -> None: + self.client = async_openai_client + + async def query_model( + self, + *, + messages: Iterable, + model: str, + conversation_formatter: Optional[ConversationFormatter] = None, + max_tokens: Union[Optional[int], NotGiven] = NOT_GIVEN, + n: Union[Optional[int], NotGiven] = NOT_GIVEN, + seed: Union[Optional[int], NotGiven] = NOT_GIVEN, + stop: Union[Optional[str], List[str], NotGiven] = NOT_GIVEN, + stream: Union[Optional[bool], NotGiven] = False, + temperature: Union[Optional[float], NotGiven] = NOT_GIVEN, + top_k: Optional[int] = None, + top_p: Union[Optional[float], NotGiven] = NOT_GIVEN, + ) -> List[str]: + + if conversation_formatter is not None: + warnings.warn("conversation_formatter is not used in an AsyncOpenAIClient") + if top_k is not None: + warnings.warn("top_k is not used in an AsyncOpenAIClient") + + response = await self.client.chat.completions.create( + messages=messages, + model=model, + max_tokens=max_tokens, + n=n, + seed=seed, + stop=stop, + stream=stream, + temperature=temperature, + top_p=top_p, + ) + + return [choice.message.content for choice in response.choices] + + async def query_reward_model(self, *, messages: Iterable, model: str) -> dict: + """ + Prompts an LLM Reward model to score a conversation between a user and assistant + Args: + messages: The conversation to calculate a score for. + Should be formatted like: + [{"role": "user", "content": "Write a sentence"}, {"role": "assistant", "content": "This is a sentence"}, ...] + model: The name of the model that should be used to calculate the reward. + Must be a reward model, cannot be a regular LLM. + Returns: + A mapping of score_name -> score + """ + response = await self.client.chat.completions.create( + messages=messages, model=model + ) + + if response.choices[0].logprobs is None: + raise ValueError( + f"Logprobs not found. {model} is likely not a reward model." + ) + + scores = { + score.token: score.logprob for score in response.choices[0].logprobs.content + } + + return scores diff --git a/nemo_curator/synthetic/__init__.py b/nemo_curator/synthetic/__init__.py new file mode 100644 index 000000000..44a4b6c12 --- /dev/null +++ b/nemo_curator/synthetic/__init__.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .async_nemotron import AsyncNemotronGenerator +from .error import YamlConversionError +from .mixtral import Mixtral8x7BFormatter +from .nemotron import NemotronFormatter, NemotronGenerator +from .no_format import NoFormat +from .prompts import ( + DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, + DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_MATH_CLASSIFICATION_PROMPT_TEMPLATE, + DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE, + DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + DEFAULT_PYTHON_CLASSIFICATION_PROMPT_TEMPLATE, + DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE, + DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, + DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE, + DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, + DEFAULT_WRITING_TASK_PROMPT_TEMPLATE, + DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + DIALOGUE_COMPLEX_USER_TURN_PROMPT_TEMPLATE, + DIALOGUE_CONCISE_USER_TURN_PROMPT_TEMPLATE, + DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE, + MATH_PROBLEM_BEGINNER_PROMPT_TEMPLATE, + MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE, + PYTHON_PROBLEM_ADVANCED_PROMPT_TEMPLATE, + PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE, + PYTHON_PROBLEM_INTERMEDIATE_PROMPT_TEMPLATE, +) + +__all__ = [ + "NemotronGenerator", + "AsyncNemotronGenerator", + "NemotronFormatter", + "Mixtral8x7BFormatter", + "NoFormat", + "DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE", + "DEFAULT_SUBTOPICS_PROMPT_TEMPLATE", + "DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE", + "DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE", + "DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE", + "DEFAULT_WRITING_TASK_PROMPT_TEMPLATE", + "DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE", + "DEFAULT_CLOSED_QA_PROMPT_TEMPLATE", + "DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE", + "DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE", + "DEFAULT_MATH_CLASSIFICATION_PROMPT_TEMPLATE", + "MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE", + "MATH_PROBLEM_BEGINNER_PROMPT_TEMPLATE", + "DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE", + "DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE", + "DEFAULT_PYTHON_CLASSIFICATION_PROMPT_TEMPLATE", + "PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE", + "PYTHON_PROBLEM_INTERMEDIATE_PROMPT_TEMPLATE", + "PYTHON_PROBLEM_ADVANCED_PROMPT_TEMPLATE", + "DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE", + "DIALOGUE_COMPLEX_USER_TURN_PROMPT_TEMPLATE", + "DIALOGUE_CONCISE_USER_TURN_PROMPT_TEMPLATE", + "YamlConversionError", +] diff --git a/nemo_curator/synthetic/async_nemotron.py b/nemo_curator/synthetic/async_nemotron.py new file mode 100644 index 000000000..d52fd7559 --- /dev/null +++ b/nemo_curator/synthetic/async_nemotron.py @@ -0,0 +1,1685 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import asyncio +import logging +import os +from typing import Any, Coroutine, List, Optional, Tuple, Union + +import yaml +from tqdm.asyncio import tqdm + +from nemo_curator.log import create_logger +from nemo_curator.services.model_client import AsyncLLMClient +from nemo_curator.synthetic.error import YamlConversionError +from nemo_curator.synthetic.prompts import ( + DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, + DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_MATH_CLASSIFICATION_PROMPT_TEMPLATE, + DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE, + DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + DEFAULT_PYTHON_CLASSIFICATION_PROMPT_TEMPLATE, + DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE, + DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, + DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE, + DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, + DEFAULT_WRITING_TASK_PROMPT_TEMPLATE, + DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE, + MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE, + PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE, +) + + +class AsyncNemotronGenerator: + """ + Provides a collection of methods for generating synthetic data + described in the Nemotron-4 340B Technical Report + (https://arxiv.org/abs/2406.11704v1) and inspired by the + UltraChat paper (https://arxiv.org/abs/2305.14233) + """ + + def __init__( + self, + llm_client: AsyncLLMClient, + logger: Union[logging.LoggerAdapter, str] = "./", + max_concurrent_requests: Optional[int] = None, + ) -> None: + self.client = llm_client + self.max_concurrent_requests = max_concurrent_requests + if isinstance(logger, str): + self.logger = create_logger( + rank=0, + log_file=os.path.join(logger, "nemotron-generator.log"), + name="AsyncNemotronGenrator", + ) + else: + self.logger = logger + + async def _prompt( + self, model: str, prompt_template: str, prompt_kwargs: dict, model_kwargs: dict + ) -> List[str]: + prompt = prompt_template.format(**prompt_kwargs) + messages = [{"role": "user", "content": prompt}] + + return await self.client.query_model( + messages=messages, model=model, **model_kwargs + ) + + async def convert_response_to_yaml_list( + self, + llm_response: str, + model: str, + prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Converts a response of an LLM to a list of strings by querying an LLM + Args: + llm_response: The original unformatted response of the LLM + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have a {llm_response} + parameter that will be populated with the llm_response value passed in this function. + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A parsed list of elements from the original LLM response + """ + prompt_kwargs["llm_response"] = llm_response + yaml_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + try: + parsed_response = yaml.safe_load(yaml_response[0]) + except yaml.error.YAMLError as _: + raise YamlConversionError( + f"Error parsing yaml response: {yaml_response[0]}" + ) + + if not isinstance(parsed_response, list): + raise YamlConversionError( + f"Error: Parsed response was not a list: {parsed_response}" + ) + + for elem in parsed_response: + if not isinstance(elem, str): + raise YamlConversionError( + f"Error: Parsed response contains non-string elements in list: {parsed_response}" + ) + if elem not in llm_response: + raise YamlConversionError( + f"Conversion introduced hallucinations. Original response:\n{llm_response}\nConverted response:\n{parsed_response}\nHallucination:\n{elem}" + ) + + return parsed_response + + async def _try_convert_yaml_list( + self, + response: str, + model: str, + yaml_conversion_prompt_template: str, + conversion_model_kwargs: dict, + expected_length: int, + ignore_conversion_failure: bool, + ): + try: + parsed_list = await self.convert_response_to_yaml_list( + response, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_list) != expected_length: + raise YamlConversionError( + f"Error: Length of parsed list {len(parsed_list)} does not match expected length {expected_length}: {parsed_list}" + ) + except YamlConversionError as e: + if ignore_conversion_failure: + return [] + else: + raise e + + return parsed_list + + async def _gather( + self, requests: List[Coroutine[Any, Any, List[str]]] + ) -> List[str]: + max_requests = self.max_concurrent_requests + if max_requests is None: + max_requests = len(requests) + + final_list = [] + for i in tqdm(range(0, len(requests), max_requests)): + request_slice = requests[i : i + max_requests] + result = await tqdm.gather(*request_slice) + final_list.extend(result) + + return final_list + + async def generate_macro_topics( + self, + n_macro_topics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of macro topics about the world + Args: + n_macro_topics: The number of macro topics to generate. + model: The name of the model that should be used to generate the macro topics. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_macro_topics"] = n_macro_topics + macro_topics = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return macro_topics + + async def generate_subtopics( + self, + macro_topic: str, + n_subtopics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of subtopics relating to a macro topic + Args: + macro_topic: The macro topic to generate subtopics for. + n_subtopics: The number of subtopics to generate per macro topic + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with the macro_topic passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_subtopics"] = n_subtopics + prompt_kwargs["macro_topic"] = macro_topic + subtopics_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return subtopics_response + + async def generate_open_qa_from_topic( + self, + topic: str, + n_openlines: Union[str, int], + model: str, + prompt_template: str = DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of open Q&A questions based on a topic + Args: + topic: The topic to generate questions for. + n_openlines: The number of questions to generate per topic. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_subtopics passed in this function + - topic: Will be populated with the topic passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["topic"] = topic + prompt_kwargs["n_openlines"] = n_openlines + openline_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return openline_response + + async def revise_open_qa( + self, + openline: str, + n_revisions: Union[str, int], + model: str, + prompt_template: str = DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to revise an open Q&A question a given number of times + Args: + openline: An openline to revise + n_revisions: The number of revisions to generate for the question. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - openline: Will be populated with the openline passed in this function + - n_revisions: Will be populated with the n_revisions passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["openline"] = openline + prompt_kwargs["n_revisions"] = n_revisions + revisions = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return revisions + + async def generate_writing_tasks( + self, + topic: str, + text_material_type: str, + n_openlines: Union[str, int], + model: str, + prompt_template: str = DEFAULT_WRITING_TASK_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of writing tasks based on a topic and document type + Args: + topic: The topic to generate writing tasks for. + text_material_type: The type of the document the question should ask to generate (e.g., "Email", "Poem") + n_openlines: The number of tasks to generate per topic and text material pair. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - topic: Will be populated with the topic passed in this function + - text_material_type: Will be populated with the text_material_type passed in this function + - n_openlines: Will be populated with the n_openlines passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["topic"] = topic + prompt_kwargs["text_material_type"] = text_material_type + prompt_kwargs["n_openlines"] = n_openlines + writing_tasks = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return writing_tasks + + async def revise_writing_tasks( + self, + openline: str, + n_revisions: Union[str, int], + model: str, + prompt_template: str = DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to revise a writing task a given number of times + Args: + openline: An openline to revise + n_revisions: The number of revisions to generate for the task. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - openline: Will be populated with the openline passed in this function + - n_revisions: Will be populated with the n_revisions passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["openline"] = openline + prompt_kwargs["n_revisions"] = n_revisions + revisions = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return revisions + + async def generate_closed_qa_instructions( + self, + document: str, + n_openlines: Union[str, int], + model: str, + prompt_template: str = DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of closed Q&A questions based on a reference document + Args: + document: The document to use when generating questions + n_openlines: The number of questions to generate per document. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - document: Will be populated with the document passed in this function + - n_openlines: Will be populated with the n_openlines passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["document"] = document + prompt_kwargs["n_openlines"] = n_openlines + openline_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return openline_response + + async def generate_math_macro_topics( + self, + n_macro_topics: Union[int, str], + school_level: str, + model: str, + prompt_template: str = DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of macro topics about math + Args: + n_macro_topics: The number of macro topics to generate. Can be an integer like 5 or a string like "five". + school_level: The school level the math questions should be targeted at. + model: The name of the model that should be used to generate the macro topics. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + - school_level: Will be populated with the school_level passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_macro_topics"] = n_macro_topics + prompt_kwargs["school_level"] = school_level + macro_topics = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return macro_topics + + async def generate_math_subtopics( + self, + macro_topic: str, + n_subtopics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of subtopics relating to a math macro topic + Args: + macro_topic: The macro topic to generate subtopics for. + n_subtopics: The number of subtopics to generate per macro topic + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with the macro_topic passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_subtopics"] = n_subtopics + prompt_kwargs["macro_topic"] = macro_topic + subtopics_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return subtopics_response + + async def classify_math_entity( + self, + entity: str, + model: str, + prompt_template: str = DEFAULT_MATH_CLASSIFICATION_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs={}, + ) -> List[str]: + """ + Prompts an LLM to classify if an entity is related to math + Args: + entity: The entity to classify + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - entity: Will be populated with the entity passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["entity"] = entity + classification_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return classification_response + + async def generate_math_problem( + self, + topic: str, + n_openlines: Union[str, int], + model: str, + prompt_template: str = MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of math problems based on a topic + Args: + topic: The topic to generate problems for. + n_openlines: The number of problems to generate per topic. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_subtopics passed in this function + - topic: Will be populated with the topic passed in this function + Some example templates found in nemo_curator.synthetic include: + - MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE + - MATH_PROBLEM_BEGINNER_PROMPT_TEMPLATE + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["topic"] = topic + prompt_kwargs["n_openlines"] = n_openlines + openline_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return openline_response + + async def generate_python_macro_topics( + self, + n_macro_topics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of macro topics about the Python programming language + Args: + n_macro_topics: The number of macro topics to generate. Can be an integer like 5 or a string like "five". + model: The name of the model that should be used to generate the macro topics. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_macro_topics"] = n_macro_topics + macro_topics = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return macro_topics + + async def generate_python_subtopics( + self, + macro_topic: str, + n_subtopics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of subtopics relating to a Python macro topic + Args: + macro_topic: The macro topic to generate subtopics for. + n_subtopics: The number of subtopics to generate per macro topic + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with the macro_topic passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_subtopics"] = n_subtopics + prompt_kwargs["macro_topic"] = macro_topic + subtopics_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return subtopics_response + + async def classify_python_entity( + self, + entity: str, + model: str, + prompt_template: str = DEFAULT_PYTHON_CLASSIFICATION_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to classify if an entity is related to Python + Args: + entity: The entity to classify + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - entity: Will be populated with the entity passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["entity"] = entity + classification_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return classification_response + + async def generate_python_problem( + self, + topic: str, + n_openlines: Union[str, int], + model: str, + language="Python", + prompt_template: str = PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of coding problems based on a topic + Args: + topic: The topic to generate problems for. + n_openlines: The number of problems to generate per topic. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + language: The programming language to target when generating these questions. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_subtopics passed in this function + - topic: Will be populated with the topic passed in this function + - language: Will be populated with the language passed in this function + Some example templates found in nemo_curator.synthetic include: + - PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE + - PYTHON_PROBLEM_INTERMEDIATE_PROMPT_TEMPLATE + - PYTHON_PROBLEM_ADVANCED_PROMPT_TEMPLATE + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["topic"] = topic + prompt_kwargs["n_openlines"] = n_openlines + prompt_kwargs["language"] = language + openline_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return openline_response + + async def generate_dialogue( + self, + openline: str, + user_model: str, + assistant_model: str, + n_user_turns: int = 3, + prompt_template: str = DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + user_model_kwargs: dict = {}, + assistant_model_kwargs: dict = {}, + ) -> List[dict]: + """ + Prompts an LLM to generate a dialogue based on a given openline. + The LLM will alternate impersonating the user and the assistant. + Args: + openline: The openline that will comprise the first user turn. + user_model: The model that will be impersonating the user. + Must be available in the LLMClient passed in the constructor. + assistant_model: The model that will be impersonating the assistant + Must be available in the LLMClient passed in the constructor. + n_user_turns: The number of user turns to go through. The openline counts as 1 user turn. + Therefore, if there are 3 user turns, 2 will be generated by the LLM impersonating the user. + prompt_template: A format string of the prompt to use when impersonating the user. + It must have the following parameters: + - converstation_history: Will be populated with a formatted history of the dialogue up to that point. + Some example templates found in nemo_curator.synthetic include: + - DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE + - DIALOGUE_COMPLEX_USER_TURN_PROMPT_TEMPLATE + - DIALOGUE_CONCISE_USER_TURN_PROMPT_TEMPLATE + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + user_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the user. + assistant_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the assistant. + Returns: + A conversation between a User and Assistant + """ + conversation_history = [{"role": "user", "content": openline}] + first_assistant_response = await self.client.query_model( + messages=conversation_history, + model=assistant_model, + **assistant_model_kwargs, + ) + first_assistant_response = first_assistant_response[0] + conversation_history.append( + {"role": "assistant", "content": first_assistant_response} + ) + for _ in range(n_user_turns - 1): + user_response = await self._impersonate_user( + conversation_history=conversation_history, + model=user_model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=user_model_kwargs, + ) + conversation_history.append({"role": "user", "content": user_response}) + assistant_response = await self.client.query_model( + messages=conversation_history, + model=assistant_model, + **assistant_model_kwargs, + ) + assistant_response = assistant_response[0] + conversation_history.append( + {"role": "assistant", "content": assistant_response} + ) + + return conversation_history + + async def generate_two_turn_prompt( + self, + openline: str, + user_model: str, + assistant_model: str, + prompt_template: str = DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + user_model_kwargs: dict = {}, + assistant_model_kwargs: dict = {}, + ) -> List[dict]: + """ + Prompts an LLM to generate a response as an assistant, then as the user based on a given openline. + The conversation will look like "User -> Assistant -> User" + Args: + openline: The openline that will comprise the first user turn. + user_model: The model that will be impersonating the user. + Must be available in the LLMClient passed in the constructor. + assistant_model: The model that will be impersonating the assistant + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use when impersonating the user. + It must have the following parameters: + - converstation_history: Will be populated with a formatted history of the dialogue up to that point. + Some example templates found in nemo_curator.synthetic include: + - DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE + - DIALOGUE_COMPLEX_USER_TURN_PROMPT_TEMPLATE + - DIALOGUE_CONCISE_USER_TURN_PROMPT_TEMPLATE + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + user_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the user. + assistant_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the assistant. + Returns: + A conversation between a User and Assistant + """ + conversation_history = [{"role": "user", "content": openline}] + first_assistant_response = await self.client.query_model( + messages=conversation_history, + model=assistant_model, + **assistant_model_kwargs, + ) + first_assistant_response = first_assistant_response[0] + conversation_history.append( + {"role": "assistant", "content": first_assistant_response} + ) + + user_response = await self._impersonate_user( + conversation_history=conversation_history, + model=user_model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=user_model_kwargs, + ) + conversation_history.append({"role": "user", "content": user_response}) + + return conversation_history + + async def _impersonate_user( + self, + conversation_history: List[dict], + model: str, + prompt_template: str, + prompt_kwargs: dict, + model_kwargs: dict, + ) -> str: + # Convert the conversation history to a string + history_str = "" + for turn in conversation_history: + history_str += f"{turn['role'].capitalize()}: {turn['content']}" + prompt_kwargs["conversation_history"] = history_str + response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return response[0] + + async def run_open_qa_pipeline( + self, + n_macro_topics: Union[str, int], + n_subtopics: Union[str, int], + n_openlines: Union[str, int], + n_revisions: Union[str, int], + model: str, + macro_topic_prompt_template: str = DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, + subtopic_prompt_template: str = DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, + open_qa_from_topics_prompt_template: str = DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + revise_open_qa_prompt_template: str = DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + additional_macro_topics: List[str] = [], + additional_subtopics: List[str] = [], + ignore_conversion_failure: bool = False, + combine_topics: bool = True, + ) -> List[str]: + """ + Runs a pipeline for automatically generating Open Q&A openlines for a dialogue + Args: + n_macro_topics: The number of macro topics to generate + n_subtopics: The number of subtopics to generate per macro topic + n_openlines: The number of questions to generate per topic. + n_revisions: The number of revisions to generate per original question. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + macro_topic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + No additional parameters may be passed to this prompt template. + subtopic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with a generated macro topic + No additional parameters may be passed to this prompt template. + open_qa_from_topics_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - topic: Will be populated with a generated topic + No additional parameters may be passed to this prompt template. + revise_open_qa_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_revisions: Will be populated with the n_revisions passed in this function + - openline: Will be populated with a generated open Q&A openline + No additional parameters may be passed to this prompt template. + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + combine_topics: If True, mixes the macro topics with the subtopics when generating openlines. + If False, only the subtopics are used. + Returns: + A list of synthetically generated open Q&A prompts + """ + self.logger.info("Starting open q&a pipeline") + # Generate the macro topics + self.logger.info("Starting macro topic generation") + responses = await self.generate_macro_topics( + n_macro_topics=n_macro_topics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=macro_topic_prompt_template, + ) + macro_topics = await self.convert_response_to_yaml_list( + responses[0], + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(macro_topics) != n_macro_topics and not ignore_conversion_failure: + raise YamlConversionError( + f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" + ) + macro_topics.extend(additional_macro_topics) + self.logger.info("Finished macro topic generation") + + # Generate the subtopics + raw_topics = [ + self._generate_parse_subtopic( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + subtopic_prompt_template=subtopic_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for macro_topic in macro_topics + ] + self.logger.info("Starting subtopic generation") + raw_topics = await self._gather(raw_topics) + topic_list = [item for subtopics in raw_topics for item in subtopics] + topic_list.extend(additional_subtopics) + self.logger.info("Finished subtopic generation") + + # Mix the macro topics with the subtopics + if combine_topics: + topic_list.extend(macro_topics) + + # Generate the openlines + raw_lines = [ + self._generate_parse_openline( + subtopic=subtopic, + n_openlines=n_openlines, + model=model, + open_qa_from_topics_prompt_template=open_qa_from_topics_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for subtopic in topic_list + ] + self.logger.info("Starting openline generation") + raw_lines = await self._gather(raw_lines) + openlines = [item for lines in raw_lines for item in lines] + self.logger.info("Finished openline generation") + + # Revise the openlines + raw_revisions = [ + self._revise_parse_openline( + openline=openline, + n_revisions=n_revisions, + model=model, + revise_open_qa_prompt_template=revise_open_qa_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for openline in openlines + ] + self.logger.info("Starting openline revision") + raw_revisions = await self._gather(raw_revisions) + revised_openlines = [item for revisions in raw_revisions for item in revisions] + self.logger.info("Finished openline revision") + self.logger.info("Finished open q&a pipeline") + + return revised_openlines + + async def _generate_parse_subtopic( + self, + macro_topic: str, + n_subtopics: Union[int, str], + model: str, + subtopic_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + subtopic = await self.generate_subtopics( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=subtopic_prompt_template, + ) + subtopic = subtopic[0] + return await self._try_convert_yaml_list( + subtopic, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_subtopics, + ignore_conversion_failure=ignore_conversion_failure, + ) + + async def _generate_parse_openline( + self, + subtopic: str, + n_openlines: Union[int, str], + model: str, + open_qa_from_topics_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + openline = await self.generate_open_qa_from_topic( + topic=subtopic, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=open_qa_from_topics_prompt_template, + ) + openline = openline[0] + return await self._try_convert_yaml_list( + openline, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_openlines, + ignore_conversion_failure=ignore_conversion_failure, + ) + + async def _revise_parse_openline( + self, + openline: str, + n_revisions: Union[int, str], + model: str, + revise_open_qa_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + revised_openline = await self.revise_open_qa( + openline=openline, + n_revisions=n_revisions, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=revise_open_qa_prompt_template, + ) + revised_openline = revised_openline[0] + return await self._try_convert_yaml_list( + revised_openline, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_revisions, + ignore_conversion_failure=ignore_conversion_failure, + ) + + async def run_writing_pipeline( + self, + topics: List[str], + text_material_types: List[str], + n_openlines: Union[str, int], + n_revisions: Union[str, int], + model: str, + writing_task_prompt_template: str = DEFAULT_WRITING_TASK_PROMPT_TEMPLATE, + revise_writing_task_prompt_template: str = DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + ignore_conversion_failure: bool = False, + ) -> List[str]: + """ + Runs a pipeline for automatically generating writing task openlines for a dialogue + Args: + topics: A list of topics to generate tasks for + text_material_types: A list of writing material types, like "Essay" or "Blog post" + n_openlines: The number of tasks to generate per (topic, text_material_type) pair. + n_revisions: The number of revisions to generate per original task. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + writing_task_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - topic: Will be populated with one element of the topics list passed in this function + - text_material_type: Will be populated with one element of the text_material_types list passed in this function + No additional parameters may be passed to this prompt template. + revise_writing_task_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_revisions: Will be populated with the n_revisions passed in this function + - openline: Will be populated with one of the writing tasks generated in the pipeline. + No additional parameters may be passed to this prompt template. + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + Returns: + A list of synthetically generated writing task prompts + """ + self.logger.info("Starting writing pipeline") + # Generate the tasks + raw_writing_tasks = [] + for topic in topics: + for material in text_material_types: + raw_writing_tasks.append( + self._generate_parse_writing_task( + topic=topic, + material=material, + n_openlines=n_openlines, + model=model, + writing_task_prompt_template=writing_task_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + ) + self.logger.info("Starting writing task generation") + raw_writing_tasks = await self._gather(raw_writing_tasks) + writing_tasks = [item for tasks in raw_writing_tasks for item in tasks] + self.logger.info("Finished writing task generation") + + # Revise the tasks + raw_revised_openlines = [ + self._revise_parse_writing_task( + task=task, + n_revisions=n_revisions, + model=model, + revise_writing_task_prompt_template=revise_writing_task_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for task in writing_tasks + ] + self.logger.info("Starting writing task revision") + raw_revised_openlines = await self._gather(raw_revised_openlines) + revised_openlines = [item for lines in raw_revised_openlines for item in lines] + self.logger.info("Finished writing task revision") + self.logger.info("Finished writing pipeline") + + return revised_openlines + + async def _generate_parse_writing_task( + self, + topic: str, + material: str, + n_openlines: Union[int, str], + model: str, + writing_task_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + raw_tasks = await self.generate_writing_tasks( + topic=topic, + text_material_type=material, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=writing_task_prompt_template, + ) + raw_tasks = raw_tasks[0] + return await self._try_convert_yaml_list( + raw_tasks, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_openlines, + ignore_conversion_failure=ignore_conversion_failure, + ) + + async def _revise_parse_writing_task( + self, + task: str, + n_revisions: Union[int, str], + model: str, + revise_writing_task_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + raw_revision = await self.revise_writing_tasks( + openline=task, + n_revisions=n_revisions, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=revise_writing_task_prompt_template, + ) + raw_revision = raw_revision[0] + return await self._try_convert_yaml_list( + raw_revision, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_revisions, + ignore_conversion_failure=ignore_conversion_failure, + ) + + async def run_closed_qa_pipeline( + self, + documents: List[str], + n_openlines: Union[str, int], + model: str, + closed_qa_prompt_template: str = DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + ignore_conversion_failure: bool = False, + ) -> List[Tuple[int, str]]: + """ + Runs a pipeline for automatically generating closed Q&A openlines for a dialogue + Args: + documents: A list of documents to generate closed Q&A questions for + n_openlines: The number of questions to generate per document. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + closed_qa_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - document: Will be populated with one element of the documents list passed in this function + No additional parameters may be passed to this prompt template. + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + Returns: + A list of pairs where the first element represents the index of the document used to generate the question in the documents list + and the second element represents a synthetically generated closed Q&A prompt. Example: [(0, "Summarize this document"), ...] + """ + self.logger.info("Starting closed q&a pipeline") + raw_qa = [ + self._generate_parse_closed_qa( + document_id=i, + document=document, + n_openlines=n_openlines, + model=model, + closed_qa_prompt_template=closed_qa_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for i, document in enumerate(documents) + ] + raw_qa = await self._gather(raw_qa) + document_openline_pairs = [item for lines in raw_qa for item in lines] + self.logger.info("Finished closed q&a pipeline") + + return document_openline_pairs + + async def _generate_parse_closed_qa( + self, + document_id: int, + document: str, + n_openlines: Union[int, str], + model: str, + closed_qa_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + raw_instruction = await self.generate_closed_qa_instructions( + document=document, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=closed_qa_prompt_template, + ) + raw_instruction = raw_instruction[0] + parsed_instructions = await self._try_convert_yaml_list( + raw_instruction, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_openlines, + ignore_conversion_failure=ignore_conversion_failure, + ) + + return [(document_id, inst) for inst in parsed_instructions] + + async def run_math_pipeline( + self, + n_macro_topics: Union[str, int], + school_level: str, + n_subtopics: Union[str, int], + n_openlines: Union[str, int], + model: str, + macro_topic_prompt_template: str = DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE, + subtopic_prompt_template: str = DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE, + math_problem_prompt_template: str = MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + additional_macro_topics: List[str] = [], + additional_subtopics: List[str] = [], + ignore_conversion_failure: bool = False, + combine_topics: bool = True, + ) -> List[str]: + """ + Runs a pipeline for automatically generating math questions for a dialogue + Args: + n_macro_topics: The number of macro topics to generate. + school_level: The school level to target when generating macro topics. + n_subtopics: The number of subtopics to generate per macro topic. + n_openlines: The number of questions to generate per topic. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + macro_topic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + - school_level: Will be populated with the school_level passed in this function + No additional parameters may be passed to this prompt template. + subtopic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with a generated macro topic + No additional parameters may be passed to this prompt template. + math_problem_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - topic: Will be populated with a generated topic + No additional parameters may be passed to this prompt template. + Some example templates found in nemo_curator.synthetic include: + - MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE + - MATH_PROBLEM_BEGINNER_PROMPT_TEMPLATE + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + combine_topics: If True, mixes the macro topics with the subtopics when generating openlines. + If False, only the subtopics are used. + Returns: + A list of synthetically generated math prompts + """ + self.logger.info("Starting math pipeline") + # Generate the macro topics + self.logger.info("Starting math macro topic generation") + responses = await self.generate_math_macro_topics( + n_macro_topics=n_macro_topics, + school_level=school_level, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=macro_topic_prompt_template, + ) + macro_topics = await self.convert_response_to_yaml_list( + responses[0], + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(macro_topics) != n_macro_topics and not ignore_conversion_failure: + raise YamlConversionError( + f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" + ) + macro_topics.extend(additional_macro_topics) + self.logger.info("Finished math macro topic generation") + + # Generate the subtopics + raw_topics = [ + self._generate_parse_math_subtopic( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + subtopic_prompt_template=subtopic_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for macro_topic in macro_topics + ] + self.logger.info("Starting math subtopic generation") + raw_topics = await self._gather(raw_topics) + topic_list = [item for subtopics in raw_topics for item in subtopics] + topic_list.extend(additional_subtopics) + self.logger.info("Finished math subtopic generation") + + # Mix the macro topics with the subtopics + if combine_topics: + topic_list.extend(macro_topics) + + # Generate the openlines + raw_lines = [ + self._generate_parse_math_openline( + subtopic=subtopic, + n_openlines=n_openlines, + model=model, + math_problem_prompt_template=math_problem_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for subtopic in topic_list + ] + self.logger.info("Starting math openline generation") + raw_lines = await self._gather(raw_lines) + openlines = [item for lines in raw_lines for item in lines] + self.logger.info("Finished math openline generation") + self.logger.info("Finished math pipeline") + + return openlines + + async def _generate_parse_math_subtopic( + self, + macro_topic: str, + n_subtopics: Union[int, str], + model: str, + subtopic_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + raw_topic = await self.generate_math_subtopics( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=subtopic_prompt_template, + ) + raw_topic = raw_topic[0] + return await self._try_convert_yaml_list( + raw_topic, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_subtopics, + ignore_conversion_failure=ignore_conversion_failure, + ) + + async def _generate_parse_math_openline( + self, + subtopic: str, + n_openlines: Union[int, str], + model: str, + math_problem_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + raw_line = await self.generate_math_problem( + topic=subtopic, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=math_problem_prompt_template, + ) + raw_line = raw_line[0] + return await self._try_convert_yaml_list( + raw_line, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_openlines, + ignore_conversion_failure=ignore_conversion_failure, + ) + + async def run_python_pipeline( + self, + n_macro_topics: Union[str, int], + n_subtopics: Union[str, int], + n_openlines: Union[str, int], + model: str, + macro_topic_prompt_template: str = DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE, + subtopic_prompt_template: str = DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE, + python_problem_prompt_template: str = PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + additional_macro_topics: List[str] = [], + additional_subtopics: List[str] = [], + ignore_conversion_failure: bool = False, + combine_topics: bool = True, + ) -> List[str]: + """ + Runs a pipeline for automatically generating Python questions for a dialogue + Args: + n_macro_topics: The number of macro topics to generate. + n_subtopics: The number of subtopics to generate per macro topic. + n_openlines: The number of questions to generate per topic. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + macro_topic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + No additional parameters may be passed to this prompt template. + subtopic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with a generated macro topic + No additional parameters may be passed to this prompt template. + python_problem_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - language: Will be populated with "Python" + - topic: Will be populated with a generated topic + No additional parameters may be passed to this prompt template. + Some example templates found in nemo_curator.synthetic include: + - PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE + - PYTHON_PROBLEM_INTERMEDIATE_PROMPT_TEMPLATE + - PYTHON_PROBLEM_ADVANCED_PROMPT_TEMPLATE + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + combine_topics: If True, mixes the macro topics with the subtopics when generating openlines. + If False, only the subtopics are used. + Returns: + A list of synthetically generated Python prompts + """ + self.logger.info("Starting python pipeline") + # Generate the macro topics + self.logger.info("Starting python macro topic generation") + responses = await self.generate_python_macro_topics( + n_macro_topics=n_macro_topics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=macro_topic_prompt_template, + ) + macro_topics = await self.convert_response_to_yaml_list( + responses[0], + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(macro_topics) != n_macro_topics and not ignore_conversion_failure: + raise YamlConversionError( + f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" + ) + macro_topics.extend(additional_macro_topics) + self.logger.info("Finished python macro topic generation") + + # Generate the subtopics + raw_topics = [ + self._generate_parse_python_subtopic( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + subtopic_prompt_template=subtopic_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for macro_topic in macro_topics + ] + self.logger.info("Starting python subtopic generation") + raw_topics = await self._gather(raw_topics) + topic_list = [item for subtopics in raw_topics for item in subtopics] + topic_list.extend(additional_subtopics) + self.logger.info("Finished python subtopic generation") + + # Mix the macro topics with the subtopics + if combine_topics: + topic_list.extend(macro_topics) + + # Generate the openlines + raw_lines = [ + self._generate_parse_python_openline( + subtopic=subtopic, + n_openlines=n_openlines, + model=model, + python_problem_prompt_template=python_problem_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for subtopic in topic_list + ] + self.logger.info("Starting python openline generation") + raw_lines = await self._gather(raw_lines) + openlines = [item for lines in raw_lines for item in lines] + self.logger.info("Finished python openline generation") + self.logger.info("Finished python pipeline") + + return openlines + + async def _generate_parse_python_subtopic( + self, + macro_topic: str, + n_subtopics: Union[int, str], + model: str, + subtopic_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + raw_topic = await self.generate_python_subtopics( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=subtopic_prompt_template, + ) + raw_topic = raw_topic[0] + return await self._try_convert_yaml_list( + raw_topic, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_subtopics, + ignore_conversion_failure=ignore_conversion_failure, + ) + + async def _generate_parse_python_openline( + self, + subtopic: str, + n_openlines: Union[int, str], + model: str, + python_problem_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + raw_line = await self.generate_python_problem( + topic=subtopic, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=python_problem_prompt_template, + ) + raw_line = raw_line[0] + return await self._try_convert_yaml_list( + raw_line, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_openlines, + ignore_conversion_failure=ignore_conversion_failure, + ) diff --git a/nemo_curator/synthetic/error.py b/nemo_curator/synthetic/error.py new file mode 100644 index 000000000..b89792eaa --- /dev/null +++ b/nemo_curator/synthetic/error.py @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +class YamlConversionError(Exception): + def __init__(self, message): + self.message = message + super().__init__(self.message) + + def __str__(self): + return self.message diff --git a/nemo_curator/synthetic/mixtral.py b/nemo_curator/synthetic/mixtral.py new file mode 100644 index 000000000..71a6f372e --- /dev/null +++ b/nemo_curator/synthetic/mixtral.py @@ -0,0 +1,38 @@ +from typing import List + +from nemo_curator.services.conversation_formatter import ConversationFormatter + + +class Mixtral8x7BFormatter(ConversationFormatter): + + PROMPT_PREFIX = " [INST] \n" + + @staticmethod + def format_conversation(conv: List[dict]) -> str: + """ + Formats a converstation between a user and assistant in the Mixtral-8x7B format + described here: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 + Args: + conv: A conversation between a user and assistant + Returns: + A conversation formatted as text + """ + prompt = Mixtral8x7BFormatter.PROMPT_PREFIX + + for i, turn in enumerate(conv): + user_turn = i % 2 == 0 + + if user_turn: + if turn["role"] != "user": + raise ValueError( + f"Conversation turn {i} is not 'user'. All even number turns should be." + ) + prompt += turn["content"] + " [/INST]" + else: + if turn["role"] != "assistant": + raise ValueError( + f"Conversation turn {i} is not 'assistant'. All odd number turns should be." + ) + prompt += turn["content"] + "[INST] " + + return prompt diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py new file mode 100644 index 000000000..fedf7c192 --- /dev/null +++ b/nemo_curator/synthetic/nemotron.py @@ -0,0 +1,1472 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List, Tuple, Union + +import yaml + +from nemo_curator.services.conversation_formatter import ConversationFormatter +from nemo_curator.services.model_client import LLMClient +from nemo_curator.synthetic.error import YamlConversionError +from nemo_curator.synthetic.prompts import ( + DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, + DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_MATH_CLASSIFICATION_PROMPT_TEMPLATE, + DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE, + DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + DEFAULT_PYTHON_CLASSIFICATION_PROMPT_TEMPLATE, + DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE, + DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, + DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE, + DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, + DEFAULT_WRITING_TASK_PROMPT_TEMPLATE, + DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE, + MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE, + PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE, +) + + +class NemotronGenerator: + """ + Provides a collection of methods for generating synthetic data + described in the Nemotron-4 340B Technical Report + (https://arxiv.org/abs/2406.11704v1) and inspired by the + UltraChat paper (https://arxiv.org/abs/2305.14233) + """ + + def __init__(self, llm_client: LLMClient) -> None: + self.client = llm_client + + def _prompt( + self, model: str, prompt_template: str, prompt_kwargs: dict, model_kwargs: dict + ) -> List[str]: + prompt = prompt_template.format(**prompt_kwargs) + messages = [{"role": "user", "content": prompt}] + + return self.client.query_model(messages=messages, model=model, **model_kwargs) + + def convert_response_to_yaml_list( + self, + llm_response: str, + model: str, + prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Converts a response of an LLM to a list of strings by querying an LLM + Args: + llm_response: The original unformatted response of the LLM + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have a {llm_response} + parameter that will be populated with the llm_response value passed in this function. + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A parsed list of elements from the original LLM response + """ + prompt_kwargs["llm_response"] = llm_response + yaml_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + try: + parsed_response = yaml.safe_load(yaml_response[0]) + except yaml.error.YAMLError as _: + raise YamlConversionError( + f"Error parsing yaml response: {yaml_response[0]}" + ) + + if not isinstance(parsed_response, list): + raise YamlConversionError( + f"Error: Parsed response was not a list: {parsed_response}" + ) + + for elem in parsed_response: + if not isinstance(elem, str): + raise YamlConversionError( + f"Error: Parsed response contains non-string elements in list: {parsed_response}" + ) + if elem not in llm_response: + raise YamlConversionError( + f"Conversion introduced hallucinations. Original response:\n{llm_response}\nConverted response:\n{parsed_response}\nHallucination:\n{elem}" + ) + + return parsed_response + + def generate_macro_topics( + self, + n_macro_topics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of macro topics about the world + Args: + n_macro_topics: The number of macro topics to generate. + model: The name of the model that should be used to generate the macro topics. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_macro_topics"] = n_macro_topics + macro_topics = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return macro_topics + + def generate_subtopics( + self, + macro_topic: str, + n_subtopics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of subtopics relating to a macro topic + Args: + macro_topic: The macro topic to generate subtopics for. + n_subtopics: The number of subtopics to generate per macro topic + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with the macro_topic passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_subtopics"] = n_subtopics + prompt_kwargs["macro_topic"] = macro_topic + subtopics_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return subtopics_response + + def generate_open_qa_from_topic( + self, + topic: str, + n_openlines: Union[str, int], + model: str, + prompt_template: str = DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of open Q&A questions based on a topic + Args: + topic: The topic to generate questions for. + n_openlines: The number of questions to generate per topic. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_subtopics passed in this function + - topic: Will be populated with the topic passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["topic"] = topic + prompt_kwargs["n_openlines"] = n_openlines + openline_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return openline_response + + def revise_open_qa( + self, + openline: str, + n_revisions: Union[str, int], + model: str, + prompt_template: str = DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to revise an open Q&A question a given number of times + Args: + openline: An openline to revise + n_revisions: The number of revisions to generate for the question. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - openline: Will be populated with the openline passed in this function + - n_revisions: Will be populated with the n_revisions passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["openline"] = openline + prompt_kwargs["n_revisions"] = n_revisions + revisions = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return revisions + + def generate_writing_tasks( + self, + topic: str, + text_material_type: str, + n_openlines: Union[str, int], + model: str, + prompt_template: str = DEFAULT_WRITING_TASK_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of writing tasks based on a topic and document type + Args: + topic: The topic to generate writing tasks for. + text_material_type: The type of the document the question should ask to generate (e.g., "Email", "Poem") + n_openlines: The number of tasks to generate per topic and text material pair. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - topic: Will be populated with the topic passed in this function + - text_material_type: Will be populated with the text_material_type passed in this function + - n_openlines: Will be populated with the n_openlines passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["topic"] = topic + prompt_kwargs["text_material_type"] = text_material_type + prompt_kwargs["n_openlines"] = n_openlines + writing_tasks = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return writing_tasks + + def revise_writing_tasks( + self, + openline: str, + n_revisions: Union[str, int], + model: str, + prompt_template: str = DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to revise a writing task a given number of times + Args: + openline: An openline to revise + n_revisions: The number of revisions to generate for the task. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - openline: Will be populated with the openline passed in this function + - n_revisions: Will be populated with the n_revisions passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["openline"] = openline + prompt_kwargs["n_revisions"] = n_revisions + revisions = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return revisions + + def generate_closed_qa_instructions( + self, + document: str, + n_openlines: Union[str, int], + model: str, + prompt_template: str = DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of closed Q&A questions based on a reference document + Args: + document: The document to use when generating questions + n_openlines: The number of questions to generate per document. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - document: Will be populated with the document passed in this function + - n_openlines: Will be populated with the n_openlines passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["document"] = document + prompt_kwargs["n_openlines"] = n_openlines + openline_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return openline_response + + def generate_math_macro_topics( + self, + n_macro_topics: Union[int, str], + school_level: str, + model: str, + prompt_template: str = DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of macro topics about math + Args: + n_macro_topics: The number of macro topics to generate. Can be an integer like 5 or a string like "five". + school_level: The school level the math questions should be targeted at. + model: The name of the model that should be used to generate the macro topics. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + - school_level: Will be populated with the school_level passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_macro_topics"] = n_macro_topics + prompt_kwargs["school_level"] = school_level + macro_topics = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return macro_topics + + def generate_math_subtopics( + self, + macro_topic: str, + n_subtopics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of subtopics relating to a math macro topic + Args: + macro_topic: The macro topic to generate subtopics for. + n_subtopics: The number of subtopics to generate per macro topic + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with the macro_topic passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_subtopics"] = n_subtopics + prompt_kwargs["macro_topic"] = macro_topic + subtopics_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return subtopics_response + + def classify_math_entity( + self, + entity: str, + model: str, + prompt_template: str = DEFAULT_MATH_CLASSIFICATION_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs={}, + ) -> List[str]: + """ + Prompts an LLM to classify if an entity is related to math + Args: + entity: The entity to classify + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - entity: Will be populated with the entity passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["entity"] = entity + classification_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return classification_response + + def generate_math_problem( + self, + topic: str, + n_openlines: Union[str, int], + model: str, + prompt_template: str = MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of math problems based on a topic + Args: + topic: The topic to generate problems for. + n_openlines: The number of problems to generate per topic. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_subtopics passed in this function + - topic: Will be populated with the topic passed in this function + Some example templates found in nemo_curator.synthetic include: + - MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE + - MATH_PROBLEM_BEGINNER_PROMPT_TEMPLATE + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["topic"] = topic + prompt_kwargs["n_openlines"] = n_openlines + openline_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return openline_response + + def generate_python_macro_topics( + self, + n_macro_topics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of macro topics about the Python programming language + Args: + n_macro_topics: The number of macro topics to generate. Can be an integer like 5 or a string like "five". + model: The name of the model that should be used to generate the macro topics. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_macro_topics"] = n_macro_topics + macro_topics = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return macro_topics + + def generate_python_subtopics( + self, + macro_topic: str, + n_subtopics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of subtopics relating to a Python macro topic + Args: + macro_topic: The macro topic to generate subtopics for. + n_subtopics: The number of subtopics to generate per macro topic + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with the macro_topic passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_subtopics"] = n_subtopics + prompt_kwargs["macro_topic"] = macro_topic + subtopics_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return subtopics_response + + def classify_python_entity( + self, + entity: str, + model: str, + prompt_template: str = DEFAULT_PYTHON_CLASSIFICATION_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to classify if an entity is related to Python + Args: + entity: The entity to classify + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - entity: Will be populated with the entity passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["entity"] = entity + classification_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return classification_response + + def generate_python_problem( + self, + topic: str, + n_openlines: Union[str, int], + model: str, + language="Python", + prompt_template: str = PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of coding problems based on a topic + Args: + topic: The topic to generate problems for. + n_openlines: The number of problems to generate per topic. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + language: The programming language to target when generating these questions. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_subtopics passed in this function + - topic: Will be populated with the topic passed in this function + - language: Will be populated with the language passed in this function + Some example templates found in nemo_curator.synthetic include: + - PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE + - PYTHON_PROBLEM_INTERMEDIATE_PROMPT_TEMPLATE + - PYTHON_PROBLEM_ADVANCED_PROMPT_TEMPLATE + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["topic"] = topic + prompt_kwargs["n_openlines"] = n_openlines + prompt_kwargs["language"] = language + openline_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return openline_response + + def generate_dialogue( + self, + openline: str, + user_model: str, + assistant_model: str, + n_user_turns: int = 3, + prompt_template: str = DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + user_model_kwargs: dict = {}, + assistant_model_kwargs: dict = {}, + ) -> List[dict]: + """ + Prompts an LLM to generate a dialogue based on a given openline. + The LLM will alternate impersonating the user and the assistant. + Args: + openline: The openline that will comprise the first user turn. + user_model: The model that will be impersonating the user. + Must be available in the LLMClient passed in the constructor. + assistant_model: The model that will be impersonating the assistant + Must be available in the LLMClient passed in the constructor. + n_user_turns: The number of user turns to go through. The openline counts as 1 user turn. + Therefore, if there are 3 user turns, 2 will be generated by the LLM impersonating the user. + prompt_template: A format string of the prompt to use when impersonating the user. + It must have the following parameters: + - converstation_history: Will be populated with a formatted history of the dialogue up to that point. + Some example templates found in nemo_curator.synthetic include: + - DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE + - DIALOGUE_COMPLEX_USER_TURN_PROMPT_TEMPLATE + - DIALOGUE_CONCISE_USER_TURN_PROMPT_TEMPLATE + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + user_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the user. + assistant_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the assistant. + Returns: + A conversation between a User and Assistant + """ + conversation_history = [{"role": "user", "content": openline}] + first_assistant_response = self.client.query_model( + messages=conversation_history, + model=assistant_model, + **assistant_model_kwargs, + )[0] + conversation_history.append( + {"role": "assistant", "content": first_assistant_response} + ) + for _ in range(n_user_turns - 1): + user_response = self._impersonate_user( + conversation_history=conversation_history, + model=user_model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=user_model_kwargs, + ) + conversation_history.append({"role": "user", "content": user_response}) + assistant_response = self.client.query_model( + messages=conversation_history, + model=assistant_model, + **assistant_model_kwargs, + )[0] + conversation_history.append( + {"role": "assistant", "content": assistant_response} + ) + + return conversation_history + + def generate_two_turn_prompt( + self, + openline: str, + user_model: str, + assistant_model: str, + prompt_template: str = DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + user_model_kwargs: dict = {}, + assistant_model_kwargs: dict = {}, + ) -> List[dict]: + """ + Prompts an LLM to generate a response as an assistant, then as the user based on a given openline. + The conversation will look like "User -> Assistant -> User" + Args: + openline: The openline that will comprise the first user turn. + user_model: The model that will be impersonating the user. + Must be available in the LLMClient passed in the constructor. + assistant_model: The model that will be impersonating the assistant + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use when impersonating the user. + It must have the following parameters: + - converstation_history: Will be populated with a formatted history of the dialogue up to that point. + Some example templates found in nemo_curator.synthetic include: + - DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE + - DIALOGUE_COMPLEX_USER_TURN_PROMPT_TEMPLATE + - DIALOGUE_CONCISE_USER_TURN_PROMPT_TEMPLATE + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + user_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the user. + assistant_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the assistant. + Returns: + A conversation between a User and Assistant + """ + conversation_history = [{"role": "user", "content": openline}] + first_assistant_response = self.client.query_model( + messages=conversation_history, + model=assistant_model, + **assistant_model_kwargs, + )[0] + conversation_history.append( + {"role": "assistant", "content": first_assistant_response} + ) + + user_response = self._impersonate_user( + conversation_history=conversation_history, + model=user_model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=user_model_kwargs, + ) + conversation_history.append({"role": "user", "content": user_response}) + + return conversation_history + + def _impersonate_user( + self, + conversation_history: List[dict], + model: str, + prompt_template: str, + prompt_kwargs: dict, + model_kwargs: dict, + ) -> str: + # Convert the conversation history to a string + history_str = "" + for turn in conversation_history: + history_str += f"{turn['role'].capitalize()}: {turn['content']}" + prompt_kwargs["conversation_history"] = history_str + response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return response[0] + + def run_open_qa_pipeline( + self, + n_macro_topics: Union[str, int], + n_subtopics: Union[str, int], + n_openlines: Union[str, int], + n_revisions: Union[str, int], + model: str, + macro_topic_prompt_template: str = DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, + subtopic_prompt_template: str = DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, + open_qa_from_topics_prompt_template: str = DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + revise_open_qa_prompt_template: str = DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + additional_macro_topics: List[str] = [], + additional_subtopics: List[str] = [], + ignore_conversion_failure: bool = False, + combine_topics: bool = True, + ) -> List[str]: + """ + Runs a pipeline for automatically generating Open Q&A openlines for a dialogue + Args: + n_macro_topics: The number of macro topics to generate + n_subtopics: The number of subtopics to generate per macro topic + n_openlines: The number of questions to generate per topic. + n_revisions: The number of revisions to generate per original question. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + macro_topic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + No additional parameters may be passed to this prompt template. + subtopic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with a generated macro topic + No additional parameters may be passed to this prompt template. + open_qa_from_topics_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - topic: Will be populated with a generated topic + No additional parameters may be passed to this prompt template. + revise_open_qa_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_revisions: Will be populated with the n_revisions passed in this function + - openline: Will be populated with a generated open Q&A openline + No additional parameters may be passed to this prompt template. + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + combine_topics: If True, mixes the macro topics with the subtopics when generating openlines. + If False, only the subtopics are used. + Returns: + A list of synthetically generated open Q&A prompts + """ + # Generate the macro topics + responses = self.generate_macro_topics( + n_macro_topics=n_macro_topics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=macro_topic_prompt_template, + ) + macro_topics = self.convert_response_to_yaml_list( + responses[0], + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(macro_topics) != n_macro_topics and not ignore_conversion_failure: + raise YamlConversionError( + f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" + ) + macro_topics.extend(additional_macro_topics) + + # Generate the subtopics + raw_topics = [ + self.generate_subtopics( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=subtopic_prompt_template, + )[0] + for macro_topic in macro_topics + ] + topic_list = [] + for topic in raw_topics: + try: + parsed_topics = self.convert_response_to_yaml_list( + topic, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_topics) != n_subtopics: + raise YamlConversionError( + f"Error: Length of subtopics {len(parsed_topics)} does not match desired n_subtopics {n_subtopics}: {parsed_topics}" + ) + topic_list.extend(parsed_topics) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + topic_list.extend(additional_subtopics) + + # Mix the macro topics with the subtopics + if combine_topics: + topic_list.extend(macro_topics) + + # Generate the openlines + raw_lines = [ + self.generate_open_qa_from_topic( + topic=t, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=open_qa_from_topics_prompt_template, + )[0] + for t in topic_list + ] + openlines = [] + for line in raw_lines: + try: + parsed_line = self.convert_response_to_yaml_list( + line, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_line) != n_openlines: + raise YamlConversionError( + f"Error: Length of openlines {len(parsed_line)} does not match desired n_openlines {n_openlines}: {parsed_line}" + ) + openlines.extend(parsed_line) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + + # Revise the openlines + raw_revisions = [ + self.revise_open_qa( + openline=line, + n_revisions=n_revisions, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=revise_open_qa_prompt_template, + )[0] + for line in openlines + ] + revised_openlines = [] + for line in raw_revisions: + try: + parsed_revision = self.convert_response_to_yaml_list( + line, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_revision) != n_revisions: + raise YamlConversionError( + f"Error: Length of revisions {len(parsed_revision)} does not match desired n_revisions {n_revisions}: {parsed_revision}" + ) + revised_openlines.extend(parsed_revision) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + + return revised_openlines + + def run_writing_pipeline( + self, + topics: List[str], + text_material_types: List[str], + n_openlines: Union[str, int], + n_revisions: Union[str, int], + model: str, + writing_task_prompt_template: str = DEFAULT_WRITING_TASK_PROMPT_TEMPLATE, + revise_writing_task_prompt_template: str = DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + ignore_conversion_failure: bool = False, + ) -> List[str]: + """ + Runs a pipeline for automatically generating writing task openlines for a dialogue + Args: + topics: A list of topics to generate tasks for + text_material_types: A list of writing material types, like "Essay" or "Blog post" + n_openlines: The number of tasks to generate per (topic, text_material_type) pair. + n_revisions: The number of revisions to generate per original task. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + writing_task_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - topic: Will be populated with one element of the topics list passed in this function + - text_material_type: Will be populated with one element of the text_material_types list passed in this function + No additional parameters may be passed to this prompt template. + revise_writing_task_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_revisions: Will be populated with the n_revisions passed in this function + - openline: Will be populated with one of the writing tasks generated in the pipeline. + No additional parameters may be passed to this prompt template. + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + Returns: + A list of synthetically generated writing task prompts + """ + # Generate the tasks + writing_tasks = [] + for topic in topics: + for material in text_material_types: + raw_tasks = self.generate_writing_tasks( + topic=topic, + text_material_type=material, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=writing_task_prompt_template, + )[0] + try: + parsed_tasks = self.convert_response_to_yaml_list( + raw_tasks, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_tasks) != n_openlines: + raise YamlConversionError( + f"Error: Length of writing tasks {len(parsed_tasks)} does not match desired n_openlines {n_openlines}: {parsed_tasks}" + ) + writing_tasks.extend(parsed_tasks) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + + # Revise the tasks + raw_revisions = [ + self.revise_writing_tasks( + openline=line, + n_revisions=n_revisions, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=revise_writing_task_prompt_template, + )[0] + for line in writing_tasks + ] + revised_openlines = [] + for line in raw_revisions: + try: + parsed_revision = self.convert_response_to_yaml_list( + line, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_revision) != n_revisions: + raise YamlConversionError( + f"Error: Length of revisions {len(parsed_revision)} does not match desired n_revisions {n_revisions}: {parsed_revision}" + ) + revised_openlines.extend(parsed_revision) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + + return revised_openlines + + def run_closed_qa_pipeline( + self, + documents: List[str], + n_openlines: Union[str, int], + model: str, + closed_qa_prompt_template: str = DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + ignore_conversion_failure: bool = False, + ) -> List[Tuple[int, str]]: + """ + Runs a pipeline for automatically generating closed Q&A openlines for a dialogue + Args: + documents: A list of documents to generate closed Q&A questions for + n_openlines: The number of questions to generate per document. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + closed_qa_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - document: Will be populated with one element of the documents list passed in this function + No additional parameters may be passed to this prompt template. + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + Returns: + A list of pairs where the first element represents the index of the document used to generate the question in the documents list + and the second element represents a synthetically generated closed Q&A prompt. Example: [(0, "Summarize this document"), ...] + """ + raw_instructions = [ + self.generate_closed_qa_instructions( + document=document, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=closed_qa_prompt_template, + )[0] + for document in documents + ] + document_openline_pairs = [] + for i, instruction in enumerate(raw_instructions): + try: + parsed_instructions = self.convert_response_to_yaml_list( + instruction, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_instructions) != n_openlines: + raise YamlConversionError( + f"Error: Length of openlines {len(parsed_instructions)} does not match desired n_openlines {n_openlines}: {parsed_instructions}" + ) + document_openline_pairs.extend( + [(i, inst) for inst in parsed_instructions] + ) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + + return document_openline_pairs + + def run_math_pipeline( + self, + n_macro_topics: Union[str, int], + school_level: str, + n_subtopics: Union[str, int], + n_openlines: Union[str, int], + model: str, + macro_topic_prompt_template: str = DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE, + subtopic_prompt_template: str = DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE, + math_problem_prompt_template: str = MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + additional_macro_topics: List[str] = [], + additional_subtopics: List[str] = [], + ignore_conversion_failure: bool = False, + combine_topics: bool = True, + ) -> List[str]: + """ + Runs a pipeline for automatically generating math questions for a dialogue + Args: + n_macro_topics: The number of macro topics to generate. + school_level: The school level to target when generating macro topics. + n_subtopics: The number of subtopics to generate per macro topic. + n_openlines: The number of questions to generate per topic. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + macro_topic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + - school_level: Will be populated with the school_level passed in this function + No additional parameters may be passed to this prompt template. + subtopic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with a generated macro topic + No additional parameters may be passed to this prompt template. + math_problem_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - topic: Will be populated with a generated topic + No additional parameters may be passed to this prompt template. + Some example templates found in nemo_curator.synthetic include: + - MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE + - MATH_PROBLEM_BEGINNER_PROMPT_TEMPLATE + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + combine_topics: If True, mixes the macro topics with the subtopics when generating openlines. + If False, only the subtopics are used. + Returns: + A list of synthetically generated math prompts + """ + # Generate the macro topics + responses = self.generate_math_macro_topics( + n_macro_topics=n_macro_topics, + school_level=school_level, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=macro_topic_prompt_template, + ) + macro_topics = self.convert_response_to_yaml_list( + responses[0], + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(macro_topics) != n_macro_topics and not ignore_conversion_failure: + raise YamlConversionError( + f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" + ) + macro_topics.extend(additional_macro_topics) + + # Generate the subtopics + raw_topics = [ + self.generate_math_subtopics( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=subtopic_prompt_template, + )[0] + for macro_topic in macro_topics + ] + topic_list = [] + for topic in raw_topics: + try: + parsed_topics = self.convert_response_to_yaml_list( + topic, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_topics) != n_subtopics: + raise YamlConversionError( + f"Error: Length of subtopics {len(parsed_topics)} does not match desired n_subtopics {n_subtopics}: {parsed_topics}" + ) + topic_list.extend(parsed_topics) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + topic_list.extend(additional_subtopics) + + # Mix the macro topics with the subtopics + if combine_topics: + topic_list.extend(macro_topics) + + # Generate the openlines + raw_lines = [ + self.generate_math_problem( + topic=t, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=math_problem_prompt_template, + )[0] + for t in topic_list + ] + openlines = [] + for line in raw_lines: + try: + parsed_line = self.convert_response_to_yaml_list( + line, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_line) != n_openlines: + raise YamlConversionError( + f"Error: Length of openlines {len(parsed_line)} does not match desired n_openlines {n_openlines}: {parsed_line}" + ) + openlines.extend(parsed_line) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + + return openlines + + def run_python_pipeline( + self, + n_macro_topics: Union[str, int], + n_subtopics: Union[str, int], + n_openlines: Union[str, int], + model: str, + macro_topic_prompt_template: str = DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE, + subtopic_prompt_template: str = DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE, + python_problem_prompt_template: str = PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + additional_macro_topics: List[str] = [], + additional_subtopics: List[str] = [], + ignore_conversion_failure: bool = False, + combine_topics: bool = True, + ) -> List[str]: + """ + Runs a pipeline for automatically generating Python questions for a dialogue + Args: + n_macro_topics: The number of macro topics to generate. + n_subtopics: The number of subtopics to generate per macro topic. + n_openlines: The number of questions to generate per topic. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + macro_topic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + No additional parameters may be passed to this prompt template. + subtopic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with a generated macro topic + No additional parameters may be passed to this prompt template. + python_problem_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - language: Will be populated with "Python" + - topic: Will be populated with a generated topic + No additional parameters may be passed to this prompt template. + Some example templates found in nemo_curator.synthetic include: + - PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE + - PYTHON_PROBLEM_INTERMEDIATE_PROMPT_TEMPLATE + - PYTHON_PROBLEM_ADVANCED_PROMPT_TEMPLATE + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + combine_topics: If True, mixes the macro topics with the subtopics when generating openlines. + If False, only the subtopics are used. + Returns: + A list of synthetically generated Python prompts + """ + # Generate the macro topics + responses = self.generate_python_macro_topics( + n_macro_topics=n_macro_topics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=macro_topic_prompt_template, + ) + macro_topics = self.convert_response_to_yaml_list( + responses[0], + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(macro_topics) != n_macro_topics and not ignore_conversion_failure: + raise YamlConversionError( + f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" + ) + macro_topics.extend(additional_macro_topics) + + # Generate the subtopics + raw_topics = [ + self.generate_python_subtopics( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=subtopic_prompt_template, + )[0] + for macro_topic in macro_topics + ] + topic_list = [] + for topic in raw_topics: + try: + parsed_topics = self.convert_response_to_yaml_list( + topic, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_topics) != n_subtopics: + raise YamlConversionError( + f"Error: Length of subtopics {len(parsed_topics)} does not match desired n_subtopics {n_subtopics}: {parsed_topics}" + ) + topic_list.extend(parsed_topics) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + topic_list.extend(additional_subtopics) + + # Mix the macro topics with the subtopics + if combine_topics: + topic_list.extend(macro_topics) + + # Generate the openlines + raw_lines = [ + self.generate_python_problem( + topic=t, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=python_problem_prompt_template, + )[0] + for t in topic_list + ] + openlines = [] + for line in raw_lines: + try: + parsed_line = self.convert_response_to_yaml_list( + line, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_line) != n_openlines: + raise YamlConversionError( + f"Error: Length of openlines {len(parsed_line)} does not match desired n_openlines {n_openlines}: {parsed_line}" + ) + openlines.extend(parsed_line) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + + return openlines + + +class NemotronFormatter(ConversationFormatter): + + PROMPT_PREFIX = "System\n\nUser\n" + + @staticmethod + def format_conversation(conv: List[dict]) -> str: + """ + Formats a converstation between a user and assistant in the Nemotron 340B format + described here: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/nemotron-4-340b-instruct + Args: + conv: A conversation between a user and assistant + Returns: + A conversation formatted as text + """ + prompt = NemotronFormatter.PROMPT_PREFIX + + for i, turn in enumerate(conv): + user_turn = i % 2 == 0 + + if user_turn: + if turn["role"] != "user": + raise ValueError( + f"Conversation turn {i} is not 'user'. All even number turns should be." + ) + prompt += turn["content"] + "\nAssistant\n" + else: + if turn["role"] != "assistant": + raise ValueError( + f"Conversation turn {i} is not 'assistant'. All odd number turns should be." + ) + prompt += turn["content"] + "\nUser\n" + + return prompt diff --git a/nemo_curator/synthetic/no_format.py b/nemo_curator/synthetic/no_format.py new file mode 100644 index 000000000..744c87b34 --- /dev/null +++ b/nemo_curator/synthetic/no_format.py @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List + +from nemo_curator.services.conversation_formatter import ConversationFormatter + + +class NoFormat(ConversationFormatter): + + def format_conversation(self, conv: List[dict]) -> str: + if len(conv) != 1: + raise ValueError( + "There must be exactly one turn in the conversation to use NoFormat" + ) + + turn = conv[0] + + if turn["role"] != "user": + raise ValueError( + "Conversation turn 0 is not 'user'. All even number turns should be." + ) + + return turn["content"] diff --git a/nemo_curator/synthetic/prompts.py b/nemo_curator/synthetic/prompts.py new file mode 100644 index 000000000..fbe7e026a --- /dev/null +++ b/nemo_curator/synthetic/prompts.py @@ -0,0 +1,58 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE = "The following document contains a list of items. Parse the list of items into a yaml list of strings. Do not parse any other part of the document. There should be no additional formatting to your response, just the yaml list of strings.\n\n {llm_response}" + +DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE = "Can you generate {n_macro_topics} comprehensive topics that encompass various aspects of our daily life, the world, and science? Your answer should be a list of topics. Make the topics as diverse as possible.For example, 1. Food and drinks. \n2. Technology.\n" + +DEFAULT_SUBTOPICS_PROMPT_TEMPLATE = "Can you generate {n_subtopics} comprehensive topics that encompass various aspects of {macro_topic}? Your answer should be a list of topics. Make the topics as diverse as possible." + +DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE = "Can you generate {n_openlines} questions or requests related to {topic}? The questions and requests should be as diverse possible. Your answer should be a list." + +DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE = "Question: {openline}\n\nCan you revise the question above to include more contexts or details? The revised questions can be any of the follows:\n1. Adding some context to the original question. The context might state the importance of the question, explain background knowledge, or add other reasonable information.\n2. Change the questions into a different format or style, e.g., imperative statements, length requirements for the answer, etc.\n3. Elongated questions that require to elaborate on specific topic or discuss a certain point.\n4. Any other related questions or statements.\n\nThe revised question should contain two, three, or four sentences. You should generate {n_revisions} revised questions or statements in a list. Make them as diverse as possible." + +DEFAULT_WRITING_TASK_PROMPT_TEMPLATE = 'Can you generate {n_openlines} tasks, each of which requires to create a "{text_material_type}" related to {topic}? Each task should be concise and include one or two sentences only. The tasks should be as diverse as possible. Your answer should be a list of tasks.' + +DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE = "TASK: {openline}\n\nCan you revise the task above to include more detailed requirements? These requirements can be any of the follows:\n1. Require to elaborate on a specific topic or discuss a certain point.\n2. Require to include some examples, data points, or references.\n3. Require to follow specific formats or styles, e.g., no more than 300 words, including specific words, etc.\n4. Any other reasonable requests to make the task more detailed.\n\nThe revised task should contain two, three, or four sentences. You should generate {n_revisions} revised tasks in a list. Make the tasks as diverse as possible." + +DEFAULT_CLOSED_QA_PROMPT_TEMPLATE = "TEXT: {document}\n\nGiven the text above, can you come up with {n_openlines} questions or tasks? They can be any of the follows:\n1. Asking certain information in the text;\n2. Summarizing, repharsing or explaining the text;\n3. Writing something similar to the text;\n4. Any other reasonable requests related to the text.\n\nMake the questions or tasks as diverse as possible." + +DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE = "Can you generate {n_macro_topics} comprehensive topics that encompass the mathematics knowledge taughted in {school_level}? Your answer should be a list of topics. Make the topics as diverse as possible." + +DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE = 'List {n_subtopics} mathemathics topics that encompass various aspects of "{macro_topic}". Your answer should be a list of topics. Make the topics as diverse as possible.' + +DEFAULT_MATH_CLASSIFICATION_PROMPT_TEMPLATE = 'Does the concept "{entity}" belong to one of the following categories?\n- Math concepts taught at elementary school, middle school, high school, and univiersity.\n- Important mathematics axioms, theorems, algorithms, equations, or inequalities.\n- Representative math problems, functions, and applications.\n\nYour answer should start with "Yes" or "No".' + +MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE = 'Generate {n_openlines} mathematics problems which are related to "{topic}" or can be addressed using "{topic}". Your answer should be a list of problems. Make them as diverse as possible.' + +MATH_PROBLEM_BEGINNER_PROMPT_TEMPLATE = 'Generate {n_openlines} mathematics problems which are related to "{topic}" or can be addressed using "{topic}". These problems should be suitable for beginners who just learnt "{topic}". Your answer should be a list of problems. Make them as diverse as possible.' + +DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE = ( + "List {n_macro_topics} important concepts in the python language." +) + +DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE = 'List {n_subtopics} important concepts related to "{macro_topic}" in the python language.' + +DEFAULT_PYTHON_CLASSIFICATION_PROMPT_TEMPLATE = 'Does the concept "{entity}" belong to one of the following categories?\n- Programming concepts like loops, functions, and data structures in python.\n- Important functions, objects, or libraries in python.\n- Mathematical concepts like linear algebra which can be implemented in python.\n- Basic algorithms or problems in computer science likes Greedy Search and Dynamics programming which can be addressed in python.\n\nYour answer should start with "Yes" or "No".' + +PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE = 'Generate {n_openlines} {language} coding problems related to "{topic}". These problems should be suitable for beginners who just learnt "{topic}". Your answer should be a list of problems. Make them as diverse as possible.' + +PYTHON_PROBLEM_INTERMEDIATE_PROMPT_TEMPLATE = 'Generate {n_openlines} {language} coding problems related to "{topic}". These problems should be suitable for medium-level programmers with some experiences of "{topic}". Your answer should be a list of problems. Make them as diverse as possible.' + +PYTHON_PROBLEM_ADVANCED_PROMPT_TEMPLATE = 'Generate {n_openlines} {language} coding problems related to "{topic}". These problems should be suitable for advanced programmers with solid knowledge and experiences of "{topic}". Your answer should be a list of problems. Make them as diverse as possible.' + +DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE = "Here is a conversation between a user and an assistant.\n<|The Start of Assistant's Conversation with User|>\n{conversation_history}\n<|The End of Assistant's Conversation with User|>\n\nGiven the conversation above, generate a followup request or question in the tone of User. Directly give me the question without extraneous words." + +DIALOGUE_COMPLEX_USER_TURN_PROMPT_TEMPLATE = "Here is a conversation between a user and an assistant.\n<|The Start of Assistant's Conversation with User|>\n{conversation_history}\n<|The End of Assistant's Conversation with User|>\n\nGiven the conversation above, generate a followup request or question in the tone of User. Make sure the question is complex and diverse enough and suitable as a followup question. Directly give me the question without extraneous words." + +DIALOGUE_CONCISE_USER_TURN_PROMPT_TEMPLATE = "Here is a conversation between a user and an assistant.\n<|The Start of Assistant's Conversation with User|>\n{conversation_history}\n<|The End of Assistant's Conversation with User|>\n\nGiven the conversation above, generate a followup request or question in the toneof User. Be critical. Make sure the question is concise and has a real-life tone. Directly give me the question without extraneous words." diff --git a/setup.py b/setup.py index 205087046..63ec91035 100644 --- a/setup.py +++ b/setup.py @@ -66,6 +66,7 @@ # Numpy 2.0 breaks with spacy https://github.com/explosion/spaCy/issues/13528 # TODO: Remove when issue is fixed "numpy<2", + "openai", ], extras_require={ "cuda12x": [ @@ -75,7 +76,7 @@ "cugraph-cu12>=24.2", "dask-cuda>=24.2", "spacy[cuda12x]>=3.6.0, <4.0.0", - ] + ], }, entry_points={ "console_scripts": [ diff --git a/tests/test_dataset.py b/tests/test_dataset.py new file mode 100644 index 000000000..f16d49b22 --- /dev/null +++ b/tests/test_dataset.py @@ -0,0 +1,25 @@ +import dask.dataframe as dd +import pandas as pd + +import nemo_curator as nc +from nemo_curator.datasets import DocumentDataset + + +def all_equal(left_result: pd.DataFrame, right_result: pd.DataFrame): + l_cols = set(left_result.columns) + r_cols = set(right_result.columns) + assert l_cols == r_cols + for col in left_result.columns: + left = left_result[col].reset_index(drop=True) + right = right_result[col].reset_index(drop=True) + assert all(left == right), f"Mismatch in {col} column.\n{left}\n{right}\n" + + +class TestDocumentDataset: + def test_to_from_pandas(self): + original_df = pd.DataFrame( + {"first_col": [1, 2, 3], "second_col": ["a", "b", "c"]} + ) + dataset = DocumentDataset.from_pandas(original_df) + converted_df = dataset.to_pandas() + all_equal(original_df, converted_df) From 1e6acd8302a7a095436a1f7505ecd4c93d74f487 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Tue, 9 Jul 2024 22:03:17 +0530 Subject: [PATCH 11/19] Skip excomms for dask-cuda 24.06 (#147) Signed-off-by: Ayush Dattagupta --- nemo_curator/utils/fuzzy_dedup_utils/shuffle_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nemo_curator/utils/fuzzy_dedup_utils/shuffle_utils.py b/nemo_curator/utils/fuzzy_dedup_utils/shuffle_utils.py index e104ee0ca..9d1915603 100644 --- a/nemo_curator/utils/fuzzy_dedup_utils/shuffle_utils.py +++ b/nemo_curator/utils/fuzzy_dedup_utils/shuffle_utils.py @@ -25,7 +25,10 @@ get_agg_text_bytes_df, ) -USE_EXCOMMS = Version(dask_cuda.__version__) >= Version("23.10") +dask_cuda_version = Version(dask_cuda.__version__) +USE_EXCOMMS = ( + dask_cuda_version >= Version("23.10") and dask_cuda_version < Version("24.06") +) or dask_cuda_version >= Version("24.08") def write_partitioned_file(df, output_path, partition_on, batch_id): From 0cbe44784a2e2466f7b82af125fccf68558e534e Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 9 Jul 2024 09:49:57 -0700 Subject: [PATCH 12/19] Add support for NeMo SDK (#131) * Begin docs Signed-off-by: Ryan Wolf * Add slurm sdk example Signed-off-by: Ryan Wolf * Use safe import Signed-off-by: Ryan Wolf * Fix bugs in sdk Signed-off-by: Ryan Wolf * Update docs and tweak scripts Signed-off-by: Ryan Wolf * Add interface helper function Signed-off-by: Ryan Wolf * Update docs Signed-off-by: Ryan Wolf * Fix formatting Signed-off-by: Ryan Wolf * Add config docstring Signed-off-by: Ryan Wolf * Address comments Signed-off-by: Ryan Wolf --------- Signed-off-by: Ryan Wolf --- docs/user-guide/index.rst | 4 + docs/user-guide/nemosdk.rst | 127 ++++++++++++++++++++++++ examples/nemo_sdk/launch_slurm.py | 56 +++++++++++ examples/slurm/container-entrypoint.sh | 8 +- examples/slurm/start-slurm.sh | 6 +- nemo_curator/__init__.py | 2 +- nemo_curator/nemo_sdk/__init__.py | 17 ++++ nemo_curator/nemo_sdk/slurm.py | 110 ++++++++++++++++++++ nemo_curator/utils/distributed_utils.py | 13 ++- setup.py | 3 +- 10 files changed, 338 insertions(+), 8 deletions(-) create mode 100644 docs/user-guide/nemosdk.rst create mode 100644 examples/nemo_sdk/launch_slurm.py create mode 100644 nemo_curator/nemo_sdk/__init__.py create mode 100644 nemo_curator/nemo_sdk/slurm.py diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index 1b0e63165..327b0d223 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -30,6 +30,9 @@ :ref:`NeMo Curator on Kubernetes ` Demonstration of how to run the NeMo Curator on a Dask Cluster deployed on top of Kubernetes +:ref:`NeMo Curator with NeMo SDK ` + Example of how to use NeMo Curator with NeMo SDK to run on various platforms + `Tutorials `__ To get started, you can explore the NeMo Curator GitHub repository and follow the available tutorials and notebooks. These resources cover various aspects of data curation, including training from scratch and Parameter-Efficient Fine-Tuning (PEFT). @@ -49,3 +52,4 @@ personalidentifiableinformationidentificationandremoval.rst distributeddataclassification.rst kubernetescurator.rst + nemosdk.rst diff --git a/docs/user-guide/nemosdk.rst b/docs/user-guide/nemosdk.rst new file mode 100644 index 000000000..dbf78c17a --- /dev/null +++ b/docs/user-guide/nemosdk.rst @@ -0,0 +1,127 @@ +.. _data-curator-nemo-sdk: + +====================================== +NeMo Curator with NeMo SDK +====================================== +----------------------------------------- +NeMo SDK +----------------------------------------- + +The NeMo SDK is a general purpose tool for configuring and executing Python functions and scripts acrosss various computing environments. +It is used across the NeMo Framework for managing machine learning experiments. +One of the key features of the NeMo SDK is the ability to run code locally or on platforms like SLURM with minimal changes. + +----------------------------------------- +Usage +----------------------------------------- + +We recommend getting slightly familiar with NeMo SDK before jumping into this. The documentation can be found here. + +Let's walk through the example usage for how you can launch a slurm job using `examples/launch_slurm.py `_. + +.. code-block:: python + + + import nemo_sdk as sdk + from nemo_sdk.core.execution import SlurmExecutor + + from nemo_curator.nemo_sdk import SlurmJobConfig + + @sdk.factory + def nemo_curator_slurm_executor() -> SlurmExecutor: + """ + Configure the following function with the details of your SLURM cluster + """ + return SlurmExecutor( + job_name_prefix="nemo-curator", + account="my-account", + nodes=2, + exclusive=True, + time="04:00:00", + container_image="nvcr.io/nvidia/nemo:dev", + container_mounts=["/path/on/machine:/path/in/container"], + ) + +First, we need to define a factory that can produce a ``SlurmExecutor``. +This exectuor is where you define all your cluster parameters. Note: NeMo SDK only supports running on SLURM clusters with `Pyxis `_ right now. +After this, there is the main function + +.. code-block:: python + + # Path to NeMo-Curator/examples/slurm/container_entrypoint.sh on the SLURM cluster + container_entrypoint = "/cluster/path/slurm/container_entrypoint.sh" + # The NeMo Curator command to run + curator_command = "text_cleaning --input-data-dir=/path/to/data --output-clean-dir=/path/to/output" + curator_job = SlurmJobConfig( + job_dir="/home/user/jobs", + container_entrypoint=container_entrypoint, + script_command=curator_command, + ) + +First, we need to specify the path to `examples/slurm/container-entrypoint.sh `_ on the cluster. +This shell script is responsible for setting up the Dask cluster on Slurm and will be the main script run. +Therefore, we need to define the path to it. + +Second, we need to establish the NeMo Curator script we want to run. +This can be a command line utility like ``text_cleaning`` we have above, or it can be your own custom script ran with ``python path/to/script.py`` + + +Finally, we combine all of these into a ``SlurmJobConfig``. This config has many options for configuring the Dask cluster. +We'll highlight a couple of important ones: + +* ``device="cpu"`` determines the type of Dask cluster to initialize. If you are using GPU modules, please set this equal to ``"gpu"``. +* ``interface="etho0"`` specifies the network interface to use for communication within the Dask cluster. It will likely be different for your Slurm cluster, so please modify as needed. You can determine what interfaces are available by running the following function on your cluster. + + .. code-block:: python + + from nemo_curator import get_network_interfaces + + print(get_network_interfaces()) + +.. code-block:: python + + executor = sdk.resolve(SlurmExecutor, "nemo_curator_slurm_executor") + with sdk.Experiment("example_nemo_curator_exp", executor=executor) as exp: + exp.add(curator_job.to_script(), tail_logs=True) + exp.run(detach=False) + +After configuring the job, we can finally run it. +First, we use the sdk to resolve our custom factory. +Next, we use it to begin an experiment named "example_nemo_curator_exp" running on our Slurm exectuor. + +``exp.add(curator_job.to_script(), tail_logs=True)`` adds the NeMo Curator script to be part of the experiment. +It converts the ``SlurmJobConfig`` to a ``sdk.Script``. +This ``curator_job.to_script()`` has two important parameters. +* ``add_scheduler_file=True`` +* ``add_device=True`` + +Both of these modify the command specified in ``curator_command``. +Setting both to ``True`` (the default) transforms the original command from: + +.. code-block:: bash + + # Original command + text_cleaning \ + --input-data-dir=/path/to/data \ + --output-clean-dir=/path/to/output + +to: + +.. code-block:: bash + + # Modified commmand + text_cleaning \ + --input-data-dir=/path/to/data \ + --output-clean-dir=/path/to/output \ + --scheduler-file=/path/to/scheduler/file \ + --device="cpu" + + +As you can see, ``add_scheduler_file=True`` causes ``--scheduler-file=/path/to/scheduer/file`` to be appended to the command, and ``add_device=True`` causes ``--device="cpu"`` (or whatever the device is set to) to be appended. +``/path/to/scheduer/file`` is determined by ``SlurmJobConfig``, and ``device`` is what the user specified in the ``device`` parameter previously. + +The scheduler file argument is necessary to connect to the Dask cluster on Slurm. +All NeMo Curator scripts accept both arguments, so the default is to automatically add them. +If your script is configured differently, feel free to turn these off. + +The final line ``exp.run(detach=False)`` starts the experiment on the Slurm cluster. \ No newline at end of file diff --git a/examples/nemo_sdk/launch_slurm.py b/examples/nemo_sdk/launch_slurm.py new file mode 100644 index 000000000..8bceb9702 --- /dev/null +++ b/examples/nemo_sdk/launch_slurm.py @@ -0,0 +1,56 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import nemo_sdk as sdk +from nemo_sdk.core.execution import SlurmExecutor + +from nemo_curator.nemo_sdk import SlurmJobConfig + + +@sdk.factory +def nemo_curator_slurm_executor() -> SlurmExecutor: + """ + Configure the following function with the details of your SLURM cluster + """ + return SlurmExecutor( + job_name_prefix="nemo-curator", + account="my-account", + nodes=2, + exclusive=True, + time="04:00:00", + container_image="nvcr.io/nvidia/nemo:dev", + container_mounts=["/path/on/machine:/path/in/container"], + ) + + +def main(): + # Path to NeMo-Curator/examples/slurm/container_entrypoint.sh on the SLURM cluster + container_entrypoint = "/cluster/path/slurm/container_entrypoint.sh" + # The NeMo Curator command to run + # This command can be susbstituted with any NeMo Curator command + curator_command = "text_cleaning --input-data-dir=/path/to/data --output-clean-dir=/path/to/output" + curator_job = SlurmJobConfig( + job_dir="/home/user/jobs", + container_entrypoint=container_entrypoint, + script_command=curator_command, + ) + + executor = sdk.resolve(SlurmExecutor, "nemo_curator_slurm_executor") + with sdk.Experiment("example_nemo_curator_exp", executor=executor) as exp: + exp.add(curator_job.to_script(), tail_logs=True) + exp.run(detach=False) + + +if __name__ == "__main__": + main() diff --git a/examples/slurm/container-entrypoint.sh b/examples/slurm/container-entrypoint.sh index 8bc6a9a39..e6e143b3d 100755 --- a/examples/slurm/container-entrypoint.sh +++ b/examples/slurm/container-entrypoint.sh @@ -16,6 +16,12 @@ # Start the scheduler on the rank 0 node if [[ -z "$SLURM_NODEID" ]] || [[ $SLURM_NODEID == 0 ]]; then + # Make the directories needed + echo "Making log directory $LOGDIR" + mkdir -p $LOGDIR + echo "Making profile directory $PROFILESDIR" + mkdir -p $PROFILESDIR + echo "Starting scheduler" if [[ $DEVICE == 'cpu' ]]; then dask scheduler \ @@ -58,7 +64,7 @@ fi sleep 60 if [[ -z "$SLURM_NODEID" ]] || [[ $SLURM_NODEID == 0 ]]; then - echo "Starting $SCRIPT_PATH" + echo "Starting $SCRIPT_COMMAND" bash -c "$SCRIPT_COMMAND" touch $DONE_MARKER fi diff --git a/examples/slurm/start-slurm.sh b/examples/slurm/start-slurm.sh index ab4074657..9e684298b 100644 --- a/examples/slurm/start-slurm.sh +++ b/examples/slurm/start-slurm.sh @@ -28,7 +28,8 @@ export BASE_JOB_DIR=`pwd`/nemo-curator-jobs export JOB_DIR=$BASE_JOB_DIR/$SLURM_JOB_ID -# Logging information +# Directory for Dask cluster communication and logging +# Must be paths inside the container that are accessible across nodes export LOGDIR=$JOB_DIR/logs export PROFILESDIR=$JOB_DIR/profiles export SCHEDULER_FILE=$LOGDIR/scheduler.json @@ -74,9 +75,6 @@ export DASK_DATAFRAME__QUERY_PLANNING=False # End easy customization # ================================================================= -mkdir -p $LOGDIR -mkdir -p $PROFILESDIR - # Start the container srun \ --container-mounts=${MOUNTS} \ diff --git a/nemo_curator/__init__.py b/nemo_curator/__init__.py index 9f1029316..80af4d698 100644 --- a/nemo_curator/__init__.py +++ b/nemo_curator/__init__.py @@ -41,7 +41,7 @@ NemoDeployClient, OpenAIClient, ) -from .utils.distributed_utils import get_client +from .utils.distributed_utils import get_client, get_network_interfaces # Dask will automatically convert the list score type # to a string without this option. diff --git a/nemo_curator/nemo_sdk/__init__.py b/nemo_curator/nemo_sdk/__init__.py new file mode 100644 index 000000000..fe4cd0291 --- /dev/null +++ b/nemo_curator/nemo_sdk/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .slurm import SlurmJobConfig + +__all__ = ["SlurmJobConfig"] diff --git a/nemo_curator/nemo_sdk/slurm.py b/nemo_curator/nemo_sdk/slurm.py new file mode 100644 index 000000000..ccefaffd5 --- /dev/null +++ b/nemo_curator/nemo_sdk/slurm.py @@ -0,0 +1,110 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Dict + +from nemo_curator.utils.import_utils import safe_import + +sdk = safe_import("nemo_sdk") + + +@dataclass +class SlurmJobConfig: + """ + Configuration for running a NeMo Curator script on a SLURM cluster using + NeMo SDK + + Args: + job_dir: The base directory where all the files related to setting up + the Dask cluster for NeMo Curator will be written + container_entrypoint: A path to the container-entrypoint.sh script + on the cluster. container-entrypoint.sh is found in the repo + here: https://github.com/NVIDIA/NeMo-Curator/blob/main/examples/slurm/container-entrypoint.sh + script_command: The NeMo Curator CLI tool to run. Pass any additional arguments + needed directly in this string. + device: The type of script that will be running, and therefore the type + of Dask cluster that will be created. Must be either "cpu" or "gpu". + interface: The network interface the Dask cluster will communicate over. + Use nemo_curator.get_network_interfaces() to get a list of available ones. + protocol: The networking protocol to use. Can be either "tcp" or "ucx". + Setting to "ucx" is recommended for GPU jobs if your cluster supports it. + cpu_worker_memory_limit: The maximum memory per process that a Dask worker can use. + "5GB" or "5000M" are examples. "0" means no limit. + rapids_no_initialize: Will delay or disable the CUDA context creation of RAPIDS libraries, + allowing for improved compatibility with UCX-enabled clusters and preventing runtime warnings. + cudf_spill: Enables automatic spilling (and “unspilling”) of buffers from device to host to + enable out-of-memory computation, i.e., computing on objects that occupy more memory + than is available on the GPU. + rmm_scheduler_pool_size: Sets a small pool of GPU memory for message transfers when + the scheduler is using ucx + rmm_worker_pool_size: The amount of GPU memory each GPU worker process may use. + Recommended to set at 80-90% of available GPU memory. 72GiB is good for A100/H100 + libcudf_cufile_policy: Allows reading/writing directly from storage to GPU. + """ + + job_dir: str + container_entrypoint: str + script_command: str + device: str = "cpu" + interface: str = "eth0" + protocol: str = "tcp" + cpu_worker_memory_limit: str = "0" + rapids_no_initialize: str = "1" + cudf_spill: str = "1" + rmm_scheduler_pool_size: str = "1GB" + rmm_worker_pool_size: str = "72GiB" + libcudf_cufile_policy: str = "OFF" + + def to_script(self, add_scheduler_file: bool = True, add_device: bool = True): + """ + Converts to a script object executable by NeMo SDK + Args: + add_scheduler_file: Automatically appends a '--scheduler-file' argument to the + script_command where the value is job_dir/logs/scheduler.json. All + scripts included in NeMo Curator accept and require this argument to scale + properly on SLURM clusters. + add_device: Automatically appends a '--device' argument to the script_command + where the value is the member variable of device. All scripts included in + NeMo Curator accept and require this argument. + Returns: + A NeMo SDK Script that will intialize a Dask cluster, and run the specified command. + It is designed to be executed on a SLURM cluster + """ + env_vars = self._build_env_vars() + + if add_scheduler_file: + env_vars[ + "SCRIPT_COMMAND" + ] += f" --scheduler-file={env_vars['SCHEDULER_FILE']}" + if add_device: + env_vars["SCRIPT_COMMAND"] += f" --device={env_vars['DEVICE']}" + + # Surround the command in quotes so the variable gets set properly + env_vars["SCRIPT_COMMAND"] = f"\"{env_vars['SCRIPT_COMMAND']}\"" + + return sdk.Script(path=self.container_entrypoint, env=env_vars) + + def _build_env_vars(self) -> Dict[str, str]: + env_vars = vars(self) + # Convert to uppercase to match container_entrypoint.sh + env_vars = {key.upper(): val for key, val in env_vars.items()} + + env_vars["LOGDIR"] = f"{self.job_dir}/logs" + env_vars["PROFILESDIR"] = f"{self.job_dir}/profiles" + env_vars["SCHEDULER_FILE"] = f"{env_vars['LOGDIR']}/scheduler.json" + env_vars["SCHEDULER_LOG"] = f"{env_vars['LOGDIR']}/scheduler.log" + env_vars["DONE_MARKER"] = f"{env_vars['LOGDIR']}/done.txt" + + return env_vars diff --git a/nemo_curator/utils/distributed_utils.py b/nemo_curator/utils/distributed_utils.py index 629cc387e..ef69963cf 100644 --- a/nemo_curator/utils/distributed_utils.py +++ b/nemo_curator/utils/distributed_utils.py @@ -21,11 +21,12 @@ import warnings from contextlib import nullcontext from pathlib import Path -from typing import Union +from typing import List, Union import dask.dataframe as dd import numpy as np import pandas as pd +import psutil from dask.distributed import Client, LocalCluster, get_worker, performance_report from nemo_curator.utils.gpu_utils import GPU_INSTALL_STRING, is_cudf_type @@ -611,3 +612,13 @@ def seed_all(seed: int = 42): # Ensure deterministic behavior for CUDA algorithms torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False + + +def get_network_interfaces() -> List[str]: + """ + Gets a list of all valid network interfaces on a machine + + Returns: + A list of all valid network interfaces on a machine + """ + return list(psutil.net_if_addrs().keys()) diff --git a/setup.py b/setup.py index 63ec91035..185fb3afe 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ setup( name="nemo_curator", - version="0.3.0", + version="0.4.0", description="Scalable Data Preprocessing Tool for " "Training Large Language Models", long_description=long_description, @@ -34,6 +34,7 @@ classifiers=[ "Development Status :: 3 - Alpha", "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", ], packages=find_packages(), python_requires=">=3.10, <3.11", From eee50650c2ea214384acf5502abc92cd76d56753 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Fri, 12 Jul 2024 13:39:19 -0700 Subject: [PATCH 13/19] Add tests and fix bugs found during testing (#151) Signed-off-by: Vibhu Jawa --- nemo_curator/modules/semantic_dedup.py | 4 +- nemo_curator/utils/semdedup_utils.py | 2 +- tests/test_semdedup.py | 82 ++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 3 deletions(-) create mode 100644 tests/test_semdedup.py diff --git a/nemo_curator/modules/semantic_dedup.py b/nemo_curator/modules/semantic_dedup.py index 5b95692f1..0d653e285 100644 --- a/nemo_curator/modules/semantic_dedup.py +++ b/nemo_curator/modules/semantic_dedup.py @@ -525,8 +525,8 @@ def __init__( cache_dir = config.cache_dir self.embedding_creator = EmbeddingCreator( embedding_model_name_or_path=config.embedding_model_name_or_path, - max_memory=config.embedding_max_mem_gb, - batch_size=config.embedding_batch_size, + embedding_max_mem_gb=config.embedding_max_mem_gb, + embedding_batch_size=config.embedding_batch_size, input_column=config.input_column, embedding_output_dir=os.path.join(cache_dir, config.embeddings_save_loc), logger=logger, diff --git a/nemo_curator/utils/semdedup_utils.py b/nemo_curator/utils/semdedup_utils.py index be7b6e5aa..986720984 100644 --- a/nemo_curator/utils/semdedup_utils.py +++ b/nemo_curator/utils/semdedup_utils.py @@ -157,7 +157,7 @@ def rank_within_cluster( sort_descending = keep_hard cluster_sorted = sorted( zip(example_id, cluster_dists_to_cent, cluster_label), - key=lambda x: x[2], + key=lambda x: x[1], reverse=sort_descending, ) # -- sort_descending = True for descending sort diff --git a/tests/test_semdedup.py b/tests/test_semdedup.py new file mode 100644 index 000000000..b7543d7c3 --- /dev/null +++ b/tests/test_semdedup.py @@ -0,0 +1,82 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import pytest + +os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" +from dask.dataframe.utils import assert_eq +from distributed import Client + +from nemo_curator import SemDedup, SemDedupConfig +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.import_utils import gpu_only_import, gpu_only_import_from + +cudf = gpu_only_import("cudf") +dask_cudf = gpu_only_import("dask_cudf") +LocalCUDACluster = gpu_only_import_from("dask_cuda", "LocalCUDACluster") + + +@pytest.fixture +def dedup_data(): + df = cudf.DataFrame( + { + "id": [1, 2, 3, 4, 100, 200, 300], + "text": [ + "The quick brown fox jumps over the lazy dog", + "The quick brown foxes jumps over the lazy dog", + "The quick brown wolf jumps over the lazy dog", + "The quick black cat jumps over the lazy dog", + "A test string", + "Another test string", + "A different object", + ], + } + ) + df = dask_cudf.from_cudf(df, 2) + return DocumentDataset(df) + + +@pytest.mark.gpu +class TestFuzzyDuplicates: + @pytest.fixture(autouse=True, scope="class") + def gpu_client(self, request): + with LocalCUDACluster(n_workers=1) as cluster, Client(cluster) as client: + request.cls.client = client + request.cls.cluster = cluster + yield + + def test_fuzzy_dedup( + self, + dedup_data, + tmpdir, + ): + print("client", self.client) + cache_dir = os.path.join(tmpdir, "test_sem_dedup_cache") + config = SemDedupConfig( + cache_dir=cache_dir, + id_col_name="id", + id_col_type="int", + input_column="text", + seed=42, + n_clusters=3, + eps_thresholds=[0.10], + eps_to_extract=0.10, + ) + sem_duplicates = SemDedup(config=config) + result = sem_duplicates(dedup_data) + result_df = result.df.compute() + duplicate_docs = [2, 3, 4, 200, 300] + expected_df = cudf.Series(duplicate_docs, name="id") + assert_eq(result_df["id"].sort_values(), expected_df, check_index=False) From 70acdbfaacf99f928f77a732a95741204b6dbdcd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Jul 2024 10:11:16 -0700 Subject: [PATCH 14/19] [pre-commit.ci] pre-commit suggestions (#135) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/pre-commit/pre-commit-hooks: v4.5.0 → v4.6.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.5.0...v4.6.0) - [github.com/psf/black: 24.3.0 → 24.4.2](https://github.com/psf/black/compare/24.3.0...24.4.2) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) mode change 100755 => 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml old mode 100755 new mode 100644 index 7b599a9de..1aef77856 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ ci: repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v4.6.0 hooks: - id: check-added-large-files args: ['--maxkb=1000'] @@ -35,7 +35,7 @@ repos: - id: trailing-whitespace - repo: https://github.com/psf/black - rev: 24.3.0 + rev: 24.4.2 hooks: - id: black name: Format code From fb630517ee8cc4a061ea1e447ae516294b0ca32f Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Thu, 18 Jul 2024 13:51:18 -0700 Subject: [PATCH 15/19] Fix bug with torch rmm and nemo (#155) * Fix bug with torch rmm and nemo Signed-off-by: Ryan Wolf * Change pycld2 version pin Signed-off-by: Ryan Wolf --------- Signed-off-by: Ryan Wolf --- nemo_curator/services/nemo_client.py | 5 ++--- setup.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/nemo_curator/services/nemo_client.py b/nemo_curator/services/nemo_client.py index f83ba6242..d84c72f12 100644 --- a/nemo_curator/services/nemo_client.py +++ b/nemo_curator/services/nemo_client.py @@ -11,16 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import warnings from typing import Iterable, List, Optional, Union from nemo_curator.services.conversation_formatter import ConversationFormatter -from nemo_curator.utils.import_utils import safe_import_from from .model_client import AsyncLLMClient, LLMClient -NemoQueryLLM = safe_import_from("nemo.deploy.nlp", "NemoQueryLLM") - class NemoDeployClient(LLMClient): """ diff --git a/setup.py b/setup.py index 185fb3afe..c3a16271d 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ "charset_normalizer>=3.1.0", "awscli>=1.22.55", "fasttext==0.9.2", - "pycld2==0.41", + "pycld2", "justext==3.0.1", "resiliparse", "ftfy==6.1.1", From ce7391ae5f88ed05438e18b26d0d3ece03ab51c9 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Fri, 19 Jul 2024 11:14:16 -0700 Subject: [PATCH 16/19] Prevent plugging an allocator twice (#154) * Preving plugging an allocator twice Signed-off-by: Vibhu Jawa * Remove extra import Signed-off-by: Vibhu Jawa * Fix defaults for RMM-POOL and other style fixes Signed-off-by: Vibhu Jawa * Switch it rmm_pytorch off by default Signed-off-by: Vibhu Jawa --------- Signed-off-by: Vibhu Jawa --- .../scripts/domain_classifier_inference.py | 9 +++- nemo_curator/scripts/find_exact_duplicates.py | 1 - .../fuzzy_deduplication/compute_minhashes.py | 1 - .../connected_components.py | 1 - .../fuzzy_deduplication/minhash_lsh.py | 1 - .../scripts/quality_classifier_inference.py | 9 +++- nemo_curator/utils/distributed_utils.py | 15 ++++-- nemo_curator/utils/script_utils.py | 48 +++++++++++++++---- 8 files changed, 66 insertions(+), 19 deletions(-) diff --git a/nemo_curator/scripts/domain_classifier_inference.py b/nemo_curator/scripts/domain_classifier_inference.py index 837435043..59aa5fd7a 100644 --- a/nemo_curator/scripts/domain_classifier_inference.py +++ b/nemo_curator/scripts/domain_classifier_inference.py @@ -43,8 +43,15 @@ def main(): if not os.path.exists(args.output_data_dir): os.makedirs(args.output_data_dir) + # Some times jsonl files are stored as .json + # So to handle that case we can pass the input_file_extension + if args.input_file_extension is not None: + input_file_extension = args.input_file_extension + else: + input_file_extension = args.input_file_type + input_files = get_remaining_files( - args.input_data_dir, args.output_data_dir, args.input_file_type + args.input_data_dir, args.output_data_dir, input_file_extension ) print(f"Total input files {len(input_files)}", flush=True) diff --git a/nemo_curator/scripts/find_exact_duplicates.py b/nemo_curator/scripts/find_exact_duplicates.py index 7f241f182..bae30edd8 100644 --- a/nemo_curator/scripts/find_exact_duplicates.py +++ b/nemo_curator/scripts/find_exact_duplicates.py @@ -37,7 +37,6 @@ def main(args): logger.info(f"Starting workflow with args:\n {args}") assert args.hash_method == "md5", "Currently only md5 hash is supported" - args.set_torch_to_use_rmm = False client = get_client(**ArgumentHelper.parse_client_args(args)) logger.info(f"Client Created {client}") if args.device == "gpu": diff --git a/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py b/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py index 01baee051..2add7587f 100644 --- a/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py +++ b/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py @@ -41,7 +41,6 @@ def main(args): assert args.hash_bytes in {4, 8}, "Currently only 32bit/64bit hashes are supported" assert args.device == "gpu" - args.set_torch_to_use_rmm = False client = get_client(**ArgumentHelper.parse_client_args(args)) logger.info(f"Client Created {client}") client.run(pre_imports) diff --git a/nemo_curator/scripts/fuzzy_deduplication/connected_components.py b/nemo_curator/scripts/fuzzy_deduplication/connected_components.py index 7ec8c5ab5..abaccdbda 100644 --- a/nemo_curator/scripts/fuzzy_deduplication/connected_components.py +++ b/nemo_curator/scripts/fuzzy_deduplication/connected_components.py @@ -31,7 +31,6 @@ def main(args): """ st = time.time() output_path = os.path.join(args.output_dir, "connected_components.parquet") - args.set_torch_to_use_rmm = False args.enable_spilling = True client = get_client(**ArgumentHelper.parse_client_args(args)) diff --git a/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py b/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py index d7575835f..d9740f412 100644 --- a/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py +++ b/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py @@ -39,7 +39,6 @@ def main(args): logger.info(f"Starting workflow with args:\n {args}") assert args.device == "gpu" - args.set_torch_to_use_rmm = False client = get_client(**ArgumentHelper.parse_client_args(args)) logger.info(f"Client Created {client}") client.run(pre_imports) diff --git a/nemo_curator/scripts/quality_classifier_inference.py b/nemo_curator/scripts/quality_classifier_inference.py index 173a8f924..c3260bff5 100644 --- a/nemo_curator/scripts/quality_classifier_inference.py +++ b/nemo_curator/scripts/quality_classifier_inference.py @@ -43,8 +43,15 @@ def main(): if not os.path.exists(args.output_data_dir): os.makedirs(args.output_data_dir) + # Some time jsonl files are stored as .json + # So to handle that case we can pass the input_file_extension + if args.input_file_extension is not None: + input_file_extension = args.input_file_extension + else: + input_file_extension = args.input_file_type + input_files = get_remaining_files( - args.input_data_dir, args.output_data_dir, args.input_file_type + args.input_data_dir, args.output_data_dir, input_file_extension ) print(f"Total input files {len(input_files)}", flush=True) diff --git a/nemo_curator/utils/distributed_utils.py b/nemo_curator/utils/distributed_utils.py index ef69963cf..e8b37fee2 100644 --- a/nemo_curator/utils/distributed_utils.py +++ b/nemo_curator/utils/distributed_utils.py @@ -105,14 +105,14 @@ def get_client( protocol="tcp", rmm_pool_size="1024M", enable_spilling=True, - set_torch_to_use_rmm=True, + set_torch_to_use_rmm=False, ) -> Client: """ Initializes or connects to a Dask cluster. The Dask cluster can be CPU-based or GPU-based (if GPUs are available). The intialization ensures maximum memory efficiency for the GPU by: - 1. Ensuring the PyTorch memory pool is the same as the RAPIDS memory pool. - 2. Enabling spilling for cuDF. + 1. Ensuring the PyTorch memory pool is the same as the RAPIDS memory pool. (If `set_torch_to_use_rmm` is True) + 2. Enabling spilling for cuDF. (If `enable_spilling` is True) Args: cluster_type: The type of cluster to set up. Either "cpu" or "gpu". Defaults to "cpu". @@ -171,11 +171,18 @@ def _set_torch_to_use_rmm(): See article: https://medium.com/rapids-ai/pytorch-rapids-rmm-maximize-the-memory-efficiency-of-your-workflows-f475107ba4d4 - """ + import torch from rmm.allocators.torch import rmm_torch_allocator + if torch.cuda.get_allocator_backend() == "pluggable": + warnings.warn( + "PyTorch allocator already plugged in, not switching to RMM. " + "Please ensure you have not already swapped it." + ) + return + torch.cuda.memory.change_current_allocator(rmm_torch_allocator) diff --git a/nemo_curator/utils/script_utils.py b/nemo_curator/utils/script_utils.py index 107ccd5e2..84f8609d9 100644 --- a/nemo_curator/utils/script_utils.py +++ b/nemo_curator/utils/script_utils.py @@ -281,6 +281,22 @@ def add_arg_text_ddf_blocksize(self): help="The block size for chunking jsonl files for text ddf in mb", ) + def add_arg_model_path(self, help="The path to the model file"): + self.parser.add_argument( + "--model-path", + type=str, + help=help, + required=True, + ) + + def add_arg_autocaset(self, help="Whether to use autocast or not"): + ArgumentHelper.attach_bool_arg( + parser=self.parser, + flag_name="autocast", + default=True, + help=help, + ) + def add_distributed_args(self) -> argparse.ArgumentParser: """ Adds default set of arguments that are needed for Dask cluster setup @@ -392,10 +408,26 @@ def parse_distributed_classifier_args( description, formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) - parser = ArgumentHelper(parser).add_distributed_args() + argumentHelper = ArgumentHelper(parser) + argumentHelper.add_distributed_args() + argumentHelper.add_arg_input_data_dir(required=True) + argumentHelper.add_arg_output_data_dir(help="The path of the output files") + argumentHelper.add_arg_input_file_type() + argumentHelper.add_arg_input_file_extension() + argumentHelper.add_arg_output_file_type() + argumentHelper.add_arg_input_text_field() + argumentHelper.add_arg_enable_spilling() + argumentHelper.add_arg_set_torch_to_use_rmm() + argumentHelper.add_arg_batch_size( + help="The batch size to be used for inference" + ) + argumentHelper.add_arg_model_path() + argumentHelper.add_arg_autocaset() + # Set low default RMM pool size for classifier # to allow pytorch to grow its memory usage # by default +<<<<<<< HEAD parser.set_defaults(rmm_pool_size="512MB") parser.add_argument( "--input-data-dir", @@ -443,16 +475,13 @@ def parse_distributed_classifier_args( help="Whether to enable spilling or not", ) +======= + argumentHelper.parser.set_defaults(rmm_pool_size="512MB") +>>>>>>> fb12646 (Prevent plugging an allocator twice (#154)) # Setting to False makes it more stable for long running jobs # possibly because of memory fragmentation - ArgumentHelper.attach_bool_arg( - parser, - "set-torch-to-use-rmm", - default=False, - help="Whether to set torch to use RMM or not", - ) - - return parser + argumentHelper.parser.set_defaults(set_torch_to_use_rmm=False) + return argumentHelper.parser @staticmethod def parse_gpu_dedup_args(description: str) -> argparse.ArgumentParser: @@ -472,6 +501,7 @@ def parse_gpu_dedup_args(description: str) -> argparse.ArgumentParser: # Set default device to GPU for dedup argumentHelper.parser.set_defaults(device="gpu") + argumentHelper.parser.set_defaults(set_torch_to_use_rmm=False) argumentHelper.parser.add_argument( "--input-data-dirs", type=str, From ba031061186ea76df0438c7658afbebd4190b7ce Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 19 Jul 2024 14:45:12 -0700 Subject: [PATCH 17/19] type hints Signed-off-by: Sarah Yurick --- .../modules/distributed_data_classifier.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/nemo_curator/modules/distributed_data_classifier.py b/nemo_curator/modules/distributed_data_classifier.py index 4b8c6a887..1e2abb237 100644 --- a/nemo_curator/modules/distributed_data_classifier.py +++ b/nemo_curator/modules/distributed_data_classifier.py @@ -17,6 +17,7 @@ os.environ["RAPIDS_NO_INITIALIZE"] = "1" from abc import ABC, abstractmethod from dataclasses import dataclass +from typing import List import torch import torch.nn as nn @@ -48,7 +49,12 @@ class QualityModelConfig: # TODO: Remove this class after Quality Model is uploaded to HuggingFace class NCCustomModel(nn.Module): def __init__( - self, config, out_dim, config_path=None, pretrained=False, autocast=False + self, + config: dataclass, + out_dim: int, + config_path: str = None, + pretrained: bool = False, + autocast: bool = False, ): super().__init__() self.config = config @@ -102,7 +108,7 @@ def forward(self, batch): class HFCustomModel(nn.Module, PyTorchModelHubMixin): - def __init__(self, config): + def __init__(self, config: dataclass): super(HFCustomModel, self).__init__() self.model = AutoModel.from_pretrained(config["base_model"]) self.dropout = nn.Dropout(config["fc_dropout"]) @@ -179,7 +185,7 @@ def _filter_documents( raise TypeError("filter_by must be a string or list type") - def get_labels(self): + def get_labels(self) -> List[str]: return self.labels @@ -235,7 +241,7 @@ def _run_classifier_helper( class DomainModel(HFModel): - def __init__(self, config, autocast=False): + def __init__(self, config: dataclass, autocast: bool = False): self.config = config self.autocast = autocast super().__init__(self.config.model) From c38ae32fc00514c322fa6fad7491e6a4d856d1a1 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Mon, 22 Jul 2024 13:52:01 -0700 Subject: [PATCH 18/19] cluster_type gpu Signed-off-by: Sarah Yurick --- .../distributed_data_classification.ipynb | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tutorials/distributed_data_classification/distributed_data_classification.ipynb b/tutorials/distributed_data_classification/distributed_data_classification.ipynb index eea0276de..a2701b8ff 100644 --- a/tutorials/distributed_data_classification/distributed_data_classification.ipynb +++ b/tutorials/distributed_data_classification/distributed_data_classification.ipynb @@ -20,7 +20,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "env: PYTHONWARNINGS=ignore\n" + "env: PYTHONWARNINGS=ignore\n", + "env: DASK_DATAFRAME__QUERY_PLANNING=False\n" ] } ], @@ -28,6 +29,7 @@ "# Silence Warnings (HuggingFace internal warnings)\n", "\n", "%env PYTHONWARNINGS=ignore\n", + "%env DASK_DATAFRAME__QUERY_PLANNING=False\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] @@ -56,7 +58,7 @@ "metadata": {}, "outputs": [], "source": [ - "client = get_client()" + "client = get_client(cluster_type=\"gpu\")" ] }, { @@ -208,11 +210,15 @@ "name": "stderr", "output_type": "stream", "text": [ +<<<<<<< HEAD <<<<<<< HEAD "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.23it/s]" ======= "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:02<00:00, 3.62it/s]" >>>>>>> 19692e0 (add dataframe example (#137)) +======= + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.12it/s]\n" +>>>>>>> 80eb2d2 (cluster_type gpu) ] }, { @@ -220,6 +226,7 @@ "output_type": "stream", "text": [ "Writing to disk complete for 1 partitions\n", +<<<<<<< HEAD <<<<<<< HEAD "CPU times: user 4.69 s, sys: 5.13 s, total: 9.82 s\n", "Wall time: 12.7 s\n" @@ -238,6 +245,10 @@ ======= "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:03<00:00, 3.30it/s]\n" >>>>>>> 19692e0 (add dataframe example (#137)) +======= + "CPU times: user 393 ms, sys: 244 ms, total: 638 ms\n", + "Wall time: 6.04 s\n" +>>>>>>> 80eb2d2 (cluster_type gpu) ] } ], From e121329b417ac049c3be8047983eb36c964730fd Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Tue, 23 Jul 2024 11:53:30 -0700 Subject: [PATCH 19/19] signed commits Signed-off-by: Sarah Yurick