Merge branch 'main' into rywolf/justext

Signed-off-by: Ryan Wolf <[email protected]>
NVIDIA · Jul 8, 2024 · f513b4c · f513b4c
2 parents c888411 + e557ee3
commit f513b4c
Show file tree

Hide file tree

Showing 18 changed files with 1,683 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -39,8 +39,9 @@ NeMo Curator provides a collection of scalable data-mining modules. Some of the
 
 - [Document-level deduplication](docs/user-guide/gpudeduplication.rst)
 
-  - Both exact and fuzzy (near-identical) deduplication are accelerated using cuDF and Dask
+  - exact and fuzzy (near-identical) deduplication are accelerated using cuDF and Dask
   - For fuzzy deduplication, our implementation follows the method described in [Microsoft Turing NLG 530B](https://arxiv.org/abs/2201.11990)
+  - For semantic deduplication,  our implementation follows the method described in [SemDeDup] (https://arxiv.org/pdf/2303.09540) by Meta AI (FAIR) (https://github.com/facebookresearch/SemDeDup)
 
 - [Multilingual downstream-task decontamination](docs/user-guide/taskdecontamination.rst) following the approach of [OpenAI GPT3](https://arxiv.org/pdf/2005.14165.pdf) and [Microsoft Turing NLG 530B](https://arxiv.org/abs/2201.11990)
 

diff --git a/config/sem_dedup_config.yaml b/config/sem_dedup_config.yaml
@@ -0,0 +1,32 @@
+# Configuration file for semdantic dedup
+cache_dir: "semdedup_cache"
+num_files: 16
+id_col_name: "id"
+id_col_type: "int"
+input_column: "text"
+
+# Embeddings configuration
+embeddings_save_loc: "embeddings"
+embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
+embedding_batch_size: 128
+embedding_max_mem_gb: 25
+
+# Clustering configuration
+clustering_save_loc: "clustering_results"
+n_clusters: 1000
+seed: 1234
+max_iter: 100
+kmeans_with_cos_dist: false
+
+# Semdedup configuration
+which_to_keep: "hard"
+largest_cluster_size_to_process: 100000
+sim_metric: "cosine"
+
+# Extract dedup configuration
+eps_thresholds:
+  - 0.01
+  - 0.001
+
+# Which threshold to use for extracting deduped data
+eps_to_extract: 0.01
diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst
@@ -46,4 +46,3 @@
    personalidentifiableinformationidentificationandremoval.rst
    distributeddataclassification.rst
    kubernetescurator.rst
-
diff --git a/examples/semdedup_example.py b/examples/semdedup_example.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import time
+
+from nemo_curator.datasets import DocumentDataset
+from nemo_curator.log import create_logger
+from nemo_curator.modules.config import SemDedupConfig
+from nemo_curator.modules.semantic_dedup import SemDedup
+from nemo_curator.utils.distributed_utils import get_client, read_data
+from nemo_curator.utils.file_utils import (
+    expand_outdir_and_mkdir,
+    get_all_files_paths_under,
+)
+from nemo_curator.utils.script_utils import ArgumentHelper
+
+
+def silence_hf_warnings():
+    from transformers.utils import logging
+
+    logging.set_verbosity_error()
+
+
+def main(args):
+    semdedup_config = SemDedupConfig.from_yaml(args.config_file)
+    client = get_client(**ArgumentHelper.parse_client_args(args))
+
+    silence_hf_warnings()
+    client.run(silence_hf_warnings)
+
+    expand_outdir_and_mkdir(semdedup_config.cache_dir)
+    logger = create_logger(
+        rank=0,
+        name="logger-end-to_end-semdup",
+        log_file=os.path.join(semdedup_config.cache_dir, "compute_embeddings.log"),
+        log_level=logging.INFO,
+        stdout=True,
+    )
+    st = time.time()
+    input_files = get_all_files_paths_under(
+        root=args.input_data_dir,
+    )
+    if semdedup_config.num_files > 0:
+        input_files = input_files[: semdedup_config.num_files]
+    logger.info(f"Processing {len(input_files)} files")
+    ddf = read_data(
+        input_files=input_files,
+        file_type=args.input_file_type,
+        add_filename=False,
+        backend="cudf",
+    )
+    dataset = DocumentDataset(ddf)
+    semdup = SemDedup(semdedup_config, logger=logger)
+    dedup_ids = semdup(dataset)
+    print(dedup_ids.df.head())
+    logger.info(f"Time taken: {time.time() - st}")
+    client.cancel(client.futures, force=True)
+    client.close()
+
+
+def attach_args():
+    parser = ArgumentHelper.parse_semdedup_args(add_input_args=True)
+    return parser
+
+
+def console_script():
+    main(attach_args().parse_args())
+
+
+if __name__ == "__main__":
+    main(attach_args().parse_args())
diff --git a/nemo_curator/log.py b/nemo_curator/log.py
@@ -19,7 +19,7 @@
 from nemo_curator.utils.file_utils import expand_outdir_and_mkdir
 
 
-def create_logger(rank, log_file, name="logger", log_level=logging.INFO):
+def create_logger(rank, log_file, name="logger", log_level=logging.INFO, stdout=False):
     # Create the logger
     logger = logging.getLogger(name)
     logger.setLevel(log_level)
@@ -36,8 +36,12 @@ def create_logger(rank, log_file, name="logger", log_level=logging.INFO):
     file_handler.setFormatter(formatter)
     logger.addHandler(file_handler)
 
-    logger = logging.LoggerAdapter(logger, extra)
+    if stdout:
+        stdout_handler = logging.StreamHandler()
+        stdout_handler.setFormatter(formatter)
+        logger.addHandler(stdout_handler)
 
+    logger = logging.LoggerAdapter(logger, extra)
     return logger
 
 

diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py
@@ -22,7 +22,7 @@
 from nemo_curator.utils.import_utils import gpu_only_import_from
 
 from .add_id import AddId
-from .config import FuzzyDuplicatesConfig
+from .config import FuzzyDuplicatesConfig, SemDedupConfig
 from .dataset_ops import blend_datasets, Shuffle
 from .exact_dedup import ExactDuplicates
 from .filter import Filter, Score, ScoreFilter
@@ -36,10 +36,19 @@
 FuzzyDuplicates = gpu_only_import_from(
     "nemo_curator.modules.fuzzy_dedup", "FuzzyDuplicates"
 )
-
 # Pytorch related imports must come after all imports that require cugraph,
 # because of context cleanup issues b/w pytorch and cugraph
 # See this issue: https://github.com/rapidsai/cugraph/issues/2718
+SemDedup = gpu_only_import_from("nemo_curator.modules.semantic_dedup", "SemDedup")
+EmbeddingCreator = gpu_only_import_from(
+    "nemo_curator.modules.semantic_dedup", "EmbeddingCreator"
+)
+ClusteringModel = gpu_only_import_from(
+    "nemo_curator.modules.semantic_dedup", "ClusteringModel"
+)
+SemanticClusterLevelDedup = gpu_only_import_from(
+    "nemo_curator.modules.semantic_dedup", "SemanticClusterLevelDedup"
+)
 from .distributed_data_classifier import DomainClassifier, QualityClassifier
 
 __all__ = [
@@ -59,4 +68,9 @@
     "AddId",
     "blend_datasets",
     "Shuffle",
+    "SemDedup",
+    "SemDedupConfig",
+    "EmbeddingCreator",
+    "ClusteringModel",
+    "SemanticClusterLevelDedup",
 ]
diff --git a/nemo_curator/modules/config.py b/nemo_curator/modules/config.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 
 import warnings
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+from typing import List
 
 import yaml
 
@@ -98,3 +99,70 @@ def __post_init__(self):
             raise ValueError("Jaccard Threshold must be between [0,1]")
         if self.buckets_per_shuffle <= 0:
             raise ValueError("Buckets per shuffle must be greater than 0")
+
+
+@dataclass
+class SemDedupConfig(BaseConfig):
+    """
+    Configuration for Semantic Deduplication.
+
+    Attributes:
+        cache_dir (str): Directory to store cache.
+        num_files (int): Number of files. Default is -1, meaning all files.
+        id_col_name (str): Column name for ID.
+        id_col_type (str): Column type for ID.
+        input_column (str): Input column for embeddings.
+        embeddings_save_loc (str): Location to save embeddings.
+        embedding_model_name_or_path (str): Model name or path for embeddings.
+        embedding_batch_size (int): Inital Batch size for processing embeddings.
+        embedding_max_mem_gb (int): Maximum memory in GB for embeddings.
+        clustering_save_loc (str): Location to save clustering results.
+        n_clusters (int): Number of clusters.
+        seed (int): Seed for clustering.
+        max_iter (int): Maximum iterations for clustering.
+        kmeans_with_cos_dist (bool): Use KMeans with cosine distance.
+        which_to_keep (str): Which duplicates to keep.
+        largest_cluster_size_to_process (int): Largest cluster size to process.
+        sim_metric (str): Similarity metric for deduplication.
+        eps_thresholds (List[float]): Epsilon thresholds to calculate if semantically similar or not.
+        eps_to_extract (float): Epsilon value to extract deduplicated data.
+    """
+
+    cache_dir: str
+    num_files: int = -1
+    id_col_name: str = "id"
+    id_col_type: str = "str"
+    input_column: str = "text"
+
+    # Embeddings
+    embeddings_save_loc: str = "embeddings"
+    embedding_model_name_or_path: str = "sentence-transformers/all-MiniLM-L6-v2"
+    embedding_batch_size: int = 128
+    embedding_max_mem_gb: int = 25
+
+    # Clustering config
+    clustering_save_loc: str = "clustering_results"
+    n_clusters: int = 1000
+    seed: int = 1234
+    max_iter: int = 100
+    kmeans_with_cos_dist: bool = False
+
+    # Semdedup config
+    which_to_keep: str = "hard"
+    largest_cluster_size_to_process: int = 100000
+    sim_metric: str = "cosine"
+
+    # Extract dedup config
+    eps_thresholds: List[float] = field(default_factory=lambda: [0.01, 0.001])
+    eps_to_extract: float = 0.01
+
+    def __post_init__(self):
+        if self.cache_dir is None:
+            raise ValueError(
+                "Finding sem-dedup requires a cache directory accessible via all workers to store intermediates"
+            )
+
+        if self.eps_to_extract not in self.eps_thresholds:
+            raise ValueError(
+                f"Epsilon to extract {self.eps_to_extract} must be in eps_thresholds {self.eps_thresholds}"
+            )
Original file line number	Diff line number	Diff line change
Expand Up		@@ -46,4 +46,3 @@
		personalidentifiableinformationidentificationandremoval.rst
		distributeddataclassification.rst
		kubernetescurator.rst