Skip to content

Commit

Permalink
Merge branch 'main' into rywolf/justext
Browse files Browse the repository at this point in the history
Signed-off-by: Ryan Wolf <[email protected]>
  • Loading branch information
ryantwolf committed Jul 8, 2024
2 parents c888411 + e557ee3 commit f513b4c
Show file tree
Hide file tree
Showing 18 changed files with 1,683 additions and 12 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ NeMo Curator provides a collection of scalable data-mining modules. Some of the

- [Document-level deduplication](docs/user-guide/gpudeduplication.rst)

- Both exact and fuzzy (near-identical) deduplication are accelerated using cuDF and Dask
- exact and fuzzy (near-identical) deduplication are accelerated using cuDF and Dask
- For fuzzy deduplication, our implementation follows the method described in [Microsoft Turing NLG 530B](https://arxiv.org/abs/2201.11990)
- For semantic deduplication, our implementation follows the method described in [SemDeDup] (https://arxiv.org/pdf/2303.09540) by Meta AI (FAIR) (https://github.com/facebookresearch/SemDeDup)

- [Multilingual downstream-task decontamination](docs/user-guide/taskdecontamination.rst) following the approach of [OpenAI GPT3](https://arxiv.org/pdf/2005.14165.pdf) and [Microsoft Turing NLG 530B](https://arxiv.org/abs/2201.11990)

Expand Down
32 changes: 32 additions & 0 deletions config/sem_dedup_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Configuration file for semdantic dedup
cache_dir: "semdedup_cache"
num_files: 16
id_col_name: "id"
id_col_type: "int"
input_column: "text"

# Embeddings configuration
embeddings_save_loc: "embeddings"
embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
embedding_batch_size: 128
embedding_max_mem_gb: 25

# Clustering configuration
clustering_save_loc: "clustering_results"
n_clusters: 1000
seed: 1234
max_iter: 100
kmeans_with_cos_dist: false

# Semdedup configuration
which_to_keep: "hard"
largest_cluster_size_to_process: 100000
sim_metric: "cosine"

# Extract dedup configuration
eps_thresholds:
- 0.01
- 0.001

# Which threshold to use for extracting deduped data
eps_to_extract: 0.01
1 change: 0 additions & 1 deletion docs/user-guide/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,3 @@
personalidentifiableinformationidentificationandremoval.rst
distributeddataclassification.rst
kubernetescurator.rst

84 changes: 84 additions & 0 deletions examples/semdedup_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import time

from nemo_curator.datasets import DocumentDataset
from nemo_curator.log import create_logger
from nemo_curator.modules.config import SemDedupConfig
from nemo_curator.modules.semantic_dedup import SemDedup
from nemo_curator.utils.distributed_utils import get_client, read_data
from nemo_curator.utils.file_utils import (
expand_outdir_and_mkdir,
get_all_files_paths_under,
)
from nemo_curator.utils.script_utils import ArgumentHelper


def silence_hf_warnings():
from transformers.utils import logging

logging.set_verbosity_error()


def main(args):
semdedup_config = SemDedupConfig.from_yaml(args.config_file)
client = get_client(**ArgumentHelper.parse_client_args(args))

silence_hf_warnings()
client.run(silence_hf_warnings)

expand_outdir_and_mkdir(semdedup_config.cache_dir)
logger = create_logger(
rank=0,
name="logger-end-to_end-semdup",
log_file=os.path.join(semdedup_config.cache_dir, "compute_embeddings.log"),
log_level=logging.INFO,
stdout=True,
)
st = time.time()
input_files = get_all_files_paths_under(
root=args.input_data_dir,
)
if semdedup_config.num_files > 0:
input_files = input_files[: semdedup_config.num_files]
logger.info(f"Processing {len(input_files)} files")
ddf = read_data(
input_files=input_files,
file_type=args.input_file_type,
add_filename=False,
backend="cudf",
)
dataset = DocumentDataset(ddf)
semdup = SemDedup(semdedup_config, logger=logger)
dedup_ids = semdup(dataset)
print(dedup_ids.df.head())
logger.info(f"Time taken: {time.time() - st}")
client.cancel(client.futures, force=True)
client.close()


def attach_args():
parser = ArgumentHelper.parse_semdedup_args(add_input_args=True)
return parser


def console_script():
main(attach_args().parse_args())


if __name__ == "__main__":
main(attach_args().parse_args())
8 changes: 6 additions & 2 deletions nemo_curator/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from nemo_curator.utils.file_utils import expand_outdir_and_mkdir


def create_logger(rank, log_file, name="logger", log_level=logging.INFO):
def create_logger(rank, log_file, name="logger", log_level=logging.INFO, stdout=False):
# Create the logger
logger = logging.getLogger(name)
logger.setLevel(log_level)
Expand All @@ -36,8 +36,12 @@ def create_logger(rank, log_file, name="logger", log_level=logging.INFO):
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

logger = logging.LoggerAdapter(logger, extra)
if stdout:
stdout_handler = logging.StreamHandler()
stdout_handler.setFormatter(formatter)
logger.addHandler(stdout_handler)

logger = logging.LoggerAdapter(logger, extra)
return logger


Expand Down
18 changes: 16 additions & 2 deletions nemo_curator/modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from nemo_curator.utils.import_utils import gpu_only_import_from

from .add_id import AddId
from .config import FuzzyDuplicatesConfig
from .config import FuzzyDuplicatesConfig, SemDedupConfig
from .dataset_ops import blend_datasets, Shuffle
from .exact_dedup import ExactDuplicates
from .filter import Filter, Score, ScoreFilter
Expand All @@ -36,10 +36,19 @@
FuzzyDuplicates = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup", "FuzzyDuplicates"
)

# Pytorch related imports must come after all imports that require cugraph,
# because of context cleanup issues b/w pytorch and cugraph
# See this issue: https://github.com/rapidsai/cugraph/issues/2718
SemDedup = gpu_only_import_from("nemo_curator.modules.semantic_dedup", "SemDedup")
EmbeddingCreator = gpu_only_import_from(
"nemo_curator.modules.semantic_dedup", "EmbeddingCreator"
)
ClusteringModel = gpu_only_import_from(
"nemo_curator.modules.semantic_dedup", "ClusteringModel"
)
SemanticClusterLevelDedup = gpu_only_import_from(
"nemo_curator.modules.semantic_dedup", "SemanticClusterLevelDedup"
)
from .distributed_data_classifier import DomainClassifier, QualityClassifier

__all__ = [
Expand All @@ -59,4 +68,9 @@
"AddId",
"blend_datasets",
"Shuffle",
"SemDedup",
"SemDedupConfig",
"EmbeddingCreator",
"ClusteringModel",
"SemanticClusterLevelDedup",
]
70 changes: 69 additions & 1 deletion nemo_curator/modules/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
# limitations under the License.

import warnings
from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import List

import yaml

Expand Down Expand Up @@ -98,3 +99,70 @@ def __post_init__(self):
raise ValueError("Jaccard Threshold must be between [0,1]")
if self.buckets_per_shuffle <= 0:
raise ValueError("Buckets per shuffle must be greater than 0")


@dataclass
class SemDedupConfig(BaseConfig):
"""
Configuration for Semantic Deduplication.
Attributes:
cache_dir (str): Directory to store cache.
num_files (int): Number of files. Default is -1, meaning all files.
id_col_name (str): Column name for ID.
id_col_type (str): Column type for ID.
input_column (str): Input column for embeddings.
embeddings_save_loc (str): Location to save embeddings.
embedding_model_name_or_path (str): Model name or path for embeddings.
embedding_batch_size (int): Inital Batch size for processing embeddings.
embedding_max_mem_gb (int): Maximum memory in GB for embeddings.
clustering_save_loc (str): Location to save clustering results.
n_clusters (int): Number of clusters.
seed (int): Seed for clustering.
max_iter (int): Maximum iterations for clustering.
kmeans_with_cos_dist (bool): Use KMeans with cosine distance.
which_to_keep (str): Which duplicates to keep.
largest_cluster_size_to_process (int): Largest cluster size to process.
sim_metric (str): Similarity metric for deduplication.
eps_thresholds (List[float]): Epsilon thresholds to calculate if semantically similar or not.
eps_to_extract (float): Epsilon value to extract deduplicated data.
"""

cache_dir: str
num_files: int = -1
id_col_name: str = "id"
id_col_type: str = "str"
input_column: str = "text"

# Embeddings
embeddings_save_loc: str = "embeddings"
embedding_model_name_or_path: str = "sentence-transformers/all-MiniLM-L6-v2"
embedding_batch_size: int = 128
embedding_max_mem_gb: int = 25

# Clustering config
clustering_save_loc: str = "clustering_results"
n_clusters: int = 1000
seed: int = 1234
max_iter: int = 100
kmeans_with_cos_dist: bool = False

# Semdedup config
which_to_keep: str = "hard"
largest_cluster_size_to_process: int = 100000
sim_metric: str = "cosine"

# Extract dedup config
eps_thresholds: List[float] = field(default_factory=lambda: [0.01, 0.001])
eps_to_extract: float = 0.01

def __post_init__(self):
if self.cache_dir is None:
raise ValueError(
"Finding sem-dedup requires a cache directory accessible via all workers to store intermediates"
)

if self.eps_to_extract not in self.eps_thresholds:
raise ValueError(
f"Epsilon to extract {self.eps_to_extract} must be in eps_thresholds {self.eps_thresholds}"
)
Loading

0 comments on commit f513b4c

Please sign in to comment.