diff --git a/examples/research_projects/codeparrot/README.md b/examples/research_projects/codeparrot/README.md index 2b51b3ba4b57..95d6ae407882 100644 --- a/examples/research_projects/codeparrot/README.md +++ b/examples/research_projects/codeparrot/README.md @@ -37,20 +37,25 @@ Additionally, sure you have git-lfs installed. You can find instructions for how The source of the dataset is the GitHub dump available on Google's [BigQuery](https://cloud.google.com/blog/topics/public-datasets/github-on-bigquery-analyze-all-the-open-source-code). The database was queried for all Python files with less than 1MB in size resulting in a 180GB dataset with over 20M files. The dataset is available on the Hugging Face Hub [here](https://huggingface.co/datasets/transformersbook/codeparrot). ### Preprocessing -The raw dataset contains many duplicates. We deduplicated and filtered the dataset using the heuristics proposed in OpenAI's Codex [paper](https://arxiv.org/abs/2107.03374): +The raw dataset contains many duplicates. We deduplicated and filtered the dataset using the heuristics proposed in OpenAI's Codex [paper](https://arxiv.org/abs/2107.03374) and some new ones: - exact deduplication using each file's hash - filtering files with max line length > 1000 - filtering files with mean line length > 100 - fraction of alphanumeric characters < 0.25 - containing the word "auto-generated" or similar in the first 5 lines +- filtering with a probability of 0.7 of files with a mention of "test file" or "configuration file" or similar in the first 5 lines +- filtering with a probability of 0.7 of files with high occurence of the keywords "test " or "config" +- filtering with a probability of 0.7 of files without a mention of the keywords `def` , `for`, `while` and `class` +- filtering files that use the assignment operator `=` less than 5 times +- filtering files with ratio between number of characters and number of tokens after tokenization < 1.5 (the average ratio is 3.6) -The script to process the full dataset can be found in `scripts/preprocessing.py`. Executing the script on 16 vCPUs takes roughly 3h and removes 70% of the original dataset. The cleaned [train](https://huggingface.co/datasets/lvwerra/codeparrot-clean-train) and [validation](https://huggingface.co/datasets/lvwerra/codeparrot-clean-valid) splits are also available on the Hub if you want to skip this step or use the data for another project. +The script to process the full dataset can be found in `scripts/preprocessing.py`. Executing the script on 16 vCPUs takes roughly 3h and removes 70% of the original dataset. The cleaned [train](https://huggingface.co/datasets/loubnabnl/codeparrot-clean-train-v2) and [validation](https://huggingface.co/datasets/loubnabnl/codeparrot-clean-valid-v2) splits are also available on the Hub if you want to skip this step or use the data for another project. To execute the preprocessing run the following command: ```bash python scripts/preprocessing.py \ ---dataset_name lvwerra/codeparrot \ +--dataset_name transformersbook/codeparrot \ --output_dir codeparrot-clean ``` During preprocessing the dataset is downloaded and stored locally as well as caches of the computations. Make sure you have more than 500GB free disk space to execute it. diff --git a/examples/research_projects/codeparrot/scripts/arguments.py b/examples/research_projects/codeparrot/scripts/arguments.py index a94cda2d2f1b..c05e06f5640a 100644 --- a/examples/research_projects/codeparrot/scripts/arguments.py +++ b/examples/research_projects/codeparrot/scripts/arguments.py @@ -130,7 +130,7 @@ class PreprocessingArguments: }, ) dataset_name: Optional[str] = field( - default="codeparrot", metadata={"help": "Folder or name of dataset to process."} + default="transformersbook/codeparrot", metadata={"help": "Folder or name of dataset to process."} ) output_dir: Optional[str] = field( default="codeparrot-clean", metadata={"help": "Folder to save processed processed dataset."} @@ -148,6 +148,16 @@ class PreprocessingArguments: alpha_frac: Optional[float] = field( default=0.25, metadata={"help": "Maximum fraction of non-alphanumeric characters, otherwise file is filtered."} ) + min_token_ratio: Optional[float] = field( + default=1.5, metadata={"help": "Minimum character token ratio for the file, otherwise file is filtered."} + ) + filter_proba: Optional[float] = field( + default=0.7, metadata={"help": "Probability for filtering config, test and uncommon files."} + ) + tokenizer: Optional[str] = field( + default="lvwerra/codeparrot", + metadata={"help": "Name or path to the tokenizer."}, + ) @dataclass diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py index 4e09379a943f..0e5899f5de9a 100644 --- a/examples/research_projects/codeparrot/scripts/preprocessing.py +++ b/examples/research_projects/codeparrot/scripts/preprocessing.py @@ -9,7 +9,7 @@ from datasets import load_dataset from arguments import PreprocessingArguments -from transformers import HfArgumentParser +from transformers import AutoTokenizer, HfArgumentParser def get_hash(example): @@ -50,18 +50,77 @@ def is_autogenerated(example, scan_width=5): return {"autogenerated": False} +def is_config_or_test(example, scan_width=5, coeff=0.05): + """Check if file is a configuration file or a unit test by : + 1- looking for keywords in the first few lines of the file. + 2- counting number of occurence of the words 'config' and 'test' with respect to number of lines. + """ + + keywords = ["unit tests", "test file", "configuration file"] + lines = example["content"].splitlines() + count_config = 0 + count_test = 0 + # first test + for _, line in zip(range(scan_width), lines): + for keyword in keywords: + if keyword in line.lower(): + return {"config_or_test": True} + # second test + nlines = example["content"].count("\n") + threshold = int(coeff * nlines) + for line in lines: + count_config += line.lower().count("config") + count_test += line.lower().count("test") + if count_config > threshold or count_test > threshold: + return {"config_or_test": True} + return {"config_or_test": False} + + +def has_no_keywords(example): + """Check if a python file has none of the keywords for: funcion, class, for loop, while loop.""" + keywords = ["def ", "class ", "for ", "while "] + lines = example["content"].splitlines() + for line in lines: + for keyword in keywords: + if keyword in line.lower(): + return {"has_no_keywords": False} + return {"has_no_keywords": True} + + +def has_few_assignments(example, minimum=4): + """Check if file uses symbol '=' less than `minimum` times.""" + lines = example["content"].splitlines() + counter = 0 + for line in lines: + counter += line.lower().count("=") + if counter > minimum: + return {"has_few_assignments": False} + return {"has_few_assignments": True} + + +def char_token_ratio(example): + """Compute character/token ratio of the file with tokenizer.""" + input_ids = tokenizer(example["content"], truncation=False)["input_ids"] + ratio = len(example["content"]) / len(input_ids) + return {"ratio": ratio} + + def preprocess(example): """Chain all preprocessing steps into one function to not fill cache.""" results = dict() results.update(get_hash(example)) results.update(line_stats(example)) results.update(alpha_stats(example)) + results.update(char_token_ratio(example)) results.update(is_autogenerated(example)) + results.update(is_config_or_test(example)) + results.update(has_no_keywords(example)) + results.update(has_few_assignments(example)) return results def filter(example, uniques, args): - """Filter dataset with heuristics.""" + """Filter dataset with heuristics. Config, test and has_no_keywords files are removed with a given probability.""" if not check_uniques(example, uniques): return False elif example["autogenerated"]: @@ -72,6 +131,14 @@ def filter(example, uniques, args): return False elif example["alpha_frac"] < args.alpha_frac: return False + elif example["ratio"] < args.min_token_ratio: + return False + elif example["config_or_test"] and np.random.rand() <= args.filter_proba: + return False + elif example["has_no_keywords"] and np.random.rand() <= args.filter_proba: + return False + elif example["has_few_assignments"]: + return False else: return True @@ -89,6 +156,7 @@ def compress_file(file_path): args = parser.parse_args() if args.num_workers is None: args.num_workers = multiprocessing.cpu_count() +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir) # Load dataset t_start = time.time()