From 02f3ee062e046c6a7e3e7d8ddb0718783c10c123 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Tue, 26 Apr 2022 11:34:26 +0000
Subject: [PATCH 01/12] add new preprocessing arguments

---
 .../codeparrot/scripts/arguments.py                  | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/examples/research_projects/codeparrot/scripts/arguments.py b/examples/research_projects/codeparrot/scripts/arguments.py
index a94cda2d2f1b..5bf2fa0efd72 100644
--- a/examples/research_projects/codeparrot/scripts/arguments.py
+++ b/examples/research_projects/codeparrot/scripts/arguments.py
@@ -130,7 +130,7 @@ class PreprocessingArguments:
         },
     )
     dataset_name: Optional[str] = field(
-        default="codeparrot", metadata={"help": "Folder or name of dataset to process."}
+        default="transformersbook/codeparrot", metadata={"help": "Folder or name of dataset to process."}
     )
     output_dir: Optional[str] = field(
         default="codeparrot-clean", metadata={"help": "Folder to save processed processed dataset."}
@@ -148,6 +148,16 @@ class PreprocessingArguments:
     alpha_frac: Optional[float] = field(
         default=0.25, metadata={"help": "Maximum fraction of non-alphanumeric characters, otherwise file is filtered."}
     )
+    min_token_ratio: Optional[float] = field(
+        default=1.5, metadata={"help": "Minimum character token ratio for the file, otherwise file is filtered."}
+    )
+    filter_proba: Optional[float] = field(
+        default=0.7, metadata={"help": "Probability for filtering config, test and uncommon files."}
+    )
+    tokenizer_dir: Optional[str] = field(
+        default="lvwerra/codeparrot",
+        metadata={"help": "Name or path to the tokenizer."},
+    )
 
 
 @dataclass

From 10b8e5c0e375bbf527be027850bc94990c86a889 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Tue, 26 Apr 2022 11:35:00 +0000
Subject: [PATCH 02/12] add new filters

---
 .../codeparrot/scripts/preprocessing.py       | 72 ++++++++++++++++++-
 1 file changed, 70 insertions(+), 2 deletions(-)

diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py
index 4e09379a943f..5ef12b430c98 100644
--- a/examples/research_projects/codeparrot/scripts/preprocessing.py
+++ b/examples/research_projects/codeparrot/scripts/preprocessing.py
@@ -9,7 +9,7 @@
 from datasets import load_dataset
 
 from arguments import PreprocessingArguments
-from transformers import HfArgumentParser
+from transformers import AutoTokenizer, HfArgumentParser
 
 
 def get_hash(example):
@@ -50,18 +50,77 @@ def is_autogenerated(example, scan_width=5):
         return {"autogenerated": False}
 
 
+def is_config_test(example, scan_width=5, max_config=4, max_test=4, max_len=8000):
+    """Check if file is a configuration file or a unit test by :
+         1- looking for keywords in the first few lines of the file
+         2- counting number of occurence of the words 'config' and 'test'.
+    Remark: for the second test we only check files with less than 8000 characters"""
+
+    keywords = ["unit tests", "test file", "configuration file"]
+    lines = example["content"].splitlines()
+    count_config = 0
+    count_test = 0
+    # first test
+    for _, line in zip(range(scan_width), lines):
+        for keyword in keywords:
+            if keyword in line.lower():
+                return {"config_test": True}
+    # second test
+    if len(example["content"]) < max_len:
+        for line in lines:
+            count_config += line.lower().count("config")
+            count_test += line.lower().count("test")
+            if count_config > max_config or count_test > max_test:
+                return {"config_test": True}
+    return {"config_test": False}
+
+
+def is_uncommon(example):
+    """Check if a python file is uncommon, we define uncommun files as those without at
+    least one of the following: funcion, class, for loop, while loop"""
+    keywords = ["def ", "class ", "for ", "while "]
+    lines = example["content"].splitlines()
+    for line in lines:
+        for keyword in keywords:
+            if keyword in line.lower():
+                return {"uncommon": False}
+    return {"uncommon": True}
+
+
+def is_unusual(example, min=4):
+    """Check if file is unusual, uses symbol '=' less than min times"""
+    lines = example["content"].splitlines()
+    counter = 0
+    for line in lines:
+        counter += line.lower().count("=")
+        if counter > min:
+            return {"unusual": False}
+    return {"unusual": True}
+
+
+def get_char_token_ratio(example):
+    """Get character/token ratio of the file using BPE tokenizer."""
+    input_ids = tokenizer(example["content"], truncation=False)["input_ids"]
+    ratio = len(example["content"]) / len(input_ids)
+    return {"ratio": ratio}
+
+
 def preprocess(example):
     """Chain all preprocessing steps into one function to not fill cache."""
     results = dict()
     results.update(get_hash(example))
     results.update(line_stats(example))
     results.update(alpha_stats(example))
+    results.update(get_char_token_ratio(example))
     results.update(is_autogenerated(example))
+    results.update(is_config_test(example))
+    results.update(is_uncommon(example))
+    results.update(is_unusual(example))
     return results
 
 
 def filter(example, uniques, args):
-    """Filter dataset with heuristics."""
+    """Filter dataset with heuristics. Config, test and uncommon files are removed with a given probability"""
     if not check_uniques(example, uniques):
         return False
     elif example["autogenerated"]:
@@ -72,6 +131,14 @@ def filter(example, uniques, args):
         return False
     elif example["alpha_frac"] < args.alpha_frac:
         return False
+    elif example["ratio"] < args.min_token_ratio:
+        return False
+    elif example["config_test"] and np.random.rand() <= args.filter_proba:
+        return False
+    elif example["uncommon"] and np.random.rand() <= args.filter_proba:
+        return False
+    elif example["unusual"]:
+        return False
     else:
         return True
 
@@ -89,6 +156,7 @@ def compress_file(file_path):
 args = parser.parse_args()
 if args.num_workers is None:
     args.num_workers = multiprocessing.cpu_count()
+tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
 
 # Load dataset
 t_start = time.time()

From d947a33da263e198b7bec53dea8470bcb7e7d5d2 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Tue, 26 Apr 2022 12:24:59 +0000
Subject: [PATCH 03/12] add new filters to readme

---
 examples/research_projects/codeparrot/README.md | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/examples/research_projects/codeparrot/README.md b/examples/research_projects/codeparrot/README.md
index 2b51b3ba4b57..f21a1c458475 100644
--- a/examples/research_projects/codeparrot/README.md
+++ b/examples/research_projects/codeparrot/README.md
@@ -37,7 +37,7 @@ Additionally, sure you have git-lfs installed. You can find instructions for how
 The source of the dataset is the GitHub dump available on Google's [BigQuery](https://cloud.google.com/blog/topics/public-datasets/github-on-bigquery-analyze-all-the-open-source-code). The database was queried for all Python files with less than 1MB in size resulting in a 180GB dataset with over 20M files. The dataset is available on the Hugging Face Hub [here](https://huggingface.co/datasets/transformersbook/codeparrot).
 
 ### Preprocessing
-The raw dataset contains many duplicates. We deduplicated and filtered the dataset using the heuristics proposed in OpenAI's Codex [paper](https://arxiv.org/abs/2107.03374):
+The raw dataset contains many duplicates. We deduplicated and filtered the dataset using the heuristics proposed in OpenAI's Codex [paper](https://arxiv.org/abs/2107.03374) and some new ones:
 
 - exact deduplication using each file's hash
 - filtering files with max line length > 1000
@@ -45,12 +45,18 @@ The raw dataset contains many duplicates. We deduplicated and filtered the datas
 - fraction of alphanumeric characters < 0.25
 - containing the word "auto-generated" or similar in the first 5 lines
 
+- filtering files with ratio between number of characters and number of tokens after tokenization < 1.5 (the average ratio is 3.6)
+- filtering with a probability of 0.7 of files with a mention of "test file" or "configuration file" or similar in the first 5 lines
+- filtering with a probability of 0.7 of short and medium sized files that mention the words "test " or "config" more than 4 times
+- filtering with a probability of 0.7  of files without a mention of the keywords `def` , `for`, `while`  and `class`
+- filtering files that use the assignment operator `=` less than 5 times 
+
 The script to process the full dataset can be found in `scripts/preprocessing.py`. Executing the script on 16 vCPUs takes roughly 3h and removes 70% of the original dataset. The cleaned [train](https://huggingface.co/datasets/lvwerra/codeparrot-clean-train) and [validation](https://huggingface.co/datasets/lvwerra/codeparrot-clean-valid) splits are also available on the Hub if you want to skip this step or use the data for another project.
 
 To execute the preprocessing run the following command:
 ```bash
 python scripts/preprocessing.py \
---dataset_name lvwerra/codeparrot \
+--dataset_name transformersbook/codeparrot \
 --output_dir codeparrot-clean
 ```
 During preprocessing the dataset is downloaded and stored locally as well as caches of the computations. Make sure you have more than 500GB free disk space to execute it.

From 9fb2e4b1b983e5cf2c62959c9e270416932076e7 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Wed, 27 Apr 2022 09:56:43 +0000
Subject: [PATCH 04/12] fix config and test count, update function names and
 docstrings

---
 .../codeparrot/scripts/preprocessing.py       | 56 +++++++++----------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py
index 5ef12b430c98..e181ce088557 100644
--- a/examples/research_projects/codeparrot/scripts/preprocessing.py
+++ b/examples/research_projects/codeparrot/scripts/preprocessing.py
@@ -50,11 +50,11 @@ def is_autogenerated(example, scan_width=5):
         return {"autogenerated": False}
 
 
-def is_config_test(example, scan_width=5, max_config=4, max_test=4, max_len=8000):
+def is_config_test(example, scan_width=5, coeff=5):
     """Check if file is a configuration file or a unit test by :
-         1- looking for keywords in the first few lines of the file
-         2- counting number of occurence of the words 'config' and 'test'.
-    Remark: for the second test we only check files with less than 8000 characters"""
+         1- looking for keywords in the first few lines of the file.
+         2- counting number of occurence of the words 'config' and 'test' with respect to number of lines.
+    """
 
     keywords = ["unit tests", "test file", "configuration file"]
     lines = example["content"].splitlines()
@@ -66,40 +66,40 @@ def is_config_test(example, scan_width=5, max_config=4, max_test=4, max_len=8000
             if keyword in line.lower():
                 return {"config_test": True}
     # second test
-    if len(example["content"]) < max_len:
-        for line in lines:
-            count_config += line.lower().count("config")
-            count_test += line.lower().count("test")
-            if count_config > max_config or count_test > max_test:
-                return {"config_test": True}
+    nlines = example["content"].count('\n')
+    threshold = coeff if nlines < 300 else coeff * (nlines//300)
+    for line in lines:
+        count_config += line.lower().count("config")
+        count_test += line.lower().count("test")
+        if count_config > threshold or count_test > threshold:
+            return {"config_test": True}
     return {"config_test": False}
 
 
-def is_uncommon(example):
-    """Check if a python file is uncommon, we define uncommun files as those without at
-    least one of the following: funcion, class, for loop, while loop"""
+def has_no_keywords(example):
+    """Check if a python file has none of the keywords for: funcion, class, for loop, while loop."""
     keywords = ["def ", "class ", "for ", "while "]
     lines = example["content"].splitlines()
     for line in lines:
         for keyword in keywords:
             if keyword in line.lower():
-                return {"uncommon": False}
-    return {"uncommon": True}
+                return {"has_no_keywords": False}
+    return {"has_no_keywords": True}
 
 
-def is_unusual(example, min=4):
-    """Check if file is unusual, uses symbol '=' less than min times"""
+def few_assignments(example, minimum=4):
+    """Check if file uses symbol '=' less than `minimum` times."""
     lines = example["content"].splitlines()
     counter = 0
     for line in lines:
         counter += line.lower().count("=")
-        if counter > min:
-            return {"unusual": False}
-    return {"unusual": True}
+        if counter > minimum:
+            return {"few_assignments": False}
+    return {"few_assignments": True}
 
 
-def get_char_token_ratio(example):
-    """Get character/token ratio of the file using BPE tokenizer."""
+def char_token_ratio(example):
+    """Compute character/token ratio of the file with tokenizer."""
     input_ids = tokenizer(example["content"], truncation=False)["input_ids"]
     ratio = len(example["content"]) / len(input_ids)
     return {"ratio": ratio}
@@ -111,16 +111,16 @@ def preprocess(example):
     results.update(get_hash(example))
     results.update(line_stats(example))
     results.update(alpha_stats(example))
-    results.update(get_char_token_ratio(example))
+    results.update(char_token_ratio(example))
     results.update(is_autogenerated(example))
     results.update(is_config_test(example))
-    results.update(is_uncommon(example))
-    results.update(is_unusual(example))
+    results.update(has_no_keywords(example))
+    results.update(few_assignments(example))
     return results
 
 
 def filter(example, uniques, args):
-    """Filter dataset with heuristics. Config, test and uncommon files are removed with a given probability"""
+    """Filter dataset with heuristics. Config, test and has_no_keywords files are removed with a given probability."""
     if not check_uniques(example, uniques):
         return False
     elif example["autogenerated"]:
@@ -135,9 +135,9 @@ def filter(example, uniques, args):
         return False
     elif example["config_test"] and np.random.rand() <= args.filter_proba:
         return False
-    elif example["uncommon"] and np.random.rand() <= args.filter_proba:
+    elif example["has_no_keywords"] and np.random.rand() <= args.filter_proba:
         return False
-    elif example["unusual"]:
+    elif example["few_assignments"]:
         return False
     else:
         return True

From aebdd3a720b505647f92c606cb5053f4a55043f5 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Wed, 27 Apr 2022 10:06:43 +0000
Subject: [PATCH 05/12] reformat code

---
 .../research_projects/codeparrot/scripts/preprocessing.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py
index e181ce088557..a038ff68c90b 100644
--- a/examples/research_projects/codeparrot/scripts/preprocessing.py
+++ b/examples/research_projects/codeparrot/scripts/preprocessing.py
@@ -52,8 +52,8 @@ def is_autogenerated(example, scan_width=5):
 
 def is_config_test(example, scan_width=5, coeff=5):
     """Check if file is a configuration file or a unit test by :
-         1- looking for keywords in the first few lines of the file.
-         2- counting number of occurence of the words 'config' and 'test' with respect to number of lines.
+    1- looking for keywords in the first few lines of the file.
+    2- counting number of occurence of the words 'config' and 'test' with respect to number of lines.
     """
 
     keywords = ["unit tests", "test file", "configuration file"]
@@ -66,8 +66,8 @@ def is_config_test(example, scan_width=5, coeff=5):
             if keyword in line.lower():
                 return {"config_test": True}
     # second test
-    nlines = example["content"].count('\n')
-    threshold = coeff if nlines < 300 else coeff * (nlines//300)
+    nlines = example["content"].count("\n")
+    threshold = coeff if nlines < 300 else coeff * (nlines // 300)
     for line in lines:
         count_config += line.lower().count("config")
         count_test += line.lower().count("test")

From bb5a5dedd371aad27ba8687a354e98072d4dad5a Mon Sep 17 00:00:00 2001
From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com>
Date: Thu, 28 Apr 2022 01:48:24 +0200
Subject: [PATCH 06/12] update readme

---
 examples/research_projects/codeparrot/README.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/research_projects/codeparrot/README.md b/examples/research_projects/codeparrot/README.md
index f21a1c458475..5541a89027e2 100644
--- a/examples/research_projects/codeparrot/README.md
+++ b/examples/research_projects/codeparrot/README.md
@@ -44,14 +44,13 @@ The raw dataset contains many duplicates. We deduplicated and filtered the datas
 - filtering files with mean line length > 100
 - fraction of alphanumeric characters < 0.25
 - containing the word "auto-generated" or similar in the first 5 lines
-
-- filtering files with ratio between number of characters and number of tokens after tokenization < 1.5 (the average ratio is 3.6)
 - filtering with a probability of 0.7 of files with a mention of "test file" or "configuration file" or similar in the first 5 lines
-- filtering with a probability of 0.7 of short and medium sized files that mention the words "test " or "config" more than 4 times
+- filtering with a probability of 0.7 of files with high occurence of the keywords "test " or "config" 
 - filtering with a probability of 0.7  of files without a mention of the keywords `def` , `for`, `while`  and `class`
 - filtering files that use the assignment operator `=` less than 5 times 
+- filtering files with ratio between number of characters and number of tokens after tokenization < 1.5 (the average ratio is 3.6)
 
-The script to process the full dataset can be found in `scripts/preprocessing.py`. Executing the script on 16 vCPUs takes roughly 3h and removes 70% of the original dataset. The cleaned [train](https://huggingface.co/datasets/lvwerra/codeparrot-clean-train) and [validation](https://huggingface.co/datasets/lvwerra/codeparrot-clean-valid) splits are also available on the Hub if you want to skip this step or use the data for another project.
+The script to process the full dataset can be found in `scripts/preprocessing.py`. Executing the script on 16 vCPUs takes roughly 3h and removes 70% of the original dataset. The cleaned [train](https://huggingface.co/datasets/loubnabnl/codeparrot-clean-train-new) and [validation](https://huggingface.co/datasets/loubnabnl/codeparrot-clean-valid-new) splits are also available on the Hub if you want to skip this step or use the data for another project.
 
 To execute the preprocessing run the following command:
 ```bash

From 954eaefb7e19c5e7688f10957930b59d27f8bd83 Mon Sep 17 00:00:00 2001
From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com>
Date: Thu, 28 Apr 2022 01:58:44 +0200
Subject: [PATCH 07/12] Update readme

---
 examples/research_projects/codeparrot/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/codeparrot/README.md b/examples/research_projects/codeparrot/README.md
index 5541a89027e2..95d6ae407882 100644
--- a/examples/research_projects/codeparrot/README.md
+++ b/examples/research_projects/codeparrot/README.md
@@ -50,7 +50,7 @@ The raw dataset contains many duplicates. We deduplicated and filtered the datas
 - filtering files that use the assignment operator `=` less than 5 times 
 - filtering files with ratio between number of characters and number of tokens after tokenization < 1.5 (the average ratio is 3.6)
 
-The script to process the full dataset can be found in `scripts/preprocessing.py`. Executing the script on 16 vCPUs takes roughly 3h and removes 70% of the original dataset. The cleaned [train](https://huggingface.co/datasets/loubnabnl/codeparrot-clean-train-new) and [validation](https://huggingface.co/datasets/loubnabnl/codeparrot-clean-valid-new) splits are also available on the Hub if you want to skip this step or use the data for another project.
+The script to process the full dataset can be found in `scripts/preprocessing.py`. Executing the script on 16 vCPUs takes roughly 3h and removes 70% of the original dataset. The cleaned [train](https://huggingface.co/datasets/loubnabnl/codeparrot-clean-train-v2) and [validation](https://huggingface.co/datasets/loubnabnl/codeparrot-clean-valid-v2) splits are also available on the Hub if you want to skip this step or use the data for another project.
 
 To execute the preprocessing run the following command:
 ```bash

From 9c0f87111b2e3a64cfbfefc0f16263c48afe0df3 Mon Sep 17 00:00:00 2001
From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com>
Date: Wed, 4 May 2022 11:01:36 +0200
Subject: [PATCH 08/12] rename config_test filter

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>
---
 examples/research_projects/codeparrot/scripts/preprocessing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py
index a038ff68c90b..f6ca9eadf67a 100644
--- a/examples/research_projects/codeparrot/scripts/preprocessing.py
+++ b/examples/research_projects/codeparrot/scripts/preprocessing.py
@@ -50,7 +50,7 @@ def is_autogenerated(example, scan_width=5):
         return {"autogenerated": False}
 
 
-def is_config_test(example, scan_width=5, coeff=5):
+def is_config_or_test(example, scan_width=5, coeff=5):
     """Check if file is a configuration file or a unit test by :
     1- looking for keywords in the first few lines of the file.
     2- counting number of occurence of the words 'config' and 'test' with respect to number of lines.

From c23a30f04a826238c9d86dd49134331e6a3d24ef Mon Sep 17 00:00:00 2001
From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com>
Date: Wed, 4 May 2022 11:02:05 +0200
Subject: [PATCH 09/12] rename few_assignments filter

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>
---
 examples/research_projects/codeparrot/scripts/preprocessing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py
index f6ca9eadf67a..72ea440231bc 100644
--- a/examples/research_projects/codeparrot/scripts/preprocessing.py
+++ b/examples/research_projects/codeparrot/scripts/preprocessing.py
@@ -87,7 +87,7 @@ def has_no_keywords(example):
     return {"has_no_keywords": True}
 
 
-def few_assignments(example, minimum=4):
+def has_few_assignments(example, minimum=4):
     """Check if file uses symbol '=' less than `minimum` times."""
     lines = example["content"].splitlines()
     counter = 0

From edbde3c12777918d91543a44ee56a56e0c4cbefc Mon Sep 17 00:00:00 2001
From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com>
Date: Wed, 4 May 2022 11:02:59 +0200
Subject: [PATCH 10/12] rename tokenizer in arguments

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>
---
 examples/research_projects/codeparrot/scripts/arguments.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/codeparrot/scripts/arguments.py b/examples/research_projects/codeparrot/scripts/arguments.py
index 5bf2fa0efd72..c05e06f5640a 100644
--- a/examples/research_projects/codeparrot/scripts/arguments.py
+++ b/examples/research_projects/codeparrot/scripts/arguments.py
@@ -154,7 +154,7 @@ class PreprocessingArguments:
     filter_proba: Optional[float] = field(
         default=0.7, metadata={"help": "Probability for filtering config, test and uncommon files."}
     )
-    tokenizer_dir: Optional[str] = field(
+    tokenizer: Optional[str] = field(
         default="lvwerra/codeparrot",
         metadata={"help": "Name or path to the tokenizer."},
     )

From b22a73f58e2774ea5cc93c01a0354cb090a43aec Mon Sep 17 00:00:00 2001
From: Loubna ben allal <loubnabenallal@gmail.com>
Date: Wed, 4 May 2022 09:30:24 +0000
Subject: [PATCH 11/12] rename functions and add limit_line argument for
 config_test filter

---
 .../codeparrot/scripts/preprocessing.py       | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py
index 72ea440231bc..c15a03dac256 100644
--- a/examples/research_projects/codeparrot/scripts/preprocessing.py
+++ b/examples/research_projects/codeparrot/scripts/preprocessing.py
@@ -50,7 +50,7 @@ def is_autogenerated(example, scan_width=5):
         return {"autogenerated": False}
 
 
-def is_config_or_test(example, scan_width=5, coeff=5):
+def is_config_or_test(example, scan_width=5, coeff=5, limit_lines=300):
     """Check if file is a configuration file or a unit test by :
     1- looking for keywords in the first few lines of the file.
     2- counting number of occurence of the words 'config' and 'test' with respect to number of lines.
@@ -64,16 +64,16 @@ def is_config_or_test(example, scan_width=5, coeff=5):
     for _, line in zip(range(scan_width), lines):
         for keyword in keywords:
             if keyword in line.lower():
-                return {"config_test": True}
+                return {"config_or_test": True}
     # second test
     nlines = example["content"].count("\n")
-    threshold = coeff if nlines < 300 else coeff * (nlines // 300)
+    threshold = coeff if nlines < limit_lines else coeff * (nlines // limit_lines)
     for line in lines:
         count_config += line.lower().count("config")
         count_test += line.lower().count("test")
         if count_config > threshold or count_test > threshold:
-            return {"config_test": True}
-    return {"config_test": False}
+            return {"config_or_test": True}
+    return {"config_or_test": False}
 
 
 def has_no_keywords(example):
@@ -94,8 +94,8 @@ def has_few_assignments(example, minimum=4):
     for line in lines:
         counter += line.lower().count("=")
         if counter > minimum:
-            return {"few_assignments": False}
-    return {"few_assignments": True}
+            return {"has_few_assignments": False}
+    return {"has_few_assignments": True}
 
 
 def char_token_ratio(example):
@@ -113,9 +113,9 @@ def preprocess(example):
     results.update(alpha_stats(example))
     results.update(char_token_ratio(example))
     results.update(is_autogenerated(example))
-    results.update(is_config_test(example))
+    results.update(is_config_or_test(example))
     results.update(has_no_keywords(example))
-    results.update(few_assignments(example))
+    results.update(has_few_assignments(example))
     return results
 
 
@@ -133,11 +133,11 @@ def filter(example, uniques, args):
         return False
     elif example["ratio"] < args.min_token_ratio:
         return False
-    elif example["config_test"] and np.random.rand() <= args.filter_proba:
+    elif example["config_or_test"] and np.random.rand() <= args.filter_proba:
         return False
     elif example["has_no_keywords"] and np.random.rand() <= args.filter_proba:
         return False
-    elif example["few_assignments"]:
+    elif example["has_few_assignments"]:
         return False
     else:
         return True

From 74fe7fa7ed6aaf294e0b72d9e0ecec7fe79d4ed0 Mon Sep 17 00:00:00 2001
From: Loubna ben allal <loubnabenallal@gmail.com>
Date: Wed, 4 May 2022 13:07:10 +0000
Subject: [PATCH 12/12] update threshold for config_test filter

---
 .../research_projects/codeparrot/scripts/preprocessing.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py
index c15a03dac256..0e5899f5de9a 100644
--- a/examples/research_projects/codeparrot/scripts/preprocessing.py
+++ b/examples/research_projects/codeparrot/scripts/preprocessing.py
@@ -50,7 +50,7 @@ def is_autogenerated(example, scan_width=5):
         return {"autogenerated": False}
 
 
-def is_config_or_test(example, scan_width=5, coeff=5, limit_lines=300):
+def is_config_or_test(example, scan_width=5, coeff=0.05):
     """Check if file is a configuration file or a unit test by :
     1- looking for keywords in the first few lines of the file.
     2- counting number of occurence of the words 'config' and 'test' with respect to number of lines.
@@ -67,7 +67,7 @@ def is_config_or_test(example, scan_width=5, coeff=5, limit_lines=300):
                 return {"config_or_test": True}
     # second test
     nlines = example["content"].count("\n")
-    threshold = coeff if nlines < limit_lines else coeff * (nlines // limit_lines)
+    threshold = int(coeff * nlines)
     for line in lines:
         count_config += line.lower().count("config")
         count_test += line.lower().count("test")