|
16 | 16 | import re
|
17 | 17 | from pathlib import Path
|
18 | 18 |
|
| 19 | +import pandas as pd |
19 | 20 | import pytest
|
| 21 | +from dask import dataframe as dd |
| 22 | +from dask.distributed import Client, LocalCluster |
20 | 23 |
|
| 24 | +import nemo_curator as nc |
| 25 | +from nemo_curator.datasets import DocumentDataset |
| 26 | +from nemo_curator.filters import DocumentFilter |
| 27 | +from nemo_curator.modifiers import PiiModifier |
21 | 28 | from nemo_curator.pii.algorithm import PiiDeidentifier
|
| 29 | +from nemo_curator.utils.decorators import batched |
22 | 30 |
|
23 | 31 | LOGGER = logging.getLogger(__name__)
|
24 | 32 |
|
@@ -118,3 +126,63 @@ def test_batch_accuracy(self):
|
118 | 126 | match = all(compare_outputs(x, y) for x, y in zip(outputs, targets))
|
119 | 127 | print("Matches:", "No" if not match else "Yes")
|
120 | 128 | assert match == True
|
| 129 | + |
| 130 | + |
| 131 | +class BatchedLengthFilter(DocumentFilter): |
| 132 | + """ |
| 133 | + Keeps documents of a given length |
| 134 | + """ |
| 135 | + |
| 136 | + def __init__(self, min_length=5, max_length=10): |
| 137 | + super().__init__() |
| 138 | + self.min_length = min_length |
| 139 | + self.max_length = max_length |
| 140 | + |
| 141 | + @batched |
| 142 | + def score_document(self, df): |
| 143 | + return df.str.len() |
| 144 | + |
| 145 | + @batched |
| 146 | + def keep_document(self, scores): |
| 147 | + min_threshold = self.min_length <= scores |
| 148 | + max_threshold = scores <= self.max_length |
| 149 | + return min_threshold & max_threshold |
| 150 | + |
| 151 | + |
| 152 | +class TestPIIModule: |
| 153 | + def test_filter_chain(self): |
| 154 | + inputs = [ |
| 155 | + "Alice goes on a walk", |
| 156 | + "Bob goes on a walk", |
| 157 | + "Someone named Charlie goes on a walk", |
| 158 | + "A human walking is David", |
| 159 | + "A human walking is Eliza", |
| 160 | + ] |
| 161 | + targets = [ |
| 162 | + "***** goes on a walk", |
| 163 | + "*** goes on a walk", |
| 164 | + "A human walking is *****", |
| 165 | + "A human walking is *****", |
| 166 | + ] |
| 167 | + input_df = pd.DataFrame({"text": inputs}) |
| 168 | + target_df = pd.DataFrame({"text": targets}) |
| 169 | + with LocalCluster(n_workers=1, threads_per_worker=1) as cluster: |
| 170 | + with Client(cluster): |
| 171 | + input_dataset = DocumentDataset(dd.from_pandas(input_df, npartitions=1)) |
| 172 | + pipeline = nc.Sequential( |
| 173 | + [ |
| 174 | + nc.ScoreFilter( |
| 175 | + BatchedLengthFilter(min_length=0, max_length=25) |
| 176 | + ), |
| 177 | + nc.Modify( |
| 178 | + PiiModifier( |
| 179 | + language="en", anonymize_action="mask", device="cpu" |
| 180 | + ) |
| 181 | + ), |
| 182 | + ] |
| 183 | + ) |
| 184 | + output_dataset = pipeline(input_dataset) |
| 185 | + |
| 186 | + output_df = output_dataset.df.compute().reset_index(drop=True) |
| 187 | + match = all(output_df["text"] == target_df["text"]) |
| 188 | + assert match |
0 commit comments