16
16
import pandas as pd
17
17
import pytest
18
18
19
- import nemo_curator
19
+ import nemo_curator as nc
20
20
from nemo_curator .datasets import DocumentDataset
21
21
22
22
@@ -41,10 +41,10 @@ def two_partition_dataset():
41
41
)
42
42
43
43
44
- class TestPrepareTaskData :
44
+ class TestAddId :
45
45
def test_basic_id (self , single_partition_dataset ):
46
46
id_field = "id"
47
- add_id = nemo_curator .AddId (id_field )
47
+ add_id = nc .AddId (id_field , start_index = 0 )
48
48
id_dataset = add_id (single_partition_dataset )
49
49
actual_ids = id_dataset .df [id_field ].compute ()
50
50
expected_ids = pd .Series (
@@ -63,7 +63,7 @@ def test_basic_id(self, single_partition_dataset):
63
63
64
64
def test_two_partitions (self , two_partition_dataset ):
65
65
id_field = "id"
66
- add_id = nemo_curator .AddId (id_field )
66
+ add_id = nc .AddId (id_field , start_index = 0 )
67
67
id_dataset = add_id (two_partition_dataset )
68
68
actual_ids = id_dataset .df [id_field ].compute ()
69
69
expected_ids = pd .Series (
@@ -83,7 +83,7 @@ def test_two_partitions(self, two_partition_dataset):
83
83
def test_id_prefix (self , two_partition_dataset ):
84
84
id_field = "id"
85
85
id_prefix = "my_id"
86
- add_id = nemo_curator .AddId (id_field , id_prefix = id_prefix )
86
+ add_id = nc .AddId (id_field , id_prefix = id_prefix , start_index = 0 )
87
87
id_dataset = add_id (two_partition_dataset )
88
88
actual_ids = id_dataset .df [id_field ].compute ()
89
89
expected_ids = pd .Series (
@@ -103,7 +103,7 @@ def test_id_prefix(self, two_partition_dataset):
103
103
def test_start_index (self , two_partition_dataset ):
104
104
id_field = "id"
105
105
start_index = 13
106
- add_id = nemo_curator .AddId (id_field , start_index = start_index )
106
+ add_id = nc .AddId (id_field , start_index = start_index )
107
107
id_dataset = add_id (two_partition_dataset )
108
108
actual_ids = id_dataset .df [id_field ].compute ()
109
109
expected_ids = pd .Series (
@@ -119,3 +119,41 @@ def test_start_index(self, two_partition_dataset):
119
119
assert all (
120
120
expected_ids == actual_ids
121
121
), f"Expected: { expected_ids } , got: { actual_ids } "
122
+
123
+ def test_fast_id_single_partition (self , single_partition_dataset ):
124
+ id_field = "id"
125
+ add_id = nc .AddId (id_field )
126
+ id_dataset = add_id (single_partition_dataset )
127
+ actual_ids = id_dataset .df [id_field ].compute ()
128
+ expected_ids = pd .Series (
129
+ [
130
+ "doc_id-00" ,
131
+ "doc_id-10" ,
132
+ "doc_id-20" ,
133
+ "doc_id-30" ,
134
+ "doc_id-40" ,
135
+ ]
136
+ )
137
+
138
+ assert all (
139
+ expected_ids == actual_ids
140
+ ), f"Expected: { expected_ids } , got: { actual_ids } "
141
+
142
+ def test_fast_id_two_partitions (self , two_partition_dataset ):
143
+ id_field = "id"
144
+ add_id = nc .AddId (id_field )
145
+ id_dataset = add_id (two_partition_dataset )
146
+ actual_ids = id_dataset .df [id_field ].compute ()
147
+ expected_ids = pd .Series (
148
+ [
149
+ "doc_id-00" ,
150
+ "doc_id-10" ,
151
+ "doc_id-20" ,
152
+ "doc_id-01" ,
153
+ "doc_id-11" ,
154
+ ]
155
+ )
156
+
157
+ assert all (
158
+ expected_ids == actual_ids
159
+ ), f"Expected: { expected_ids } , got: { actual_ids } "
0 commit comments