diff --git a/examples/Error_Types.ipynb b/examples/Error_Types.ipynb
index 0a7f83b..5aabf75 100644
--- a/examples/Error_Types.ipynb
+++ b/examples/Error_Types.ipynb
@@ -16,7 +16,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 17,
"id": "e031b356-c92e-4e6c-9422-ea968c81aa64",
"metadata": {},
"outputs": [],
@@ -41,7 +41,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 18,
"id": "eaf1bf17-08f2-4627-a6bb-318b60e3528d",
"metadata": {},
"outputs": [],
@@ -57,7 +57,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 19,
"id": "18931616-22c3-4f63-8e7d-c0710d924716",
"metadata": {},
"outputs": [],
@@ -110,7 +110,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 20,
"id": "351080b6-841f-4e5e-809f-0a237eaa4fe5",
"metadata": {},
"outputs": [
@@ -172,10 +172,10 @@
"
3.0 | \n",
" Alice | \n",
" 1984 | \n",
- " 3.0 | \n",
- " False | \n",
+ " 3.1 | \n",
" False | \n",
" False | \n",
+ " True | \n",
" \n",
" \n",
" | 2 | \n",
@@ -196,10 +196,10 @@
" 4.0 | \n",
" Bob | \n",
" The Great Gatsby | \n",
- " 4.1 | \n",
+ " 4.0 | \n",
+ " False | \n",
" False | \n",
" False | \n",
- " True | \n",
"
\n",
" \n",
" | 4 | \n",
@@ -242,14 +242,14 @@
" error_mask \n",
" rating typist book_title rating \n",
"0 1.0 False False False \n",
- "1 3.0 False False False \n",
+ "1 3.1 False False True \n",
"2 3.0 False False False \n",
- "3 4.1 False False True \n",
+ "3 4.0 False False False \n",
"4 2.0 False False False \n",
"5 1.0 False False False "
]
},
- "execution_count": 5,
+ "execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
@@ -272,7 +272,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 21,
"id": "ec59d663-3e0a-45a8-a662-9ca8674f192c",
"metadata": {},
"outputs": [
@@ -346,10 +346,10 @@
" 3.0 | \n",
" Alice | \n",
" Pride and Prejudice | \n",
- " 3.0 | \n",
- " False | \n",
+ " 3.1 | \n",
" False | \n",
" False | \n",
+ " True | \n",
"
\n",
" \n",
" | 3 | \n",
@@ -370,10 +370,10 @@
" 2.0 | \n",
" Bob | \n",
" Moby-Dick | \n",
- " 2.1 | \n",
+ " 2.0 | \n",
+ " False | \n",
" False | \n",
" False | \n",
- " True | \n",
"
\n",
" \n",
" | 5 | \n",
@@ -405,13 +405,13 @@
" rating typist book_title rating \n",
"0 1.0 False False False \n",
"1 3.0 False False False \n",
- "2 3.0 False False False \n",
+ "2 3.1 False False True \n",
"3 4.0 False False False \n",
- "4 2.1 False False True \n",
+ "4 2.0 False False False \n",
"5 1.0 False False False "
]
},
- "execution_count": 6,
+ "execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@@ -438,7 +438,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 22,
"id": "bcfc46a3-16dc-4427-b580-6d1dd09b9d87",
"metadata": {},
"outputs": [
@@ -577,7 +577,7 @@
"5 1.0 False False False "
]
},
- "execution_count": 7,
+ "execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
@@ -600,7 +600,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 23,
"id": "a07d9c15-c43c-48e0-beb4-b074855e557c",
"metadata": {},
"outputs": [
@@ -650,10 +650,10 @@
" 1.0 | \n",
" Alice | \n",
" To Kill a Mockingbird | \n",
- " 1.0 | \n",
- " False | \n",
+ " 9999.0 | \n",
" False | \n",
" False | \n",
+ " True | \n",
"
\n",
" \n",
" | 1 | \n",
@@ -698,10 +698,10 @@
" 2.0 | \n",
" Bob | \n",
" Moby-Dick | \n",
- " 9999.0 | \n",
+ " 2.0 | \n",
+ " False | \n",
" False | \n",
" False | \n",
- " True | \n",
"
\n",
" \n",
" | 5 | \n",
@@ -731,15 +731,15 @@
"\n",
" error_mask \n",
" rating typist book_title rating \n",
- "0 1.0 False False False \n",
+ "0 9999.0 False False True \n",
"1 3.0 False False False \n",
"2 3.0 False False False \n",
"3 4.0 False False False \n",
- "4 9999.0 False False True \n",
+ "4 2.0 False False False \n",
"5 1.0 False False False "
]
},
- "execution_count": 8,
+ "execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
@@ -770,7 +770,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 24,
"id": "4834399f-c5b6-4d3d-8bd8-2869ec4637a5",
"metadata": {},
"outputs": [
@@ -885,11 +885,11 @@
" 3 | \n",
" Clara | \n",
" ¡Nos vemos mañana! | \n",
- " 11/10 1 p.m. | \n",
+ " 1 p.m. | \n",
+ " False | \n",
" False | \n",
" False | \n",
" False | \n",
- " True | \n",
"
\n",
" \n",
" | 5 | \n",
@@ -900,11 +900,11 @@
" 4 | \n",
" David | \n",
" Ich hätte Hunger. | \n",
- " 1 p.m. | \n",
- " False | \n",
+ " 11/10 1 p.m. | \n",
" False | \n",
" False | \n",
" False | \n",
+ " True | \n",
"
\n",
" \n",
"\n",
@@ -926,11 +926,11 @@
"1 Привет, как дела? 3 p.m. False False False False \n",
"2 今日はどうですか 3 p.m. False False False False \n",
"3 Ça va bien, merci. 11/10 4 a.m. False False False True \n",
- "4 ¡Nos vemos mañana! 11/10 1 p.m. False False False True \n",
- "5 Ich hätte Hunger. 1 p.m. False False False False "
+ "4 ¡Nos vemos mañana! 1 p.m. False False False False \n",
+ "5 Ich hätte Hunger. 11/10 1 p.m. False False False True "
]
},
- "execution_count": 9,
+ "execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
@@ -956,7 +956,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 25,
"id": "948c980c-9b16-4bde-b63b-9e450a7e1ac5",
"metadata": {},
"outputs": [
@@ -1116,7 +1116,7 @@
"5 Ich hätte Hunger. 1 p.m. False False False False "
]
},
- "execution_count": 10,
+ "execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
@@ -1141,7 +1141,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 26,
"id": "b9606b2a-b26f-493d-bba1-f61803aa2a4f",
"metadata": {},
"outputs": [
@@ -1195,7 +1195,7 @@
" 12 a.m. | \n",
" 1 | \n",
" Alice | \n",
- " 驴C贸mo est谩s? | \n",
+ " Ź˘ÄCŹ«Ńmo estŹ«ˇs? | \n",
" 12 a.m. | \n",
" False | \n",
" False | \n",
@@ -1240,7 +1240,7 @@
" 4 a.m. | \n",
" 2 | \n",
" Bob | \n",
- " a va bien, merci. | \n",
+ " ŹŞ®a va bien, merci. | \n",
" 4 a.m. | \n",
" False | \n",
" False | \n",
@@ -1270,7 +1270,7 @@
" 1 p.m. | \n",
" 4 | \n",
" David | \n",
- " Ich h盲tte Hunger. | \n",
+ " Ich hŹ«Łtte Hunger. | \n",
" 1 p.m. | \n",
" False | \n",
" False | \n",
@@ -1291,17 +1291,17 @@
"4 3 Clara ¡Nos vemos mañana! 1 p.m. 3 Clara \n",
"5 4 David Ich hätte Hunger. 1 p.m. 4 David \n",
"\n",
- " error_mask \n",
- " content timestamp user_id user content timestamp \n",
- "0 驴C贸mo est谩s? 12 a.m. False False True False \n",
- "1 Привет, как дела? 3 p.m. False False False False \n",
- "2 今日はどうですか 3 p.m. False False False False \n",
- "3 a va bien, merci. 4 a.m. False False True False \n",
- "4 ¡Nos vemos mañana! 1 p.m. False False False False \n",
- "5 Ich h盲tte Hunger. 1 p.m. False False True False "
+ " error_mask \n",
+ " content timestamp user_id user content timestamp \n",
+ "0 Ź˘ÄCŹ«Ńmo estŹ«ˇs? 12 a.m. False False True False \n",
+ "1 Привет, как дела? 3 p.m. False False False False \n",
+ "2 今日はどうですか 3 p.m. False False False False \n",
+ "3 ŹŞ®a va bien, merci. 4 a.m. False False True False \n",
+ "4 ¡Nos vemos mañana! 1 p.m. False False False False \n",
+ "5 Ich hŹ«Łtte Hunger. 1 p.m. False False True False "
]
},
- "execution_count": 11,
+ "execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
@@ -1326,7 +1326,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 27,
"id": "2c3b59e7-f651-424b-94b0-dcd485e0c15d",
"metadata": {},
"outputs": [
@@ -1376,10 +1376,10 @@
" 1.0 | \n",
" Alice | \n",
" To Kill a Mockingbird | \n",
- " -2.576077 | \n",
+ " 1.000000 | \n",
+ " False | \n",
" False | \n",
" False | \n",
- " True | \n",
" \n",
" \n",
" | 1 | \n",
@@ -1388,10 +1388,10 @@
" 3.0 | \n",
" Alice | \n",
" 1984 | \n",
- " 3.000000 | \n",
- " False | \n",
+ " 7.827799 | \n",
" False | \n",
" False | \n",
+ " True | \n",
"
\n",
" \n",
" | 2 | \n",
@@ -1412,7 +1412,7 @@
" 4.0 | \n",
" Bob | \n",
" The Great Gatsby | \n",
- " 7.164860 | \n",
+ " 7.830657 | \n",
" False | \n",
" False | \n",
" True | \n",
@@ -1424,7 +1424,7 @@
" 2.0 | \n",
" Bob | \n",
" Moby-Dick | \n",
- " -1.803358 | \n",
+ " -2.476614 | \n",
" False | \n",
" False | \n",
" True | \n",
@@ -1457,15 +1457,15 @@
"\n",
" error_mask \n",
" rating typist book_title rating \n",
- "0 -2.576077 False False True \n",
- "1 3.000000 False False False \n",
+ "0 1.000000 False False False \n",
+ "1 7.827799 False False True \n",
"2 3.000000 False False False \n",
- "3 7.164860 False False True \n",
- "4 -1.803358 False False True \n",
+ "3 7.830657 False False True \n",
+ "4 -2.476614 False False True \n",
"5 1.000000 False False False "
]
},
- "execution_count": 12,
+ "execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
@@ -1490,7 +1490,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 28,
"id": "affc303b-d72f-41fc-bc4c-ddac9303a289",
"metadata": {},
"outputs": [
@@ -1530,14 +1530,14 @@
"
\n",
" | 0 | \n",
" service-A-2024-02-01 | \n",
- " service-A-2024-02-01 | \n",
- " False | \n",
+ " A-2024-02-service-01 | \n",
+ " True | \n",
"
\n",
" \n",
" | 1 | \n",
" service-A-2024-02-02 | \n",
- " service-A-2024-02-02 | \n",
- " False | \n",
+ " A-02-service-2024-02 | \n",
+ " True | \n",
"
\n",
" \n",
" | 2 | \n",
@@ -1554,32 +1554,32 @@
"
\n",
" | 4 | \n",
" service-B-2024-02-02 | \n",
- " 2024-service-02-02-B | \n",
+ " service-2024-02-B-02 | \n",
" True | \n",
"
\n",
" \n",
" | 5 | \n",
" service-B-2024-02-03 | \n",
- " 02-2024-B-service-03 | \n",
+ " service-B-2024-03-02 | \n",
" True | \n",
"
\n",
" \n",
" | 6 | \n",
" service-C-2024-02-01 | \n",
- " 02-2024-C-01-service | \n",
+ " 2024-02-service-C-01 | \n",
" True | \n",
"
\n",
" \n",
" | 7 | \n",
" service-C-2024-02-02 | \n",
- " 2024-service-02-02-C | \n",
- " True | \n",
+ " service-C-2024-02-02 | \n",
+ " False | \n",
"
\n",
" \n",
" | 8 | \n",
" service-C-2024-02-03 | \n",
- " 2024-C-03-service-02 | \n",
- " True | \n",
+ " service-C-2024-02-03 | \n",
+ " False | \n",
"
\n",
" \n",
"\n",
@@ -1588,18 +1588,18 @@
"text/plain": [
" original perturbed error_mask\n",
" service service service\n",
- "0 service-A-2024-02-01 service-A-2024-02-01 False\n",
- "1 service-A-2024-02-02 service-A-2024-02-02 False\n",
+ "0 service-A-2024-02-01 A-2024-02-service-01 True\n",
+ "1 service-A-2024-02-02 A-02-service-2024-02 True\n",
"2 service-A-2024-02-03 service-A-2024-02-03 False\n",
"3 service-A-2024-02-01 service-A-2024-02-01 False\n",
- "4 service-B-2024-02-02 2024-service-02-02-B True\n",
- "5 service-B-2024-02-03 02-2024-B-service-03 True\n",
- "6 service-C-2024-02-01 02-2024-C-01-service True\n",
- "7 service-C-2024-02-02 2024-service-02-02-C True\n",
- "8 service-C-2024-02-03 2024-C-03-service-02 True"
+ "4 service-B-2024-02-02 service-2024-02-B-02 True\n",
+ "5 service-B-2024-02-03 service-B-2024-03-02 True\n",
+ "6 service-C-2024-02-01 2024-02-service-C-01 True\n",
+ "7 service-C-2024-02-02 service-C-2024-02-02 False\n",
+ "8 service-C-2024-02-03 service-C-2024-02-03 False"
]
},
- "execution_count": 13,
+ "execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
@@ -1622,7 +1622,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 29,
"id": "eae650d0-d6a9-4599-8c0f-8f4216ea7b63",
"metadata": {},
"outputs": [
@@ -1662,32 +1662,32 @@
" \n",
" | 0 | \n",
" service-A-2024-02-01 | \n",
- " A-01-2024-service-02 | \n",
+ " 02-01-2024-service-A | \n",
" True | \n",
"
\n",
" \n",
" | 1 | \n",
" service-A-2024-02-02 | \n",
- " service-A-2024-02-02 | \n",
- " False | \n",
+ " 02-02-2024-service-A | \n",
+ " True | \n",
"
\n",
" \n",
" | 2 | \n",
" service-A-2024-02-03 | \n",
- " A-03-2024-service-02 | \n",
+ " 02-03-2024-service-A | \n",
" True | \n",
"
\n",
" \n",
" | 3 | \n",
" service-A-2024-02-01 | \n",
- " service-A-2024-02-01 | \n",
- " False | \n",
+ " 02-01-2024-service-A | \n",
+ " True | \n",
"
\n",
" \n",
" | 4 | \n",
" service-B-2024-02-02 | \n",
- " B-02-2024-service-02 | \n",
- " True | \n",
+ " service-B-2024-02-02 | \n",
+ " False | \n",
"
\n",
" \n",
" | 5 | \n",
@@ -1698,8 +1698,8 @@
"
\n",
" | 6 | \n",
" service-C-2024-02-01 | \n",
- " C-01-2024-service-02 | \n",
- " True | \n",
+ " service-C-2024-02-01 | \n",
+ " False | \n",
"
\n",
" \n",
" | 7 | \n",
@@ -1710,7 +1710,7 @@
"
\n",
" | 8 | \n",
" service-C-2024-02-03 | \n",
- " C-03-2024-service-02 | \n",
+ " 02-03-2024-service-C | \n",
" True | \n",
"
\n",
" \n",
@@ -1720,18 +1720,18 @@
"text/plain": [
" original perturbed error_mask\n",
" service service service\n",
- "0 service-A-2024-02-01 A-01-2024-service-02 True\n",
- "1 service-A-2024-02-02 service-A-2024-02-02 False\n",
- "2 service-A-2024-02-03 A-03-2024-service-02 True\n",
- "3 service-A-2024-02-01 service-A-2024-02-01 False\n",
- "4 service-B-2024-02-02 B-02-2024-service-02 True\n",
+ "0 service-A-2024-02-01 02-01-2024-service-A True\n",
+ "1 service-A-2024-02-02 02-02-2024-service-A True\n",
+ "2 service-A-2024-02-03 02-03-2024-service-A True\n",
+ "3 service-A-2024-02-01 02-01-2024-service-A True\n",
+ "4 service-B-2024-02-02 service-B-2024-02-02 False\n",
"5 service-B-2024-02-03 service-B-2024-02-03 False\n",
- "6 service-C-2024-02-01 C-01-2024-service-02 True\n",
+ "6 service-C-2024-02-01 service-C-2024-02-01 False\n",
"7 service-C-2024-02-02 service-C-2024-02-02 False\n",
- "8 service-C-2024-02-03 C-03-2024-service-02 True"
+ "8 service-C-2024-02-03 02-03-2024-service-C True"
]
},
- "execution_count": 14,
+ "execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
@@ -1756,7 +1756,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 30,
"id": "0d36eba7-9e7b-42c3-8f3e-8d42588c53e7",
"metadata": {},
"outputs": [
@@ -1796,20 +1796,20 @@
" \n",
" | 0 | \n",
" service-A-2024-02-01 | \n",
- " service-A-2024-02-01 | \n",
- " False | \n",
+ " service_A_2024_02_01 | \n",
+ " True | \n",
"
\n",
" \n",
" | 1 | \n",
" service-A-2024-02-02 | \n",
- " service_A_2024_02_02 | \n",
- " True | \n",
+ " service-A-2024-02-02 | \n",
+ " False | \n",
"
\n",
" \n",
" | 2 | \n",
" service-A-2024-02-03 | \n",
- " service-A-2024-02-03 | \n",
- " False | \n",
+ " service_A_2024_02_03 | \n",
+ " True | \n",
"
\n",
" \n",
" | 3 | \n",
@@ -1820,14 +1820,14 @@
"
\n",
" | 4 | \n",
" service-B-2024-02-02 | \n",
- " service-B-2024-02-02 | \n",
- " False | \n",
+ " service_B_2024_02_02 | \n",
+ " True | \n",
"
\n",
" \n",
" | 5 | \n",
" service-B-2024-02-03 | \n",
- " service_B_2024_02_03 | \n",
- " True | \n",
+ " service-B-2024-02-03 | \n",
+ " False | \n",
"
\n",
" \n",
" | 6 | \n",
@@ -1838,8 +1838,8 @@
"
\n",
" | 7 | \n",
" service-C-2024-02-02 | \n",
- " service_C_2024_02_02 | \n",
- " True | \n",
+ " service-C-2024-02-02 | \n",
+ " False | \n",
"
\n",
" \n",
" | 8 | \n",
@@ -1854,18 +1854,18 @@
"text/plain": [
" original perturbed error_mask\n",
" service service service\n",
- "0 service-A-2024-02-01 service-A-2024-02-01 False\n",
- "1 service-A-2024-02-02 service_A_2024_02_02 True\n",
- "2 service-A-2024-02-03 service-A-2024-02-03 False\n",
+ "0 service-A-2024-02-01 service_A_2024_02_01 True\n",
+ "1 service-A-2024-02-02 service-A-2024-02-02 False\n",
+ "2 service-A-2024-02-03 service_A_2024_02_03 True\n",
"3 service-A-2024-02-01 service_A_2024_02_01 True\n",
- "4 service-B-2024-02-02 service-B-2024-02-02 False\n",
- "5 service-B-2024-02-03 service_B_2024_02_03 True\n",
+ "4 service-B-2024-02-02 service_B_2024_02_02 True\n",
+ "5 service-B-2024-02-03 service-B-2024-02-03 False\n",
"6 service-C-2024-02-01 service_C_2024_02_01 True\n",
- "7 service-C-2024-02-02 service_C_2024_02_02 True\n",
+ "7 service-C-2024-02-02 service-C-2024-02-02 False\n",
"8 service-C-2024-02-03 service-C-2024-02-03 False"
]
},
- "execution_count": 15,
+ "execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
@@ -1890,7 +1890,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 31,
"id": "246551cb-946b-4732-92e9-6806b46f1d26",
"metadata": {},
"outputs": [
@@ -1939,7 +1939,7 @@
" To Kill a Mockingbird | \n",
" 1.0 | \n",
" Alice | \n",
- " Yo Kill a Mockingbird | \n",
+ " To Kill w Mockingbird | \n",
" 1.0 | \n",
" False | \n",
" True | \n",
@@ -1951,10 +1951,10 @@
" 1984 | \n",
" 3.0 | \n",
" Alice | \n",
- " 1984 | \n",
+ " 1i84 | \n",
" 3.0 | \n",
" False | \n",
- " False | \n",
+ " True | \n",
" False | \n",
"
\n",
" \n",
@@ -1987,10 +1987,10 @@
" | Moby-Dick | \n",
" 2.0 | \n",
" Bob | \n",
- " Moby-D9ck | \n",
+ " Moby-Dick | \n",
" 2.0 | \n",
" False | \n",
- " True | \n",
+ " False | \n",
" False | \n",
"
\n",
" \n",
@@ -1999,7 +1999,7 @@
" | The Catcher in the Rye | \n",
" 1.0 | \n",
" Bob | \n",
- " Ghe Catcher in the Rye | \n",
+ " The Catcher in tne Rye | \n",
" 1.0 | \n",
" False | \n",
" True | \n",
@@ -2012,24 +2012,24 @@
"text/plain": [
" original perturbed \\\n",
" typist book_title rating typist book_title \n",
- "0 Alice To Kill a Mockingbird 1.0 Alice Yo Kill a Mockingbird \n",
- "1 Alice 1984 3.0 Alice 1984 \n",
+ "0 Alice To Kill a Mockingbird 1.0 Alice To Kill w Mockingbird \n",
+ "1 Alice 1984 3.0 Alice 1i84 \n",
"2 Alice Pride and Prejudice 3.0 Alice Pride and Prejudice \n",
"3 Bob The Great Gatsby 4.0 Bob The Great Gatsby \n",
- "4 Bob Moby-Dick 2.0 Bob Moby-D9ck \n",
- "5 Bob The Catcher in the Rye 1.0 Bob Ghe Catcher in the Rye \n",
+ "4 Bob Moby-Dick 2.0 Bob Moby-Dick \n",
+ "5 Bob The Catcher in the Rye 1.0 Bob The Catcher in tne Rye \n",
"\n",
" error_mask \n",
" rating typist book_title rating \n",
"0 1.0 False True False \n",
- "1 3.0 False False False \n",
+ "1 3.0 False True False \n",
"2 3.0 False False False \n",
"3 4.0 False False False \n",
- "4 2.0 False True False \n",
+ "4 2.0 False False False \n",
"5 1.0 False True False "
]
},
- "execution_count": 16,
+ "execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
@@ -2054,7 +2054,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 32,
"id": "fbb7c9fd-6d14-4bff-a6c9-780117e6218b",
"metadata": {},
"outputs": [
@@ -2104,10 +2104,10 @@
" 1.0 | \n",
" Alice | \n",
" To Kill a Mockingbird | \n",
- " 1.0 | \n",
- " False | \n",
+ " 10.0 | \n",
" False | \n",
" False | \n",
+ " True | \n",
"
\n",
" \n",
" | 1 | \n",
@@ -2152,10 +2152,10 @@
" 2.0 | \n",
" Bob | \n",
" Moby-Dick | \n",
- " 20.0 | \n",
+ " 2.0 | \n",
+ " False | \n",
" False | \n",
" False | \n",
- " True | \n",
"
\n",
" \n",
" | 5 | \n",
@@ -2185,15 +2185,15 @@
"\n",
" error_mask \n",
" rating typist book_title rating \n",
- "0 1.0 False False False \n",
+ "0 10.0 False False True \n",
"1 3.0 False False False \n",
"2 30.0 False False True \n",
"3 4.0 False False False \n",
- "4 20.0 False False True \n",
+ "4 2.0 False False False \n",
"5 10.0 False False True "
]
},
- "execution_count": 17,
+ "execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
diff --git a/tab_err/error_type/_config.py b/tab_err/error_type/_config.py
index 0cc7810..3e45df9 100644
--- a/tab_err/error_type/_config.py
+++ b/tab_err/error_type/_config.py
@@ -47,12 +47,13 @@ class ErrorTypeConfig:
add_delta_value (Any | None): Value that is added to the value by the AddDelta Error Type. Defaults to None.
- outlier_coin_flip_threshold (float): Coin flip determines the direction (positive, negative) of the outlier. Defaults to 0.5.
+ outlier_coin_flip_threshold (float): Probability of a negative outlier. Defaults to 0.5.
- outlier_coefficient (float): Coefficient that determines the magnitude of the outliers for the Outlier Error Type. Defaults to 1.0.
+ outlier_coefficient (float): Coefficient that determines how many times the iqr should be added/subtracted from the median for the Outlier Error Type.
+ Defaults to 3.0.
- outlier_noise_coeff (float): Coefficient that influences the standard deviation of the noise added to the outliers for the Outlier Error Type.
- Defaults to 0.1.
+ outlier_noise_coeff (float): Coefficient that influences the standard deviation of the gaussian noise added/subtracted to the outliers for the
+ Outlier Error Type. Defaults to 0.1.
"""
encoding_sender: str | None = None
@@ -82,7 +83,7 @@ class ErrorTypeConfig:
add_delta_value: float | int | None = None
outlier_coin_flip_threshold: float = 0.5
- outlier_coefficient: float = 1.0
+ outlier_coefficient: float = 3.0
outlier_noise_coeff: float = 0.1
def to_dict(self: ErrorTypeConfig) -> dict[str, Any]:
diff --git a/tab_err/error_type/_outlier.py b/tab_err/error_type/_outlier.py
index 4c2303b..ed4b904 100644
--- a/tab_err/error_type/_outlier.py
+++ b/tab_err/error_type/_outlier.py
@@ -2,7 +2,7 @@
import numpy as np
import pandas as pd
-from pandas.api.types import is_datetime64_dtype, is_integer_dtype, is_numeric_dtype
+from pandas.api.types import is_datetime64_dtype, is_numeric_dtype
from tab_err._utils import get_column
@@ -10,20 +10,9 @@
class Outlier(ErrorType):
- """Inserts outliers into a column by pushing data points outside the interquartile range (IQR) boundaries.
-
- - Data points below the mean are pushed towards lower outliers, while those above the mean are pushed towards upper outliers.
- - The `outlier_coefficient` controls how far values are pushed relative to the IQR. An `outlier_coefficient` of 1.0 means the
- push is equal to half of the IQR, shifting the mean value exactly to the edge of the IQR. Values that deviate more from the
- mean will be pushed beyond the IQR boundary. When `outlier_coefficient` is less than 1.0, values—including the mean—are pushed
- less drastically, potentially keeping them within the IQR.
- - The push is calculated as:
- push = outlier_coefficient * |upper_boundary - mean_value|
- - Values above the mean are pushed towards the upper boundary, and values below the mean are pushed towards the lower boundary.
- If a value equals the mean, a coin flip decides whether it is pushed towards the upper or lower boundary.
- - After this process, Gaussian noise is added to simulate measurement errors and make the outliers appear more realistic. The
- amount of noise can be controlled via the `outlier_noise_coeff` parameter and is scaled with the IQR to ensure it is proportional
- to the data's spread.
+ """Inserts outliers into a column by adding/subtracting (k * iqr + noise) to the median of the given column.
+
+ Determines if an outlier is above or below the median by tossing a coin for each row to be errored.
"""
@staticmethod
@@ -58,44 +47,43 @@ def _apply(self: Outlier, data: pd.DataFrame, error_mask: pd.DataFrame, column:
series = series.astype("int64")
was_datetime = True
- mean_value = series.mean()
- q1 = series.quantile(0.25)
- q3 = series.quantile(0.75)
- iqr = q3 - q1
-
- upper_boundary = q3 + 1.5 * iqr
- lower_boundary = q1 - 1.5 * iqr
-
- # Pre-compute the perturbations
- perturbation_upper = self.config.outlier_coefficient * (upper_boundary - mean_value)
- perturbation_lower = self.config.outlier_coefficient * (mean_value - lower_boundary)
-
- if is_integer_dtype(series): # round float to int when series is int
- perturbation_upper = np.ceil(perturbation_upper)
- perturbation_lower = np.floor(perturbation_lower)
-
- # Get masks for the different outlier types depending on the mean
- mask_lower = (series < mean_value) & series_mask
- mask_upper = (series > mean_value) & series_mask
- mask_equal = (series == mean_value) & series_mask
-
- # Apply the constant perturbation to the respective mask
- series.loc[mask_lower] -= perturbation_lower
- series.loc[mask_upper] += perturbation_upper
-
- # Handle the mean values with a coin flip
- coin_flips = self._random_generator.random(mask_equal.sum())
- series.loc[mask_equal] += np.where(coin_flips > self.config.outlier_coin_flip_threshold, perturbation_upper, -perturbation_lower)
-
- # Apply Gaussian noise to simulate the increase in measurement error of the outliers
- noise_std = self.config.outlier_noise_coeff * iqr
-
- if is_integer_dtype(series): # round float to int when series is int
- series.loc[series_mask] += np.rint(self._random_generator.normal(loc=0, scale=noise_std, size=series_mask.sum()))
- else:
- series.loc[series_mask] += self._random_generator.normal(loc=0, scale=noise_std, size=series_mask.sum())
+ # Set up the necessary values
+ median_value = series.median()
+ iqr = series.quantile(0.75) - series.quantile(0.25)
+ if iqr == 0: # To not impute the median +/- noise
+ iqr = 1e-9
+
+ # Decide which outliers are above/below the median - at least one is above/below
+ coin_tosses = self._random_generator.random(series_mask.sum()) < self.config.outlier_coin_flip_threshold
+ if series_mask.sum() > 1:
+ if not coin_tosses.any():
+ coin_tosses[self._random_generator.integers(0, len(coin_tosses))] = True
+ elif coin_tosses.all():
+ coin_tosses[self._random_generator.integers(0, len(coin_tosses))] = False
+
+ neg_outliers = series_mask.copy()
+ neg_outliers[series_mask] = coin_tosses
+ pos_outliers = series_mask & ~neg_outliers
+
+ neg_noise = (
+ self._random_generator.normal(loc=0, scale=self.config.outlier_noise_coeff * iqr, size=neg_outliers.sum())
+ if neg_outliers.sum() > 0
+ else np.array([])
+ )
+ pos_noise = (
+ self._random_generator.normal(loc=0, scale=self.config.outlier_noise_coeff * iqr, size=pos_outliers.sum())
+ if pos_outliers.sum() > 0
+ else np.array([])
+ )
+
+ # Apply outliers
+ if neg_noise.size > 0:
+ series[neg_outliers] = median_value - (self.config.outlier_coefficient * iqr) - neg_noise
+ if pos_noise.size > 0:
+ series[pos_outliers] = median_value + (self.config.outlier_coefficient * iqr) + pos_noise
if was_datetime: # Handle datetime objects
+ series = series.clip(lower=pd.Timestamp.min.value, upper=pd.Timestamp.max.value)
series = pd.to_datetime(series)
return series