diff --git a/examples/Error_Types.ipynb b/examples/Error_Types.ipynb index 0a7f83b..5aabf75 100644 --- a/examples/Error_Types.ipynb +++ b/examples/Error_Types.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 17, "id": "e031b356-c92e-4e6c-9422-ea968c81aa64", "metadata": {}, "outputs": [], @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 18, "id": "eaf1bf17-08f2-4627-a6bb-318b60e3528d", "metadata": {}, "outputs": [], @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 19, "id": "18931616-22c3-4f63-8e7d-c0710d924716", "metadata": {}, "outputs": [], @@ -110,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 20, "id": "351080b6-841f-4e5e-809f-0a237eaa4fe5", "metadata": {}, "outputs": [ @@ -172,10 +172,10 @@ " 3.0\n", " Alice\n", " 1984\n", - " 3.0\n", - " False\n", + " 3.1\n", " False\n", " False\n", + " True\n", " \n", " \n", " 2\n", @@ -196,10 +196,10 @@ " 4.0\n", " Bob\n", " The Great Gatsby\n", - " 4.1\n", + " 4.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 4\n", @@ -242,14 +242,14 @@ " error_mask \n", " rating typist book_title rating \n", "0 1.0 False False False \n", - "1 3.0 False False False \n", + "1 3.1 False False True \n", "2 3.0 False False False \n", - "3 4.1 False False True \n", + "3 4.0 False False False \n", "4 2.0 False False False \n", "5 1.0 False False False " ] }, - "execution_count": 5, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -272,7 +272,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 21, "id": "ec59d663-3e0a-45a8-a662-9ca8674f192c", "metadata": {}, "outputs": [ @@ -346,10 +346,10 @@ " 3.0\n", " Alice\n", " Pride and Prejudice\n", - " 3.0\n", - " False\n", + " 3.1\n", " False\n", " False\n", + " True\n", " \n", " \n", " 3\n", @@ -370,10 +370,10 @@ " 2.0\n", " Bob\n", " Moby-Dick\n", - " 2.1\n", + " 2.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 5\n", @@ -405,13 +405,13 @@ " rating typist book_title rating \n", "0 1.0 False False False \n", "1 3.0 False False False \n", - "2 3.0 False False False \n", + "2 3.1 False False True \n", "3 4.0 False False False \n", - "4 2.1 False False True \n", + "4 2.0 False False False \n", "5 1.0 False False False " ] }, - "execution_count": 6, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -438,7 +438,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 22, "id": "bcfc46a3-16dc-4427-b580-6d1dd09b9d87", "metadata": {}, "outputs": [ @@ -577,7 +577,7 @@ "5 1.0 False False False " ] }, - "execution_count": 7, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -600,7 +600,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 23, "id": "a07d9c15-c43c-48e0-beb4-b074855e557c", "metadata": {}, "outputs": [ @@ -650,10 +650,10 @@ " 1.0\n", " Alice\n", " To Kill a Mockingbird\n", - " 1.0\n", - " False\n", + " 9999.0\n", " False\n", " False\n", + " True\n", " \n", " \n", " 1\n", @@ -698,10 +698,10 @@ " 2.0\n", " Bob\n", " Moby-Dick\n", - " 9999.0\n", + " 2.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 5\n", @@ -731,15 +731,15 @@ "\n", " error_mask \n", " rating typist book_title rating \n", - "0 1.0 False False False \n", + "0 9999.0 False False True \n", "1 3.0 False False False \n", "2 3.0 False False False \n", "3 4.0 False False False \n", - "4 9999.0 False False True \n", + "4 2.0 False False False \n", "5 1.0 False False False " ] }, - "execution_count": 8, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -770,7 +770,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 24, "id": "4834399f-c5b6-4d3d-8bd8-2869ec4637a5", "metadata": {}, "outputs": [ @@ -885,11 +885,11 @@ " 3\n", " Clara\n", " ¡Nos vemos mañana!\n", - " 11/10 1 p.m.\n", + " 1 p.m.\n", + " False\n", " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 5\n", @@ -900,11 +900,11 @@ " 4\n", " David\n", " Ich hätte Hunger.\n", - " 1 p.m.\n", - " False\n", + " 11/10 1 p.m.\n", " False\n", " False\n", " False\n", + " True\n", " \n", " \n", "\n", @@ -926,11 +926,11 @@ "1 Привет, как дела? 3 p.m. False False False False \n", "2 今日はどうですか 3 p.m. False False False False \n", "3 Ça va bien, merci. 11/10 4 a.m. False False False True \n", - "4 ¡Nos vemos mañana! 11/10 1 p.m. False False False True \n", - "5 Ich hätte Hunger. 1 p.m. False False False False " + "4 ¡Nos vemos mañana! 1 p.m. False False False False \n", + "5 Ich hätte Hunger. 11/10 1 p.m. False False False True " ] }, - "execution_count": 9, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -956,7 +956,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 25, "id": "948c980c-9b16-4bde-b63b-9e450a7e1ac5", "metadata": {}, "outputs": [ @@ -1116,7 +1116,7 @@ "5 Ich hätte Hunger. 1 p.m. False False False False " ] }, - "execution_count": 10, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1141,7 +1141,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 26, "id": "b9606b2a-b26f-493d-bba1-f61803aa2a4f", "metadata": {}, "outputs": [ @@ -1195,7 +1195,7 @@ " 12 a.m.\n", " 1\n", " Alice\n", - " 驴C贸mo est谩s?\n", + " Ź˘ÄCŹ«Ńmo estŹ«ˇs?\n", " 12 a.m.\n", " False\n", " False\n", @@ -1240,7 +1240,7 @@ " 4 a.m.\n", " 2\n", " Bob\n", - " a va bien, merci.\n", + " ŹŞ®a va bien, merci.\n", " 4 a.m.\n", " False\n", " False\n", @@ -1270,7 +1270,7 @@ " 1 p.m.\n", " 4\n", " David\n", - " Ich h盲tte Hunger.\n", + " Ich hŹ«Łtte Hunger.\n", " 1 p.m.\n", " False\n", " False\n", @@ -1291,17 +1291,17 @@ "4 3 Clara ¡Nos vemos mañana! 1 p.m. 3 Clara \n", "5 4 David Ich hätte Hunger. 1 p.m. 4 David \n", "\n", - " error_mask \n", - " content timestamp user_id user content timestamp \n", - "0 驴C贸mo est谩s? 12 a.m. False False True False \n", - "1 Привет, как дела? 3 p.m. False False False False \n", - "2 今日はどうですか 3 p.m. False False False False \n", - "3 a va bien, merci. 4 a.m. False False True False \n", - "4 ¡Nos vemos mañana! 1 p.m. False False False False \n", - "5 Ich h盲tte Hunger. 1 p.m. False False True False " + " error_mask \n", + " content timestamp user_id user content timestamp \n", + "0 Ź˘ÄCŹ«Ńmo estŹ«ˇs? 12 a.m. False False True False \n", + "1 Привет, как дела? 3 p.m. False False False False \n", + "2 今日はどうですか 3 p.m. False False False False \n", + "3 ŹŞ®a va bien, merci. 4 a.m. False False True False \n", + "4 ¡Nos vemos mañana! 1 p.m. False False False False \n", + "5 Ich hŹ«Łtte Hunger. 1 p.m. False False True False " ] }, - "execution_count": 11, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1326,7 +1326,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 27, "id": "2c3b59e7-f651-424b-94b0-dcd485e0c15d", "metadata": {}, "outputs": [ @@ -1376,10 +1376,10 @@ " 1.0\n", " Alice\n", " To Kill a Mockingbird\n", - " -2.576077\n", + " 1.000000\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 1\n", @@ -1388,10 +1388,10 @@ " 3.0\n", " Alice\n", " 1984\n", - " 3.000000\n", - " False\n", + " 7.827799\n", " False\n", " False\n", + " True\n", " \n", " \n", " 2\n", @@ -1412,7 +1412,7 @@ " 4.0\n", " Bob\n", " The Great Gatsby\n", - " 7.164860\n", + " 7.830657\n", " False\n", " False\n", " True\n", @@ -1424,7 +1424,7 @@ " 2.0\n", " Bob\n", " Moby-Dick\n", - " -1.803358\n", + " -2.476614\n", " False\n", " False\n", " True\n", @@ -1457,15 +1457,15 @@ "\n", " error_mask \n", " rating typist book_title rating \n", - "0 -2.576077 False False True \n", - "1 3.000000 False False False \n", + "0 1.000000 False False False \n", + "1 7.827799 False False True \n", "2 3.000000 False False False \n", - "3 7.164860 False False True \n", - "4 -1.803358 False False True \n", + "3 7.830657 False False True \n", + "4 -2.476614 False False True \n", "5 1.000000 False False False " ] }, - "execution_count": 12, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1490,7 +1490,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 28, "id": "affc303b-d72f-41fc-bc4c-ddac9303a289", "metadata": {}, "outputs": [ @@ -1530,14 +1530,14 @@ " \n", " 0\n", " service-A-2024-02-01\n", - " service-A-2024-02-01\n", - " False\n", + " A-2024-02-service-01\n", + " True\n", " \n", " \n", " 1\n", " service-A-2024-02-02\n", - " service-A-2024-02-02\n", - " False\n", + " A-02-service-2024-02\n", + " True\n", " \n", " \n", " 2\n", @@ -1554,32 +1554,32 @@ " \n", " 4\n", " service-B-2024-02-02\n", - " 2024-service-02-02-B\n", + " service-2024-02-B-02\n", " True\n", " \n", " \n", " 5\n", " service-B-2024-02-03\n", - " 02-2024-B-service-03\n", + " service-B-2024-03-02\n", " True\n", " \n", " \n", " 6\n", " service-C-2024-02-01\n", - " 02-2024-C-01-service\n", + " 2024-02-service-C-01\n", " True\n", " \n", " \n", " 7\n", " service-C-2024-02-02\n", - " 2024-service-02-02-C\n", - " True\n", + " service-C-2024-02-02\n", + " False\n", " \n", " \n", " 8\n", " service-C-2024-02-03\n", - " 2024-C-03-service-02\n", - " True\n", + " service-C-2024-02-03\n", + " False\n", " \n", " \n", "\n", @@ -1588,18 +1588,18 @@ "text/plain": [ " original perturbed error_mask\n", " service service service\n", - "0 service-A-2024-02-01 service-A-2024-02-01 False\n", - "1 service-A-2024-02-02 service-A-2024-02-02 False\n", + "0 service-A-2024-02-01 A-2024-02-service-01 True\n", + "1 service-A-2024-02-02 A-02-service-2024-02 True\n", "2 service-A-2024-02-03 service-A-2024-02-03 False\n", "3 service-A-2024-02-01 service-A-2024-02-01 False\n", - "4 service-B-2024-02-02 2024-service-02-02-B True\n", - "5 service-B-2024-02-03 02-2024-B-service-03 True\n", - "6 service-C-2024-02-01 02-2024-C-01-service True\n", - "7 service-C-2024-02-02 2024-service-02-02-C True\n", - "8 service-C-2024-02-03 2024-C-03-service-02 True" + "4 service-B-2024-02-02 service-2024-02-B-02 True\n", + "5 service-B-2024-02-03 service-B-2024-03-02 True\n", + "6 service-C-2024-02-01 2024-02-service-C-01 True\n", + "7 service-C-2024-02-02 service-C-2024-02-02 False\n", + "8 service-C-2024-02-03 service-C-2024-02-03 False" ] }, - "execution_count": 13, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1622,7 +1622,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 29, "id": "eae650d0-d6a9-4599-8c0f-8f4216ea7b63", "metadata": {}, "outputs": [ @@ -1662,32 +1662,32 @@ " \n", " 0\n", " service-A-2024-02-01\n", - " A-01-2024-service-02\n", + " 02-01-2024-service-A\n", " True\n", " \n", " \n", " 1\n", " service-A-2024-02-02\n", - " service-A-2024-02-02\n", - " False\n", + " 02-02-2024-service-A\n", + " True\n", " \n", " \n", " 2\n", " service-A-2024-02-03\n", - " A-03-2024-service-02\n", + " 02-03-2024-service-A\n", " True\n", " \n", " \n", " 3\n", " service-A-2024-02-01\n", - " service-A-2024-02-01\n", - " False\n", + " 02-01-2024-service-A\n", + " True\n", " \n", " \n", " 4\n", " service-B-2024-02-02\n", - " B-02-2024-service-02\n", - " True\n", + " service-B-2024-02-02\n", + " False\n", " \n", " \n", " 5\n", @@ -1698,8 +1698,8 @@ " \n", " 6\n", " service-C-2024-02-01\n", - " C-01-2024-service-02\n", - " True\n", + " service-C-2024-02-01\n", + " False\n", " \n", " \n", " 7\n", @@ -1710,7 +1710,7 @@ " \n", " 8\n", " service-C-2024-02-03\n", - " C-03-2024-service-02\n", + " 02-03-2024-service-C\n", " True\n", " \n", " \n", @@ -1720,18 +1720,18 @@ "text/plain": [ " original perturbed error_mask\n", " service service service\n", - "0 service-A-2024-02-01 A-01-2024-service-02 True\n", - "1 service-A-2024-02-02 service-A-2024-02-02 False\n", - "2 service-A-2024-02-03 A-03-2024-service-02 True\n", - "3 service-A-2024-02-01 service-A-2024-02-01 False\n", - "4 service-B-2024-02-02 B-02-2024-service-02 True\n", + "0 service-A-2024-02-01 02-01-2024-service-A True\n", + "1 service-A-2024-02-02 02-02-2024-service-A True\n", + "2 service-A-2024-02-03 02-03-2024-service-A True\n", + "3 service-A-2024-02-01 02-01-2024-service-A True\n", + "4 service-B-2024-02-02 service-B-2024-02-02 False\n", "5 service-B-2024-02-03 service-B-2024-02-03 False\n", - "6 service-C-2024-02-01 C-01-2024-service-02 True\n", + "6 service-C-2024-02-01 service-C-2024-02-01 False\n", "7 service-C-2024-02-02 service-C-2024-02-02 False\n", - "8 service-C-2024-02-03 C-03-2024-service-02 True" + "8 service-C-2024-02-03 02-03-2024-service-C True" ] }, - "execution_count": 14, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1756,7 +1756,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 30, "id": "0d36eba7-9e7b-42c3-8f3e-8d42588c53e7", "metadata": {}, "outputs": [ @@ -1796,20 +1796,20 @@ " \n", " 0\n", " service-A-2024-02-01\n", - " service-A-2024-02-01\n", - " False\n", + " service_A_2024_02_01\n", + " True\n", " \n", " \n", " 1\n", " service-A-2024-02-02\n", - " service_A_2024_02_02\n", - " True\n", + " service-A-2024-02-02\n", + " False\n", " \n", " \n", " 2\n", " service-A-2024-02-03\n", - " service-A-2024-02-03\n", - " False\n", + " service_A_2024_02_03\n", + " True\n", " \n", " \n", " 3\n", @@ -1820,14 +1820,14 @@ " \n", " 4\n", " service-B-2024-02-02\n", - " service-B-2024-02-02\n", - " False\n", + " service_B_2024_02_02\n", + " True\n", " \n", " \n", " 5\n", " service-B-2024-02-03\n", - " service_B_2024_02_03\n", - " True\n", + " service-B-2024-02-03\n", + " False\n", " \n", " \n", " 6\n", @@ -1838,8 +1838,8 @@ " \n", " 7\n", " service-C-2024-02-02\n", - " service_C_2024_02_02\n", - " True\n", + " service-C-2024-02-02\n", + " False\n", " \n", " \n", " 8\n", @@ -1854,18 +1854,18 @@ "text/plain": [ " original perturbed error_mask\n", " service service service\n", - "0 service-A-2024-02-01 service-A-2024-02-01 False\n", - "1 service-A-2024-02-02 service_A_2024_02_02 True\n", - "2 service-A-2024-02-03 service-A-2024-02-03 False\n", + "0 service-A-2024-02-01 service_A_2024_02_01 True\n", + "1 service-A-2024-02-02 service-A-2024-02-02 False\n", + "2 service-A-2024-02-03 service_A_2024_02_03 True\n", "3 service-A-2024-02-01 service_A_2024_02_01 True\n", - "4 service-B-2024-02-02 service-B-2024-02-02 False\n", - "5 service-B-2024-02-03 service_B_2024_02_03 True\n", + "4 service-B-2024-02-02 service_B_2024_02_02 True\n", + "5 service-B-2024-02-03 service-B-2024-02-03 False\n", "6 service-C-2024-02-01 service_C_2024_02_01 True\n", - "7 service-C-2024-02-02 service_C_2024_02_02 True\n", + "7 service-C-2024-02-02 service-C-2024-02-02 False\n", "8 service-C-2024-02-03 service-C-2024-02-03 False" ] }, - "execution_count": 15, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1890,7 +1890,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 31, "id": "246551cb-946b-4732-92e9-6806b46f1d26", "metadata": {}, "outputs": [ @@ -1939,7 +1939,7 @@ " To Kill a Mockingbird\n", " 1.0\n", " Alice\n", - " Yo Kill a Mockingbird\n", + " To Kill w Mockingbird\n", " 1.0\n", " False\n", " True\n", @@ -1951,10 +1951,10 @@ " 1984\n", " 3.0\n", " Alice\n", - " 1984\n", + " 1i84\n", " 3.0\n", " False\n", - " False\n", + " True\n", " False\n", " \n", " \n", @@ -1987,10 +1987,10 @@ " Moby-Dick\n", " 2.0\n", " Bob\n", - " Moby-D9ck\n", + " Moby-Dick\n", " 2.0\n", " False\n", - " True\n", + " False\n", " False\n", " \n", " \n", @@ -1999,7 +1999,7 @@ " The Catcher in the Rye\n", " 1.0\n", " Bob\n", - " Ghe Catcher in the Rye\n", + " The Catcher in tne Rye\n", " 1.0\n", " False\n", " True\n", @@ -2012,24 +2012,24 @@ "text/plain": [ " original perturbed \\\n", " typist book_title rating typist book_title \n", - "0 Alice To Kill a Mockingbird 1.0 Alice Yo Kill a Mockingbird \n", - "1 Alice 1984 3.0 Alice 1984 \n", + "0 Alice To Kill a Mockingbird 1.0 Alice To Kill w Mockingbird \n", + "1 Alice 1984 3.0 Alice 1i84 \n", "2 Alice Pride and Prejudice 3.0 Alice Pride and Prejudice \n", "3 Bob The Great Gatsby 4.0 Bob The Great Gatsby \n", - "4 Bob Moby-Dick 2.0 Bob Moby-D9ck \n", - "5 Bob The Catcher in the Rye 1.0 Bob Ghe Catcher in the Rye \n", + "4 Bob Moby-Dick 2.0 Bob Moby-Dick \n", + "5 Bob The Catcher in the Rye 1.0 Bob The Catcher in tne Rye \n", "\n", " error_mask \n", " rating typist book_title rating \n", "0 1.0 False True False \n", - "1 3.0 False False False \n", + "1 3.0 False True False \n", "2 3.0 False False False \n", "3 4.0 False False False \n", - "4 2.0 False True False \n", + "4 2.0 False False False \n", "5 1.0 False True False " ] }, - "execution_count": 16, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -2054,7 +2054,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 32, "id": "fbb7c9fd-6d14-4bff-a6c9-780117e6218b", "metadata": {}, "outputs": [ @@ -2104,10 +2104,10 @@ " 1.0\n", " Alice\n", " To Kill a Mockingbird\n", - " 1.0\n", - " False\n", + " 10.0\n", " False\n", " False\n", + " True\n", " \n", " \n", " 1\n", @@ -2152,10 +2152,10 @@ " 2.0\n", " Bob\n", " Moby-Dick\n", - " 20.0\n", + " 2.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 5\n", @@ -2185,15 +2185,15 @@ "\n", " error_mask \n", " rating typist book_title rating \n", - "0 1.0 False False False \n", + "0 10.0 False False True \n", "1 3.0 False False False \n", "2 30.0 False False True \n", "3 4.0 False False False \n", - "4 20.0 False False True \n", + "4 2.0 False False False \n", "5 10.0 False False True " ] }, - "execution_count": 17, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } diff --git a/tab_err/error_type/_config.py b/tab_err/error_type/_config.py index 0cc7810..3e45df9 100644 --- a/tab_err/error_type/_config.py +++ b/tab_err/error_type/_config.py @@ -47,12 +47,13 @@ class ErrorTypeConfig: add_delta_value (Any | None): Value that is added to the value by the AddDelta Error Type. Defaults to None. - outlier_coin_flip_threshold (float): Coin flip determines the direction (positive, negative) of the outlier. Defaults to 0.5. + outlier_coin_flip_threshold (float): Probability of a negative outlier. Defaults to 0.5. - outlier_coefficient (float): Coefficient that determines the magnitude of the outliers for the Outlier Error Type. Defaults to 1.0. + outlier_coefficient (float): Coefficient that determines how many times the iqr should be added/subtracted from the median for the Outlier Error Type. + Defaults to 3.0. - outlier_noise_coeff (float): Coefficient that influences the standard deviation of the noise added to the outliers for the Outlier Error Type. - Defaults to 0.1. + outlier_noise_coeff (float): Coefficient that influences the standard deviation of the gaussian noise added/subtracted to the outliers for the + Outlier Error Type. Defaults to 0.1. """ encoding_sender: str | None = None @@ -82,7 +83,7 @@ class ErrorTypeConfig: add_delta_value: float | int | None = None outlier_coin_flip_threshold: float = 0.5 - outlier_coefficient: float = 1.0 + outlier_coefficient: float = 3.0 outlier_noise_coeff: float = 0.1 def to_dict(self: ErrorTypeConfig) -> dict[str, Any]: diff --git a/tab_err/error_type/_outlier.py b/tab_err/error_type/_outlier.py index 4c2303b..ed4b904 100644 --- a/tab_err/error_type/_outlier.py +++ b/tab_err/error_type/_outlier.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd -from pandas.api.types import is_datetime64_dtype, is_integer_dtype, is_numeric_dtype +from pandas.api.types import is_datetime64_dtype, is_numeric_dtype from tab_err._utils import get_column @@ -10,20 +10,9 @@ class Outlier(ErrorType): - """Inserts outliers into a column by pushing data points outside the interquartile range (IQR) boundaries. - - - Data points below the mean are pushed towards lower outliers, while those above the mean are pushed towards upper outliers. - - The `outlier_coefficient` controls how far values are pushed relative to the IQR. An `outlier_coefficient` of 1.0 means the - push is equal to half of the IQR, shifting the mean value exactly to the edge of the IQR. Values that deviate more from the - mean will be pushed beyond the IQR boundary. When `outlier_coefficient` is less than 1.0, values—including the mean—are pushed - less drastically, potentially keeping them within the IQR. - - The push is calculated as: - push = outlier_coefficient * |upper_boundary - mean_value| - - Values above the mean are pushed towards the upper boundary, and values below the mean are pushed towards the lower boundary. - If a value equals the mean, a coin flip decides whether it is pushed towards the upper or lower boundary. - - After this process, Gaussian noise is added to simulate measurement errors and make the outliers appear more realistic. The - amount of noise can be controlled via the `outlier_noise_coeff` parameter and is scaled with the IQR to ensure it is proportional - to the data's spread. + """Inserts outliers into a column by adding/subtracting (k * iqr + noise) to the median of the given column. + + Determines if an outlier is above or below the median by tossing a coin for each row to be errored. """ @staticmethod @@ -58,44 +47,43 @@ def _apply(self: Outlier, data: pd.DataFrame, error_mask: pd.DataFrame, column: series = series.astype("int64") was_datetime = True - mean_value = series.mean() - q1 = series.quantile(0.25) - q3 = series.quantile(0.75) - iqr = q3 - q1 - - upper_boundary = q3 + 1.5 * iqr - lower_boundary = q1 - 1.5 * iqr - - # Pre-compute the perturbations - perturbation_upper = self.config.outlier_coefficient * (upper_boundary - mean_value) - perturbation_lower = self.config.outlier_coefficient * (mean_value - lower_boundary) - - if is_integer_dtype(series): # round float to int when series is int - perturbation_upper = np.ceil(perturbation_upper) - perturbation_lower = np.floor(perturbation_lower) - - # Get masks for the different outlier types depending on the mean - mask_lower = (series < mean_value) & series_mask - mask_upper = (series > mean_value) & series_mask - mask_equal = (series == mean_value) & series_mask - - # Apply the constant perturbation to the respective mask - series.loc[mask_lower] -= perturbation_lower - series.loc[mask_upper] += perturbation_upper - - # Handle the mean values with a coin flip - coin_flips = self._random_generator.random(mask_equal.sum()) - series.loc[mask_equal] += np.where(coin_flips > self.config.outlier_coin_flip_threshold, perturbation_upper, -perturbation_lower) - - # Apply Gaussian noise to simulate the increase in measurement error of the outliers - noise_std = self.config.outlier_noise_coeff * iqr - - if is_integer_dtype(series): # round float to int when series is int - series.loc[series_mask] += np.rint(self._random_generator.normal(loc=0, scale=noise_std, size=series_mask.sum())) - else: - series.loc[series_mask] += self._random_generator.normal(loc=0, scale=noise_std, size=series_mask.sum()) + # Set up the necessary values + median_value = series.median() + iqr = series.quantile(0.75) - series.quantile(0.25) + if iqr == 0: # To not impute the median +/- noise + iqr = 1e-9 + + # Decide which outliers are above/below the median - at least one is above/below + coin_tosses = self._random_generator.random(series_mask.sum()) < self.config.outlier_coin_flip_threshold + if series_mask.sum() > 1: + if not coin_tosses.any(): + coin_tosses[self._random_generator.integers(0, len(coin_tosses))] = True + elif coin_tosses.all(): + coin_tosses[self._random_generator.integers(0, len(coin_tosses))] = False + + neg_outliers = series_mask.copy() + neg_outliers[series_mask] = coin_tosses + pos_outliers = series_mask & ~neg_outliers + + neg_noise = ( + self._random_generator.normal(loc=0, scale=self.config.outlier_noise_coeff * iqr, size=neg_outliers.sum()) + if neg_outliers.sum() > 0 + else np.array([]) + ) + pos_noise = ( + self._random_generator.normal(loc=0, scale=self.config.outlier_noise_coeff * iqr, size=pos_outliers.sum()) + if pos_outliers.sum() > 0 + else np.array([]) + ) + + # Apply outliers + if neg_noise.size > 0: + series[neg_outliers] = median_value - (self.config.outlier_coefficient * iqr) - neg_noise + if pos_noise.size > 0: + series[pos_outliers] = median_value + (self.config.outlier_coefficient * iqr) + pos_noise if was_datetime: # Handle datetime objects + series = series.clip(lower=pd.Timestamp.min.value, upper=pd.Timestamp.max.value) series = pd.to_datetime(series) return series