diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 000000000..4a2d1e78a --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,30 @@ +version: 2.0 +jobs: + check: + docker: + - image: circleci/python:3.7.2 + steps: + - run: + name: Add python user PATH into PATH + command: echo "export PATH=$PATH:$HOME/.local/bin" >> $BASH_ENV + - run: + name: Install python tools + command: pip install --user pipenv + - checkout + - run: + name: Install dependencies + command: pipenv install -d + - run: + name: Type check the project + command: pipenv run pytype dataprep && pipenv run mypy dataprep --strict --ignore-missing-imports + - run: + name: Test the project + command: pipenv run pytest dataprep + - run: + name: Style check the project + command: pipenv run pylint dataprep +workflows: + version: 2 + build_and_test: + jobs: + - check diff --git a/.gitignore b/.gitignore index 894a44cc0..280c0f5cf 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,6 @@ venv.bak/ # mypy .mypy_cache/ + +# pytype +.pytype/ \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 000000000..94a25f7f4 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Pipfile b/Pipfile new file mode 100644 index 000000000..8a3fc8a93 --- /dev/null +++ b/Pipfile @@ -0,0 +1,10 @@ +[packages] +"dask[complete]" = "*" +pandas = "*" +numpy = "*" + +[dev-packages] +pylint = "*" +pytype = "*" +pytest = "*" +mypy = "*" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 000000000..635953682 --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,316 @@ +{ + "_meta": { + "hash": { + "sha256": "c5215368a09766f83ec69b2d2f1856b3fdd1f70d07fd55dba9400d3d7407c7ad" + }, + "pipfile-spec": 6, + "requires": {}, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "dask": { + "hashes": [ + "sha256:5e7876bae2a01b355d1969b73aeafa23310febd8c353163910b73e93dc7e492c", + "sha256:942edbbaceb914be3427fc6f1d5da98a31c3e9eceddcf3158a74e1d4d6fcc67c" + ], + "index": "pypi", + "version": "==1.2.2" + }, + "numpy": { + "hashes": [ + "sha256:0778076e764e146d3078b17c24c4d89e0ecd4ac5401beff8e1c87879043a0633", + "sha256:141c7102f20abe6cf0d54c4ced8d565b86df4d3077ba2343b61a6db996cefec7", + "sha256:14270a1ee8917d11e7753fb54fc7ffd1934f4d529235beec0b275e2ccf00333b", + "sha256:27e11c7a8ec9d5838bc59f809bfa86efc8a4fd02e58960fa9c49d998e14332d5", + "sha256:2a04dda79606f3d2f760384c38ccd3d5b9bb79d4c8126b67aff5eb09a253763e", + "sha256:3c26010c1b51e1224a3ca6b8df807de6e95128b0908c7e34f190e7775455b0ca", + "sha256:52c40f1a4262c896420c6ea1c6fda62cf67070e3947e3307f5562bd783a90336", + "sha256:6e4f8d9e8aa79321657079b9ac03f3cf3fd067bf31c1cca4f56d49543f4356a5", + "sha256:7242be12a58fec245ee9734e625964b97cf7e3f2f7d016603f9e56660ce479c7", + "sha256:7dc253b542bfd4b4eb88d9dbae4ca079e7bf2e2afd819ee18891a43db66c60c7", + "sha256:94f5bd885f67bbb25c82d80184abbf7ce4f6c3c3a41fbaa4182f034bba803e69", + "sha256:a89e188daa119ffa0d03ce5123dee3f8ffd5115c896c2a9d4f0dbb3d8b95bfa3", + "sha256:ad3399da9b0ca36e2f24de72f67ab2854a62e623274607e37e0ce5f5d5fa9166", + "sha256:b0348be89275fd1d4c44ffa39530c41a21062f52299b1e3ee7d1c61f060044b8", + "sha256:b5554368e4ede1856121b0dfa35ce71768102e4aa55e526cb8de7f374ff78722", + "sha256:cbddc56b2502d3f87fda4f98d948eb5b11f36ff3902e17cb6cc44727f2200525", + "sha256:d79f18f41751725c56eceab2a886f021d70fd70a6188fd386e29a045945ffc10", + "sha256:dc2ca26a19ab32dc475dbad9dfe723d3a64c835f4c23f625c2b6566ca32b9f29", + "sha256:dd9bcd4f294eb0633bb33d1a74febdd2b9018b8b8ed325f861fffcd2c7660bb8", + "sha256:e8baab1bc7c9152715844f1faca6744f2416929de10d7639ed49555a85549f52", + "sha256:ec31fe12668af687b99acf1567399632a7c47b0e17cfb9ae47c098644ef36797", + "sha256:f12b4f7e2d8f9da3141564e6737d79016fe5336cc92de6814eba579744f65b0a", + "sha256:f58ac38d5ca045a377b3b377c84df8175ab992c970a53332fa8ac2373df44ff7" + ], + "index": "pypi", + "version": "==1.16.4" + }, + "pandas": { + "hashes": [ + "sha256:071e42b89b57baa17031af8c6b6bbd2e9a5c68c595bc6bf9adabd7a9ed125d3b", + "sha256:17450e25ae69e2e6b303817bdf26b2cd57f69595d8550a77c308be0cd0fd58fa", + "sha256:17916d818592c9ec891cbef2e90f98cc85e0f1e89ed0924c9b5220dc3209c846", + "sha256:2538f099ab0e9f9c9d09bbcd94b47fd889bad06dc7ae96b1ed583f1dc1a7a822", + "sha256:366f30710172cb45a6b4f43b66c220653b1ea50303fbbd94e50571637ffb9167", + "sha256:42e5ad741a0d09232efbc7fc648226ed93306551772fc8aecc6dce9f0e676794", + "sha256:4e718e7f395ba5bfe8b6f6aaf2ff1c65a09bb77a36af6394621434e7cc813204", + "sha256:4f919f409c433577a501e023943e582c57355d50a724c589e78bc1d551a535a2", + "sha256:4fe0d7e6438212e839fc5010c78b822664f1a824c0d263fd858f44131d9166e2", + "sha256:5149a6db3e74f23dc3f5a216c2c9ae2e12920aa2d4a5b77e44e5b804a5f93248", + "sha256:627594338d6dd995cfc0bacd8e654cd9e1252d2a7c959449228df6740d737eb8", + "sha256:83c702615052f2a0a7fb1dd289726e29ec87a27272d775cb77affe749cca28f8", + "sha256:8c872f7fdf3018b7891e1e3e86c55b190e6c5cee70cab771e8f246c855001296", + "sha256:90f116086063934afd51e61a802a943826d2aac572b2f7d55caaac51c13db5b5", + "sha256:a3352bacac12e1fc646213b998bce586f965c9d431773d9e91db27c7c48a1f7d", + "sha256:bcdd06007cca02d51350f96debe51331dec429ac8f93930a43eb8fb5639e3eb5", + "sha256:c1bd07ebc15285535f61ddd8c0c75d0d6293e80e1ee6d9a8d73f3f36954342d0", + "sha256:c9a4b7c55115eb278c19aa14b34fcf5920c8fe7797a09b7b053ddd6195ea89b3", + "sha256:cc8fc0c7a8d5951dc738f1c1447f71c43734244453616f32b8aa0ef6013a5dfb", + "sha256:d7b460bc316064540ce0c41c1438c416a40746fd8a4fb2999668bf18f3c4acf1" + ], + "index": "pypi", + "version": "==0.24.2" + }, + "python-dateutil": { + "hashes": [ + "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb", + "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e" + ], + "version": "==2.8.0" + }, + "pytz": { + "hashes": [ + "sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda", + "sha256:d747dd3d23d77ef44c6a3526e274af6efeb0a6f1afd5a69ba4d5be4098c8e141" + ], + "version": "==2019.1" + }, + "six": { + "hashes": [ + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + ], + "version": "==1.12.0" + } + }, + "develop": { + "astroid": { + "hashes": [ + "sha256:6560e1e1749f68c64a4b5dee4e091fce798d2f0d84ebe638cf0e0585a343acf4", + "sha256:b65db1bbaac9f9f4d190199bb8680af6f6f84fd3769a5ea883df8a91fe68b4c4" + ], + "version": "==2.2.5" + }, + "atomicwrites": { + "hashes": [ + "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", + "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" + ], + "version": "==1.3.0" + }, + "attrs": { + "hashes": [ + "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", + "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399" + ], + "version": "==19.1.0" + }, + "decorator": { + "hashes": [ + "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de", + "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6" + ], + "version": "==4.4.0" + }, + "importlab": { + "hashes": [ + "sha256:d855350d19dc10a17aabd2fe6f4b428ff1a936071f692fbf686a73694d26a51c" + ], + "version": "==0.5.1" + }, + "importlib-metadata": { + "hashes": [ + "sha256:a9f185022cfa69e9ca5f7eabfd5a58b689894cb78a11e3c8c89398a8ccbb8e7f", + "sha256:df1403cd3aebeb2b1dcd3515ca062eecb5bd3ea7611f18cba81130c68707e879" + ], + "version": "==0.17" + }, + "isort": { + "hashes": [ + "sha256:c40744b6bc5162bbb39c1257fe298b7a393861d50978b565f3ccd9cb9de0182a", + "sha256:f57abacd059dc3bd666258d1efb0377510a89777fda3e3274e3c01f7c03ae22d" + ], + "version": "==4.3.20" + }, + "lazy-object-proxy": { + "hashes": [ + "sha256:159a745e61422217881c4de71f9eafd9d703b93af95618635849fe469a283661", + "sha256:23f63c0821cc96a23332e45dfaa83266feff8adc72b9bcaef86c202af765244f", + "sha256:3b11be575475db2e8a6e11215f5aa95b9ec14de658628776e10d96fa0b4dac13", + "sha256:3f447aff8bc61ca8b42b73304f6a44fa0d915487de144652816f950a3f1ab821", + "sha256:4ba73f6089cd9b9478bc0a4fa807b47dbdb8fad1d8f31a0f0a5dbf26a4527a71", + "sha256:4f53eadd9932055eac465bd3ca1bd610e4d7141e1278012bd1f28646aebc1d0e", + "sha256:64483bd7154580158ea90de5b8e5e6fc29a16a9b4db24f10193f0c1ae3f9d1ea", + "sha256:6f72d42b0d04bfee2397aa1862262654b56922c20a9bb66bb76b6f0e5e4f9229", + "sha256:7c7f1ec07b227bdc561299fa2328e85000f90179a2f44ea30579d38e037cb3d4", + "sha256:7c8b1ba1e15c10b13cad4171cfa77f5bb5ec2580abc5a353907780805ebe158e", + "sha256:8559b94b823f85342e10d3d9ca4ba5478168e1ac5658a8a2f18c991ba9c52c20", + "sha256:a262c7dfb046f00e12a2bdd1bafaed2408114a89ac414b0af8755c696eb3fc16", + "sha256:acce4e3267610c4fdb6632b3886fe3f2f7dd641158a843cf6b6a68e4ce81477b", + "sha256:be089bb6b83fac7f29d357b2dc4cf2b8eb8d98fe9d9ff89f9ea6012970a853c7", + "sha256:bfab710d859c779f273cc48fb86af38d6e9210f38287df0069a63e40b45a2f5c", + "sha256:c10d29019927301d524a22ced72706380de7cfc50f767217485a912b4c8bd82a", + "sha256:dd6e2b598849b3d7aee2295ac765a578879830fb8966f70be8cd472e6069932e", + "sha256:e408f1eacc0a68fed0c08da45f31d0ebb38079f043328dce69ff133b95c29dc1" + ], + "version": "==1.4.1" + }, + "mccabe": { + "hashes": [ + "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42", + "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f" + ], + "version": "==0.6.1" + }, + "more-itertools": { + "hashes": [ + "sha256:2112d2ca570bb7c3e53ea1a35cd5df42bb0fd10c45f0fb97178679c3c03d64c7", + "sha256:c3e4748ba1aad8dba30a4886b0b1a2004f9a863837b8654e7059eebf727afa5a" + ], + "markers": "python_version > '2.7'", + "version": "==7.0.0" + }, + "networkx": { + "hashes": [ + "sha256:8311ddef63cf5c5c5e7c1d0212dd141d9a1fe3f474915281b73597ed5f1d4e3d" + ], + "version": "==2.3" + }, + "ninja": { + "hashes": [ + "sha256:0184e69a70bb055621935b935f967b3dc4e189c8f1494d9ea0b90ed15d0308c4", + "sha256:0d700c1471f9771978415cab503dc6b55e6267dc21428865c9a8f1a906f3a06d", + "sha256:35d3c2fd77e9271bbfb01beb2c8b733ca647356369da41bb095e14b0369ea3cf", + "sha256:6ef795816ef3cd3a2def4c4b8e5f1fb7e470bb913c0bae7bb38afe498d0075aa", + "sha256:75ebbbaeb1b3298bf001cc7866555d88ba33bbdab4cb99eae1e2a59efe23f47b", + "sha256:9090b6695d86643354cbd394ef835f40c0179cc24969d09446eae7931b702f12", + "sha256:965bf62d59f3794306b40dc08e31a9286650cff0b11b44acd0a61e61f6030553", + "sha256:a8503e5fc4f742520e5b3389324e9710eecbc9fa60956b7034adaf1f0650ba3f", + "sha256:a998d98ffd7262e03be4655e742fa918af93fb19ac36e9140afc0fe8190920a6", + "sha256:db31cef1eb979e4fe4539046cf04311e00f271f8687bde7dfb64d85f4e4d2b1e", + "sha256:fd72664f0e2506f2c8002f2ee67ddd50b87604fe8c1bd04d2108dfeacc82420d" + ], + "version": "==1.9.0.post1" + }, + "pluggy": { + "hashes": [ + "sha256:0825a152ac059776623854c1543d65a4ad408eb3d33ee114dff91e57ec6ae6fc", + "sha256:b9817417e95936bf75d85d3f8767f7df6cdde751fc40aed3bb3074cbcb77757c" + ], + "version": "==0.12.0" + }, + "py": { + "hashes": [ + "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", + "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53" + ], + "version": "==1.8.0" + }, + "pylint": { + "hashes": [ + "sha256:5d77031694a5fb97ea95e828c8d10fc770a1df6eb3906067aaed42201a8a6a09", + "sha256:723e3db49555abaf9bf79dc474c6b9e2935ad82230b10c1138a71ea41ac0fff1" + ], + "index": "pypi", + "version": "==2.3.1" + }, + "pytest": { + "hashes": [ + "sha256:1a8aa4fa958f8f451ac5441f3ac130d9fc86ea38780dd2715e6d5c5882700b24", + "sha256:b8bf138592384bd4e87338cb0f256bf5f615398a649d4bd83915f0e4047a5ca6" + ], + "index": "pypi", + "version": "==4.5.0" + }, + "pytype": { + "hashes": [ + "sha256:ca9f387b818d5d397f0030d86aef9228a5c3f7e6319915f45bd34b9918be7b9a" + ], + "index": "pypi", + "version": "==2019.5.24" + }, + "pyyaml": { + "hashes": [ + "sha256:1adecc22f88d38052fb787d959f003811ca858b799590a5eaa70e63dca50308c", + "sha256:436bc774ecf7c103814098159fbb84c2715d25980175292c648f2da143909f95", + "sha256:460a5a4248763f6f37ea225d19d5c205677d8d525f6a83357ca622ed541830c2", + "sha256:5a22a9c84653debfbf198d02fe592c176ea548cccce47553f35f466e15cf2fd4", + "sha256:7a5d3f26b89d688db27822343dfa25c599627bc92093e788956372285c6298ad", + "sha256:9372b04a02080752d9e6f990179a4ab840227c6e2ce15b95e1278456664cf2ba", + "sha256:a5dcbebee834eaddf3fa7366316b880ff4062e4bcc9787b78c7fbb4a26ff2dd1", + "sha256:aee5bab92a176e7cd034e57f46e9df9a9862a71f8f37cad167c6fc74c65f5b4e", + "sha256:c51f642898c0bacd335fc119da60baae0824f2cde95b0330b56c0553439f0673", + "sha256:c68ea4d3ba1705da1e0d85da6684ac657912679a649e8868bd850d2c299cce13", + "sha256:e23d0cc5299223dcc37885dae624f382297717e459ea24053709675a976a3e19" + ], + "version": "==5.1" + }, + "six": { + "hashes": [ + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + ], + "version": "==1.12.0" + }, + "typed-ast": { + "hashes": [ + "sha256:132eae51d6ef3ff4a8c47c393a4ef5ebf0d1aecc96880eb5d6c8ceab7017cc9b", + "sha256:18141c1484ab8784006c839be8b985cfc82a2e9725837b0ecfa0203f71c4e39d", + "sha256:2baf617f5bbbfe73fd8846463f5aeafc912b5ee247f410700245d68525ec584a", + "sha256:3d90063f2cbbe39177e9b4d888e45777012652d6110156845b828908c51ae462", + "sha256:4304b2218b842d610aa1a1d87e1dc9559597969acc62ce717ee4dfeaa44d7eee", + "sha256:4983ede548ffc3541bae49a82675996497348e55bafd1554dc4e4a5d6eda541a", + "sha256:5315f4509c1476718a4825f45a203b82d7fdf2a6f5f0c8f166435975b1c9f7d4", + "sha256:6cdfb1b49d5345f7c2b90d638822d16ba62dc82f7616e9b4caa10b72f3f16649", + "sha256:7b325f12635598c604690efd7a0197d0b94b7d7778498e76e0710cd582fd1c7a", + "sha256:8d3b0e3b8626615826f9a626548057c5275a9733512b137984a68ba1598d3d2f", + "sha256:8f8631160c79f53081bd23446525db0bc4c5616f78d04021e6e434b286493fd7", + "sha256:912de10965f3dc89da23936f1cc4ed60764f712e5fa603a09dd904f88c996760", + "sha256:b010c07b975fe853c65d7bbe9d4ac62f1c69086750a574f6292597763781ba18", + "sha256:c908c10505904c48081a5415a1e295d8403e353e0c14c42b6d67f8f97fae6616", + "sha256:c94dd3807c0c0610f7c76f078119f4ea48235a953512752b9175f9f98f5ae2bd", + "sha256:ce65dee7594a84c466e79d7fb7d3303e7295d16a83c22c7c4037071b059e2c21", + "sha256:eaa9cfcb221a8a4c2889be6f93da141ac777eb8819f077e1d09fb12d00a09a93", + "sha256:f3376bc31bad66d46d44b4e6522c5c21976bf9bca4ef5987bb2bf727f4506cbb", + "sha256:f9202fa138544e13a4ec1a6792c35834250a85958fde1251b6a22e07d1260ae7" + ], + "markers": "implementation_name == 'cpython'", + "version": "==1.3.5" + }, + "wcwidth": { + "hashes": [ + "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", + "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" + ], + "version": "==0.1.7" + }, + "wrapt": { + "hashes": [ + "sha256:4aea003270831cceb8a90ff27c4031da6ead7ec1886023b80ce0dfe0adf61533" + ], + "version": "==1.11.1" + }, + "zipp": { + "hashes": [ + "sha256:8c1019c6aad13642199fbe458275ad6a84907634cc9f0989877ccc4a2840139d" + ], + "version": "==0.5.1" + } + } +} diff --git a/dataprep/__init__.py b/dataprep/__init__.py index e69de29bb..4ef038aea 100644 --- a/dataprep/__init__.py +++ b/dataprep/__init__.py @@ -0,0 +1,7 @@ + +"""Docstring + Data preparation module +""" +import logging + +logging.basicConfig(level=logging.INFO, format="%(message)") diff --git a/dataprep/eda/__init__.py b/dataprep/eda/__init__.py new file mode 100644 index 000000000..86cd58c76 --- /dev/null +++ b/dataprep/eda/__init__.py @@ -0,0 +1,11 @@ + +"""Docstring + Data preparation module +""" +import logging + +# Dask Default partitions +DEFAULT_PARTITIONS = 1 + +logging.basicConfig(level=logging.INFO, format="%(message)") +LOGGER = logging.getLogger(__name__) diff --git a/dataprep/eda/eda_plot.py b/dataprep/eda/eda_plot.py new file mode 100644 index 000000000..121a53a8e --- /dev/null +++ b/dataprep/eda/eda_plot.py @@ -0,0 +1,445 @@ +""" + This module implements the plot(df) function. +""" +from enum import Enum +from typing import Any, Dict, List, Optional, Tuple, Union, cast + +import dask +import dask.array as da +import dask.dataframe as dd +import numpy as np +import pandas as pd + +from .__init__ import LOGGER, DEFAULT_PARTITIONS + + +class DataType(Enum): + """ + Enumeration for storing the different types of data possible in a column + """ + TYPE_NUM = 1 + TYPE_CAT = 2 + TYPE_UNSUP = 3 + + +# Type aliasing +StringList = List[str] + + +def _calc_box_stats(grp_series: Any) -> Dict[str, Any]: + stats: Dict[str, Any] = dict() + quantiles = grp_series.quantile([.25, .50, .75]).compute() + stats["25%"], stats["50%"], stats["75%"] = quantiles[.25], quantiles[.50], quantiles[.75] + stats["iqr"] = stats["75%"] - stats["25%"] + + outliers = list() + grp_series = grp_series.compute() + if len(grp_series) == 1: + stats["min"] = grp_series.reset_index().iloc[0, 1] + stats["max"] = stats["min"] + else: + min_value, max_value = np.inf, -np.inf + + for value in grp_series: + if (stats["25%"] - 1.5 * stats["iqr"]) < value < ( + stats["75%"] + 1.5 * stats["iqr"]): # data is in the bound + min_value = min(value, min_value) + max_value = max(value, max_value) + else: # otherwise, outliers + outliers.append(value) + + stats["min"] = min_value + stats["max"] = max_value + stats["outliers"] = outliers + return stats + + +def _calc_box( + dataframe: dd.DataFrame, + col_x: str, + col_y: Optional[str] = None +) -> Dict[str, Dict[str, Any]]: + """ + Returns intermediate stats of the box plot + of columns col_x and col_y. + + PARAMETERS + __________ + dataframe: the input dataframe + col_x : a valid column name of the dataframe + col_y : a valid column name of the dataframe + + + RETURNS + __________ + a (column_name: data) dict storing the intermediate results + """ + res: Dict[str, Any] = dict() + cat_col, num_col = ( + col_x, col_y) if (get_type(dataframe[col_x]) == DataType.TYPE_CAT) else (col_y, col_x) + + if col_y is None: + col_series = dataframe[col_x] + res = _calc_box_stats(col_series) + else: + for group in dataframe[cat_col].unique().compute(): + grp_series = dataframe.groupby(cat_col).get_group(group)[num_col] + res[group] = _calc_box_stats(grp_series) + + return {"box_plot": res} + + +def _calc_statcked( + dataframe: dd.DataFrame, + col_x: str, + col_y: str +) -> Dict[str, Dict[Tuple[Any, Any], int]]: + """ Returns intermediate stats of the stacked column plot + of columns col_x and col_y. + + PARAMETERS + __________ + dataframe: the input dataframe + col_x : a valid column name of the dataframe + col_y : a valid column name of the dataframe + + + RETURNS + __________ + a (column_name: data) dict storing the intermediate results + """ + grp_object = dataframe.groupby([col_x, col_y]) + + grp_series = grp_object.count().compute().iloc[:, 0] + # print (grp_series) + return {"stacked_column_plot": dict(grp_series)} + + +def _calc_scatter( + dataframe: dd.DataFrame, + col_x: str, + col_y: str +) -> Dict[str, Dict[Union[int, float], Union[int, float]]]: + """ + TO-DO: WARNING: For very large amount of points, implement Heat Map. + Returns intermediate stats of the scattered plot + of columns col_x and col_y. + + PARAMETERS + __________ + dataframe: the input dataframe + col_x : a valid column name of the dataframe + col_y : a valid column name of the dataframe + + + RETURNS + __________ + a (column_name: data) dict storing the intermediate results + """ + series_x = dataframe[col_x].compute() + series_y = dataframe[col_y].compute() + + res = set() + for each in zip(series_x, series_y): + res.add(each) + + return {"scatter_plot": dict(res)} + + +def _calc_pie(dataframe: dd.DataFrame, col: str) -> Dict[str, Dict[str, float]]: + """ Returns a dict {category: category_count} for the + categorical column given as the second argument + + Parameters + __________ + dataframe : the input pandas dataframe + col : the str column of dataframe for which count needs to be calculated + + Returns + __________ + dict : A dict of (category : count) for the input col + """ + grp_object = (dataframe.groupby(col)[col].count() / dataframe[col].size) * 100 + return {"pie_plot": dict(grp_object.compute())} + + +def _calc_bar(dataframe: dd.DataFrame, col: str) -> Dict[str, Dict[str, int]]: + """ Returns a dict {category: category_count} for the + categorical column given as the second argument + + Parameters + __________ + dataframe : the input pandas dataframe + col : the str column of dataframe for which count needs to be calculated + + Returns + __________ + dict : A dict of (category : count) for the input col + """ + grp_object = dataframe.groupby(col)[col].count() + return {"bar_plot": dict(grp_object)} + + +def _calc_hist_by_group( + dataframe: dd.DataFrame, + col_x: str, + col_y: str, + nbins: int = 10) -> Dict[str, Dict[str, Tuple[Any, Any]]]: + """Returns the histogram array for the continuous + distribution of values in the column given as the second argument + _TODO write test + Parameters + __________ + dataframe : the input pandas dataframe + col : the str column of dataframe for which hist array needs to be + calculated + + Returns + __________ + np.array : An array of values representing histogram for the input col + """ + col_cat, col_num = (col_x, col_y) if (get_type(dataframe[col_x]) == DataType.TYPE_CAT) \ + else (col_y, col_x) + + grp_hist: Dict[str, Tuple[Any, Any]] = dict() + hist_interm: List[Any] = list() + grp_name_list: List[str] = list() + + for group in dataframe[col_cat].unique().compute(): + grp_series = dataframe.groupby(col_cat).get_group(group)[col_num] + minv = grp_series.min().compute() + maxv = grp_series.max().compute() + hist = da.histogram(grp_series, range=[minv, maxv], bins=nbins) + hist_interm.append(hist) + grp_name_list.append(group) + + hist_interm, = dask.compute(hist_interm) + + for zipped_element in zip(grp_name_list, hist_interm): + grp_hist[zipped_element[0]] = zipped_element[1] + + return {"histogram": grp_hist} + + +def _calc_hist( + dataframe: dd.DataFrame, + col: str, + nbins: int = 10) -> Dict[str, Tuple[List[Union[int, float]], List[Union[int, float]]]]: + """Returns the histogram array for the continuous + distribution of values in the column given as the second argument + + Parameters + __________ + dataframe : the input pandas dataframe + col : the str column of dataframe for which hist array needs to be + calculated + + Returns + __________ + np.array : An array of values representing histogram for the input col + """ + minv = dataframe[col].min() + maxv = dataframe[col].max() + dframe = dd.from_array(dataframe[col]).dropna() + hist_array, bins = da.histogram(dframe.values, range=[minv, maxv], bins=nbins) + hist_array = hist_array.compute() + + if not hist_array.size == 0: + return {'histogram': (hist_array, bins)} + return {'histogram': (list(), list())} + + +def _calc_qqnorm( + dataframe: dd.DataFrame, + col: str, + qrange: Optional[List[int]] = None) -> Dict[str, List[Tuple[float, float]]]: + """ + Calculates points of the QQ plot of the given column of the data frame. + :param dataframe - the input dataframe + :param col - the input column of the dataframe + :param qrange - the list of quantiles to be calculated. By default, all the percentiles are + calculated. + """ + points = list() + if qrange is None: + qrange = list(range(1, 101)) + + dask_series = dataframe[col] + try: + size_ = dask_series.size.compute() + np.random.seed(0) + normal_points = np.sort(np.random.standard_normal(size=(size_, ))) + x_points = np.percentile(normal_points, q=qrange) + y_points = dask_series.compute().sort_values().quantile([x / 100 for x in qrange]) + for point in zip(x_points, y_points): + points.append(point) + except TypeError: + # _TODO + pass + + if points: + return {"qq_norm_plot": points} + return {"qq_norm_plot": list()} + + +def get_type(data: dd.Series) -> DataType: + """ Returns the type of the input data. + Identified types are according to the DataType Enumeration. + + Parameter + __________ + The data for which the type needs to be identified. + + Returns + __________ + str representing the type of the data. + """ + + col_type = DataType.TYPE_UNSUP + try: + if pd.api.types.is_bool_dtype(data): + col_type = DataType.TYPE_CAT + elif pd.api.types.is_numeric_dtype(data) and data.dropna().unique().size.compute() == 2: + col_type = DataType.TYPE_CAT + elif pd.api.types.is_numeric_dtype(data): + col_type = DataType.TYPE_NUM + else: + col_type = DataType.TYPE_CAT + except NotImplementedError as error: # TO-DO + LOGGER.info("Type cannot be determined due to : %s", error) + + return col_type + + +def plot_df( + data_frame: dd.DataFrame, + force_cat: Optional[StringList] = None, + force_num: Optional[StringList] = None +) -> Dict[str, Union[Dict[str, Union[List[Any], Dict[Any, Any]]], Tuple[Any], List[Any], + Dict[Any, Any]]]: + """ + Supporting funtion to the main plot function + :param data_frame: dask dataframe + :param force_cat: list of categorical columns defined explicitly + :param force_num: list of numerical columns defined explicitly + :return: + """ + col_list = list() + dask_result: List[Any] = list() + + for col in data_frame.columns: + if data_frame[col].count().compute() == 0: + col_list.append(col) + dask_result.append(data_frame[col]) + + elif get_type(data_frame[col]) == DataType.TYPE_CAT or ( + force_cat is not None and col in force_cat): + cnt_series = dask.delayed(_calc_bar)(data_frame, col) + dask_result.append(cnt_series) + col_list.append(col) + + elif get_type(data_frame[col]) == DataType.TYPE_NUM or ( + force_num is not None and col in force_num): + hist = dask.delayed(_calc_hist)(data_frame, col) + dask_result.append(hist) + col_list.append(col) + + column_dict = dict() + computed_res, = dask.compute(dask_result) + + for each in zip(col_list, computed_res): + column_dict[each[0]] = each[1] + + return column_dict + + +def plot( + pd_data_frame: pd.DataFrame, + col_x: Optional[str] = None, + col_y: Optional[str] = None, + force_cat: Optional[StringList] = None, + force_num: Optional[StringList] = None +) -> Dict[str, Union[Dict[str, Union[List[Any], Dict[Any, Any]]], Tuple[Any], List[Any], + Dict[Any, Any]]]: + """ + Returns an intermediate representation for the plots of + different columns in the data_frame. + + Parameters + data_frame: the pandas data_frame for which plots are calculated for each + column. + col_x : A column in the data_frame. + col_y : A column in the data_frame. + force_cat: the list of columns which have to considered of type "TYPE_CAT" + force_num: the list of columns which have to considered of type "TYPE_NUM" + kwargs : TO-DO + + Returns + __________ + dict : A (column: [array/dict]) dict to encapsulate the + intermediate results. + """ + data_frame: dd.DataFrame = dd.from_pandas(pd_data_frame, npartitions=DEFAULT_PARTITIONS) + + result: Dict[str, Union[Dict[str, Union[List[Any], Dict[Any, Any]]], Tuple[Any], List[Any], + Dict[Any, Any]]] = dict() + + if col_x is None and col_y is None: + result = plot_df(data_frame, force_cat, force_num) + + elif (col_x is None and col_y is not None) or (col_x is not None and col_y is None): + + target_col: str = cast(str, col_x if col_y is None else col_y) + dask_result: List[Any] = list() + + if data_frame[target_col].count() == 0: + dask_result.append([]) + + elif get_type(data_frame[target_col]) == DataType.TYPE_CAT or ( + force_cat is not None and target_col in force_cat): + # BAR_PLOT + dask_result.append(dask.delayed(_calc_bar)(data_frame, target_col)) + # PIE_CHART + dask_result.append(dask.delayed(_calc_pie)(data_frame, target_col)) + + elif get_type(data_frame[target_col]) == DataType.TYPE_NUM or ( + force_num is not None and target_col in force_num): + # HISTOGRAM + dask_result.append(dask.delayed(_calc_hist)(data_frame, target_col)) + # BOX_PLOT + dask_result.append(dask.delayed(_calc_bar)(data_frame, target_col)) + # QQ-NORM + dask_result.append(dask.delayed(_calc_qqnorm)(data_frame, target_col)) + + column_dict = {target_col: dask.compute(dask_result)} + result = column_dict + + elif col_x is not None and col_y is not None: + type_x = get_type(data_frame[col_x]) + type_y = get_type(data_frame[col_y]) + temp_dask_result: Dict[str, Any] = dict() + + try: + if type_y == DataType.TYPE_CAT and type_x == DataType.TYPE_NUM or \ + type_y == DataType.TYPE_NUM and type_x == DataType.TYPE_CAT: + # BOX_PER_GROUP + temp_dask_result.update(_calc_box(data_frame, col_x, col_y)) + # HISTOGRAM_PER_GROUP + temp_dask_result.update(_calc_hist_by_group(data_frame, col_x, col_y)) + + elif type_x == DataType.TYPE_CAT and type_y == DataType.TYPE_CAT: + temp_dask_result.update(_calc_statcked(data_frame, col_x, col_y)) + + elif type_x == DataType.TYPE_NUM and type_y == DataType.TYPE_NUM: + temp_dask_result.update(_calc_scatter(data_frame, col_x, col_y)) + else: + pass + # WARNING: _TODO + result, = dask.compute(temp_dask_result) + except NotImplementedError as error: # _TODO + LOGGER.info("Plot could not be obtained due to : %s", error) + else: + pass + # _TODO to be added + + return result diff --git a/dataprep/tests/__init__.py b/dataprep/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dataprep/tests/eda/__init__.py b/dataprep/tests/eda/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dataprep/tests/eda/test_eda.py b/dataprep/tests/eda/test_eda.py new file mode 100644 index 000000000..bb3a7fddb --- /dev/null +++ b/dataprep/tests/eda/test_eda.py @@ -0,0 +1,227 @@ +""" + module for testing plot(df, x, y) function. +""" +import datetime +from typing import Any, Dict, Union, cast, Tuple + +import numpy as np +import pandas as pd +from pandas import Timestamp + +from ...eda.eda_plot import plot # dataprep.tests.eda.test_eda + + +def test_normal() -> None: + """ + + :return: + """ + data_1 = { + + "id": [chr(97 + c) for c in range(1, 10)], + + "x": [50, 50, -10, 0, 0, 5, 15, -3, None], + + "y": [0.000001, 654.152, None, 15.984512, 3122, -3.1415926535, 111, + 15.9, 13.5], + + "s1": np.ones(9), + + "somedate": [datetime.date(2011, 7, 4), + datetime.datetime(2022, 1, 1, 13, 57), + datetime.datetime(1990, 12, 9), np.nan, + datetime.datetime(1990, 12, 9), + datetime.datetime(1950, 12, 9), + datetime.datetime(1898, 1, 2), + datetime.datetime(1950, 12, 9), + datetime.datetime(1950, 12, 9)], + + "bool_tf": [True, True, False, True, False, True, True, False, + True], + + "bool_tf_with_nan": [True, False, False, False, False, True, True, + False, np.nan], + + "bool_01": [1, 1, 0, 1, 1, 0, 0, 0, 1], + + "bool_01_with_nan": [1, 0, 1, 0, 0, 1, 1, 0, np.nan], + + "mixed": [1, 2, "a", 4, 5, 6, 7, 8, 9] + + } + + df_1 = pd.DataFrame(data_1) + + df_1_expected: Dict[str, Dict[str, Union[Dict[Any, Any], Tuple[Any, Any]]]] = \ + {"bool_01": {"bar_plot": {0: 4, 1: 5}}, + "bool_01_with_nan": {"bar_plot": {0.0: 4, 1.0: 4}}, + "bool_tf": {"bar_plot": {False: 3, True: 6}}, + "bool_tf_with_nan": {"bar_plot": {False: 5, True: 3}}, + "s1": {"bar_plot": {1.0: 9}}, + "x": {"histogram": (np.array([1, 3, 1, 0, 1, 0, 0, 0, 0, 2], dtype=np.int64), + np.array([-10., -4., 2., 8., 14., 20., 26., + 32., 38., 44., 50.]))}, + 'y': {'histogram': (np.array([6, 0, 1, 0, 0, 0, 0, 0, 0, 1], dtype=np.int64), + np.array([-3.14159265, 309.37256661, 621.88672588, + 934.40088514, 1246.91504441, 1559.42920367, + 1871.94336294, 2184.4575222, 2496.97168147, + 2809.48584073, 3122.]))} + } + res = cast(Dict[str, Dict[str, Union[Dict[Any, Any], Tuple[Any, Any]]]], plot(df_1, + force_cat=[ + "bool_01", + "bool_01_ \ + with_nan", + "s1"])) + + assert res["bool_01"] == df_1_expected["bool_01"] + assert res["bool_01_with_nan"] == df_1_expected["bool_01_with_nan"] + assert res["bool_tf"] == df_1_expected["bool_tf"] + assert res["bool_tf_with_nan"] == df_1_expected["bool_tf_with_nan"] + assert res["s1"] == df_1_expected["s1"] + assert np.allclose(res["x"]["histogram"][0], df_1_expected["x"]["histogram"][0], equal_nan=True) + assert np.allclose(res["x"]["histogram"][1], df_1_expected["x"]["histogram"][1], equal_nan=True) + assert np.allclose(res["y"]["histogram"][0], df_1_expected["y"]["histogram"][0], equal_nan=True) + assert np.allclose(res["y"]["histogram"][1], df_1_expected["y"]["histogram"][1], equal_nan=True) + + data = { + + "id": [chr(97 + c) for c in range(1, 21)], + + "x": ["d", "c", "b", "a", "b", "d", "c", "a", "a", "a", "c", "b", + "c", "a", "d", "b", "b", "b", "b", "b"], + + "y": [794, 652, 158, 134, 448, 682, 135, 795, 353, 395, 403, 498, + 622, 80, 654, 772, 867, 676, 670, 736], + + "s1": np.ones(20), + + "somedate": [datetime.date(2011, 7, 4), + datetime.datetime(1898, 1, 2), + datetime.datetime(1950, 12, 9), + datetime.datetime(1950, 12, 9), + datetime.datetime(1898, 1, 2), + datetime.datetime(1990, 12, 9), np.nan, + datetime.datetime(1990, 12, 9), + datetime.datetime(1950, 12, 9), + datetime.datetime(1898, 1, 2), + datetime.datetime(1950, 12, 9), + datetime.datetime(1950, 12, 9), + datetime.datetime(1950, 12, 9), + datetime.datetime(1950, 12, 9), + datetime.datetime(1950, 12, 9), + datetime.datetime(1950, 12, 9), + datetime.datetime(1950, 12, 9), + datetime.datetime(1950, 12, 9), + datetime.datetime(1950, 12, 9), + datetime.datetime(1950, 12, 9)], + } + + df_data = pd.DataFrame(data) + + df_expected: Dict[str, Dict[str, Any]] \ + = {"box_plot": {"a": {"25%": 134.0, + "50%": 353.0, + "75%": 395.0, + "iqr": 261.0, + "max": 395, + "min": 80, + "outliers": [795]}, + "b": {"25%": 485.5, + "50%": 673.0, + "75%": 745.0, + "iqr": 259.5, + "max": 867, + "min": 158, + "outliers": []}, + "c": {"25%": 336.0, + "50%": 512.5, + "75%": 629.5, + "iqr": 293.5, + "max": 652, + "min": 135, + "outliers": []}, + "d": {"25%": 668.0, + "50%": 682.0, + "75%": 738.0, + "iqr": 70.0, + "max": 794, + "min": 654, + "outliers": []}}, + "histogram": {"d": (np.array([1, 0, 1, 0, 0, 0, 0, 0, 0, 1], dtype=np.int64), + np.array([654., 668., 682., 696., 710., 724., 738., 752., + 766., 780., 794.])), + "c": (np.array([1, 0, 0, 0, 0, 1, 0, 0, 0, 2], dtype=np.int64), + np.array([135., 186.7, 238.4, 290.1, 341.8, 393.5, + 445.2, 496.9, 548.6, 600.3, 652.])), + "b": (np.array([1, 0, 0, 0, 2, 0, 0, 2, 2, 1], dtype=np.int64), + np.array([158., 228.9, 299.8, 370.7, 441.6, 512.5, 583.4, + 654.3, 725.2, 796.1, 867.])), + "a": (np.array([2, 0, 0, 1, 1, 0, 0, 0, 0, 1], dtype=np.int64), + np.array([80., 151.5, 223., 294.5, 366., 437.5, 509., + 580.5, 652., 723.5, 795.]))} + } + another_res = cast(Dict[str, Dict[str, Any]], plot(df_data, "y", "x")) + + assert another_res["box_plot"]["a"] == df_expected["box_plot"]["a"] + assert another_res["box_plot"]["b"] == df_expected["box_plot"]["b"] + assert another_res["box_plot"]["c"] == df_expected["box_plot"]["c"] + assert another_res["box_plot"]["d"] == df_expected["box_plot"]["d"] + + assert np.allclose(another_res["histogram"]["a"][0], df_expected["histogram"]["a"][0], + equal_nan=True) + assert np.allclose(another_res["histogram"]["b"][0], df_expected["histogram"]["b"][0], + equal_nan=True) + assert np.allclose(another_res["histogram"]["c"][0], df_expected["histogram"]["c"][0], + equal_nan=True) + assert np.allclose(another_res["histogram"]["d"][0], df_expected["histogram"]["d"][0], + equal_nan=True) + + df_expected_2 = {"stacked_column_plot": {("a", Timestamp("1898-01-02 00:00:00")): 1, + ("a", Timestamp("1950-12-09 00:00:00")): 3, + ("a", Timestamp("1990-12-09 00:00:00")): 1, + ("b", Timestamp("1898-01-02 00:00:00")): 1, + ("b", Timestamp("1950-12-09 00:00:00")): 7, + ("c", Timestamp("1898-01-02 00:00:00")): 1, + ("c", Timestamp("1950-12-09 00:00:00")): 2, + ("d", Timestamp("1950-12-09 00:00:00")): 1, + ("d", Timestamp("1990-12-09 00:00:00")): 1, + ("d", Timestamp("2011-07-04 00:00:00")): 1 + } + } + + res_2 = plot(df_data, "x", "somedate") + assert df_expected_2["stacked_column_plot"] == res_2["stacked_column_plot"] + + +def test_corner() -> None: + """ + + :return: + """ + df_2 = pd.DataFrame( + {"all_nan": [np.nan for _ in range(10)], "all_one": np.ones(10), + "all_zeros": np.zeros(10), "random": np.array( + [0.38538395, 0.13609054, 0.15973238, 0.96192966, 0.03708882, + 0.03633855, 0.25260128, 0.72139843, 0.74553949, + 0.41102021])}) + + df_1_expected = {"all_one": {"bar_plot": {1.0: 10}}, + "all_zeros": {"bar_plot": {0.0: 10}}, + "random": {"bar_plot": np.array([2, 2, 1, 1, 1, 0, 0, 2, 0, 1], + dtype=np.int64)}} + + res = plot(df_2, force_cat=["all_one", "all_zeros"]) + + assert res["all_one"] == df_1_expected["all_one"] + assert res["all_zeros"] == df_1_expected["all_zeros"] + + df_2 = pd.DataFrame({ + "empty": [], + "another_empty": [] + }) + + df_2_expected: Dict[str, Any] = {'scatter_plot': {}} + + res = plot(df_2, "empty", "another_empty") + assert res == df_2_expected diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 000000000..49f577be3 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,8 @@ + + +[mypy] +mypy_path = "venv/Lib/site-packages" +ignore_missing_imports = False + +[mypy-dataprep.tests.*] +ignore_errors = False \ No newline at end of file diff --git a/pytype.cfg b/pytype.cfg new file mode 100644 index 000000000..5247f9b01 --- /dev/null +++ b/pytype.cfg @@ -0,0 +1,38 @@ +# NOTE: All relative paths are relative to the location of this file. + +[pytype] + +# Space-separated list of files or directories to exclude. +exclude = + **/*_test.py + **/test_*.py + +# Space-separated list of files or directories to process. +inputs = + . + +# Keep going past errors to analyze as many files as possible. +keep_going = False + +# All pytype output goes here. +output = .pytype + +# Paths to source code directories, separated by ':'. +pythonpath = + . + +# Python version (major.minor) of the target code. +python_version = 3.7 + +# Comma separated list of error names to ignore. +disable = + pyi-error + +# Don't report errors. +report_errors = True + +# Experimental: solve unknown types to label with structural types. +protocols = False + +# Experimental: Only load submodules that are explicitly imported. +strict_import = False diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 000000000..a4f3b40ac --- /dev/null +++ b/setup.cfg @@ -0,0 +1,6 @@ +[pytype] +inputs = dataprep + +[pep8] +ignore = +max-line-length = 80 \ No newline at end of file