diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 000000000..4a2d1e78a
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1,30 @@
+version: 2.0
+jobs:
+  check:
+    docker:
+      - image: circleci/python:3.7.2
+    steps:
+      - run:
+          name: Add python user PATH into PATH
+          command: echo "export PATH=$PATH:$HOME/.local/bin" >> $BASH_ENV 
+      - run: 
+          name: Install python tools
+          command: pip install --user pipenv
+      - checkout
+      - run:
+          name: Install dependencies
+          command: pipenv install -d
+      - run:
+          name: Type check the project
+          command: pipenv run pytype dataprep && pipenv run mypy dataprep --strict --ignore-missing-imports
+      - run:
+          name: Test the project
+          command: pipenv run pytest dataprep
+      - run:
+          name: Style check the project
+          command: pipenv run pylint dataprep
+workflows:
+  version: 2
+  build_and_test:
+    jobs:
+      - check
diff --git a/.gitignore b/.gitignore
index 894a44cc0..280c0f5cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,6 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+# pytype
+.pytype/
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 000000000..94a25f7f4
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/Pipfile b/Pipfile
new file mode 100644
index 000000000..8a3fc8a93
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,10 @@
+[packages]
+"dask[complete]" = "*"
+pandas = "*"
+numpy = "*"
+
+[dev-packages]
+pylint = "*"
+pytype = "*"
+pytest = "*"
+mypy = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
new file mode 100644
index 000000000..635953682
--- /dev/null
+++ b/Pipfile.lock
@@ -0,0 +1,316 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "c5215368a09766f83ec69b2d2f1856b3fdd1f70d07fd55dba9400d3d7407c7ad"
+        },
+        "pipfile-spec": 6,
+        "requires": {},
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "dask": {
+            "hashes": [
+                "sha256:5e7876bae2a01b355d1969b73aeafa23310febd8c353163910b73e93dc7e492c",
+                "sha256:942edbbaceb914be3427fc6f1d5da98a31c3e9eceddcf3158a74e1d4d6fcc67c"
+            ],
+            "index": "pypi",
+            "version": "==1.2.2"
+        },
+        "numpy": {
+            "hashes": [
+                "sha256:0778076e764e146d3078b17c24c4d89e0ecd4ac5401beff8e1c87879043a0633",
+                "sha256:141c7102f20abe6cf0d54c4ced8d565b86df4d3077ba2343b61a6db996cefec7",
+                "sha256:14270a1ee8917d11e7753fb54fc7ffd1934f4d529235beec0b275e2ccf00333b",
+                "sha256:27e11c7a8ec9d5838bc59f809bfa86efc8a4fd02e58960fa9c49d998e14332d5",
+                "sha256:2a04dda79606f3d2f760384c38ccd3d5b9bb79d4c8126b67aff5eb09a253763e",
+                "sha256:3c26010c1b51e1224a3ca6b8df807de6e95128b0908c7e34f190e7775455b0ca",
+                "sha256:52c40f1a4262c896420c6ea1c6fda62cf67070e3947e3307f5562bd783a90336",
+                "sha256:6e4f8d9e8aa79321657079b9ac03f3cf3fd067bf31c1cca4f56d49543f4356a5",
+                "sha256:7242be12a58fec245ee9734e625964b97cf7e3f2f7d016603f9e56660ce479c7",
+                "sha256:7dc253b542bfd4b4eb88d9dbae4ca079e7bf2e2afd819ee18891a43db66c60c7",
+                "sha256:94f5bd885f67bbb25c82d80184abbf7ce4f6c3c3a41fbaa4182f034bba803e69",
+                "sha256:a89e188daa119ffa0d03ce5123dee3f8ffd5115c896c2a9d4f0dbb3d8b95bfa3",
+                "sha256:ad3399da9b0ca36e2f24de72f67ab2854a62e623274607e37e0ce5f5d5fa9166",
+                "sha256:b0348be89275fd1d4c44ffa39530c41a21062f52299b1e3ee7d1c61f060044b8",
+                "sha256:b5554368e4ede1856121b0dfa35ce71768102e4aa55e526cb8de7f374ff78722",
+                "sha256:cbddc56b2502d3f87fda4f98d948eb5b11f36ff3902e17cb6cc44727f2200525",
+                "sha256:d79f18f41751725c56eceab2a886f021d70fd70a6188fd386e29a045945ffc10",
+                "sha256:dc2ca26a19ab32dc475dbad9dfe723d3a64c835f4c23f625c2b6566ca32b9f29",
+                "sha256:dd9bcd4f294eb0633bb33d1a74febdd2b9018b8b8ed325f861fffcd2c7660bb8",
+                "sha256:e8baab1bc7c9152715844f1faca6744f2416929de10d7639ed49555a85549f52",
+                "sha256:ec31fe12668af687b99acf1567399632a7c47b0e17cfb9ae47c098644ef36797",
+                "sha256:f12b4f7e2d8f9da3141564e6737d79016fe5336cc92de6814eba579744f65b0a",
+                "sha256:f58ac38d5ca045a377b3b377c84df8175ab992c970a53332fa8ac2373df44ff7"
+            ],
+            "index": "pypi",
+            "version": "==1.16.4"
+        },
+        "pandas": {
+            "hashes": [
+                "sha256:071e42b89b57baa17031af8c6b6bbd2e9a5c68c595bc6bf9adabd7a9ed125d3b",
+                "sha256:17450e25ae69e2e6b303817bdf26b2cd57f69595d8550a77c308be0cd0fd58fa",
+                "sha256:17916d818592c9ec891cbef2e90f98cc85e0f1e89ed0924c9b5220dc3209c846",
+                "sha256:2538f099ab0e9f9c9d09bbcd94b47fd889bad06dc7ae96b1ed583f1dc1a7a822",
+                "sha256:366f30710172cb45a6b4f43b66c220653b1ea50303fbbd94e50571637ffb9167",
+                "sha256:42e5ad741a0d09232efbc7fc648226ed93306551772fc8aecc6dce9f0e676794",
+                "sha256:4e718e7f395ba5bfe8b6f6aaf2ff1c65a09bb77a36af6394621434e7cc813204",
+                "sha256:4f919f409c433577a501e023943e582c57355d50a724c589e78bc1d551a535a2",
+                "sha256:4fe0d7e6438212e839fc5010c78b822664f1a824c0d263fd858f44131d9166e2",
+                "sha256:5149a6db3e74f23dc3f5a216c2c9ae2e12920aa2d4a5b77e44e5b804a5f93248",
+                "sha256:627594338d6dd995cfc0bacd8e654cd9e1252d2a7c959449228df6740d737eb8",
+                "sha256:83c702615052f2a0a7fb1dd289726e29ec87a27272d775cb77affe749cca28f8",
+                "sha256:8c872f7fdf3018b7891e1e3e86c55b190e6c5cee70cab771e8f246c855001296",
+                "sha256:90f116086063934afd51e61a802a943826d2aac572b2f7d55caaac51c13db5b5",
+                "sha256:a3352bacac12e1fc646213b998bce586f965c9d431773d9e91db27c7c48a1f7d",
+                "sha256:bcdd06007cca02d51350f96debe51331dec429ac8f93930a43eb8fb5639e3eb5",
+                "sha256:c1bd07ebc15285535f61ddd8c0c75d0d6293e80e1ee6d9a8d73f3f36954342d0",
+                "sha256:c9a4b7c55115eb278c19aa14b34fcf5920c8fe7797a09b7b053ddd6195ea89b3",
+                "sha256:cc8fc0c7a8d5951dc738f1c1447f71c43734244453616f32b8aa0ef6013a5dfb",
+                "sha256:d7b460bc316064540ce0c41c1438c416a40746fd8a4fb2999668bf18f3c4acf1"
+            ],
+            "index": "pypi",
+            "version": "==0.24.2"
+        },
+        "python-dateutil": {
+            "hashes": [
+                "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb",
+                "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e"
+            ],
+            "version": "==2.8.0"
+        },
+        "pytz": {
+            "hashes": [
+                "sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda",
+                "sha256:d747dd3d23d77ef44c6a3526e274af6efeb0a6f1afd5a69ba4d5be4098c8e141"
+            ],
+            "version": "==2019.1"
+        },
+        "six": {
+            "hashes": [
+                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+            ],
+            "version": "==1.12.0"
+        }
+    },
+    "develop": {
+        "astroid": {
+            "hashes": [
+                "sha256:6560e1e1749f68c64a4b5dee4e091fce798d2f0d84ebe638cf0e0585a343acf4",
+                "sha256:b65db1bbaac9f9f4d190199bb8680af6f6f84fd3769a5ea883df8a91fe68b4c4"
+            ],
+            "version": "==2.2.5"
+        },
+        "atomicwrites": {
+            "hashes": [
+                "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
+                "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
+            ],
+            "version": "==1.3.0"
+        },
+        "attrs": {
+            "hashes": [
+                "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79",
+                "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399"
+            ],
+            "version": "==19.1.0"
+        },
+        "decorator": {
+            "hashes": [
+                "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de",
+                "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6"
+            ],
+            "version": "==4.4.0"
+        },
+        "importlab": {
+            "hashes": [
+                "sha256:d855350d19dc10a17aabd2fe6f4b428ff1a936071f692fbf686a73694d26a51c"
+            ],
+            "version": "==0.5.1"
+        },
+        "importlib-metadata": {
+            "hashes": [
+                "sha256:a9f185022cfa69e9ca5f7eabfd5a58b689894cb78a11e3c8c89398a8ccbb8e7f",
+                "sha256:df1403cd3aebeb2b1dcd3515ca062eecb5bd3ea7611f18cba81130c68707e879"
+            ],
+            "version": "==0.17"
+        },
+        "isort": {
+            "hashes": [
+                "sha256:c40744b6bc5162bbb39c1257fe298b7a393861d50978b565f3ccd9cb9de0182a",
+                "sha256:f57abacd059dc3bd666258d1efb0377510a89777fda3e3274e3c01f7c03ae22d"
+            ],
+            "version": "==4.3.20"
+        },
+        "lazy-object-proxy": {
+            "hashes": [
+                "sha256:159a745e61422217881c4de71f9eafd9d703b93af95618635849fe469a283661",
+                "sha256:23f63c0821cc96a23332e45dfaa83266feff8adc72b9bcaef86c202af765244f",
+                "sha256:3b11be575475db2e8a6e11215f5aa95b9ec14de658628776e10d96fa0b4dac13",
+                "sha256:3f447aff8bc61ca8b42b73304f6a44fa0d915487de144652816f950a3f1ab821",
+                "sha256:4ba73f6089cd9b9478bc0a4fa807b47dbdb8fad1d8f31a0f0a5dbf26a4527a71",
+                "sha256:4f53eadd9932055eac465bd3ca1bd610e4d7141e1278012bd1f28646aebc1d0e",
+                "sha256:64483bd7154580158ea90de5b8e5e6fc29a16a9b4db24f10193f0c1ae3f9d1ea",
+                "sha256:6f72d42b0d04bfee2397aa1862262654b56922c20a9bb66bb76b6f0e5e4f9229",
+                "sha256:7c7f1ec07b227bdc561299fa2328e85000f90179a2f44ea30579d38e037cb3d4",
+                "sha256:7c8b1ba1e15c10b13cad4171cfa77f5bb5ec2580abc5a353907780805ebe158e",
+                "sha256:8559b94b823f85342e10d3d9ca4ba5478168e1ac5658a8a2f18c991ba9c52c20",
+                "sha256:a262c7dfb046f00e12a2bdd1bafaed2408114a89ac414b0af8755c696eb3fc16",
+                "sha256:acce4e3267610c4fdb6632b3886fe3f2f7dd641158a843cf6b6a68e4ce81477b",
+                "sha256:be089bb6b83fac7f29d357b2dc4cf2b8eb8d98fe9d9ff89f9ea6012970a853c7",
+                "sha256:bfab710d859c779f273cc48fb86af38d6e9210f38287df0069a63e40b45a2f5c",
+                "sha256:c10d29019927301d524a22ced72706380de7cfc50f767217485a912b4c8bd82a",
+                "sha256:dd6e2b598849b3d7aee2295ac765a578879830fb8966f70be8cd472e6069932e",
+                "sha256:e408f1eacc0a68fed0c08da45f31d0ebb38079f043328dce69ff133b95c29dc1"
+            ],
+            "version": "==1.4.1"
+        },
+        "mccabe": {
+            "hashes": [
+                "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
+                "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
+            ],
+            "version": "==0.6.1"
+        },
+        "more-itertools": {
+            "hashes": [
+                "sha256:2112d2ca570bb7c3e53ea1a35cd5df42bb0fd10c45f0fb97178679c3c03d64c7",
+                "sha256:c3e4748ba1aad8dba30a4886b0b1a2004f9a863837b8654e7059eebf727afa5a"
+            ],
+            "markers": "python_version > '2.7'",
+            "version": "==7.0.0"
+        },
+        "networkx": {
+            "hashes": [
+                "sha256:8311ddef63cf5c5c5e7c1d0212dd141d9a1fe3f474915281b73597ed5f1d4e3d"
+            ],
+            "version": "==2.3"
+        },
+        "ninja": {
+            "hashes": [
+                "sha256:0184e69a70bb055621935b935f967b3dc4e189c8f1494d9ea0b90ed15d0308c4",
+                "sha256:0d700c1471f9771978415cab503dc6b55e6267dc21428865c9a8f1a906f3a06d",
+                "sha256:35d3c2fd77e9271bbfb01beb2c8b733ca647356369da41bb095e14b0369ea3cf",
+                "sha256:6ef795816ef3cd3a2def4c4b8e5f1fb7e470bb913c0bae7bb38afe498d0075aa",
+                "sha256:75ebbbaeb1b3298bf001cc7866555d88ba33bbdab4cb99eae1e2a59efe23f47b",
+                "sha256:9090b6695d86643354cbd394ef835f40c0179cc24969d09446eae7931b702f12",
+                "sha256:965bf62d59f3794306b40dc08e31a9286650cff0b11b44acd0a61e61f6030553",
+                "sha256:a8503e5fc4f742520e5b3389324e9710eecbc9fa60956b7034adaf1f0650ba3f",
+                "sha256:a998d98ffd7262e03be4655e742fa918af93fb19ac36e9140afc0fe8190920a6",
+                "sha256:db31cef1eb979e4fe4539046cf04311e00f271f8687bde7dfb64d85f4e4d2b1e",
+                "sha256:fd72664f0e2506f2c8002f2ee67ddd50b87604fe8c1bd04d2108dfeacc82420d"
+            ],
+            "version": "==1.9.0.post1"
+        },
+        "pluggy": {
+            "hashes": [
+                "sha256:0825a152ac059776623854c1543d65a4ad408eb3d33ee114dff91e57ec6ae6fc",
+                "sha256:b9817417e95936bf75d85d3f8767f7df6cdde751fc40aed3bb3074cbcb77757c"
+            ],
+            "version": "==0.12.0"
+        },
+        "py": {
+            "hashes": [
+                "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa",
+                "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53"
+            ],
+            "version": "==1.8.0"
+        },
+        "pylint": {
+            "hashes": [
+                "sha256:5d77031694a5fb97ea95e828c8d10fc770a1df6eb3906067aaed42201a8a6a09",
+                "sha256:723e3db49555abaf9bf79dc474c6b9e2935ad82230b10c1138a71ea41ac0fff1"
+            ],
+            "index": "pypi",
+            "version": "==2.3.1"
+        },
+        "pytest": {
+            "hashes": [
+                "sha256:1a8aa4fa958f8f451ac5441f3ac130d9fc86ea38780dd2715e6d5c5882700b24",
+                "sha256:b8bf138592384bd4e87338cb0f256bf5f615398a649d4bd83915f0e4047a5ca6"
+            ],
+            "index": "pypi",
+            "version": "==4.5.0"
+        },
+        "pytype": {
+            "hashes": [
+                "sha256:ca9f387b818d5d397f0030d86aef9228a5c3f7e6319915f45bd34b9918be7b9a"
+            ],
+            "index": "pypi",
+            "version": "==2019.5.24"
+        },
+        "pyyaml": {
+            "hashes": [
+                "sha256:1adecc22f88d38052fb787d959f003811ca858b799590a5eaa70e63dca50308c",
+                "sha256:436bc774ecf7c103814098159fbb84c2715d25980175292c648f2da143909f95",
+                "sha256:460a5a4248763f6f37ea225d19d5c205677d8d525f6a83357ca622ed541830c2",
+                "sha256:5a22a9c84653debfbf198d02fe592c176ea548cccce47553f35f466e15cf2fd4",
+                "sha256:7a5d3f26b89d688db27822343dfa25c599627bc92093e788956372285c6298ad",
+                "sha256:9372b04a02080752d9e6f990179a4ab840227c6e2ce15b95e1278456664cf2ba",
+                "sha256:a5dcbebee834eaddf3fa7366316b880ff4062e4bcc9787b78c7fbb4a26ff2dd1",
+                "sha256:aee5bab92a176e7cd034e57f46e9df9a9862a71f8f37cad167c6fc74c65f5b4e",
+                "sha256:c51f642898c0bacd335fc119da60baae0824f2cde95b0330b56c0553439f0673",
+                "sha256:c68ea4d3ba1705da1e0d85da6684ac657912679a649e8868bd850d2c299cce13",
+                "sha256:e23d0cc5299223dcc37885dae624f382297717e459ea24053709675a976a3e19"
+            ],
+            "version": "==5.1"
+        },
+        "six": {
+            "hashes": [
+                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+            ],
+            "version": "==1.12.0"
+        },
+        "typed-ast": {
+            "hashes": [
+                "sha256:132eae51d6ef3ff4a8c47c393a4ef5ebf0d1aecc96880eb5d6c8ceab7017cc9b",
+                "sha256:18141c1484ab8784006c839be8b985cfc82a2e9725837b0ecfa0203f71c4e39d",
+                "sha256:2baf617f5bbbfe73fd8846463f5aeafc912b5ee247f410700245d68525ec584a",
+                "sha256:3d90063f2cbbe39177e9b4d888e45777012652d6110156845b828908c51ae462",
+                "sha256:4304b2218b842d610aa1a1d87e1dc9559597969acc62ce717ee4dfeaa44d7eee",
+                "sha256:4983ede548ffc3541bae49a82675996497348e55bafd1554dc4e4a5d6eda541a",
+                "sha256:5315f4509c1476718a4825f45a203b82d7fdf2a6f5f0c8f166435975b1c9f7d4",
+                "sha256:6cdfb1b49d5345f7c2b90d638822d16ba62dc82f7616e9b4caa10b72f3f16649",
+                "sha256:7b325f12635598c604690efd7a0197d0b94b7d7778498e76e0710cd582fd1c7a",
+                "sha256:8d3b0e3b8626615826f9a626548057c5275a9733512b137984a68ba1598d3d2f",
+                "sha256:8f8631160c79f53081bd23446525db0bc4c5616f78d04021e6e434b286493fd7",
+                "sha256:912de10965f3dc89da23936f1cc4ed60764f712e5fa603a09dd904f88c996760",
+                "sha256:b010c07b975fe853c65d7bbe9d4ac62f1c69086750a574f6292597763781ba18",
+                "sha256:c908c10505904c48081a5415a1e295d8403e353e0c14c42b6d67f8f97fae6616",
+                "sha256:c94dd3807c0c0610f7c76f078119f4ea48235a953512752b9175f9f98f5ae2bd",
+                "sha256:ce65dee7594a84c466e79d7fb7d3303e7295d16a83c22c7c4037071b059e2c21",
+                "sha256:eaa9cfcb221a8a4c2889be6f93da141ac777eb8819f077e1d09fb12d00a09a93",
+                "sha256:f3376bc31bad66d46d44b4e6522c5c21976bf9bca4ef5987bb2bf727f4506cbb",
+                "sha256:f9202fa138544e13a4ec1a6792c35834250a85958fde1251b6a22e07d1260ae7"
+            ],
+            "markers": "implementation_name == 'cpython'",
+            "version": "==1.3.5"
+        },
+        "wcwidth": {
+            "hashes": [
+                "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+                "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+            ],
+            "version": "==0.1.7"
+        },
+        "wrapt": {
+            "hashes": [
+                "sha256:4aea003270831cceb8a90ff27c4031da6ead7ec1886023b80ce0dfe0adf61533"
+            ],
+            "version": "==1.11.1"
+        },
+        "zipp": {
+            "hashes": [
+                "sha256:8c1019c6aad13642199fbe458275ad6a84907634cc9f0989877ccc4a2840139d"
+            ],
+            "version": "==0.5.1"
+        }
+    }
+}
diff --git a/dataprep/__init__.py b/dataprep/__init__.py
index e69de29bb..4ef038aea 100644
--- a/dataprep/__init__.py
+++ b/dataprep/__init__.py
@@ -0,0 +1,7 @@
+
+"""Docstring
+    Data preparation module
+"""
+import logging
+
+logging.basicConfig(level=logging.INFO, format="%(message)")
diff --git a/dataprep/eda/__init__.py b/dataprep/eda/__init__.py
new file mode 100644
index 000000000..86cd58c76
--- /dev/null
+++ b/dataprep/eda/__init__.py
@@ -0,0 +1,11 @@
+
+"""Docstring
+    Data preparation module
+"""
+import logging
+
+# Dask Default partitions
+DEFAULT_PARTITIONS = 1
+
+logging.basicConfig(level=logging.INFO, format="%(message)")
+LOGGER = logging.getLogger(__name__)
diff --git a/dataprep/eda/eda_plot.py b/dataprep/eda/eda_plot.py
new file mode 100644
index 000000000..121a53a8e
--- /dev/null
+++ b/dataprep/eda/eda_plot.py
@@ -0,0 +1,445 @@
+"""
+    This module implements the plot(df) function.
+"""
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
+
+import dask
+import dask.array as da
+import dask.dataframe as dd
+import numpy as np
+import pandas as pd
+
+from .__init__ import LOGGER, DEFAULT_PARTITIONS
+
+
+class DataType(Enum):
+    """
+        Enumeration for storing the different types of data possible in a column
+    """
+    TYPE_NUM = 1
+    TYPE_CAT = 2
+    TYPE_UNSUP = 3
+
+
+# Type aliasing
+StringList = List[str]
+
+
+def _calc_box_stats(grp_series: Any) -> Dict[str, Any]:
+    stats: Dict[str, Any] = dict()
+    quantiles = grp_series.quantile([.25, .50, .75]).compute()
+    stats["25%"], stats["50%"], stats["75%"] = quantiles[.25], quantiles[.50], quantiles[.75]
+    stats["iqr"] = stats["75%"] - stats["25%"]
+
+    outliers = list()
+    grp_series = grp_series.compute()
+    if len(grp_series) == 1:
+        stats["min"] = grp_series.reset_index().iloc[0, 1]
+        stats["max"] = stats["min"]
+    else:
+        min_value, max_value = np.inf, -np.inf
+
+        for value in grp_series:
+            if (stats["25%"] - 1.5 * stats["iqr"]) < value < (
+                    stats["75%"] + 1.5 * stats["iqr"]):  # data is in the bound
+                min_value = min(value, min_value)
+                max_value = max(value, max_value)
+            else:  # otherwise, outliers
+                outliers.append(value)
+
+        stats["min"] = min_value
+        stats["max"] = max_value
+    stats["outliers"] = outliers
+    return stats
+
+
+def _calc_box(
+        dataframe: dd.DataFrame,
+        col_x: str,
+        col_y: Optional[str] = None
+) -> Dict[str, Dict[str, Any]]:
+    """
+    Returns intermediate stats of the box plot
+    of columns col_x and col_y.
+
+    PARAMETERS
+    __________
+    dataframe: the input dataframe
+    col_x : a valid column name of the dataframe
+    col_y : a valid column name of the dataframe
+
+
+    RETURNS
+    __________
+    a (column_name: data) dict storing the intermediate results
+    """
+    res: Dict[str, Any] = dict()
+    cat_col, num_col = (
+        col_x, col_y) if (get_type(dataframe[col_x]) == DataType.TYPE_CAT) else (col_y, col_x)
+
+    if col_y is None:
+        col_series = dataframe[col_x]
+        res = _calc_box_stats(col_series)
+    else:
+        for group in dataframe[cat_col].unique().compute():
+            grp_series = dataframe.groupby(cat_col).get_group(group)[num_col]
+            res[group] = _calc_box_stats(grp_series)
+
+    return {"box_plot": res}
+
+
+def _calc_statcked(
+        dataframe: dd.DataFrame,
+        col_x: str,
+        col_y: str
+) -> Dict[str, Dict[Tuple[Any, Any], int]]:
+    """ Returns intermediate stats of the stacked column plot
+            of columns col_x and col_y.
+
+    PARAMETERS
+    __________
+    dataframe: the input dataframe
+    col_x : a valid column name of the dataframe
+    col_y : a valid column name of the dataframe
+
+
+    RETURNS
+    __________
+    a (column_name: data) dict storing the intermediate results
+    """
+    grp_object = dataframe.groupby([col_x, col_y])
+
+    grp_series = grp_object.count().compute().iloc[:, 0]
+    # print (grp_series)
+    return {"stacked_column_plot": dict(grp_series)}
+
+
+def _calc_scatter(
+        dataframe: dd.DataFrame,
+        col_x: str,
+        col_y: str
+) -> Dict[str, Dict[Union[int, float], Union[int, float]]]:
+    """
+        TO-DO: WARNING: For very large amount of points, implement Heat Map.
+        Returns intermediate stats of the scattered plot
+        of columns col_x and col_y.
+
+        PARAMETERS
+        __________
+        dataframe: the input dataframe
+        col_x : a valid column name of the dataframe
+        col_y : a valid column name of the dataframe
+
+
+        RETURNS
+        __________
+        a (column_name: data) dict storing the intermediate results
+    """
+    series_x = dataframe[col_x].compute()
+    series_y = dataframe[col_y].compute()
+
+    res = set()
+    for each in zip(series_x, series_y):
+        res.add(each)
+
+    return {"scatter_plot": dict(res)}
+
+
+def _calc_pie(dataframe: dd.DataFrame, col: str) -> Dict[str, Dict[str, float]]:
+    """ Returns a dict {category: category_count} for the
+        categorical column given as the second argument
+
+    Parameters
+    __________
+    dataframe : the input pandas dataframe
+    col : the str column of dataframe for which count needs to be calculated
+
+    Returns
+    __________
+    dict : A dict of (category : count) for the input col
+    """
+    grp_object = (dataframe.groupby(col)[col].count() / dataframe[col].size) * 100
+    return {"pie_plot": dict(grp_object.compute())}
+
+
+def _calc_bar(dataframe: dd.DataFrame, col: str) -> Dict[str, Dict[str, int]]:
+    """ Returns a dict {category: category_count} for the
+        categorical column given as the second argument
+
+    Parameters
+    __________
+    dataframe : the input pandas dataframe
+    col : the str column of dataframe for which count needs to be calculated
+
+    Returns
+    __________
+    dict : A dict of (category : count) for the input col
+    """
+    grp_object = dataframe.groupby(col)[col].count()
+    return {"bar_plot": dict(grp_object)}
+
+
+def _calc_hist_by_group(
+        dataframe: dd.DataFrame,
+        col_x: str,
+        col_y: str,
+        nbins: int = 10) -> Dict[str, Dict[str, Tuple[Any, Any]]]:
+    """Returns the histogram array for the continuous
+        distribution of values in the column given as the second argument
+    _TODO write test
+    Parameters
+    __________
+    dataframe : the input pandas dataframe
+    col : the str column of dataframe for which hist array needs to be
+    calculated
+
+    Returns
+    __________
+    np.array : An array of values representing histogram for the input col
+    """
+    col_cat, col_num = (col_x, col_y) if (get_type(dataframe[col_x]) == DataType.TYPE_CAT) \
+        else (col_y, col_x)
+
+    grp_hist: Dict[str, Tuple[Any, Any]] = dict()
+    hist_interm: List[Any] = list()
+    grp_name_list: List[str] = list()
+
+    for group in dataframe[col_cat].unique().compute():
+        grp_series = dataframe.groupby(col_cat).get_group(group)[col_num]
+        minv = grp_series.min().compute()
+        maxv = grp_series.max().compute()
+        hist = da.histogram(grp_series, range=[minv, maxv], bins=nbins)
+        hist_interm.append(hist)
+        grp_name_list.append(group)
+
+    hist_interm, = dask.compute(hist_interm)
+
+    for zipped_element in zip(grp_name_list, hist_interm):
+        grp_hist[zipped_element[0]] = zipped_element[1]
+
+    return {"histogram": grp_hist}
+
+
+def _calc_hist(
+        dataframe: dd.DataFrame,
+        col: str,
+        nbins: int = 10) -> Dict[str, Tuple[List[Union[int, float]], List[Union[int, float]]]]:
+    """Returns the histogram array for the continuous
+        distribution of values in the column given as the second argument
+
+    Parameters
+    __________
+    dataframe : the input pandas dataframe
+    col : the str column of dataframe for which hist array needs to be
+    calculated
+
+    Returns
+    __________
+    np.array : An array of values representing histogram for the input col
+    """
+    minv = dataframe[col].min()
+    maxv = dataframe[col].max()
+    dframe = dd.from_array(dataframe[col]).dropna()
+    hist_array, bins = da.histogram(dframe.values, range=[minv, maxv], bins=nbins)
+    hist_array = hist_array.compute()
+
+    if not hist_array.size == 0:
+        return {'histogram': (hist_array, bins)}
+    return {'histogram': (list(), list())}
+
+
+def _calc_qqnorm(
+        dataframe: dd.DataFrame,
+        col: str,
+        qrange: Optional[List[int]] = None) -> Dict[str, List[Tuple[float, float]]]:
+    """
+        Calculates points of the QQ plot of the given column of the data frame.
+        :param dataframe - the input dataframe
+        :param col - the input column of the dataframe
+        :param qrange - the list of quantiles to be calculated. By default, all the percentiles are
+        calculated.
+    """
+    points = list()
+    if qrange is None:
+        qrange = list(range(1, 101))
+
+    dask_series = dataframe[col]
+    try:
+        size_ = dask_series.size.compute()
+        np.random.seed(0)
+        normal_points = np.sort(np.random.standard_normal(size=(size_, )))
+        x_points = np.percentile(normal_points, q=qrange)
+        y_points = dask_series.compute().sort_values().quantile([x / 100 for x in qrange])
+        for point in zip(x_points, y_points):
+            points.append(point)
+    except TypeError:
+        # _TODO
+        pass
+
+    if points:
+        return {"qq_norm_plot": points}
+    return {"qq_norm_plot": list()}
+
+
+def get_type(data: dd.Series) -> DataType:
+    """ Returns the type of the input data.
+        Identified types are according to the DataType Enumeration.
+
+    Parameter
+    __________
+    The data for which the type needs to be identified.
+
+    Returns
+    __________
+    str representing the type of the data.
+    """
+
+    col_type = DataType.TYPE_UNSUP
+    try:
+        if pd.api.types.is_bool_dtype(data):
+            col_type = DataType.TYPE_CAT
+        elif pd.api.types.is_numeric_dtype(data) and data.dropna().unique().size.compute() == 2:
+            col_type = DataType.TYPE_CAT
+        elif pd.api.types.is_numeric_dtype(data):
+            col_type = DataType.TYPE_NUM
+        else:
+            col_type = DataType.TYPE_CAT
+    except NotImplementedError as error:  # TO-DO
+        LOGGER.info("Type cannot be determined due to : %s", error)
+
+    return col_type
+
+
+def plot_df(
+        data_frame: dd.DataFrame,
+        force_cat: Optional[StringList] = None,
+        force_num: Optional[StringList] = None
+) -> Dict[str, Union[Dict[str, Union[List[Any], Dict[Any, Any]]], Tuple[Any], List[Any],
+                     Dict[Any, Any]]]:
+    """
+    Supporting funtion to the main plot function
+    :param data_frame: dask dataframe
+    :param force_cat: list of categorical columns defined explicitly
+    :param force_num: list of numerical columns defined explicitly
+    :return:
+    """
+    col_list = list()
+    dask_result: List[Any] = list()
+
+    for col in data_frame.columns:
+        if data_frame[col].count().compute() == 0:
+            col_list.append(col)
+            dask_result.append(data_frame[col])
+
+        elif get_type(data_frame[col]) == DataType.TYPE_CAT or (
+                force_cat is not None and col in force_cat):
+            cnt_series = dask.delayed(_calc_bar)(data_frame, col)
+            dask_result.append(cnt_series)
+            col_list.append(col)
+
+        elif get_type(data_frame[col]) == DataType.TYPE_NUM or (
+                force_num is not None and col in force_num):
+            hist = dask.delayed(_calc_hist)(data_frame, col)
+            dask_result.append(hist)
+            col_list.append(col)
+
+    column_dict = dict()
+    computed_res, = dask.compute(dask_result)
+
+    for each in zip(col_list, computed_res):
+        column_dict[each[0]] = each[1]
+
+    return column_dict
+
+
+def plot(
+        pd_data_frame: pd.DataFrame,
+        col_x: Optional[str] = None,
+        col_y: Optional[str] = None,
+        force_cat: Optional[StringList] = None,
+        force_num: Optional[StringList] = None
+) -> Dict[str, Union[Dict[str, Union[List[Any], Dict[Any, Any]]], Tuple[Any], List[Any],
+                     Dict[Any, Any]]]:
+    """
+    Returns an intermediate representation for the plots of
+        different columns in the data_frame.
+
+    Parameters
+    data_frame: the pandas data_frame for which plots are calculated for each
+    column.
+    col_x : A column in the data_frame.
+    col_y : A column in the data_frame.
+    force_cat: the list of columns which have to considered of type "TYPE_CAT"
+    force_num: the list of columns which have to considered of type "TYPE_NUM"
+    kwargs : TO-DO
+
+    Returns
+    __________
+    dict : A (column: [array/dict]) dict to encapsulate the
+    intermediate results.
+    """
+    data_frame: dd.DataFrame = dd.from_pandas(pd_data_frame, npartitions=DEFAULT_PARTITIONS)
+
+    result: Dict[str, Union[Dict[str, Union[List[Any], Dict[Any, Any]]], Tuple[Any], List[Any],
+                            Dict[Any, Any]]] = dict()
+
+    if col_x is None and col_y is None:
+        result = plot_df(data_frame, force_cat, force_num)
+
+    elif (col_x is None and col_y is not None) or (col_x is not None and col_y is None):
+
+        target_col: str = cast(str, col_x if col_y is None else col_y)
+        dask_result: List[Any] = list()
+
+        if data_frame[target_col].count() == 0:
+            dask_result.append([])
+
+        elif get_type(data_frame[target_col]) == DataType.TYPE_CAT or (
+                force_cat is not None and target_col in force_cat):
+            # BAR_PLOT
+            dask_result.append(dask.delayed(_calc_bar)(data_frame, target_col))
+            # PIE_CHART
+            dask_result.append(dask.delayed(_calc_pie)(data_frame, target_col))
+
+        elif get_type(data_frame[target_col]) == DataType.TYPE_NUM or (
+                force_num is not None and target_col in force_num):
+            # HISTOGRAM
+            dask_result.append(dask.delayed(_calc_hist)(data_frame, target_col))
+            # BOX_PLOT
+            dask_result.append(dask.delayed(_calc_bar)(data_frame, target_col))
+            # QQ-NORM
+            dask_result.append(dask.delayed(_calc_qqnorm)(data_frame, target_col))
+
+        column_dict = {target_col: dask.compute(dask_result)}
+        result = column_dict
+
+    elif col_x is not None and col_y is not None:
+        type_x = get_type(data_frame[col_x])
+        type_y = get_type(data_frame[col_y])
+        temp_dask_result: Dict[str, Any] = dict()
+
+        try:
+            if type_y == DataType.TYPE_CAT and type_x == DataType.TYPE_NUM or \
+                    type_y == DataType.TYPE_NUM and type_x == DataType.TYPE_CAT:
+                # BOX_PER_GROUP
+                temp_dask_result.update(_calc_box(data_frame, col_x, col_y))
+                # HISTOGRAM_PER_GROUP
+                temp_dask_result.update(_calc_hist_by_group(data_frame, col_x, col_y))
+
+            elif type_x == DataType.TYPE_CAT and type_y == DataType.TYPE_CAT:
+                temp_dask_result.update(_calc_statcked(data_frame, col_x, col_y))
+
+            elif type_x == DataType.TYPE_NUM and type_y == DataType.TYPE_NUM:
+                temp_dask_result.update(_calc_scatter(data_frame, col_x, col_y))
+            else:
+                pass
+                # WARNING: _TODO
+            result, = dask.compute(temp_dask_result)
+        except NotImplementedError as error:  # _TODO
+            LOGGER.info("Plot could not be obtained due to : %s", error)
+    else:
+        pass
+        # _TODO to be added
+
+    return result
diff --git a/dataprep/tests/__init__.py b/dataprep/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dataprep/tests/eda/__init__.py b/dataprep/tests/eda/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dataprep/tests/eda/test_eda.py b/dataprep/tests/eda/test_eda.py
new file mode 100644
index 000000000..bb3a7fddb
--- /dev/null
+++ b/dataprep/tests/eda/test_eda.py
@@ -0,0 +1,227 @@
+"""
+    module for testing plot(df, x, y) function.
+"""
+import datetime
+from typing import Any, Dict, Union, cast, Tuple
+
+import numpy as np
+import pandas as pd
+from pandas import Timestamp
+
+from ...eda.eda_plot import plot  # dataprep.tests.eda.test_eda
+
+
+def test_normal() -> None:
+    """
+
+    :return:
+    """
+    data_1 = {
+
+        "id": [chr(97 + c) for c in range(1, 10)],
+
+        "x": [50, 50, -10, 0, 0, 5, 15, -3, None],
+
+        "y": [0.000001, 654.152, None, 15.984512, 3122, -3.1415926535, 111,
+              15.9, 13.5],
+
+        "s1": np.ones(9),
+
+        "somedate": [datetime.date(2011, 7, 4),
+                     datetime.datetime(2022, 1, 1, 13, 57),
+                     datetime.datetime(1990, 12, 9), np.nan,
+                     datetime.datetime(1990, 12, 9),
+                     datetime.datetime(1950, 12, 9),
+                     datetime.datetime(1898, 1, 2),
+                     datetime.datetime(1950, 12, 9),
+                     datetime.datetime(1950, 12, 9)],
+
+        "bool_tf": [True, True, False, True, False, True, True, False,
+                    True],
+
+        "bool_tf_with_nan": [True, False, False, False, False, True, True,
+                             False, np.nan],
+
+        "bool_01": [1, 1, 0, 1, 1, 0, 0, 0, 1],
+
+        "bool_01_with_nan": [1, 0, 1, 0, 0, 1, 1, 0, np.nan],
+
+        "mixed": [1, 2, "a", 4, 5, 6, 7, 8, 9]
+
+    }
+
+    df_1 = pd.DataFrame(data_1)
+
+    df_1_expected: Dict[str, Dict[str, Union[Dict[Any, Any], Tuple[Any, Any]]]] = \
+                    {"bool_01": {"bar_plot": {0: 4, 1: 5}},
+                     "bool_01_with_nan": {"bar_plot": {0.0: 4, 1.0: 4}},
+                     "bool_tf": {"bar_plot": {False: 3, True: 6}},
+                     "bool_tf_with_nan": {"bar_plot": {False: 5, True: 3}},
+                     "s1": {"bar_plot": {1.0: 9}},
+                     "x": {"histogram": (np.array([1, 3, 1, 0, 1, 0, 0, 0, 0, 2], dtype=np.int64),
+                                         np.array([-10., -4., 2., 8., 14., 20., 26.,
+                                                   32., 38., 44., 50.]))},
+                     'y': {'histogram': (np.array([6, 0, 1, 0, 0, 0, 0, 0, 0, 1], dtype=np.int64),
+                                         np.array([-3.14159265, 309.37256661, 621.88672588,
+                                                   934.40088514, 1246.91504441, 1559.42920367,
+                                                   1871.94336294, 2184.4575222, 2496.97168147,
+                                                   2809.48584073, 3122.]))}
+                    }
+    res = cast(Dict[str, Dict[str, Union[Dict[Any, Any], Tuple[Any, Any]]]], plot(df_1,
+                                                                                  force_cat=[
+                                                                                      "bool_01",
+                                                                                      "bool_01_ \
+                                                                                       with_nan",
+                                                                                      "s1"]))
+
+    assert res["bool_01"] == df_1_expected["bool_01"]
+    assert res["bool_01_with_nan"] == df_1_expected["bool_01_with_nan"]
+    assert res["bool_tf"] == df_1_expected["bool_tf"]
+    assert res["bool_tf_with_nan"] == df_1_expected["bool_tf_with_nan"]
+    assert res["s1"] == df_1_expected["s1"]
+    assert np.allclose(res["x"]["histogram"][0], df_1_expected["x"]["histogram"][0], equal_nan=True)
+    assert np.allclose(res["x"]["histogram"][1], df_1_expected["x"]["histogram"][1], equal_nan=True)
+    assert np.allclose(res["y"]["histogram"][0], df_1_expected["y"]["histogram"][0], equal_nan=True)
+    assert np.allclose(res["y"]["histogram"][1], df_1_expected["y"]["histogram"][1], equal_nan=True)
+
+    data = {
+
+        "id": [chr(97 + c) for c in range(1, 21)],
+
+        "x": ["d", "c", "b", "a", "b", "d", "c", "a", "a", "a", "c", "b",
+              "c", "a", "d", "b", "b", "b", "b", "b"],
+
+        "y": [794, 652, 158, 134, 448, 682, 135, 795, 353, 395, 403, 498,
+              622, 80, 654, 772, 867, 676, 670, 736],
+
+        "s1": np.ones(20),
+
+        "somedate": [datetime.date(2011, 7, 4),
+                     datetime.datetime(1898, 1, 2),
+                     datetime.datetime(1950, 12, 9),
+                     datetime.datetime(1950, 12, 9),
+                     datetime.datetime(1898, 1, 2),
+                     datetime.datetime(1990, 12, 9), np.nan,
+                     datetime.datetime(1990, 12, 9),
+                     datetime.datetime(1950, 12, 9),
+                     datetime.datetime(1898, 1, 2),
+                     datetime.datetime(1950, 12, 9),
+                     datetime.datetime(1950, 12, 9),
+                     datetime.datetime(1950, 12, 9),
+                     datetime.datetime(1950, 12, 9),
+                     datetime.datetime(1950, 12, 9),
+                     datetime.datetime(1950, 12, 9),
+                     datetime.datetime(1950, 12, 9),
+                     datetime.datetime(1950, 12, 9),
+                     datetime.datetime(1950, 12, 9),
+                     datetime.datetime(1950, 12, 9)],
+    }
+
+    df_data = pd.DataFrame(data)
+
+    df_expected: Dict[str, Dict[str, Any]] \
+                = {"box_plot": {"a": {"25%": 134.0,
+                                      "50%": 353.0,
+                                      "75%": 395.0,
+                                      "iqr": 261.0,
+                                      "max": 395,
+                                      "min": 80,
+                                      "outliers": [795]},
+                                "b": {"25%": 485.5,
+                                      "50%": 673.0,
+                                      "75%": 745.0,
+                                      "iqr": 259.5,
+                                      "max": 867,
+                                      "min": 158,
+                                      "outliers": []},
+                                "c": {"25%": 336.0,
+                                      "50%": 512.5,
+                                      "75%": 629.5,
+                                      "iqr": 293.5,
+                                      "max": 652,
+                                      "min": 135,
+                                      "outliers": []},
+                                "d": {"25%": 668.0,
+                                      "50%": 682.0,
+                                      "75%": 738.0,
+                                      "iqr": 70.0,
+                                      "max": 794,
+                                      "min": 654,
+                                      "outliers": []}},
+                   "histogram": {"d": (np.array([1, 0, 1, 0, 0, 0, 0, 0, 0, 1], dtype=np.int64),
+                                       np.array([654., 668., 682., 696., 710., 724., 738., 752.,
+                                                 766., 780., 794.])),
+                                 "c": (np.array([1, 0, 0, 0, 0, 1, 0, 0, 0, 2], dtype=np.int64),
+                                       np.array([135., 186.7, 238.4, 290.1, 341.8, 393.5,
+                                                 445.2, 496.9, 548.6, 600.3, 652.])),
+                                 "b": (np.array([1, 0, 0, 0, 2, 0, 0, 2, 2, 1], dtype=np.int64),
+                                       np.array([158., 228.9, 299.8, 370.7, 441.6, 512.5, 583.4,
+                                                 654.3, 725.2, 796.1, 867.])),
+                                 "a": (np.array([2, 0, 0, 1, 1, 0, 0, 0, 0, 1], dtype=np.int64),
+                                       np.array([80., 151.5, 223., 294.5, 366., 437.5, 509.,
+                                                 580.5, 652., 723.5, 795.]))}
+                  }
+    another_res = cast(Dict[str, Dict[str, Any]], plot(df_data, "y", "x"))
+
+    assert another_res["box_plot"]["a"] == df_expected["box_plot"]["a"]
+    assert another_res["box_plot"]["b"] == df_expected["box_plot"]["b"]
+    assert another_res["box_plot"]["c"] == df_expected["box_plot"]["c"]
+    assert another_res["box_plot"]["d"] == df_expected["box_plot"]["d"]
+
+    assert np.allclose(another_res["histogram"]["a"][0], df_expected["histogram"]["a"][0],
+                       equal_nan=True)
+    assert np.allclose(another_res["histogram"]["b"][0], df_expected["histogram"]["b"][0],
+                       equal_nan=True)
+    assert np.allclose(another_res["histogram"]["c"][0], df_expected["histogram"]["c"][0],
+                       equal_nan=True)
+    assert np.allclose(another_res["histogram"]["d"][0], df_expected["histogram"]["d"][0],
+                       equal_nan=True)
+
+    df_expected_2 = {"stacked_column_plot": {("a", Timestamp("1898-01-02 00:00:00")): 1,
+                                             ("a", Timestamp("1950-12-09 00:00:00")): 3,
+                                             ("a", Timestamp("1990-12-09 00:00:00")): 1,
+                                             ("b", Timestamp("1898-01-02 00:00:00")): 1,
+                                             ("b", Timestamp("1950-12-09 00:00:00")): 7,
+                                             ("c", Timestamp("1898-01-02 00:00:00")): 1,
+                                             ("c", Timestamp("1950-12-09 00:00:00")): 2,
+                                             ("d", Timestamp("1950-12-09 00:00:00")): 1,
+                                             ("d", Timestamp("1990-12-09 00:00:00")): 1,
+                                             ("d", Timestamp("2011-07-04 00:00:00")): 1
+                                            }
+                    }
+
+    res_2 = plot(df_data, "x", "somedate")
+    assert df_expected_2["stacked_column_plot"] == res_2["stacked_column_plot"]
+
+
+def test_corner() -> None:
+    """
+
+    :return:
+    """
+    df_2 = pd.DataFrame(
+        {"all_nan": [np.nan for _ in range(10)], "all_one": np.ones(10),
+         "all_zeros": np.zeros(10), "random": np.array(
+             [0.38538395, 0.13609054, 0.15973238, 0.96192966, 0.03708882,
+              0.03633855, 0.25260128, 0.72139843, 0.74553949,
+              0.41102021])})
+
+    df_1_expected = {"all_one": {"bar_plot": {1.0: 10}},
+                     "all_zeros": {"bar_plot": {0.0: 10}},
+                     "random": {"bar_plot": np.array([2, 2, 1, 1, 1, 0, 0, 2, 0, 1],
+                                                     dtype=np.int64)}}
+
+    res = plot(df_2, force_cat=["all_one", "all_zeros"])
+
+    assert res["all_one"] == df_1_expected["all_one"]
+    assert res["all_zeros"] == df_1_expected["all_zeros"]
+
+    df_2 = pd.DataFrame({
+        "empty": [],
+        "another_empty": []
+    })
+
+    df_2_expected: Dict[str, Any] = {'scatter_plot': {}}
+
+    res = plot(df_2, "empty", "another_empty")
+    assert res == df_2_expected
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 000000000..49f577be3
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,8 @@
+
+
+[mypy]
+mypy_path = "venv/Lib/site-packages"
+ignore_missing_imports = False
+
+[mypy-dataprep.tests.*]
+ignore_errors = False
\ No newline at end of file
diff --git a/pytype.cfg b/pytype.cfg
new file mode 100644
index 000000000..5247f9b01
--- /dev/null
+++ b/pytype.cfg
@@ -0,0 +1,38 @@
+# NOTE: All relative paths are relative to the location of this file.
+
+[pytype]
+
+# Space-separated list of files or directories to exclude.
+exclude =
+    **/*_test.py
+    **/test_*.py
+
+# Space-separated list of files or directories to process.
+inputs =
+    .
+
+# Keep going past errors to analyze as many files as possible.
+keep_going = False
+
+# All pytype output goes here.
+output = .pytype
+
+# Paths to source code directories, separated by ':'.
+pythonpath =
+    .
+
+# Python version (major.minor) of the target code.
+python_version = 3.7
+
+# Comma separated list of error names to ignore.
+disable =
+    pyi-error
+
+# Don't report errors.
+report_errors = True
+
+# Experimental: solve unknown types to label with structural types.
+protocols = False
+
+# Experimental: Only load submodules that are explicitly imported.
+strict_import = False
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 000000000..a4f3b40ac
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,6 @@
+[pytype]
+inputs = dataprep
+
+[pep8]
+ignore =
+max-line-length = 80
\ No newline at end of file