dataset scripts

emilwallner · Feb 23, 2019 · 8415053 · 8415053
1 parent 20f14d2
commit 8415053
Show file tree

Hide file tree

Showing 12 changed files with 1,646 additions and 0 deletions.
diff --git a/download_and_clean_data_scripts/Copy_50_files_to_another_folder.ipynb b/download_and_clean_data_scripts/Copy_50_files_to_another_folder.ipynb
@@ -0,0 +1,61 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from shutil import copyfile\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img_dir = r\"./clipart_new/\"\n",
+    "\n",
+    "k = 0\n",
+    "s = 0\n",
+    "for filename in os.listdir(img_dir):\n",
+    "    \n",
+    "    if s < 100:\n",
+    "        filepath = os.path.join(img_dir, filename)\n",
+    "        copyfile(filepath, '/home/ubuntu/storage/clipart/' + filename)\n",
+    "        s += 1\n",
+    "    k += 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Environment (conda_tensorflow_p36)",
+   "language": "python",
+   "name": "conda_tensorflow_p36"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/download_and_clean_data_scripts/Create crops.ipynb b/download_and_clean_data_scripts/Create crops.ipynb
diff --git a/download_and_clean_data_scripts/Create_random_cropped_images_from_large_images.ipynb b/download_and_clean_data_scripts/Create_random_cropped_images_from_large_images.ipynb
diff --git a/download_and_clean_data_scripts/Remove_Black_and_White_Photos.ipynb b/download_and_clean_data_scripts/Remove_Black_and_White_Photos.ipynb
@@ -0,0 +1,105 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from PIL import Image\n",
+    "from random import randint\n",
+    "import uuid\n",
+    "from multiprocessing import Pool\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/PIL/TiffImagePlugin.py:764: UserWarning: Corrupt EXIF data.  Expecting to read 4 bytes but only got 0. \n",
+      "  warnings.warn(str(msg))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done!\n"
+     ]
+    }
+   ],
+   "source": [
+    "def remove_bw(filepath_filename):\n",
+    "    try:\n",
+    "        with Image.open(filepath_filename[0]) as im:\n",
+    "            x, y = im.size\n",
+    "            pixels = im.load()\n",
+    "            if im.mode == 'L' or type(pixels) == int:\n",
+    "                os.rename(filepath_filename[0], '/home/ubuntu/storage/BW_images/' + filepath_filename[1])\n",
+    "            else:\n",
+    "                bw_pixel = 0\n",
+    "                for i in range(1000):\n",
+    "                    x_pixel = randint(0, (x-1))\n",
+    "                    y_pixel = randint(0, (y-1))\n",
+    "                    if len(pixels[x_pixel, y_pixel]) != 3:\n",
+    "                        bw_pixel += 1000\n",
+    "                    else:\n",
+    "                        match_pixel = pixels[x_pixel, y_pixel][0]\n",
+    "                        if match_pixel == pixels[x_pixel, y_pixel][1] or match_pixel == pixels[x_pixel, y_pixel][2]:\n",
+    "                            bw_pixel += 1\n",
+    "                if bw_pixel > 990:\n",
+    "                    os.rename(filepath_filename[0], '/home/ubuntu/storage/BW_images/' + filepath_filename[1])\n",
+    "    except:\n",
+    "        print('Error!')\n",
+    "        os.rename(filepath_filename[0], '/home/ubuntu/storage/error_files/' + filepath_filename[1])\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    img_dir = r\"./pixa_ready/\"\n",
+    "    images = []\n",
+    "\n",
+    "    for filename in os.listdir(img_dir):\n",
+    "        filepath = os.path.join(img_dir, filename)\n",
+    "        images.append([filepath, filename])\n",
+    "\n",
+    "    pool = Pool(processes=16) \n",
+    "    pool.map(remove_bw, images)\n",
+    "\n",
+    "    print(\"Done!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Environment (conda_tensorflow_p36)",
+   "language": "python",
+   "name": "conda_tensorflow_p36"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/download_and_clean_data_scripts/Remove_clipart_images_from_dataset.ipynb b/download_and_clean_data_scripts/Remove_clipart_images_from_dataset.ipynb
@@ -0,0 +1,128 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from PIL import Image\n",
+    "from random import randint\n",
+    "import uuid\n",
+    "from multiprocessing import Pool\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/PIL/TiffImagePlugin.py:764: UserWarning: Corrupt EXIF data.  Expecting to read 4 bytes but only got 0. \n",
+      "  warnings.warn(str(msg))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done!\n"
+     ]
+    }
+   ],
+   "source": [
+    "def remove_clipart(filepath_filename):\n",
+    "    try:\n",
+    "        with Image.open(filepath_filename[0]) as im:\n",
+    "            x, y = im.size\n",
+    "            pixels = im.load()\n",
+    "            if im.mode == 'L' or type(pixels) == int or x < 200 or y < 200:\n",
+    "                os.rename(filepath_filename[0], '/home/ubuntu/storage/error_files/' + filepath_filename[1])\n",
+    "            else:   \n",
+    "                status = False\n",
+    "\n",
+    "                for i in range(4):\n",
+    "                    light_pixel = 0\n",
+    "\n",
+    "                    if i == 0:\n",
+    "                        x_pixel = 0\n",
+    "                        y_pixel = 0\n",
+    "\n",
+    "                    if i == 1:\n",
+    "                        x_pixel = x - 35\n",
+    "                        y_pixel = 0\n",
+    "\n",
+    "                    if i == 2:\n",
+    "                        x_pixel = 0\n",
+    "                        y_pixel = y - 1\n",
+    "\n",
+    "                    if i == 3:\n",
+    "                        x_pixel = x - 35\n",
+    "                        y_pixel = y - 1\n",
+    "\n",
+    "                    for k in range(30):\n",
+    "                        if len(pixels[x_pixel, y_pixel]) != 3:\n",
+    "                            status = True\n",
+    "                        else:\n",
+    "                            first = pixels[x_pixel, y_pixel][0]\n",
+    "                            second = pixels[x_pixel, y_pixel][1]\n",
+    "                            third = pixels[x_pixel, y_pixel][2]\n",
+    "                            if first > 246 and second > 246 and third > 246:\n",
+    "                                light_pixel += 1\n",
+    "                            x_pixel += 1\n",
+    "                    if light_pixel == 30:\n",
+    "                        status = True\n",
+    "\n",
+    "                if status:\n",
+    "                    os.rename(filepath_filename[0], '/home/ubuntu/storage/BW_images/' + filepath_filename[1])\n",
+    "    except IOError:\n",
+    "        os.rename(filepath_filename[0], '/home/ubuntu/storage/error_files/' + filepath_filename[1])\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    img_dir = r\"./pixa_ready/\"\n",
+    "    images = []\n",
+    "\n",
+    "    for filename in os.listdir(img_dir):\n",
+    "        filepath = os.path.join(img_dir, filename)\n",
+    "        images.append([filepath, filename])\n",
+    "\n",
+    "    pool = Pool(processes=16) \n",
+    "    pool.map(remove_clipart, images)\n",
+    "\n",
+    "    print(\"Done!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Environment (conda_tensorflow_p36)",
+   "language": "python",
+   "name": "conda_tensorflow_p36"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/download_and_clean_data_scripts/Remove_dups.ipynb b/download_and_clean_data_scripts/Remove_dups.ipynb
@@ -0,0 +1,92 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from PIL import Image\n",
+    "from shutil import copyfile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img_dir = r\"./all_images/\"\n",
+    "\n",
+    "images = []\n",
+    "for filename in os.listdir(img_dir):\n",
+    "    images.append(filename)\n",
+    "\n",
+    "images.sort()\n",
+    "\n",
+    "def get_orgname(org_name):\n",
+    "    name = org_name.partition(\" \")[0]\n",
+    "    if name == org_name:\n",
+    "        name = org_name.partition(\".\")[0]\n",
+    "    return name\n",
+    "\n",
+    "def get_size(filepath):\n",
+    "    with Image.open(first_filepath) as im:\n",
+    "        x, y = im.size\n",
+    "    return x * y\n",
+    "    \n",
+    "\n",
+    "curr_name = ''\n",
+    "for i in range(len(images) - 1):\n",
+    "    if curr_name == '':\n",
+    "        curr_name = images[i]\n",
+    "        \n",
+    "    second_name = images[i + 1]\n",
+    "    \n",
+    "    if get_orgname(curr_name) == get_orgname(second_name):\n",
+    "        first_filepath = os.path.join(img_dir, curr_name)\n",
+    "        second_filepath = os.path.join(img_dir, second_name)\n",
+    "        first_size = get_size(first_filepath)\n",
+    "        second_size = get_size(second_filepath)\n",
+    "        \n",
+    "        if first_size < second_size:\n",
+    "            curr_name = second_name\n",
+    "        i += 1\n",
+    "    else:\n",
+    "        copyfile(os.path.join(img_dir, curr_name), '/home/ubuntu/storage/org_large/' + curr_name)\n",
+    "        curr_name = ''\n",
+    "        i += 1\n",
+    "            "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Environment (conda_tensorflow_p36)",
+   "language": "python",
+   "name": "conda_tensorflow_p36"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}