Skip to content

Commit

Permalink
dataset scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
emilwallner committed Feb 23, 2019
1 parent 20f14d2 commit 8415053
Show file tree
Hide file tree
Showing 12 changed files with 1,646 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from shutil import copyfile\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"img_dir = r\"./clipart_new/\"\n",
"\n",
"k = 0\n",
"s = 0\n",
"for filename in os.listdir(img_dir):\n",
" \n",
" if s < 100:\n",
" filepath = os.path.join(img_dir, filename)\n",
" copyfile(filepath, '/home/ubuntu/storage/clipart/' + filename)\n",
" s += 1\n",
" k += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Environment (conda_tensorflow_p36)",
"language": "python",
"name": "conda_tensorflow_p36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
424 changes: 424 additions & 0 deletions download_and_clean_data_scripts/Create crops.ipynb

Large diffs are not rendered by default.

Large diffs are not rendered by default.

105 changes: 105 additions & 0 deletions download_and_clean_data_scripts/Remove_Black_and_White_Photos.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from PIL import Image\n",
"from random import randint\n",
"import uuid\n",
"from multiprocessing import Pool\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/PIL/TiffImagePlugin.py:764: UserWarning: Corrupt EXIF data. Expecting to read 4 bytes but only got 0. \n",
" warnings.warn(str(msg))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done!\n"
]
}
],
"source": [
"def remove_bw(filepath_filename):\n",
" try:\n",
" with Image.open(filepath_filename[0]) as im:\n",
" x, y = im.size\n",
" pixels = im.load()\n",
" if im.mode == 'L' or type(pixels) == int:\n",
" os.rename(filepath_filename[0], '/home/ubuntu/storage/BW_images/' + filepath_filename[1])\n",
" else:\n",
" bw_pixel = 0\n",
" for i in range(1000):\n",
" x_pixel = randint(0, (x-1))\n",
" y_pixel = randint(0, (y-1))\n",
" if len(pixels[x_pixel, y_pixel]) != 3:\n",
" bw_pixel += 1000\n",
" else:\n",
" match_pixel = pixels[x_pixel, y_pixel][0]\n",
" if match_pixel == pixels[x_pixel, y_pixel][1] or match_pixel == pixels[x_pixel, y_pixel][2]:\n",
" bw_pixel += 1\n",
" if bw_pixel > 990:\n",
" os.rename(filepath_filename[0], '/home/ubuntu/storage/BW_images/' + filepath_filename[1])\n",
" except:\n",
" print('Error!')\n",
" os.rename(filepath_filename[0], '/home/ubuntu/storage/error_files/' + filepath_filename[1])\n",
"\n",
"if __name__ == \"__main__\":\n",
" img_dir = r\"./pixa_ready/\"\n",
" images = []\n",
"\n",
" for filename in os.listdir(img_dir):\n",
" filepath = os.path.join(img_dir, filename)\n",
" images.append([filepath, filename])\n",
"\n",
" pool = Pool(processes=16) \n",
" pool.map(remove_bw, images)\n",
"\n",
" print(\"Done!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Environment (conda_tensorflow_p36)",
"language": "python",
"name": "conda_tensorflow_p36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from PIL import Image\n",
"from random import randint\n",
"import uuid\n",
"from multiprocessing import Pool\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/PIL/TiffImagePlugin.py:764: UserWarning: Corrupt EXIF data. Expecting to read 4 bytes but only got 0. \n",
" warnings.warn(str(msg))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done!\n"
]
}
],
"source": [
"def remove_clipart(filepath_filename):\n",
" try:\n",
" with Image.open(filepath_filename[0]) as im:\n",
" x, y = im.size\n",
" pixels = im.load()\n",
" if im.mode == 'L' or type(pixels) == int or x < 200 or y < 200:\n",
" os.rename(filepath_filename[0], '/home/ubuntu/storage/error_files/' + filepath_filename[1])\n",
" else: \n",
" status = False\n",
"\n",
" for i in range(4):\n",
" light_pixel = 0\n",
"\n",
" if i == 0:\n",
" x_pixel = 0\n",
" y_pixel = 0\n",
"\n",
" if i == 1:\n",
" x_pixel = x - 35\n",
" y_pixel = 0\n",
"\n",
" if i == 2:\n",
" x_pixel = 0\n",
" y_pixel = y - 1\n",
"\n",
" if i == 3:\n",
" x_pixel = x - 35\n",
" y_pixel = y - 1\n",
"\n",
" for k in range(30):\n",
" if len(pixels[x_pixel, y_pixel]) != 3:\n",
" status = True\n",
" else:\n",
" first = pixels[x_pixel, y_pixel][0]\n",
" second = pixels[x_pixel, y_pixel][1]\n",
" third = pixels[x_pixel, y_pixel][2]\n",
" if first > 246 and second > 246 and third > 246:\n",
" light_pixel += 1\n",
" x_pixel += 1\n",
" if light_pixel == 30:\n",
" status = True\n",
"\n",
" if status:\n",
" os.rename(filepath_filename[0], '/home/ubuntu/storage/BW_images/' + filepath_filename[1])\n",
" except IOError:\n",
" os.rename(filepath_filename[0], '/home/ubuntu/storage/error_files/' + filepath_filename[1])\n",
"\n",
"if __name__ == \"__main__\":\n",
" img_dir = r\"./pixa_ready/\"\n",
" images = []\n",
"\n",
" for filename in os.listdir(img_dir):\n",
" filepath = os.path.join(img_dir, filename)\n",
" images.append([filepath, filename])\n",
"\n",
" pool = Pool(processes=16) \n",
" pool.map(remove_clipart, images)\n",
"\n",
" print(\"Done!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Environment (conda_tensorflow_p36)",
"language": "python",
"name": "conda_tensorflow_p36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
92 changes: 92 additions & 0 deletions download_and_clean_data_scripts/Remove_dups.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from PIL import Image\n",
"from shutil import copyfile"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"img_dir = r\"./all_images/\"\n",
"\n",
"images = []\n",
"for filename in os.listdir(img_dir):\n",
" images.append(filename)\n",
"\n",
"images.sort()\n",
"\n",
"def get_orgname(org_name):\n",
" name = org_name.partition(\" \")[0]\n",
" if name == org_name:\n",
" name = org_name.partition(\".\")[0]\n",
" return name\n",
"\n",
"def get_size(filepath):\n",
" with Image.open(first_filepath) as im:\n",
" x, y = im.size\n",
" return x * y\n",
" \n",
"\n",
"curr_name = ''\n",
"for i in range(len(images) - 1):\n",
" if curr_name == '':\n",
" curr_name = images[i]\n",
" \n",
" second_name = images[i + 1]\n",
" \n",
" if get_orgname(curr_name) == get_orgname(second_name):\n",
" first_filepath = os.path.join(img_dir, curr_name)\n",
" second_filepath = os.path.join(img_dir, second_name)\n",
" first_size = get_size(first_filepath)\n",
" second_size = get_size(second_filepath)\n",
" \n",
" if first_size < second_size:\n",
" curr_name = second_name\n",
" i += 1\n",
" else:\n",
" copyfile(os.path.join(img_dir, curr_name), '/home/ubuntu/storage/org_large/' + curr_name)\n",
" curr_name = ''\n",
" i += 1\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Environment (conda_tensorflow_p36)",
"language": "python",
"name": "conda_tensorflow_p36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 8415053

Please sign in to comment.