PotatoSpudowski
diff --git a/Diff for: ‎.gitignore
+11 b/Diff for: ‎.gitignore
+11
diff --git a/Diff for: ‎.ipynb_checkpoints/Data_collection-checkpoint.ipynb
+6 b/Diff for: ‎.ipynb_checkpoints/Data_collection-checkpoint.ipynb
+6
diff --git a/Diff for: ‎Data/Validation_GCC-1.1.0-Validation.tsv
+15,840 b/Diff for: ‎Data/Validation_GCC-1.1.0-Validation.tsv
+15,840
diff --git a/Diff for: ‎Data/download_pascal_sentence_data.py
+83 b/Diff for: ‎Data/download_pascal_sentence_data.py
+83
diff --git a/Diff for: ‎Notebooks/Data_collection/download_google_conceptual_captions_data.ipynb
+5,283 b/Diff for: ‎Notebooks/Data_collection/download_google_conceptual_captions_data.ipynb
+5,283
diff --git a/Diff for: ‎Notebooks/Data_collection/download_pascal_sentence_data.ipynb
+1 b/Diff for: ‎Notebooks/Data_collection/download_pascal_sentence_data.ipynb
+1
diff --git a/Diff for: ‎__pycache__/utils.cpython-36.pyc
340 Bytes b/Diff for: ‎__pycache__/utils.cpython-36.pyc
340 Bytes
diff --git a/Diff for: ‎clean_data_GCC.py
+16 b/Diff for: ‎clean_data_GCC.py
+16
diff --git a/Diff for: ‎utils/utils.py
+18 b/Diff for: ‎utils/utils.py
+18
@@ -0,0 +1,11 @@
+.idea/
+.vs/
+.env
+.pk
+.pem
+.pub
+
+#datafolder
+Data/GCC_data/**
+Data/Pascal_sencence_data/**
+Data/Train_GCC-training.tsv
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,83 @@
+from urllib.parse import urljoin
+from pyquery import PyQuery
+import os
+import requests
+import csv
+
+
+class PascalSentenceDataSet():
+
+    DATASET_DIR = 'dataset/'
+    SENTENCE_DIR = 'sentence/'
+    PASCAL_SENTENCE_DATASET_URL = 'http://vision.cs.uiuc.edu/pascal-sentences/'
+
+    def __init__(self):
+        self.url = PascalSentenceDataSet.PASCAL_SENTENCE_DATASET_URL
+
+    def download_images(self):
+        dom = PyQuery(self.url)
+        for img in dom('img').items():
+            img_src = img.attr['src']
+            category, img_file_name = os.path.split(img_src)
+
+            output_dir = PascalSentenceDataSet.DATASET_DIR + category
+            print(output_dir)
+            if not os.path.isdir(output_dir):
+                os.makedirs(output_dir)
+
+            output = os.path.join(output_dir, img_file_name)
+            print(output)
+            if img_src.startswith('http'):
+                img_url = img_src
+            else:
+                img_url = urljoin(self.url, img_src)
+            if os.path.isfile(output):
+                print("Already downloaded, Skipping: %s" % output)
+                continue
+            print("Downloading: %s" % output)
+            with open(output,'wb') as f:
+
+                while True:
+                    result = requests.get(img_url)
+                    raw = result.content
+                    if result.status_code == 200:
+                        f.write(raw)
+                        break
+                    print("error occurred while fetching img")
+                    print("retry...")
+
+
+    def download_sentences(self):
+        dom = PyQuery(self.url)
+        for tr in dom('body>table>tr').items():
+            img_src = tr('img').attr['src']
+            category, img_file_name = os.path.split(img_src)
+
+            output_dir = PascalSentenceDataSet.SENTENCE_DIR + category
+            if not os.path.isdir(output_dir):
+                os.makedirs(output_dir)
+
+            head, tail = os.path.splitext(img_file_name)
+            sentence_file_name = head + "txt"
+            output = os.path.join(output_dir, sentence_file_name)
+            if os.path.isfile(output):
+                print("Already downloaded, Skipping: %s" % output)
+                continue
+            print("Downloading: %s" % output)
+            with open(output,'w') as f:
+                for td in tr('table tr td').items():
+                    f.write(td.text() + "\n")
+
+    def create_correspondence_data(self):
+        dom = PyQuery(self.url)
+        writer = csv.writer(open('correspondence.csv', 'wb'))
+        for i, img in enumerate(dom('img').items()):
+            img_src = img.attr['src']
+            print("%d => %s" % (i + 1, img_src))
+            writer.writerow([i + 1, img_src])
+
+if __name__=="__main__":
+
+    dataset = PascalSentenceDataSet()
+    dataset.download_images()
+    dataset.download_sentences()
@@ -0,0 +1,16 @@
+import pandas as pd
+from tqdm import tqdm
+from utils import utils
+
+df = pd.read_csv("Data/GCC_data/val_data.csv")
+ignore_list = []
+
+for i in tqdm(range(len(df))):
+    image = df["images"][i]
+    if utils.verify_image("Data/GCC_data/"+image):
+        continue
+    else: 
+        ignore_list.append(image)
+
+df2 = pd.DataFrame(ignore_list)
+df2.to_csv("Data/GCC_data/Ignore_list.csv")
@@ -0,0 +1,18 @@
+from skimage import io
+
+def verify_image(img_file):
+    try:
+        img = io.imread(img_file)
+    except:
+        return False
+    return True
+
+def preprocess_image(image):
+    image = imread(image)
+    image_size = model.input_shape[1]
+    x = center_crop_and_resize(image, image_size=image_size)
+    x = preprocess_input(x)
+    x = np.expand_dims(x, 0)
+
+    return x
+