Skip to content

Commit 94ac8ba

Browse files
Added files
1 parent ac47bc0 commit 94ac8ba

9 files changed

+21258
-0
lines changed

Diff for: .gitignore

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
.idea/
2+
.vs/
3+
.env
4+
.pk
5+
.pem
6+
.pub
7+
8+
#datafolder
9+
Data/GCC_data/**
10+
Data/Pascal_sencence_data/**
11+
Data/Train_GCC-training.tsv

Diff for: .ipynb_checkpoints/Data_collection-checkpoint.ipynb

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"cells": [],
3+
"metadata": {},
4+
"nbformat": 4,
5+
"nbformat_minor": 2
6+
}

Diff for: Data/Validation_GCC-1.1.0-Validation.tsv

+15,840
Large diffs are not rendered by default.

Diff for: Data/download_pascal_sentence_data.py

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
from urllib.parse import urljoin
2+
from pyquery import PyQuery
3+
import os
4+
import requests
5+
import csv
6+
7+
8+
class PascalSentenceDataSet():
9+
10+
DATASET_DIR = 'dataset/'
11+
SENTENCE_DIR = 'sentence/'
12+
PASCAL_SENTENCE_DATASET_URL = 'http://vision.cs.uiuc.edu/pascal-sentences/'
13+
14+
def __init__(self):
15+
self.url = PascalSentenceDataSet.PASCAL_SENTENCE_DATASET_URL
16+
17+
def download_images(self):
18+
dom = PyQuery(self.url)
19+
for img in dom('img').items():
20+
img_src = img.attr['src']
21+
category, img_file_name = os.path.split(img_src)
22+
23+
output_dir = PascalSentenceDataSet.DATASET_DIR + category
24+
print(output_dir)
25+
if not os.path.isdir(output_dir):
26+
os.makedirs(output_dir)
27+
28+
output = os.path.join(output_dir, img_file_name)
29+
print(output)
30+
if img_src.startswith('http'):
31+
img_url = img_src
32+
else:
33+
img_url = urljoin(self.url, img_src)
34+
if os.path.isfile(output):
35+
print("Already downloaded, Skipping: %s" % output)
36+
continue
37+
print("Downloading: %s" % output)
38+
with open(output,'wb') as f:
39+
40+
while True:
41+
result = requests.get(img_url)
42+
raw = result.content
43+
if result.status_code == 200:
44+
f.write(raw)
45+
break
46+
print("error occurred while fetching img")
47+
print("retry...")
48+
49+
50+
def download_sentences(self):
51+
dom = PyQuery(self.url)
52+
for tr in dom('body>table>tr').items():
53+
img_src = tr('img').attr['src']
54+
category, img_file_name = os.path.split(img_src)
55+
56+
output_dir = PascalSentenceDataSet.SENTENCE_DIR + category
57+
if not os.path.isdir(output_dir):
58+
os.makedirs(output_dir)
59+
60+
head, tail = os.path.splitext(img_file_name)
61+
sentence_file_name = head + "txt"
62+
output = os.path.join(output_dir, sentence_file_name)
63+
if os.path.isfile(output):
64+
print("Already downloaded, Skipping: %s" % output)
65+
continue
66+
print("Downloading: %s" % output)
67+
with open(output,'w') as f:
68+
for td in tr('table tr td').items():
69+
f.write(td.text() + "\n")
70+
71+
def create_correspondence_data(self):
72+
dom = PyQuery(self.url)
73+
writer = csv.writer(open('correspondence.csv', 'wb'))
74+
for i, img in enumerate(dom('img').items()):
75+
img_src = img.attr['src']
76+
print("%d => %s" % (i + 1, img_src))
77+
writer.writerow([i + 1, img_src])
78+
79+
if __name__=="__main__":
80+
81+
dataset = PascalSentenceDataSet()
82+
dataset.download_images()
83+
dataset.download_sentences()

Diff for: Notebooks/Data_collection/download_google_conceptual_captions_data.ipynb

+5,283
Large diffs are not rendered by default.

Diff for: Notebooks/Data_collection/download_pascal_sentence_data.ipynb

+1
Large diffs are not rendered by default.

Diff for: __pycache__/utils.cpython-36.pyc

340 Bytes
Binary file not shown.

Diff for: clean_data_GCC.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import pandas as pd
2+
from tqdm import tqdm
3+
from utils import utils
4+
5+
df = pd.read_csv("Data/GCC_data/val_data.csv")
6+
ignore_list = []
7+
8+
for i in tqdm(range(len(df))):
9+
image = df["images"][i]
10+
if utils.verify_image("Data/GCC_data/"+image):
11+
continue
12+
else:
13+
ignore_list.append(image)
14+
15+
df2 = pd.DataFrame(ignore_list)
16+
df2.to_csv("Data/GCC_data/Ignore_list.csv")

Diff for: utils/utils.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from skimage import io
2+
3+
def verify_image(img_file):
4+
try:
5+
img = io.imread(img_file)
6+
except:
7+
return False
8+
return True
9+
10+
def preprocess_image(image):
11+
image = imread(image)
12+
image_size = model.input_shape[1]
13+
x = center_crop_and_resize(image, image_size=image_size)
14+
x = preprocess_input(x)
15+
x = np.expand_dims(x, 0)
16+
17+
return x
18+

0 commit comments

Comments
 (0)