-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathbing_image_collector.py
144 lines (110 loc) · 4.39 KB
/
bing_image_collector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# -*- coding: utf-8 -*-
import http.client
import json
import re
import requests
import os
import math
import pickle
import urllib
import hashlib
import sha3
import configparser # for Python3
def make_dir(path):
if not os.path.isdir(path):
os.mkdir(path)
def make_correspondence_table(correspondence_table, original_url, hashed_url):
"""Create reference table of hash value and original URL.
"""
correspondence_table[original_url] = hashed_url
def make_img_path(save_dir_path, url):
"""Hash the image url and create the path
Args:
save_dir_path (str): Path to save image dir.
url (str): An url of image.
Returns:
Path of hashed image URL.
"""
save_img_path = os.path.join(save_dir_path, 'imgs')
make_dir(save_img_path)
file_extension = os.path.splitext(url)[-1]
if file_extension.lower() in ('.jpg', '.jpeg', '.gif', '.png', '.bmp'):
encoded_url = url.encode('utf-8') # required encoding for hashed
hashed_url = hashlib.sha3_256(encoded_url).hexdigest()
full_path = os.path.join(save_img_path, hashed_url + file_extension.lower())
make_correspondence_table(correspondence_table, url, hashed_url)
return full_path
else:
raise ValueError('Not applicable file extension')
def download_image(url, timeout=10):
response = requests.get(url, allow_redirects=True, timeout=timeout)
if response.status_code != 200:
error = Exception("HTTP status: " + response.status_code)
raise error
content_type = response.headers["content-type"]
if 'image' not in content_type:
error = Exception("Content-Type: " + content_type)
raise error
return response.content
def save_image(filename, image):
with open(filename, "wb") as fout:
fout.write(image)
if __name__ == '__main__':
config = configparser.ConfigParser()
config.read('authentication.ini')
bing_api_key = config['auth']['bing_api_key']
save_dir_path = '/root/share/local_data/sandbox'
make_dir(save_dir_path)
num_imgs_required = 300 # Number of images you want.
num_imgs_per_transaction = 150 # default 30, Max 150 images
offset_count = math.floor(num_imgs_required / num_imgs_per_transaction)
url_list = []
correspondence_table = {}
headers = {
# Request headers
'Content-Type': 'multipart/form-data',
'Ocp-Apim-Subscription-Key': bing_api_key, # API key
}
for offset in range(offset_count):
params = urllib.parse.urlencode({
# Request parameters
'q': '猫',
'mkt': 'ja-JP',
'count': num_imgs_per_transaction,
'offset': offset * num_imgs_per_transaction # increment offset by 'num_imgs_per_transaction' (for example 0, 150, 300)
})
try:
conn = http.client.HTTPSConnection('api.cognitive.microsoft.com')
conn.request("POST", "/bing/v5.0/images/search?%s" % params, "{body}", headers)
response = conn.getresponse()
data = response.read()
save_res_path = os.path.join(save_dir_path, 'pickle_files')
make_dir(save_res_path)
with open(os.path.join(save_res_path, '{}.pickle'.format(offset)), mode='wb') as f:
pickle.dump(data, f)
conn.close()
except Exception as err:
print("[Errno {0}] {1}".format(err.errno, err.strerror))
else:
decode_res = data.decode('utf-8')
data = json.loads(decode_res)
pattern = r"&r=(http.+)&p=" # extract an URL of image
for values in data['value']:
unquoted_url = urllib.parse.unquote(values['contentUrl'])
img_url = re.search(pattern, unquoted_url)
if img_url:
url_list.append(img_url.group(1))
for url in url_list:
try:
img_path = make_img_path(save_dir_path, url)
image = download_image(url)
save_image(img_path, image)
print('saved image... {}'.format(url))
except KeyboardInterrupt:
break
except Exception as err:
print("%s" % (err))
correspondence_table_path = os.path.join(save_dir_path, 'corr_table')
make_dir(correspondence_table_path)
with open(os.path.join(correspondence_table_path, 'corr_table.json'), mode='w') as f:
json.dump(correspondence_table, f)