-
Notifications
You must be signed in to change notification settings - Fork 0
/
4cdl.py
109 lines (94 loc) · 3.85 KB
/
4cdl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from urlparse import urlparse
import argparse
import httplib
import urllib2
import re
import time
import json
import os
import threading
sleep_time = 10
wait_thread_sleep_time = 2
cache_string = "Run out of free thread. Retry after" + \
str(wait_thread_sleep_time) + "second"
number_of_thread = 10
class downloadThread (threading.Thread):
def __init__(self, url, folder):
threading.Thread.__init__(self)
self.url = url
self.folder = folder
def run(self):
print "Starting download thread for " + self.url
download(self.url, self.folder)
print "Exiting download thread for " + self.url
def download(url, folder):
file_name = '.\\' + folder + '\\' + url.split('/')[-1]
if not os.path.exists('.\\' + folder + '\\'):
os.makedirs('.\\' + folder + '\\')
headers = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(url, None, headers)
u = urllib2.urlopen(req)
meta = u.info()
file_size = int(meta.getheaders("Content-Length")[0])
# Check if file is already downloaded
if os.path.isfile(file_name) and file_size == os.stat(file_name).st_size:
print "File "+file_name+" is already downloaded"
return
# Begin download
file_size_dl = 0
block_sz = 1024
with open(file_name, 'wb') as f:
while True:
buffer = u.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
status = r" [%3.2f%%]" % (file_size_dl * 100. / file_size)
print "Downloading:" + file_name + status
def check_thread(board, sid):
prev_img_list = []
while True:
myConnection = httplib.HTTPSConnection(
"a.4cdn.org")
myConnection.request("GET", "/" + board + "/thread/" + sid + ".json")
reply = myConnection.getresponse()
print reply.status, reply.reason
if reply.status == 404:
print "404 Not found. Please check the URL again!"
break
temp_json = reply.read()
img_list = re.findall(r'"filename":".+?".+?"tim":.+?,', temp_json)
if not os.path.exists('.\\' + board + sid + '\\'):
os.makedirs('.\\' + board + sid + '\\')
with open('.\\' + board + sid + '\\' + sid + ".json", 'wb') as f:
f.write(temp_json)
# Print img_list
myConnection.close()
for i in img_list[len(prev_img_list):]:
j = json.loads('{'+i[:-1]+'}')
download_link = \
"http://i.4cdn.org/" + board + "/" + str(j['tim']) + j['ext']
print download_link
while (threading.activeCount() == number_of_thread):
print cache_string
time.sleep(wait_thread_sleep_time)
downloadThread(download_link, board + sid).start()
prev_img_list = img_list
time.sleep(sleep_time)
def parse_thread_URL(url):
url_components = urlparse(url).path.split('/')
return url_components[1], url_components[3]
prog_description = 'Download all images and json of a 4chan thread until '\
'thread dies. Resume and multi-thread download supported.'\
'From json and the images, the original html can be generated.'
parser = argparse.ArgumentParser(description=prog_description)
parser.add_argument('threadURL', metavar='Thread_URL',
help='The thread URL for example '
'http://boards.4chan.org/biz/thread/1873336', default=10)
parser.add_argument('-t', '--thread_num', metavar='number',
help='The number of download thread, default is 10')
args = parser.parse_args()
number_of_thread = args.thread_num
board, thread_id = parse_thread_URL(args.threadURL)
check_thread(board, thread_id)