This repository has been archived by the owner on Sep 3, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
webcrawler_main.py
96 lines (79 loc) · 3.3 KB
/
webcrawler_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
__author__ = 'Piotr Chmiel'
from multiprocessing import Process, Lock
from json import load, dump
import os
from web_crawler.rss_provider import FeedProvider, get_source
from web_crawler.website_critera import CriteriaManager
def valid_filename(filename):
valid_chars = '-_.() abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
return ''.join(c for c in filename if c in valid_chars)
def create_file(path, article):
with open(path, "w", encoding='utf-8') as file_handler:
file_handler.write(article.decode(encoding='UTF-8', errors="replace"))
def write_index(url, filename):
with open("index.txt", encoding="utf-8") as index_fh:
data = load(index_fh)
if filename not in data:
data[filename] = url
with open("index.txt", "w", encoding="utf-8") as index_fh:
dump(data, index_fh)
def get_articles(rss_link, category, lock):
feed_provider = FeedProvider(rss_link)
urls = feed_provider.get_article_urls()
print(urls)
for url in urls:
if url is not None:
try:
parser = CriteriaManager.get_parser(url)
except Exception as err:
print("Parser error\n" + str(err))
else:
if parser is not None:
print(url)
article_title = parser.get_title()
if article_title is not None:
path = os.getcwd() + "/Articles/" + category + \
"/" + valid_filename(article_title) + ".txt"
if not os.path.exists(path):
article = parser.get_article()
if article is not None:
article = article.encode("utf-8")
print(article_title.encode("utf-8"))
print(article)
create_file(path, article)
lock.acquire()
write_index(url, valid_filename(article_title) + ".txt")
lock.release()
else:
print("Article is None")
else:
print("Path exists")
else:
print("Title is none")
else:
print("Parser is none")
def main():
processes = []
lock = Lock()
print("Start Web Crawler")
rss_link_by_category = get_source("rss_source.txt")
print(rss_link_by_category)
if not os.path.isdir(os.getcwd() + "/Articles"):
os.mkdir("Articles")
if not os.path.isfile(os.getcwd() + "/index.txt"):
with open(os.getcwd() + "/index.txt", 'w') as fh:
fh.write("{}")
for category in rss_link_by_category.keys():
if not os.path.isdir(os.getcwd() + "/Articles/" + category):
os.mkdir("Articles/" + category)
for rss_link in rss_link_by_category[category]:
#processes.append(Process(target=get_articles, args=(rss_link, category, lock)))
get_articles(rss_link,category,lock)
"""
for process in processes:
process.start()
for process in processes:
process.join()
"""
if __name__ == '__main__':
main()