-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprovider.py
61 lines (49 loc) · 2.04 KB
/
provider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import logging
import timeit
from models import Source
from scrapy.crawler import CrawlerRunner, Settings
from scrapy_crawler import settings as local_crawler_settings
from scrapy_crawler.pipelines import NewsCrawlerPipeline
from scrapy_crawler.spiders.nld import NguoiLaoDongSpider
from scrapy_crawler.spiders.tuoitre import TuoiTreSpider
from scrapy_crawler.spiders.vnexpress import VnExpressSpider
from twisted.internet import reactor
logger = logging.getLogger(__name__)
class Provider:
spiders = [
NguoiLaoDongSpider,
TuoiTreSpider,
VnExpressSpider,
]
def start_crawling(self, sources: list[Source]) -> None:
self.start_time = timeit.default_timer()
logger.info(">> Start crawling...")
source_dict = self._get_source_dict(sources)
crawler = self._setup_crawler()
self._setup_spiders(crawler, source_dict)
self._crawl(crawler)
self._handle_crawled_articles(NewsCrawlerPipeline.articles_by_topics)
elapsed_time = round(timeit.default_timer() - self.start_time, 4)
logger.info(f">> Elapsed time: {elapsed_time}")
def _get_source_dict(self, sources: list[Source]) -> dict[str, Source]:
source_dict = {}
for source in sources:
if source.editor_id not in source_dict:
source_dict[source.editor_id] = []
source_dict[source.editor_id] = source
return source_dict
def _setup_crawler(self) -> CrawlerRunner:
crawler_settings = Settings()
crawler_settings.setmodule(local_crawler_settings)
crawler = CrawlerRunner(settings=crawler_settings)
return crawler
def _setup_spiders(self, crawler: CrawlerRunner, sources: dict[str, Source]):
for spider in self.spiders:
if spider.name not in sources:
continue
source = sources[spider.name]
crawler.crawl(spider, source)
def _crawl(self, crawler: CrawlerRunner):
d = crawler.join()
d.addBoth(lambda _: reactor.stop())
reactor.run(0)