diff --git a/scrapy_app/scrapy_app/spiders/tor_onion.py b/scrapy_app/scrapy_app/spiders/tor_onion.py new file mode 100644 index 0000000..f5a2ba6 --- /dev/null +++ b/scrapy_app/scrapy_app/spiders/tor_onion.py @@ -0,0 +1,34 @@ +import scrapy +from scrapy.spiders import CrawlSpider +from scrapy_app.spider_common import common_parser + + +class CrawlItem(scrapy.Item): + name = scrapy.Field() + link = scrapy.Field() + + +# default spider for retrieve href in the given URL +class TorOnionSpider(CrawlSpider): + name = 'tor_onion' + + def __init__(self, *args, **kwargs): + self.url = kwargs.get('url') + self.domain = kwargs.get('domain') + self.start_urls = [self.url] + self.allowed_domains = [self.domain] + self.settings = kwargs.get('settings') + + super(TorOnionSpider, self).__init__(*args, **kwargs) + + def parse(self, response): + parsed_item = common_parser(self.settings) + crawled_data = [] + for sel in response.xpath('//a'): + item = CrawlItem() + item['name'] = sel.xpath('text()').extract() + item['link'] = sel.xpath('@href').extract() + crawled_data.append(item) + + parsed_item['data'] = crawled_data + yield parsed_item