Skip to content

Commit

Permalink
Add basic tor onion spider
Browse files Browse the repository at this point in the history
  • Loading branch information
DrifterKaru committed Jul 17, 2022
1 parent 4b53e5f commit 81bef55
Showing 1 changed file with 34 additions and 0 deletions.
34 changes: 34 additions & 0 deletions scrapy_app/scrapy_app/spiders/tor_onion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import scrapy
from scrapy.spiders import CrawlSpider
from scrapy_app.spider_common import common_parser


class CrawlItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()


# default spider for retrieve href in the given URL
class TorOnionSpider(CrawlSpider):
name = 'tor_onion'

def __init__(self, *args, **kwargs):
self.url = kwargs.get('url')
self.domain = kwargs.get('domain')
self.start_urls = [self.url]
self.allowed_domains = [self.domain]
self.settings = kwargs.get('settings')

super(TorOnionSpider, self).__init__(*args, **kwargs)

def parse(self, response):
parsed_item = common_parser(self.settings)
crawled_data = []
for sel in response.xpath('//a'):
item = CrawlItem()
item['name'] = sel.xpath('text()').extract()
item['link'] = sel.xpath('@href').extract()
crawled_data.append(item)

parsed_item['data'] = crawled_data
yield parsed_item

0 comments on commit 81bef55

Please sign in to comment.