crawler-conf.yaml

# Custom configuration for StormCrawler
# This is used to override the default values from crawler-default.xml and provide additional ones 
# for your custom components.
# Use this file with the parameter -config when launching your extension of ConfigurableTopology.  
# This file does not contain all the key values but only the most frequently used ones. See crawler-default.xml for an extensive list.

config: 
  topology.workers: 2
  topology.message.timeout.secs: 300
  topology.max.spout.pending: 250
  topology.debug: false

  topology.worker.childopts: "-Xmx5g -Djava.net.preferIPv4Stack=true"

  fetcher.threads.number: 250
  
  fetcher.timeout.queue: 300

  # set to 50MB to handle sitemaps
  http.content.limit: 52428800

  partition.url.mode: byDomain

  # mandatory when using Flux
  topology.kryo.register:
    - com.digitalpebble.stormcrawler.Metadata

  # metadata to transfer to the outlinks
  # used by Fetcher for redirections, sitemapparser, etc...
  # these are also persisted for the parent document (see below)
  # metadata.transfer:
  # - customMetadataName

  metadata.track.path: true
  metadata.track.depth: true

  # lists the metadata to persist to storage
  # these are not transfered to the outlinks
  metadata.persist:
   - _redirTo
   - error.cause
   - error.source
   - isSitemap
   - isFeed

  # text extraction for JSoupParserBolt
  textextractor.include.pattern:
   - DIV[id="maincontent"]
   - DIV[itemprop="articleBody"]
   - ARTICLE

  textextractor.exclude.tags:
   - STYLE
   - SCRIPT

  parser.mimetype.whitelist:
   - application/.+word.*
   - application/.+excel.*
   - application/.+powerpoint.*
   - application/.*pdf.*

  indexer.text.maxlength: 100000

  http.agent.name: "DigitalPebble Crawler"
  http.agent.version: "1.14"
  http.agent.description: "StormCrawler Benchmark"
  http.agent.url: "http://digitalpebble.com/"
  http.agent.email: "crawler@digitalpebble.com"

  http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"
  https.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"

  robots.cache.spec: "softValues,expireAfterWrite=6h"

  # FetcherBolt queue dump : comment out to activate
  # if a file exists on the worker machine with the corresponding port number
  # the FetcherBolt will log the content of its internal queues to the logs
  # fetcherbolt.queue.debug.filepath: "/tmp/fetcher-dump-{port}

  parsefilters.config.file: "parsefilters.json"
  urlfilters.config.file: "urlfilters.json"

  sitemap.discovery: true
  sitemap.sniffContent: true

  sitemap.schedule.delay: 30

  feed.sniffContent: true

  detect.charset.maxlength: 10000

  # revisit a page daily (value in minutes)
  # set it to -1 to never refetch a page
  fetchInterval.default: -1

  jsoup.treat.non.html.as.error: false

  # revisit a page with a fetch error after 2 hours (value in minutes)
  # set it to -1 to never refetch a page
  fetchInterval.fetch.error: 120

  # never revisit a page with an error (or set a value in minutes)
  fetchInterval.error: -1

  # custom fetch interval to be used when a document has the key/value in its metadata
  # and has been fetched succesfully (value in minutes)
  # fetchInterval.isFeed=true: 10

  status.updater.cache.spec: "maximumSize=500000,expireAfterAccess=1h"

  indexer.url.fieldname: "url"
  indexer.text.fieldname: "content"
  indexer.canonical.name: "canonical"
  indexer.md.mapping:
  - parse.title=title
  - parse.keywords=keywords
  - parse.description=description
  - lang=lang
  - domain=domain
  - format=format