Hi Julien,
Thanks for your quick response. I think I haven't explained clear. I would like to do just opposite, so if the crawler can jump from site A to site B to implement a basic web finder.
This is my urlfilter.json:
{
"com.digitalpebble.storm.crawler.filtering.URLFilters": [
{
"class": "com.digitalpebble.storm.crawler.filtering.depth.MaxDepthFilter",
"name": "MaxDepthFilter",
"params": {
"maxDepth": 100
}
},
{
"class": "com.digitalpebble.storm.crawler.filtering.basic.BasicURLNormalizer",
"name": "BasicURLNormalizer",
"params": {
"removeAnchorPart": true,
"unmangleQueryString": true,
"checkValidURI": true
}
},
{
"class": "com.digitalpebble.storm.crawler.filtering.host.HostURLFilter",
"ignoreOutsideDomain": false
}
},
{
"class": "com.digitalpebble.storm.crawler.filtering.regex.RegexURLNormalizer",
"name": "RegexURLNormalizer",
"params": {
"regexNormalizerFile": "default-regex-normalizers.xml"
}
},
{
"class": "com.digitalpebble.storm.crawler.filtering.regex.RegexURLFilter",
"name": "RegexURLFilter",
"params": {
"regexFilterFile": "default-regex-filters.txt"
}
},
{
"class": "com.digitalpebble.storm.crawler.filtering.basic.SelfURLFilter",
"name": "SelfURLFilter"
}
]
}
and the crawl.yaml file:
# Default configuration for StormCrawler
# This is used to make the default values explicit and list the most common configurations.
# Do not modify this file but instead provide a custom one with the parameter -config
# when launching your extension of ConfigurableTopology.
fetcher.server.delay: 1.0
fetcher.server.min.delay: 0.0
fetcher.queue.mode: "byHost"
fetcher.threads.per.queue: 1
fetcher.threads.number: 10
# time bucket to use for the metrics sent by the Fetcher
fetcher.metrics.time.bucket.secs: 10
partition.url.mode: "byHost"
# lists the metadata to transfer to the outlinks
# used by Fetcher for redirections, sitemapparser, etc...
# metadata.transfer:
# - key1
# - key2
# - key3
http.agent.version: "1.0"
http.agent.description: "a Storm-based crawler"
http.accept.language: "en-us,en-gb,en;q=0.7,*;q=0.3"
http.accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
http.content.limit: 65536
http.store.responsetime: true
http.timeout: 10000
http.robots.403.allow: true
# should the URLs be removed when a page is marked as noFollow
robots.noFollow.strict: true
# should the URLs be removed when a page is marked as noFollow
robots.noFollow.strict: true
protocols: "http,https"
http.protocol.implementation: "com.digitalpebble.storm.crawler.protocol.httpclient.HttpProtocol"
https.protocol.implementation: "com.digitalpebble.storm.crawler.protocol.httpclient.HttpProtocol"
parsefilters.config.file: "parsefilters.json"
urlfilters.config.file: "urlfilters.json"
# whether the sitemap parser should try to
# determine whether a page is a sitemap based
# on its content if it is missing the K/V in the metadata
sitemap.sniffContent: false
# filters URLs in sitemaps based on their modified Date (if any)
sitemap.filter.hours.since.modified: -1
# whether to add any sitemaps found in the robots.txt to the status stream
# used by fetcher bolts. sitemap.sniffContent must be set to true if the
# discovery is enabled
sitemap.discovery: false
# Default implementation of Scheduler
scheduler.class: "com.digitalpebble.storm.crawler.persistence.DefaultScheduler"
# revisit a page daily (value in minutes)
fetchInterval.default: 1440
# revisit a page with a fetch error after 2 hours (value in minutes)
fetchInterval.fetch.error: 120
# revisit a page with an error every month (value in minutes)
fetchInterval.error: 44640
# max number of successive fetch errors before changing status to ERROR
max.fetch.errors: 3
# configuration for the classes extending AbstractIndexerBolt
# indexer.md.filter: "someKey=aValue"
indexer.url.fieldname: "url"
indexer.text.fieldname: "content"
indexer.md.mapping:
- parse.title=page_title
- parse.keywords=keywords
- parse.description=description
- html_title=html_title
- doc_type=doc_type
metadata.track.path: true
metadata.track.depth: false
I have been crawling for 6 hours and all discovered files are under the seed host.
Is there any problem under configuration?
Thanks in advance for your help,
Rodrigo