Hello
i try to use an override on the SchedulePageLinks method in the Webcrawler class. The code is here below ( i only add a pagebag property). For the rest the code is exactly the same When debugging, the code is never touched.
Any ideas on what can cause this ?
best regards
Ghislain Borremans
public class MySchedulePageLinks : WebCrawler
{
protected override void SchedulePageLinks(CrawledPage crawledPage)
{
var linksToCrawl = 0;
foreach (var hyperLink in crawledPage.ParsedLinks)
{
// First validate that the link was not already visited or added to the list of pages to visit, so we don't
// make the same validation and fire the same events twice.
if (!_scheduler.IsUriKnown(hyperLink.HrefValue) &&
(ShouldScheduleLinkDecisionMaker == null || ShouldScheduleLinkDecisionMaker.Invoke(hyperLink.HrefValue, crawledPage, _crawlContext)))
{
try //Added due to a bug in the Uri class related to this (
http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed)
{
var page = new PageToCrawl(hyperLink.HrefValue);
page.ParentUri = crawledPage.Uri;
page.CrawlDepth = crawledPage.CrawlDepth + 1;
page.IsInternal = IsInternalUri(hyperLink.HrefValue);
page.IsRoot = false;
//added statement
page.PageBag.RawHrefText = hyperLink.HrefValue;
if (ShouldSchedulePageLink(page))
{
_scheduler.Add(page);
linksToCrawl++;
}
if (!ShouldScheduleMorePageLink(linksToCrawl))
{
Log.Information("MaxLinksPerPage has been reached. No more links will be scheduled for current page [{0}].", crawledPage.Uri);
break;
}
}
catch { }
}
// Add this link to the list of known Urls so validations are not duplicated in the future.
_scheduler.AddKnownUri(hyperLink.HrefValue);
}
}
}