Enter code using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Microsoft.AspNetCore.Mvc;
using Abot.Crawler;
using Abot.Poco;
using System.Net;
namespace WebCrawler.Web.Controllers
{
public class HomeController : Controller
{
public IActionResult Index()
{
return View();
}
public async Task<JsonResult> ScrapUrl()
{
CrawlConfiguration crawlConfig = new CrawlConfiguration();
//Create a config object manually
CrawlConfiguration config = new CrawlConfiguration();
//config.CrawlTimeoutSeconds = 0;
//config.DownloadableContentTypes = "text/html, text/plain";
//config.IsExternalPageCrawlingEnabled = false;
//config.IsExternalPageLinksCrawlingEnabled = false;
//config.IsRespectRobotsDotTextEnabled = false;
//config.IsUriRecrawlingEnabled = false;
//config.MaxConcurrentThreads = 10;
//config.MaxPagesToCrawl = 10;
//config.MaxPagesToCrawlPerDomain = 100;
//config.MinCrawlDelayPerDomainMilliSeconds = 1000;
//PoliteWebCrawler crawler = new PoliteWebCrawler();
PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null);
crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
{
CrawlDecision decision = new CrawlDecision { Allow = true };
if (crawledPage.Content.Bytes.Length < 100)
return new CrawlDecision { Allow = false, Reason = "Just crawl links in pages that have at least 100 bytes" };
return decision;
});
CrawlResult result = await crawler.CrawlAsync(new Uri("
http://localhost:1111/")); //This is synchronous, it will not go to the next line until the crawl has completed
if (result.ErrorOccurred)
Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
else
Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
return Json("test");
}
public IActionResult About()
{
ViewData["Message"] = "Your application description page.";
return View();
}
public IActionResult Contact()
{
ViewData["Message"] = "Your contact page.";
return View();
}
public IActionResult Error()
{
return View();
}
void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
{
PageToCrawl pageToCrawl = e.PageToCrawl;
Console.WriteLine("About to crawl link {0} which was found on page {1}", pageToCrawl.Uri.AbsoluteUri, pageToCrawl.ParentUri.AbsoluteUri);
}
void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
{
CrawledPage crawledPage = e.CrawledPage;
if (crawledPage.HttpRequestException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
else
Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
if (string.IsNullOrEmpty(crawledPage.Content.Text))
Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
var htmlAgilityPackDocument = crawledPage.HtmlDocument; //Html Agility Pack parser
var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser
foreach (Uri uri in e.CrawledPage.ParsedLinks)
{
try
{
PageToCrawl page = new PageToCrawl(uri);
// logic & stuff to format/save goes here
}
catch { }
}
}
void crawler_PageLinksCrawlDisallowed(object sender, PageLinksCrawlDisallowedArgs e)
{
CrawledPage crawledPage = e.CrawledPage;
Console.WriteLine("Did not crawl the links on page {0} due to {1}", crawledPage.Uri.AbsoluteUri, e.DisallowedReason);
}
void crawler_PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e)
{
PageToCrawl pageToCrawl = e.PageToCrawl;
Console.WriteLine("Did not crawl page {0} due to {1}", pageToCrawl.Uri.AbsoluteUri, e.DisallowedReason);
}
}
}
here...