How to scrap multiple pages in .net core. ? how to use in .net core mvc application for dynmaic page crawling ?

203 views
Skip to first unread message

Jay Patel

unread,
Apr 18, 2017, 5:10:26 PM4/18/17
to Abot Web Crawler
I have used following code on controller I am just able to scrap single url. I wanted to scrap multiple pages using abot. I have used simulator example and extended to dotnet core mvc project.

Enter code using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Microsoft.AspNetCore.Mvc;
using Abot.Crawler;
using Abot.Poco;
using System.Net;

namespace WebCrawler.Web.Controllers
{
    public class HomeController : Controller
    {
        public IActionResult Index()
        {
            return View();
        }

        public async Task<JsonResult> ScrapUrl()
        {
            CrawlConfiguration crawlConfig = new CrawlConfiguration();

            //Create a config object manually
            CrawlConfiguration config = new CrawlConfiguration();
            //config.CrawlTimeoutSeconds = 0;
            //config.DownloadableContentTypes = "text/html, text/plain";
            //config.IsExternalPageCrawlingEnabled = false;
            //config.IsExternalPageLinksCrawlingEnabled = false;
            //config.IsRespectRobotsDotTextEnabled = false;
            //config.IsUriRecrawlingEnabled = false;
            //config.MaxConcurrentThreads = 10;
            //config.MaxPagesToCrawl = 10;
            //config.MaxPagesToCrawlPerDomain = 100;
            //config.MinCrawlDelayPerDomainMilliSeconds = 1000;

            //PoliteWebCrawler crawler = new PoliteWebCrawler();
            PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null);
            crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            //CrawlResult result = crawler.Crawl(new Uri("http://localhost:1111/"));

            crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
            {
                CrawlDecision decision = new CrawlDecision { Allow = true };
                if (crawledPage.Content.Bytes.Length < 100)
                    return new CrawlDecision { Allow = false, Reason = "Just crawl links in pages that have at least 100 bytes" };

                return decision;
            });

            CrawlResult result = await crawler.CrawlAsync(new Uri("http://localhost:1111/")); //This is synchronous, it will not go to the next line until the crawl has completed

            if (result.ErrorOccurred)
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            else
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);

            return Json("test");
        }

        public IActionResult About()
        {
            ViewData["Message"] = "Your application description page.";

            return View();
        }

        public IActionResult Contact()
        {
            ViewData["Message"] = "Your contact page.";

            return View();
        }

        public IActionResult Error()
        {
            return View();
        }

        void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
        {
            PageToCrawl pageToCrawl = e.PageToCrawl;
            Console.WriteLine("About to crawl link {0} which was found on page {1}", pageToCrawl.Uri.AbsoluteUri, pageToCrawl.ParentUri.AbsoluteUri);
        }

        void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (crawledPage.HttpRequestException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            else
                Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
                Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);

            var htmlAgilityPackDocument = crawledPage.HtmlDocument; //Html Agility Pack parser
            var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser

            foreach (Uri uri in e.CrawledPage.ParsedLinks)
            {
                try
                {
                    PageToCrawl page = new PageToCrawl(uri);
                    // logic & stuff to format/save goes here
                }
                catch { }
            }

        }

        void crawler_PageLinksCrawlDisallowed(object sender, PageLinksCrawlDisallowedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;
            Console.WriteLine("Did not crawl the links on page {0} due to {1}", crawledPage.Uri.AbsoluteUri, e.DisallowedReason);
        }

        void crawler_PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e)
        {
            PageToCrawl pageToCrawl = e.PageToCrawl;
            Console.WriteLine("Did not crawl page {0} due to {1}", pageToCrawl.Uri.AbsoluteUri, e.DisallowedReason);
        }
    }
}
here...


Reply all
Reply to author
Forward
0 new messages