Hi Guys,
I want to crawl a lot of links, but I find that using only one instance of casperjs/phantomjs uses up a lot of CPU and Memory.
So I wanted to try out if I can just loop the whole casper process inside a while and use casper.page.close() each loop to minimize the resource usage
But there's something wrong in my loop though, my sample script is below:
var links = [];
var pendingUrls = [
'http://casperjs.org/',
'http://phantomjs.org/'
];
function getLinks() {
var links = [];
Array.prototype.forEach.call(__utils__.findAll('a'), function(e) {
links.push(e.getAttribute('href'));
});
return links;
}
function crawl(url) {
var casper = require('casper').create();
casper.start(url);
casper.then(function() {
this.echo(this.getTitle());
links = this.evaluate(getLinks);
this.echo(links.length + ' links found:');
this.echo(' - ' + links.join('\n - ')).exit();
});
casper.then(function() {
casper.page.close();
});
casper.run(function() {
this.exit();
});
}
while (pendingUrls.length > 0) {
nextURL = pendingUrls.shift();
crawl(nextURL);
}
In my output, only one of the links in the pendingURL list is crawled. The other one returns no output. Any ideas?