The USERAGENT_DEBUG option showed connection errors, however I've gone back to the cookbook example and modified it some more - seems to be running & exiting ok now:
my $ua = Mojo::UserAgent->new(
max_redirects => 8,
name => 'xyz',
inactivity_timeout => 15
);
my $delay = Mojo::IOLoop->delay(sub {
my $delay = @_;
});
# Crawler
my $crawl;
$crawl = sub {
my $id = shift;
$delay->begin;
# If there are no more URLs in the array, remove crawler id
return Mojo::IOLoop->remove($id) unless my $url_ref = shift @urls;
my ($url_id, $url) = each(%$url_ref);
$ua->get($url => sub {
my ($ua, $tx) = @_;
if ($tx->success) {
# store destination url
my $uri = $tx->req->url;
# Store title or URL if title not found
my $title = $tx->res->dom->at('head > title') ? $tx->res->dom->at('head > title')->text : $uri;
# Clean out script and other extraneous tags we don't want turned into text
my $content = $tx->res->body;
$content =~ s!<(script|style|iframe)[^>]*>.*?</\1>!!gis;
# Turn back into DOM object to retrieve text
my $dom = Mojo::DOM->new($content);
my $clean_content = $dom->all_text;
$clean_content =~ s![<>"',.&*\!$()^]! !g;
# fetch headings
my $headings = '';
$tx->res->dom('h1, h2')->each(sub {
$headings .= shift->all_text . " ";
});
## Update DB
my $ts = time;
open(XML,">:utf8", catfile( $corpus_source, "$ts-$url_id.xml") ) or die $!;
print XML "<uri>$uri</uri><url_id>$url_id</url_id><title>$title</title><headings>$headings</headings><content>$clean_content</content>";
close XML;
} else {
my ($message, $code) = $tx->error;
say $code ? "$code response: $message" : "Connection error: $message";
## Update DB
}
$delay->end;
# Next - *only* if there's more to crawl
if (@urls) {
$crawl->($id);
}
});
};
# Start a bunch of parallel crawlers sharing the same user agent
$crawl->($_) for 1 .. 10;
# Start reactor if necessary
$delay->wait unless Mojo::IOLoop->is_running;