WWW::Scraper::Lite is a HTTP scraper module written in Perl.
SYNOPSIS
my $domain = 'http://devsite.local/';
my $scraper = WWW::Scraper::Lite->new();
$scraper->crawl($domain,
{
'//a' => sub { # handler for all 'a' tags
my ($scraper, $nodes) = @_;
$scraper->enqueue(grep { $_ =~ m{^$domain} } # only this domain
map { $scraper->url_remove_anchor($_) } # only index pages without #anchor
map { $scraper->url_make_absolute($_) } # indexer needs absolute URLs
map { $_->{href} } # pull href out of the 'a' DOM node
@{$nodes});
},
'/*' => sub { # handler for all content
my ($scraper, $nodes) = @_;
print $scraper->{current}->{response}->content; # do something useful with HTTP response
},
}
);
Product's homepage
Requirements:
· Perl
· strict
· warnings
· LWP::UserAgent
· HTML::TreeBuilder::XPath