#!/usr/bin/perl
#
# Copyright (C) 2007 Peteris Krumins (peter@catonmat.net)
# http://www.catonmat.net - good coders code, great reuse
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#
use warnings;
use strict;
#
# This program scrapes given site(s) extracting posts matching
# given pattern(s)
#
# A new site can be added by writing a plugin for it.
#
# Patterns are specified as the last argument which should be
# path to the file containing the patterns.
#
# This program is a part of picurls.com website data miner.
# More about it at:
# http://www.catonmat.net/blog/making-of-picurls-popurls-for-pictures-part-one
#
use File::Find;
my %plugins = load_plugins();
unless (@ARGV) {
usage();
exit 1;
}
if ($ARGV[0] eq "--sites") {
print_sites();
exit 0;
}
elsif ($ARGV[0] eq "--help") {
usage();
exit 0;
}
my %sites;
my $pattern_file;
foreach my $idx (0 .. $#ARGV) {
if ($idx == $#ARGV) { # last argument might be pattern file
if (-r $ARGV[$idx]) {
$pattern_file = $ARGV[$idx];
last;
}
}
my @parts = split ':\s*', $ARGV[$idx]; # site[:M][:{var1=val1; var2=val2}]
my $site = $parts[0];
if (@parts == 1) { # just the site specified
push @{$sites{$site}}, { pages => 1 };
}
elsif (@parts == 2) { # either pages to scrape specified or args
if ($parts[1] =~ /{\s*(.+)\s*}/) {
my %vars = parse_vars($1);
push @{$sites{$site}}, {
pages => 1,
vars => \%vars
};
}
elsif ($parts[1] =~ /^\d+$/) {
push @{$sites{$site}}, { pages => $parts[1] };
}
else {
print STDERR "Invalid argument: '$ARGV[$idx]'. Ignoring!\n";
}
}
elsif (@parts == 3) { # pages and args
my %site_entry;
if ($parts[1] =~ /^\d+$/) {
$site_entry{pages} = $parts[1];
}
else {
print STDERR "Invalid argument: '$ARGV[$idx]'. Ignoring!\n";
next;
}
if ($parts[2] =~ /{\s*(.+)\s*}/) {
my %vars = parse_vars($1);
$site_entry{vars} = \%vars;
push @{$sites{$site}}, \%site_entry;
}
else {
print STDERR "Invalid argument: '$ARGV[$idx]'. Ignoring!\n";
}
}
else {
print STDERR "Invalid argument: '$ARGV[$idx]'. Ignoring!\n";
}
}
unless (keys %sites) {
usage();
exit 1;
}
foreach (keys %sites) { # check if all sites listed have a plugin
unless (exists $plugins{$_}) {
print "Plugin for '$_' does not exist!";
exit 1;
}
}
foreach my $site (keys %sites) {
foreach my $entry (@{$sites{$site}}) {
my $scraper = $plugins{$site}->new(
pages => $entry->{pages},
vars => $entry->{vars} || {},
pattern_file => $pattern_file
);
$scraper->scrape_verbose();
}
}
#
# parse_vars
#
# parses a string in format 'var1=val1; var2=val2' and returns a hash with var => vals
#
sub parse_vars {
my $varvals = shift;
my @valvars = split '\s*;\s*', $varvals;
my %rethash;
foreach my $vv (@valvars) {
my ($var, $val) = $vv =~ /(\w+)\s*=(.+)/;
$val =~ s/^\s+|\s+$//g;
$rethash{$var} = $val;
}
return %rethash;
}
#
# load_plugins
#
# loads the existing site plugins.
#
sub load_plugins {
my @plugins;
my %ret_plugs;
find (sub { push @plugins, $_ if /\.pm$/ && !/scraper\.pm/}, 'sites');
foreach my $p (@plugins) {
$p = 'sites::' . $p;
$p =~ s/\.pm$//;
eval "require $p";
unless ($@) {
$ret_plugs{$p->site_name} = $p;
}
else {
print "Failed loading $p: $@\n";
}
}
return %ret_plugs;
}
#
# print_sites
#
# prints all loaded plugins for sites
#
sub print_sites {
print "Available sites:\n";
print join ' ', sort keys %plugins;
print "\n";
}
#
# usage
#
# prints program's usage
#
sub usage {
print "Program by Peteris Krumins (peter\@catonmat.net)\n";
print "http://www.catonmat.net - good coders code, great reuse\n";
print "\n";
print "Usage: $0 ... [/path/to/pattern_file]\n";
print "Crawls given sites extracting entries matching optional patterns in pattern_file\n";
print "Optional argument M specifies how many pages to crawl, default 1\n";
print "Arguments (variables) for plugins can be passed via an optional { }\n";
print "\nor\n";
print "Usage: $0 [--sites|--help]\n";
print "Prints all installed site plugins (--sites), or prints this message (--help)\n";
print "\n";
}