#!/usr/bin/perl
#
# Copyright (C) 2007 Peteris Krumins (peter@catonmat.net)
# http://www.catonmat.net - good coders code, great reuse
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
use warnings;
use strict;
#
# This program was written as a part of "digpicz: the missing digg's picture section"
# website generator.
# This website can be viewed here: http://digpicz.com
#
# See http://www.catonmat.net/designing-digg-picture-website for more info.
#
use LWP;
use POSIX;
use XML::Simple;
binmode(STDOUT, ":utf8");
# Each request to Digg API requires an appkey which is a a valid absolute URI
# that identifies the application making the request.
# This constant defines that key.
# Read more: http://apidoc.digg.com/ApplicationKeys
#
use constant DIGG_APPKEY => 'http://digpicz.com';
# This constant defines now many votes a digg post has had to have received
# to be included in the results. Use 0 to include all posts.
#
#use constant VOTE_THRESHOLD => 60;
use constant ITEMS_PER_REQUEST => 15;
# These regex patterns match common picture titles on Digg.
# It's an array to maintain the order of plural regexes vs. singular regexes.
#
my @extract_patterns = (
# pattern type
"[[(].*pictures.*[])]" => 'pictures',
"[[(].*picture.*[])]" => 'picture',
"[[(].*pics.*[])]" => 'pictures',
"[[(].*pic.*[])]" => 'picture',
"[[(].*images.*[])]" => 'pictures',
"[[(].*image.*[])]" => 'picture',
"[[(].*photos.*[])]" => 'pictures',
"[[(].*photo.*[])]" => 'picture',
"[[(].*comics.*[])]" => 'pictures',
"[[(].*comic.*[])]" => 'picture',
"[[(].*charts.*[])]" => 'pictures',
"[[(].*chart.*[])]" => 'picture',
);
# These regex patterns match domains which usually contain only images
# and videos.
my @extract_domains = (
'photobucket.com' => 'picture',
'photo.livevideo.com' => 'picture',
'flickr.com' => 'picture',
'xkcd.com' => 'picture'
);
my $ua = LWP::UserAgent->new(
agent => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Gecko/20070515 Firefox/2.0.0.4'
);
my $reqs_to_get = shift || 'all'; # number of requests to get (ITEMS_PER_REQUEST per request)
my $xml_posts = get_posts($ua, 1);
my $posts = parse_posts($xml_posts);
extract_and_print($posts->{story});
$reqs_to_get-- if $reqs_to_get =~ /\d+/;
my $offset = 2;
if ($reqs_to_get eq 'all') {
do {
$xml_posts = get_posts($ua, $offset++);
$posts = parse_posts($xml_posts);
extract_and_print($posts->{story});
} while (exists $posts->{story} and @{$posts->{story}});
}
else {
while ($reqs_to_get--) {
# note: it doesn't matter that we duplicate code, this program is so small
# that i typed it in matter of minutes
$xml_posts = get_posts($ua, $offset++);
$posts = parse_posts($xml_posts);
extract_and_print($posts->{story});
exit 0 unless exists $posts->{story} and @{$posts->{story}};
}
}
#
# extract_and_print
#
# Given a hashref data structure of posts, find posts matching @extract_patterns and
# @extract_domains and prints them out
#
sub extract_and_print {
my $posts = shift;
my @to_print;
POST:
foreach my $post (@$posts) { # naive algorithm, we don't care about complexity
foreach my $idx (grep { $_ % 2 == 0 } 0..$#extract_patterns) {
# foreach extract pattern
if ($post->{title} =~ /$extract_patterns[$idx]/i ||
$post->{description} =~ /$extract_patterns[$idx]/i)
{
push @to_print, {
entry => $post,
type => $extract_patterns[$idx+1]
};
next POST;
}
}
foreach my $idx (grep { $_ % 2 == 0 } 0..$#extract_domains) {
my $uri = URI->new($post->{link});
my $host;
next unless $uri->can('host');
$host = $uri->host;
if ($host =~ /$extract_domains[$idx]/i) {
push @to_print, {
entry => $post,
type => $extract_domains[$idx+1]
};
next POST;
}
}
}
print_entries(\@to_print);
}
#
# print_entries
#
# Given a arrayref of entries, prints one by one in our desired format.
# The format is:
# title: story title
# type: story type
# desc: story description
# url: story url
# digg_url: url to original story on digg
# category: digg category of the story
# short_category: short cateogry name
# user: username
# user_pic: url to user pic
# date: date story appeared on digg YYYY-MM-DD HH::MM::SS
#
#
sub print_entries {
my $entries = shift;
foreach (@$entries) {
print "title: $_->{entry}->{title}\n";
print "type: $_->{type}\n";
print "desc: $_->{entry}->{description}\n";
print "url: $_->{entry}->{link}\n";
print "digg_url: $_->{entry}->{href}\n";
print "category: $_->{entry}->{topic}->{name}\n";
print "short_category: $_->{entry}->{topic}->{short_name}\n";
print "user: $_->{entry}->{user}->{name}\n";
print "user_pic: $_->{entry}->{user}->{icon}\n";
print "date: " . strftime("%Y-%m-%d %H:%M:%S", localtime $_->{entry}->{promote_date}) . "\n";
print "\n";
}
}
#
# parse_posts
#
# Given XML posts, returns a hashref data structure with them
#
sub parse_posts {
my $xml = shift;
return XMLin($xml, KeyAttr => [], ForceArray => ['story']);
}
#
# get_posts
#
# Gets front page ITEMS_PER_REQUEST posts at (offset - 1) * ITEMS_PER_REQUEST
#
sub get_posts {
my ($ua, $offset) = @_;
my $service_url = "http://services.digg.com/stories/popular";
$service_url .= "?appkey=" . DIGG_APPKEY;
$service_url .= "&offset=" . ($offset - 1) * ITEMS_PER_REQUEST;
$service_url .= "&count=" . ITEMS_PER_REQUEST;
return get_page($ua, $service_url);
}
#
# get_page
#
# Given an URL, the subroutine returns content of the resource located at URL.
# die()s if getting the URL fails
#
sub get_page {
my ($ua, $url) = @_;
my $response = $ua->get($url);
unless ($response->is_success) {
die "Failed getting $url: ", $response->status_line;
}
return $response->content;
}