#!/usr/bin/perl # # Copyright (C) 2007 Peteris Krumins (peter@catonmat.net) # http://www.catonmat.net - good coders code, great reuse # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . use warnings; use strict; # # This program was written as a part of "reddit media: intelligent fun online" # website generator. # This website can be viewed here: http://redditmedia.com # # See http://www.catonmat.net/designing-reddit-media-website for more info. # use LWP::UserAgent; use HTML::TreeBuilder; use HTML::Entities; use URI; # # This script accesses reddit.com website and goes through all the pages # it can find, looking for link titles matching patterns specified # in %extract_patterns hash or domains specified in %extract_domains hash. # use constant VOTE_THRESHOLD => 10; # include only titles with at least this # much votes # These regex patterns match common picture and video reddit titles # notice the order of plural and singular. my @extract_patterns = ( # pattern type "[[(].*pictures.*[])]" => 'pictures', "[[(].*picture.*[])]" => 'picture', "[[(].*pics.*[])]" => 'pictures', "[[(].*pic.*[])]" => 'picture', "[[(].*images.*[])]" => 'pictures', "[[(].*image.*[])]" => 'picture', "[[(].*photos.*[])]" => 'pictures', "[[(].*photo.*[])]" => 'picture', "[[(].*comics.*[])]" => 'pictures', "[[(].*comic.*[])]" => 'picture', "[[(].*charts.*[])]" => 'pictures', "[[(].*chart.*[])]" => 'picture', "[[(].*vids.*[])]" => 'videos', "[[(].*vid.*[])]" => 'video', "[[(].*videos.*[])]" => 'videos', "[[(].*video.*[])]" => 'video', "[[(].*clips.*[])]" => 'videos', "[[(].*clip.*[])]" => 'video', "[[(].*films.*[])]" => 'videos', "[[(].*film.*[])]" => 'video', "[[(].*movies.*[])]" => 'videos', "[[(].*movie.*[])]" => 'video' ); # These regex patterns match domains which usually contain only images # and videos. my @extract_domains = ( # video sites 'youtube.com' => 'video', 'video.google.com' => 'video', 'liveleak.com' => 'video', 'break.com' => 'video', 'metacafe.com' => 'video', 'brightcove.com' => 'video', 'dailymotion.com' => 'video', 'dailymotion.alice.it' => 'video', 'flicklife.com' => 'video', 'flixya.com' => 'video', 'flurl.com' => 'video', 'gofish.com' => 'video', 'ifilm.com' => 'video', 'livevideo.com' => 'video', 'video.yahoo.com' => 'video', # image sites 'photobucket.com' => 'picture', 'photo.livevideo.com' => 'picture', 'flickr.com' => 'picture', 'xkcd.com' => 'picture' ); # compile regex extract pattern #my $joined_patterns = join '|', @extract_patterns; #my $c_extract_patterns = qr{$joined_patterns}i; # compile regex extract pattern #$joined_patterns = join '|', @extract_domains; #my $c_extract_domains = qr{$joined_patterns}i; my $pages_to_get = shift || 'all'; # exit successfully if we do not want any pages to be parsed exit 0 unless $pages_to_get; my $ua = LWP::UserAgent->new( agent => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Gecko/20070515 Firefox/2.0.0.4' ); my $reddit_page = get_page($ua, 'http://reddit.com'); extract_and_print($reddit_page); $pages_to_get-- if $pages_to_get =~ /\d+/; my $next_page; if ($pages_to_get eq 'all') { while ($next_page = get_next_page_url($reddit_page)) { $reddit_page = get_page($ua, $next_page); extract_and_print($reddit_page); } } else { while ($pages_to_get--) { $next_page = get_next_page_url($reddit_page); $reddit_page = get_page($ua, $next_page); extract_and_print($reddit_page); } } # # get_page # # Given an URL, the subroutine returns content of the resource located at URL. # die()s if getting the URL fails # sub get_page { my ($ua, $url) = @_; my $response = $ua->get($url); unless ($response->is_success) { die "Failed getting $url: ", $response->status_line; } return $response->content; } # # extract_and_print # # Subroutine takes html content and extracts reddit links, titles and domains # from the content. Then it tests each title and domain against # extract patterns (both domain and title). If matched, it will # print a human readable output in format: # # title (type, user, reddit id, url) # # The output is used by a script which imports it into sqlite database. # sub extract_and_print { my $content = shift or die "Error: no content provided to extract_and_print"; my @posts = extract_posts($content); my @to_print; POST: foreach my $post (@posts) { # naive algorithm, we don't care about complexity foreach my $idx (grep { $_ % 2 == 0 } 0..$#extract_patterns) { # foreach extract pattern if ($post->{title} =~ /$extract_patterns[$idx]/i) { push @to_print, { entry => $post, type => $extract_patterns[$idx+1] }; next POST; } } foreach my $idx (grep { $_ % 2 == 0 } 0..$#extract_domains) { my $uri = URI->new($post->{url}); my $host; next unless $uri->can('host'); $host = $uri->host; if ($host =~ /$extract_domains[$idx]/i) { push @to_print, { entry => $post, type => $extract_domains[$idx+1] }; next POST; } } } print_entries(\@to_print); } # # print_entries # # Given a arrayref of entries, prints one by one in our desired format. # sub print_entries { my $entries = shift; foreach (@$entries) { print "$_->{entry}->{title} ($_->{type}, $_->{entry}->{user}, $_->{entry}->{id}, $_->{entry}->{url})\n"; } } # # extract_posts # # Subroutine takes HTML content of reddit's page and returns an array # of hashes containing information about each post on the page. # sub extract_posts { my $content = shift or die "error: no content provided to extract_posts"; my @posts; my $tree = HTML::TreeBuilder->new; $tree->parse($content); # if we look how the reddit is made in FireBug, we see that each link # is a row of a big HTML table. # each row has 'class' attribute named 'evenRow' or 'oddRow' # my %post_entry; my @trs = $tree->look_down("_tag" => "tr", "class" => qr{evenRow|oddRow}); foreach my $tr (@trs) { my $link = $tr->look_down("_tag" => "a", "id" => qr{title\d+}); unless (defined $link) { # if link is not defined, it means that we have the row containing user info # and id of reddit url for comments, and score my ($ulink, $clink) = $tr->look_down("_tag" => "a"); next unless (defined $ulink and defined $clink); my $user = $ulink->as_text; $user =~ s/\s+$//g; $post_entry{user} = $user; my $chref = $clink->attr('href'); next unless $chref; if ($chref =~ m{/info/([^/]+)/comments}) { $post_entry{id} = $1; } my $score_span = $tr->look_down("_tag" => "span", id => qr{score\d+}); if ($score_span) { my $score_text = $score_span->as_text; if ($score_text =~ /(\d+) point/) { $post_entry{score} = $1; } } unless (exists $post_entry{score}) { # could be that the entry was posted less than hour ago, # then it has no score visible $post_entry{score} = 0; } if ($post_entry{score} >= VOTE_THRESHOLD) { push @posts, { %post_entry }; } next; } # get the title, strip leading spaces my $title = decode_entities($link->as_text); $title =~ s/^\s+//g; my $url = $link->attr('href'); $post_entry{title} = $title; $post_entry{url} = $url; } $tree->delete; return @posts; } # # get_next_page_url # # Given HTML content of a reddit page, extracts url to the next page. # sub get_next_page_url { my $content = shift or die "error: no content provided to get_next_page"; my $next_url; if ($content =~ m{next »}) { $next_url = "http://reddit.com" . $1; } return $next_url; }