#!/usr/bin/perl
#
# Copyright (C) 2007 Peteris Krumins (peter@catonmat.net)
# http://www.catonmat.net - good coders code, great reuse
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#
use warnings;
use strict;
package ThumbExtractor;
#
# This package extracts thumbnail images for a given URL to a video or picture.
#
use LWP::UserAgent;
use HTML::Entities;
use HTML::TreeBuilder;
use File::MMagic;
use File::Temp 'mktemp';
use URI;
#
# Here are handlers for various video and image sites.
# There is no other way to extract thumbnail from a video site than analyzing the
# website how the site displays thumbnail itself.
#
# For video sites I wrote find_best_image in ImageCacher's package which finds
# the best image on the site.
#
# It is a very expensive function (requires fetching all images and converting them to
# pnm format and then calculate areas, etc).
#
# For the most popular sites (from top 10) I wrote handlers manually.
#
my @thumb_handlers = (
'youtube.com' => \&_youtube_handler,
'video.google.com' => \&_video_google_handler,
'flickr.com' => \&_flickr_handler,
'metacafe.com' => \&_metacafe_handler,
'liveleak.com' => \&_liveleak_handler,
'xkcd.com' => \&_xkcd_handler,
'bestpicever.com' => \&_bestpicever_handler,
'blogger.com' => \&_blogger_handler
);
sub new {
my $this = shift;
my $class = ref($this) || $this;
my $self = {};
$self->{ua} = LWP::UserAgent->new(
agent => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Gecko/20070515 Firefox/2.0.0.4'
);
bless $self, $class;
}
sub get_thumbnail {
my ($self, $url) = @_;
my $host = $self->_get_host($url);
return undef if $host eq "unknown";
# find a handler for a host
for my $handler_idx (grep { $_ % 2 == 0 } 0 .. $#thumb_handlers) {
if ($host =~ /$thumb_handlers[$handler_idx]/) {
my $thumb = $thumb_handlers[$handler_idx + 1]->($url, $self->{ua});
return $thumb;
}
}
# there was no handler, try matching extensions
my @img_rxes = qw|jpg$ jpeg$ gif$ png$|;
my $rx = join '|', @img_rxes;
if ($url =~ /$rx/i) {
# some sites have URLs ending with an image extension but really it is
# a HTML page. Let's check this.
# read just the first KB of the image and make sure we are not getting
# gzipped content
#
# File::MMagic is broken, it didnt work this way.
# my $data;
# my $cb_sub = sub {
# $data .= shift;
# my $length = do { use bytes; length($data) };
#
# if ($length >= 2024) {
# die "got a KB of data";
# }
# };
# my $response = $self->{ua}->get($url, 'Accept-Encoding' => undef,
# ':content_cb' => $cb_sub);
my $tmp_file = $self->_get_temp_file();
my $response = $self->{ua}->get($url, ':content_file' => $tmp_file);
my $content = $response->content;
my $mm = new File::MMagic;
my $res = $mm->checktype_filename($tmp_file);
unlink $tmp_file;
if ($res =~ /image/) { # image, yumm, ok!
return ThumbExtractor::Thumb->new($url, 0);
}
}
return undef; # unknown url or not an image
}
sub _get_page {
my ($ua, $url) = @_;
my $resp = $ua->get($url);
if ($resp->is_success) {
return $resp->content;
}
return undef;
}
sub _get_temp_file {
return mktemp("/tmp/imageTEXXXXXXXX");
}
#
# I use regexes for extracting because parsing each tree would be much slower and would
# take me 5 times longer to write the code and I can't see any reason to do it.
# I want the site to be running asap ;)
#
sub _youtube_handler {
my $url = shift;
# http://www.youtube.com/watch?v=qSNcVjpX-9Q&NR=1
if ($url =~ /v=([A-Za-z0-9-_]+)/) {
my $thumb_url = "http://img.youtube.com/vi/$1/1.jpg";
return ThumbExtractor::Thumb->new($thumb_url, 1);
}
return undef;
}
sub _video_google_handler {
my ($url, $ua) = @_;
# google video can either have their own thumbnail or youtube's thumbnail
#
#
new($thumb_url, 1);
}
return;
}
sub _flickr_handler {
my ($url, $ua) = @_;
my $flickr_extract = sub {
my $id = shift;
my $content = _get_page($ua, "http://flickr.com/photo_zoom.gne?id=$id&size=sq");
if (defined $content) {
if ($content =~ m{Download}) {
return ThumbExtractor::Thumb->new($1, 1);
}
return;
}
return;
};
if ($url =~ /static.flickr.com/) {
return ThumbExtractor::Thumb->new($url, 0); # not a thumb yet
}
elsif ($url =~ /id=(\d+)/) {
# http://flickr.com/photo_zoom.gne?id=346049991&size=sq
return $flickr_extract->($1);
}
elsif ($url =~ m{/(\d+)/}) {
# http://www.flickr.com/photos/kielbryant/118020322/in/set-72057594137096110/
return $flickr_extract->($1);
}
return;
}
sub _metacafe_handler {
my $url = shift;
if ($url =~ m{metacafe.com/watch/(\d+)}) {
return ThumbExtractor::Thumb->new("http://www.metacafe.com/thumb/$1.jpg", 1);
}
elsif ($url =~ m{metacafe.com/w/(\d+)}) {
return ThumbExtractor::Thumb->new("http://www.metacafe.com/thumb/$1.jpg", 1);
}
return;
}
sub _liveleak_handler {
my ($url, $ua) = @_;
my $content = _get_page($ua, $url);
if (defined $content) {
if ($content =~ m{}) {
return ThumbExtractor::Thumb->new($1, 1);
}
}
return;
}
sub _xkcd_handler {
my ($url, $ua) = @_;
my $content = _get_page($ua, $url);
return undef unless defined $content;
if ($content =~ m{
new($1, 0);
}
return;
}
sub _bestpicever_handler {
my ($url, $ua) = @_;
my $content = _get_page($ua, $url);
return undef unless defined $content;
my $tree = HTML::TreeBuilder->new;
$tree->parse($content);
my $div_holding_img = $tree->look_down(_tag => 'div', id => 'img-holder-reg');
unless (defined $div_holding_img) {
$tree->delete;
return undef;
};
my $img = $div_holding_img->look_down(_tag => 'img');
unless (defined $img) {
$tree->delete;
return undef;
}
my $img_url = $img->attr('src');
$tree->delete;
return ThumbExtractor::Thumb->new($img_url, 0);
}
sub _blogger_handler {
my ($url, $ua) = @_;
my $content = _get_page($ua, $url);
return undef unless defined $content;
#
#
#gangsta.jpg (image)
#
#
#
#
#
#
if ($content =~ /img src="(.+?)"/) {
return ThumbExtractor::Thumb->new($1, 0);
}
return
}
sub _get_host {
my ($self, $url) = @_;
my $uri = URI->new($url);
if ($uri->can('host')) {
return $uri->host;
}
return "unknown";
}
package ThumbExtractor::Thumb;
sub new {
my $class = shift;
my ($url, $is_thumb) = @_;
my $self = {
url => $url,
is_thumb => $is_thumb
};
bless $self, $class;
}
sub is_thumb {
my $self = shift;
return $self->{is_thumb};
}
sub url {
my $self = shift;
return $self->{url};
}
1;