#!/usr/bin/python
# 
# Created by Peter Krumins (peter@catonmat.net, @pkrumins on twitter)
# www.catonmat.net -- good coders code, great coders reuse
#
# Released under GNU GPL
#
# Developed as a part of reddit top program.
# Read how it was designed:
# http://www.catonmat.net/blog/follow-reddit-from-the-console
#

import re
import sys
import time
import socket
import urllib2
import datetime
from BeautifulSoup import BeautifulSoup

version = "1.0"

reddit_url = 'http://www.reddit.com/'
subreddit_url = 'http://www.reddit.com/r/%s/'

class RedesignError(Exception):
    """
    An exception class thrown when it seems that Reddit has redesigned
    """
    pass

class SeriousError(Exception):
    """
    An exception class thrown when something unexpected happened
    """
    pass

class Story(dict):
    """
    Encapsulates the information about a single Reddit story.

    After the object is constructed it contains the following attributes:
    * position
    * reddit_name
    * id
    * title
    * url
    * user
    * score
    * human_time
    * unix_time
    * comments
    """

    def __repr__(self):
        inner = ', '.join([repr(x) for x in (self.position, str(self.reddit_name),
            str(self.id), str(self.title),
            str(self.url), str(self.user), self.score, str(self.human_time),
            self.unix_time, self.comments)])
        return ''.join(('{', inner, '}'))

def stories_per_page():
    """ Returns stories per single web page """
    return 25

def get_stories(subreddit='front_page', pages=1, new=False):
    """
    Finds all stories accross 'pages' pages on a 'subreddit' and returns a
    list of Story objects representing stories.

    If the 'subreddit' is 'front_page' gets stories from http://www.reddit.com/
    Otherwise gets stories from http://www.reddit.com/r/<subreddit>/

    If 'new' is True, gets new stories from http://www.reddit.com/new/
    If 'new' is True and 'subreddit' is set, gets stories from
    http://www.reddit.com/r/<subreddit>/new/
    """

    stories = [] 
    if subreddit == 'front_page':
        url = reddit_url
    else:
        url = subreddit_url % subreddit
    if new: url += 'new'

    for i in range(pages):
        content = _get_page(url)
        entries = _extract_stories(content)
        stories.extend(entries)
        url = _get_next_page(content)
        if not url:
            break

    for pos, story in enumerate(stories):
        story.position = pos+1
        story.reddit_name = subreddit

    return stories;

def _extract_stories(content):
    """
    Given an HTML page, extracts all the stories and returns a list of Story
    objects representing stories.
    """

    stories = []
    soup = BeautifulSoup(content)

    def tagline_finder(tag):
        if tag.name == 'p' and tag.parent and tag.parent.name == 'div':
            try:
                if tag['class'] == 'tagline' and tag.parent['class'] == 'entry':
                    return True
            except KeyError:
                pass

    score_divs = soup.findAll('div', attrs = { 'class': re.compile(r'^score')})
    title_as = soup.findAll('a', id=re.compile(r'^title_'))
    tagline_ps = soup.findAll(tagline_finder)
    comment_as = soup.findAll('a', id=re.compile(r'^comment_'))

    if not len(score_divs) == len(title_as) == len(tagline_ps) == len(comment_as):
        raise RedesignError, "lengths of score, title, tagline and comment lists do not match"

    for score_div, title_a, tagline_p, comment_a in zip(score_divs, title_as, tagline_ps, comment_as):
        score = score_div.string
        if not re.match(r'^\d+$', score):
            score = -1
        else:
            score = int(score)

        title = title_a.string.strip()
        url = title_a['href']
        if url.startswith('/'):
            url = reddit_url + url[1:]

        m = re.search(r'title_t\d_(.+)', title_a['id'])
        if not m:
            raise RedesignError, "title did not contain a reddit id"
        id = m.group(1)

        # posted_re = re.compile(r'\s+(.+)\s+ago')  # huh, reddit has &#32; all over the place here
        posted_re = re.compile(r'&#32;(.+)&#32;ago')
        posted_text = tagline_p.find(text=posted_re)
        if not posted_text:
            raise RedesignError, "unable to extract 'ago' text"
        m = posted_re.search(posted_text);
        posted_ago = m.group(1)
        unix_time = _ago_to_unix(posted_ago)
        if not unix_time:
            raise RedesignError, "unable to extract story date"
        human_time = time.ctime(unix_time)

        user_a = tagline_p.find('a', href=re.compile(r'/user/'))
        if not user_a:
            raise RedesignError, "unable to find <a> containing username"
        user = user_a.string

        m = re.search(r'(\d+) comment', comment_a.string)
        if not m:
            comments = 0
        else:
            comments = int(m.group(1))

        story = Story()
        story.id = id
        story.title = title.encode('utf8')
        story.url = url.encode('utf8')
        story.score = score
        story.comments = comments
        story.user = user.encode('utf8')
        story.unix_time = unix_time
        story.human_time = human_time.encode('utf8')

        stories.append(story)

    return stories

def _ago_to_unix(ago):
    m = re.search(r'(\d+) (\w+)', ago, re.IGNORECASE)
    if not m:
        return 0

    delta = int(m.group(1))
    units = m.group(2)

    if not units.endswith('s'): # singular
        units += 's' # append 's' to make it plural

    if units == "months":
        units = "days"
        delta *= 30        # lets take 30 days in a month
    elif units == "years":
        units = "days"
        delta *= 365

    dt = datetime.datetime.now() - datetime.timedelta(**{units: delta})
    return int(time.mktime(dt.timetuple()))

def _get_page(url, timeout=10):
    """ Gets and returns a web page at url with timeout 'timeout'. """

    old_timeout = socket.setdefaulttimeout(timeout)

    request = urllib2.Request(url)
    request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)')

    try:
        response = urllib2.urlopen(request)
        content = response.read()
    except (urllib2.HTTPError, urllib2.URLError, socket.error, socket.sslerror), e:
        socket.setdefaulttimeout(old_timeout)
        raise SeriousError, e

    socket.setdefaulttimeout(old_timeout)
    return content

def _get_next_page(content):
    soup = BeautifulSoup(content)
    a = soup.find(lambda tag: tag.name == 'a' and tag.string == 'next')
    if a:
        return reddit_url + a['href'][1:]

def print_stories_paragraph(stories):
    """
    Given a list of Stories, prints them out paragraph by paragraph
    """
    
    for story in stories:
        print 'position:', story.position
        print 'reddit_name:', story.reddit_name
        print 'id:', story.id
        print 'title:', story.title
        print 'url:', story.url
        print 'score:', story.score
        print 'comments:', story.comments
        print 'user:', story.user
        print 'unix_time:', story.unix_time
        print 'human_time:', story.human_time
        print

if __name__ == '__main__':
    from optparse import OptionParser

    description = "A program by Peteris Krumins (http://www.catonmat.net)"
    usage = "%prog [options]"

    parser = OptionParser(description=description, usage=usage)
    parser.add_option("-s", action="store", dest="subreddit", default="front_page",
                      help="Subreddit to retrieve stories from. Default: front_page.")
    parser.add_option("-p", action="store", type="int", dest="pages",
                      default=1, help="How many pages of stories to output. Default: 1.")
    parser.add_option("-n", action="store_true", dest="new", 
                      help="Retrieve new stories. Default: nope.")
    options, args = parser.parse_args()

    try:
        stories = get_stories(options.subreddit, options.pages, options.new)
    except RedesignError, e:
        print >>sys.stderr, "Reddit has redesigned: %s!" % e
        sys.exit(1)
    except SeriousError, e:
        print >>sys.stderr, "Serious error: %s!" % e
        sys.exit(1)

    print_stories_paragraph(stories)

