#!/usr/bin/python # # Peteris Krumins (peter@catonmat.net) # http://www.catonmat.net -- good coders code, great reuse # # Released under GNU GPL # # Developed as a part of reddit top program. # Read how it was designed: # http://www.catonmat.net/blog/follow-reddit-from-the-console # import re import sys import time import socket import urllib2 import datetime from BeautifulSoup import BeautifulSoup version = "1.0" reddit_url = 'http://www.reddit.com/' subreddit_url = 'http://www.reddit.com/r/%s/' class RedesignError(Exception): """ An exception class thrown when it seems that Reddit has redesigned """ pass class SeriousError(Exception): """ An exception class thrown when something unexpected happened """ pass class Story(dict): """ Encapsulates the information about a single Reddit story. After the object is constructed it contains the following attributes: * position * reddit_name * id * title * url * user * score * human_time * unix_time * comments """ def __repr__(self): inner = ', '.join([repr(x) for x in (self.position, str(self.reddit_name), str(self.id), str(self.title), str(self.url), str(self.user), self.score, str(self.human_time), self.unix_time, self.comments)]) return ''.join(('{', inner, '}')) def stories_per_page(): """ Returns stories per single web page """ return 25 def get_stories(subreddit='front_page', pages=1, new=False): """ Finds all stories accross 'pages' pages on a 'subreddit' and returns a list of Story objects representing stories. If the 'subreddit' is 'front_page' gets stories from http://www.reddit.com/ Otherwise gets stories from http://www.reddit.com/r// If 'new' is True, gets new stories from http://www.reddit.com/new/ If 'new' is True and 'subreddit' is set, gets stories from http://www.reddit.com/r//new/ """ stories = [] if subreddit == 'front_page': url = reddit_url else: url = subreddit_url % subreddit if new: url += 'new' for i in range(pages): content = _get_page(url) entries = _extract_stories(content) stories.extend(entries) url = _get_next_page(content) if not url: break for pos, story in enumerate(stories): story.position = pos+1 story.reddit_name = subreddit return stories; def _extract_stories(content): """ Given an HTML page, extracts all the stories and returns a list of Story objects representing stories. """ stories = [] soup = BeautifulSoup(content) def tagline_finder(tag): if tag.name == 'p' and tag.parent and tag.parent.name == 'div': try: if tag['class'] == 'tagline' and tag.parent['class'] == 'entry': return True except KeyError: pass score_divs = soup.findAll('div', attrs = { 'class': re.compile(r'^score')}) title_as = soup.findAll('a', id=re.compile(r'^title_')) tagline_ps = soup.findAll(tagline_finder) comment_as = soup.findAll('a', id=re.compile(r'^comment_')) if not len(score_divs) == len(title_as) == len(tagline_ps) == len(comment_as): raise RedesignError, "lengths of score, title, tagline and comment lists do not match" for score_div, title_a, tagline_p, comment_a in zip(score_divs, title_as, tagline_ps, comment_as): score = score_div.string if not re.match(r'^\d+$', score): score = -1 else: score = int(score) title = title_a.string.strip() url = title_a['href'] if url.startswith('/'): url = reddit_url + url[1:] m = re.search(r'title_t\d_(.+)', title_a['id']) if not m: raise RedesignError, "title did not contain a reddit id" id = m.group(1) # posted_re = re.compile(r'\s+(.+)\s+ago') # huh, reddit has all over the place here posted_re = re.compile(r' (.+) ago') posted_text = tagline_p.find(text=posted_re) if not posted_text: raise RedesignError, "unable to extract 'ago' text" m = posted_re.search(posted_text); posted_ago = m.group(1) unix_time = _ago_to_unix(posted_ago) if not unix_time: raise RedesignError, "unable to extract story date" human_time = time.ctime(unix_time) user_a = tagline_p.find('a', href=re.compile(r'/user/')) if not user_a: raise RedesignError, "unable to find containing username" user = user_a.string m = re.search(r'(\d+) comment', comment_a.string) if not m: comments = 0 else: comments = int(m.group(1)) story = Story() story.id = id story.title = title.encode('utf8') story.url = url.encode('utf8') story.score = score story.comments = comments story.user = user.encode('utf8') story.unix_time = unix_time story.human_time = human_time.encode('utf8') stories.append(story) return stories def _ago_to_unix(ago): m = re.search(r'(\d+) (\w+)', ago, re.IGNORECASE) if not m: return 0 delta = int(m.group(1)) units = m.group(2) if not units.endswith('s'): # singular units += 's' # append 's' to make it plural if units == "months": units = "days" delta *= 30 # lets take 30 days in a month elif units == "years": units = "days" delta *= 365 dt = datetime.datetime.now() - datetime.timedelta(**{units: delta}) return int(time.mktime(dt.timetuple())) def _get_page(url, timeout=10): """ Gets and returns a web page at url with timeout 'timeout'. """ old_timeout = socket.setdefaulttimeout(timeout) request = urllib2.Request(url) request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)') try: response = urllib2.urlopen(request) content = response.read() except (urllib2.HTTPError, urllib2.URLError, socket.error, socket.sslerror), e: socket.setdefaulttimeout(old_timeout) raise SeriousError, e socket.setdefaulttimeout(old_timeout) return content def _get_next_page(content): soup = BeautifulSoup(content) a = soup.find(lambda tag: tag.name == 'a' and tag.string == 'next') if a: return reddit_url + a['href'][1:] def print_stories_paragraph(stories): """ Given a list of Stories, prints them out paragraph by paragraph """ for story in stories: print 'position:', story.position print 'reddit_name:', story.reddit_name print 'id:', story.id print 'title:', story.title print 'url:', story.url print 'score:', story.score print 'comments:', story.comments print 'user:', story.user print 'unix_time:', story.unix_time print 'human_time:', story.human_time print if __name__ == '__main__': from optparse import OptionParser description = "A program by Peteris Krumins (http://www.catonmat.net)" usage = "%prog [options]" parser = OptionParser(description=description, usage=usage) parser.add_option("-s", action="store", dest="subreddit", default="front_page", help="Subreddit to retrieve stories from. Default: front_page.") parser.add_option("-p", action="store", type="int", dest="pages", default=1, help="How many pages of stories to output. Default: 1.") parser.add_option("-n", action="store_true", dest="new", help="Retrieve new stories. Default: nope.") options, args = parser.parse_args() try: stories = get_stories(options.subreddit, options.pages, options.new) except RedesignError, e: print >>sys.stderr, "Reddit has redesigned: %s!" % e sys.exit(1) except SeriousError, e: print >>sys.stderr, "Serious error: %s!" % e sys.exit(1) print_stories_paragraph(stories)