#!/usr/bin/python # -*- coding: iso-8859-1 -*- # # ASPN Python cookbook scraper 1.0.3 # # Purpose: # ASPN Python cookbook is a great site with tons of recipes for Python. # http://aspn.activestate.com/ASPN/Cookbook/Python # Shame is: You cannot browse these snippets offline. # This program downloads all the recipes and packs them in a single html file. # Nice for offline browsing while travelling, or taking on a USB key. # This is also handy for quick full-text search. # # License: # This program is in public domain. # # Author: # Sébastien SAUVAGE, webmaster of http://sebsauvage.net # # import urllib,re,time print 'ASPN Python cookbook scraper 1.0.3' # In case of proxy, uncomment the following line (or define the HTTP_PROXY variable in your environment.) #import os; os.environ['HTTP_PROXY'] = 'http://proxy.myisp.com:3128' recipeids = [] # Recipe identifiers. # STEP 1 : Get the identifier of Python recipes by browsing the pages. re_recipeids = re.compile(r'',re.IGNORECASE|re.DOTALL) currentPage = 1 while True: print "Getting recipes page %d" % ((currentPage-1)/20+1) page = urllib.urlopen('http://aspn.activestate.com/ASPN/Cookbook/Python?query_start=%d' % currentPage).read(500000) recipeids += re_recipeids.findall(page) if not "next >>" in page: break currentPage += 20 # Make sure we have unique recipeids: recipeids = sorted(dict([(int(i),0) for i in recipeids]).keys()) print "I got %s recipes." % len(recipeids) # STEP 2 : Get each recipe (title+body+comments) recipes = {} # key= recipeid, value=the recipe itself (html) recipeTitles = {} # key= recipeid, value=the title of the recipe # regexp to convert image rating (eg. 5 stars) to text. re_stars = re.compile(r'',re.DOTALL|re.IGNORECASE) for recipeid in recipeids: print "Getting recipe %d" % recipeid # Get the page page = urllib.urlopen('http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/%d' % recipeid).read(800000) # Extract the recipe body start = page.find('') end = page.find('',start) recipeBody = page[start:end] # Cleanup the html start = recipeBody.find('