#!/usr/bin/python
# -*- coding: iso-8859-1 -*-
#
# ASPN Python cookbook scraper 1.0.3
#
# Purpose:
# ASPN Python cookbook is a great site with tons of recipes for Python.
# http://aspn.activestate.com/ASPN/Cookbook/Python
# Shame is: You cannot browse these snippets offline.
# This program downloads all the recipes and packs them in a single html file.
# Nice for offline browsing while travelling, or taking on a USB key.
# This is also handy for quick full-text search.
#
# License:
# This program is in public domain.
#
# Author:
# Sébastien SAUVAGE, webmaster of http://sebsauvage.net
#
#
import urllib,re,time
print 'ASPN Python cookbook scraper 1.0.3'
# In case of proxy, uncomment the following line (or define the HTTP_PROXY variable in your environment.)
#import os; os.environ['HTTP_PROXY'] = 'http://proxy.myisp.com:3128'
recipeids = [] # Recipe identifiers.
# STEP 1 : Get the identifier of Python recipes by browsing the pages.
re_recipeids = re.compile(r'',re.IGNORECASE|re.DOTALL)
currentPage = 1
while True:
print "Getting recipes page %d" % ((currentPage-1)/20+1)
page = urllib.urlopen('http://aspn.activestate.com/ASPN/Cookbook/Python?query_start=%d' % currentPage).read(500000)
recipeids += re_recipeids.findall(page)
if not "next >>" in page:
break
currentPage += 20
# Make sure we have unique recipeids:
recipeids = sorted(dict([(int(i),0) for i in recipeids]).keys())
print "I got %s recipes." % len(recipeids)
# STEP 2 : Get each recipe (title+body+comments)
recipes = {} # key= recipeid, value=the recipe itself (html)
recipeTitles = {} # key= recipeid, value=the title of the recipe
# regexp to convert image rating (eg. 5 stars) to text.
re_stars = re.compile(r'',re.DOTALL|re.IGNORECASE)
for recipeid in recipeids:
print "Getting recipe %d" % recipeid
# Get the page
page = urllib.urlopen('http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/%d' % recipeid).read(800000)
# Extract the recipe body
start = page.find('')
end = page.find('',start)
recipeBody = page[start:end]
# Cleanup the html
start = recipeBody.find('