#!/usr/bin/python # -*- coding: iso-8859-1 -*- # # ASPN Python cookbook scraper 1.0.3 # # Purpose: # ASPN Python cookbook is a great site with tons of recipes for Python. # http://aspn.activestate.com/ASPN/Cookbook/Python # Shame is: You cannot browse these snippets offline. # This program downloads all the recipes and packs them in a single html file. # Nice for offline browsing while travelling, or taking on a USB key. # This is also handy for quick full-text search. # # License: # This program is in public domain. # # Author: # Sébastien SAUVAGE, webmaster of http://sebsauvage.net # # import urllib,re,time print 'ASPN Python cookbook scraper 1.0.3' # In case of proxy, uncomment the following line (or define the HTTP_PROXY variable in your environment.) #import os; os.environ['HTTP_PROXY'] = 'http://proxy.myisp.com:3128' recipeids = [] # Recipe identifiers. # STEP 1 : Get the identifier of Python recipes by browsing the pages. re_recipeids = re.compile(r'',re.IGNORECASE|re.DOTALL) currentPage = 1 while True: print "Getting recipes page %d" % ((currentPage-1)/20+1) page = urllib.urlopen('http://aspn.activestate.com/ASPN/Cookbook/Python?query_start=%d' % currentPage).read(500000) recipeids += re_recipeids.findall(page) if not "next >>" in page: break currentPage += 20 # Make sure we have unique recipeids: recipeids = sorted(dict([(int(i),0) for i in recipeids]).keys()) print "I got %s recipes." % len(recipeids) # STEP 2 : Get each recipe (title+body+comments) recipes = {} # key= recipeid, value=the recipe itself (html) recipeTitles = {} # key= recipeid, value=the title of the recipe # regexp to convert image rating (eg. 5 stars) to text. re_stars = re.compile(r'(\d) stars',re.DOTALL|re.IGNORECASE) for recipeid in recipeids: print "Getting recipe %d" % recipeid # Get the page page = urllib.urlopen('http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/%d' % recipeid).read(800000) # Extract the recipe body start = page.find('') end = page.find('',start) recipeBody = page[start:end] # Cleanup the html start = recipeBody.find('') end = recipeBody.rfind('
') end = recipeBody.rfind('',0,end-1)+8 recipeBody = recipeBody[start:end] # Convert "x stars" rating image to text. recipeBody = re_stars.subn(r'\1 stars
',recipeBody)[0] # Remove references to external objects (images, etc.) recipeBody = recipeBody.replace('src="http://www.activestate.com/','nosrc="http://www.activestate.com/') recipeBody = recipeBody.replace('src="\nhttp://www.activestate.com/','nosrc="http://www.activestate.com/') recipeBody = recipeBody.replace('src="/ASPN/','nosrc="/ASPN/') recipeBody = recipeBody.replace('href="/ASPN/','nohref="/ASPN/') # Remove rating forms while recipeBody.find('
',start) recipeBody = recipeBody[:start] + recipeBody[end+7:] # Extract title start = recipeBody.find('Title: ')+19 end = recipeBody.find('
',start) recipeTitle = recipeBody[start:end].strip() recipeTitles[recipeid] = recipeTitle # Save it. recipes[recipeid] = recipeBody # STEP 3 : Generate an HTML page. index = '' body = '' for recipeid in recipes.keys(): index += '
  • ' % recipeid + recipeTitles[recipeid] + '
  • \n' body += '

    %s

    ' % (recipeid,recipeid,recipeTitles[recipeid])+ recipes[recipeid]+'
    \n' html = ''' ASPN Python Cookbook

    ASPN Python Cookbook

    %s recipes downloaded on %s from
    http://aspn.activestate.com/ASPN/Cookbook/Python/

    Table of content



    %s
    -- end of document --
    \n''' % (len(recipes),time.strftime('%Y-%m-%d'),index,body) open('recipes.html','w+b').write(html)