#!/usr/bin/python
# -*- coding: iso-8859-1 -*-
'''
bashfr_download.py 1.0.0
This program downloads the whole bashfr.org archive into a single HTML file.
http://www.bashfr.org/?sort=browse&p=1
Why: I wanted to read bashfr.org offline.
License: This program is public domain.
Author: Sébastien SAUVAGE (webmaster of http://sebsauvage.net)
'''
# Use this if you use a proxy (or define the HTTP_PROXY variable in your environment)
#import os; os.environ['HTTP_PROXY'] = 'http://proxy.myisp.com:3128'
import sys,urllib2,re
currentPage = 1
print 'bashfr_download 1.0.0'
sys.stdout.write('Downloading page...')
re_quote = re.compile('
',re.IGNORECASE|re.DOTALL)
quotes = {} # List of quotes. key=quote number, value=the quote itself (HTML code)
while True:
html = urllib2.urlopen('http://www.bashfr.org/?sort=browse&p=%d' % currentPage).read(200000)
sys.stdout.write('.')
if not '>>>' in html:
break
for (number,quote) in re_quote.findall(html):
quotes[int(number)] = quote.decode('ISO-8859-1')
currentPage += 1
body = u''
for number in sorted(quotes.keys()):
body += u'
%d
%s' % (number,quotes[number])
body += u'
'
html = u'''
http://www.bashfr.org/
http://www.bashfr.org/%s
''' % body
file = open('bashfr_quotes.html','w+b')
file.write(html.encode('utf-8'))
file.close()