# Hello, this script is written in Python - http://python.org """fastUnixMailbox.py - A fast UnixMailbox reader class. Version 0.1 DEV3 (2002-06-09) WARNING : This is development/alpha version. It's unfinished, it contains known bugs. Not recommended for production environments. Use it at your own risks. This class is a handy replacement of the UnixMailbox module. fastUnixMailbox allows a much, much faster access to Unix-like mailboxes, and a random-access to messages. Differences are: - fastUnixMailbox takes a filename, not a file pointer like UnixMailbox. - fastUnixMailbox expects the read(), seek() and tell() methods to be available on file. - fastUnixMailbox does not read file in text mode, but in binary mode. This result in a much faster access to messages (much faster than xreadlines()). - With UnixMailbox, you have to access messages sequentially with .next(). fastUnixMailbox allows random access to messages. All messages are accessed through their 'Message-ID'. - rfc822 objects are not required to access header or body of messages. - Upon creation, fastUnixMailbox indexes the mailbox: Further messages retreival is almost instant (there is a file.seek() directly at the message start). - fastUnixMailbox identifies messages from the 'Message-ID: ' tag in headers. This should be more reliable than the 'From ' (and - YES - I know it's not RFC2822-compliant). - The file will remain opened all through the class life. - fastUnixMailbox uses only with the built-in modules. No import necessary. This class was developped to have a faster access to big mailboxes, like the Python mailing-list/newsgroups archive (a nice 394 Mb file :-). Example: import fastUnixMailbox filename = '2002-May.txt' # You can download it at http://mail.python.org/pipermail/python-list/ print "Indexing messages from mailbox file %s ..." % filename mb = fastUnixMailbox(filename) print "Found %s message in file." % len(mb.getListOfMessages()) print repr(mb.getListOfMessages()[:20]) # Display 20 messages print mb.getMessageBody('') print mb.getMessageHeader('') Versions : 0.1 DEV1 (2002-05-19): - First version 0.1 DEV2 (2002-05-22): - Changed body length detection from 'Lines:' to next message header (more reliable). - Added example. - Added close() and reindex() methods. - Wrapped manual to column 78. - First distribution, packed with pyPack 0.0.2 (one of my scripts) 0.1 DEV3 (2002-05-09): - FIXED : case of EOF in next message detection in getMessageBody() - FIXED : removed useless empty lines in body beginning To do/ideas: - FIXME: develop getMessage() to return a rfc822 objet ? - FIXME: get closer to the RFC2822 (eg. support for multi-line header fields, which is not currently the case). License : This source code is public domain. Author : Sebastien SAUVAGE http://sebsauvage.net My Python page is here : http://sebsauvage.net/python/ Further version of this class will probably be published at the previous URL, fr.comp.lang.python or comp.lang.python. If you are interested in this class, I would like to hear from you :-) """ class fastUnixMailbox: """A fast UnixMailbox reader class. """ # readbuffersize : read chunk size for mailbox indexing. # Reducing it will reduce memory usage, but will increase disk access. readbuffersize = 512000 # 512 kb read buffer for indexing # maxheadersize : Dirty guess of maximum header size in a message : 4096 bytes ? # Increase if you think this class is missing messages in a mailbox (*highly* unlikely) maxheadersize = 4096 # bytes # Message body read chunk size. bodyreadchunksize = 4096 # bytes def __init__(self,filename): """Create a fastUnixMailbox object. filename is the name of the mailbox file. """ self.filename = filename # Name of mailbox file self.file = open( filename, 'rb' ) # Mailbox file self.messagesoffsets = {} self._getMessagesOffsets() # self.messagesoffsets : Offset of each message header in mailbox file def _getMessagesOffsets( self ): """Returns a list of messages index contained in a Unix-Style mailbox file (plain text). """ if self.file == None : return # Get file size (without the sys.stats module ! :-) self.file.seek(0,2) filesize = self.file.tell() self.file.seek(0) pattern = "\nmessage-id: " messages = {} buffer = self.file.read(fastUnixMailbox.readbuffersize) bufferlow = buffer.lower() # RFC says Message-ID: may not be case-constant while 1: currentoffset = self.file.tell() - len(buffer) # Offset of current buffer in file patternstartindex = bufferlow.find(pattern) # Offset of 'Message-ID:' line in buffer patternendindex = 0 # Offset of end-of-line 'Message-ID:' line in buffer while (patternstartindex > -1) and (patternendindex > -1): patternendindex = bufferlow.find('\n',patternstartindex+1) # Search for 'Message-ID: end-of-line if patternendindex > -1: # Message-ID: end-of-line found messageid = buffer[patternstartindex+len(pattern):patternendindex].strip() # Get the message-id # Now search for the previous two continuous blank lines, which mark the # beginning of the header of the message. pattern_el = bufferlow.rfind('\n\n',0,patternstartindex) # Offset of the first empty line before 'Message-ID:' in buffer if pattern_el > -1: messages[ messageid ] = currentoffset + pattern_el + 2 patternstartindex = bufferlow.find(pattern,patternendindex + 1) # Search for the next 'Message-ID:' if self.file.tell() == filesize: break # After buffer processing, break if we are at end-of-file. # Walk back in file to make sure we are not missing a pattern on buffer boundary. # (The dictionnary will make sure we do not index the same messageID twice.) # fastUnixMailbox.maxheadersize is a dirty guess of a maximum header's size. self.file.seek( self.file.tell() - fastUnixMailbox.maxheadersize) buffer = self.file.read( fastUnixMailbox.readbuffersize ) bufferlow = buffer.lower() # At this point, messages contains a dictionnary of messages in the form: # { message-id : offset of header start in file, etc } # eg. { '' : 6414338L:, # '' : 7217156L:, etc. } if len(messages) > 0 : self.messagesoffsets = messages else : self.messagesoffsets = None def reindex(self): """Re-indexes the mailbox file. You can call this method if the mailbox file has changed. """ self.messagesoffsets = None self._getMessagesOffsets() def hasMessage( self, messageid ): """Returns True if this mailbox has this messageID. """ return self.messagesoffsets.has_key(messageid) def getMessage( self, messageid ): # FIXME pass # rfc822 object ? def getMessageOffset( self, messageid ): """Returns the offset of a message in file. Returns None if message is not found. """ if self.hasMessage( messageid ): return self.messagesoffsets[ messageid ] else: return None def getMessageHeader( self, messageid ): """Return the header of a message. Returns None if the message if not found.""" if self.hasMessage( messageid ): self.file.seek( self.getMessageOffset( messageid ) ) # Go to header start offset in file buffer = '' i = -1 while i < 0: # Read file until two empty lines are found (which is the header end). buffer += self.file.read(fastUnixMailbox.maxheadersize) i = buffer.find('\n\n') return buffer[:i] # Return only the header else: return None def getMessageBody( self, messageid ): """Return the body of a message. Returns None if message not found. Return an error message if a problem occured while reading body of message.""" if self.hasMessage( messageid ): header = self.getMessageHeader( messageid ) self.file.seek( self.getMessageOffset(messageid) + len(header) ) # Go to beginning of body in file # ... and search for the beginning of the header of the next message. # To find next's message header start, we search for the first empty line # before the next 'Message-ID:' line. # This is more reliable than searching for 'From ' (which could appear # in the message body). This is also more reliable than using the 'Lines:' # header, because it's not always present. body = '' i = -1 data = ' ' while i < 0 and len(data) > 0 : # Read body while the next 'Message-ID:' is not found. data = self.file.read(fastUnixMailbox.bodyreadchunksize) body += data i = body.lower().find('\nmessage-id: ') if i < 0: # If we are at EOF and Mesage-Id is not found, take the rest of the file as body return body[2:] else: # we are not at EOF, let's search for an empty line before 'Mesage-ID' j = body[:i].rfind('\n\n') # Look for the empty line before 'Message-ID:' if j > -1: return body[2:j] # Return body, stopping at the first empty line before the next 'Message-ID:' line. else: return "[fastUnixMailbox.getMessageBody() ERROR : Could not determine body's length of message %s]" % messageid else: return None def getListOfMessages(self): """Return the list of messages present in the mailbox file. This is a list of string. Each string is a Message-ID, which is supposed to be unique in the world. Messages are unsorted. """ return self.messagesoffsets.keys() def close(self): """Closes the mailbox. This is not needed in most cases (the garbage collector will take care of this point), except when you can to explicitly release the file handle.""" self.filename = '' self.messagesoffsets = {} self.file.close() self.file = None if __name__ == '__main__': print __doc__