# Hello, this script is written in Python - http://python.org
"""fastUnixMailbox.py - A fast UnixMailbox reader class.
Version 0.1 DEV3 (2002-06-09)

WARNING : This is development/alpha version.
          It's unfinished, it contains known bugs.
          Not recommended for production environments.
          Use it at your own risks.

This class is a handy replacement of the UnixMailbox module.
fastUnixMailbox allows a much, much faster access to Unix-like mailboxes,
and a random-access to messages.

Differences are:
    - fastUnixMailbox takes a filename, not a file pointer like UnixMailbox.
    - fastUnixMailbox expects the read(), seek() and tell() methods to be
      available on file.
    - fastUnixMailbox does not read file in text mode, but in binary mode.
      This result in a much faster access to messages (much faster than
      xreadlines()).
    - With UnixMailbox, you have to access messages sequentially with .next().
      fastUnixMailbox allows random access to messages.
      All messages are accessed through their 'Message-ID'.
    - rfc822 objects are not required to access header or body of messages.
    - Upon creation, fastUnixMailbox indexes the mailbox:
      Further messages retreival is almost instant (there is a file.seek()
      directly at the message start).
    - fastUnixMailbox identifies messages from the 'Message-ID: ' tag in
      headers. This should be more reliable than the 'From ' (and - YES -
      I know it's not RFC2822-compliant).
    - The file will remain opened all through the class life.
    - fastUnixMailbox uses only with the built-in modules.
      No import necessary.

This class was developped to have a faster access to big mailboxes,
like the Python mailing-list/newsgroups archive (a nice 394 Mb file :-).

Example:
    import fastUnixMailbox
    filename = '2002-May.txt'
    # You can download it at http://mail.python.org/pipermail/python-list/
    print "Indexing messages from mailbox file %s ..." % filename
    mb = fastUnixMailbox(filename)
    print "Found %s message in file." % len(mb.getListOfMessages())
    print repr(mb.getListOfMessages()[:20]) # Display 20 messages
    print mb.getMessageBody('<j44rhj3o96.fsf@informatik.hu-berlin.de>')
    print mb.getMessageHeader('<j44rhj3o96.fsf@informatik.hu-berlin.de>')

Versions :
    0.1 DEV1 (2002-05-19):
        - First version
    0.1 DEV2 (2002-05-22):
        - Changed body length detection from 'Lines:' to next message
          header (more reliable).
        - Added example.
        - Added close() and reindex() methods.
        - Wrapped manual to column 78.
        - First distribution, packed with pyPack 0.0.2 (one of my scripts)
    0.1 DEV3 (2002-05-09):
        - FIXED : case of EOF in next message detection in getMessageBody()
        - FIXED : removed useless empty lines in body beginning

To do/ideas:
    - FIXME: develop getMessage() to return a rfc822 objet ?
    - FIXME: get closer to the RFC2822 (eg. support for multi-line header
             fields, which is not currently the case).

License :
    This source code is public domain.

Author :
    Sebastien SAUVAGE <sebsauvage at sebsauvage dot net>
    http://sebsauvage.net
    My Python page is here : http://sebsauvage.net/python/
    Further version of this class will probably be published at the
    previous URL, fr.comp.lang.python or comp.lang.python.

    If you are interested in this class, I would like to hear from you :-)
"""

class fastUnixMailbox:
    """A fast UnixMailbox reader class. """
    # readbuffersize : read chunk size for mailbox indexing.
    # Reducing it will reduce memory usage, but will increase disk access.
    readbuffersize = 512000  # 512 kb read buffer for indexing
    # maxheadersize : Dirty guess of maximum header size in a message : 4096 bytes ?
    # Increase if you think this class is missing messages in a mailbox (*highly* unlikely)
    maxheadersize = 4096 # bytes
    # Message body read chunk size.
    bodyreadchunksize = 4096 # bytes

    def __init__(self,filename):
        """Create a fastUnixMailbox object.
           filename is the name of the mailbox file. """
        self.filename = filename  # Name of mailbox file
        self.file = open( filename, 'rb' )  # Mailbox file
        self.messagesoffsets = {}
        self._getMessagesOffsets() # self.messagesoffsets : Offset of each message header in mailbox file

    def _getMessagesOffsets( self ):
        """Returns a list of messages index contained in a Unix-Style mailbox file (plain text). """
        if self.file == None : return

        # Get file size (without the sys.stats module !  :-)
        self.file.seek(0,2)
        filesize = self.file.tell()
        self.file.seek(0)

        pattern = "\nmessage-id: "
        messages = {}
        buffer = self.file.read(fastUnixMailbox.readbuffersize)
        bufferlow = buffer.lower()  # RFC says Message-ID: may not be case-constant
        while 1:
            currentoffset = self.file.tell() - len(buffer) # Offset of current buffer in file
            patternstartindex = bufferlow.find(pattern) # Offset of 'Message-ID:' line in buffer
            patternendindex = 0  # Offset of end-of-line 'Message-ID:' line in buffer
            while (patternstartindex > -1) and (patternendindex > -1):
                patternendindex = bufferlow.find('\n',patternstartindex+1)  # Search for 'Message-ID: end-of-line
                if patternendindex > -1: # Message-ID: end-of-line found
                    messageid = buffer[patternstartindex+len(pattern):patternendindex].strip()  # Get the message-id
                    # Now search for the previous two continuous blank lines, which mark the
                    # beginning of the header of the message.
                    pattern_el = bufferlow.rfind('\n\n',0,patternstartindex)  # Offset of the first empty line before 'Message-ID:' in buffer
                    if pattern_el > -1:
                        messages[ messageid ] = currentoffset + pattern_el + 2
                    patternstartindex = bufferlow.find(pattern,patternendindex + 1) # Search for the next 'Message-ID:'
            if self.file.tell() == filesize: break  # After buffer processing, break if we are at end-of-file.
            # Walk back in file to make sure we are not missing a pattern on buffer boundary.
            # (The dictionnary will make sure we do not index the same messageID twice.)
            # fastUnixMailbox.maxheadersize is a dirty guess of a maximum header's size.
            self.file.seek( self.file.tell() - fastUnixMailbox.maxheadersize)
            buffer = self.file.read( fastUnixMailbox.readbuffersize )
            bufferlow = buffer.lower()
        # At this point, messages contains a dictionnary of messages in the form:
        #   { message-id : offset of header start in file, etc }
        # eg. { '<a6n3gs$fem19$1@ID-135695.news.dfncis.de>' : 6414338L:,
        #       '<Xns91D1A9EBA13CFduncanrcpcouk@127.0.0.1>' : 7217156L:,  etc. }
        if len(messages) > 0 : self.messagesoffsets = messages
        else                 : self.messagesoffsets = None

    def reindex(self):
        """Re-indexes the mailbox file.
           You can call this method if the mailbox file has changed. """
        self.messagesoffsets = None
        self._getMessagesOffsets()

    def hasMessage( self, messageid ):
        """Returns True if this mailbox has this messageID. """
        return self.messagesoffsets.has_key(messageid)

    def getMessage( self, messageid ): # FIXME
        pass # rfc822 object ?

    def getMessageOffset( self, messageid ):
        """Returns the offset of a message in file.
           Returns None if message is not found. """
        if self.hasMessage( messageid ):
            return self.messagesoffsets[ messageid ]
        else:
            return None

    def getMessageHeader( self, messageid ):
        """Return the header of a message.
           Returns None if the message if not found."""
        if self.hasMessage( messageid ):
            self.file.seek( self.getMessageOffset( messageid ) ) # Go to header start offset in file
            buffer = ''
            i = -1
            while i < 0: # Read file until two empty lines are found (which is the header end).
                buffer += self.file.read(fastUnixMailbox.maxheadersize)
                i = buffer.find('\n\n')
            return buffer[:i]  # Return only the header
        else:
            return None

    def getMessageBody( self, messageid ):
        """Return the body of a message.
           Returns None if message not found.
           Return an error message if a problem occured while reading body of message."""
        if self.hasMessage( messageid ):
            header = self.getMessageHeader( messageid )
            self.file.seek( self.getMessageOffset(messageid) + len(header) ) # Go to beginning of body in file
            # ... and search for the beginning of the header of the next message.
            # To find next's message header start, we search for the first empty line
            # before the next 'Message-ID:' line.
            # This is more reliable than searching for 'From ' (which could appear
            # in the message body). This is also more reliable than using the 'Lines:'
            # header, because it's not always present.
            body = ''
            i = -1
            data = ' '
            while i < 0 and len(data) > 0 : # Read body while the next 'Message-ID:' is not found.
                    data = self.file.read(fastUnixMailbox.bodyreadchunksize)
                    body += data
                    i = body.lower().find('\nmessage-id: ')
            if i < 0: # If we are at EOF and Mesage-Id is not found, take the rest of the file as body
                return body[2:]
            else: # we are not at EOF, let's search for an empty line before 'Mesage-ID'
                j = body[:i].rfind('\n\n') # Look for the empty line before 'Message-ID:'
                if j > -1:
                    return body[2:j] # Return body, stopping at the first empty line before the next 'Message-ID:' line.
                else:
                    return "[fastUnixMailbox.getMessageBody() ERROR : Could not determine body's length of message %s]" % messageid
        else:
            return None

    def getListOfMessages(self):
        """Return the list of messages present in the mailbox file.
           This is a list of string. Each string is a Message-ID,
           which is supposed to be unique in the world.
           Messages are unsorted. """
        return self.messagesoffsets.keys()

    def close(self):
        """Closes the mailbox.
           This is not needed in most cases (the garbage collector will take care of this point),
           except when you can to explicitly release the file handle."""
        self.filename = ''
        self.messagesoffsets = {}
        self.file.close()
        self.file = None

if __name__ == '__main__':
    print __doc__

