Processing emails using IMAP and Thunderbird files

February 24th, 2024

Processing messages using an IMAP connection

Take care of following placeholders in the following code:

from imapclient import IMAPClient
import ssl
import email
import email.header
from email import policy

ssl_context = ssl.create_default_context()

with IMAPClient(IMAP_HOSTNAME, ssl_context=ssl_context) as server:
    server.login(IMAP_USERNAME, IMAP_PASSWORD)

    # show capabilities of the IMAP server:
    #print(server.capabilities())

    # select a folder for processing
    select_info = server.select_folder(IMAP_FOLDER, readonly=True)

    # get a list of all messages (adapt as needed)
    total_messages = select_info[b'EXISTS']
    messages = server.search()

    # you can fetch all messages at once or a subset of them based on a filter
    # in this case, messages will be fetched and processed one-by-one
    for msg in messages:
        # fetch the message, normally only one message should be returned here
        for uid, message_data in server.fetch(msg, ['RFC822', 'FLAGS', 'INTERNALDATE', 'UID']).items():
            # parse the message, missing policy falls back to compat32 api
            # the new API takes care of handling basic types and decoding strings (e.g. UTF-8 headers)
            email_message = email.message_from_bytes(message_data[b'RFC822'], policy=policy.default)

            print('    - SEQ', message_data[b'SEQ'])
            print('    - FLAGS', message_data[b'FLAGS'])

            # the internal date is the timestamp, when the message arrived at the server (and it is assigned by the server)
            # it can be used for forensic work in order to find some mismatches
            internaldate_raw = message_data[b'INTERNALDATE']
            internaldate_ts = message_data[b'INTERNALDATE'].timestamp()
            print('    - INTERNALDATE', internaldate_raw, internaldate_ts)

            print('    - From:', email_message.get('From'))
            print('    - Subject:', email_message.get('Subject'))
            date_header = email_message.get('Date')
            print('    - Date:', date_header, date_header.datetime.timestamp())
            print('    - Message-ID:', email_message.get('Message-ID'))
            print('    - In-Reply-To:', email_message.get('In-Reply-To'))

            # iterate over parts, e.g. to find relevant attachments
            # see documentation: there are convenience methods for iterating over specific parts only
            for part in email_message.walk():
                print('        - part', part.get_content_type(), 'filename', part.get_filename())

                # most of the real attachments have a declared file name (but this is not a 100% rule!)
                if part.get_filename() != None:
                    local_filename = ...

                    # the API delivers one of: bytes, EmailMessage or str, depending on the Content-Type
                    content = part.get_content()
                    if isinstance(content, email.message.EmailMessage):
                        content = content.as_bytes()
                    elif isinstance(content, str):
                        charset = part.get_content_charset()
                        print('            - charset', charset)
                        content = content.encode(charset or 'utf-8')

                    # save the content to a file
                    with open(local_filename, 'wb') as f:
                        f.write(content)

Compat32 API

In case of using the legacy API, a few things work differently. The same is valid for the MBOX approach below.

list of (subject, subject_encoding) = email.header.decode_header(email_message.get('Subject'))
subject = subject.decode(subject_encoding or 'utf-8')
part.get_payload(None, True)

Processing Thunderbird messages

If you don't need a live-interaction with the application (see: plugin approach via WebExtension API or A "Hello World" Extension Tutorial), there is an extremely easy approach just by accessing the files directly. E-mails are stored in a simple MBOX format.

import mailbox
import datetime

mb = mailbox.mbox(MBOX_FILE, create=False)
for email_message in mb:
    print('    - From:', email_message.get('From'))
    print('    - Subject:', email_message.get('Subject'))
    date_header = email_message.get('Date')
    print('    - Date:', date_header)

    # you need to parse the Date header manually
    # in most cases, the given format is correct
    date_ts = 0.0
    fmt = '%a, %d %b %Y %H:%M:%S %z'
    try:
        date_ts = datetime.datetime.strptime(date_header, fmt).timestamp()
    except ValueError as v:
        # it can be that the header contains additional data (e.g. the name of the time zone)
        # assuming English-based exception messages and no changes in the message format,
        # we cut the excessive data and try again
        ulr = len(v.args[0].partition('unconverted data remains: ')[2])
        if ulr:
            date_ts = datetime.datetime.strptime(date_header[:-ulr], fmt).timestamp()
        else:
            # still no luck, you need to adapt the code to your situation
            raise v

    print('    - Date:', date_ts)
    print('    - Message-ID:', email_message.get('Message-ID'))
    print('    - In-Reply-To:', email_message.get('In-Reply-To'))
    #print('    - Received:', email_message.get('Received'), type(email_message.get('Received')))

    # same as in the IMAP example above
    for part in email_message.walk():
        print('        - part', part.get_content_type(), 'filename', part.get_filename())
        if part.get_filename() != None:
            local_filename = ...
            # mailbox generates mailbox.mboxMessage objects not EmailMessage objects
            # mailbox.mboxMessage seems to be identical to the compat32 API
            content = part.get_payload(None, True)
                if isinstance(content, mailbox.mboxMessage): # not sure if this is the right class
                    content = content.as_bytes()
                elif isinstance(content, str): # not sure if this case may happen at all
                    charset = part.get_content_charset()
                    print('            - charset', charset)
                    content = content.encode(charset or 'utf-8')

                with open(local_filename, 'wb') as f:
                    f.write(content)

References


Next: SMS TPDU Transfer Protocol Data Unit

Previous: Home Automation without a Cloud

Main Menu