First commit.

thomir · Feb 17, 2015 · bddda49 · bddda49
commit bddda49
Show file tree

Hide file tree

Showing 10 changed files with 414 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,31 @@
+The FreeBSD Copyright
+
+Copyright 1992-2012 The FreeBSD Project. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE FREEBSD PROJECT ``AS IS'' AND ANY EXPRESS
+OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
+NO EVENT SHALL THE FREEBSD PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The views and conclusions contained in the software and documentation
+are those of the authors and should not be interpreted as representing
+official policies, either expressed or implied, of the FreeBSD
+Project.
+
diff --git a/gmailfilter/__init__.py b/gmailfilter/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/gmailfilter/_command.py b/gmailfilter/_command.py
@@ -0,0 +1,44 @@
+
+import logging
+import os
+import sys
+from argparse import ArgumentParser
+
+from gmailfilter._connection import IMAPServer
+
+
+def run():
+    """Main entry point for command line executable."""
+    args = configure_argument_parser()
+    log_level = logging.DEBUG if args.verbose else logging.INFO
+    logging.basicConfig(level=log_level, stream=sys.stdout)
+    rules_path = get_filter_file_or_raise()
+
+    with open(rules_path) as f:
+        code = compile(f.read(), rules_path, 'exec')
+        exec(code, get_rule_globals_dict())
+
+
+def configure_argument_parser():
+    parser = ArgumentParser(
+        prog="gmailfilter",
+        description="Filter IMAP emails the easy way!"
+    )
+    parser.add_argument('-v', '--verbose', action='store_true', help="Be more verbose")
+    return parser.parse_args()
+
+
+def get_filter_file_or_raise():
+    path = os.path.expanduser('~/.config/gmailfilter/rules')
+    if not os.path.exists(path):
+        raise IOError("Rules file %r does not exist" % path)
+    # TODO: Check for readability?
+    return path
+
+
+def get_rule_globals_dict():
+    rule_globals = {
+        'IMAPServer': IMAPServer
+    }
+    return rule_globals
+
diff --git a/gmailfilter/_connection.py b/gmailfilter/_connection.py
@@ -0,0 +1,167 @@
+from contextlib import contextmanager
+import logging
+
+from imapclient import IMAPClient
+
+from gmailfilter._message import Message
+
+
+# TODO: Accept config from command line, encapsulate in a dict and pass
+# in to the connection class.
+
+
+
+class IMAPServer(object):
+
+    def __init__(self, server=None, username=None, password=None, port=993, ssl=True):
+        if (
+            server is None or
+            username is None or
+            password is None
+        ):
+            raise ValueError("server and username and password cannot be None")
+
+
+        self._client = IMAPClient(
+            host=server,
+            port=port,
+            use_uid=False,
+            ssl=ssl
+            )
+        # self._client.debug = True
+        self._client.login(
+            username,
+            password,
+        )
+
+    def get_messages(self):
+        """A generator that yields Message instances, one for every message
+        in the users inbox.
+
+        """
+        # TODO - perahps the user wants to filter a different folder?
+        mbox_details = self._client.select_folder("INBOX")
+        total_messages = mbox_details['EXISTS']
+        logging.info("Scanning inbox, found %d messages" % total_messages)
+        # TODO: Research best chunk size - maybe let user tweak this from
+        # config file?:
+        i = 0
+        with self.use_sequence():
+            for chunk in sequence_chunk(total_messages, optimal_chunk_size(1000)):
+                logging.info("Fetching: " + chunk)
+                data = self._client.fetch(
+                    chunk,
+                    ['UID', 'BODY.PEEK[HEADER]', 'INTERNALDATE', 'FLAGS']
+                )
+                for msg_seq in data:
+                    logging.debug("Processing %d / %d", i, total_messages)
+                    proxy = MessageConnectionProxy(self, data[msg_seq])
+                    yield Message(proxy)
+                    i += 1
+                self._do_chunk_cleanup()
+
+    def move_message(self, message, folder):
+        """Move a message to a folder, creating the folder if it doesn't exist.
+
+        :param message: An instance of gmailfilter.Message
+        :param folder: A string descriving the folder.
+
+        """
+        # TODO: optimise this by trying the copy, and if we get 'NO' with
+        # 'TRYCREATE' then, and only then try and create the folder. Removes the
+        # overhead of the existance check for every message,
+        if not self._client.folder_exists(folder):
+            status = self._client.create_folder(folder)
+            assert status.lower() == "success", "Unable to create folder %s" % folder
+        with self.use_uid():
+            self._client.copy(str(message.uid()), folder)
+            self.delete_message(message)
+
+    def delete_message(self, message):
+        with self.use_uid():
+            uid_string = str(message.uid())
+            logging.info("Deleting %s" % uid_string)
+            self._client.delete_messages(uid_string)
+
+    def _do_chunk_cleanup(self):
+        # self._client.expunge()
+        pass
+
+
+    @contextmanager
+    def use_uid(self):
+        old = self._client.use_uid
+        self._client.use_uid = True
+        try:
+            yield
+        finally:
+            self._client.use_uid = old
+
+    @contextmanager
+    def use_sequence(self):
+        old = self._client.use_uid
+        self._client.use_uid = False
+        try:
+            yield
+        finally:
+            self._client.use_uid = old
+
+
+def sequence_chunk(num_messages, chunk_size):
+    assert chunk_size >= 1
+    start = 1
+    while start <= num_messages:
+        end = min(start + chunk_size - 1, num_messages)
+        if end > start:
+            if end != num_messages:
+                yield '%d:%d' % (start, end)
+            else:
+                yield '%d:*' % start
+        else:
+            yield '%d' % (start)
+        start += chunk_size
+
+
+def optimal_chunk_size(total_messages):
+    """Work out the optimal chunk size for an inbox with total_messages."""
+    # use 1000 (maximum sensible chunk size), or 10 retrieval operations,
+    # whichever is smaller:
+    return min(1000, total_messages / 10)
+
+class MessageConnectionProxy(object):
+
+    """A class that knows how to retrieve additional message parts."""
+
+    def __init__(self, connection, initial_data):
+        assert 'UID' in initial_data
+        self._connection = connection
+        self._data = initial_data
+
+    def get_message_part(self, part_name):
+        """Get a part of a message, possibly from memory.
+
+        'part_name' will be one of ENVELOPE, RFC822, UID, BODY etc.
+
+        """
+        # transform 'BODY.PEEK[HEADER]' into 'BODY[HEADER]'
+        if part_name.startswith('BODY.PEEK'):
+            retrieve_key = 'BODY' + part_name[9:]
+        else:
+            retrieve_key = part_name
+
+        # ask the server for 'part_name', but look in our dictionary with
+        # 'retrieve_key'
+        if retrieve_key not in self._data:
+            with self._connection.use_uid():
+                msg_uid = self._data['UID']
+                # for some reason, sometimes a fetch call returns an empty dict.
+                # until I find out why, I'll simply retry this:
+                data = {}
+                for i in range(3):
+                    data = self._connection._client.fetch(msg_uid, part_name)
+                    if data:
+                        self._data.update(data[msg_uid])
+                        break
+                assert msg_uid in data, ("Server gave us back some other data: %d %r" % (msg_uid, data))
+        return self._data[retrieve_key]
+
diff --git a/gmailfilter/_message.py b/gmailfilter/_message.py
@@ -0,0 +1,58 @@
+import email
+from email.utils import parseaddr
+
+
+class Message(object):
+
+    """An interface to represent an email message.
+
+    The message is lazily-created. Methods such as 'subject' cause network
+    traffic the first time they're called. After that, the results are cached.
+    """
+
+    def __init__(self, connection_proxy):
+        self._connection_proxy = connection_proxy
+        self._message = None
+
+    def _get_email(self):
+        if self._message is None:
+            self._message = email.message_from_string(
+                self._connection_proxy.get_message_part('BODY.PEEK[HEADER]')
+            )
+        return self._message
+
+    def subject(self):
+        return self._get_email()['Subject']
+
+    def from_(self):
+        return self._get_email()['From']
+
+    def is_list_message(self):
+        return 'List-Id' in self._get_email()
+
+    def list_id(self):
+        # Returns None if key is not found, does not raise KeyError:
+        list_id = self._get_email()['List-Id']
+        return parse_list_id(list_id) if list_id is not None else None
+
+    def uid(self):
+        return self._connection_proxy.get_message_part('UID')
+
+    def get_headers(self):
+        # TODO: email objects are dictionaries for the headers, but also expose
+        # the body contents, attachments etc. etc. It'd be nice if we could
+        # *only* expose the headers here...
+        return self._get_email()
+
+    def get_date(self):
+        return self._connection_proxy.get_message_part('INTERNALDATE')
+
+    def get_flags(self):
+        return self._connection_proxy.get_message_part('FLAGS')
+
+    def __repr__(self):
+        return repr(self.subject())
+
+
+def parse_list_id(id_string):
+    return parseaddr(id_string)[1]
diff --git a/gmailfilter/test/__init__.py b/gmailfilter/test/__init__.py
diff --git a/gmailfilter/test/test_connection.py b/gmailfilter/test/test_connection.py
@@ -0,0 +1,33 @@
+from unittest import TestCase
+
+
+from gmailfilter import _connection as c
+
+class SequenceChunkTests(TestCase):
+
+    def assertSequenceChunk(self, messages, chunk_size, expected):
+        observed = list(c.sequence_chunk(messages, chunk_size))
+        self.assertEqual(expected, observed)
+
+    def test_no_messages(self):
+        self.assertSequenceChunk(0, 10, [])
+
+    def test_single_message(self):
+        self.assertSequenceChunk(1, 10, ['1'])
+
+    def test_two_messages(self):
+        self.assertSequenceChunk(2, 10, ['1:2'])
+
+    def test_one_chunk(self):
+        self.assertSequenceChunk(10, 10, ['1:10'])
+
+    def test_one_and_a_bit_chunks(self):
+        self.assertSequenceChunk(11, 10, ['1:10', '11'])
+
+    def test_two_chunks(self):
+        self.assertSequenceChunk(20, 10, ['1:10', '11:20'])
+
+    def test_with_no_chunking(self):
+        self.assertSequenceChunk(5, 1, ['1', '2', '3', '4', '5'])
+
+
diff --git a/gmailfilter/test/test_message.py b/gmailfilter/test/test_message.py
@@ -0,0 +1,27 @@
+
+
+from unittest import TestCase
+
+from gmailfilter._message import parse_list_id
+
+class ListIdParsingTestCase(TestCase):
+
+    def test_list_id_equality_as_string(self):
+        l = parse_list_id('mail.asana.com')
+        self.assertEqual('mail.asana.com', str(l))
+        self.assertEqual('mail.asana.com', l)
+
+    def test_list_id_inequality_as_string(self):
+        l = parse_list_id('mail.asana.com')
+        self.assertNotEqual('foo.com', str(l))
+        self.assertNotEqual('foo.com', l)
+
+    def test_can_extract_list_id_from_description(self):
+        l = parse_list_id('Some list description <some.list.id>')
+        self.assertEqual('some.list.id', l)
+
+    def test_list_ids_with_different_descriptions_are_equal(self):
+        self.assertEqual(
+            parse_list_id('some description <list.id>'),
+            parse_list_id('some other description <list.id>'),
+            )
diff --git a/readme.md b/readme.md
@@ -0,0 +1,16 @@
+= What is this? =
+
+This is the source code to my personal IMAP-based mail filtering tool. I use it every day, and it has proven to be useful for me. You are more than welcome to use and contribute to it.
+
+= How does it work? =
+
+You must create a `rules` configuration file which tells `gmailfilter` how to connect to your IMAP mail server, and what to do with all the mail in your inbox. The approach `gmailfilter` takes to mail filtering is that your inbox should be virtually empty at the end of a filter run - only messages which need "active processing" should remain (usually this means "unread and flagged (starred) messages"). `gmailfilter` takes care of iterating over the messages in your inbox, and will run your rules over any new messages that arrive. 
+
+You can use `gmailfilter` to achieve the following:
+
+ * Automatically move mailing list messages to a separate folder.
+ * Automatically delete spam messages from automated services such as jenkins.
+ * Move messages that are older than a certain age to a different folder.
+ * ...much much more!
+
+ Rules are written in python, so you can do pretty much whatever you want!