Basic refactoring and Tom's recommendations
authorPlamen Dimitrov <plamen.dimitrov@intra2net.com>
Tue, 26 Jun 2012 15:18:47 +0000 (17:18 +0200)
committerPlamen Dimitrov <plamen.dimitrov@intra2net.com>
Tue, 26 Jun 2012 15:18:47 +0000 (17:18 +0200)
caching_data.py
date_interpreter.py
fix_imap_internaldate.py
mail_iterator.py
mailbox_state.py [new file with mode: 0644]
unit_tester.py

index ea3fb8c..e7c5f9e 100644 (file)
@@ -14,82 +14,90 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 '''
-
+import os, tempfile
 import pickle
+from mailbox_state import MailboxState
+
+CACHING_FILENAME = "caching_data.dat"
 
 class CachingData:
     """This class is responsible for the caching of data."""
+    
+    # class attributes
+    # integer for version of the cache
+    version = None
+    # dictionary of usernames as keys and dictionaries as values
+    # the second dictionaries have unique mailbox keys and mailboxes as values
+    data = None
 
     def __init__(self):
         try:
-            cachingfile = open('caching_data.pkl', 'rb')
-            self.data = pickle.load(cachingfile)
-            #print(len(self.data), "users found.")
-            self.save_flag = {}
-            for user in self.data:
-                self.save_flag[user] = {}
-                for uid_key in self.data[user]:
-                    self.save_flag[user][uid_key] = False
+            cachingfile = open(CACHING_FILENAME, 'rb')
+            self.version, self.data = pickle.load(cachingfile)
+            print("Cache version", self.version)
+            print(len(self.data), "users found.")
         except IOError:
+            self.version = 0
             self.data = {}
-            self.save_flag = {}
-            with open('caching_data.pkl', 'wb') as cachingfile:
-                pickle.dump(self.data, cachingfile)
+            with open(CACHING_FILENAME, 'wb') as cachingfile:
+                pickle.dump((0, self.data), cachingfile)
 
     def __del__(self):
-        with open('caching_data.pkl', 'wb') as cachingfile:
-            # prepare data based on a save flag
-            for user in self.save_flag:
-                for uid_key in self.save_flag[user]:
-                    if(not self.save_flag[user][uid_key]):
-                        del self.data[user][uid_key]
-                        #print(uidvalidity, "deleted from cache.")
-                if(len(self.data[user])==0):
-                    del self.data[user]
-                    #print(user, "deleted from cache.")
-
-            # serialize in file
-            pickle.dump(self.data, cachingfile)
-
-        #print(len(self.data), "users stored.")
-    
-    def _cache_new_mailbox(self, username, uid_key):
-        """Store the mailbox as integer uidvalidity"""
-        if(username not in self.data):
-            self.data[username] = {}
-            self.save_flag[username] = {}
-            #print(username, "created.")
-        if(uid_key not in self.data[username]):
-            self.data[username][uid_key] = []
-            self.save_flag[username][uid_key] = False
-            #print(uid_key, "created.")
-        return
-
-    def sync_cached_mailbox(self, username, uid_key, list_ids):
-        """Adds new messages to the cache and returns a list of them.
-        Confirm the changes to a mailbox to finally save it."""
-        new_ids = []
+        # create temporary file first
+        location = os.path.dirname(CACHING_FILENAME)    
+        file_descriptor, tmpname = tempfile.mkstemp(dir=location)
+        cachingfile = os.fdopen(file_descriptor, 'wb')
 
-        if(username not in self.data or \
-           uid_key not in self.data[username]):
-            self._cache_new_mailbox(username, uid_key)
-            new_ids = list_ids
-        else:
-            for uid in list_ids:
-                try:
-                    self.data[username][uid_key].index(uid)
-                    #print("found", uid, uid_key)
-                except ValueError:
-                    #print("new", uid, uid_key)
-                    new_ids.append(uid)
+        # prepare data based on a save flag
+        saved_data = {}
+        for user in self.data:
+            saved_data[user] = {}
+            for box_key in self.data[user]:
+                if(self.data[user][box_key].needs_save):
+                    saved_data[user][box_key] = self.data[user][box_key]
+                    print(saved_data[user][box_key].name, "will be saved.")
+            if(len(saved_data[user])==0):
+                del saved_data[user]
+                print(user, "will not be saved.")
+        self.data = saved_data
 
-        # update cached_mailbox
-        self.data[username][uid_key] = list_ids
+        # avoid test mode or cases where nothing needs saving
+        if(len(saved_data)==0):
+            os.unlink(tmpname)
+            return
+        # serialize in file
+        self.version += 1
+        pickle.dump((self.version, self.data), cachingfile)
+        print(len(self.data), "users stored.")
+        cachingfile.close()
+        os.rename(tmpname, CACHING_FILENAME)
 
-        return new_ids
-
-    def commit_cached_mailbox(self, username, uid_key):
-        """Confirm the chages to the cached mailbox."""
-        self.save_flag[username][uid_key] = True
-        #print(username, uid_key, "committed.") 
+    def retrieve_cached_mailbox(self, name, uidvalidity, user):
+        """Retrieve a cached mailbox or create it."""
+        box_key = name.strip('"') + uidvalidity
+        if(user not in self.data):
+            self.data[user] = {}
+            #print(user, "created.")
+        if(box_key not in self.data[user]):
+            self.data[user][box_key] = MailboxState(name, uidvalidity, user)
+            #print(box_key, "created.")
+        return self.data[user][box_key]
+    
+    def report_date_conflicts(self):
+        """Write a date conflicts report in a file."""
+        with open("conflict_stats.txt", 'w') as statsfile:
+            owner_total_conflicts = {}
+            owner_total_missing = {}
+            for user in self.data:
+                owner_total_conflicts[user] = 0
+                owner_total_missing[user] = 0
+                for box_key in self.data[user]:
+                    owner_total_conflicts[user] += self.data[user][box_key].date_conflicts
+                    owner_total_missing[user] += self.data[user][box_key].no_received_field
+                    statsfile.write("Total date conflicts to be corrected in a mailbox {0} are {1}.\n"\
+                                    .format(self.data[user][box_key].name, self.data[user][box_key].date_conflicts))
+                    statsfile.write("Total missing received headers in a mailbox {0} are {1}.\n"\
+                                    .format(self.data[user][box_key].name, self.data[user][box_key].no_received_field))
+                statsfile.write("Total date conflicts to be corrected for user {0} are {1}.\n\n"\
+                                .format(user, owner_total_missing[user]))
         return
index b494df6..f2eeedc 100644 (file)
@@ -13,15 +13,10 @@ This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
-
-Add '-t' argument when running the module for a test mode.
-For a detailed list of each message with a date conflict change
-the 'log_level' in the configuration file from '30' to '20'.
 '''
 
-import datetime
+import datetime, time
 import re
-import time
 
 #reg expressions
 RECEIVED_DATE = re.compile(r'(0?[1-9]|[1-2][0-9]|3[01])\s+([A-Z][a-z][a-z])\s+'
@@ -33,7 +28,8 @@ INTERNAL_DATE = re.compile(r'(?P<day>[ 0123][0-9])-(?P<mon>[A-Z][a-z][a-z])-(?P<
 CONTROL_SYMBOLS = re.compile(r'[\n\r\t]')
 
 class DateInterpreter:
-    """This class extracts dates from imap server responses and compares them."""
+    """This class extracts dates from imap server responses and compares them.
+    This class contains only static methods."""
 
     def __init__(self):
         return
index 7aa0675..bd95732 100644 (file)
@@ -13,40 +13,41 @@ This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
-
-Add '-t' argument when running the module for a test mode.
-For a detailed list of each message with a date conflict change
-the 'log_level' in the configuration file from '30' to '20'.
 '''
 
 import sys
 import csv
-import logging
-import configparser
+import logging, configparser
 from date_interpreter import DateInterpreter
 from mail_iterator import MailIterator
 from caching_data import CachingData
 
 def main():
     """Iterates through csv list of users and their mailboxes"""
-    if (len(sys.argv) > 1 and sys.argv[1]=="-t"):
-        test_mode = 1
+    if(len(sys.argv) > 1):
+        if(sys.argv[1]=="--h"):
+            print("The default mode of the script is test mode."
+                  "Add '--u' argument to exit to modify messages."
+                  "For a detailed list of each message with a date conflict change"
+                  "change the 'log_level' in the configuration file from '30' to '20'.")
+            return
+        if(sys.argv[1]=="--u"):
+            test_mode = False
     else:
-        test_mode = 0
+        test_mode = True
 
     config = load_configuration()
-    logging.basicConfig(filename='mailscript.log',
+    logging.basicConfig(filename='fix_imap_internaldate.log',
                         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                         level=config.getint('basic_settings', 'log_level'))
 
     date_interp = DateInterpreter()
-    cashing_data = CachingData()
-    logging.warning(cashing_data)
+    caching_data = CachingData()
+    logging.warning("Cache version %s loaded.\n\n", caching_data.version)
     user_reader = csv.DictReader(open("userdata.csv", "r"), delimiter=',')
 
     server = config.get('basic_settings', 'imap_server')
     tolerance = config.getint('basic_settings', 'tolerance')
-    total_per_box = {}
 
     for user in user_reader:
         try:
@@ -56,11 +57,10 @@ def main():
             continue
         for mailbox in session:
             try:
-                #special key to ensure better mailbox uniqueness
-                mailbox_key = mailbox[0].strip('"') + mailbox[1]
+                box = caching_data.retrieve_cached_mailbox(mailbox[0], mailbox[1], user['username'])
                 mail_ids = session.fetch_messages()
-                new_ids = cashing_data.sync_cached_mailbox(user['username'], mailbox_key, mail_ids)
-                #print(len(new_ids), "new out of", len(mail_ids), "in", mailbox)
+                new_ids = box.synchronize(mail_ids)
+                logging.warning("%s non-cached messages found out of %s in %s.\n", len(new_ids), len(mail_ids), box.name)
             except UserWarning as ex:
                 logging.error(ex)
                 continue
@@ -71,8 +71,9 @@ def main():
                     fetched_received_date = session.fetch_received_date(mid)
                     received_date = date_interp.extract_received_date(fetched_received_date)
                     if(received_date==""):
-                        logging.warning("No received date could be found in message uid: %s - mailbox: %s - user: %s.\n",
-                                        mid.decode("utf-8"), mailbox[0], user['username'])
+                        logging.info("No received date could be found in message uid: %s - mailbox: %s - user: %s.\n",
+                                        mid.decode("utf-8"), box.name, box.owner)
+                        box.no_received_field += 1
                         continue
                 except UserWarning as ex:
                     logging.error(ex)
@@ -81,29 +82,24 @@ def main():
                     #print(received_date, internal_date)
                     if(test_mode==0):
                         try:
-                            session.update_message(mid, mailbox[0], received_date)
+                            session.update_message(mid, box.name, received_date)
                         except UserWarning as ex:
                             logging.error(ex)
                             continue
                     else:
                         logging.info("Date conflict found in message uid: %s - mailbox: %s - user: %s.\nInternal date %s is different from received date %s from RECEIVED header:\n%s.",
-                                        mid.decode("utf-8"), mailbox[0], user['username'],
+                                        mid.decode("utf-8"), box.name, box.owner,
                                         internal_date.strftime("%d %b %Y %H:%M:%S"),
                                         received_date.strftime("%d %b %Y %H:%M:%S"),
                                         fetched_received_date[0][1].decode("utf-8").split("Received:")[1])
                     # count total emails for every user and mailbox
-                    user_key = user['username']+'|'+mailbox[0].strip('"')
-                    total_per_box[user_key] = 1 + total_per_box.get(user_key, 0)
+                    box.date_conflicts += 1
             # if all messages were successfully fixed confirm caching
-            cashing_data.commit_cached_mailbox(user['username'], mailbox_key)
+            if(not test_mode):
+                box.confirm_change()
+        
         # final report on date conflicts
-        total_per_user = 0
-        for warning in total_per_box:
-            total_per_user += total_per_box[warning]
-            logging.warning("Total date conflicts to be corrected in a mailbox %s are %s.",
-                         warning.split('|')[1], total_per_box[warning])
-        logging.warning("Total date conflicts to be corrected for user %s are %s.\n",
-                     user['username'], total_per_user)
+        caching_data.report_date_conflicts()
 
 def load_configuration():
     """Loads the script configuration from a file or creates such."""
index 1119945..be7964d 100644 (file)
@@ -13,10 +13,6 @@ This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
-
-Add '-t' argument when running the module for a test mode.
-For a detailed list of each message with a date conflict change
-the 'log_level' in the configuration file from '30' to '20'.
 '''
 
 import imaplib
@@ -29,6 +25,12 @@ UIDVAL_RESP = re.compile(r'(?P<name>.*) \(UIDVALIDITY (?P<uidval>.*)\)')
 class MailIterator:
     """This class communicates with the e-mail server."""
 
+    # class attributes
+    # IMAP4_SSL for connection with an IMAP server
+    mail_con = None
+    # list of tuples (uidvalidity, mailboxname) for the retrieved mailboxes
+    mailboxes = None
+
     def __init__(self, server, username, password):
         """Creates a connection and a user session."""
         self.mail_con = imaplib.IMAP4_SSL(server)
diff --git a/mailbox_state.py b/mailbox_state.py
new file mode 100644 (file)
index 0000000..e2bd86a
--- /dev/null
@@ -0,0 +1,108 @@
+'''
+mailbox_state.py - The module contains the MailboxState class.
+
+Copyright (c) 2012 Intra2net AG
+Author: Plamen Dimitrov
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+'''
+
+class MailboxState:
+    """This class is responsible for containing and updating a mailbox data."""
+
+    # class attributes
+    # string with quotation marks for the mailbox name
+    name = None
+    # string for the mailbox uidvalidity
+    uidvalidity = None
+    # string for user owning the mailbox
+    owner = None
+    # list of bytes for last cached mail uids
+    uids = None
+    # boolean flag for committing state changes
+    needs_save = None
+    # integer for found date conflicts
+    date_conflicts = None
+    # integer for found messages with missing received headers
+    no_received_field = None
+    # unique key for a mailbox
+    key = None
+
+    def __init__(self, name, uidvalidity, owner):
+        self.name = name
+        self.uidvalidity = uidvalidity
+        self.owner = owner
+
+        self.uids = []
+        self.needs_save = False
+
+        self.date_conflicts = 0
+        self.no_received_field = 0
+
+        #special key to ensure better mailbox uniqueness
+        self.key = self.name.strip('"') + self.uidvalidity
+
+        return
+
+    def __getstate__(self):
+        """Prepares the MailboxState instance for pickling."""
+        changed_dict = self.__dict__.copy()
+        # remove the following attributes for pickling
+        del changed_dict['needs_save']
+        del changed_dict['date_conflicts']
+        del changed_dict['no_received_field']
+        #print("pickling preparation complete")
+        return changed_dict
+
+    def __setstate__(self, dict):
+        """Prepares the MailboxState instance for unpickling."""
+        self.name = dict["name"]
+        self.uidvalidity = dict["uidvalidity"]
+        self.owner = dict["owner"]
+
+        self.uids = dict["uids"]
+        self.needs_save = False
+
+        self.date_conflicts = 0
+        self.no_received_field = 0
+        
+        self.key = dict["key"]
+        
+        #print("unpickling preparation complete")
+        return    
+
+    def __str__(self):
+        """Makes the class printable."""
+        return self.key
+
+    def synchronize(self, list_ids):
+        """Adds new messages to the cache and returns a list of them.
+        Confirm the changes to a mailbox to finally save it."""
+        new_ids = []
+        if(len(self.uids)==0):
+            new_ids = list_ids
+        else:
+            for uid in list_ids:
+                try:
+                    self.uids.index(uid)
+                    #print("found", uid, self.key)
+                except ValueError:
+                    new_ids.append(uid)
+                    #print("new", uid, self.key)
+        # update this mailbox potential uids
+        self.uids = list_ids
+        return new_ids
+
+    def confirm_change(self):
+        """Confirm the chages to the cached mailbox."""
+        self.needs_save = True
+        #print(self.owner, self.key, "committed.") 
+        return
index b7d240b..13da94b 100644 (file)
@@ -16,48 +16,87 @@ GNU General Public License for more details.
 '''
 
 import unittest
-import datetime
-import date_interpreter
+import datetime, date_interpreter
 
 class MailScriptTester(unittest.TestCase):
 
+    # class attributes
+    # DateInterpreter instance testing the DateInterpreter methods
+    date_interp = None
+    # datetime for comparison with extracted datetimes and assertions
+    true_date = None
+
     def setUp(self):
         self.date_interp = date_interpreter.DateInterpreter()
         self.true_date = datetime.datetime(2007, 12, 11, 18, 24, 35)
 
-    def test_received_header1(self):
+    def test_received_date_extraction1(self):
         """Tests the date extraction method."""
         date = [[0, b"Tue, 11 Dec 2007 18:24:35 +0100"]]
         extracted_date = self.date_interp.extract_received_date(date)
         self.assertEqual(extracted_date, self.true_date, "Failed date format 1")
 
-    def test_received_header2(self):
+    def test_received_date_extraction2(self):
         """Tests the date extraction method."""
         date = [[0, b"11 Dec 2007 \r\n18:24:35 +0100"]]
         extracted_date = self.date_interp.extract_received_date(date)
         self.assertEqual(extracted_date, self.true_date, "Failed date format 2")
         return
 
-    def test_received_header3(self):
+    def test_received_date_extraction3(self):
         """Tests the date extraction method."""  
         date = [[0, b"11 Dec 2007 18:24:35 +0100"]]
         extracted_date = self.date_interp.extract_received_date(date)
         self.assertEqual(extracted_date, self.true_date, "Failed date format 3")
 
-    def test_received_header4(self):
+    def test_received_date_extraction4(self):
         """Tests the date extraction method."""
         date = [[0, b"11 Dec 2007 18:24:35"]]
         extracted_date = self.date_interp.extract_received_date(date)
         #should not be equal because of time zone assumption
         self.assertNotEqual(extracted_date, self.true_date, "Failed date format 4")
 
-    def test_received_header5(self):
+    def test_received_date_extraction5(self):
         """Tests the received date extraction method."""
         date = [[0, b"11 Dec 2007 18:24:35 GMT"]]
         extracted_date = self.date_interp.extract_received_date(date)
         #should not be equal because of time zone assumption
         self.assertNotEqual(extracted_date, self.true_date, "Failed date format 5")
 
+    def test_received_date_extraction6(self):
+        """Tests the received date extraction method."""
+        date = [[0, b'Received: from intranator.m.i2n ([unix socket])'
+                b'by intranator.m.i2n with LMTPA; Tue, 11 Dec 2007 18:24:35'
+                b'+0100Received: from localhost (intranator.m.i2n [127.0.0.1])'
+                b'by localhost (Postfix) with ESMTP id 895812AC54for <intra2net_thomas@intranator.m.i2n>;'
+                b'Sun, 13 Mar 2011 18:47:18 +0100 (CET)Received: from re04.intra2net.com '
+                b'(re04.intra2net.com [82.165.46.26])(using TLSv1 with cipher ADH-AES256-SHA '
+                b'(256/256 bits))(No client certificate requested)by intranator.m.i2n (Postfix) with '
+                b'ESMTPS id 28DB92AC53for <thomas.jarosch@intra2net.com>; Sun, 13 Mar 2011 18:47:15 +0100 '
+                b'(CET)Received: from postfix.charite.de (postfix.charite.de [141.42.206.35])(using TLSv1 '
+                b'with cipher ADH-AES256-SHA (256/256 bits))(No client certificate requested)by '
+                b're04.intra2net.com (Postfix) with ESMTP id C054A3010Afor <thomas.jarosch@intra2net.com>; '
+                b'Sun, 13 Mar 2011 18:47:14 +0100 (CET)Received: from localhost (localhost [127.0.0.1])by '
+                b'de.postfix.org (Postfix) with ESMTP id 7FCCFF7879for <thomas.jarosch@intra2net.com>; '
+                b'Sun, 13 Mar 2011 18:47:14 +0100 (CET)Received: from de.postfix.org ([127.0.0.1])by '
+                b'localhost (de.postfix.org [127.0.0.1]) (amavisd-new, port 10026)with LMTP id '
+                b'YSXF-vf3+6E1 for <thomas.jarosch@intra2net.com>;Sun, 13 Mar 2011 18:47:14 +0100 (CET)'
+                b'Received: from de.postfix.org (localhost [127.0.0.1])by de.postfix.org (Postfix) with '
+                b'ESMTP id 3C3123DF1Efor <thomas.jarosch@intra2net.com>; Sun, 13 Mar 2011 18:46:33 +0100 '
+                b'(CET)Received: from localhost (localhost [127.0.0.1])by de.postfix.org (Postfix) with '
+                b'ESMTP id AB6CE3DBD2for <amavis-users@amavis.org>; Sun, 13 Mar 2011 18:45:57 +0100 (CET)'
+                b'Received: from de.postfix.org ([127.0.0.1])by localhost (de.postfix.org [127.0.0.1]) '
+                b'(amavisd-new, port 10024)with ESMTP id mBYiZO8wREeS for <amavis-users@amavis.org>;Sun, '
+                b'13 Mar 2011 18:45:56 +0100 (CET)Received: from mail.inetmsg.com (mail.inetmsg.com '
+                b'[173.10.94.185])by de.postfix.org (Postfix) with ESMTPSfor <amavis-users@amavis.org>; '
+                b'Sun, 13 Mar 2011 18:45:55 +0100 (CET)Received: from [192.168.1.107] (fw1.inetmsg.com '
+                b'[10.20.30.253])(using TLSv1 with cipher DHE-RSA-CAMELLIA256-SHA (256/256 bits))'
+                b'(No client certificate requested)by mail.inetmsg.com (INetMsg Mail Service) with ESMTPSA '
+                b'id 0B95326CD1for <amavis-users@amavis.org>; Sun, 13 Mar 2011 10:45:41 -0700 (PDT)"]]']]
+        extracted_date = self.date_interp.extract_received_date(date)
+        #should not be equal because of time zone assumption
+        self.assertEqual(extracted_date, self.true_date, "Failed date format 6")
+
     def test_compare_dates(self):
         """Tests the date comparison method."""
         self.true_date2 = datetime.datetime(2007, 12, 11, 18, 34, 35)