From: Plamen Dimitrov Date: Tue, 26 Jun 2012 15:18:47 +0000 (+0200) Subject: Basic refactoring and Tom's recommendations X-Git-Url: http://developer.intra2net.com/git/?p=imap-fix-internaldate;a=commitdiff_plain;h=7a1d4c35586b37483eeba433eeb3ca46334095ef Basic refactoring and Tom's recommendations --- diff --git a/caching_data.py b/caching_data.py index ea3fb8c..e7c5f9e 100644 --- a/caching_data.py +++ b/caching_data.py @@ -14,82 +14,90 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ''' - +import os, tempfile import pickle +from mailbox_state import MailboxState + +CACHING_FILENAME = "caching_data.dat" class CachingData: """This class is responsible for the caching of data.""" + + # class attributes + # integer for version of the cache + version = None + # dictionary of usernames as keys and dictionaries as values + # the second dictionaries have unique mailbox keys and mailboxes as values + data = None def __init__(self): try: - cachingfile = open('caching_data.pkl', 'rb') - self.data = pickle.load(cachingfile) - #print(len(self.data), "users found.") - self.save_flag = {} - for user in self.data: - self.save_flag[user] = {} - for uid_key in self.data[user]: - self.save_flag[user][uid_key] = False + cachingfile = open(CACHING_FILENAME, 'rb') + self.version, self.data = pickle.load(cachingfile) + print("Cache version", self.version) + print(len(self.data), "users found.") except IOError: + self.version = 0 self.data = {} - self.save_flag = {} - with open('caching_data.pkl', 'wb') as cachingfile: - pickle.dump(self.data, cachingfile) + with open(CACHING_FILENAME, 'wb') as cachingfile: + pickle.dump((0, self.data), cachingfile) def __del__(self): - with open('caching_data.pkl', 'wb') as cachingfile: - # prepare data based on a save flag - for user in self.save_flag: - for uid_key in self.save_flag[user]: - if(not self.save_flag[user][uid_key]): - del self.data[user][uid_key] - #print(uidvalidity, "deleted from cache.") - if(len(self.data[user])==0): - del self.data[user] - #print(user, "deleted from cache.") - - # serialize in file - pickle.dump(self.data, cachingfile) - - #print(len(self.data), "users stored.") - - def _cache_new_mailbox(self, username, uid_key): - """Store the mailbox as integer uidvalidity""" - if(username not in self.data): - self.data[username] = {} - self.save_flag[username] = {} - #print(username, "created.") - if(uid_key not in self.data[username]): - self.data[username][uid_key] = [] - self.save_flag[username][uid_key] = False - #print(uid_key, "created.") - return - - def sync_cached_mailbox(self, username, uid_key, list_ids): - """Adds new messages to the cache and returns a list of them. - Confirm the changes to a mailbox to finally save it.""" - new_ids = [] + # create temporary file first + location = os.path.dirname(CACHING_FILENAME) + file_descriptor, tmpname = tempfile.mkstemp(dir=location) + cachingfile = os.fdopen(file_descriptor, 'wb') - if(username not in self.data or \ - uid_key not in self.data[username]): - self._cache_new_mailbox(username, uid_key) - new_ids = list_ids - else: - for uid in list_ids: - try: - self.data[username][uid_key].index(uid) - #print("found", uid, uid_key) - except ValueError: - #print("new", uid, uid_key) - new_ids.append(uid) + # prepare data based on a save flag + saved_data = {} + for user in self.data: + saved_data[user] = {} + for box_key in self.data[user]: + if(self.data[user][box_key].needs_save): + saved_data[user][box_key] = self.data[user][box_key] + print(saved_data[user][box_key].name, "will be saved.") + if(len(saved_data[user])==0): + del saved_data[user] + print(user, "will not be saved.") + self.data = saved_data - # update cached_mailbox - self.data[username][uid_key] = list_ids + # avoid test mode or cases where nothing needs saving + if(len(saved_data)==0): + os.unlink(tmpname) + return + # serialize in file + self.version += 1 + pickle.dump((self.version, self.data), cachingfile) + print(len(self.data), "users stored.") + cachingfile.close() + os.rename(tmpname, CACHING_FILENAME) - return new_ids - - def commit_cached_mailbox(self, username, uid_key): - """Confirm the chages to the cached mailbox.""" - self.save_flag[username][uid_key] = True - #print(username, uid_key, "committed.") + def retrieve_cached_mailbox(self, name, uidvalidity, user): + """Retrieve a cached mailbox or create it.""" + box_key = name.strip('"') + uidvalidity + if(user not in self.data): + self.data[user] = {} + #print(user, "created.") + if(box_key not in self.data[user]): + self.data[user][box_key] = MailboxState(name, uidvalidity, user) + #print(box_key, "created.") + return self.data[user][box_key] + + def report_date_conflicts(self): + """Write a date conflicts report in a file.""" + with open("conflict_stats.txt", 'w') as statsfile: + owner_total_conflicts = {} + owner_total_missing = {} + for user in self.data: + owner_total_conflicts[user] = 0 + owner_total_missing[user] = 0 + for box_key in self.data[user]: + owner_total_conflicts[user] += self.data[user][box_key].date_conflicts + owner_total_missing[user] += self.data[user][box_key].no_received_field + statsfile.write("Total date conflicts to be corrected in a mailbox {0} are {1}.\n"\ + .format(self.data[user][box_key].name, self.data[user][box_key].date_conflicts)) + statsfile.write("Total missing received headers in a mailbox {0} are {1}.\n"\ + .format(self.data[user][box_key].name, self.data[user][box_key].no_received_field)) + statsfile.write("Total date conflicts to be corrected for user {0} are {1}.\n\n"\ + .format(user, owner_total_missing[user])) return diff --git a/date_interpreter.py b/date_interpreter.py index b494df6..f2eeedc 100644 --- a/date_interpreter.py +++ b/date_interpreter.py @@ -13,15 +13,10 @@ This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -Add '-t' argument when running the module for a test mode. -For a detailed list of each message with a date conflict change -the 'log_level' in the configuration file from '30' to '20'. ''' -import datetime +import datetime, time import re -import time #reg expressions RECEIVED_DATE = re.compile(r'(0?[1-9]|[1-2][0-9]|3[01])\s+([A-Z][a-z][a-z])\s+' @@ -33,7 +28,8 @@ INTERNAL_DATE = re.compile(r'(?P[ 0123][0-9])-(?P[A-Z][a-z][a-z])-(?P< CONTROL_SYMBOLS = re.compile(r'[\n\r\t]') class DateInterpreter: - """This class extracts dates from imap server responses and compares them.""" + """This class extracts dates from imap server responses and compares them. + This class contains only static methods.""" def __init__(self): return diff --git a/fix_imap_internaldate.py b/fix_imap_internaldate.py index 7aa0675..bd95732 100644 --- a/fix_imap_internaldate.py +++ b/fix_imap_internaldate.py @@ -13,40 +13,41 @@ This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -Add '-t' argument when running the module for a test mode. -For a detailed list of each message with a date conflict change -the 'log_level' in the configuration file from '30' to '20'. ''' import sys import csv -import logging -import configparser +import logging, configparser from date_interpreter import DateInterpreter from mail_iterator import MailIterator from caching_data import CachingData def main(): """Iterates through csv list of users and their mailboxes""" - if (len(sys.argv) > 1 and sys.argv[1]=="-t"): - test_mode = 1 + if(len(sys.argv) > 1): + if(sys.argv[1]=="--h"): + print("The default mode of the script is test mode." + "Add '--u' argument to exit to modify messages." + "For a detailed list of each message with a date conflict change" + "change the 'log_level' in the configuration file from '30' to '20'.") + return + if(sys.argv[1]=="--u"): + test_mode = False else: - test_mode = 0 + test_mode = True config = load_configuration() - logging.basicConfig(filename='mailscript.log', + logging.basicConfig(filename='fix_imap_internaldate.log', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=config.getint('basic_settings', 'log_level')) date_interp = DateInterpreter() - cashing_data = CachingData() - logging.warning(cashing_data) + caching_data = CachingData() + logging.warning("Cache version %s loaded.\n\n", caching_data.version) user_reader = csv.DictReader(open("userdata.csv", "r"), delimiter=',') server = config.get('basic_settings', 'imap_server') tolerance = config.getint('basic_settings', 'tolerance') - total_per_box = {} for user in user_reader: try: @@ -56,11 +57,10 @@ def main(): continue for mailbox in session: try: - #special key to ensure better mailbox uniqueness - mailbox_key = mailbox[0].strip('"') + mailbox[1] + box = caching_data.retrieve_cached_mailbox(mailbox[0], mailbox[1], user['username']) mail_ids = session.fetch_messages() - new_ids = cashing_data.sync_cached_mailbox(user['username'], mailbox_key, mail_ids) - #print(len(new_ids), "new out of", len(mail_ids), "in", mailbox) + new_ids = box.synchronize(mail_ids) + logging.warning("%s non-cached messages found out of %s in %s.\n", len(new_ids), len(mail_ids), box.name) except UserWarning as ex: logging.error(ex) continue @@ -71,8 +71,9 @@ def main(): fetched_received_date = session.fetch_received_date(mid) received_date = date_interp.extract_received_date(fetched_received_date) if(received_date==""): - logging.warning("No received date could be found in message uid: %s - mailbox: %s - user: %s.\n", - mid.decode("utf-8"), mailbox[0], user['username']) + logging.info("No received date could be found in message uid: %s - mailbox: %s - user: %s.\n", + mid.decode("utf-8"), box.name, box.owner) + box.no_received_field += 1 continue except UserWarning as ex: logging.error(ex) @@ -81,29 +82,24 @@ def main(): #print(received_date, internal_date) if(test_mode==0): try: - session.update_message(mid, mailbox[0], received_date) + session.update_message(mid, box.name, received_date) except UserWarning as ex: logging.error(ex) continue else: logging.info("Date conflict found in message uid: %s - mailbox: %s - user: %s.\nInternal date %s is different from received date %s from RECEIVED header:\n%s.", - mid.decode("utf-8"), mailbox[0], user['username'], + mid.decode("utf-8"), box.name, box.owner, internal_date.strftime("%d %b %Y %H:%M:%S"), received_date.strftime("%d %b %Y %H:%M:%S"), fetched_received_date[0][1].decode("utf-8").split("Received:")[1]) # count total emails for every user and mailbox - user_key = user['username']+'|'+mailbox[0].strip('"') - total_per_box[user_key] = 1 + total_per_box.get(user_key, 0) + box.date_conflicts += 1 # if all messages were successfully fixed confirm caching - cashing_data.commit_cached_mailbox(user['username'], mailbox_key) + if(not test_mode): + box.confirm_change() + # final report on date conflicts - total_per_user = 0 - for warning in total_per_box: - total_per_user += total_per_box[warning] - logging.warning("Total date conflicts to be corrected in a mailbox %s are %s.", - warning.split('|')[1], total_per_box[warning]) - logging.warning("Total date conflicts to be corrected for user %s are %s.\n", - user['username'], total_per_user) + caching_data.report_date_conflicts() def load_configuration(): """Loads the script configuration from a file or creates such.""" diff --git a/mail_iterator.py b/mail_iterator.py index 1119945..be7964d 100644 --- a/mail_iterator.py +++ b/mail_iterator.py @@ -13,10 +13,6 @@ This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -Add '-t' argument when running the module for a test mode. -For a detailed list of each message with a date conflict change -the 'log_level' in the configuration file from '30' to '20'. ''' import imaplib @@ -29,6 +25,12 @@ UIDVAL_RESP = re.compile(r'(?P.*) \(UIDVALIDITY (?P.*)\)') class MailIterator: """This class communicates with the e-mail server.""" + # class attributes + # IMAP4_SSL for connection with an IMAP server + mail_con = None + # list of tuples (uidvalidity, mailboxname) for the retrieved mailboxes + mailboxes = None + def __init__(self, server, username, password): """Creates a connection and a user session.""" self.mail_con = imaplib.IMAP4_SSL(server) diff --git a/mailbox_state.py b/mailbox_state.py new file mode 100644 index 0000000..e2bd86a --- /dev/null +++ b/mailbox_state.py @@ -0,0 +1,108 @@ +''' +mailbox_state.py - The module contains the MailboxState class. + +Copyright (c) 2012 Intra2net AG +Author: Plamen Dimitrov + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +''' + +class MailboxState: + """This class is responsible for containing and updating a mailbox data.""" + + # class attributes + # string with quotation marks for the mailbox name + name = None + # string for the mailbox uidvalidity + uidvalidity = None + # string for user owning the mailbox + owner = None + # list of bytes for last cached mail uids + uids = None + # boolean flag for committing state changes + needs_save = None + # integer for found date conflicts + date_conflicts = None + # integer for found messages with missing received headers + no_received_field = None + # unique key for a mailbox + key = None + + def __init__(self, name, uidvalidity, owner): + self.name = name + self.uidvalidity = uidvalidity + self.owner = owner + + self.uids = [] + self.needs_save = False + + self.date_conflicts = 0 + self.no_received_field = 0 + + #special key to ensure better mailbox uniqueness + self.key = self.name.strip('"') + self.uidvalidity + + return + + def __getstate__(self): + """Prepares the MailboxState instance for pickling.""" + changed_dict = self.__dict__.copy() + # remove the following attributes for pickling + del changed_dict['needs_save'] + del changed_dict['date_conflicts'] + del changed_dict['no_received_field'] + #print("pickling preparation complete") + return changed_dict + + def __setstate__(self, dict): + """Prepares the MailboxState instance for unpickling.""" + self.name = dict["name"] + self.uidvalidity = dict["uidvalidity"] + self.owner = dict["owner"] + + self.uids = dict["uids"] + self.needs_save = False + + self.date_conflicts = 0 + self.no_received_field = 0 + + self.key = dict["key"] + + #print("unpickling preparation complete") + return + + def __str__(self): + """Makes the class printable.""" + return self.key + + def synchronize(self, list_ids): + """Adds new messages to the cache and returns a list of them. + Confirm the changes to a mailbox to finally save it.""" + new_ids = [] + if(len(self.uids)==0): + new_ids = list_ids + else: + for uid in list_ids: + try: + self.uids.index(uid) + #print("found", uid, self.key) + except ValueError: + new_ids.append(uid) + #print("new", uid, self.key) + # update this mailbox potential uids + self.uids = list_ids + return new_ids + + def confirm_change(self): + """Confirm the chages to the cached mailbox.""" + self.needs_save = True + #print(self.owner, self.key, "committed.") + return diff --git a/unit_tester.py b/unit_tester.py index b7d240b..13da94b 100644 --- a/unit_tester.py +++ b/unit_tester.py @@ -16,48 +16,87 @@ GNU General Public License for more details. ''' import unittest -import datetime -import date_interpreter +import datetime, date_interpreter class MailScriptTester(unittest.TestCase): + # class attributes + # DateInterpreter instance testing the DateInterpreter methods + date_interp = None + # datetime for comparison with extracted datetimes and assertions + true_date = None + def setUp(self): self.date_interp = date_interpreter.DateInterpreter() self.true_date = datetime.datetime(2007, 12, 11, 18, 24, 35) - def test_received_header1(self): + def test_received_date_extraction1(self): """Tests the date extraction method.""" date = [[0, b"Tue, 11 Dec 2007 18:24:35 +0100"]] extracted_date = self.date_interp.extract_received_date(date) self.assertEqual(extracted_date, self.true_date, "Failed date format 1") - def test_received_header2(self): + def test_received_date_extraction2(self): """Tests the date extraction method.""" date = [[0, b"11 Dec 2007 \r\n18:24:35 +0100"]] extracted_date = self.date_interp.extract_received_date(date) self.assertEqual(extracted_date, self.true_date, "Failed date format 2") return - def test_received_header3(self): + def test_received_date_extraction3(self): """Tests the date extraction method.""" date = [[0, b"11 Dec 2007 18:24:35 +0100"]] extracted_date = self.date_interp.extract_received_date(date) self.assertEqual(extracted_date, self.true_date, "Failed date format 3") - def test_received_header4(self): + def test_received_date_extraction4(self): """Tests the date extraction method.""" date = [[0, b"11 Dec 2007 18:24:35"]] extracted_date = self.date_interp.extract_received_date(date) #should not be equal because of time zone assumption self.assertNotEqual(extracted_date, self.true_date, "Failed date format 4") - def test_received_header5(self): + def test_received_date_extraction5(self): """Tests the received date extraction method.""" date = [[0, b"11 Dec 2007 18:24:35 GMT"]] extracted_date = self.date_interp.extract_received_date(date) #should not be equal because of time zone assumption self.assertNotEqual(extracted_date, self.true_date, "Failed date format 5") + def test_received_date_extraction6(self): + """Tests the received date extraction method.""" + date = [[0, b'Received: from intranator.m.i2n ([unix socket])' + b'by intranator.m.i2n with LMTPA; Tue, 11 Dec 2007 18:24:35' + b'+0100Received: from localhost (intranator.m.i2n [127.0.0.1])' + b'by localhost (Postfix) with ESMTP id 895812AC54for ;' + b'Sun, 13 Mar 2011 18:47:18 +0100 (CET)Received: from re04.intra2net.com ' + b'(re04.intra2net.com [82.165.46.26])(using TLSv1 with cipher ADH-AES256-SHA ' + b'(256/256 bits))(No client certificate requested)by intranator.m.i2n (Postfix) with ' + b'ESMTPS id 28DB92AC53for ; Sun, 13 Mar 2011 18:47:15 +0100 ' + b'(CET)Received: from postfix.charite.de (postfix.charite.de [141.42.206.35])(using TLSv1 ' + b'with cipher ADH-AES256-SHA (256/256 bits))(No client certificate requested)by ' + b're04.intra2net.com (Postfix) with ESMTP id C054A3010Afor ; ' + b'Sun, 13 Mar 2011 18:47:14 +0100 (CET)Received: from localhost (localhost [127.0.0.1])by ' + b'de.postfix.org (Postfix) with ESMTP id 7FCCFF7879for ; ' + b'Sun, 13 Mar 2011 18:47:14 +0100 (CET)Received: from de.postfix.org ([127.0.0.1])by ' + b'localhost (de.postfix.org [127.0.0.1]) (amavisd-new, port 10026)with LMTP id ' + b'YSXF-vf3+6E1 for ;Sun, 13 Mar 2011 18:47:14 +0100 (CET)' + b'Received: from de.postfix.org (localhost [127.0.0.1])by de.postfix.org (Postfix) with ' + b'ESMTP id 3C3123DF1Efor ; Sun, 13 Mar 2011 18:46:33 +0100 ' + b'(CET)Received: from localhost (localhost [127.0.0.1])by de.postfix.org (Postfix) with ' + b'ESMTP id AB6CE3DBD2for ; Sun, 13 Mar 2011 18:45:57 +0100 (CET)' + b'Received: from de.postfix.org ([127.0.0.1])by localhost (de.postfix.org [127.0.0.1]) ' + b'(amavisd-new, port 10024)with ESMTP id mBYiZO8wREeS for ;Sun, ' + b'13 Mar 2011 18:45:56 +0100 (CET)Received: from mail.inetmsg.com (mail.inetmsg.com ' + b'[173.10.94.185])by de.postfix.org (Postfix) with ESMTPSfor ; ' + b'Sun, 13 Mar 2011 18:45:55 +0100 (CET)Received: from [192.168.1.107] (fw1.inetmsg.com ' + b'[10.20.30.253])(using TLSv1 with cipher DHE-RSA-CAMELLIA256-SHA (256/256 bits))' + b'(No client certificate requested)by mail.inetmsg.com (INetMsg Mail Service) with ESMTPSA ' + b'id 0B95326CD1for ; Sun, 13 Mar 2011 10:45:41 -0700 (PDT)"]]']] + extracted_date = self.date_interp.extract_received_date(date) + #should not be equal because of time zone assumption + self.assertEqual(extracted_date, self.true_date, "Failed date format 6") + def test_compare_dates(self): """Tests the date comparison method.""" self.true_date2 = datetime.datetime(2007, 12, 11, 18, 34, 35)