From: Christian Herdtweck <christian.herdtweck@intra2net.com>
Date: Mon, 15 Jul 2019 11:06:08 +0000 (+0200)
Subject: Add option to parse newer EmailMessage type from mails
X-Git-Tag: v1.6~1^2~1
X-Git-Url: http://developer.intra2net.com/git/?a=commitdiff_plain;h=4b44f515a98d16114153437c9782185dafcdc80c;p=pyi2ncommon

Add option to parse newer EmailMessage type from mails

Sofar, parser policy was not specified, so "compat32" was used for compatibility
with python 3.2. Changing to the more modern "default" policy returns a different
type of message (EmailMessage instead of Message) that has quite some differences.
For example, EmailMessage.get(header_name) returns no longer str but a Header object.
This would break lots of code so use the old compat32 as default still.
---

diff --git a/src/mail_utils.py b/src/mail_utils.py
index 19adb76..ed128df 100644
--- a/src/mail_utils.py
+++ b/src/mail_utils.py
@@ -43,6 +43,7 @@ import re
 import logging
 from email.utils import parsedate_to_datetime
 from email.parser import BytesParser
+from email import policy
 
 from . import arnied_wrapper
 
@@ -123,32 +124,51 @@ def create_users(usernames, config_file, params):
     log.info("%s users successfully created!", len(usernames))
 
 
-def parse_mail_file(file_name, headers_only=True, attachment_filenames=False):
+def parse_mail_file(file_name, headers_only=True, attachment_filenames=False,
+                    raise_on_defect=False, new_message_type=False):
     """
     Parse given email file (e.g. a banned message).
 
     This is basically a `email.parser.BytesParser().parse(...)` with given
-    `headers_only`, that can handle BSMTP. As an extra bonus, you can just
-    request headers plus the names of attached files.
+    `headers_only` and policy selection, that can also handle BSMTP. As an
+    extra bonus, you can just request headers plus the names of attached files.
 
     Removes the SMTP envelope surrounding the email if present. Only left-over
     might be a line with a '.' at end of non-multipart messages if
      `headers_only` is False.
 
-    :param str file_name: file name for the email
+    :param str file_name: path to the file that contains the email text
     :param bool headers_only: whether to parse only the email headers; set this
                               to False, e.g. if you want to check for
                               attachments using message.walk()
     :param bool attachment_filenames: if you just want headers and names of
                                       attached files, set `headers_only` and
                                       this to True.
+    :param bool raise_on_defect: whether to raise an error if email parser
+                                 encounters a defect (email policy `strict`) or
+                                 just add the defect to message's `defect`
+                                 attribute
+    :param bool new_message_type: whether to return the older
+                                  :py:class:`email.message.Message` (policy
+                                  `compat32`, our default), or the newer
+                                  :py:class:`email.message.EmailMessage` type
+                                  (policy `default`). Big difference!
     :returns: either msg or 2-tuple `(msg, filenames)` if requested per arg
               `attachment_filenames`
     :rtype: :py:class:`email.message.Message` or
-             (:py:class:`email.message.Message`, (str))
+             (:py:class:`email.message.Message`, (str)) or
+             one of these two with :py:class:`email.message.EmailMessage`
     """
     msg = None
     start_pos = 0
+
+    if new_message_type:
+        mail_policy = policy.default
+    else:
+        mail_policy = policy.compat32
+    if raise_on_defect:
+        mail_policy += policy.strict
+
     with open(file_name, 'rb') as read_handle:
         line = read_handle.readline()
         if line.startswith(b'EHLO'):
@@ -160,7 +180,8 @@ def parse_mail_file(file_name, headers_only=True, attachment_filenames=False):
         else:
             read_handle.seek(0)  # forget we read the first line already
         start_pos = read_handle.tell()
-        msg = BytesParser().parse(read_handle, headersonly=headers_only)
+        msg = BytesParser(policy=mail_policy).parse(read_handle,
+                                                    headersonly=headers_only)
 
     if not attachment_filenames:
         return msg
@@ -169,7 +190,8 @@ def parse_mail_file(file_name, headers_only=True, attachment_filenames=False):
     if headers_only:
         with open(file_name, 'rb') as read_handle:
             read_handle.seek(start_pos)
-            full_msg = BytesParser().parse(read_handle, headersonly=False)
+            full_msg = BytesParser(policy=mail_policy).parse(read_handle,
+                                                             headersonly=False)
     else:
         full_msg = msg
     filenames = [get_filename(part) for part in full_msg.walk()]
@@ -378,7 +400,8 @@ def get_filename(message, failobj=None, do_unwrap=True):
 
     :param message: message part, e.g. from
                     :py:meth:`email.message.Message.walk`
-    :type message: :py:class:`email.message.Message`
+    :type message: :py:class:`email.message.Message` or
+                   :py:class:`email.message.EmailMessage`
     :param failobj: object to return in case of failure (defaults to None)
     :param bool do_unwrap: undo line-break inserted by mail-creator; may remove
                            whitespace from file name; only applies to ascii
@@ -406,7 +429,7 @@ def get_filename(message, failobj=None, do_unwrap=True):
     # '5YWs5byA6K++6K6h5YiS6KGoLnhsc3gi?='
 
     # This may be a re-implementation of email.utils.collapse_rfc2231_value()
-    # as mentioned in email.message.Message.get_param()
+    # as mentioned in email.message.EmailMessage.get_param()
 
     # The form is: "=?charset?encoding?encoded text?="
     SPLIT_REGEX = '\r?\n *'    # should be CRNL but some files miss the \r