From: Christian Herdtweck Date: Mon, 15 Jul 2019 11:06:08 +0000 (+0200) Subject: Add option to parse newer EmailMessage type from mails X-Git-Tag: v1.6~1^2~1 X-Git-Url: http://developer.intra2net.com/git/?a=commitdiff_plain;h=4b44f515a98d16114153437c9782185dafcdc80c;p=pyi2ncommon Add option to parse newer EmailMessage type from mails Sofar, parser policy was not specified, so "compat32" was used for compatibility with python 3.2. Changing to the more modern "default" policy returns a different type of message (EmailMessage instead of Message) that has quite some differences. For example, EmailMessage.get(header_name) returns no longer str but a Header object. This would break lots of code so use the old compat32 as default still. --- diff --git a/src/mail_utils.py b/src/mail_utils.py index 19adb76..ed128df 100644 --- a/src/mail_utils.py +++ b/src/mail_utils.py @@ -43,6 +43,7 @@ import re import logging from email.utils import parsedate_to_datetime from email.parser import BytesParser +from email import policy from . import arnied_wrapper @@ -123,32 +124,51 @@ def create_users(usernames, config_file, params): log.info("%s users successfully created!", len(usernames)) -def parse_mail_file(file_name, headers_only=True, attachment_filenames=False): +def parse_mail_file(file_name, headers_only=True, attachment_filenames=False, + raise_on_defect=False, new_message_type=False): """ Parse given email file (e.g. a banned message). This is basically a `email.parser.BytesParser().parse(...)` with given - `headers_only`, that can handle BSMTP. As an extra bonus, you can just - request headers plus the names of attached files. + `headers_only` and policy selection, that can also handle BSMTP. As an + extra bonus, you can just request headers plus the names of attached files. Removes the SMTP envelope surrounding the email if present. Only left-over might be a line with a '.' at end of non-multipart messages if `headers_only` is False. - :param str file_name: file name for the email + :param str file_name: path to the file that contains the email text :param bool headers_only: whether to parse only the email headers; set this to False, e.g. if you want to check for attachments using message.walk() :param bool attachment_filenames: if you just want headers and names of attached files, set `headers_only` and this to True. + :param bool raise_on_defect: whether to raise an error if email parser + encounters a defect (email policy `strict`) or + just add the defect to message's `defect` + attribute + :param bool new_message_type: whether to return the older + :py:class:`email.message.Message` (policy + `compat32`, our default), or the newer + :py:class:`email.message.EmailMessage` type + (policy `default`). Big difference! :returns: either msg or 2-tuple `(msg, filenames)` if requested per arg `attachment_filenames` :rtype: :py:class:`email.message.Message` or - (:py:class:`email.message.Message`, (str)) + (:py:class:`email.message.Message`, (str)) or + one of these two with :py:class:`email.message.EmailMessage` """ msg = None start_pos = 0 + + if new_message_type: + mail_policy = policy.default + else: + mail_policy = policy.compat32 + if raise_on_defect: + mail_policy += policy.strict + with open(file_name, 'rb') as read_handle: line = read_handle.readline() if line.startswith(b'EHLO'): @@ -160,7 +180,8 @@ def parse_mail_file(file_name, headers_only=True, attachment_filenames=False): else: read_handle.seek(0) # forget we read the first line already start_pos = read_handle.tell() - msg = BytesParser().parse(read_handle, headersonly=headers_only) + msg = BytesParser(policy=mail_policy).parse(read_handle, + headersonly=headers_only) if not attachment_filenames: return msg @@ -169,7 +190,8 @@ def parse_mail_file(file_name, headers_only=True, attachment_filenames=False): if headers_only: with open(file_name, 'rb') as read_handle: read_handle.seek(start_pos) - full_msg = BytesParser().parse(read_handle, headersonly=False) + full_msg = BytesParser(policy=mail_policy).parse(read_handle, + headersonly=False) else: full_msg = msg filenames = [get_filename(part) for part in full_msg.walk()] @@ -378,7 +400,8 @@ def get_filename(message, failobj=None, do_unwrap=True): :param message: message part, e.g. from :py:meth:`email.message.Message.walk` - :type message: :py:class:`email.message.Message` + :type message: :py:class:`email.message.Message` or + :py:class:`email.message.EmailMessage` :param failobj: object to return in case of failure (defaults to None) :param bool do_unwrap: undo line-break inserted by mail-creator; may remove whitespace from file name; only applies to ascii @@ -406,7 +429,7 @@ def get_filename(message, failobj=None, do_unwrap=True): # '5YWs5byA6K++6K6h5YiS6KGoLnhsc3gi?=' # This may be a re-implementation of email.utils.collapse_rfc2231_value() - # as mentioned in email.message.Message.get_param() + # as mentioned in email.message.EmailMessage.get_param() # The form is: "=?charset?encoding?encoded text?=" SPLIT_REGEX = '\r?\n *' # should be CRNL but some files miss the \r