From b36398e75fa44512466fa1ad88ad29f5b9c7d406 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 15 Dec 2017 16:17:27 +0100 Subject: [PATCH] Add function to decode non-ascii attachment names from mails --- src/mail_utils.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 75 insertions(+), 0 deletions(-) diff --git a/src/mail_utils.py b/src/mail_utils.py index e9a7f16..c0e46e5 100644 --- a/src/mail_utils.py +++ b/src/mail_utils.py @@ -39,6 +39,7 @@ import os import difflib import socket from inspect import currentframe +from base64 import b64decode import re import subprocess import logging @@ -721,3 +722,77 @@ def cyrus_unescape(user_or_folder): return 'inbox' return user_or_folder.replace('.', '/')\ .replace(r'\^', '.').replace('^', '.') + + +def get_filename(message, failobj=None, do_unwrap=True): + """ + Get filename of a message part, even if it is base64-encoded + + For attachments with base64-encoded file name, the + :py:func:`email.message.Message.get_filename()` does not work. This function + tries that first and if it fails tries to interprete the Content-Disposition + of the message part. If all fails, returns `failobj`. + + Only for ascii filenames: also unwraps file names if they are line-wrapped. + But note that this may remove too much whitespace from the filename if + line-wrapping happend in the same position as the filename's whitespace. + To get unwrapped version, set param `do_unwrap` to `False`. + + See also: https://en.wikipedia.org/wiki/MIME#Encoded-Word + + :param message: message part, e.g. from + :py:meth:`email.message.Message.walk` + :type message: :py:class:`email.message.Message` + :param failobj: object to return in case of failure (defaults to None) + :param bool do_unwrap: undo line-break inserted by mail-creator; may remove + whitespace from file name; only applies to ascii + file names + :returns: either a string or failobj + """ + # try the old way and unwrap + filename = message.get_filename(failobj) + + if isinstance(filename, bytes) and not filename.startswith(b'=?') \ + and not filename.endswith(b'?='): + filename = filename.decode('utf8') + + if isinstance(filename, str): + if do_unwrap: + return re.sub('[\\r\\n]+', '', filename) + return filename + + if 'Content-Disposition' not in message: + return failobj + + # try parsing content-disposition. e.g.: + # attachment; filename="2018年度公开课计划表.xlsx" --> + # '=?utf-8?b?YXR0YWNobWVudDsgZmlsZW5hbWU9IjIwMTjlubTluqY=?=\r\n =?utf-8?b?' + # '5YWs5byA6K++6K6h5YiS6KGoLnhsc3gi?=' + + # This may be a re-implementation of email.utils.collapse_rfc2231_value() + # as mentioned in email.message.get_param() + + # The form is: "=?charset?encoding?encoded text?=" + SPLIT_REGEX = '\r?\n *' # should be CRNL but some files miss the \r + ENCODED_WORD_REGEX = r'\s*=\?([^?]+)\?([^?]+)\?(.*)\?=\s*$' + LINE_REGEX = r'attachment\s*;\s*filename=(")?(.+)\1\s*$' + decoded = [] + for word in re.split(SPLIT_REGEX, message['Content-Disposition']): + match = re.match(ENCODED_WORD_REGEX, word) + if not match: + break + charset, encoding, data = match.groups() + if encoding.lower() == 'b': + temp = b64decode(data) + elif encoding.lower() == 'q': + raise NotImplementedError('use quopri.decodestring, handle _') + else: + raise ValueError('not allowed according to wikipedia: "{}"' + .format(encoding)) + decoded.append(temp.decode(charset)) + decoded = u''.join(decoded) + + match = re.match(LINE_REGEX, decoded) + if match: + return match.groups()[1] + return failobj -- 1.7.1