[pyi2ncommon] / src / mail_utils.py

# This Python file uses the following encoding: utf-8

# The software in this package is distributed under the GNU General
# Public License version 2 (with a special exception described below).
#
# A copy of GNU General Public License (GPL) is included in this distribution,
# in the file COPYING.GPL.
#
# As a special exception, if other files instantiate templates or use macros
# or inline functions from this file, or you compile this file and link it
# with other works to produce a work based on this file, this file
# does not by itself cause the resulting work to be covered
# by the GNU General Public License.
#
# However the source code for this file must still be made available
# in accordance with section (3) of the GNU General Public License.
#
# This exception does not invalidate any other reasons why a work based
# on this file might be covered by the GNU General Public License.
#
# Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>

"""
Utilities for dealing with email.

.. seealso:: :py:mod:`pyi2ncommon.mail_validator`,
             :py:mod:`pyi2ncommon.imap_mailbox`

Copyright: Intra2net AG
"""

from base64 import b64decode
from email.utils import parsedate_to_datetime
from email.parser import BytesParser
from email import policy

# outsourced source, import required for compatiblity
from .imap_mailbox import ImapMailbox           # pylint: disable=unused-import
from .mail_validator import *                   # pylint: disable=unused-import
from .sysmisc import replace_file_regex

log = logging.getLogger('pyi2ncommon.mail_utils')


def prep_email_header(email_file, value, regex=None, criterion="envelopeto"):
    """
    Replace value in a provided email file.

    :param str email_file: file to use for the replacement
    :param str value: value to replace the first matched group with
    :param regex: regular expression to use when replacing a header value
    :type regex: str or None
    :param str criterion: criterion to use for replacement, one
                          of 'envelopeto' or 'received'
    :raises: :py:class:`ValueError` if the choice of criterion is invalid

    ..todo:: In some cases this function is reusing arnied wrapper's cnf
             value preparation but for email headers.
    """
    if criterion == "envelopeto":
        logging.debug("Updating test emails' EnvelopeTo header")
        replace_file_regex(email_file, value, regex=regex)
    elif criterion == "received":
        logging.debug("Updating test emails' Received header")
        with open(email_file, "r") as file_handle:
            email_text = file_handle.read()
            email_text = re.sub(regex, value, email_text)
            email_text = re.sub(regex, value, email_text)
        with open(email_file, "w") as file_handle:
            file_handle.write(email_text)
    else:
        raise ValueError("Invalid header preparation criterion '%s'"
                         % criterion)


def parse_mail_file(file_name, headers_only=True, attachment_filenames=False,
                    raise_on_defect=False, new_message_type=False):
    """
    Parse given email file (e.g. a banned message).

    This is basically a `email.parser.BytesParser().parse(...)` with given
    `headers_only` and policy selection, that can also handle BSMTP. As an
    extra bonus, you can just request headers plus the names of attached files.

    Removes the SMTP envelope surrounding the email if present. Only left-over
    might be a line with a '.' at end of non-multipart messages if
    `headers_only` is False.

    :param str file_name: path to the file that contains the email text
    :param bool headers_only: whether to parse only the email headers; set this
                              to False, e.g. if you want to check for
                              attachments using message.walk()
    :param bool attachment_filenames: if you just want headers and names of
                                      attached files, set `headers_only` and
                                      this to True.
    :param bool raise_on_defect: whether to raise an error if email parser
                                 encounters a defect (email policy `strict`) or
                                 just add the defect to message's `defect`
                                 attribute
    :param bool new_message_type: whether to return the older
                                  :py:class:`email.message.Message` (policy
                                  `compat32`, our default), or the newer
                                  :py:class:`email.message.EmailMessage` type
                                  (policy `default`). Big difference!
    :returns: either msg or 2-tuple `(msg, filenames)` if requested per arg
              `attachment_filenames`
    :rtype: :py:class:`email.message.Message` or
             (:py:class:`email.message.Message`, (str)) or
             one of these two with :py:class:`email.message.EmailMessage`
    """
    msg = None
    start_pos = 0

    if new_message_type:
        mail_policy = policy.default
    else:
        mail_policy = policy.compat32
    if raise_on_defect:
        mail_policy += policy.strict

    with open(file_name, 'rb') as read_handle:
        line = read_handle.readline()
        if line.startswith(b'EHLO'):
            # there is a smtp header. skip to its end
            while line.strip() != b'DATA':
                line = read_handle.readline()
            # the rest is the email plus a trailing '.' (ignored by parser if
            # multipart)
        else:
            read_handle.seek(0)  # forget we read the first line already
        start_pos = read_handle.tell()
        msg = BytesParser(policy=mail_policy).parse(read_handle,
                                                    headersonly=headers_only)

    if not attachment_filenames:
        return msg

    # otherwise need to parse complete message to get attachment file names
    if headers_only:
        with open(file_name, 'rb') as read_handle:
            read_handle.seek(start_pos)
            full_msg = BytesParser(policy=mail_policy).parse(read_handle,
                                                             headersonly=False)
    else:
        full_msg = msg
    filenames = [get_filename(part) for part in full_msg.walk()]
    return msg, tuple(filename for filename in filenames
                      if filename is not None)


def parse_mail_date(message):
    """
    Parse the 'Date' header of the given message.

    Shortcut for :py:func:`email.utils.parsedate_to_datetime`.

    This is no longer necessary for newer
    :py:class:`email.message.EmailMessage` since the `Date` Header is
    automatically parsed to a :py:class:`email.headerregistry.DateHeader`.

    :param message: Email message
    :type message: :py:class:`email.message.Message`
    :returns: datetime from Email "Date" header or None if header not present
    :rtype: :py:class:`datetime.datetime` or None
    """
    date_str = message.get('Date', '')
    if not date_str:
        return None
    return parsedate_to_datetime(date_str)


def get_user_mail_files(user, mailbox='INBOX'):
    """
    Iterate over mails in given folder of given user; yields file names.

    Works on local cyrus file system, not on imap server.

    :param str user: Name of user whose mailbox is analyzed
    :param str mailbox: name of mailbox to use, INBOX (default) for base
                        folder; name is modified using :py:func:`cyrus_escape`
    :returns: nothing; but yields full path to messages on disc
    """
    # base folder of user mail
    folder = os.path.join('/datastore', 'imap-mails', 'user', user)

    # adapt paths like "INBOX/sub/dir" to "sub/dir"
    subdirs = mailbox.split('/')
    if subdirs[0].upper() == 'INBOX':
        subdirs = subdirs[1:]
    folder = os.path.join(folder,
                          *(cyrus_escape(subdir) for subdir in subdirs))

    for filename in os.listdir(folder):
        if not re.match(r'\d+\.', filename):
            continue
        full_path = os.path.join(folder, filename)
        yield full_path


def get_user_mail(user, mailbox='INBOX', **kwargs):
    """
    Iterate over mails in given folder of given user; yields parsed mails.

    :param str user: see :py:func:`get_user_mail_files`
    :param str mailbox: see :py:func:`get_user_mail_files`
    :param dict kwargs: all other args are forwarded to
                        :py:func:`parse_mail_file`
    :returns: nothing; but yields 2-tuples (path, email_msg) where first is the
              full path to the message on disc, and the latter is the outcome
              of :py:func:`parse_mail_file` for that file
    """
    for full_path in get_user_mail_files(user, mailbox):
        yield full_path, parse_mail_file(full_path, **kwargs)


def get_message_text(filename, fallback_encoding='iso8859-1',
                     include_all_text=False):
    """
    Extract message text as string from email message.

    Intended as complementary addition to get_user_mail, e.g. ::

        for filename, msg in get_user_mail(user):
            # rough filtering based on headers
            if msg['Subject'] != 'Expected Subject':
                continue
            # get message text for closer inspection
            text = get_message_text(filename)
            if 'Expected Text' not in text:
                continue
            ...

    Finds the first part in message that is of type `text/plain` and decodes it
    using encoding specified in mail or otherwise fallback encoding. If none
    found takes first part of type `text/*`, or otherwise just the first part.

    If include_all_text is True, all `text/*` parts are included, with `text/plain`
    being the first.

    :param str filename: complete path of message file in filesystem
    :param str fallback_encoding: Encoding of email text if none is specified
                                  in mail.
    :param bool include_all_text: include all `text/*` parts in returned text
    :returns: text(s) of message
    :rtype: [str] if include_all_text else str
    """
    result = []
    msg = parse_mail_file(filename, headers_only=False)
    for part in msg.walk():
        if part.get_content_type() != 'text/plain':
            continue
        encoding = part.get_content_charset(fallback_encoding)
        result.append(part.get_payload(decode=True).decode(encoding))

    if result and not include_all_text:
        return result[0]

    # no text/plain found. Try only "text/":
    for part in msg.walk():
        cont_type = part.get_content_type()
        if cont_type.startswith('text/') and cont_type != 'text/plain':
            encoding = part.get_content_charset(fallback_encoding)
            result.append(part.get_payload(decode=True).decode(encoding))

    if result:
        if not include_all_text:
            return result[0]
        return result

    # no "text/" found. Just take first part
    while msg.is_multipart():
        msg = msg.get_payload(0)

    encoding = msg.get_content_charset(fallback_encoding)
    if include_all_text:
        return [msg.get_payload(decode=True).decode(encoding), ]
    return msg.get_payload(decode=True).decode(encoding)


def cyrus_escape(user_or_folder, keep_path=False, regex=False):
    """
    Convert names of users or mailbox folders to cyrus format.

    quite a hack, just does the following hard-coded replacements:

    * . --> ^
    * / --> .  (except if keep_path is True)
    * "u --> &APw-  ,  "o --> &APY-  ,  "a --> &AOQ-
      (if need more: this is modified utf-7)
    * inbox -->   (the empty string)

    Would like to use a general modified utf-7-encoder/decoder but python has
    none builtin (see https://bugs.python.org/issue5305) and an extra lib like
    https://bitbucket.org/mjs0/imapclient/ would be overkill. After all, we
    control the input to this function via params and this is enough umlaut-
    testing I think...

    :param str user_or_folder: name of the user or folder string to escape
    :param bool keep_path: do not replace '/' with '.' so can still use result
                           as path name
    :param bool regex: result is used in grep or other regex, so ^, . and & are
                       escaped again with a backslash
    :returns: escaped user or folder string
    :rtype: str

    .. seealso:: :py:func:`cyrus_unescape`
    """
    temp = user_or_folder.replace('.', '^') \
        .replace('ü', '&APw-').replace('ä', '&AOQ-') \
        .replace('ö', '&APY-') \
        .replace('inbox', '').replace('INBOX', '').replace('Inbox', '')
    if not keep_path:
        temp = temp.replace('/', '.')
    if regex:
        return temp.replace('^', r'\^').replace('&', r'\&') \
                   .replace('.', r'\.').replace('$', r'\$')
    return temp


def cyrus_unescape(user_or_folder):
    """
    Undo effects of :py:func:`cyrus_escape` (but not all of them).

    :param str user_or_folder: name of the user or folder string to unescape
    :returns: unescaped user or folder string
    :rtype: str
    """
    if user_or_folder == '':
        return 'inbox'
    return user_or_folder.replace('.', '/')\
        .replace(r'\^', '.').replace('^', '.')


def get_filename(message, failobj=None, do_unwrap=True):
    """
    Get filename of a message part, even if it is base64-encoded.

    For attachments with base64-encoded file name, the
    :py:func:`email.message.Message.get_filename()` does not work. This
    function tries that first and if it fails tries to interprete the
    Content-Disposition of the message part. If all fails, returns `failobj`.

    Only for ascii filenames: also unwraps file names if they are line-wrapped.
    But note that this may remove too much whitespace from the filename if
    line-wrapping happened in the same position as the filename's whitespace.
    To get unwrapped version, set param `do_unwrap` to `False`.

    See also: https://en.wikipedia.org/wiki/MIME#Encoded-Word

    :param message: message part, e.g. from
                    :py:meth:`email.message.Message.walk`
    :type message: :py:class:`email.message.Message` or
                   :py:class:`email.message.EmailMessage`
    :param failobj: object to return in case of failure (defaults to None)
    :param bool do_unwrap: undo line-break inserted by mail-creator; may remove
                           whitespace from file name; only applies to ascii
                           file names
    :returns: either a string or failobj
    """
    # try the old way and unwrap
    filename = message.get_filename(failobj)

    if isinstance(filename, bytes) and not filename.startswith(b'=?') \
            and not filename.endswith(b'?='):
        filename = filename.decode('utf8')

    if isinstance(filename, str):
        if do_unwrap:
            return re.sub('[\\r\\n]+', '', filename)
        return filename

    if 'Content-Disposition' not in message:
        return failobj

    # try parsing content-disposition. e.g.:
    # attachment; filename="2018年度公开课计划表.xlsx"   -->
    # '=?utf-8?b?YXR0YWNobWVudDsgZmlsZW5hbWU9IjIwMTjlubTluqY=?=\r\n =?utf-8?b?'
    # '5YWs5byA6K++6K6h5YiS6KGoLnhsc3gi?='

    # This may be a re-implementation of email.utils.collapse_rfc2231_value()
    # as mentioned in email.message.EmailMessage.get_param()

    # The form is: "=?charset?encoding?encoded text?="
    SPLIT_REGEX = '\r?\n *'    # should be CRNL but some files miss the \r
    ENCODED_WORD_REGEX = r'\s*=\?([^?]+)\?([^?]+)\?(.*)\?=\s*$'
    LINE_REGEX = r'attachment\s*;\s*filename=(")?(.+)\1\s*$'
    decoded = []
    for word in re.split(SPLIT_REGEX, message['Content-Disposition']):
        match = re.match(ENCODED_WORD_REGEX, word)
        if not match:
            break
        charset, encoding, data = match.groups()
        if encoding.lower() == 'b':
            temp = b64decode(data)
        elif encoding.lower() == 'q':
            raise NotImplementedError('use quopri.decodestring, handle _')
        else:
            raise ValueError('not allowed according to wikipedia: "{}"'
                             .format(encoding))
        decoded.append(temp.decode(charset))
    decoded = u''.join(decoded)

    match = re.match(LINE_REGEX, decoded)
    if match:
        return match.groups()[1]
    return failobj
Commit	Line	Data
f49f6323	1	# This Python file uses the following encoding: utf-8
11cbb815 PD	2
	3	# The software in this package is distributed under the GNU General
	4	# Public License version 2 (with a special exception described below).
	5	#
	6	# A copy of GNU General Public License (GPL) is included in this distribution,
	7	# in the file COPYING.GPL.
	8	#
	9	# As a special exception, if other files instantiate templates or use macros
	10	# or inline functions from this file, or you compile this file and link it
	11	# with other works to produce a work based on this file, this file
	12	# does not by itself cause the resulting work to be covered
	13	# by the GNU General Public License.
	14	#
	15	# However the source code for this file must still be made available
	16	# in accordance with section (3) of the GNU General Public License.
	17	#
	18	# This exception does not invalidate any other reasons why a work based
	19	# on this file might be covered by the GNU General Public License.
	20	#
	21	# Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>
	22
f49f6323	23	"""
fcec8a63	24	Utilities for dealing with email.
2ed7100d CH	25
	26	.. seealso:: :py:mod:`pyi2ncommon.mail_validator`,
	27	:py:mod:`pyi2ncommon.imap_mailbox`
f49f6323 PD	28
f49f6323 PD	29	Copyright: Intra2net AG
f49f6323 PD	30	"""
f49f6323 PD	31
b36398e7	32	from base64 import b64decode
67177844	33	from email.utils import parsedate_to_datetime
1d21262c	34	from email.parser import BytesParser
4b44f515	35	from email import policy
f49f6323	36
67177844 CH	37	# outsourced source, import required for compatiblity
	38	from .imap_mailbox import ImapMailbox # pylint: disable=unused-import
	39	from .mail_validator import * # pylint: disable=unused-import
4965c436	40	from .sysmisc import replace_file_regex
f49f6323	41
67177844	42	log = logging.getLogger('pyi2ncommon.mail_utils')
f49f6323 PD	43
	44
	45	def prep_email_header(email_file, value, regex=None, criterion="envelopeto"):
	46	"""
	47	Replace value in a provided email file.
	48
	49	:param str email_file: file to use for the replacement
	50	:param str value: value to replace the first matched group with
	51	:param regex: regular expression to use when replacing a header value
	52	:type regex: str or None
	53	:param str criterion: criterion to use for replacement, one
	54	of 'envelopeto' or 'received'
	55	:raises: :py:class:`ValueError` if the choice of criterion is invalid
	56
fd562d9b PD	57	..todo:: In some cases this function is reusing arnied wrapper's cnf
fd562d9b PD	58	value preparation but for email headers.
f49f6323 PD	59	"""
	60	if criterion == "envelopeto":
	61	logging.debug("Updating test emails' EnvelopeTo header")
4965c436	62	replace_file_regex(email_file, value, regex=regex)
f49f6323 PD	63	elif criterion == "received":
f49f6323 PD	64	logging.debug("Updating test emails' Received header")
e108b7d4 CH	65	with open(email_file, "r") as file_handle:
e108b7d4 CH	66	email_text = file_handle.read()
f49f6323 PD	67	email_text = re.sub(regex, value, email_text)
f49f6323 PD	68	email_text = re.sub(regex, value, email_text)
e108b7d4 CH	69	with open(email_file, "w") as file_handle:
e108b7d4 CH	70	file_handle.write(email_text)
f49f6323	71	else:
e108b7d4 CH	72	raise ValueError("Invalid header preparation criterion '%s'"
e108b7d4 CH	73	% criterion)
f49f6323 PD	74
f49f6323 PD	75
4b44f515 CH	76	def parse_mail_file(file_name, headers_only=True, attachment_filenames=False,
4b44f515 CH	77	raise_on_defect=False, new_message_type=False):
f49f6323 PD	78	"""
	79	Parse given email file (e.g. a banned message).
	80
1d21262c	81	This is basically a `email.parser.BytesParser().parse(...)` with given
4b44f515 CH	82	`headers_only` and policy selection, that can also handle BSMTP. As an
4b44f515 CH	83	extra bonus, you can just request headers plus the names of attached files.
f49f6323 PD	84
f49f6323 PD	85	Removes the SMTP envelope surrounding the email if present. Only left-over
e108b7d4	86	might be a line with a '.' at end of non-multipart messages if
df036fbe	87	`headers_only` is False.
b359b15c	88
4b44f515	89	:param str file_name: path to the file that contains the email text
b359b15c CH	90	:param bool headers_only: whether to parse only the email headers; set this
	91	to False, e.g. if you want to check for
	92	attachments using message.walk()
	93	:param bool attachment_filenames: if you just want headers and names of
	94	attached files, set `headers_only` and
	95	this to True.
4b44f515 CH	96	:param bool raise_on_defect: whether to raise an error if email parser
	97	encounters a defect (email policy `strict`) or
	98	just add the defect to message's `defect`
	99	attribute
	100	:param bool new_message_type: whether to return the older
	101	:py:class:`email.message.Message` (policy
	102	`compat32`, our default), or the newer
	103	:py:class:`email.message.EmailMessage` type
	104	(policy `default`). Big difference!
b359b15c CH	105	:returns: either msg or 2-tuple `(msg, filenames)` if requested per arg
	106	`attachment_filenames`
	107	:rtype: :py:class:`email.message.Message` or
4b44f515 CH	108	(:py:class:`email.message.Message`, (str)) or
4b44f515 CH	109	one of these two with :py:class:`email.message.EmailMessage`
f49f6323	110	"""
b359b15c CH	111	msg = None
b359b15c CH	112	start_pos = 0
4b44f515 CH	113
	114	if new_message_type:
	115	mail_policy = policy.default
	116	else:
	117	mail_policy = policy.compat32
	118	if raise_on_defect:
	119	mail_policy += policy.strict
	120
1d21262c	121	with open(file_name, 'rb') as read_handle:
f49f6323	122	line = read_handle.readline()
1d21262c	123	if line.startswith(b'EHLO'):
f49f6323	124	# there is a smtp header. skip to its end
1d21262c	125	while line.strip() != b'DATA':
f49f6323 PD	126	line = read_handle.readline()
	127	# the rest is the email plus a trailing '.' (ignored by parser if
	128	# multipart)
	129	else:
	130	read_handle.seek(0) # forget we read the first line already
b359b15c	131	start_pos = read_handle.tell()
4b44f515 CH	132	msg = BytesParser(policy=mail_policy).parse(read_handle,
4b44f515 CH	133	headersonly=headers_only)
b359b15c CH	134
	135	if not attachment_filenames:
	136	return msg
	137
	138	# otherwise need to parse complete message to get attachment file names
	139	if headers_only:
1d21262c	140	with open(file_name, 'rb') as read_handle:
b359b15c	141	read_handle.seek(start_pos)
4b44f515 CH	142	full_msg = BytesParser(policy=mail_policy).parse(read_handle,
4b44f515 CH	143	headersonly=False)
b359b15c CH	144	else:
	145	full_msg = msg
	146	filenames = [get_filename(part) for part in full_msg.walk()]
	147	return msg, tuple(filename for filename in filenames
	148	if filename is not None)
f49f6323 PD	149
f49f6323 PD	150
58414aec CH	151	def parse_mail_date(message):
	152	"""
	153	Parse the 'Date' header of the given message.
	154
	155	Shortcut for :py:func:`email.utils.parsedate_to_datetime`.
	156
	157	This is no longer necessary for newer
	158	:py:class:`email.message.EmailMessage` since the `Date` Header is
	159	automatically parsed to a :py:class:`email.headerregistry.DateHeader`.
	160
	161	:param message: Email message
	162	:type message: :py:class:`email.message.Message`
	163	:returns: datetime from Email "Date" header or None if header not present
	164	:rtype: :py:class:`datetime.datetime` or None
	165	"""
	166	date_str = message.get('Date', '')
	167	if not date_str:
	168	return None
	169	return parsedate_to_datetime(date_str)
	170
	171
f44055b0 CH	172	def get_user_mail_files(user, mailbox='INBOX'):
f44055b0 CH	173	"""
2ed7100d CH	174	Iterate over mails in given folder of given user; yields file names.
	175
	176	Works on local cyrus file system, not on imap server.
f44055b0	177
2ed7100d CH	178	:param str user: Name of user whose mailbox is analyzed
	179	:param str mailbox: name of mailbox to use, INBOX (default) for base
	180	folder; name is modified using :py:func:`cyrus_escape`
f44055b0 CH	181	:returns: nothing; but yields full path to messages on disc
	182	"""
	183	# base folder of user mail
	184	folder = os.path.join('/datastore', 'imap-mails', 'user', user)
	185
2ed7100d	186	# adapt paths like "INBOX/sub/dir" to "sub/dir"
f44055b0 CH	187	subdirs = mailbox.split('/')
	188	if subdirs[0].upper() == 'INBOX':
	189	subdirs = subdirs[1:]
	190	folder = os.path.join(folder,
	191	*(cyrus_escape(subdir) for subdir in subdirs))
	192
	193	for filename in os.listdir(folder):
	194	if not re.match(r'\d+\.', filename):
	195	continue
	196	full_path = os.path.join(folder, filename)
	197	yield full_path
	198
	199
f49f6323 PD	200	def get_user_mail(user, mailbox='INBOX', **kwargs):
f49f6323 PD	201	"""
e108b7d4	202	Iterate over mails in given folder of given user; yields parsed mails.
f49f6323	203
2ed7100d CH	204	:param str user: see :py:func:`get_user_mail_files`
2ed7100d CH	205	:param str mailbox: see :py:func:`get_user_mail_files`
f49f6323 PD	206	:param dict kwargs: all other args are forwarded to
	207	:py:func:`parse_mail_file`
	208	:returns: nothing; but yields 2-tuples (path, email_msg) where first is the
	209	full path to the message on disc, and the latter is the outcome
	210	of :py:func:`parse_mail_file` for that file
	211	"""
f44055b0 CH	212	for full_path in get_user_mail_files(user, mailbox):
f44055b0 CH	213	yield full_path, parse_mail_file(full_path, **kwargs)
f49f6323 PD	214
f49f6323 PD	215
f4dec410 CH	216	def get_message_text(filename, fallback_encoding='iso8859-1',
	217	include_all_text=False):
	218	"""
	219	Extract message text as string from email message.
	220
	221	Intended as complementary addition to get_user_mail, e.g. ::
	222
	223	for filename, msg in get_user_mail(user):
	224	# rough filtering based on headers
	225	if msg['Subject'] != 'Expected Subject':
	226	continue
	227	# get message text for closer inspection
	228	text = get_message_text(filename)
	229	if 'Expected Text' not in text:
	230	continue
	231	...
	232
df036fbe	233	Finds the first part in message that is of type `text/plain` and decodes it
f4dec410	234	using encoding specified in mail or otherwise fallback encoding. If none
df036fbe	235	found takes first part of type `text/*`, or otherwise just the first part.
f4dec410	236
df036fbe	237	If include_all_text is True, all `text/*` parts are included, with `text/plain`
f4dec410 CH	238	being the first.
	239
	240	:param str filename: complete path of message file in filesystem
2ed7100d CH	241	:param str fallback_encoding: Encoding of email text if none is specified
2ed7100d CH	242	in mail.
df036fbe	243	:param bool include_all_text: include all `text/*` parts in returned text
f4dec410 CH	244	:returns: text(s) of message
	245	:rtype: [str] if include_all_text else str
	246	"""
	247	result = []
	248	msg = parse_mail_file(filename, headers_only=False)
	249	for part in msg.walk():
	250	if part.get_content_type() != 'text/plain':
	251	continue
	252	encoding = part.get_content_charset(fallback_encoding)
	253	result.append(part.get_payload(decode=True).decode(encoding))
	254
	255	if result and not include_all_text:
	256	return result[0]
	257
	258	# no text/plain found. Try only "text/":
	259	for part in msg.walk():
	260	cont_type = part.get_content_type()
	261	if cont_type.startswith('text/') and cont_type != 'text/plain':
	262	encoding = part.get_content_charset(fallback_encoding)
	263	result.append(part.get_payload(decode=True).decode(encoding))
	264
	265	if result:
	266	if not include_all_text:
	267	return result[0]
	268	return result
	269
	270	# no "text/" found. Just take first part
	271	while msg.is_multipart():
	272	msg = msg.get_payload(0)
	273
	274	encoding = msg.get_content_charset(fallback_encoding)
	275	if include_all_text:
	276	return [msg.get_payload(decode=True).decode(encoding), ]
	277	return msg.get_payload(decode=True).decode(encoding)
	278
	279
f49f6323 PD	280	def cyrus_escape(user_or_folder, keep_path=False, regex=False):
f49f6323 PD	281	"""
e108b7d4	282	Convert names of users or mailbox folders to cyrus format.
f49f6323 PD	283
	284	quite a hack, just does the following hard-coded replacements:
	285
	286	* . --> ^
	287	* / --> . (except if keep_path is True)
	288	* "u --> &APw- , "o --> &APY- , "a --> &AOQ-
	289	(if need more: this is modified utf-7)
	290	* inbox --> (the empty string)
	291
	292	Would like to use a general modified utf-7-encoder/decoder but python has
7628bc48	293	none builtin (see https://bugs.python.org/issue5305) and an extra lib like
f49f6323 PD	294	https://bitbucket.org/mjs0/imapclient/ would be overkill. After all, we
	295	control the input to this function via params and this is enough umlaut-
	296	testing I think...
	297
	298	:param str user_or_folder: name of the user or folder string to escape
	299	:param bool keep_path: do not replace '/' with '.' so can still use result
	300	as path name
	301	:param bool regex: result is used in grep or other regex, so ^, . and & are
	302	escaped again with a backslash
	303	:returns: escaped user or folder string
	304	:rtype: str
	305
	306	.. seealso:: :py:func:`cyrus_unescape`
	307	"""
	308	temp = user_or_folder.replace('.', '^') \
	309	.replace('ü', '&APw-').replace('ä', '&AOQ-') \
	310	.replace('ö', '&APY-') \
	311	.replace('inbox', '').replace('INBOX', '').replace('Inbox', '')
	312	if not keep_path:
	313	temp = temp.replace('/', '.')
	314	if regex:
	315	return temp.replace('^', r'\^').replace('&', r'\&') \
	316	.replace('.', r'\.').replace('$', r'\$')
2ed7100d	317	return temp
f49f6323 PD	318
	319
	320	def cyrus_unescape(user_or_folder):
	321	"""
	322	Undo effects of :py:func:`cyrus_escape` (but not all of them).
	323
	324	:param str user_or_folder: name of the user or folder string to unescape
	325	:returns: unescaped user or folder string
	326	:rtype: str
	327	"""
	328	if user_or_folder == '':
	329	return 'inbox'
	330	return user_or_folder.replace('.', '/')\
	331	.replace(r'\^', '.').replace('^', '.')
b36398e7 CH	332
	333
	334	def get_filename(message, failobj=None, do_unwrap=True):
	335	"""
e108b7d4	336	Get filename of a message part, even if it is base64-encoded.
b36398e7 CH	337
b36398e7 CH	338	For attachments with base64-encoded file name, the
2ed7100d CH	339	:py:func:`email.message.Message.get_filename()` does not work. This
	340	function tries that first and if it fails tries to interprete the
	341	Content-Disposition of the message part. If all fails, returns `failobj`.
b36398e7 CH	342
	343	Only for ascii filenames: also unwraps file names if they are line-wrapped.
	344	But note that this may remove too much whitespace from the filename if
7628bc48	345	line-wrapping happened in the same position as the filename's whitespace.
b36398e7 CH	346	To get unwrapped version, set param `do_unwrap` to `False`.
	347
	348	See also: https://en.wikipedia.org/wiki/MIME#Encoded-Word
	349
	350	:param message: message part, e.g. from
	351	:py:meth:`email.message.Message.walk`
4b44f515 CH	352	:type message: :py:class:`email.message.Message` or
4b44f515 CH	353	:py:class:`email.message.EmailMessage`
b36398e7 CH	354	:param failobj: object to return in case of failure (defaults to None)
	355	:param bool do_unwrap: undo line-break inserted by mail-creator; may remove
	356	whitespace from file name; only applies to ascii
	357	file names
	358	:returns: either a string or failobj
	359	"""
	360	# try the old way and unwrap
	361	filename = message.get_filename(failobj)
	362
	363	if isinstance(filename, bytes) and not filename.startswith(b'=?') \
	364	and not filename.endswith(b'?='):
	365	filename = filename.decode('utf8')
	366
	367	if isinstance(filename, str):
	368	if do_unwrap:
	369	return re.sub('[\\r\\n]+', '', filename)
	370	return filename
	371
	372	if 'Content-Disposition' not in message:
	373	return failobj
	374
	375	# try parsing content-disposition. e.g.:
	376	# attachment; filename="2018年度公开课计划表.xlsx" -->
	377	# '=?utf-8?b?YXR0YWNobWVudDsgZmlsZW5hbWU9IjIwMTjlubTluqY=?=\r\n =?utf-8?b?'
	378	# '5YWs5byA6K++6K6h5YiS6KGoLnhsc3gi?='
	379
	380	# This may be a re-implementation of email.utils.collapse_rfc2231_value()
4b44f515	381	# as mentioned in email.message.EmailMessage.get_param()
b36398e7 CH	382
	383	# The form is: "=?charset?encoding?encoded text?="
	384	SPLIT_REGEX = '\r?\n *' # should be CRNL but some files miss the \r
	385	ENCODED_WORD_REGEX = r'\s=\?([^?]+)\?([^?]+)\?(.)\?=\s*$'
	386	LINE_REGEX = r'attachment\s;\sfilename=(")?(.+)\1\s*$'
	387	decoded = []
	388	for word in re.split(SPLIT_REGEX, message['Content-Disposition']):
	389	match = re.match(ENCODED_WORD_REGEX, word)
	390	if not match:
	391	break
	392	charset, encoding, data = match.groups()
	393	if encoding.lower() == 'b':
	394	temp = b64decode(data)
	395	elif encoding.lower() == 'q':
	396	raise NotImplementedError('use quopri.decodestring, handle _')
	397	else:
	398	raise ValueError('not allowed according to wikipedia: "{}"'
	399	.format(encoding))
	400	decoded.append(temp.decode(charset))
	401	decoded = u''.join(decoded)
	402
	403	match = re.match(LINE_REGEX, decoded)
	404	if match:
	405	return match.groups()[1]
	406	return failobj