[pyi2ncommon] / src / mail_utils.py

# This Python file uses the following encoding: utf-8

# The software in this package is distributed under the GNU General
# Public License version 2 (with a special exception described below).
#
# A copy of GNU General Public License (GPL) is included in this distribution,
# in the file COPYING.GPL.
#
# As a special exception, if other files instantiate templates or use macros
# or inline functions from this file, or you compile this file and link it
# with other works to produce a work based on this file, this file
# does not by itself cause the resulting work to be covered
# by the GNU General Public License.
#
# However the source code for this file must still be made available
# in accordance with section (3) of the GNU General Public License.
#
# This exception does not invalidate any other reasons why a work based
# on this file might be covered by the GNU General Public License.
#
# Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>

"""

SUMMARY
------------------------------------------------------
Utilities for dealing with email

.. seealso:: :py:mod:`pyi2ncommon.mail_validator`,
             :py:mod:`pyi2ncommon.imap_mailbox`

Copyright: Intra2net AG


INTERFACE
------------------------------------------------------

"""

from base64 import b64decode
from email.utils import parsedate_to_datetime
from email.parser import BytesParser
from email import policy

# outsourced source, import required for compatiblity
from .imap_mailbox import ImapMailbox           # pylint: disable=unused-import
from .mail_validator import *                   # pylint: disable=unused-import

log = logging.getLogger('pyi2ncommon.mail_utils')


def prep_email_header(email_file, value, regex=None, criterion="envelopeto"):
    """
    Replace value in a provided email file.

    :param str email_file: file to use for the replacement
    :param str value: value to replace the first matched group with
    :param regex: regular expression to use when replacing a header value
    :type regex: str or None
    :param str criterion: criterion to use for replacement, one
                          of 'envelopeto' or 'received'
    :raises: :py:class:`ValueError` if the choice of criterion is invalid

    In some cases this function is reusing arnied wrapper's cnf value
    preparation but for email headers.
    """
    if criterion == "envelopeto":
        logging.debug("Updating test emails' EnvelopeTo header")
        arnied_wrapper.prep_cnf_value(email_file, value, regex=regex)
    elif criterion == "received":
        logging.debug("Updating test emails' Received header")
        with open(email_file, "r") as file_handle:
            email_text = file_handle.read()
            email_text = re.sub(regex, value, email_text)
            email_text = re.sub(regex, value, email_text)
        with open(email_file, "w") as file_handle:
            file_handle.write(email_text)
    else:
        raise ValueError("Invalid header preparation criterion '%s'"
                         % criterion)


def create_users(usernames, config_file, params):
    """
    Create cyrus users from an absolute path to a user configuration file.

    :param usernames: usernames of the created users
    :type usernames: [str]
    :param str config_file: template config file to use for each user
                            configuration
    :param params: template config file to use for each user configuration
    :type params: {str, str}
    :raises: :py:class:`RuntimeError` if the user exists already or cannot be
              created
    """
    log.info("Creating new cyrus users %s", ", ".join(usernames))
    cyrus_user_path = params.get("cyrus_user_path",
                                 "/datastore/imap-mails/user/")

    # check for existence round
    for username in usernames:
        if os.path.exists(os.path.join(cyrus_user_path,
                                       username.replace(".", "^"))):
            raise RuntimeError("The user %s was already created" % username)

    for username in usernames:
        params["user"] = '%i: "%s"' % (-1, username)
        params["user_fullname"] = username
        params_regex = {"user": r'%s,(-?\d+: ".*")'}
        arnied_wrapper.set_cnf_semidynamic([config_file],
                                           params, params_regex)

    for username in usernames:
        if not os.path.exists(os.path.join(cyrus_user_path,
                                           username.replace(".", "^"))):
            raise RuntimeError("The user %s could not be created" % username)
        else:
            log.info("Added new user %s", username)
    log.info("%s users successfully created!", len(usernames))


def parse_mail_file(file_name, headers_only=True, attachment_filenames=False,
                    raise_on_defect=False, new_message_type=False):
    """
    Parse given email file (e.g. a banned message).

    This is basically a `email.parser.BytesParser().parse(...)` with given
    `headers_only` and policy selection, that can also handle BSMTP. As an
    extra bonus, you can just request headers plus the names of attached files.

    Removes the SMTP envelope surrounding the email if present. Only left-over
    might be a line with a '.' at end of non-multipart messages if
     `headers_only` is False.

    :param str file_name: path to the file that contains the email text
    :param bool headers_only: whether to parse only the email headers; set this
                              to False, e.g. if you want to check for
                              attachments using message.walk()
    :param bool attachment_filenames: if you just want headers and names of
                                      attached files, set `headers_only` and
                                      this to True.
    :param bool raise_on_defect: whether to raise an error if email parser
                                 encounters a defect (email policy `strict`) or
                                 just add the defect to message's `defect`
                                 attribute
    :param bool new_message_type: whether to return the older
                                  :py:class:`email.message.Message` (policy
                                  `compat32`, our default), or the newer
                                  :py:class:`email.message.EmailMessage` type
                                  (policy `default`). Big difference!
    :returns: either msg or 2-tuple `(msg, filenames)` if requested per arg
              `attachment_filenames`
    :rtype: :py:class:`email.message.Message` or
             (:py:class:`email.message.Message`, (str)) or
             one of these two with :py:class:`email.message.EmailMessage`
    """
    msg = None
    start_pos = 0

    if new_message_type:
        mail_policy = policy.default
    else:
        mail_policy = policy.compat32
    if raise_on_defect:
        mail_policy += policy.strict

    with open(file_name, 'rb') as read_handle:
        line = read_handle.readline()
        if line.startswith(b'EHLO'):
            # there is a smtp header. skip to its end
            while line.strip() != b'DATA':
                line = read_handle.readline()
            # the rest is the email plus a trailing '.' (ignored by parser if
            # multipart)
        else:
            read_handle.seek(0)  # forget we read the first line already
        start_pos = read_handle.tell()
        msg = BytesParser(policy=mail_policy).parse(read_handle,
                                                    headersonly=headers_only)

    if not attachment_filenames:
        return msg

    # otherwise need to parse complete message to get attachment file names
    if headers_only:
        with open(file_name, 'rb') as read_handle:
            read_handle.seek(start_pos)
            full_msg = BytesParser(policy=mail_policy).parse(read_handle,
                                                             headersonly=False)
    else:
        full_msg = msg
    filenames = [get_filename(part) for part in full_msg.walk()]
    return msg, tuple(filename for filename in filenames
                      if filename is not None)


def parse_mail_date(message):
    """
    Parse the 'Date' header of the given message.

    Shortcut for :py:func:`email.utils.parsedate_to_datetime`.

    This is no longer necessary for newer
    :py:class:`email.message.EmailMessage` since the `Date` Header is
    automatically parsed to a :py:class:`email.headerregistry.DateHeader`.

    :param message: Email message
    :type message: :py:class:`email.message.Message`
    :returns: datetime from Email "Date" header or None if header not present
    :rtype: :py:class:`datetime.datetime` or None
    """
    date_str = message.get('Date', '')
    if not date_str:
        return None
    return parsedate_to_datetime(date_str)


def get_user_mail_files(user, mailbox='INBOX'):
    """
    Iterate over mails in given folder of given user; yields file names.

    Works on local cyrus file system, not on imap server.

    :param str user: Name of user whose mailbox is analyzed
    :param str mailbox: name of mailbox to use, INBOX (default) for base
                        folder; name is modified using :py:func:`cyrus_escape`
    :returns: nothing; but yields full path to messages on disc
    """
    # base folder of user mail
    folder = os.path.join('/datastore', 'imap-mails', 'user', user)

    # adapt paths like "INBOX/sub/dir" to "sub/dir"
    subdirs = mailbox.split('/')
    if subdirs[0].upper() == 'INBOX':
        subdirs = subdirs[1:]
    folder = os.path.join(folder,
                          *(cyrus_escape(subdir) for subdir in subdirs))

    for filename in os.listdir(folder):
        if not re.match(r'\d+\.', filename):
            continue
        full_path = os.path.join(folder, filename)
        yield full_path


def get_user_mail(user, mailbox='INBOX', **kwargs):
    """
    Iterate over mails in given folder of given user; yields parsed mails.

    :param str user: see :py:func:`get_user_mail_files`
    :param str mailbox: see :py:func:`get_user_mail_files`
    :param dict kwargs: all other args are forwarded to
                        :py:func:`parse_mail_file`
    :returns: nothing; but yields 2-tuples (path, email_msg) where first is the
              full path to the message on disc, and the latter is the outcome
              of :py:func:`parse_mail_file` for that file
    """
    for full_path in get_user_mail_files(user, mailbox):
        yield full_path, parse_mail_file(full_path, **kwargs)


def get_message_text(filename, fallback_encoding='iso8859-1',
                     include_all_text=False):
    """
    Extract message text as string from email message.

    Intended as complementary addition to get_user_mail, e.g. ::

        for filename, msg in get_user_mail(user):
            # rough filtering based on headers
            if msg['Subject'] != 'Expected Subject':
                continue
            # get message text for closer inspection
            text = get_message_text(filename)
            if 'Expected Text' not in text:
                continue
            ...

    Finds the first part in message that is of type text/plain and decodes it
    using encoding specified in mail or otherwise fallback encoding. If none
    found takes first part of type "text/*", or otherwise just the first part.

    If include_all_text is True, all text/* parts are included, with text/plain
    being the first.

    :param str filename: complete path of message file in filesystem
    :param str fallback_encoding: Encoding of email text if none is specified
                                  in mail.
    :param bool include_all_text: include all "text/*" parts in returned text
    :returns: text(s) of message
    :rtype: [str] if include_all_text else str
    """
    result = []
    msg = parse_mail_file(filename, headers_only=False)
    for part in msg.walk():
        if part.get_content_type() != 'text/plain':
            continue
        encoding = part.get_content_charset(fallback_encoding)
        result.append(part.get_payload(decode=True).decode(encoding))

    if result and not include_all_text:
        return result[0]

    # no text/plain found. Try only "text/":
    for part in msg.walk():
        cont_type = part.get_content_type()
        if cont_type.startswith('text/') and cont_type != 'text/plain':
            encoding = part.get_content_charset(fallback_encoding)
            result.append(part.get_payload(decode=True).decode(encoding))

    if result:
        if not include_all_text:
            return result[0]
        return result

    # no "text/" found. Just take first part
    while msg.is_multipart():
        msg = msg.get_payload(0)

    encoding = msg.get_content_charset(fallback_encoding)
    if include_all_text:
        return [msg.get_payload(decode=True).decode(encoding), ]
    return msg.get_payload(decode=True).decode(encoding)


def cyrus_escape(user_or_folder, keep_path=False, regex=False):
    """
    Convert names of users or mailbox folders to cyrus format.

    quite a hack, just does the following hard-coded replacements:

    * . --> ^
    * / --> .  (except if keep_path is True)
    * "u --> &APw-  ,  "o --> &APY-  ,  "a --> &AOQ-
      (if need more: this is modified utf-7)
    * inbox -->   (the empty string)

    Would like to use a general modified utf-7-encoder/decoder but python has
    none builtin (see https://bugs.python.org/issue5305) and an extra lib like
    https://bitbucket.org/mjs0/imapclient/ would be overkill. After all, we
    control the input to this function via params and this is enough umlaut-
    testing I think...

    :param str user_or_folder: name of the user or folder string to escape
    :param bool keep_path: do not replace '/' with '.' so can still use result
                           as path name
    :param bool regex: result is used in grep or other regex, so ^, . and & are
                       escaped again with a backslash
    :returns: escaped user or folder string
    :rtype: str

    .. seealso:: :py:func:`cyrus_unescape`
    """
    temp = user_or_folder.replace('.', '^') \
        .replace('ü', '&APw-').replace('ä', '&AOQ-') \
        .replace('ö', '&APY-') \
        .replace('inbox', '').replace('INBOX', '').replace('Inbox', '')
    if not keep_path:
        temp = temp.replace('/', '.')
    if regex:
        return temp.replace('^', r'\^').replace('&', r'\&') \
                   .replace('.', r'\.').replace('$', r'\$')
    return temp


def cyrus_unescape(user_or_folder):
    """
    Undo effects of :py:func:`cyrus_escape` (but not all of them).

    :param str user_or_folder: name of the user or folder string to unescape
    :returns: unescaped user or folder string
    :rtype: str
    """
    if user_or_folder == '':
        return 'inbox'
    return user_or_folder.replace('.', '/')\
        .replace(r'\^', '.').replace('^', '.')


def get_filename(message, failobj=None, do_unwrap=True):
    """
    Get filename of a message part, even if it is base64-encoded.

    For attachments with base64-encoded file name, the
    :py:func:`email.message.Message.get_filename()` does not work. This
    function tries that first and if it fails tries to interprete the
    Content-Disposition of the message part. If all fails, returns `failobj`.

    Only for ascii filenames: also unwraps file names if they are line-wrapped.
    But note that this may remove too much whitespace from the filename if
    line-wrapping happened in the same position as the filename's whitespace.
    To get unwrapped version, set param `do_unwrap` to `False`.

    See also: https://en.wikipedia.org/wiki/MIME#Encoded-Word

    :param message: message part, e.g. from
                    :py:meth:`email.message.Message.walk`
    :type message: :py:class:`email.message.Message` or
                   :py:class:`email.message.EmailMessage`
    :param failobj: object to return in case of failure (defaults to None)
    :param bool do_unwrap: undo line-break inserted by mail-creator; may remove
                           whitespace from file name; only applies to ascii
                           file names
    :returns: either a string or failobj
    """
    # try the old way and unwrap
    filename = message.get_filename(failobj)

    if isinstance(filename, bytes) and not filename.startswith(b'=?') \
            and not filename.endswith(b'?='):
        filename = filename.decode('utf8')

    if isinstance(filename, str):
        if do_unwrap:
            return re.sub('[\\r\\n]+', '', filename)
        return filename

    if 'Content-Disposition' not in message:
        return failobj

    # try parsing content-disposition. e.g.:
    # attachment; filename="2018年度公开课计划表.xlsx"   -->
    # '=?utf-8?b?YXR0YWNobWVudDsgZmlsZW5hbWU9IjIwMTjlubTluqY=?=\r\n =?utf-8?b?'
    # '5YWs5byA6K++6K6h5YiS6KGoLnhsc3gi?='

    # This may be a re-implementation of email.utils.collapse_rfc2231_value()
    # as mentioned in email.message.EmailMessage.get_param()

    # The form is: "=?charset?encoding?encoded text?="
    SPLIT_REGEX = '\r?\n *'    # should be CRNL but some files miss the \r
    ENCODED_WORD_REGEX = r'\s*=\?([^?]+)\?([^?]+)\?(.*)\?=\s*$'
    LINE_REGEX = r'attachment\s*;\s*filename=(")?(.+)\1\s*$'
    decoded = []
    for word in re.split(SPLIT_REGEX, message['Content-Disposition']):
        match = re.match(ENCODED_WORD_REGEX, word)
        if not match:
            break
        charset, encoding, data = match.groups()
        if encoding.lower() == 'b':
            temp = b64decode(data)
        elif encoding.lower() == 'q':
            raise NotImplementedError('use quopri.decodestring, handle _')
        else:
            raise ValueError('not allowed according to wikipedia: "{}"'
                             .format(encoding))
        decoded.append(temp.decode(charset))
    decoded = u''.join(decoded)

    match = re.match(LINE_REGEX, decoded)
    if match:
        return match.groups()[1]
    return failobj
Commit	Line	Data
f49f6323	1	# This Python file uses the following encoding: utf-8
11cbb815 PD	2
	3	# The software in this package is distributed under the GNU General
	4	# Public License version 2 (with a special exception described below).
	5	#
	6	# A copy of GNU General Public License (GPL) is included in this distribution,
	7	# in the file COPYING.GPL.
	8	#
	9	# As a special exception, if other files instantiate templates or use macros
	10	# or inline functions from this file, or you compile this file and link it
	11	# with other works to produce a work based on this file, this file
	12	# does not by itself cause the resulting work to be covered
	13	# by the GNU General Public License.
	14	#
	15	# However the source code for this file must still be made available
	16	# in accordance with section (3) of the GNU General Public License.
	17	#
	18	# This exception does not invalidate any other reasons why a work based
	19	# on this file might be covered by the GNU General Public License.
	20	#
	21	# Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>
	22
f49f6323 PD	23	"""
	24
	25	SUMMARY
	26	------------------------------------------------------
2ed7100d CH	27	Utilities for dealing with email
	28
	29	.. seealso:: :py:mod:`pyi2ncommon.mail_validator`,
	30	:py:mod:`pyi2ncommon.imap_mailbox`
f49f6323 PD	31
	32	Copyright: Intra2net AG
	33
	34
	35	INTERFACE
	36	------------------------------------------------------
	37
	38	"""
	39
b36398e7	40	from base64 import b64decode
67177844	41	from email.utils import parsedate_to_datetime
1d21262c	42	from email.parser import BytesParser
4b44f515	43	from email import policy
f49f6323	44
67177844 CH	45	# outsourced source, import required for compatiblity
	46	from .imap_mailbox import ImapMailbox # pylint: disable=unused-import
	47	from .mail_validator import * # pylint: disable=unused-import
f49f6323	48
67177844	49	log = logging.getLogger('pyi2ncommon.mail_utils')
f49f6323 PD	50
	51
	52	def prep_email_header(email_file, value, regex=None, criterion="envelopeto"):
	53	"""
	54	Replace value in a provided email file.
	55
	56	:param str email_file: file to use for the replacement
	57	:param str value: value to replace the first matched group with
	58	:param regex: regular expression to use when replacing a header value
	59	:type regex: str or None
	60	:param str criterion: criterion to use for replacement, one
	61	of 'envelopeto' or 'received'
	62	:raises: :py:class:`ValueError` if the choice of criterion is invalid
	63
	64	In some cases this function is reusing arnied wrapper's cnf value
	65	preparation but for email headers.
	66	"""
	67	if criterion == "envelopeto":
	68	logging.debug("Updating test emails' EnvelopeTo header")
	69	arnied_wrapper.prep_cnf_value(email_file, value, regex=regex)
	70	elif criterion == "received":
	71	logging.debug("Updating test emails' Received header")
e108b7d4 CH	72	with open(email_file, "r") as file_handle:
e108b7d4 CH	73	email_text = file_handle.read()
f49f6323 PD	74	email_text = re.sub(regex, value, email_text)
f49f6323 PD	75	email_text = re.sub(regex, value, email_text)
e108b7d4 CH	76	with open(email_file, "w") as file_handle:
e108b7d4 CH	77	file_handle.write(email_text)
f49f6323	78	else:
e108b7d4 CH	79	raise ValueError("Invalid header preparation criterion '%s'"
e108b7d4 CH	80	% criterion)
f49f6323 PD	81
	82
	83	def create_users(usernames, config_file, params):
	84	"""
	85	Create cyrus users from an absolute path to a user configuration file.
	86
	87	:param usernames: usernames of the created users
	88	:type usernames: [str]
e108b7d4 CH	89	:param str config_file: template config file to use for each user
e108b7d4 CH	90	configuration
f49f6323 PD	91	:param params: template config file to use for each user configuration
f49f6323 PD	92	:type params: {str, str}
e108b7d4 CH	93	:raises: :py:class:`RuntimeError` if the user exists already or cannot be
e108b7d4 CH	94	created
f49f6323 PD	95	"""
f49f6323 PD	96	log.info("Creating new cyrus users %s", ", ".join(usernames))
e108b7d4 CH	97	cyrus_user_path = params.get("cyrus_user_path",
e108b7d4 CH	98	"/datastore/imap-mails/user/")
f49f6323 PD	99
	100	# check for existence round
	101	for username in usernames:
	102	if os.path.exists(os.path.join(cyrus_user_path,
	103	username.replace(".", "^"))):
	104	raise RuntimeError("The user %s was already created" % username)
	105
	106	for username in usernames:
	107	params["user"] = '%i: "%s"' % (-1, username)
	108	params["user_fullname"] = username
e108b7d4	109	params_regex = {"user": r'%s,(-?\d+: ".*")'}
f49f6323 PD	110	arnied_wrapper.set_cnf_semidynamic([config_file],
	111	params, params_regex)
	112
	113	for username in usernames:
e108b7d4 CH	114	if not os.path.exists(os.path.join(cyrus_user_path,
e108b7d4 CH	115	username.replace(".", "^"))):
f49f6323 PD	116	raise RuntimeError("The user %s could not be created" % username)
	117	else:
	118	log.info("Added new user %s", username)
	119	log.info("%s users successfully created!", len(usernames))
	120
	121
4b44f515 CH	122	def parse_mail_file(file_name, headers_only=True, attachment_filenames=False,
4b44f515 CH	123	raise_on_defect=False, new_message_type=False):
f49f6323 PD	124	"""
	125	Parse given email file (e.g. a banned message).
	126
1d21262c	127	This is basically a `email.parser.BytesParser().parse(...)` with given
4b44f515 CH	128	`headers_only` and policy selection, that can also handle BSMTP. As an
4b44f515 CH	129	extra bonus, you can just request headers plus the names of attached files.
f49f6323 PD	130
f49f6323 PD	131	Removes the SMTP envelope surrounding the email if present. Only left-over
e108b7d4 CH	132	might be a line with a '.' at end of non-multipart messages if
e108b7d4 CH	133	`headers_only` is False.
b359b15c	134
4b44f515	135	:param str file_name: path to the file that contains the email text
b359b15c CH	136	:param bool headers_only: whether to parse only the email headers; set this
	137	to False, e.g. if you want to check for
	138	attachments using message.walk()
	139	:param bool attachment_filenames: if you just want headers and names of
	140	attached files, set `headers_only` and
	141	this to True.
4b44f515 CH	142	:param bool raise_on_defect: whether to raise an error if email parser
	143	encounters a defect (email policy `strict`) or
	144	just add the defect to message's `defect`
	145	attribute
	146	:param bool new_message_type: whether to return the older
	147	:py:class:`email.message.Message` (policy
	148	`compat32`, our default), or the newer
	149	:py:class:`email.message.EmailMessage` type
	150	(policy `default`). Big difference!
b359b15c CH	151	:returns: either msg or 2-tuple `(msg, filenames)` if requested per arg
	152	`attachment_filenames`
	153	:rtype: :py:class:`email.message.Message` or
4b44f515 CH	154	(:py:class:`email.message.Message`, (str)) or
4b44f515 CH	155	one of these two with :py:class:`email.message.EmailMessage`
f49f6323	156	"""
b359b15c CH	157	msg = None
b359b15c CH	158	start_pos = 0
4b44f515 CH	159
	160	if new_message_type:
	161	mail_policy = policy.default
	162	else:
	163	mail_policy = policy.compat32
	164	if raise_on_defect:
	165	mail_policy += policy.strict
	166
1d21262c	167	with open(file_name, 'rb') as read_handle:
f49f6323	168	line = read_handle.readline()
1d21262c	169	if line.startswith(b'EHLO'):
f49f6323	170	# there is a smtp header. skip to its end
1d21262c	171	while line.strip() != b'DATA':
f49f6323 PD	172	line = read_handle.readline()
	173	# the rest is the email plus a trailing '.' (ignored by parser if
	174	# multipart)
	175	else:
	176	read_handle.seek(0) # forget we read the first line already
b359b15c	177	start_pos = read_handle.tell()
4b44f515 CH	178	msg = BytesParser(policy=mail_policy).parse(read_handle,
4b44f515 CH	179	headersonly=headers_only)
b359b15c CH	180
	181	if not attachment_filenames:
	182	return msg
	183
	184	# otherwise need to parse complete message to get attachment file names
	185	if headers_only:
1d21262c	186	with open(file_name, 'rb') as read_handle:
b359b15c	187	read_handle.seek(start_pos)
4b44f515 CH	188	full_msg = BytesParser(policy=mail_policy).parse(read_handle,
4b44f515 CH	189	headersonly=False)
b359b15c CH	190	else:
	191	full_msg = msg
	192	filenames = [get_filename(part) for part in full_msg.walk()]
	193	return msg, tuple(filename for filename in filenames
	194	if filename is not None)
f49f6323 PD	195
f49f6323 PD	196
58414aec CH	197	def parse_mail_date(message):
	198	"""
	199	Parse the 'Date' header of the given message.
	200
	201	Shortcut for :py:func:`email.utils.parsedate_to_datetime`.
	202
	203	This is no longer necessary for newer
	204	:py:class:`email.message.EmailMessage` since the `Date` Header is
	205	automatically parsed to a :py:class:`email.headerregistry.DateHeader`.
	206
	207	:param message: Email message
	208	:type message: :py:class:`email.message.Message`
	209	:returns: datetime from Email "Date" header or None if header not present
	210	:rtype: :py:class:`datetime.datetime` or None
	211	"""
	212	date_str = message.get('Date', '')
	213	if not date_str:
	214	return None
	215	return parsedate_to_datetime(date_str)
	216
	217
f44055b0 CH	218	def get_user_mail_files(user, mailbox='INBOX'):
f44055b0 CH	219	"""
2ed7100d CH	220	Iterate over mails in given folder of given user; yields file names.
	221
	222	Works on local cyrus file system, not on imap server.
f44055b0	223
2ed7100d CH	224	:param str user: Name of user whose mailbox is analyzed
	225	:param str mailbox: name of mailbox to use, INBOX (default) for base
	226	folder; name is modified using :py:func:`cyrus_escape`
f44055b0 CH	227	:returns: nothing; but yields full path to messages on disc
	228	"""
	229	# base folder of user mail
	230	folder = os.path.join('/datastore', 'imap-mails', 'user', user)
	231
2ed7100d	232	# adapt paths like "INBOX/sub/dir" to "sub/dir"
f44055b0 CH	233	subdirs = mailbox.split('/')
	234	if subdirs[0].upper() == 'INBOX':
	235	subdirs = subdirs[1:]
	236	folder = os.path.join(folder,
	237	*(cyrus_escape(subdir) for subdir in subdirs))
	238
	239	for filename in os.listdir(folder):
	240	if not re.match(r'\d+\.', filename):
	241	continue
	242	full_path = os.path.join(folder, filename)
	243	yield full_path
	244
	245
f49f6323 PD	246	def get_user_mail(user, mailbox='INBOX', **kwargs):
f49f6323 PD	247	"""
e108b7d4	248	Iterate over mails in given folder of given user; yields parsed mails.
f49f6323	249
2ed7100d CH	250	:param str user: see :py:func:`get_user_mail_files`
2ed7100d CH	251	:param str mailbox: see :py:func:`get_user_mail_files`
f49f6323 PD	252	:param dict kwargs: all other args are forwarded to
	253	:py:func:`parse_mail_file`
	254	:returns: nothing; but yields 2-tuples (path, email_msg) where first is the
	255	full path to the message on disc, and the latter is the outcome
	256	of :py:func:`parse_mail_file` for that file
	257	"""
f44055b0 CH	258	for full_path in get_user_mail_files(user, mailbox):
f44055b0 CH	259	yield full_path, parse_mail_file(full_path, **kwargs)
f49f6323 PD	260
f49f6323 PD	261
f4dec410 CH	262	def get_message_text(filename, fallback_encoding='iso8859-1',
	263	include_all_text=False):
	264	"""
	265	Extract message text as string from email message.
	266
	267	Intended as complementary addition to get_user_mail, e.g. ::
	268
	269	for filename, msg in get_user_mail(user):
	270	# rough filtering based on headers
	271	if msg['Subject'] != 'Expected Subject':
	272	continue
	273	# get message text for closer inspection
	274	text = get_message_text(filename)
	275	if 'Expected Text' not in text:
	276	continue
	277	...
	278
	279	Finds the first part in message that is of type text/plain and decodes it
	280	using encoding specified in mail or otherwise fallback encoding. If none
	281	found takes first part of type "text/*", or otherwise just the first part.
	282
	283	If include_all_text is True, all text/* parts are included, with text/plain
	284	being the first.
	285
	286	:param str filename: complete path of message file in filesystem
2ed7100d CH	287	:param str fallback_encoding: Encoding of email text if none is specified
2ed7100d CH	288	in mail.
f4dec410 CH	289	:param bool include_all_text: include all "text/*" parts in returned text
	290	:returns: text(s) of message
	291	:rtype: [str] if include_all_text else str
	292	"""
	293	result = []
	294	msg = parse_mail_file(filename, headers_only=False)
	295	for part in msg.walk():
	296	if part.get_content_type() != 'text/plain':
	297	continue
	298	encoding = part.get_content_charset(fallback_encoding)
	299	result.append(part.get_payload(decode=True).decode(encoding))
	300
	301	if result and not include_all_text:
	302	return result[0]
	303
	304	# no text/plain found. Try only "text/":
	305	for part in msg.walk():
	306	cont_type = part.get_content_type()
	307	if cont_type.startswith('text/') and cont_type != 'text/plain':
	308	encoding = part.get_content_charset(fallback_encoding)
	309	result.append(part.get_payload(decode=True).decode(encoding))
	310
	311	if result:
	312	if not include_all_text:
	313	return result[0]
	314	return result
	315
	316	# no "text/" found. Just take first part
	317	while msg.is_multipart():
	318	msg = msg.get_payload(0)
	319
	320	encoding = msg.get_content_charset(fallback_encoding)
	321	if include_all_text:
	322	return [msg.get_payload(decode=True).decode(encoding), ]
	323	return msg.get_payload(decode=True).decode(encoding)
	324
	325
f49f6323 PD	326	def cyrus_escape(user_or_folder, keep_path=False, regex=False):
f49f6323 PD	327	"""
e108b7d4	328	Convert names of users or mailbox folders to cyrus format.
f49f6323 PD	329
	330	quite a hack, just does the following hard-coded replacements:
	331
	332	* . --> ^
	333	* / --> . (except if keep_path is True)
	334	* "u --> &APw- , "o --> &APY- , "a --> &AOQ-
	335	(if need more: this is modified utf-7)
	336	* inbox --> (the empty string)
	337
	338	Would like to use a general modified utf-7-encoder/decoder but python has
7628bc48	339	none builtin (see https://bugs.python.org/issue5305) and an extra lib like
f49f6323 PD	340	https://bitbucket.org/mjs0/imapclient/ would be overkill. After all, we
	341	control the input to this function via params and this is enough umlaut-
	342	testing I think...
	343
	344	:param str user_or_folder: name of the user or folder string to escape
	345	:param bool keep_path: do not replace '/' with '.' so can still use result
	346	as path name
	347	:param bool regex: result is used in grep or other regex, so ^, . and & are
	348	escaped again with a backslash
	349	:returns: escaped user or folder string
	350	:rtype: str
	351
	352	.. seealso:: :py:func:`cyrus_unescape`
	353	"""
	354	temp = user_or_folder.replace('.', '^') \
	355	.replace('ü', '&APw-').replace('ä', '&AOQ-') \
	356	.replace('ö', '&APY-') \
	357	.replace('inbox', '').replace('INBOX', '').replace('Inbox', '')
	358	if not keep_path:
	359	temp = temp.replace('/', '.')
	360	if regex:
	361	return temp.replace('^', r'\^').replace('&', r'\&') \
	362	.replace('.', r'\.').replace('$', r'\$')
2ed7100d	363	return temp
f49f6323 PD	364
	365
	366	def cyrus_unescape(user_or_folder):
	367	"""
	368	Undo effects of :py:func:`cyrus_escape` (but not all of them).
	369
	370	:param str user_or_folder: name of the user or folder string to unescape
	371	:returns: unescaped user or folder string
	372	:rtype: str
	373	"""
	374	if user_or_folder == '':
	375	return 'inbox'
	376	return user_or_folder.replace('.', '/')\
	377	.replace(r'\^', '.').replace('^', '.')
b36398e7 CH	378
	379
	380	def get_filename(message, failobj=None, do_unwrap=True):
	381	"""
e108b7d4	382	Get filename of a message part, even if it is base64-encoded.
b36398e7 CH	383
b36398e7 CH	384	For attachments with base64-encoded file name, the
2ed7100d CH	385	:py:func:`email.message.Message.get_filename()` does not work. This
	386	function tries that first and if it fails tries to interprete the
	387	Content-Disposition of the message part. If all fails, returns `failobj`.
b36398e7 CH	388
	389	Only for ascii filenames: also unwraps file names if they are line-wrapped.
	390	But note that this may remove too much whitespace from the filename if
7628bc48	391	line-wrapping happened in the same position as the filename's whitespace.
b36398e7 CH	392	To get unwrapped version, set param `do_unwrap` to `False`.
	393
	394	See also: https://en.wikipedia.org/wiki/MIME#Encoded-Word
	395
	396	:param message: message part, e.g. from
	397	:py:meth:`email.message.Message.walk`
4b44f515 CH	398	:type message: :py:class:`email.message.Message` or
4b44f515 CH	399	:py:class:`email.message.EmailMessage`
b36398e7 CH	400	:param failobj: object to return in case of failure (defaults to None)
	401	:param bool do_unwrap: undo line-break inserted by mail-creator; may remove
	402	whitespace from file name; only applies to ascii
	403	file names
	404	:returns: either a string or failobj
	405	"""
	406	# try the old way and unwrap
	407	filename = message.get_filename(failobj)
	408
	409	if isinstance(filename, bytes) and not filename.startswith(b'=?') \
	410	and not filename.endswith(b'?='):
	411	filename = filename.decode('utf8')
	412
	413	if isinstance(filename, str):
	414	if do_unwrap:
	415	return re.sub('[\\r\\n]+', '', filename)
	416	return filename
	417
	418	if 'Content-Disposition' not in message:
	419	return failobj
	420
	421	# try parsing content-disposition. e.g.:
	422	# attachment; filename="2018年度公开课计划表.xlsx" -->
	423	# '=?utf-8?b?YXR0YWNobWVudDsgZmlsZW5hbWU9IjIwMTjlubTluqY=?=\r\n =?utf-8?b?'
	424	# '5YWs5byA6K++6K6h5YiS6KGoLnhsc3gi?='
	425
	426	# This may be a re-implementation of email.utils.collapse_rfc2231_value()
4b44f515	427	# as mentioned in email.message.EmailMessage.get_param()
b36398e7 CH	428
	429	# The form is: "=?charset?encoding?encoded text?="
	430	SPLIT_REGEX = '\r?\n *' # should be CRNL but some files miss the \r
	431	ENCODED_WORD_REGEX = r'\s=\?([^?]+)\?([^?]+)\?(.)\?=\s*$'
	432	LINE_REGEX = r'attachment\s;\sfilename=(")?(.+)\1\s*$'
	433	decoded = []
	434	for word in re.split(SPLIT_REGEX, message['Content-Disposition']):
	435	match = re.match(ENCODED_WORD_REGEX, word)
	436	if not match:
	437	break
	438	charset, encoding, data = match.groups()
	439	if encoding.lower() == 'b':
	440	temp = b64decode(data)
	441	elif encoding.lower() == 'q':
	442	raise NotImplementedError('use quopri.decodestring, handle _')
	443	else:
	444	raise ValueError('not allowed according to wikipedia: "{}"'
	445	.format(encoding))
	446	decoded.append(temp.decode(charset))
	447	decoded = u''.join(decoded)
	448
	449	match = re.match(LINE_REGEX, decoded)
	450	if match:
	451	return match.groups()[1]
	452	return failobj