[pyi2ncommon] / src / mail_utils.py

# This Python file uses the following encoding: utf-8

# The software in this package is distributed under the GNU General
# Public License version 2 (with a special exception described below).
#
# A copy of GNU General Public License (GPL) is included in this distribution,
# in the file COPYING.GPL.
#
# As a special exception, if other files instantiate templates or use macros
# or inline functions from this file, or you compile this file and link it
# with other works to produce a work based on this file, this file
# does not by itself cause the resulting work to be covered
# by the GNU General Public License.
#
# However the source code for this file must still be made available
# in accordance with section (3) of the GNU General Public License.
#
# This exception does not invalidate any other reasons why a work based
# on this file might be covered by the GNU General Public License.
#
# Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>

"""

SUMMARY
------------------------------------------------------
Utilities for dealing with email

.. seealso:: :py:mod:`pyi2ncommon.mail_validator`,
             :py:mod:`pyi2ncommon.imap_mailbox`

Copyright: Intra2net AG


INTERFACE
------------------------------------------------------

"""

from base64 import b64decode
from email.utils import parsedate_to_datetime
from email.parser import BytesParser
from email import policy

from .simple_cnf import SimpleCnf
# outsourced source, import required for compatiblity
from .imap_mailbox import ImapMailbox           # pylint: disable=unused-import
from .mail_validator import *                   # pylint: disable=unused-import

log = logging.getLogger('pyi2ncommon.mail_utils')


def prep_email_header(email_file, value, regex=None, criterion="envelopeto"):
    """
    Replace value in a provided email file.

    :param str email_file: file to use for the replacement
    :param str value: value to replace the first matched group with
    :param regex: regular expression to use when replacing a header value
    :type regex: str or None
    :param str criterion: criterion to use for replacement, one
                          of 'envelopeto' or 'received'
    :raises: :py:class:`ValueError` if the choice of criterion is invalid

    In some cases this function is reusing arnied wrapper's cnf value
    preparation but for email headers.
    """
    if criterion == "envelopeto":
        logging.debug("Updating test emails' EnvelopeTo header")
        arnied_wrapper.prep_cnf_value(email_file, value, regex=regex)
    elif criterion == "received":
        logging.debug("Updating test emails' Received header")
        with open(email_file, "r") as file_handle:
            email_text = file_handle.read()
            email_text = re.sub(regex, value, email_text)
            email_text = re.sub(regex, value, email_text)
        with open(email_file, "w") as file_handle:
            file_handle.write(email_text)
    else:
        raise ValueError("Invalid header preparation criterion '%s'"
                         % criterion)


def create_users(usernames, **extra_params):
    """
    Create users for sending / receiving mail.

    The created user settings are complete with spamfilter settings and
    groupare folders. User is per default member in groups 1 (admins) and
    2 (all). This cannot yet be changed.

    :param usernames: Names of users to create
    :type usernames: [str]

    All other params are forwarded to user config
    """
    if isinstance(usernames, str):
        usernames = [usernames,]
    default_cnf = dict(
        user_disabled="0",
        user_locale="",
        user_password="1234test",
        user_spamfilter_blacklist="",
        user_spamfilter_potential_spam_action="FOLDER",
        user_spamfilter_potential_spam_action_destaddr="",
        user_spamfilter_potential_spam_action_folder="Spamverdacht",
        # TODO: this doesn't handle situations where the child variable should not be defined
        user_spamfilter_potential_spam_threshold="1050",
        user_spamfilter_spam_action="FOLDER",
        user_spamfilter_spam_action_destaddr="",
        user_spamfilter_spam_action_folder="Spam",
        user_spamfilter_spam_deletedays="",
        # TODO: this doesn't handle situations where the child variable should not be defined
        user_spamfilter_spam_threshold="1080",
        user_spamfilter_whitelist="",
        user_groupware_folder_drafts="INBOX/Entwürfe",
        user_groupware_folder_outbox="INBOX/Gesendete Elemente",
        user_groupware_folder_trash="INBOX/Gelöschte Elemente",
    )

    cnf = SimpleCnf()
    for username in usernames:
        curr_cnf = default_cnf.copy()
        curr_cnf['user_fullname'] = username
        curr_cnf.update(extra_params)
        children = SimpleCnf()
        for key, value in curr_cnf.items():
            if isinstance(value, dict):
                children.add(key, children=value)
            if not isinstance(value, str):
                raise ValueError('Invalid value type for key "{}": {}'
                                 .format(key, type(value)))
            children.add(key, value)
        children.add('user_group_member_ref', "2")
        cnf.add('user', username, children=children, instance=-1)
    cnf.apply()


def parse_mail_file(file_name, headers_only=True, attachment_filenames=False,
                    raise_on_defect=False, new_message_type=False):
    """
    Parse given email file (e.g. a banned message).

    This is basically a `email.parser.BytesParser().parse(...)` with given
    `headers_only` and policy selection, that can also handle BSMTP. As an
    extra bonus, you can just request headers plus the names of attached files.

    Removes the SMTP envelope surrounding the email if present. Only left-over
    might be a line with a '.' at end of non-multipart messages if
     `headers_only` is False.

    :param str file_name: path to the file that contains the email text
    :param bool headers_only: whether to parse only the email headers; set this
                              to False, e.g. if you want to check for
                              attachments using message.walk()
    :param bool attachment_filenames: if you just want headers and names of
                                      attached files, set `headers_only` and
                                      this to True.
    :param bool raise_on_defect: whether to raise an error if email parser
                                 encounters a defect (email policy `strict`) or
                                 just add the defect to message's `defect`
                                 attribute
    :param bool new_message_type: whether to return the older
                                  :py:class:`email.message.Message` (policy
                                  `compat32`, our default), or the newer
                                  :py:class:`email.message.EmailMessage` type
                                  (policy `default`). Big difference!
    :returns: either msg or 2-tuple `(msg, filenames)` if requested per arg
              `attachment_filenames`
    :rtype: :py:class:`email.message.Message` or
             (:py:class:`email.message.Message`, (str)) or
             one of these two with :py:class:`email.message.EmailMessage`
    """
    msg = None
    start_pos = 0

    if new_message_type:
        mail_policy = policy.default
    else:
        mail_policy = policy.compat32
    if raise_on_defect:
        mail_policy += policy.strict

    with open(file_name, 'rb') as read_handle:
        line = read_handle.readline()
        if line.startswith(b'EHLO'):
            # there is a smtp header. skip to its end
            while line.strip() != b'DATA':
                line = read_handle.readline()
            # the rest is the email plus a trailing '.' (ignored by parser if
            # multipart)
        else:
            read_handle.seek(0)  # forget we read the first line already
        start_pos = read_handle.tell()
        msg = BytesParser(policy=mail_policy).parse(read_handle,
                                                    headersonly=headers_only)

    if not attachment_filenames:
        return msg

    # otherwise need to parse complete message to get attachment file names
    if headers_only:
        with open(file_name, 'rb') as read_handle:
            read_handle.seek(start_pos)
            full_msg = BytesParser(policy=mail_policy).parse(read_handle,
                                                             headersonly=False)
    else:
        full_msg = msg
    filenames = [get_filename(part) for part in full_msg.walk()]
    return msg, tuple(filename for filename in filenames
                      if filename is not None)


def parse_mail_date(message):
    """
    Parse the 'Date' header of the given message.

    Shortcut for :py:func:`email.utils.parsedate_to_datetime`.

    This is no longer necessary for newer
    :py:class:`email.message.EmailMessage` since the `Date` Header is
    automatically parsed to a :py:class:`email.headerregistry.DateHeader`.

    :param message: Email message
    :type message: :py:class:`email.message.Message`
    :returns: datetime from Email "Date" header or None if header not present
    :rtype: :py:class:`datetime.datetime` or None
    """
    date_str = message.get('Date', '')
    if not date_str:
        return None
    return parsedate_to_datetime(date_str)


def get_user_mail_files(user, mailbox='INBOX'):
    """
    Iterate over mails in given folder of given user; yields file names.

    Works on local cyrus file system, not on imap server.

    :param str user: Name of user whose mailbox is analyzed
    :param str mailbox: name of mailbox to use, INBOX (default) for base
                        folder; name is modified using :py:func:`cyrus_escape`
    :returns: nothing; but yields full path to messages on disc
    """
    # base folder of user mail
    folder = os.path.join('/datastore', 'imap-mails', 'user', user)

    # adapt paths like "INBOX/sub/dir" to "sub/dir"
    subdirs = mailbox.split('/')
    if subdirs[0].upper() == 'INBOX':
        subdirs = subdirs[1:]
    folder = os.path.join(folder,
                          *(cyrus_escape(subdir) for subdir in subdirs))

    for filename in os.listdir(folder):
        if not re.match(r'\d+\.', filename):
            continue
        full_path = os.path.join(folder, filename)
        yield full_path


def get_user_mail(user, mailbox='INBOX', **kwargs):
    """
    Iterate over mails in given folder of given user; yields parsed mails.

    :param str user: see :py:func:`get_user_mail_files`
    :param str mailbox: see :py:func:`get_user_mail_files`
    :param dict kwargs: all other args are forwarded to
                        :py:func:`parse_mail_file`
    :returns: nothing; but yields 2-tuples (path, email_msg) where first is the
              full path to the message on disc, and the latter is the outcome
              of :py:func:`parse_mail_file` for that file
    """
    for full_path in get_user_mail_files(user, mailbox):
        yield full_path, parse_mail_file(full_path, **kwargs)


def get_message_text(filename, fallback_encoding='iso8859-1',
                     include_all_text=False):
    """
    Extract message text as string from email message.

    Intended as complementary addition to get_user_mail, e.g. ::

        for filename, msg in get_user_mail(user):
            # rough filtering based on headers
            if msg['Subject'] != 'Expected Subject':
                continue
            # get message text for closer inspection
            text = get_message_text(filename)
            if 'Expected Text' not in text:
                continue
            ...

    Finds the first part in message that is of type text/plain and decodes it
    using encoding specified in mail or otherwise fallback encoding. If none
    found takes first part of type "text/*", or otherwise just the first part.

    If include_all_text is True, all text/* parts are included, with text/plain
    being the first.

    :param str filename: complete path of message file in filesystem
    :param str fallback_encoding: Encoding of email text if none is specified
                                  in mail.
    :param bool include_all_text: include all "text/*" parts in returned text
    :returns: text(s) of message
    :rtype: [str] if include_all_text else str
    """
    result = []
    msg = parse_mail_file(filename, headers_only=False)
    for part in msg.walk():
        if part.get_content_type() != 'text/plain':
            continue
        encoding = part.get_content_charset(fallback_encoding)
        result.append(part.get_payload(decode=True).decode(encoding))

    if result and not include_all_text:
        return result[0]

    # no text/plain found. Try only "text/":
    for part in msg.walk():
        cont_type = part.get_content_type()
        if cont_type.startswith('text/') and cont_type != 'text/plain':
            encoding = part.get_content_charset(fallback_encoding)
            result.append(part.get_payload(decode=True).decode(encoding))

    if result:
        if not include_all_text:
            return result[0]
        return result

    # no "text/" found. Just take first part
    while msg.is_multipart():
        msg = msg.get_payload(0)

    encoding = msg.get_content_charset(fallback_encoding)
    if include_all_text:
        return [msg.get_payload(decode=True).decode(encoding), ]
    return msg.get_payload(decode=True).decode(encoding)


def cyrus_escape(user_or_folder, keep_path=False, regex=False):
    """
    Convert names of users or mailbox folders to cyrus format.

    quite a hack, just does the following hard-coded replacements:

    * . --> ^
    * / --> .  (except if keep_path is True)
    * "u --> &APw-  ,  "o --> &APY-  ,  "a --> &AOQ-
      (if need more: this is modified utf-7)
    * inbox -->   (the empty string)

    Would like to use a general modified utf-7-encoder/decoder but python has
    none builtin (see https://bugs.python.org/issue5305) and an extra lib like
    https://bitbucket.org/mjs0/imapclient/ would be overkill. After all, we
    control the input to this function via params and this is enough umlaut-
    testing I think...

    :param str user_or_folder: name of the user or folder string to escape
    :param bool keep_path: do not replace '/' with '.' so can still use result
                           as path name
    :param bool regex: result is used in grep or other regex, so ^, . and & are
                       escaped again with a backslash
    :returns: escaped user or folder string
    :rtype: str

    .. seealso:: :py:func:`cyrus_unescape`
    """
    temp = user_or_folder.replace('.', '^') \
        .replace('ü', '&APw-').replace('ä', '&AOQ-') \
        .replace('ö', '&APY-') \
        .replace('inbox', '').replace('INBOX', '').replace('Inbox', '')
    if not keep_path:
        temp = temp.replace('/', '.')
    if regex:
        return temp.replace('^', r'\^').replace('&', r'\&') \
                   .replace('.', r'\.').replace('$', r'\$')
    return temp


def cyrus_unescape(user_or_folder):
    """
    Undo effects of :py:func:`cyrus_escape` (but not all of them).

    :param str user_or_folder: name of the user or folder string to unescape
    :returns: unescaped user or folder string
    :rtype: str
    """
    if user_or_folder == '':
        return 'inbox'
    return user_or_folder.replace('.', '/')\
        .replace(r'\^', '.').replace('^', '.')


def get_filename(message, failobj=None, do_unwrap=True):
    """
    Get filename of a message part, even if it is base64-encoded.

    For attachments with base64-encoded file name, the
    :py:func:`email.message.Message.get_filename()` does not work. This
    function tries that first and if it fails tries to interprete the
    Content-Disposition of the message part. If all fails, returns `failobj`.

    Only for ascii filenames: also unwraps file names if they are line-wrapped.
    But note that this may remove too much whitespace from the filename if
    line-wrapping happened in the same position as the filename's whitespace.
    To get unwrapped version, set param `do_unwrap` to `False`.

    See also: https://en.wikipedia.org/wiki/MIME#Encoded-Word

    :param message: message part, e.g. from
                    :py:meth:`email.message.Message.walk`
    :type message: :py:class:`email.message.Message` or
                   :py:class:`email.message.EmailMessage`
    :param failobj: object to return in case of failure (defaults to None)
    :param bool do_unwrap: undo line-break inserted by mail-creator; may remove
                           whitespace from file name; only applies to ascii
                           file names
    :returns: either a string or failobj
    """
    # try the old way and unwrap
    filename = message.get_filename(failobj)

    if isinstance(filename, bytes) and not filename.startswith(b'=?') \
            and not filename.endswith(b'?='):
        filename = filename.decode('utf8')

    if isinstance(filename, str):
        if do_unwrap:
            return re.sub('[\\r\\n]+', '', filename)
        return filename

    if 'Content-Disposition' not in message:
        return failobj

    # try parsing content-disposition. e.g.:
    # attachment; filename="2018年度公开课计划表.xlsx"   -->
    # '=?utf-8?b?YXR0YWNobWVudDsgZmlsZW5hbWU9IjIwMTjlubTluqY=?=\r\n =?utf-8?b?'
    # '5YWs5byA6K++6K6h5YiS6KGoLnhsc3gi?='

    # This may be a re-implementation of email.utils.collapse_rfc2231_value()
    # as mentioned in email.message.EmailMessage.get_param()

    # The form is: "=?charset?encoding?encoded text?="
    SPLIT_REGEX = '\r?\n *'    # should be CRNL but some files miss the \r
    ENCODED_WORD_REGEX = r'\s*=\?([^?]+)\?([^?]+)\?(.*)\?=\s*$'
    LINE_REGEX = r'attachment\s*;\s*filename=(")?(.+)\1\s*$'
    decoded = []
    for word in re.split(SPLIT_REGEX, message['Content-Disposition']):
        match = re.match(ENCODED_WORD_REGEX, word)
        if not match:
            break
        charset, encoding, data = match.groups()
        if encoding.lower() == 'b':
            temp = b64decode(data)
        elif encoding.lower() == 'q':
            raise NotImplementedError('use quopri.decodestring, handle _')
        else:
            raise ValueError('not allowed according to wikipedia: "{}"'
                             .format(encoding))
        decoded.append(temp.decode(charset))
    decoded = u''.join(decoded)

    match = re.match(LINE_REGEX, decoded)
    if match:
        return match.groups()[1]
    return failobj
Commit	Line	Data
f49f6323	1	# This Python file uses the following encoding: utf-8
11cbb815 PD	2
	3	# The software in this package is distributed under the GNU General
	4	# Public License version 2 (with a special exception described below).
	5	#
	6	# A copy of GNU General Public License (GPL) is included in this distribution,
	7	# in the file COPYING.GPL.
	8	#
	9	# As a special exception, if other files instantiate templates or use macros
	10	# or inline functions from this file, or you compile this file and link it
	11	# with other works to produce a work based on this file, this file
	12	# does not by itself cause the resulting work to be covered
	13	# by the GNU General Public License.
	14	#
	15	# However the source code for this file must still be made available
	16	# in accordance with section (3) of the GNU General Public License.
	17	#
	18	# This exception does not invalidate any other reasons why a work based
	19	# on this file might be covered by the GNU General Public License.
	20	#
	21	# Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>
	22
f49f6323 PD	23	"""
	24
	25	SUMMARY
	26	------------------------------------------------------
2ed7100d CH	27	Utilities for dealing with email
	28
	29	.. seealso:: :py:mod:`pyi2ncommon.mail_validator`,
	30	:py:mod:`pyi2ncommon.imap_mailbox`
f49f6323 PD	31
	32	Copyright: Intra2net AG
	33
	34
	35	INTERFACE
	36	------------------------------------------------------
	37
	38	"""
	39
b36398e7	40	from base64 import b64decode
67177844	41	from email.utils import parsedate_to_datetime
1d21262c	42	from email.parser import BytesParser
4b44f515	43	from email import policy
f49f6323	44
998bc6bb	45	from .simple_cnf import SimpleCnf
67177844 CH	46	# outsourced source, import required for compatiblity
	47	from .imap_mailbox import ImapMailbox # pylint: disable=unused-import
	48	from .mail_validator import * # pylint: disable=unused-import
f49f6323	49
67177844	50	log = logging.getLogger('pyi2ncommon.mail_utils')
f49f6323 PD	51
	52
	53	def prep_email_header(email_file, value, regex=None, criterion="envelopeto"):
	54	"""
	55	Replace value in a provided email file.
	56
	57	:param str email_file: file to use for the replacement
	58	:param str value: value to replace the first matched group with
	59	:param regex: regular expression to use when replacing a header value
	60	:type regex: str or None
	61	:param str criterion: criterion to use for replacement, one
	62	of 'envelopeto' or 'received'
	63	:raises: :py:class:`ValueError` if the choice of criterion is invalid
	64
	65	In some cases this function is reusing arnied wrapper's cnf value
	66	preparation but for email headers.
	67	"""
	68	if criterion == "envelopeto":
	69	logging.debug("Updating test emails' EnvelopeTo header")
	70	arnied_wrapper.prep_cnf_value(email_file, value, regex=regex)
	71	elif criterion == "received":
	72	logging.debug("Updating test emails' Received header")
e108b7d4 CH	73	with open(email_file, "r") as file_handle:
e108b7d4 CH	74	email_text = file_handle.read()
f49f6323 PD	75	email_text = re.sub(regex, value, email_text)
f49f6323 PD	76	email_text = re.sub(regex, value, email_text)
e108b7d4 CH	77	with open(email_file, "w") as file_handle:
e108b7d4 CH	78	file_handle.write(email_text)
f49f6323	79	else:
e108b7d4 CH	80	raise ValueError("Invalid header preparation criterion '%s'"
e108b7d4 CH	81	% criterion)
f49f6323 PD	82
f49f6323 PD	83
998bc6bb	84	def create_users(usernames, **extra_params):
f49f6323	85	"""
998bc6bb	86	Create users for sending / receiving mail.
f49f6323	87
998bc6bb CH	88	The created user settings are complete with spamfilter settings and
	89	groupare folders. User is per default member in groups 1 (admins) and
	90	2 (all). This cannot yet be changed.
f49f6323	91
998bc6bb CH	92	:param usernames: Names of users to create
998bc6bb CH	93	:type usernames: [str]
f49f6323	94
998bc6bb CH	95	All other params are forwarded to user config
	96	"""
	97	if isinstance(usernames, str):
	98	usernames = [usernames,]
	99	default_cnf = dict(
	100	user_disabled="0",
	101	user_locale="",
	102	user_password="1234test",
	103	user_spamfilter_blacklist="",
	104	user_spamfilter_potential_spam_action="FOLDER",
	105	user_spamfilter_potential_spam_action_destaddr="",
	106	user_spamfilter_potential_spam_action_folder="Spamverdacht",
	107	# TODO: this doesn't handle situations where the child variable should not be defined
	108	user_spamfilter_potential_spam_threshold="1050",
	109	user_spamfilter_spam_action="FOLDER",
	110	user_spamfilter_spam_action_destaddr="",
	111	user_spamfilter_spam_action_folder="Spam",
	112	user_spamfilter_spam_deletedays="",
	113	# TODO: this doesn't handle situations where the child variable should not be defined
	114	user_spamfilter_spam_threshold="1080",
	115	user_spamfilter_whitelist="",
	116	user_groupware_folder_drafts="INBOX/Entwürfe",
	117	user_groupware_folder_outbox="INBOX/Gesendete Elemente",
	118	user_groupware_folder_trash="INBOX/Gelöschte Elemente",
	119	)
	120
	121	cnf = SimpleCnf()
f49f6323	122	for username in usernames:
998bc6bb CH	123	curr_cnf = default_cnf.copy()
	124	curr_cnf['user_fullname'] = username
	125	curr_cnf.update(extra_params)
	126	children = SimpleCnf()
	127	for key, value in curr_cnf.items():
	128	if isinstance(value, dict):
	129	children.add(key, children=value)
	130	if not isinstance(value, str):
	131	raise ValueError('Invalid value type for key "{}": {}'
	132	.format(key, type(value)))
	133	children.add(key, value)
	134	children.add('user_group_member_ref', "2")
	135	cnf.add('user', username, children=children, instance=-1)
	136	cnf.apply()
f49f6323 PD	137
f49f6323 PD	138
4b44f515 CH	139	def parse_mail_file(file_name, headers_only=True, attachment_filenames=False,
4b44f515 CH	140	raise_on_defect=False, new_message_type=False):
f49f6323 PD	141	"""
	142	Parse given email file (e.g. a banned message).
	143
1d21262c	144	This is basically a `email.parser.BytesParser().parse(...)` with given
4b44f515 CH	145	`headers_only` and policy selection, that can also handle BSMTP. As an
4b44f515 CH	146	extra bonus, you can just request headers plus the names of attached files.
f49f6323 PD	147
f49f6323 PD	148	Removes the SMTP envelope surrounding the email if present. Only left-over
e108b7d4 CH	149	might be a line with a '.' at end of non-multipart messages if
e108b7d4 CH	150	`headers_only` is False.
b359b15c	151
4b44f515	152	:param str file_name: path to the file that contains the email text
b359b15c CH	153	:param bool headers_only: whether to parse only the email headers; set this
	154	to False, e.g. if you want to check for
	155	attachments using message.walk()
	156	:param bool attachment_filenames: if you just want headers and names of
	157	attached files, set `headers_only` and
	158	this to True.
4b44f515 CH	159	:param bool raise_on_defect: whether to raise an error if email parser
	160	encounters a defect (email policy `strict`) or
	161	just add the defect to message's `defect`
	162	attribute
	163	:param bool new_message_type: whether to return the older
	164	:py:class:`email.message.Message` (policy
	165	`compat32`, our default), or the newer
	166	:py:class:`email.message.EmailMessage` type
	167	(policy `default`). Big difference!
b359b15c CH	168	:returns: either msg or 2-tuple `(msg, filenames)` if requested per arg
	169	`attachment_filenames`
	170	:rtype: :py:class:`email.message.Message` or
4b44f515 CH	171	(:py:class:`email.message.Message`, (str)) or
4b44f515 CH	172	one of these two with :py:class:`email.message.EmailMessage`
f49f6323	173	"""
b359b15c CH	174	msg = None
b359b15c CH	175	start_pos = 0
4b44f515 CH	176
	177	if new_message_type:
	178	mail_policy = policy.default
	179	else:
	180	mail_policy = policy.compat32
	181	if raise_on_defect:
	182	mail_policy += policy.strict
	183
1d21262c	184	with open(file_name, 'rb') as read_handle:
f49f6323	185	line = read_handle.readline()
1d21262c	186	if line.startswith(b'EHLO'):
f49f6323	187	# there is a smtp header. skip to its end
1d21262c	188	while line.strip() != b'DATA':
f49f6323 PD	189	line = read_handle.readline()
	190	# the rest is the email plus a trailing '.' (ignored by parser if
	191	# multipart)
	192	else:
	193	read_handle.seek(0) # forget we read the first line already
b359b15c	194	start_pos = read_handle.tell()
4b44f515 CH	195	msg = BytesParser(policy=mail_policy).parse(read_handle,
4b44f515 CH	196	headersonly=headers_only)
b359b15c CH	197
	198	if not attachment_filenames:
	199	return msg
	200
	201	# otherwise need to parse complete message to get attachment file names
	202	if headers_only:
1d21262c	203	with open(file_name, 'rb') as read_handle:
b359b15c	204	read_handle.seek(start_pos)
4b44f515 CH	205	full_msg = BytesParser(policy=mail_policy).parse(read_handle,
4b44f515 CH	206	headersonly=False)
b359b15c CH	207	else:
	208	full_msg = msg
	209	filenames = [get_filename(part) for part in full_msg.walk()]
	210	return msg, tuple(filename for filename in filenames
	211	if filename is not None)
f49f6323 PD	212
f49f6323 PD	213
58414aec CH	214	def parse_mail_date(message):
	215	"""
	216	Parse the 'Date' header of the given message.
	217
	218	Shortcut for :py:func:`email.utils.parsedate_to_datetime`.
	219
	220	This is no longer necessary for newer
	221	:py:class:`email.message.EmailMessage` since the `Date` Header is
	222	automatically parsed to a :py:class:`email.headerregistry.DateHeader`.
	223
	224	:param message: Email message
	225	:type message: :py:class:`email.message.Message`
	226	:returns: datetime from Email "Date" header or None if header not present
	227	:rtype: :py:class:`datetime.datetime` or None
	228	"""
	229	date_str = message.get('Date', '')
	230	if not date_str:
	231	return None
	232	return parsedate_to_datetime(date_str)
	233
	234
f44055b0 CH	235	def get_user_mail_files(user, mailbox='INBOX'):
f44055b0 CH	236	"""
2ed7100d CH	237	Iterate over mails in given folder of given user; yields file names.
	238
	239	Works on local cyrus file system, not on imap server.
f44055b0	240
2ed7100d CH	241	:param str user: Name of user whose mailbox is analyzed
	242	:param str mailbox: name of mailbox to use, INBOX (default) for base
	243	folder; name is modified using :py:func:`cyrus_escape`
f44055b0 CH	244	:returns: nothing; but yields full path to messages on disc
	245	"""
	246	# base folder of user mail
	247	folder = os.path.join('/datastore', 'imap-mails', 'user', user)
	248
2ed7100d	249	# adapt paths like "INBOX/sub/dir" to "sub/dir"
f44055b0 CH	250	subdirs = mailbox.split('/')
	251	if subdirs[0].upper() == 'INBOX':
	252	subdirs = subdirs[1:]
	253	folder = os.path.join(folder,
	254	*(cyrus_escape(subdir) for subdir in subdirs))
	255
	256	for filename in os.listdir(folder):
	257	if not re.match(r'\d+\.', filename):
	258	continue
	259	full_path = os.path.join(folder, filename)
	260	yield full_path
	261
	262
f49f6323 PD	263	def get_user_mail(user, mailbox='INBOX', **kwargs):
f49f6323 PD	264	"""
e108b7d4	265	Iterate over mails in given folder of given user; yields parsed mails.
f49f6323	266
2ed7100d CH	267	:param str user: see :py:func:`get_user_mail_files`
2ed7100d CH	268	:param str mailbox: see :py:func:`get_user_mail_files`
f49f6323 PD	269	:param dict kwargs: all other args are forwarded to
	270	:py:func:`parse_mail_file`
	271	:returns: nothing; but yields 2-tuples (path, email_msg) where first is the
	272	full path to the message on disc, and the latter is the outcome
	273	of :py:func:`parse_mail_file` for that file
	274	"""
f44055b0 CH	275	for full_path in get_user_mail_files(user, mailbox):
f44055b0 CH	276	yield full_path, parse_mail_file(full_path, **kwargs)
f49f6323 PD	277
f49f6323 PD	278
f4dec410 CH	279	def get_message_text(filename, fallback_encoding='iso8859-1',
	280	include_all_text=False):
	281	"""
	282	Extract message text as string from email message.
	283
	284	Intended as complementary addition to get_user_mail, e.g. ::
	285
	286	for filename, msg in get_user_mail(user):
	287	# rough filtering based on headers
	288	if msg['Subject'] != 'Expected Subject':
	289	continue
	290	# get message text for closer inspection
	291	text = get_message_text(filename)
	292	if 'Expected Text' not in text:
	293	continue
	294	...
	295
	296	Finds the first part in message that is of type text/plain and decodes it
	297	using encoding specified in mail or otherwise fallback encoding. If none
	298	found takes first part of type "text/*", or otherwise just the first part.
	299
	300	If include_all_text is True, all text/* parts are included, with text/plain
	301	being the first.
	302
	303	:param str filename: complete path of message file in filesystem
2ed7100d CH	304	:param str fallback_encoding: Encoding of email text if none is specified
2ed7100d CH	305	in mail.
f4dec410 CH	306	:param bool include_all_text: include all "text/*" parts in returned text
	307	:returns: text(s) of message
	308	:rtype: [str] if include_all_text else str
	309	"""
	310	result = []
	311	msg = parse_mail_file(filename, headers_only=False)
	312	for part in msg.walk():
	313	if part.get_content_type() != 'text/plain':
	314	continue
	315	encoding = part.get_content_charset(fallback_encoding)
	316	result.append(part.get_payload(decode=True).decode(encoding))
	317
	318	if result and not include_all_text:
	319	return result[0]
	320
	321	# no text/plain found. Try only "text/":
	322	for part in msg.walk():
	323	cont_type = part.get_content_type()
	324	if cont_type.startswith('text/') and cont_type != 'text/plain':
	325	encoding = part.get_content_charset(fallback_encoding)
	326	result.append(part.get_payload(decode=True).decode(encoding))
	327
	328	if result:
	329	if not include_all_text:
	330	return result[0]
	331	return result
	332
	333	# no "text/" found. Just take first part
	334	while msg.is_multipart():
	335	msg = msg.get_payload(0)
	336
	337	encoding = msg.get_content_charset(fallback_encoding)
	338	if include_all_text:
	339	return [msg.get_payload(decode=True).decode(encoding), ]
	340	return msg.get_payload(decode=True).decode(encoding)
	341
	342
f49f6323 PD	343	def cyrus_escape(user_or_folder, keep_path=False, regex=False):
f49f6323 PD	344	"""
e108b7d4	345	Convert names of users or mailbox folders to cyrus format.
f49f6323 PD	346
	347	quite a hack, just does the following hard-coded replacements:
	348
	349	* . --> ^
	350	* / --> . (except if keep_path is True)
	351	* "u --> &APw- , "o --> &APY- , "a --> &AOQ-
	352	(if need more: this is modified utf-7)
	353	* inbox --> (the empty string)
	354
	355	Would like to use a general modified utf-7-encoder/decoder but python has
7628bc48	356	none builtin (see https://bugs.python.org/issue5305) and an extra lib like
f49f6323 PD	357	https://bitbucket.org/mjs0/imapclient/ would be overkill. After all, we
	358	control the input to this function via params and this is enough umlaut-
	359	testing I think...
	360
	361	:param str user_or_folder: name of the user or folder string to escape
	362	:param bool keep_path: do not replace '/' with '.' so can still use result
	363	as path name
	364	:param bool regex: result is used in grep or other regex, so ^, . and & are
	365	escaped again with a backslash
	366	:returns: escaped user or folder string
	367	:rtype: str
	368
	369	.. seealso:: :py:func:`cyrus_unescape`
	370	"""
	371	temp = user_or_folder.replace('.', '^') \
	372	.replace('ü', '&APw-').replace('ä', '&AOQ-') \
	373	.replace('ö', '&APY-') \
	374	.replace('inbox', '').replace('INBOX', '').replace('Inbox', '')
	375	if not keep_path:
	376	temp = temp.replace('/', '.')
	377	if regex:
	378	return temp.replace('^', r'\^').replace('&', r'\&') \
	379	.replace('.', r'\.').replace('$', r'\$')
2ed7100d	380	return temp
f49f6323 PD	381
	382
	383	def cyrus_unescape(user_or_folder):
	384	"""
	385	Undo effects of :py:func:`cyrus_escape` (but not all of them).
	386
	387	:param str user_or_folder: name of the user or folder string to unescape
	388	:returns: unescaped user or folder string
	389	:rtype: str
	390	"""
	391	if user_or_folder == '':
	392	return 'inbox'
	393	return user_or_folder.replace('.', '/')\
	394	.replace(r'\^', '.').replace('^', '.')
b36398e7 CH	395
	396
	397	def get_filename(message, failobj=None, do_unwrap=True):
	398	"""
e108b7d4	399	Get filename of a message part, even if it is base64-encoded.
b36398e7 CH	400
b36398e7 CH	401	For attachments with base64-encoded file name, the
2ed7100d CH	402	:py:func:`email.message.Message.get_filename()` does not work. This
	403	function tries that first and if it fails tries to interprete the
	404	Content-Disposition of the message part. If all fails, returns `failobj`.
b36398e7 CH	405
	406	Only for ascii filenames: also unwraps file names if they are line-wrapped.
	407	But note that this may remove too much whitespace from the filename if
7628bc48	408	line-wrapping happened in the same position as the filename's whitespace.
b36398e7 CH	409	To get unwrapped version, set param `do_unwrap` to `False`.
	410
	411	See also: https://en.wikipedia.org/wiki/MIME#Encoded-Word
	412
	413	:param message: message part, e.g. from
	414	:py:meth:`email.message.Message.walk`
4b44f515 CH	415	:type message: :py:class:`email.message.Message` or
4b44f515 CH	416	:py:class:`email.message.EmailMessage`
b36398e7 CH	417	:param failobj: object to return in case of failure (defaults to None)
	418	:param bool do_unwrap: undo line-break inserted by mail-creator; may remove
	419	whitespace from file name; only applies to ascii
	420	file names
	421	:returns: either a string or failobj
	422	"""
	423	# try the old way and unwrap
	424	filename = message.get_filename(failobj)
	425
	426	if isinstance(filename, bytes) and not filename.startswith(b'=?') \
	427	and not filename.endswith(b'?='):
	428	filename = filename.decode('utf8')
	429
	430	if isinstance(filename, str):
	431	if do_unwrap:
	432	return re.sub('[\\r\\n]+', '', filename)
	433	return filename
	434
	435	if 'Content-Disposition' not in message:
	436	return failobj
	437
	438	# try parsing content-disposition. e.g.:
	439	# attachment; filename="2018年度公开课计划表.xlsx" -->
	440	# '=?utf-8?b?YXR0YWNobWVudDsgZmlsZW5hbWU9IjIwMTjlubTluqY=?=\r\n =?utf-8?b?'
	441	# '5YWs5byA6K++6K6h5YiS6KGoLnhsc3gi?='
	442
	443	# This may be a re-implementation of email.utils.collapse_rfc2231_value()
4b44f515	444	# as mentioned in email.message.EmailMessage.get_param()
b36398e7 CH	445
	446	# The form is: "=?charset?encoding?encoded text?="
	447	SPLIT_REGEX = '\r?\n *' # should be CRNL but some files miss the \r
	448	ENCODED_WORD_REGEX = r'\s=\?([^?]+)\?([^?]+)\?(.)\?=\s*$'
	449	LINE_REGEX = r'attachment\s;\sfilename=(")?(.+)\1\s*$'
	450	decoded = []
	451	for word in re.split(SPLIT_REGEX, message['Content-Disposition']):
	452	match = re.match(ENCODED_WORD_REGEX, word)
	453	if not match:
	454	break
	455	charset, encoding, data = match.groups()
	456	if encoding.lower() == 'b':
	457	temp = b64decode(data)
	458	elif encoding.lower() == 'q':
	459	raise NotImplementedError('use quopri.decodestring, handle _')
	460	else:
	461	raise ValueError('not allowed according to wikipedia: "{}"'
	462	.format(encoding))
	463	decoded.append(temp.decode(charset))
	464	decoded = u''.join(decoded)
	465
	466	match = re.match(LINE_REGEX, decoded)
	467	if match:
	468	return match.groups()[1]
	469	return failobj