developer.intra2net.com Git - pyi2ncommon/blob - src/mail_utils.py

   1 # This Python file uses the following encoding: utf-8
   2
   3 # The software in this package is distributed under the GNU General
   4 # Public License version 2 (with a special exception described below).
   5 #
   6 # A copy of GNU General Public License (GPL) is included in this distribution,
   7 # in the file COPYING.GPL.
   8 #
   9 # As a special exception, if other files instantiate templates or use macros
  10 # or inline functions from this file, or you compile this file and link it
  11 # with other works to produce a work based on this file, this file
  12 # does not by itself cause the resulting work to be covered
  13 # by the GNU General Public License.
  14 #
  15 # However the source code for this file must still be made available
  16 # in accordance with section (3) of the GNU General Public License.
  17 #
  18 # This exception does not invalidate any other reasons why a work based
  19 # on this file might be covered by the GNU General Public License.
  20 #
  21 # Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>
  22
  23 """
  24
  25 SUMMARY
  26 ------------------------------------------------------
  27 Utilities for dealing with email
  28
  29 .. seealso:: :py:mod:`pyi2ncommon.mail_validator`,
  30              :py:mod:`pyi2ncommon.imap_mailbox`
  31
  32 Copyright: Intra2net AG
  33
  34
  35 INTERFACE
  36 ------------------------------------------------------
  37
  38 """
  39
  40 from base64 import b64decode
  41 from email.utils import parsedate_to_datetime
  42 from email.parser import BytesParser
  43 from email import policy
  44
  45 from .simple_cnf import SimpleCnf
  46 # outsourced source, import required for compatiblity
  47 from .imap_mailbox import ImapMailbox           # pylint: disable=unused-import
  48 from .mail_validator import *                   # pylint: disable=unused-import
  49
  50 log = logging.getLogger('pyi2ncommon.mail_utils')
  51
  52
  53 def prep_email_header(email_file, value, regex=None, criterion="envelopeto"):
  54     """
  55     Replace value in a provided email file.
  56
  57     :param str email_file: file to use for the replacement
  58     :param str value: value to replace the first matched group with
  59     :param regex: regular expression to use when replacing a header value
  60     :type regex: str or None
  61     :param str criterion: criterion to use for replacement, one
  62                           of 'envelopeto' or 'received'
  63     :raises: :py:class:`ValueError` if the choice of criterion is invalid
  64
  65     In some cases this function is reusing arnied wrapper's cnf value
  66     preparation but for email headers.
  67     """
  68     if criterion == "envelopeto":
  69         logging.debug("Updating test emails' EnvelopeTo header")
  70         arnied_wrapper.prep_cnf_value(email_file, value, regex=regex)
  71     elif criterion == "received":
  72         logging.debug("Updating test emails' Received header")
  73         with open(email_file, "r") as file_handle:
  74             email_text = file_handle.read()
  75             email_text = re.sub(regex, value, email_text)
  76             email_text = re.sub(regex, value, email_text)
  77         with open(email_file, "w") as file_handle:
  78             file_handle.write(email_text)
  79     else:
  80         raise ValueError("Invalid header preparation criterion '%s'"
  81                          % criterion)
  82
  83
  84 def create_users(usernames, **extra_params):
  85     """
  86     Create users for sending / receiving mail.
  87
  88     The created user settings are complete with spamfilter settings and
  89     groupare folders. User is per default member in groups 1 (admins) and
  90     2 (all). This cannot yet be changed.
  91
  92     :param usernames: Names of users to create
  93     :type usernames: [str]
  94
  95     All other params are forwarded to user config
  96     """
  97     if isinstance(usernames, str):
  98         usernames = [usernames,]
  99     default_cnf = dict(
 100         user_disabled="0",
 101         user_locale="",
 102         user_password="1234test",
 103         user_spamfilter_blacklist="",
 104         user_spamfilter_potential_spam_action="FOLDER",
 105         user_spamfilter_potential_spam_action_destaddr="",
 106         user_spamfilter_potential_spam_action_folder="Spamverdacht",
 107         # TODO: this doesn't handle situations where the child variable should not be defined
 108         user_spamfilter_potential_spam_threshold="1050",
 109         user_spamfilter_spam_action="FOLDER",
 110         user_spamfilter_spam_action_destaddr="",
 111         user_spamfilter_spam_action_folder="Spam",
 112         user_spamfilter_spam_deletedays="",
 113         # TODO: this doesn't handle situations where the child variable should not be defined
 114         user_spamfilter_spam_threshold="1080",
 115         user_spamfilter_whitelist="",
 116         user_groupware_folder_drafts="INBOX/Entwürfe",
 117         user_groupware_folder_outbox="INBOX/Gesendete Elemente",
 118         user_groupware_folder_trash="INBOX/Gelöschte Elemente",
 119     )
 120
 121     cnf = SimpleCnf()
 122     for username in usernames:
 123         curr_cnf = default_cnf.copy()
 124         curr_cnf['user_fullname'] = username
 125         curr_cnf.update(extra_params)
 126         children = SimpleCnf()
 127         for key, value in curr_cnf.items():
 128             if isinstance(value, dict):
 129                 children.add(key, children=value)
 130             if not isinstance(value, str):
 131                 raise ValueError('Invalid value type for key "{}": {}'
 132                                  .format(key, type(value)))
 133             children.add(key, value)
 134         children.add('user_group_member_ref', "2")
 135         cnf.add('user', username, children=children, instance=-1)
 136     cnf.apply()
 137
 138
 139 def parse_mail_file(file_name, headers_only=True, attachment_filenames=False,
 140                     raise_on_defect=False, new_message_type=False):
 141     """
 142     Parse given email file (e.g. a banned message).
 143
 144     This is basically a `email.parser.BytesParser().parse(...)` with given
 145     `headers_only` and policy selection, that can also handle BSMTP. As an
 146     extra bonus, you can just request headers plus the names of attached files.
 147
 148     Removes the SMTP envelope surrounding the email if present. Only left-over
 149     might be a line with a '.' at end of non-multipart messages if
 150      `headers_only` is False.
 151
 152     :param str file_name: path to the file that contains the email text
 153     :param bool headers_only: whether to parse only the email headers; set this
 154                               to False, e.g. if you want to check for
 155                               attachments using message.walk()
 156     :param bool attachment_filenames: if you just want headers and names of
 157                                       attached files, set `headers_only` and
 158                                       this to True.
 159     :param bool raise_on_defect: whether to raise an error if email parser
 160                                  encounters a defect (email policy `strict`) or
 161                                  just add the defect to message's `defect`
 162                                  attribute
 163     :param bool new_message_type: whether to return the older
 164                                   :py:class:`email.message.Message` (policy
 165                                   `compat32`, our default), or the newer
 166                                   :py:class:`email.message.EmailMessage` type
 167                                   (policy `default`). Big difference!
 168     :returns: either msg or 2-tuple `(msg, filenames)` if requested per arg
 169               `attachment_filenames`
 170     :rtype: :py:class:`email.message.Message` or
 171              (:py:class:`email.message.Message`, (str)) or
 172              one of these two with :py:class:`email.message.EmailMessage`
 173     """
 174     msg = None
 175     start_pos = 0
 176
 177     if new_message_type:
 178         mail_policy = policy.default
 179     else:
 180         mail_policy = policy.compat32
 181     if raise_on_defect:
 182         mail_policy += policy.strict
 183
 184     with open(file_name, 'rb') as read_handle:
 185         line = read_handle.readline()
 186         if line.startswith(b'EHLO'):
 187             # there is a smtp header. skip to its end
 188             while line.strip() != b'DATA':
 189                 line = read_handle.readline()
 190             # the rest is the email plus a trailing '.' (ignored by parser if
 191             # multipart)
 192         else:
 193             read_handle.seek(0)  # forget we read the first line already
 194         start_pos = read_handle.tell()
 195         msg = BytesParser(policy=mail_policy).parse(read_handle,
 196                                                     headersonly=headers_only)
 197
 198     if not attachment_filenames:
 199         return msg
 200
 201     # otherwise need to parse complete message to get attachment file names
 202     if headers_only:
 203         with open(file_name, 'rb') as read_handle:
 204             read_handle.seek(start_pos)
 205             full_msg = BytesParser(policy=mail_policy).parse(read_handle,
 206                                                              headersonly=False)
 207     else:
 208         full_msg = msg
 209     filenames = [get_filename(part) for part in full_msg.walk()]
 210     return msg, tuple(filename for filename in filenames
 211                       if filename is not None)
 212
 213
 214 def parse_mail_date(message):
 215     """
 216     Parse the 'Date' header of the given message.
 217
 218     Shortcut for :py:func:`email.utils.parsedate_to_datetime`.
 219
 220     This is no longer necessary for newer
 221     :py:class:`email.message.EmailMessage` since the `Date` Header is
 222     automatically parsed to a :py:class:`email.headerregistry.DateHeader`.
 223
 224     :param message: Email message
 225     :type message: :py:class:`email.message.Message`
 226     :returns: datetime from Email "Date" header or None if header not present
 227     :rtype: :py:class:`datetime.datetime` or None
 228     """
 229     date_str = message.get('Date', '')
 230     if not date_str:
 231         return None
 232     return parsedate_to_datetime(date_str)
 233
 234
 235 def get_user_mail_files(user, mailbox='INBOX'):
 236     """
 237     Iterate over mails in given folder of given user; yields file names.
 238
 239     Works on local cyrus file system, not on imap server.
 240
 241     :param str user: Name of user whose mailbox is analyzed
 242     :param str mailbox: name of mailbox to use, INBOX (default) for base
 243                         folder; name is modified using :py:func:`cyrus_escape`
 244     :returns: nothing; but yields full path to messages on disc
 245     """
 246     # base folder of user mail
 247     folder = os.path.join('/datastore', 'imap-mails', 'user', user)
 248
 249     # adapt paths like "INBOX/sub/dir" to "sub/dir"
 250     subdirs = mailbox.split('/')
 251     if subdirs[0].upper() == 'INBOX':
 252         subdirs = subdirs[1:]
 253     folder = os.path.join(folder,
 254                           *(cyrus_escape(subdir) for subdir in subdirs))
 255
 256     for filename in os.listdir(folder):
 257         if not re.match(r'\d+\.', filename):
 258             continue
 259         full_path = os.path.join(folder, filename)
 260         yield full_path
 261
 262
 263 def get_user_mail(user, mailbox='INBOX', **kwargs):
 264     """
 265     Iterate over mails in given folder of given user; yields parsed mails.
 266
 267     :param str user: see :py:func:`get_user_mail_files`
 268     :param str mailbox: see :py:func:`get_user_mail_files`
 269     :param dict kwargs: all other args are forwarded to
 270                         :py:func:`parse_mail_file`
 271     :returns: nothing; but yields 2-tuples (path, email_msg) where first is the
 272               full path to the message on disc, and the latter is the outcome
 273               of :py:func:`parse_mail_file` for that file
 274     """
 275     for full_path in get_user_mail_files(user, mailbox):
 276         yield full_path, parse_mail_file(full_path, **kwargs)
 277
 278
 279 def get_message_text(filename, fallback_encoding='iso8859-1',
 280                      include_all_text=False):
 281     """
 282     Extract message text as string from email message.
 283
 284     Intended as complementary addition to get_user_mail, e.g. ::
 285
 286         for filename, msg in get_user_mail(user):
 287             # rough filtering based on headers
 288             if msg['Subject'] != 'Expected Subject':
 289                 continue
 290             # get message text for closer inspection
 291             text = get_message_text(filename)
 292             if 'Expected Text' not in text:
 293                 continue
 294             ...
 295
 296     Finds the first part in message that is of type text/plain and decodes it
 297     using encoding specified in mail or otherwise fallback encoding. If none
 298     found takes first part of type "text/*", or otherwise just the first part.
 299
 300     If include_all_text is True, all text/* parts are included, with text/plain
 301     being the first.
 302
 303     :param str filename: complete path of message file in filesystem
 304     :param str fallback_encoding: Encoding of email text if none is specified
 305                                   in mail.
 306     :param bool include_all_text: include all "text/*" parts in returned text
 307     :returns: text(s) of message
 308     :rtype: [str] if include_all_text else str
 309     """
 310     result = []
 311     msg = parse_mail_file(filename, headers_only=False)
 312     for part in msg.walk():
 313         if part.get_content_type() != 'text/plain':
 314             continue
 315         encoding = part.get_content_charset(fallback_encoding)
 316         result.append(part.get_payload(decode=True).decode(encoding))
 317
 318     if result and not include_all_text:
 319         return result[0]
 320
 321     # no text/plain found. Try only "text/":
 322     for part in msg.walk():
 323         cont_type = part.get_content_type()
 324         if cont_type.startswith('text/') and cont_type != 'text/plain':
 325             encoding = part.get_content_charset(fallback_encoding)
 326             result.append(part.get_payload(decode=True).decode(encoding))
 327
 328     if result:
 329         if not include_all_text:
 330             return result[0]
 331         return result
 332
 333     # no "text/" found. Just take first part
 334     while msg.is_multipart():
 335         msg = msg.get_payload(0)
 336
 337     encoding = msg.get_content_charset(fallback_encoding)
 338     if include_all_text:
 339         return [msg.get_payload(decode=True).decode(encoding), ]
 340     return msg.get_payload(decode=True).decode(encoding)
 341
 342
 343 def cyrus_escape(user_or_folder, keep_path=False, regex=False):
 344     """
 345     Convert names of users or mailbox folders to cyrus format.
 346
 347     quite a hack, just does the following hard-coded replacements:
 348
 349     * . --> ^
 350     * / --> .  (except if keep_path is True)
 351     * "u --> &APw-  ,  "o --> &APY-  ,  "a --> &AOQ-
 352       (if need more: this is modified utf-7)
 353     * inbox -->   (the empty string)
 354
 355     Would like to use a general modified utf-7-encoder/decoder but python has
 356     none builtin (see https://bugs.python.org/issue5305) and an extra lib like
 357     https://bitbucket.org/mjs0/imapclient/ would be overkill. After all, we
 358     control the input to this function via params and this is enough umlaut-
 359     testing I think...
 360
 361     :param str user_or_folder: name of the user or folder string to escape
 362     :param bool keep_path: do not replace '/' with '.' so can still use result
 363                            as path name
 364     :param bool regex: result is used in grep or other regex, so ^, . and & are
 365                        escaped again with a backslash
 366     :returns: escaped user or folder string
 367     :rtype: str
 368
 369     .. seealso:: :py:func:`cyrus_unescape`
 370     """
 371     temp = user_or_folder.replace('.', '^') \
 372         .replace('ü', '&APw-').replace('ä', '&AOQ-') \
 373         .replace('ö', '&APY-') \
 374         .replace('inbox', '').replace('INBOX', '').replace('Inbox', '')
 375     if not keep_path:
 376         temp = temp.replace('/', '.')
 377     if regex:
 378         return temp.replace('^', r'\^').replace('&', r'\&') \
 379                    .replace('.', r'\.').replace('$', r'\$')
 380     return temp
 381
 382
 383 def cyrus_unescape(user_or_folder):
 384     """
 385     Undo effects of :py:func:`cyrus_escape` (but not all of them).
 386
 387     :param str user_or_folder: name of the user or folder string to unescape
 388     :returns: unescaped user or folder string
 389     :rtype: str
 390     """
 391     if user_or_folder == '':
 392         return 'inbox'
 393     return user_or_folder.replace('.', '/')\
 394         .replace(r'\^', '.').replace('^', '.')
 395
 396
 397 def get_filename(message, failobj=None, do_unwrap=True):
 398     """
 399     Get filename of a message part, even if it is base64-encoded.
 400
 401     For attachments with base64-encoded file name, the
 402     :py:func:`email.message.Message.get_filename()` does not work. This
 403     function tries that first and if it fails tries to interprete the
 404     Content-Disposition of the message part. If all fails, returns `failobj`.
 405
 406     Only for ascii filenames: also unwraps file names if they are line-wrapped.
 407     But note that this may remove too much whitespace from the filename if
 408     line-wrapping happened in the same position as the filename's whitespace.
 409     To get unwrapped version, set param `do_unwrap` to `False`.
 410
 411     See also: https://en.wikipedia.org/wiki/MIME#Encoded-Word
 412
 413     :param message: message part, e.g. from
 414                     :py:meth:`email.message.Message.walk`
 415     :type message: :py:class:`email.message.Message` or
 416                    :py:class:`email.message.EmailMessage`
 417     :param failobj: object to return in case of failure (defaults to None)
 418     :param bool do_unwrap: undo line-break inserted by mail-creator; may remove
 419                            whitespace from file name; only applies to ascii
 420                            file names
 421     :returns: either a string or failobj
 422     """
 423     # try the old way and unwrap
 424     filename = message.get_filename(failobj)
 425
 426     if isinstance(filename, bytes) and not filename.startswith(b'=?') \
 427             and not filename.endswith(b'?='):
 428         filename = filename.decode('utf8')
 429
 430     if isinstance(filename, str):
 431         if do_unwrap:
 432             return re.sub('[\\r\\n]+', '', filename)
 433         return filename
 434
 435     if 'Content-Disposition' not in message:
 436         return failobj
 437
 438     # try parsing content-disposition. e.g.:
 439     # attachment; filename="2018年度公开课计划表.xlsx"   -->
 440     # '=?utf-8?b?YXR0YWNobWVudDsgZmlsZW5hbWU9IjIwMTjlubTluqY=?=\r\n =?utf-8?b?'
 441     # '5YWs5byA6K++6K6h5YiS6KGoLnhsc3gi?='
 442
 443     # This may be a re-implementation of email.utils.collapse_rfc2231_value()
 444     # as mentioned in email.message.EmailMessage.get_param()
 445
 446     # The form is: "=?charset?encoding?encoded text?="
 447     SPLIT_REGEX = '\r?\n *'    # should be CRNL but some files miss the \r
 448     ENCODED_WORD_REGEX = r'\s*=\?([^?]+)\?([^?]+)\?(.*)\?=\s*$'
 449     LINE_REGEX = r'attachment\s*;\s*filename=(")?(.+)\1\s*$'
 450     decoded = []
 451     for word in re.split(SPLIT_REGEX, message['Content-Disposition']):
 452         match = re.match(ENCODED_WORD_REGEX, word)
 453         if not match:
 454             break
 455         charset, encoding, data = match.groups()
 456         if encoding.lower() == 'b':
 457             temp = b64decode(data)
 458         elif encoding.lower() == 'q':
 459             raise NotImplementedError('use quopri.decodestring, handle _')
 460         else:
 461             raise ValueError('not allowed according to wikipedia: "{}"'
 462                              .format(encoding))
 463         decoded.append(temp.decode(charset))
 464     decoded = u''.join(decoded)
 465
 466     match = re.match(LINE_REGEX, decoded)
 467     if match:
 468         return match.groups()[1]
 469     return failobj