developer.intra2net.com Git - pyi2ncommon/blob - src/mail_utils.py

   1 # This Python file uses the following encoding: utf-8
   2
   3 # The software in this package is distributed under the GNU General
   4 # Public License version 2 (with a special exception described below).
   5 #
   6 # A copy of GNU General Public License (GPL) is included in this distribution,
   7 # in the file COPYING.GPL.
   8 #
   9 # As a special exception, if other files instantiate templates or use macros
  10 # or inline functions from this file, or you compile this file and link it
  11 # with other works to produce a work based on this file, this file
  12 # does not by itself cause the resulting work to be covered
  13 # by the GNU General Public License.
  14 #
  15 # However the source code for this file must still be made available
  16 # in accordance with section (3) of the GNU General Public License.
  17 #
  18 # This exception does not invalidate any other reasons why a work based
  19 # on this file might be covered by the GNU General Public License.
  20 #
  21 # Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>
  22
  23 """
  24
  25 SUMMARY
  26 ------------------------------------------------------
  27 Utilities for dealing with email
  28
  29 .. seealso:: :py:mod:`pyi2ncommon.mail_validator`,
  30              :py:mod:`pyi2ncommon.imap_mailbox`
  31
  32 Copyright: Intra2net AG
  33
  34
  35 INTERFACE
  36 ------------------------------------------------------
  37
  38 """
  39
  40 from base64 import b64decode
  41 from email.utils import parsedate_to_datetime
  42 from email.parser import BytesParser
  43 from email import policy
  44
  45 # outsourced source, import required for compatiblity
  46 from .imap_mailbox import ImapMailbox           # pylint: disable=unused-import
  47 from .mail_validator import *                   # pylint: disable=unused-import
  48 from .sysmisc import replace_file_regex
  49
  50 log = logging.getLogger('pyi2ncommon.mail_utils')
  51
  52
  53 def prep_email_header(email_file, value, regex=None, criterion="envelopeto"):
  54     """
  55     Replace value in a provided email file.
  56
  57     :param str email_file: file to use for the replacement
  58     :param str value: value to replace the first matched group with
  59     :param regex: regular expression to use when replacing a header value
  60     :type regex: str or None
  61     :param str criterion: criterion to use for replacement, one
  62                           of 'envelopeto' or 'received'
  63     :raises: :py:class:`ValueError` if the choice of criterion is invalid
  64
  65     ..todo:: In some cases this function is reusing arnied wrapper's cnf
  66              value preparation but for email headers.
  67     """
  68     if criterion == "envelopeto":
  69         logging.debug("Updating test emails' EnvelopeTo header")
  70         replace_file_regex(email_file, value, regex=regex)
  71     elif criterion == "received":
  72         logging.debug("Updating test emails' Received header")
  73         with open(email_file, "r") as file_handle:
  74             email_text = file_handle.read()
  75             email_text = re.sub(regex, value, email_text)
  76             email_text = re.sub(regex, value, email_text)
  77         with open(email_file, "w") as file_handle:
  78             file_handle.write(email_text)
  79     else:
  80         raise ValueError("Invalid header preparation criterion '%s'"
  81                          % criterion)
  82
  83
  84 def parse_mail_file(file_name, headers_only=True, attachment_filenames=False,
  85                     raise_on_defect=False, new_message_type=False):
  86     """
  87     Parse given email file (e.g. a banned message).
  88
  89     This is basically a `email.parser.BytesParser().parse(...)` with given
  90     `headers_only` and policy selection, that can also handle BSMTP. As an
  91     extra bonus, you can just request headers plus the names of attached files.
  92
  93     Removes the SMTP envelope surrounding the email if present. Only left-over
  94     might be a line with a '.' at end of non-multipart messages if
  95     `headers_only` is False.
  96
  97     :param str file_name: path to the file that contains the email text
  98     :param bool headers_only: whether to parse only the email headers; set this
  99                               to False, e.g. if you want to check for
 100                               attachments using message.walk()
 101     :param bool attachment_filenames: if you just want headers and names of
 102                                       attached files, set `headers_only` and
 103                                       this to True.
 104     :param bool raise_on_defect: whether to raise an error if email parser
 105                                  encounters a defect (email policy `strict`) or
 106                                  just add the defect to message's `defect`
 107                                  attribute
 108     :param bool new_message_type: whether to return the older
 109                                   :py:class:`email.message.Message` (policy
 110                                   `compat32`, our default), or the newer
 111                                   :py:class:`email.message.EmailMessage` type
 112                                   (policy `default`). Big difference!
 113     :returns: either msg or 2-tuple `(msg, filenames)` if requested per arg
 114               `attachment_filenames`
 115     :rtype: :py:class:`email.message.Message` or
 116              (:py:class:`email.message.Message`, (str)) or
 117              one of these two with :py:class:`email.message.EmailMessage`
 118     """
 119     msg = None
 120     start_pos = 0
 121
 122     if new_message_type:
 123         mail_policy = policy.default
 124     else:
 125         mail_policy = policy.compat32
 126     if raise_on_defect:
 127         mail_policy += policy.strict
 128
 129     with open(file_name, 'rb') as read_handle:
 130         line = read_handle.readline()
 131         if line.startswith(b'EHLO'):
 132             # there is a smtp header. skip to its end
 133             while line.strip() != b'DATA':
 134                 line = read_handle.readline()
 135             # the rest is the email plus a trailing '.' (ignored by parser if
 136             # multipart)
 137         else:
 138             read_handle.seek(0)  # forget we read the first line already
 139         start_pos = read_handle.tell()
 140         msg = BytesParser(policy=mail_policy).parse(read_handle,
 141                                                     headersonly=headers_only)
 142
 143     if not attachment_filenames:
 144         return msg
 145
 146     # otherwise need to parse complete message to get attachment file names
 147     if headers_only:
 148         with open(file_name, 'rb') as read_handle:
 149             read_handle.seek(start_pos)
 150             full_msg = BytesParser(policy=mail_policy).parse(read_handle,
 151                                                              headersonly=False)
 152     else:
 153         full_msg = msg
 154     filenames = [get_filename(part) for part in full_msg.walk()]
 155     return msg, tuple(filename for filename in filenames
 156                       if filename is not None)
 157
 158
 159 def parse_mail_date(message):
 160     """
 161     Parse the 'Date' header of the given message.
 162
 163     Shortcut for :py:func:`email.utils.parsedate_to_datetime`.
 164
 165     This is no longer necessary for newer
 166     :py:class:`email.message.EmailMessage` since the `Date` Header is
 167     automatically parsed to a :py:class:`email.headerregistry.DateHeader`.
 168
 169     :param message: Email message
 170     :type message: :py:class:`email.message.Message`
 171     :returns: datetime from Email "Date" header or None if header not present
 172     :rtype: :py:class:`datetime.datetime` or None
 173     """
 174     date_str = message.get('Date', '')
 175     if not date_str:
 176         return None
 177     return parsedate_to_datetime(date_str)
 178
 179
 180 def get_user_mail_files(user, mailbox='INBOX'):
 181     """
 182     Iterate over mails in given folder of given user; yields file names.
 183
 184     Works on local cyrus file system, not on imap server.
 185
 186     :param str user: Name of user whose mailbox is analyzed
 187     :param str mailbox: name of mailbox to use, INBOX (default) for base
 188                         folder; name is modified using :py:func:`cyrus_escape`
 189     :returns: nothing; but yields full path to messages on disc
 190     """
 191     # base folder of user mail
 192     folder = os.path.join('/datastore', 'imap-mails', 'user', user)
 193
 194     # adapt paths like "INBOX/sub/dir" to "sub/dir"
 195     subdirs = mailbox.split('/')
 196     if subdirs[0].upper() == 'INBOX':
 197         subdirs = subdirs[1:]
 198     folder = os.path.join(folder,
 199                           *(cyrus_escape(subdir) for subdir in subdirs))
 200
 201     for filename in os.listdir(folder):
 202         if not re.match(r'\d+\.', filename):
 203             continue
 204         full_path = os.path.join(folder, filename)
 205         yield full_path
 206
 207
 208 def get_user_mail(user, mailbox='INBOX', **kwargs):
 209     """
 210     Iterate over mails in given folder of given user; yields parsed mails.
 211
 212     :param str user: see :py:func:`get_user_mail_files`
 213     :param str mailbox: see :py:func:`get_user_mail_files`
 214     :param dict kwargs: all other args are forwarded to
 215                         :py:func:`parse_mail_file`
 216     :returns: nothing; but yields 2-tuples (path, email_msg) where first is the
 217               full path to the message on disc, and the latter is the outcome
 218               of :py:func:`parse_mail_file` for that file
 219     """
 220     for full_path in get_user_mail_files(user, mailbox):
 221         yield full_path, parse_mail_file(full_path, **kwargs)
 222
 223
 224 def get_message_text(filename, fallback_encoding='iso8859-1',
 225                      include_all_text=False):
 226     """
 227     Extract message text as string from email message.
 228
 229     Intended as complementary addition to get_user_mail, e.g. ::
 230
 231         for filename, msg in get_user_mail(user):
 232             # rough filtering based on headers
 233             if msg['Subject'] != 'Expected Subject':
 234                 continue
 235             # get message text for closer inspection
 236             text = get_message_text(filename)
 237             if 'Expected Text' not in text:
 238                 continue
 239             ...
 240
 241     Finds the first part in message that is of type `text/plain` and decodes it
 242     using encoding specified in mail or otherwise fallback encoding. If none
 243     found takes first part of type `text/*`, or otherwise just the first part.
 244
 245     If include_all_text is True, all `text/*` parts are included, with `text/plain`
 246     being the first.
 247
 248     :param str filename: complete path of message file in filesystem
 249     :param str fallback_encoding: Encoding of email text if none is specified
 250                                   in mail.
 251     :param bool include_all_text: include all `text/*` parts in returned text
 252     :returns: text(s) of message
 253     :rtype: [str] if include_all_text else str
 254     """
 255     result = []
 256     msg = parse_mail_file(filename, headers_only=False)
 257     for part in msg.walk():
 258         if part.get_content_type() != 'text/plain':
 259             continue
 260         encoding = part.get_content_charset(fallback_encoding)
 261         result.append(part.get_payload(decode=True).decode(encoding))
 262
 263     if result and not include_all_text:
 264         return result[0]
 265
 266     # no text/plain found. Try only "text/":
 267     for part in msg.walk():
 268         cont_type = part.get_content_type()
 269         if cont_type.startswith('text/') and cont_type != 'text/plain':
 270             encoding = part.get_content_charset(fallback_encoding)
 271             result.append(part.get_payload(decode=True).decode(encoding))
 272
 273     if result:
 274         if not include_all_text:
 275             return result[0]
 276         return result
 277
 278     # no "text/" found. Just take first part
 279     while msg.is_multipart():
 280         msg = msg.get_payload(0)
 281
 282     encoding = msg.get_content_charset(fallback_encoding)
 283     if include_all_text:
 284         return [msg.get_payload(decode=True).decode(encoding), ]
 285     return msg.get_payload(decode=True).decode(encoding)
 286
 287
 288 def cyrus_escape(user_or_folder, keep_path=False, regex=False):
 289     """
 290     Convert names of users or mailbox folders to cyrus format.
 291
 292     quite a hack, just does the following hard-coded replacements:
 293
 294     * . --> ^
 295     * / --> .  (except if keep_path is True)
 296     * "u --> &APw-  ,  "o --> &APY-  ,  "a --> &AOQ-
 297       (if need more: this is modified utf-7)
 298     * inbox -->   (the empty string)
 299
 300     Would like to use a general modified utf-7-encoder/decoder but python has
 301     none builtin (see https://bugs.python.org/issue5305) and an extra lib like
 302     https://bitbucket.org/mjs0/imapclient/ would be overkill. After all, we
 303     control the input to this function via params and this is enough umlaut-
 304     testing I think...
 305
 306     :param str user_or_folder: name of the user or folder string to escape
 307     :param bool keep_path: do not replace '/' with '.' so can still use result
 308                            as path name
 309     :param bool regex: result is used in grep or other regex, so ^, . and & are
 310                        escaped again with a backslash
 311     :returns: escaped user or folder string
 312     :rtype: str
 313
 314     .. seealso:: :py:func:`cyrus_unescape`
 315     """
 316     temp = user_or_folder.replace('.', '^') \
 317         .replace('ü', '&APw-').replace('ä', '&AOQ-') \
 318         .replace('ö', '&APY-') \
 319         .replace('inbox', '').replace('INBOX', '').replace('Inbox', '')
 320     if not keep_path:
 321         temp = temp.replace('/', '.')
 322     if regex:
 323         return temp.replace('^', r'\^').replace('&', r'\&') \
 324                    .replace('.', r'\.').replace('$', r'\$')
 325     return temp
 326
 327
 328 def cyrus_unescape(user_or_folder):
 329     """
 330     Undo effects of :py:func:`cyrus_escape` (but not all of them).
 331
 332     :param str user_or_folder: name of the user or folder string to unescape
 333     :returns: unescaped user or folder string
 334     :rtype: str
 335     """
 336     if user_or_folder == '':
 337         return 'inbox'
 338     return user_or_folder.replace('.', '/')\
 339         .replace(r'\^', '.').replace('^', '.')
 340
 341
 342 def get_filename(message, failobj=None, do_unwrap=True):
 343     """
 344     Get filename of a message part, even if it is base64-encoded.
 345
 346     For attachments with base64-encoded file name, the
 347     :py:func:`email.message.Message.get_filename()` does not work. This
 348     function tries that first and if it fails tries to interprete the
 349     Content-Disposition of the message part. If all fails, returns `failobj`.
 350
 351     Only for ascii filenames: also unwraps file names if they are line-wrapped.
 352     But note that this may remove too much whitespace from the filename if
 353     line-wrapping happened in the same position as the filename's whitespace.
 354     To get unwrapped version, set param `do_unwrap` to `False`.
 355
 356     See also: https://en.wikipedia.org/wiki/MIME#Encoded-Word
 357
 358     :param message: message part, e.g. from
 359                     :py:meth:`email.message.Message.walk`
 360     :type message: :py:class:`email.message.Message` or
 361                    :py:class:`email.message.EmailMessage`
 362     :param failobj: object to return in case of failure (defaults to None)
 363     :param bool do_unwrap: undo line-break inserted by mail-creator; may remove
 364                            whitespace from file name; only applies to ascii
 365                            file names
 366     :returns: either a string or failobj
 367     """
 368     # try the old way and unwrap
 369     filename = message.get_filename(failobj)
 370
 371     if isinstance(filename, bytes) and not filename.startswith(b'=?') \
 372             and not filename.endswith(b'?='):
 373         filename = filename.decode('utf8')
 374
 375     if isinstance(filename, str):
 376         if do_unwrap:
 377             return re.sub('[\\r\\n]+', '', filename)
 378         return filename
 379
 380     if 'Content-Disposition' not in message:
 381         return failobj
 382
 383     # try parsing content-disposition. e.g.:
 384     # attachment; filename="2018年度公开课计划表.xlsx"   -->
 385     # '=?utf-8?b?YXR0YWNobWVudDsgZmlsZW5hbWU9IjIwMTjlubTluqY=?=\r\n =?utf-8?b?'
 386     # '5YWs5byA6K++6K6h5YiS6KGoLnhsc3gi?='
 387
 388     # This may be a re-implementation of email.utils.collapse_rfc2231_value()
 389     # as mentioned in email.message.EmailMessage.get_param()
 390
 391     # The form is: "=?charset?encoding?encoded text?="
 392     SPLIT_REGEX = '\r?\n *'    # should be CRNL but some files miss the \r
 393     ENCODED_WORD_REGEX = r'\s*=\?([^?]+)\?([^?]+)\?(.*)\?=\s*$'
 394     LINE_REGEX = r'attachment\s*;\s*filename=(")?(.+)\1\s*$'
 395     decoded = []
 396     for word in re.split(SPLIT_REGEX, message['Content-Disposition']):
 397         match = re.match(ENCODED_WORD_REGEX, word)
 398         if not match:
 399             break
 400         charset, encoding, data = match.groups()
 401         if encoding.lower() == 'b':
 402             temp = b64decode(data)
 403         elif encoding.lower() == 'q':
 404             raise NotImplementedError('use quopri.decodestring, handle _')
 405         else:
 406             raise ValueError('not allowed according to wikipedia: "{}"'
 407                              .format(encoding))
 408         decoded.append(temp.decode(charset))
 409     decoded = u''.join(decoded)
 410
 411     match = re.match(LINE_REGEX, decoded)
 412     if match:
 413         return match.groups()[1]
 414     return failobj