Change creation of mail users to dynamic
[pyi2ncommon] / src / mail_utils.py
CommitLineData
f49f6323 1# This Python file uses the following encoding: utf-8
11cbb815
PD
2
3# The software in this package is distributed under the GNU General
4# Public License version 2 (with a special exception described below).
5#
6# A copy of GNU General Public License (GPL) is included in this distribution,
7# in the file COPYING.GPL.
8#
9# As a special exception, if other files instantiate templates or use macros
10# or inline functions from this file, or you compile this file and link it
11# with other works to produce a work based on this file, this file
12# does not by itself cause the resulting work to be covered
13# by the GNU General Public License.
14#
15# However the source code for this file must still be made available
16# in accordance with section (3) of the GNU General Public License.
17#
18# This exception does not invalidate any other reasons why a work based
19# on this file might be covered by the GNU General Public License.
20#
21# Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>
22
f49f6323
PD
23"""
24
25SUMMARY
26------------------------------------------------------
2ed7100d
CH
27Utilities for dealing with email
28
29.. seealso:: :py:mod:`pyi2ncommon.mail_validator`,
30 :py:mod:`pyi2ncommon.imap_mailbox`
f49f6323
PD
31
32Copyright: Intra2net AG
33
34
35INTERFACE
36------------------------------------------------------
37
38"""
39
b36398e7 40from base64 import b64decode
67177844 41from email.utils import parsedate_to_datetime
1d21262c 42from email.parser import BytesParser
4b44f515 43from email import policy
f49f6323 44
998bc6bb 45from .simple_cnf import SimpleCnf
67177844
CH
46# outsourced source, import required for compatiblity
47from .imap_mailbox import ImapMailbox # pylint: disable=unused-import
48from .mail_validator import * # pylint: disable=unused-import
f49f6323 49
67177844 50log = logging.getLogger('pyi2ncommon.mail_utils')
f49f6323
PD
51
52
53def prep_email_header(email_file, value, regex=None, criterion="envelopeto"):
54 """
55 Replace value in a provided email file.
56
57 :param str email_file: file to use for the replacement
58 :param str value: value to replace the first matched group with
59 :param regex: regular expression to use when replacing a header value
60 :type regex: str or None
61 :param str criterion: criterion to use for replacement, one
62 of 'envelopeto' or 'received'
63 :raises: :py:class:`ValueError` if the choice of criterion is invalid
64
65 In some cases this function is reusing arnied wrapper's cnf value
66 preparation but for email headers.
67 """
68 if criterion == "envelopeto":
69 logging.debug("Updating test emails' EnvelopeTo header")
70 arnied_wrapper.prep_cnf_value(email_file, value, regex=regex)
71 elif criterion == "received":
72 logging.debug("Updating test emails' Received header")
e108b7d4
CH
73 with open(email_file, "r") as file_handle:
74 email_text = file_handle.read()
f49f6323
PD
75 email_text = re.sub(regex, value, email_text)
76 email_text = re.sub(regex, value, email_text)
e108b7d4
CH
77 with open(email_file, "w") as file_handle:
78 file_handle.write(email_text)
f49f6323 79 else:
e108b7d4
CH
80 raise ValueError("Invalid header preparation criterion '%s'"
81 % criterion)
f49f6323
PD
82
83
998bc6bb 84def create_users(usernames, **extra_params):
f49f6323 85 """
998bc6bb 86 Create users for sending / receiving mail.
f49f6323 87
998bc6bb
CH
88 The created user settings are complete with spamfilter settings and
89 groupare folders. User is per default member in groups 1 (admins) and
90 2 (all). This cannot yet be changed.
f49f6323 91
998bc6bb
CH
92 :param usernames: Names of users to create
93 :type usernames: [str]
f49f6323 94
998bc6bb
CH
95 All other params are forwarded to user config
96 """
97 if isinstance(usernames, str):
98 usernames = [usernames,]
99 default_cnf = dict(
100 user_disabled="0",
101 user_locale="",
102 user_password="1234test",
103 user_spamfilter_blacklist="",
104 user_spamfilter_potential_spam_action="FOLDER",
105 user_spamfilter_potential_spam_action_destaddr="",
106 user_spamfilter_potential_spam_action_folder="Spamverdacht",
107 # TODO: this doesn't handle situations where the child variable should not be defined
108 user_spamfilter_potential_spam_threshold="1050",
109 user_spamfilter_spam_action="FOLDER",
110 user_spamfilter_spam_action_destaddr="",
111 user_spamfilter_spam_action_folder="Spam",
112 user_spamfilter_spam_deletedays="",
113 # TODO: this doesn't handle situations where the child variable should not be defined
114 user_spamfilter_spam_threshold="1080",
115 user_spamfilter_whitelist="",
116 user_groupware_folder_drafts="INBOX/Entwürfe",
117 user_groupware_folder_outbox="INBOX/Gesendete Elemente",
118 user_groupware_folder_trash="INBOX/Gelöschte Elemente",
119 )
120
121 cnf = SimpleCnf()
f49f6323 122 for username in usernames:
998bc6bb
CH
123 curr_cnf = default_cnf.copy()
124 curr_cnf['user_fullname'] = username
125 curr_cnf.update(extra_params)
126 children = SimpleCnf()
127 for key, value in curr_cnf.items():
128 if isinstance(value, dict):
129 children.add(key, children=value)
130 if not isinstance(value, str):
131 raise ValueError('Invalid value type for key "{}": {}'
132 .format(key, type(value)))
133 children.add(key, value)
134 children.add('user_group_member_ref', "2")
135 cnf.add('user', username, children=children, instance=-1)
136 cnf.apply()
f49f6323
PD
137
138
4b44f515
CH
139def parse_mail_file(file_name, headers_only=True, attachment_filenames=False,
140 raise_on_defect=False, new_message_type=False):
f49f6323
PD
141 """
142 Parse given email file (e.g. a banned message).
143
1d21262c 144 This is basically a `email.parser.BytesParser().parse(...)` with given
4b44f515
CH
145 `headers_only` and policy selection, that can also handle BSMTP. As an
146 extra bonus, you can just request headers plus the names of attached files.
f49f6323
PD
147
148 Removes the SMTP envelope surrounding the email if present. Only left-over
e108b7d4
CH
149 might be a line with a '.' at end of non-multipart messages if
150 `headers_only` is False.
b359b15c 151
4b44f515 152 :param str file_name: path to the file that contains the email text
b359b15c
CH
153 :param bool headers_only: whether to parse only the email headers; set this
154 to False, e.g. if you want to check for
155 attachments using message.walk()
156 :param bool attachment_filenames: if you just want headers and names of
157 attached files, set `headers_only` and
158 this to True.
4b44f515
CH
159 :param bool raise_on_defect: whether to raise an error if email parser
160 encounters a defect (email policy `strict`) or
161 just add the defect to message's `defect`
162 attribute
163 :param bool new_message_type: whether to return the older
164 :py:class:`email.message.Message` (policy
165 `compat32`, our default), or the newer
166 :py:class:`email.message.EmailMessage` type
167 (policy `default`). Big difference!
b359b15c
CH
168 :returns: either msg or 2-tuple `(msg, filenames)` if requested per arg
169 `attachment_filenames`
170 :rtype: :py:class:`email.message.Message` or
4b44f515
CH
171 (:py:class:`email.message.Message`, (str)) or
172 one of these two with :py:class:`email.message.EmailMessage`
f49f6323 173 """
b359b15c
CH
174 msg = None
175 start_pos = 0
4b44f515
CH
176
177 if new_message_type:
178 mail_policy = policy.default
179 else:
180 mail_policy = policy.compat32
181 if raise_on_defect:
182 mail_policy += policy.strict
183
1d21262c 184 with open(file_name, 'rb') as read_handle:
f49f6323 185 line = read_handle.readline()
1d21262c 186 if line.startswith(b'EHLO'):
f49f6323 187 # there is a smtp header. skip to its end
1d21262c 188 while line.strip() != b'DATA':
f49f6323
PD
189 line = read_handle.readline()
190 # the rest is the email plus a trailing '.' (ignored by parser if
191 # multipart)
192 else:
193 read_handle.seek(0) # forget we read the first line already
b359b15c 194 start_pos = read_handle.tell()
4b44f515
CH
195 msg = BytesParser(policy=mail_policy).parse(read_handle,
196 headersonly=headers_only)
b359b15c
CH
197
198 if not attachment_filenames:
199 return msg
200
201 # otherwise need to parse complete message to get attachment file names
202 if headers_only:
1d21262c 203 with open(file_name, 'rb') as read_handle:
b359b15c 204 read_handle.seek(start_pos)
4b44f515
CH
205 full_msg = BytesParser(policy=mail_policy).parse(read_handle,
206 headersonly=False)
b359b15c
CH
207 else:
208 full_msg = msg
209 filenames = [get_filename(part) for part in full_msg.walk()]
210 return msg, tuple(filename for filename in filenames
211 if filename is not None)
f49f6323
PD
212
213
58414aec
CH
214def parse_mail_date(message):
215 """
216 Parse the 'Date' header of the given message.
217
218 Shortcut for :py:func:`email.utils.parsedate_to_datetime`.
219
220 This is no longer necessary for newer
221 :py:class:`email.message.EmailMessage` since the `Date` Header is
222 automatically parsed to a :py:class:`email.headerregistry.DateHeader`.
223
224 :param message: Email message
225 :type message: :py:class:`email.message.Message`
226 :returns: datetime from Email "Date" header or None if header not present
227 :rtype: :py:class:`datetime.datetime` or None
228 """
229 date_str = message.get('Date', '')
230 if not date_str:
231 return None
232 return parsedate_to_datetime(date_str)
233
234
f44055b0
CH
235def get_user_mail_files(user, mailbox='INBOX'):
236 """
2ed7100d
CH
237 Iterate over mails in given folder of given user; yields file names.
238
239 Works on local cyrus file system, not on imap server.
f44055b0 240
2ed7100d
CH
241 :param str user: Name of user whose mailbox is analyzed
242 :param str mailbox: name of mailbox to use, INBOX (default) for base
243 folder; name is modified using :py:func:`cyrus_escape`
f44055b0
CH
244 :returns: nothing; but yields full path to messages on disc
245 """
246 # base folder of user mail
247 folder = os.path.join('/datastore', 'imap-mails', 'user', user)
248
2ed7100d 249 # adapt paths like "INBOX/sub/dir" to "sub/dir"
f44055b0
CH
250 subdirs = mailbox.split('/')
251 if subdirs[0].upper() == 'INBOX':
252 subdirs = subdirs[1:]
253 folder = os.path.join(folder,
254 *(cyrus_escape(subdir) for subdir in subdirs))
255
256 for filename in os.listdir(folder):
257 if not re.match(r'\d+\.', filename):
258 continue
259 full_path = os.path.join(folder, filename)
260 yield full_path
261
262
f49f6323
PD
263def get_user_mail(user, mailbox='INBOX', **kwargs):
264 """
e108b7d4 265 Iterate over mails in given folder of given user; yields parsed mails.
f49f6323 266
2ed7100d
CH
267 :param str user: see :py:func:`get_user_mail_files`
268 :param str mailbox: see :py:func:`get_user_mail_files`
f49f6323
PD
269 :param dict kwargs: all other args are forwarded to
270 :py:func:`parse_mail_file`
271 :returns: nothing; but yields 2-tuples (path, email_msg) where first is the
272 full path to the message on disc, and the latter is the outcome
273 of :py:func:`parse_mail_file` for that file
274 """
f44055b0
CH
275 for full_path in get_user_mail_files(user, mailbox):
276 yield full_path, parse_mail_file(full_path, **kwargs)
f49f6323
PD
277
278
f4dec410
CH
279def get_message_text(filename, fallback_encoding='iso8859-1',
280 include_all_text=False):
281 """
282 Extract message text as string from email message.
283
284 Intended as complementary addition to get_user_mail, e.g. ::
285
286 for filename, msg in get_user_mail(user):
287 # rough filtering based on headers
288 if msg['Subject'] != 'Expected Subject':
289 continue
290 # get message text for closer inspection
291 text = get_message_text(filename)
292 if 'Expected Text' not in text:
293 continue
294 ...
295
296 Finds the first part in message that is of type text/plain and decodes it
297 using encoding specified in mail or otherwise fallback encoding. If none
298 found takes first part of type "text/*", or otherwise just the first part.
299
300 If include_all_text is True, all text/* parts are included, with text/plain
301 being the first.
302
303 :param str filename: complete path of message file in filesystem
2ed7100d
CH
304 :param str fallback_encoding: Encoding of email text if none is specified
305 in mail.
f4dec410
CH
306 :param bool include_all_text: include all "text/*" parts in returned text
307 :returns: text(s) of message
308 :rtype: [str] if include_all_text else str
309 """
310 result = []
311 msg = parse_mail_file(filename, headers_only=False)
312 for part in msg.walk():
313 if part.get_content_type() != 'text/plain':
314 continue
315 encoding = part.get_content_charset(fallback_encoding)
316 result.append(part.get_payload(decode=True).decode(encoding))
317
318 if result and not include_all_text:
319 return result[0]
320
321 # no text/plain found. Try only "text/":
322 for part in msg.walk():
323 cont_type = part.get_content_type()
324 if cont_type.startswith('text/') and cont_type != 'text/plain':
325 encoding = part.get_content_charset(fallback_encoding)
326 result.append(part.get_payload(decode=True).decode(encoding))
327
328 if result:
329 if not include_all_text:
330 return result[0]
331 return result
332
333 # no "text/" found. Just take first part
334 while msg.is_multipart():
335 msg = msg.get_payload(0)
336
337 encoding = msg.get_content_charset(fallback_encoding)
338 if include_all_text:
339 return [msg.get_payload(decode=True).decode(encoding), ]
340 return msg.get_payload(decode=True).decode(encoding)
341
342
f49f6323
PD
343def cyrus_escape(user_or_folder, keep_path=False, regex=False):
344 """
e108b7d4 345 Convert names of users or mailbox folders to cyrus format.
f49f6323
PD
346
347 quite a hack, just does the following hard-coded replacements:
348
349 * . --> ^
350 * / --> . (except if keep_path is True)
351 * "u --> &APw- , "o --> &APY- , "a --> &AOQ-
352 (if need more: this is modified utf-7)
353 * inbox --> (the empty string)
354
355 Would like to use a general modified utf-7-encoder/decoder but python has
7628bc48 356 none builtin (see https://bugs.python.org/issue5305) and an extra lib like
f49f6323
PD
357 https://bitbucket.org/mjs0/imapclient/ would be overkill. After all, we
358 control the input to this function via params and this is enough umlaut-
359 testing I think...
360
361 :param str user_or_folder: name of the user or folder string to escape
362 :param bool keep_path: do not replace '/' with '.' so can still use result
363 as path name
364 :param bool regex: result is used in grep or other regex, so ^, . and & are
365 escaped again with a backslash
366 :returns: escaped user or folder string
367 :rtype: str
368
369 .. seealso:: :py:func:`cyrus_unescape`
370 """
371 temp = user_or_folder.replace('.', '^') \
372 .replace('ü', '&APw-').replace('ä', '&AOQ-') \
373 .replace('ö', '&APY-') \
374 .replace('inbox', '').replace('INBOX', '').replace('Inbox', '')
375 if not keep_path:
376 temp = temp.replace('/', '.')
377 if regex:
378 return temp.replace('^', r'\^').replace('&', r'\&') \
379 .replace('.', r'\.').replace('$', r'\$')
2ed7100d 380 return temp
f49f6323
PD
381
382
383def cyrus_unescape(user_or_folder):
384 """
385 Undo effects of :py:func:`cyrus_escape` (but not all of them).
386
387 :param str user_or_folder: name of the user or folder string to unescape
388 :returns: unescaped user or folder string
389 :rtype: str
390 """
391 if user_or_folder == '':
392 return 'inbox'
393 return user_or_folder.replace('.', '/')\
394 .replace(r'\^', '.').replace('^', '.')
b36398e7
CH
395
396
397def get_filename(message, failobj=None, do_unwrap=True):
398 """
e108b7d4 399 Get filename of a message part, even if it is base64-encoded.
b36398e7
CH
400
401 For attachments with base64-encoded file name, the
2ed7100d
CH
402 :py:func:`email.message.Message.get_filename()` does not work. This
403 function tries that first and if it fails tries to interprete the
404 Content-Disposition of the message part. If all fails, returns `failobj`.
b36398e7
CH
405
406 Only for ascii filenames: also unwraps file names if they are line-wrapped.
407 But note that this may remove too much whitespace from the filename if
7628bc48 408 line-wrapping happened in the same position as the filename's whitespace.
b36398e7
CH
409 To get unwrapped version, set param `do_unwrap` to `False`.
410
411 See also: https://en.wikipedia.org/wiki/MIME#Encoded-Word
412
413 :param message: message part, e.g. from
414 :py:meth:`email.message.Message.walk`
4b44f515
CH
415 :type message: :py:class:`email.message.Message` or
416 :py:class:`email.message.EmailMessage`
b36398e7
CH
417 :param failobj: object to return in case of failure (defaults to None)
418 :param bool do_unwrap: undo line-break inserted by mail-creator; may remove
419 whitespace from file name; only applies to ascii
420 file names
421 :returns: either a string or failobj
422 """
423 # try the old way and unwrap
424 filename = message.get_filename(failobj)
425
426 if isinstance(filename, bytes) and not filename.startswith(b'=?') \
427 and not filename.endswith(b'?='):
428 filename = filename.decode('utf8')
429
430 if isinstance(filename, str):
431 if do_unwrap:
432 return re.sub('[\\r\\n]+', '', filename)
433 return filename
434
435 if 'Content-Disposition' not in message:
436 return failobj
437
438 # try parsing content-disposition. e.g.:
439 # attachment; filename="2018年度公开课计划表.xlsx" -->
440 # '=?utf-8?b?YXR0YWNobWVudDsgZmlsZW5hbWU9IjIwMTjlubTluqY=?=\r\n =?utf-8?b?'
441 # '5YWs5byA6K++6K6h5YiS6KGoLnhsc3gi?='
442
443 # This may be a re-implementation of email.utils.collapse_rfc2231_value()
4b44f515 444 # as mentioned in email.message.EmailMessage.get_param()
b36398e7
CH
445
446 # The form is: "=?charset?encoding?encoded text?="
447 SPLIT_REGEX = '\r?\n *' # should be CRNL but some files miss the \r
448 ENCODED_WORD_REGEX = r'\s*=\?([^?]+)\?([^?]+)\?(.*)\?=\s*$'
449 LINE_REGEX = r'attachment\s*;\s*filename=(")?(.+)\1\s*$'
450 decoded = []
451 for word in re.split(SPLIT_REGEX, message['Content-Disposition']):
452 match = re.match(ENCODED_WORD_REGEX, word)
453 if not match:
454 break
455 charset, encoding, data = match.groups()
456 if encoding.lower() == 'b':
457 temp = b64decode(data)
458 elif encoding.lower() == 'q':
459 raise NotImplementedError('use quopri.decodestring, handle _')
460 else:
461 raise ValueError('not allowed according to wikipedia: "{}"'
462 .format(encoding))
463 decoded.append(temp.decode(charset))
464 decoded = u''.join(decoded)
465
466 match = re.match(LINE_REGEX, decoded)
467 if match:
468 return match.groups()[1]
469 return failobj