Commit | Line | Data |
---|---|---|
f49f6323 | 1 | # This Python file uses the following encoding: utf-8 |
11cbb815 PD |
2 | |
3 | # The software in this package is distributed under the GNU General | |
4 | # Public License version 2 (with a special exception described below). | |
5 | # | |
6 | # A copy of GNU General Public License (GPL) is included in this distribution, | |
7 | # in the file COPYING.GPL. | |
8 | # | |
9 | # As a special exception, if other files instantiate templates or use macros | |
10 | # or inline functions from this file, or you compile this file and link it | |
11 | # with other works to produce a work based on this file, this file | |
12 | # does not by itself cause the resulting work to be covered | |
13 | # by the GNU General Public License. | |
14 | # | |
15 | # However the source code for this file must still be made available | |
16 | # in accordance with section (3) of the GNU General Public License. | |
17 | # | |
18 | # This exception does not invalidate any other reasons why a work based | |
19 | # on this file might be covered by the GNU General Public License. | |
20 | # | |
21 | # Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com> | |
22 | ||
f49f6323 PD |
23 | """ |
24 | ||
25 | SUMMARY | |
26 | ------------------------------------------------------ | |
2ed7100d CH |
27 | Utilities for dealing with email |
28 | ||
29 | .. seealso:: :py:mod:`pyi2ncommon.mail_validator`, | |
30 | :py:mod:`pyi2ncommon.imap_mailbox` | |
f49f6323 PD |
31 | |
32 | Copyright: Intra2net AG | |
33 | ||
34 | ||
35 | INTERFACE | |
36 | ------------------------------------------------------ | |
37 | ||
38 | """ | |
39 | ||
b36398e7 | 40 | from base64 import b64decode |
67177844 | 41 | from email.utils import parsedate_to_datetime |
1d21262c | 42 | from email.parser import BytesParser |
4b44f515 | 43 | from email import policy |
f49f6323 | 44 | |
998bc6bb | 45 | from .simple_cnf import SimpleCnf |
67177844 CH |
46 | # outsourced source, import required for compatiblity |
47 | from .imap_mailbox import ImapMailbox # pylint: disable=unused-import | |
48 | from .mail_validator import * # pylint: disable=unused-import | |
f49f6323 | 49 | |
67177844 | 50 | log = logging.getLogger('pyi2ncommon.mail_utils') |
f49f6323 PD |
51 | |
52 | ||
53 | def prep_email_header(email_file, value, regex=None, criterion="envelopeto"): | |
54 | """ | |
55 | Replace value in a provided email file. | |
56 | ||
57 | :param str email_file: file to use for the replacement | |
58 | :param str value: value to replace the first matched group with | |
59 | :param regex: regular expression to use when replacing a header value | |
60 | :type regex: str or None | |
61 | :param str criterion: criterion to use for replacement, one | |
62 | of 'envelopeto' or 'received' | |
63 | :raises: :py:class:`ValueError` if the choice of criterion is invalid | |
64 | ||
65 | In some cases this function is reusing arnied wrapper's cnf value | |
66 | preparation but for email headers. | |
67 | """ | |
68 | if criterion == "envelopeto": | |
69 | logging.debug("Updating test emails' EnvelopeTo header") | |
70 | arnied_wrapper.prep_cnf_value(email_file, value, regex=regex) | |
71 | elif criterion == "received": | |
72 | logging.debug("Updating test emails' Received header") | |
e108b7d4 CH |
73 | with open(email_file, "r") as file_handle: |
74 | email_text = file_handle.read() | |
f49f6323 PD |
75 | email_text = re.sub(regex, value, email_text) |
76 | email_text = re.sub(regex, value, email_text) | |
e108b7d4 CH |
77 | with open(email_file, "w") as file_handle: |
78 | file_handle.write(email_text) | |
f49f6323 | 79 | else: |
e108b7d4 CH |
80 | raise ValueError("Invalid header preparation criterion '%s'" |
81 | % criterion) | |
f49f6323 PD |
82 | |
83 | ||
998bc6bb | 84 | def create_users(usernames, **extra_params): |
f49f6323 | 85 | """ |
998bc6bb | 86 | Create users for sending / receiving mail. |
f49f6323 | 87 | |
998bc6bb CH |
88 | The created user settings are complete with spamfilter settings and |
89 | groupare folders. User is per default member in groups 1 (admins) and | |
90 | 2 (all). This cannot yet be changed. | |
f49f6323 | 91 | |
998bc6bb CH |
92 | :param usernames: Names of users to create |
93 | :type usernames: [str] | |
f49f6323 | 94 | |
998bc6bb CH |
95 | All other params are forwarded to user config |
96 | """ | |
97 | if isinstance(usernames, str): | |
98 | usernames = [usernames,] | |
99 | default_cnf = dict( | |
100 | user_disabled="0", | |
101 | user_locale="", | |
102 | user_password="1234test", | |
103 | user_spamfilter_blacklist="", | |
104 | user_spamfilter_potential_spam_action="FOLDER", | |
105 | user_spamfilter_potential_spam_action_destaddr="", | |
106 | user_spamfilter_potential_spam_action_folder="Spamverdacht", | |
107 | # TODO: this doesn't handle situations where the child variable should not be defined | |
108 | user_spamfilter_potential_spam_threshold="1050", | |
109 | user_spamfilter_spam_action="FOLDER", | |
110 | user_spamfilter_spam_action_destaddr="", | |
111 | user_spamfilter_spam_action_folder="Spam", | |
112 | user_spamfilter_spam_deletedays="", | |
113 | # TODO: this doesn't handle situations where the child variable should not be defined | |
114 | user_spamfilter_spam_threshold="1080", | |
115 | user_spamfilter_whitelist="", | |
116 | user_groupware_folder_drafts="INBOX/Entwürfe", | |
117 | user_groupware_folder_outbox="INBOX/Gesendete Elemente", | |
118 | user_groupware_folder_trash="INBOX/Gelöschte Elemente", | |
119 | ) | |
120 | ||
121 | cnf = SimpleCnf() | |
f49f6323 | 122 | for username in usernames: |
998bc6bb CH |
123 | curr_cnf = default_cnf.copy() |
124 | curr_cnf['user_fullname'] = username | |
125 | curr_cnf.update(extra_params) | |
126 | children = SimpleCnf() | |
127 | for key, value in curr_cnf.items(): | |
128 | if isinstance(value, dict): | |
129 | children.add(key, children=value) | |
130 | if not isinstance(value, str): | |
131 | raise ValueError('Invalid value type for key "{}": {}' | |
132 | .format(key, type(value))) | |
133 | children.add(key, value) | |
134 | children.add('user_group_member_ref', "2") | |
135 | cnf.add('user', username, children=children, instance=-1) | |
136 | cnf.apply() | |
f49f6323 PD |
137 | |
138 | ||
4b44f515 CH |
139 | def parse_mail_file(file_name, headers_only=True, attachment_filenames=False, |
140 | raise_on_defect=False, new_message_type=False): | |
f49f6323 PD |
141 | """ |
142 | Parse given email file (e.g. a banned message). | |
143 | ||
1d21262c | 144 | This is basically a `email.parser.BytesParser().parse(...)` with given |
4b44f515 CH |
145 | `headers_only` and policy selection, that can also handle BSMTP. As an |
146 | extra bonus, you can just request headers plus the names of attached files. | |
f49f6323 PD |
147 | |
148 | Removes the SMTP envelope surrounding the email if present. Only left-over | |
e108b7d4 CH |
149 | might be a line with a '.' at end of non-multipart messages if |
150 | `headers_only` is False. | |
b359b15c | 151 | |
4b44f515 | 152 | :param str file_name: path to the file that contains the email text |
b359b15c CH |
153 | :param bool headers_only: whether to parse only the email headers; set this |
154 | to False, e.g. if you want to check for | |
155 | attachments using message.walk() | |
156 | :param bool attachment_filenames: if you just want headers and names of | |
157 | attached files, set `headers_only` and | |
158 | this to True. | |
4b44f515 CH |
159 | :param bool raise_on_defect: whether to raise an error if email parser |
160 | encounters a defect (email policy `strict`) or | |
161 | just add the defect to message's `defect` | |
162 | attribute | |
163 | :param bool new_message_type: whether to return the older | |
164 | :py:class:`email.message.Message` (policy | |
165 | `compat32`, our default), or the newer | |
166 | :py:class:`email.message.EmailMessage` type | |
167 | (policy `default`). Big difference! | |
b359b15c CH |
168 | :returns: either msg or 2-tuple `(msg, filenames)` if requested per arg |
169 | `attachment_filenames` | |
170 | :rtype: :py:class:`email.message.Message` or | |
4b44f515 CH |
171 | (:py:class:`email.message.Message`, (str)) or |
172 | one of these two with :py:class:`email.message.EmailMessage` | |
f49f6323 | 173 | """ |
b359b15c CH |
174 | msg = None |
175 | start_pos = 0 | |
4b44f515 CH |
176 | |
177 | if new_message_type: | |
178 | mail_policy = policy.default | |
179 | else: | |
180 | mail_policy = policy.compat32 | |
181 | if raise_on_defect: | |
182 | mail_policy += policy.strict | |
183 | ||
1d21262c | 184 | with open(file_name, 'rb') as read_handle: |
f49f6323 | 185 | line = read_handle.readline() |
1d21262c | 186 | if line.startswith(b'EHLO'): |
f49f6323 | 187 | # there is a smtp header. skip to its end |
1d21262c | 188 | while line.strip() != b'DATA': |
f49f6323 PD |
189 | line = read_handle.readline() |
190 | # the rest is the email plus a trailing '.' (ignored by parser if | |
191 | # multipart) | |
192 | else: | |
193 | read_handle.seek(0) # forget we read the first line already | |
b359b15c | 194 | start_pos = read_handle.tell() |
4b44f515 CH |
195 | msg = BytesParser(policy=mail_policy).parse(read_handle, |
196 | headersonly=headers_only) | |
b359b15c CH |
197 | |
198 | if not attachment_filenames: | |
199 | return msg | |
200 | ||
201 | # otherwise need to parse complete message to get attachment file names | |
202 | if headers_only: | |
1d21262c | 203 | with open(file_name, 'rb') as read_handle: |
b359b15c | 204 | read_handle.seek(start_pos) |
4b44f515 CH |
205 | full_msg = BytesParser(policy=mail_policy).parse(read_handle, |
206 | headersonly=False) | |
b359b15c CH |
207 | else: |
208 | full_msg = msg | |
209 | filenames = [get_filename(part) for part in full_msg.walk()] | |
210 | return msg, tuple(filename for filename in filenames | |
211 | if filename is not None) | |
f49f6323 PD |
212 | |
213 | ||
58414aec CH |
214 | def parse_mail_date(message): |
215 | """ | |
216 | Parse the 'Date' header of the given message. | |
217 | ||
218 | Shortcut for :py:func:`email.utils.parsedate_to_datetime`. | |
219 | ||
220 | This is no longer necessary for newer | |
221 | :py:class:`email.message.EmailMessage` since the `Date` Header is | |
222 | automatically parsed to a :py:class:`email.headerregistry.DateHeader`. | |
223 | ||
224 | :param message: Email message | |
225 | :type message: :py:class:`email.message.Message` | |
226 | :returns: datetime from Email "Date" header or None if header not present | |
227 | :rtype: :py:class:`datetime.datetime` or None | |
228 | """ | |
229 | date_str = message.get('Date', '') | |
230 | if not date_str: | |
231 | return None | |
232 | return parsedate_to_datetime(date_str) | |
233 | ||
234 | ||
f44055b0 CH |
235 | def get_user_mail_files(user, mailbox='INBOX'): |
236 | """ | |
2ed7100d CH |
237 | Iterate over mails in given folder of given user; yields file names. |
238 | ||
239 | Works on local cyrus file system, not on imap server. | |
f44055b0 | 240 | |
2ed7100d CH |
241 | :param str user: Name of user whose mailbox is analyzed |
242 | :param str mailbox: name of mailbox to use, INBOX (default) for base | |
243 | folder; name is modified using :py:func:`cyrus_escape` | |
f44055b0 CH |
244 | :returns: nothing; but yields full path to messages on disc |
245 | """ | |
246 | # base folder of user mail | |
247 | folder = os.path.join('/datastore', 'imap-mails', 'user', user) | |
248 | ||
2ed7100d | 249 | # adapt paths like "INBOX/sub/dir" to "sub/dir" |
f44055b0 CH |
250 | subdirs = mailbox.split('/') |
251 | if subdirs[0].upper() == 'INBOX': | |
252 | subdirs = subdirs[1:] | |
253 | folder = os.path.join(folder, | |
254 | *(cyrus_escape(subdir) for subdir in subdirs)) | |
255 | ||
256 | for filename in os.listdir(folder): | |
257 | if not re.match(r'\d+\.', filename): | |
258 | continue | |
259 | full_path = os.path.join(folder, filename) | |
260 | yield full_path | |
261 | ||
262 | ||
f49f6323 PD |
263 | def get_user_mail(user, mailbox='INBOX', **kwargs): |
264 | """ | |
e108b7d4 | 265 | Iterate over mails in given folder of given user; yields parsed mails. |
f49f6323 | 266 | |
2ed7100d CH |
267 | :param str user: see :py:func:`get_user_mail_files` |
268 | :param str mailbox: see :py:func:`get_user_mail_files` | |
f49f6323 PD |
269 | :param dict kwargs: all other args are forwarded to |
270 | :py:func:`parse_mail_file` | |
271 | :returns: nothing; but yields 2-tuples (path, email_msg) where first is the | |
272 | full path to the message on disc, and the latter is the outcome | |
273 | of :py:func:`parse_mail_file` for that file | |
274 | """ | |
f44055b0 CH |
275 | for full_path in get_user_mail_files(user, mailbox): |
276 | yield full_path, parse_mail_file(full_path, **kwargs) | |
f49f6323 PD |
277 | |
278 | ||
f4dec410 CH |
279 | def get_message_text(filename, fallback_encoding='iso8859-1', |
280 | include_all_text=False): | |
281 | """ | |
282 | Extract message text as string from email message. | |
283 | ||
284 | Intended as complementary addition to get_user_mail, e.g. :: | |
285 | ||
286 | for filename, msg in get_user_mail(user): | |
287 | # rough filtering based on headers | |
288 | if msg['Subject'] != 'Expected Subject': | |
289 | continue | |
290 | # get message text for closer inspection | |
291 | text = get_message_text(filename) | |
292 | if 'Expected Text' not in text: | |
293 | continue | |
294 | ... | |
295 | ||
296 | Finds the first part in message that is of type text/plain and decodes it | |
297 | using encoding specified in mail or otherwise fallback encoding. If none | |
298 | found takes first part of type "text/*", or otherwise just the first part. | |
299 | ||
300 | If include_all_text is True, all text/* parts are included, with text/plain | |
301 | being the first. | |
302 | ||
303 | :param str filename: complete path of message file in filesystem | |
2ed7100d CH |
304 | :param str fallback_encoding: Encoding of email text if none is specified |
305 | in mail. | |
f4dec410 CH |
306 | :param bool include_all_text: include all "text/*" parts in returned text |
307 | :returns: text(s) of message | |
308 | :rtype: [str] if include_all_text else str | |
309 | """ | |
310 | result = [] | |
311 | msg = parse_mail_file(filename, headers_only=False) | |
312 | for part in msg.walk(): | |
313 | if part.get_content_type() != 'text/plain': | |
314 | continue | |
315 | encoding = part.get_content_charset(fallback_encoding) | |
316 | result.append(part.get_payload(decode=True).decode(encoding)) | |
317 | ||
318 | if result and not include_all_text: | |
319 | return result[0] | |
320 | ||
321 | # no text/plain found. Try only "text/": | |
322 | for part in msg.walk(): | |
323 | cont_type = part.get_content_type() | |
324 | if cont_type.startswith('text/') and cont_type != 'text/plain': | |
325 | encoding = part.get_content_charset(fallback_encoding) | |
326 | result.append(part.get_payload(decode=True).decode(encoding)) | |
327 | ||
328 | if result: | |
329 | if not include_all_text: | |
330 | return result[0] | |
331 | return result | |
332 | ||
333 | # no "text/" found. Just take first part | |
334 | while msg.is_multipart(): | |
335 | msg = msg.get_payload(0) | |
336 | ||
337 | encoding = msg.get_content_charset(fallback_encoding) | |
338 | if include_all_text: | |
339 | return [msg.get_payload(decode=True).decode(encoding), ] | |
340 | return msg.get_payload(decode=True).decode(encoding) | |
341 | ||
342 | ||
f49f6323 PD |
343 | def cyrus_escape(user_or_folder, keep_path=False, regex=False): |
344 | """ | |
e108b7d4 | 345 | Convert names of users or mailbox folders to cyrus format. |
f49f6323 PD |
346 | |
347 | quite a hack, just does the following hard-coded replacements: | |
348 | ||
349 | * . --> ^ | |
350 | * / --> . (except if keep_path is True) | |
351 | * "u --> &APw- , "o --> &APY- , "a --> &AOQ- | |
352 | (if need more: this is modified utf-7) | |
353 | * inbox --> (the empty string) | |
354 | ||
355 | Would like to use a general modified utf-7-encoder/decoder but python has | |
7628bc48 | 356 | none builtin (see https://bugs.python.org/issue5305) and an extra lib like |
f49f6323 PD |
357 | https://bitbucket.org/mjs0/imapclient/ would be overkill. After all, we |
358 | control the input to this function via params and this is enough umlaut- | |
359 | testing I think... | |
360 | ||
361 | :param str user_or_folder: name of the user or folder string to escape | |
362 | :param bool keep_path: do not replace '/' with '.' so can still use result | |
363 | as path name | |
364 | :param bool regex: result is used in grep or other regex, so ^, . and & are | |
365 | escaped again with a backslash | |
366 | :returns: escaped user or folder string | |
367 | :rtype: str | |
368 | ||
369 | .. seealso:: :py:func:`cyrus_unescape` | |
370 | """ | |
371 | temp = user_or_folder.replace('.', '^') \ | |
372 | .replace('ü', '&APw-').replace('ä', '&AOQ-') \ | |
373 | .replace('ö', '&APY-') \ | |
374 | .replace('inbox', '').replace('INBOX', '').replace('Inbox', '') | |
375 | if not keep_path: | |
376 | temp = temp.replace('/', '.') | |
377 | if regex: | |
378 | return temp.replace('^', r'\^').replace('&', r'\&') \ | |
379 | .replace('.', r'\.').replace('$', r'\$') | |
2ed7100d | 380 | return temp |
f49f6323 PD |
381 | |
382 | ||
383 | def cyrus_unescape(user_or_folder): | |
384 | """ | |
385 | Undo effects of :py:func:`cyrus_escape` (but not all of them). | |
386 | ||
387 | :param str user_or_folder: name of the user or folder string to unescape | |
388 | :returns: unescaped user or folder string | |
389 | :rtype: str | |
390 | """ | |
391 | if user_or_folder == '': | |
392 | return 'inbox' | |
393 | return user_or_folder.replace('.', '/')\ | |
394 | .replace(r'\^', '.').replace('^', '.') | |
b36398e7 CH |
395 | |
396 | ||
397 | def get_filename(message, failobj=None, do_unwrap=True): | |
398 | """ | |
e108b7d4 | 399 | Get filename of a message part, even if it is base64-encoded. |
b36398e7 CH |
400 | |
401 | For attachments with base64-encoded file name, the | |
2ed7100d CH |
402 | :py:func:`email.message.Message.get_filename()` does not work. This |
403 | function tries that first and if it fails tries to interprete the | |
404 | Content-Disposition of the message part. If all fails, returns `failobj`. | |
b36398e7 CH |
405 | |
406 | Only for ascii filenames: also unwraps file names if they are line-wrapped. | |
407 | But note that this may remove too much whitespace from the filename if | |
7628bc48 | 408 | line-wrapping happened in the same position as the filename's whitespace. |
b36398e7 CH |
409 | To get unwrapped version, set param `do_unwrap` to `False`. |
410 | ||
411 | See also: https://en.wikipedia.org/wiki/MIME#Encoded-Word | |
412 | ||
413 | :param message: message part, e.g. from | |
414 | :py:meth:`email.message.Message.walk` | |
4b44f515 CH |
415 | :type message: :py:class:`email.message.Message` or |
416 | :py:class:`email.message.EmailMessage` | |
b36398e7 CH |
417 | :param failobj: object to return in case of failure (defaults to None) |
418 | :param bool do_unwrap: undo line-break inserted by mail-creator; may remove | |
419 | whitespace from file name; only applies to ascii | |
420 | file names | |
421 | :returns: either a string or failobj | |
422 | """ | |
423 | # try the old way and unwrap | |
424 | filename = message.get_filename(failobj) | |
425 | ||
426 | if isinstance(filename, bytes) and not filename.startswith(b'=?') \ | |
427 | and not filename.endswith(b'?='): | |
428 | filename = filename.decode('utf8') | |
429 | ||
430 | if isinstance(filename, str): | |
431 | if do_unwrap: | |
432 | return re.sub('[\\r\\n]+', '', filename) | |
433 | return filename | |
434 | ||
435 | if 'Content-Disposition' not in message: | |
436 | return failobj | |
437 | ||
438 | # try parsing content-disposition. e.g.: | |
439 | # attachment; filename="2018年度公开课计划表.xlsx" --> | |
440 | # '=?utf-8?b?YXR0YWNobWVudDsgZmlsZW5hbWU9IjIwMTjlubTluqY=?=\r\n =?utf-8?b?' | |
441 | # '5YWs5byA6K++6K6h5YiS6KGoLnhsc3gi?=' | |
442 | ||
443 | # This may be a re-implementation of email.utils.collapse_rfc2231_value() | |
4b44f515 | 444 | # as mentioned in email.message.EmailMessage.get_param() |
b36398e7 CH |
445 | |
446 | # The form is: "=?charset?encoding?encoded text?=" | |
447 | SPLIT_REGEX = '\r?\n *' # should be CRNL but some files miss the \r | |
448 | ENCODED_WORD_REGEX = r'\s*=\?([^?]+)\?([^?]+)\?(.*)\?=\s*$' | |
449 | LINE_REGEX = r'attachment\s*;\s*filename=(")?(.+)\1\s*$' | |
450 | decoded = [] | |
451 | for word in re.split(SPLIT_REGEX, message['Content-Disposition']): | |
452 | match = re.match(ENCODED_WORD_REGEX, word) | |
453 | if not match: | |
454 | break | |
455 | charset, encoding, data = match.groups() | |
456 | if encoding.lower() == 'b': | |
457 | temp = b64decode(data) | |
458 | elif encoding.lower() == 'q': | |
459 | raise NotImplementedError('use quopri.decodestring, handle _') | |
460 | else: | |
461 | raise ValueError('not allowed according to wikipedia: "{}"' | |
462 | .format(encoding)) | |
463 | decoded.append(temp.decode(charset)) | |
464 | decoded = u''.join(decoded) | |
465 | ||
466 | match = re.match(LINE_REGEX, decoded) | |
467 | if match: | |
468 | return match.groups()[1] | |
469 | return failobj |