Commit | Line | Data |
---|---|---|
aaf62c14 CH |
1 | #!/usr/bin/env python3 |
2 | ||
3 | """ A more long-term testing of pingcheck | |
4 | ||
5 | Runs pingcheck parsing its log output; goes online and offline repeatedly to see if | |
6 | pinging stays correct | |
7 | ||
8 | see also: connd/stresstest_pingcheck.py | |
9 | ||
10 | Christian Herdtweck, Intra2net, January 2015 | |
11 | (c) Intra2net AG 2015 | |
12 | """ | |
13 | ||
14 | # Version History | |
15 | # 15/01/15 Christian Herdtweck: started creation | |
3bd8a585 | 16 | # 21/1/15 ch: running 3rd long-term test, seems to work nicely now |
aaf62c14 CH |
17 | |
18 | import subprocess | |
3daedf04 | 19 | import sys |
aaf62c14 CH |
20 | import select |
21 | from re import match as regexp | |
22 | from time import sleep | |
23 | from datetime import timedelta, datetime as dt | |
24 | from warnings import warn | |
25 | from random import randint | |
3daedf04 CH |
26 | import logging |
27 | from os.path import expanduser, dirname, isdir | |
28 | from os import mkdir, EX_OK | |
3bd8a585 CH |
29 | from connd_state import ConndState |
30 | #import signal | |
4c50ece1 CH |
31 | import pickle |
32 | from strftimedelta import strftimedelta | |
3daedf04 CH |
33 | |
34 | logger_base = logging.getLogger('long_term_test') | |
aaf62c14 CH |
35 | |
36 | ######################################################################## | |
37 | ### CONSTANTS | |
38 | ######################################################################## | |
39 | ||
3daedf04 | 40 | version = '1.1' |
c3adbb6d | 41 | default_pingcheck_bin = '/usr/intranator/bin/pingcheck' |
aaf62c14 | 42 | default_tell_connd_bin = '/usr/intranator/bin/tell-connd' |
3daedf04 | 43 | default_output_prefix = './long_term_test/{0:%d%m%y_%H%M}' |
aaf62c14 | 44 | |
3daedf04 | 45 | # for parsing log messages |
aaf62c14 CH |
46 | LEVEL_DEBUG = 'dbug' |
47 | LEVEL_INFO = 'info' | |
48 | LEVEL_WARN = 'warn' | |
49 | LEVEL_NOTICE = 'note' | |
50 | LEVEL_ERROR = 'err ' | |
51 | ||
52 | text_to_level = {} | |
53 | text_to_level['INFO'] = LEVEL_INFO | |
54 | text_to_level['DBUG'] = LEVEL_DEBUG | |
55 | text_to_level['WARN'] = LEVEL_WARN | |
56 | text_to_level['NOTE'] = LEVEL_NOTICE | |
57 | text_to_level['ERR'] = LEVEL_ERROR | |
58 | ||
3daedf04 | 59 | # duration in which to remain online/offline |
aaf62c14 CH |
60 | MIN_DURATION_SAME_STATE = 10 # [seconds] |
61 | MAX_DURATION_SAME_STATE = 50 # [seconds] | |
62 | ||
63 | ######################################################################## | |
3daedf04 | 64 | #### for config file |
aaf62c14 CH |
65 | ######################################################################## |
66 | ||
3daedf04 CH |
67 | PING_INTERVAL = 3 # [seconds] |
68 | config_file_pattern = 'long_term_test_{0:%d%m%y_%H%M%S}.conf' | |
69 | conf_start = """# config file for pingcheck testing | |
aaf62c14 | 70 | |
3daedf04 CH |
71 | default-source-network-interface=eth1 |
72 | nameserver=127.0.0.1 | |
73 | hosts-down-limit={hosts_down_limit:d} | |
74 | ping-fail-limit=25 | |
75 | status-notifier-cmd=echo host status is ${status} | |
76 | link-up-interval=2 | |
3bd8a585 | 77 | ping-reply-timeout=3 |
3daedf04 CH |
78 | """ |
79 | conf_host = """ | |
80 | [host] | |
81 | name={0} | |
82 | interval={1:d} | |
83 | ping-protocol=ICMPv4 | |
84 | port=80 | |
85 | source-network-interface=default | |
86 | """ | |
87 | host_list = (\ | |
88 | 'intranator.m.i2n', \ | |
89 | 'intraroute.m.i2n', \ | |
90 | 'websync.m.i2n', \ | |
91 | 'aqua.m.i2n', \ | |
92 | 'intratdsl.m.i2n', \ | |
93 | 'beholder.m.i2n', \ | |
94 | 'eagle.m.i2n', \ | |
95 | 'osdist.m.i2n', \ | |
96 | 'tokyo.m.i2n', \ | |
97 | 'blizzard.m.i2n', \ | |
98 | 'phonehelp.m.i2n', \ | |
99 | 'virt-dmz.m.i2n', \ | |
100 | 'virt01.m.i2n', \ | |
101 | 'virt02.m.i2n', \ | |
102 | 'tornado.n.i2n', \ | |
103 | 'auth.m.i2n', \ | |
104 | 'forum.intra2net.com', \ | |
105 | 'update.intranator.com', \ | |
106 | 'demo-intranator.intra2net.com', \ | |
107 | 'intra2net.com', \ | |
4013dbf2 | 108 | #'partner.intra2net.com', has same IP as intra2net.com --> trouble in Tester._host_for_ip \ |
3daedf04 CH |
109 | 'demo-securitygateway.intra2net.com', \ |
110 | 'demo-networksecurity.intra2net.com', \ | |
111 | ) | |
112 | ||
113 | def create_conf_file(file_prefix): | |
114 | """ create config file using constants above """ | |
115 | ||
116 | file_name = file_prefix + '_config.conf' | |
117 | ||
118 | n_hosts = len(host_list) | |
119 | ||
120 | with open(file_name, 'wt') as file_handle: | |
121 | file_handle.write(conf_start.format(hosts_down_limit=int(n_hosts * 0.7), status='{status}')) | |
122 | for host in host_list: | |
123 | file_handle.write(conf_host.format(host, PING_INTERVAL)) | |
124 | ||
3bd8a585 | 125 | return file_name, n_hosts |
3daedf04 | 126 | #end: create_conf_file |
aaf62c14 | 127 | |
aaf62c14 | 128 | |
3daedf04 | 129 | ######################################################################## |
3bd8a585 CH |
130 | ### Exceptions |
131 | ######################################################################## | |
132 | ||
133 | def flags_to_str(flags): | |
134 | text_parts = [] | |
135 | if flags & select.POLLIN: | |
136 | text_parts.append('IN') | |
137 | if flags & select.POLLOUT: | |
138 | text_parts.append('OUT') | |
139 | if flags & select.POLLPRI: | |
140 | text_parts.append('PRI') | |
141 | if flags & select.POLLERR: | |
142 | text_parts.append('ERR') | |
143 | if flags & select.POLLHUP: | |
144 | text_parts.append('HUP') | |
145 | if flags & select.POLLNVAL: | |
146 | text_parts.append('INVALID') | |
147 | return ','.join(text_parts) | |
148 | ||
149 | class PollFlagsException(Exception): | |
150 | def __init__(self, flags, proc_name): | |
151 | super(PollFlagsException, self).__init__('Unexpected flags from polling {0}: {1}'.format(\ | |
152 | proc_name, flags_to_str(flags))) | |
153 | ||
154 | class PollHupException(Exception): | |
155 | def __init__(self, proc_name): | |
156 | super(PollHupException, self).__init__(\ | |
157 | 'Received HUP from polling, {0} was probably killed!'.format(proc_name)) | |
158 | ||
159 | class TooManyWarnings(Exception): | |
160 | def __init__(self): | |
161 | super(TooManyWarnings, self).__init__('Too many warnings / failures!') | |
162 | ||
163 | ######################################################################## | |
3daedf04 CH |
164 | ### PARSER |
165 | ######################################################################## | |
aaf62c14 CH |
166 | |
167 | class ProcOutputParser: | |
3daedf04 CH |
168 | """ parser for output from pingcheck process via pipes |
169 | ||
170 | uses select and polling | |
171 | should never silently swallow log output, main function parse should yield all lines | |
172 | or emit errors | |
173 | """ | |
aaf62c14 CH |
174 | |
175 | _out = None | |
176 | _err = None | |
177 | _poller = None | |
3bd8a585 | 178 | _flags = select.POLLIN | select.POLLPRI | select.POLLHUP # | select.POLLERR |
aaf62c14 CH |
179 | _regexp = '(.+)\s+(INFO|ERR|WARN|DBUG|NOTE)\s+pingcheck\[\d+\]: (.+)\s*' |
180 | _out_fileno = None | |
181 | _err_fileno = None | |
182 | ||
183 | SRC_OUT = 'stdout' | |
184 | SRC_ERR = 'stderr' | |
185 | SRC_UNK = 'unknown' | |
186 | ||
187 | def __init__(self, pipe_out, pipe_err): | |
188 | self._out = pipe_out | |
189 | self._err = pipe_err | |
190 | self._poller = select.poll() | |
191 | self._poller.register(pipe_out, self._flags) | |
192 | self._poller.register(pipe_err, self._flags) | |
193 | self._out_fileno = pipe_out.fileno() | |
194 | self._err_fileno = pipe_err.fileno() | |
195 | ||
196 | def _read(self): | |
197 | while True: | |
198 | result = self._poller.poll(10) | |
199 | for file_no, flags in result: | |
200 | if not (flags & self._flags): | |
3bd8a585 CH |
201 | raise PollFlagsException(flags, 'pingcheck') |
202 | ||
203 | if flags & select.POLLHUP: | |
204 | raise PollHupException('pingcheck') | |
aaf62c14 CH |
205 | |
206 | if file_no == self._out_fileno: | |
207 | src = self.SRC_OUT | |
208 | line = self._out.readline() | |
209 | elif file_no == self._err_fileno: | |
210 | src = self.SRC_ERR | |
211 | line = self._err.readline() | |
212 | else: | |
213 | src = self.SRC_UNK | |
214 | line = '' | |
215 | ||
216 | if not line: | |
217 | continue | |
3daedf04 | 218 | # if reach here, we have a new line of text |
aaf62c14 | 219 | |
3daedf04 | 220 | yield src, line.strip() |
aaf62c14 CH |
221 | #end: ProcOutputParser._read |
222 | ||
aaf62c14 | 223 | def parse(self): |
3daedf04 CH |
224 | for src, line in self._read(): |
225 | try: | |
3bd8a585 CH |
226 | timestamp, level, message = regexp(self._regexp, line).groups() |
227 | yield src, (timestamp, text_to_level[level], message) | |
3daedf04 CH |
228 | except: |
229 | yield src, line | |
aaf62c14 CH |
230 | |
231 | def __iter__(self): | |
232 | return self.parse() | |
233 | ||
aaf62c14 CH |
234 | #end: class ProcOutputParser |
235 | ||
3daedf04 CH |
236 | ######################################################################## |
237 | ### TESTER | |
238 | ######################################################################## | |
239 | ||
aaf62c14 CH |
240 | class Tester: |
241 | """ """ | |
242 | ||
243 | _line_iter = None | |
244 | _last_request = None | |
245 | _last_reply = None | |
246 | _n_requests = None | |
247 | _n_replies = None | |
4013dbf2 | 248 | _host_for_ip = None |
aaf62c14 CH |
249 | _next_state_change = None |
250 | _tell_connd_bin = None | |
251 | _prev_state_changes = None | |
252 | _fails = None | |
3daedf04 CH |
253 | _log = None |
254 | _log_ping = None | |
255 | max_fails = None | |
aaf62c14 | 256 | |
3daedf04 | 257 | def __init__(self, line_iter, tell_connd_bin, max_fails): |
aaf62c14 CH |
258 | self._line_iter = line_iter |
259 | self._last_request = {} | |
260 | self._last_reply = {} | |
261 | self._n_requests = {} | |
262 | self._n_replies = {} | |
4013dbf2 | 263 | self._host_for_ip = {} |
aaf62c14 CH |
264 | self._tell_connd_bin = tell_connd_bin |
265 | self._prev_state_changes = [] | |
266 | self._fails = [] | |
86ab1571 | 267 | self.max_fails = max_fails + 2 # for _test_the_tester |
3daedf04 CH |
268 | |
269 | self._log = logger_base.getChild('Tester') | |
270 | self._log_ping = {} | |
271 | self._log_ping[ProcOutputParser.SRC_OUT] = logger_base.getChild('pingOut') | |
272 | self._log_ping[ProcOutputParser.SRC_ERR] = logger_base.getChild('pingErr') | |
aaf62c14 CH |
273 | |
274 | ||
275 | def test(self): | |
3daedf04 CH |
276 | # preparations |
277 | current_state = self._is_online() | |
278 | while current_state is None: | |
279 | sleep(1) | |
280 | current_state = self._is_online() | |
281 | self._log.info('Start with state online = {0}'.format(current_state)) | |
3bd8a585 CH |
282 | self._prev_state_changes.append((current_state, dt.now())) |
283 | self._schedule_next_state_change() | |
3daedf04 | 284 | |
86ab1571 CH |
285 | self._test_the_tester() |
286 | ||
3daedf04 CH |
287 | # start looping over output from observed process |
288 | for src, text_info in self._line_iter: | |
86ab1571 CH |
289 | |
290 | # first of all: write to pinger log so nothing is lost | |
291 | if isinstance(text_info, str): # ProcOutputParser could not parse | |
3daedf04 CH |
292 | msg = 'unparsed: {0}'.format(text_info) |
293 | if text_info.startswith("RECV BUF ("): | |
4013dbf2 | 294 | self._log_ping[src].debug(msg) |
3daedf04 | 295 | else: |
4013dbf2 CH |
296 | self._log_ping[src].warning(msg) |
297 | continue # no need to parse message | |
3daedf04 CH |
298 | else: |
299 | text_time, level, message = text_info | |
3bd8a585 | 300 | msg = '{0} {1}'.format(level, message) |
aaf62c14 | 301 | |
4013dbf2 CH |
302 | if level in (LEVEL_WARN, LEVEL_ERROR, LEVEL_NOTICE): |
303 | # most common warning/error is only logged at debug level | |
304 | if message.endswith('could not be resolved.'): | |
305 | self._log_ping[src].debug(msg) | |
306 | continue | |
307 | elif message.endswith('fail sending ping data. Network is unreachable'): | |
308 | self._log_ping[src].debug(msg) | |
309 | continue | |
310 | elif message.endswith('fail sending ping data. Operation not permitted'): | |
311 | self._log_ping[src].debug(msg) | |
312 | continue | |
313 | else: # unexpected errors and warnings from pingcheck are logged at info level | |
314 | self._log_ping[src].info(msg) | |
315 | # no continue --> try analyze | |
316 | else: # normal messages are logged at debug level | |
3bd8a585 | 317 | self._log_ping[src].debug(msg) |
4013dbf2 | 318 | # no continue --> try analyze |
3daedf04 CH |
319 | |
320 | # now deal with message contents | |
aaf62c14 CH |
321 | if self._handle_request(message): |
322 | continue | |
323 | if self._handle_reply(message): | |
324 | continue | |
325 | if self._handle_change_state(): | |
326 | continue | |
3daedf04 CH |
327 | if self._handle_new_ip(message): |
328 | continue | |
329 | #end: for all lines from pingcheck output | |
aaf62c14 CH |
330 | #end: Tester.test |
331 | ||
86ab1571 CH |
332 | def _test_the_tester(self): |
333 | self._warn("this is a test warning without exception information") | |
334 | try: | |
335 | raise NotImplementedError('Test Exception') | |
336 | except: | |
337 | self._warn("this is a test warning including exception information", exception=True) | |
338 | self._log.info('you should have seen 2 failures above -- only be alarmed if not!') | |
339 | ||
aaf62c14 CH |
340 | |
341 | def _handle_new_ip(self, message): | |
a85f210b | 342 | # "Have IP 11.22.33.44 [61963s] for host.domain.com" |
4013dbf2 | 343 | matches = regexp('Have IP (\d+\.\d+\.\d+\.\d+) \[\d+s\] for (.+)', message) |
aaf62c14 | 344 | if matches: |
4013dbf2 CH |
345 | ip,host_name = matches.groups() |
346 | else: | |
347 | matches = regexp('Host is already an IP: (\d+\.\d+\.\d+\.\d+)', message) | |
348 | if matches: | |
349 | host_name = ip = matches.groups()[0] | |
350 | else: | |
351 | return False | |
352 | # now have host_name and ip | |
353 | ||
354 | self._log.info('remember new IP {0} for {1}'.format(ip, host_name)) | |
355 | self._host_for_ip[ip] = host_name | |
356 | self._last_request[host_name] = dt.now() | |
357 | self._last_reply[host_name] = dt.now() | |
358 | self._n_requests[host_name] = 0 | |
359 | self._n_replies[host_name] = 0 | |
360 | return True | |
aaf62c14 | 361 | |
3daedf04 | 362 | |
aaf62c14 CH |
363 | def _handle_request(self, message): |
364 | matches = regexp('(\d+\.\d+\.\d+\.\d+): sending ping', message) | |
365 | if matches is not None: | |
366 | ip = matches.groups()[0] | |
4013dbf2 CH |
367 | host_name = self._host_for_ip[ip] |
368 | self._log.info('pinging host {0} ({1})'.format(host_name, ip)) | |
369 | self._last_request[host_name] = dt.now() | |
370 | self._n_requests[host_name] += 1 | |
aaf62c14 CH |
371 | return True |
372 | return False | |
373 | ||
3daedf04 | 374 | |
aaf62c14 CH |
375 | def _handle_reply(self, message): |
376 | matches = regexp('(\d+\.\d+\.\d+\.\d+): Received reply', message) | |
377 | if matches is not None: | |
378 | ip = matches.groups()[0] | |
4013dbf2 CH |
379 | host_name = self._host_for_ip[ip] |
380 | self._log.info('received reply from {0} ({1})'.format(host_name, ip)) | |
381 | self._last_reply[host_name] = dt.now() | |
382 | self._n_replies[host_name] += 1 | |
aaf62c14 CH |
383 | return True |
384 | return False | |
385 | ||
aaf62c14 | 386 | |
3daedf04 | 387 | def _handle_change_state(self): |
86ab1571 CH |
388 | if dt.now() < self._next_state_change: |
389 | return False | |
390 | ||
391 | is_online = self._is_online() | |
392 | if is_online is None: | |
393 | return False # try again later | |
394 | ||
395 | expect_online = self._prev_state_changes[-1][0] | |
396 | if expect_online != is_online: | |
397 | self._warn('Expect to be online={0} since {1} but are online={2}!'.format(\ | |
398 | expect_online, self._prev_state_changes[-1][1], is_online)) | |
399 | ||
400 | if is_online: | |
401 | self._ensure_all_pinged() | |
402 | new_val = False | |
403 | else: | |
404 | new_val = True | |
405 | if self._set_online(new_val): | |
406 | self._schedule_next_state_change() | |
407 | return True | |
408 | # if call to tell_connd fails, try again later | |
aaf62c14 CH |
409 | #end: _handle_change_state |
410 | ||
411 | ||
3bd8a585 | 412 | def _schedule_next_state_change(self): |
3daedf04 CH |
413 | n_secs = randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE) |
414 | self._next_state_change = dt.now() + timedelta(seconds=n_secs) | |
415 | self._log.info('schedule next state change for {0:%H:%M:%S} (in {1:d}s)'.format(\ | |
416 | self._next_state_change, n_secs)) | |
417 | #end: Tester._schedule_next_state_change | |
418 | ||
419 | ||
aaf62c14 | 420 | def _is_online(self): |
3bd8a585 CH |
421 | """ determine state using ConndState """ |
422 | state = ConndState.get_state(self._tell_connd_bin) | |
423 | if not isinstance(state, ConndState): | |
424 | err_code, output_lines = state | |
425 | self._warn('Non-zero exit status asking tell-connd for its --status: {0}'.format(\ | |
426 | err_code, output_lines[0])) | |
aaf62c14 CH |
427 | return None |
428 | ||
3bd8a585 | 429 | return len(state.connections) > 0 |
aaf62c14 CH |
430 | #end: Tester._is_online |
431 | ||
432 | ||
aaf62c14 | 433 | def _set_online(self, new_val): |
3bd8a585 CH |
434 | """ run tell-connd --online/offline P1 |
435 | ||
436 | returns true if afterwards are in requested state | |
437 | """ | |
438 | if new_val == self._is_online(): | |
439 | self._warn('Want to set online state to current state!') | |
440 | return True | |
aaf62c14 | 441 | if new_val: |
3daedf04 | 442 | self._log.info('going online') |
aaf62c14 CH |
443 | args = [self._tell_connd_bin, '--online', 'P1'] |
444 | else: | |
3daedf04 | 445 | self._log.info('going offline') |
aaf62c14 | 446 | args = [self._tell_connd_bin, '--offline'] |
3bd8a585 | 447 | |
aaf62c14 CH |
448 | try: |
449 | return_code = subprocess.call(args, stderr=subprocess.STDOUT, timeout=1) | |
450 | if return_code == 0: | |
3bd8a585 | 451 | self._prev_state_changes.append((new_val, dt.now())) |
aaf62c14 CH |
452 | return True |
453 | else: | |
3bd8a585 CH |
454 | self._log.info('tell-connd returned {0} --> online state probably unchanged!'.format(\ |
455 | return_code)) | |
aaf62c14 CH |
456 | return False |
457 | except subprocess.TimeoutExpired as te: | |
3daedf04 | 458 | self._warn('Timeout expired running {0}. Output was {1}'.format(args, te.output)) |
aaf62c14 CH |
459 | return False |
460 | except: | |
86ab1571 | 461 | self._warn('Trouble calling tell-connd', exception=True) |
aaf62c14 CH |
462 | return False |
463 | #end: Tester._set_online | |
464 | ||
465 | ||
466 | def _ensure_all_pinged(self): | |
467 | """ called before going offline; check if all hosts were successfully pinged """ | |
468 | # determine when we last went online | |
469 | time_went_online = None | |
470 | time_go_offline = dt.now() | |
471 | ||
472 | for went_online, change_time in reversed(self._prev_state_changes): | |
473 | if went_online: | |
474 | time_went_online = change_time | |
475 | break | |
476 | if time_went_online is None: | |
477 | # never been online --> nothing to check yet | |
478 | return | |
479 | ||
3bd8a585 CH |
480 | self._log.info('checking whether all hosts were pinged between ' |
481 | + '{0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\ | |
482 | time_went_online, time_go_offline)) | |
4013dbf2 | 483 | for host_name, reply_time in self._last_reply.items(): |
aaf62c14 | 484 | if reply_time < time_went_online: |
3bd8a585 CH |
485 | self._log.warning(\ |
486 | 'Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\ | |
4013dbf2 CH |
487 | host_name, time_went_online, time_go_offline)) |
488 | self._fails.append((host_name, time_went_online, time_go_offline)) | |
3daedf04 CH |
489 | self._log.info('remembered as failure number {0} / {1}'.format(\ |
490 | len(self._fails), self.max_fails)) | |
491 | ||
492 | # check if there were too many fails; if so, stop | |
493 | if len(self._fails) > self.max_fails: | |
494 | self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!")) | |
3bd8a585 | 495 | raise TooManyWarnings() |
3daedf04 CH |
496 | #end: if not pinged recently |
497 | #end: for all known ips | |
aaf62c14 CH |
498 | #end: Tester._ensure_all_pinged |
499 | ||
500 | ||
86ab1571 | 501 | def _warn(self, message, exception=False): |
3daedf04 | 502 | |
86ab1571 CH |
503 | if exception: |
504 | self._log.exception(message, exc_info=True) | |
505 | else: | |
506 | self._log.warning(message) | |
3daedf04 CH |
507 | self._fails.append(("WARN", dt.now(), message)) |
508 | self._log.info('remembered as failure number {0} / {1}'.format(\ | |
509 | len(self._fails), self.max_fails)) | |
aaf62c14 | 510 | |
3daedf04 CH |
511 | # check if there were too many fails; if so, stop |
512 | if len(self._fails) > self.max_fails: | |
513 | self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!")) | |
3bd8a585 | 514 | raise TooManyWarnings() |
3daedf04 | 515 | #end: Tester._warn |
aaf62c14 | 516 | |
3daedf04 | 517 | |
3daedf04 | 518 | def print_failures(self, log): |
aaf62c14 | 519 | if len(self._fails) == 0: |
86ab1571 CH |
520 | log.info('No warnings nor failures since {0:%d.%m.%y, %H:%M} :-)'.format(\ |
521 | self._prev_state_changes[0][1])) | |
aaf62c14 | 522 | else: |
3daedf04 | 523 | log.info('Recorded the following failures and warnings:') |
4013dbf2 CH |
524 | for host_name, time_went_online, time_go_offline in self._fails: |
525 | if host_name == 'WARN': | |
3daedf04 | 526 | log.info('Warning at {0:%H:%M:%S}: {1}'.format(time_went_online, time_go_offline)) |
aaf62c14 | 527 | else: |
3bd8a585 | 528 | log.info('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3:.2f}s)'\ |
4013dbf2 | 529 | .format(host_name, time_went_online, time_go_offline, \ |
3bd8a585 | 530 | (time_go_offline-time_went_online).total_seconds())) |
86ab1571 CH |
531 | |
532 | #end: Tester.print_failures | |
533 | ||
534 | def save_state_and_history(self, save_file): | |
535 | """ save all other data for later inspection using show_saved_data """ | |
536 | try: | |
537 | with open(save_file, 'wb') as file_handle: | |
538 | self._log.info('saving state and history to {0}'.format(save_file)) | |
539 | pickle.dump( (self._last_request, self._last_reply, self._n_requests, self._n_replies, \ | |
540 | self._host_for_ip, self._tell_connd_bin, self._prev_state_changes, \ | |
541 | self._fails, self.max_fails, save_file, dt.now()), \ | |
542 | file_handle) | |
543 | return True | |
544 | except Exception as e: | |
545 | self._log.exception("Failed to save!", exc_info=True) | |
546 | return False | |
547 | #end: Tester.save_state_and_history | |
548 | ||
549 | ||
550 | @staticmethod | |
551 | def show_saved_data(save_file): | |
552 | """ print info on data saved by save_state_and_history """ | |
553 | ||
554 | # load | |
555 | save_date = None | |
556 | with open(save_file, 'rb') as file_handle: | |
557 | print('loading state and history from {0}'.format(save_file)) | |
558 | last_request, last_reply, n_requests, n_replies, host_for_ip, tell_connd_bin, \ | |
559 | state_changes, fails, max_fails, save_file_again, save_date \ | |
560 | = pickle.load(file_handle) | |
561 | if save_date is None: | |
562 | print('Failed to read state and history from {0}'.format(save_file)) | |
563 | return 1 | |
564 | ||
565 | # simple stats | |
566 | first_change = state_changes[0][1] | |
567 | last_change = state_changes[-1][1] | |
568 | print('Performed {0} state changes between {1:%d.%m,%H:%M} and {2:%d.%m,%H:%M}' \ | |
569 | .format(len(state_changes), first_change, last_change), end='') | |
570 | print(' ({0} --> one every {1} on average)'.format( \ | |
571 | strftimedelta( last_change - first_change ), \ | |
572 | strftimedelta((last_change - first_change) / len(state_changes)) )) | |
573 | print('Used {0} IPs for {1} hosts'.format(len(host_for_ip), len(last_request))) | |
574 | print('Overall, recorded {0} warnings/failures (max={1})'.format(len(fails), max_fails)) | |
575 | ||
576 | # sort fails into warnings and fail per host and fail per time | |
577 | warnings = [] | |
578 | n_test_warnings = 0 | |
579 | fail_per_host = {} | |
580 | fail_per_time = {} | |
581 | n_fails = 0 | |
582 | for host, time_online, time_offline in fails: | |
583 | if host == 'WARN': | |
584 | # time_online = time, time_offline=message | |
585 | if 'test warning' in time_offline: | |
586 | n_test_warnings += 1 | |
587 | else: | |
588 | warnings.append((time_online, time_offline)) | |
589 | else: | |
590 | n_fails += 1 | |
591 | try: | |
592 | fail_per_host[host].append(time_online) | |
593 | except KeyError: | |
594 | fail_per_host[host] = [time_online,] | |
595 | duration = time_offline - time_online | |
596 | try: | |
597 | fail_per_time[time_online, duration].append(host) | |
598 | except KeyError: | |
599 | fail_per_time[time_online, duration] = [host, ] | |
600 | ||
601 | # stats per host | |
602 | for host, n_req in n_requests.items(): | |
603 | print(host + ' (', end='') | |
604 | print(','.join( ip for ip, host_for_ip in host_for_ip.items() if host==host_for_ip ), end='') | |
605 | print('): \treplied to {1} of {0} pings ({2:.1f}%); '.format(\ | |
606 | n_req, n_replies[host], 100. * n_replies[host]/n_req), end='') | |
607 | if host in fail_per_host: | |
608 | print('failed to ping at ', end='') | |
609 | print(','.join( '{0:%H:%M},'.format(fail_time) for fail_time in fail_per_host[host] )) | |
610 | else: | |
611 | print('replied in each test interval') | |
612 | ||
613 | # stats per time | |
614 | if fail_per_time: | |
615 | print("Ping failures per time (overall {0} fails in {1} intervals):".format(\ | |
616 | n_fails, len(fail_per_time))) | |
617 | for (start, duration), host_list in fail_per_time.items(): | |
618 | print('{0:%d.%m,%H:%M} + {1}: '.format(start, strftimedelta(duration)), end='') | |
619 | print(','.join(host_list)) | |
620 | ||
621 | # warnings | |
622 | if warnings: | |
623 | if n_test_warnings: | |
624 | print("Warnings (not showing {0} test warnings):".format(n_test_warnings)) | |
625 | else: | |
626 | print("Warnings:") | |
627 | for time, message in warnings: | |
628 | print("{0:%d.%m,%H:%M}: {1}".format(time, message)) | |
629 | elif n_test_warnings: | |
630 | print('Only warnings were {0} for testing'.format(n_test_warnings)) | |
631 | ||
632 | # done | |
633 | return 0 | |
634 | #end: static Tester.show_saved_data | |
635 | ||
aaf62c14 CH |
636 | #end: class Tester |
637 | ||
638 | ######################################################################## | |
639 | ### MAIN | |
640 | ######################################################################## | |
641 | ||
de9d4d4c CH |
642 | def test_version(pingcheck_binary): |
643 | """ run pingcheck --version; we want > 0.1 for proper IP parsing | |
4013dbf2 | 644 | |
de9d4d4c CH |
645 | no try-catch here, also intended as test-run for pingcheck binary |
646 | """ | |
647 | ||
648 | output = subprocess.check_output([pingcheck_binary, '--version'], universal_newlines=True) | |
649 | match = regexp('\s*pingcheck version (\d+\.\d+)\s*', output) | |
650 | if match is None: | |
651 | raise ValueError('Could not determine pingcheck version from output of {0} --version!'\ | |
652 | .format(pingcheck_binary)) | |
653 | version = float(match.groups()[0]) | |
654 | if version < 0.2: | |
655 | raise ValueError('Pingcheck version is {0} but require version > 0.1!'.format(version)) | |
4013dbf2 CH |
656 | |
657 | #end: test_version | |
658 | ||
659 | ||
aaf62c14 CH |
660 | def parse_args(): |
661 | """ parse command line arguments and set defaults """ | |
662 | from argparse import ArgumentParser | |
663 | parser = ArgumentParser(description="Long-term testing of pingcheck: goes online and offline " \ | |
664 | + "using tell-connd and checks log for correct output", \ | |
665 | epilog="(c) Christian Herdtweck, Intra2net AG, 2015") | |
666 | ||
4c50ece1 CH |
667 | # option to show saved data |
668 | parser.add_argument('--show-saved-data', '-s', metavar='SAVE_FILE.pickle', \ | |
669 | default='', type=str, \ | |
670 | help='Do not run test but instead print info saved by earlier test run') | |
671 | ||
aaf62c14 | 672 | # binaries |
3bd8a585 | 673 | parser.add_argument('--pingcheck-binary', '-p', metavar='PATH', \ |
aaf62c14 CH |
674 | default=default_pingcheck_bin, \ |
675 | help='path to pingcheck binary') | |
3bd8a585 | 676 | parser.add_argument('--tell-connd-binary', '-t', metavar='PATH', \ |
aaf62c14 CH |
677 | default=default_tell_connd_bin, \ |
678 | help='path to tell-connd binary') | |
679 | ||
3daedf04 | 680 | # output |
3bd8a585 | 681 | parser.add_argument('--output-prefix', '-o', metavar='PATH', \ |
3daedf04 CH |
682 | default=default_output_prefix, \ |
683 | help='path and/or file prefix to config and log files; ~=home, 0=prog start datetime; ' \ | |
684 | + 'default="%(default)s"') | |
3bd8a585 | 685 | parser.add_argument('--log-to-console', '-c', metavar='0|1', \ |
3daedf04 CH |
686 | choices=(0, 1), type=int, default=1, \ |
687 | help='print all output to console? default=%(default)s') | |
3bd8a585 | 688 | parser.add_argument('--log-level-console', metavar='LEVEL', \ |
3daedf04 | 689 | type=int, default=logging.INFO, \ |
3bd8a585 CH |
690 | help='log level for console output; ' \ |
691 | + 'debug={0}, info={1}, warn={2}; default: %(default)s'.format(\ | |
3daedf04 | 692 | logging.DEBUG, logging.INFO, logging.WARN)) |
3bd8a585 | 693 | parser.add_argument('--log-to-file', '-f', metavar='0|1', \ |
3daedf04 CH |
694 | choices=(0, 1), type=int, default=1, \ |
695 | help='print all output to log file? default=%(default)s') | |
3bd8a585 | 696 | parser.add_argument('--log-level-file', metavar='LEVEL', \ |
3daedf04 | 697 | type=int, default=logging.DEBUG, \ |
3bd8a585 CH |
698 | help='log level for console output; ' \ |
699 | + 'debug={0}, info={1}, warn={2}; default: %(default)s'.format(\ | |
3daedf04 | 700 | logging.DEBUG, logging.INFO, logging.WARN)) |
3bd8a585 | 701 | parser.add_argument('--log-file-suffix', '-l', metavar='SUFF.IX', \ |
3daedf04 CH |
702 | type=str, default='.log', \ |
703 | help='2nd part of log file name after --output_prefix; default="%(default)s"') | |
3bd8a585 CH |
704 | parser.add_argument('--pinger-log-level', metavar='LEVEL', \ |
705 | type=str, default='INFO', \ | |
706 | help='log level given to pingchecker, DEBUG might quickly fill log; default=%(default)s') | |
3daedf04 | 707 | |
3bd8a585 CH |
708 | |
709 | parser.add_argument('--max-fails-per-host', '-m', metavar='N', type=int, default=10, \ | |
710 | help='max number of failures/warnings per host before test will end; default: %(default)s') | |
aaf62c14 | 711 | parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(version)) |
3daedf04 CH |
712 | |
713 | # parse | |
714 | opts = parser.parse_args() | |
715 | opts.output_prefix = expanduser(opts.output_prefix).format(dt.now()) | |
716 | return opts | |
aaf62c14 CH |
717 | #end: parse_args |
718 | ||
719 | ||
3daedf04 CH |
720 | def setup_logging(opts): |
721 | """ create main logger and handlers for base logger """ | |
722 | ||
723 | # create logger for main | |
724 | log = logger_base.getChild('main') | |
725 | ||
726 | # remove all existing handlers | |
727 | for old_handler in logger_base.handlers: | |
728 | print('removing handler from base') | |
729 | logger_base.removeHandler(old_handler) | |
730 | for old_handler in log.handlers: | |
731 | print('removing handler from log') | |
732 | log.removeHandler(old_handler) | |
733 | ||
734 | logger_base.setLevel(logging.DEBUG) | |
735 | ||
736 | # create handlers and format for base logger | |
737 | log_file = None | |
86ab1571 CH |
738 | formatter = logging.Formatter('%(asctime)s %(name)s %(levelname)s %(message)s', \ |
739 | datefmt='%d.%m,%H:%M:%S') | |
3daedf04 CH |
740 | if opts.log_to_console: |
741 | handler = logging.StreamHandler(sys.stdout) | |
742 | handler.setLevel(opts.log_level_console) | |
3daedf04 CH |
743 | handler.setFormatter(formatter) |
744 | logger_base.addHandler(handler) | |
745 | ||
746 | if opts.log_to_file: | |
747 | log_file = opts.output_prefix + opts.log_file_suffix | |
748 | handler = logging.FileHandler(log_file) | |
749 | handler.setLevel(opts.log_level_file) | |
3daedf04 CH |
750 | handler.setFormatter(formatter) |
751 | logger_base.addHandler(handler) | |
752 | ||
753 | # log info | |
754 | log.info('Log for {0} in {1} created on {2}'.format(__name__, __file__, dt.now())) | |
755 | if opts.log_to_console: | |
756 | log.info('logging to console at level {0}'.format(opts.log_level_console)) | |
757 | else: | |
758 | log.info('not logging to console') | |
759 | if opts.log_to_file: | |
760 | log.info('logging to file {1} at level {0}'.format(opts.log_level_file, log_file)) | |
761 | else: | |
762 | log.info('not logging to file') | |
763 | ||
764 | # return log_file only if makes sense to talk about it in main | |
765 | if opts.log_to_console: | |
766 | return log, log_file | |
767 | else: | |
768 | return log, None | |
769 | #end: setup_logging | |
770 | ||
771 | ||
4c50ece1 | 772 | def handle_sigchild(signum, curr_frame, log, cleanup_args): |
3bd8a585 CH |
773 | log.error('Received SIGCHLD') |
774 | log.info('with args {0} and {1}'.format(signum, curr_frame)) | |
775 | ||
4c50ece1 | 776 | cleanup(*cleanup_args) |
3bd8a585 CH |
777 | |
778 | ||
4c50ece1 | 779 | def cleanup(log, pinger, tester, line_iter, save_file): |
3bd8a585 CH |
780 | """ the stuff from finally clause in main loop; also called from handle_sigchild """ |
781 | ||
782 | return_code = 255 | |
783 | ||
784 | log.info('-' * 72) | |
785 | if pinger is None: | |
786 | log.warning('Error before pinger was created') | |
787 | return_code = 254 | |
788 | else: | |
789 | log.info("Cleaning up") | |
790 | ||
3bd8a585 CH |
791 | try: |
792 | log.info('telling pinger to terminate') | |
793 | pinger.terminate() | |
794 | ||
d1df5eda CH |
795 | for line_idx, line in enumerate(line_iter): |
796 | log.info('left-over output: {0}'.format(line)) | |
797 | if line_idx > 100: | |
798 | log.warning('ignore further left-over output') | |
799 | break | |
800 | ||
801 | #return_code = pinger.wait(3) can block! | |
3bd8a585 CH |
802 | outs, errs = pinger.communicate(timeout=3) |
803 | for line in outs.splitlines(): | |
804 | log.info('left-over output: {0}'.format(line)) | |
805 | for line in errs.splitlines(): | |
806 | log.info('left-over output: {0}'.format(line)) | |
807 | ||
808 | return_code = pinger.returncode # may still be None | |
809 | log.info('pinger terminated with return code {0}'.format(return_code)) | |
810 | except subprocess.TimeoutExpired as te: | |
811 | log.warning('Timeout expired waiting for pinger to terminate: {0}'.format(te)) | |
812 | return_code = 253 | |
813 | except: | |
86ab1571 | 814 | log.exception('trouble terminating pinger', exc_info=True) |
3bd8a585 CH |
815 | |
816 | if return_code is None: | |
d1df5eda | 817 | for line_idx, line in enumerate(line_iter): |
3bd8a585 | 818 | log.info('left-over output: {0}'.format(line)) |
d1df5eda CH |
819 | if line_idx > 100: |
820 | log.warning('ignore further left-over output') | |
821 | break | |
3bd8a585 CH |
822 | |
823 | # kill it | |
824 | try: | |
825 | log.info('killing pinger the hard way') | |
826 | pinger.kill() | |
827 | outs, errs = pinger.communicate(timeout=3) | |
828 | for line in outs.splitlines(): | |
829 | log.info('left-over output: {0}'.format(line)) | |
830 | for line in errs.splitlines(): | |
831 | log.info('left-over output: {0}'.format(line)) | |
832 | ||
833 | except: | |
86ab1571 | 834 | log.exception('trouble killing pinger', exc_info=True) |
3bd8a585 CH |
835 | |
836 | if tester is not None: | |
86ab1571 | 837 | tester.save_state_and_history(save_file) |
3bd8a585 CH |
838 | tester.print_failures(log) |
839 | #end: if have pinger | |
840 | ||
841 | # reset original signal handler | |
842 | #signal.signal(signal.SIGCHLD, signal.SIG_DFL) | |
843 | ||
844 | return return_code | |
845 | #end: cleanup | |
846 | ||
847 | ||
aaf62c14 | 848 | def main(): |
3daedf04 | 849 | """ parses args, create a Pinger and Parser and Tester and runs them; cleans up |
aaf62c14 CH |
850 | """ |
851 | ||
852 | opts = parse_args() | |
853 | ||
4c50ece1 CH |
854 | if opts.show_saved_data: |
855 | return Tester.show_saved_data(opts.show_saved_data) | |
856 | ||
3daedf04 CH |
857 | base_dir = dirname(opts.output_prefix + "_test") |
858 | if not isdir(base_dir): | |
859 | mkdir(base_dir) | |
860 | ||
861 | log, log_file = setup_logging(opts) | |
862 | ||
aaf62c14 | 863 | pinger = None |
3bd8a585 | 864 | return_code = 255 |
aaf62c14 | 865 | tester = None |
3bd8a585 | 866 | line_iter = None |
aaf62c14 | 867 | try: |
3bd8a585 | 868 | config_file, n_hosts = create_conf_file(opts.output_prefix) |
4c50ece1 | 869 | save_file = opts.output_prefix + '_save.pickle' |
3daedf04 CH |
870 | log.info('created config file {0}'.format(config_file)) |
871 | ||
de9d4d4c CH |
872 | # test version |
873 | test_version(opts.pingcheck_binary) | |
874 | ||
3bd8a585 | 875 | #signal.signal(signal.SIGCHLD, \ |
4c50ece1 CH |
876 | # lambda signum, curr_frame: handle_sigchild(signum, curr_frame, log, \ |
877 | # (log, pinger, tester, line_iter, save_file) ) | |
3bd8a585 | 878 | |
3daedf04 | 879 | # pingcheck subprocess |
3bd8a585 | 880 | pinger = subprocess.Popen([opts.pingcheck_binary, '--log-level', opts.pinger_log_level, \ |
3daedf04 CH |
881 | '--log-output', 'TERMINAL', '--config-file', config_file], \ |
882 | stdout=subprocess.PIPE, stderr=subprocess.PIPE, \ | |
883 | universal_newlines=True) | |
884 | log.info('started pingcheck with PID {0}'.format(pinger.pid)) | |
885 | ||
aaf62c14 CH |
886 | parser = ProcOutputParser(pinger.stdout, pinger.stderr) |
887 | line_iter = iter(parser) | |
3bd8a585 CH |
888 | tester = Tester(line_iter, opts.tell_connd_binary, max(10, opts.max_fails_per_host*n_hosts)) |
889 | ||
aaf62c14 CH |
890 | tester.test() |
891 | except KeyboardInterrupt: | |
3daedf04 | 892 | log.info('stopped by user') |
aaf62c14 | 893 | except: |
86ab1571 | 894 | log.exception('Caught an exception in main loop', exc_info=True) |
3bd8a585 | 895 | log.warning('Re-raise') |
aaf62c14 CH |
896 | raise |
897 | finally: | |
86ab1571 | 898 | return_code = cleanup(log, pinger, tester, line_iter, save_file) |
3daedf04 | 899 | |
3bd8a585 CH |
900 | log.info('Will stop now') |
901 | if log_file is not None: | |
902 | log.info('might find more log output in {0}'.format(log_file)) | |
903 | log.info('-' * 72) | |
aaf62c14 CH |
904 | #end: global clean-up |
905 | ||
aaf62c14 CH |
906 | return return_code |
907 | ||
908 | #end: function main | |
909 | ||
910 | ||
911 | if __name__ == '__main__': | |
912 | main() | |
3daedf04 | 913 | logging.shutdown() |
aaf62c14 | 914 |