added log output target UNDEFINED; return if no hosts defined
[pingcheck] / test / long_term_test.py
CommitLineData
aaf62c14
CH
1#!/usr/bin/env python3
2
3""" A more long-term testing of pingcheck
4
5Runs pingcheck parsing its log output; goes online and offline repeatedly to see if
6 pinging stays correct
7
8see also: connd/stresstest_pingcheck.py
9
10Christian Herdtweck, Intra2net, January 2015
11(c) Intra2net AG 2015
12"""
13
14# Version History
15# 15/01/15 Christian Herdtweck: started creation
3bd8a585 16# 21/1/15 ch: running 3rd long-term test, seems to work nicely now
aaf62c14
CH
17
18import subprocess
3daedf04 19import sys
aaf62c14
CH
20import select
21from re import match as regexp
22from time import sleep
23from datetime import timedelta, datetime as dt
24from warnings import warn
25from random import randint
3daedf04
CH
26import logging
27from os.path import expanduser, dirname, isdir
28from os import mkdir, EX_OK
3bd8a585
CH
29from connd_state import ConndState
30#import signal
4c50ece1
CH
31import pickle
32from strftimedelta import strftimedelta
3daedf04
CH
33
34logger_base = logging.getLogger('long_term_test')
aaf62c14
CH
35
36########################################################################
37### CONSTANTS
38########################################################################
39
3daedf04 40version = '1.1'
c3adbb6d 41default_pingcheck_bin = '/usr/intranator/bin/pingcheck'
aaf62c14 42default_tell_connd_bin = '/usr/intranator/bin/tell-connd'
3daedf04 43default_output_prefix = './long_term_test/{0:%d%m%y_%H%M}'
aaf62c14 44
3daedf04 45# for parsing log messages
aaf62c14
CH
46LEVEL_DEBUG = 'dbug'
47LEVEL_INFO = 'info'
48LEVEL_WARN = 'warn'
49LEVEL_NOTICE = 'note'
50LEVEL_ERROR = 'err '
51
52text_to_level = {}
53text_to_level['INFO'] = LEVEL_INFO
54text_to_level['DBUG'] = LEVEL_DEBUG
55text_to_level['WARN'] = LEVEL_WARN
56text_to_level['NOTE'] = LEVEL_NOTICE
57text_to_level['ERR'] = LEVEL_ERROR
58
3daedf04 59# duration in which to remain online/offline
aaf62c14
CH
60MIN_DURATION_SAME_STATE = 10 # [seconds]
61MAX_DURATION_SAME_STATE = 50 # [seconds]
62
63########################################################################
3daedf04 64#### for config file
aaf62c14
CH
65########################################################################
66
3daedf04
CH
67PING_INTERVAL = 3 # [seconds]
68config_file_pattern = 'long_term_test_{0:%d%m%y_%H%M%S}.conf'
69conf_start = """# config file for pingcheck testing
aaf62c14 70
3daedf04
CH
71default-source-network-interface=eth1
72nameserver=127.0.0.1
73hosts-down-limit={hosts_down_limit:d}
74ping-fail-limit=25
75status-notifier-cmd=echo host status is ${status}
76link-up-interval=2
3bd8a585 77ping-reply-timeout=3
3daedf04
CH
78"""
79conf_host = """
80[host]
81name={0}
82interval={1:d}
83ping-protocol=ICMPv4
84port=80
85source-network-interface=default
86"""
87host_list = (\
88 'intranator.m.i2n', \
89 'intraroute.m.i2n', \
90 'websync.m.i2n', \
91 'aqua.m.i2n', \
92 'intratdsl.m.i2n', \
93 'beholder.m.i2n', \
94 'eagle.m.i2n', \
95 'osdist.m.i2n', \
96 'tokyo.m.i2n', \
97 'blizzard.m.i2n', \
98 'phonehelp.m.i2n', \
99 'virt-dmz.m.i2n', \
100 'virt01.m.i2n', \
101 'virt02.m.i2n', \
102 'tornado.n.i2n', \
103 'auth.m.i2n', \
104 'forum.intra2net.com', \
105 'update.intranator.com', \
106 'demo-intranator.intra2net.com', \
107 'intra2net.com', \
4013dbf2 108 #'partner.intra2net.com', has same IP as intra2net.com --> trouble in Tester._host_for_ip \
3daedf04
CH
109 'demo-securitygateway.intra2net.com', \
110 'demo-networksecurity.intra2net.com', \
111 )
112
113def create_conf_file(file_prefix):
114 """ create config file using constants above """
115
116 file_name = file_prefix + '_config.conf'
117
118 n_hosts = len(host_list)
119
120 with open(file_name, 'wt') as file_handle:
121 file_handle.write(conf_start.format(hosts_down_limit=int(n_hosts * 0.7), status='{status}'))
122 for host in host_list:
123 file_handle.write(conf_host.format(host, PING_INTERVAL))
124
3bd8a585 125 return file_name, n_hosts
3daedf04 126#end: create_conf_file
aaf62c14 127
aaf62c14 128
3daedf04 129########################################################################
3bd8a585
CH
130### Exceptions
131########################################################################
132
133def flags_to_str(flags):
134 text_parts = []
135 if flags & select.POLLIN:
136 text_parts.append('IN')
137 if flags & select.POLLOUT:
138 text_parts.append('OUT')
139 if flags & select.POLLPRI:
140 text_parts.append('PRI')
141 if flags & select.POLLERR:
142 text_parts.append('ERR')
143 if flags & select.POLLHUP:
144 text_parts.append('HUP')
145 if flags & select.POLLNVAL:
146 text_parts.append('INVALID')
147 return ','.join(text_parts)
148
149class PollFlagsException(Exception):
150 def __init__(self, flags, proc_name):
151 super(PollFlagsException, self).__init__('Unexpected flags from polling {0}: {1}'.format(\
152 proc_name, flags_to_str(flags)))
153
154class PollHupException(Exception):
155 def __init__(self, proc_name):
156 super(PollHupException, self).__init__(\
157 'Received HUP from polling, {0} was probably killed!'.format(proc_name))
158
159class TooManyWarnings(Exception):
160 def __init__(self):
161 super(TooManyWarnings, self).__init__('Too many warnings / failures!')
162
163########################################################################
3daedf04
CH
164### PARSER
165########################################################################
aaf62c14
CH
166
167class ProcOutputParser:
3daedf04
CH
168 """ parser for output from pingcheck process via pipes
169
170 uses select and polling
171 should never silently swallow log output, main function parse should yield all lines
172 or emit errors
173 """
aaf62c14
CH
174
175 _out = None
176 _err = None
177 _poller = None
3bd8a585 178 _flags = select.POLLIN | select.POLLPRI | select.POLLHUP # | select.POLLERR
aaf62c14
CH
179 _regexp = '(.+)\s+(INFO|ERR|WARN|DBUG|NOTE)\s+pingcheck\[\d+\]: (.+)\s*'
180 _out_fileno = None
181 _err_fileno = None
182
183 SRC_OUT = 'stdout'
184 SRC_ERR = 'stderr'
185 SRC_UNK = 'unknown'
186
187 def __init__(self, pipe_out, pipe_err):
188 self._out = pipe_out
189 self._err = pipe_err
190 self._poller = select.poll()
191 self._poller.register(pipe_out, self._flags)
192 self._poller.register(pipe_err, self._flags)
193 self._out_fileno = pipe_out.fileno()
194 self._err_fileno = pipe_err.fileno()
195
196 def _read(self):
197 while True:
198 result = self._poller.poll(10)
199 for file_no, flags in result:
200 if not (flags & self._flags):
3bd8a585
CH
201 raise PollFlagsException(flags, 'pingcheck')
202
203 if flags & select.POLLHUP:
204 raise PollHupException('pingcheck')
aaf62c14
CH
205
206 if file_no == self._out_fileno:
207 src = self.SRC_OUT
208 line = self._out.readline()
209 elif file_no == self._err_fileno:
210 src = self.SRC_ERR
211 line = self._err.readline()
212 else:
213 src = self.SRC_UNK
214 line = ''
215
216 if not line:
217 continue
3daedf04 218 # if reach here, we have a new line of text
aaf62c14 219
3daedf04 220 yield src, line.strip()
aaf62c14
CH
221 #end: ProcOutputParser._read
222
aaf62c14 223 def parse(self):
3daedf04
CH
224 for src, line in self._read():
225 try:
3bd8a585
CH
226 timestamp, level, message = regexp(self._regexp, line).groups()
227 yield src, (timestamp, text_to_level[level], message)
3daedf04
CH
228 except:
229 yield src, line
aaf62c14
CH
230
231 def __iter__(self):
232 return self.parse()
233
aaf62c14
CH
234#end: class ProcOutputParser
235
3daedf04
CH
236########################################################################
237### TESTER
238########################################################################
239
aaf62c14
CH
240class Tester:
241 """ """
242
243 _line_iter = None
244 _last_request = None
245 _last_reply = None
246 _n_requests = None
247 _n_replies = None
4013dbf2 248 _host_for_ip = None
aaf62c14
CH
249 _next_state_change = None
250 _tell_connd_bin = None
251 _prev_state_changes = None
252 _fails = None
3daedf04
CH
253 _log = None
254 _log_ping = None
255 max_fails = None
aaf62c14 256
3daedf04 257 def __init__(self, line_iter, tell_connd_bin, max_fails):
aaf62c14
CH
258 self._line_iter = line_iter
259 self._last_request = {}
260 self._last_reply = {}
261 self._n_requests = {}
262 self._n_replies = {}
4013dbf2 263 self._host_for_ip = {}
aaf62c14
CH
264 self._tell_connd_bin = tell_connd_bin
265 self._prev_state_changes = []
266 self._fails = []
86ab1571 267 self.max_fails = max_fails + 2 # for _test_the_tester
3daedf04
CH
268
269 self._log = logger_base.getChild('Tester')
270 self._log_ping = {}
271 self._log_ping[ProcOutputParser.SRC_OUT] = logger_base.getChild('pingOut')
272 self._log_ping[ProcOutputParser.SRC_ERR] = logger_base.getChild('pingErr')
aaf62c14
CH
273
274
275 def test(self):
3daedf04
CH
276 # preparations
277 current_state = self._is_online()
278 while current_state is None:
279 sleep(1)
280 current_state = self._is_online()
281 self._log.info('Start with state online = {0}'.format(current_state))
3bd8a585
CH
282 self._prev_state_changes.append((current_state, dt.now()))
283 self._schedule_next_state_change()
3daedf04 284
86ab1571
CH
285 self._test_the_tester()
286
3daedf04
CH
287 # start looping over output from observed process
288 for src, text_info in self._line_iter:
86ab1571
CH
289
290 # first of all: write to pinger log so nothing is lost
291 if isinstance(text_info, str): # ProcOutputParser could not parse
3daedf04
CH
292 msg = 'unparsed: {0}'.format(text_info)
293 if text_info.startswith("RECV BUF ("):
4013dbf2 294 self._log_ping[src].debug(msg)
3daedf04 295 else:
4013dbf2
CH
296 self._log_ping[src].warning(msg)
297 continue # no need to parse message
3daedf04
CH
298 else:
299 text_time, level, message = text_info
3bd8a585 300 msg = '{0} {1}'.format(level, message)
aaf62c14 301
4013dbf2
CH
302 if level in (LEVEL_WARN, LEVEL_ERROR, LEVEL_NOTICE):
303 # most common warning/error is only logged at debug level
304 if message.endswith('could not be resolved.'):
305 self._log_ping[src].debug(msg)
306 continue
307 elif message.endswith('fail sending ping data. Network is unreachable'):
308 self._log_ping[src].debug(msg)
309 continue
310 elif message.endswith('fail sending ping data. Operation not permitted'):
311 self._log_ping[src].debug(msg)
312 continue
313 else: # unexpected errors and warnings from pingcheck are logged at info level
314 self._log_ping[src].info(msg)
315 # no continue --> try analyze
316 else: # normal messages are logged at debug level
3bd8a585 317 self._log_ping[src].debug(msg)
4013dbf2 318 # no continue --> try analyze
3daedf04
CH
319
320 # now deal with message contents
aaf62c14
CH
321 if self._handle_request(message):
322 continue
323 if self._handle_reply(message):
324 continue
325 if self._handle_change_state():
326 continue
3daedf04
CH
327 if self._handle_new_ip(message):
328 continue
329 #end: for all lines from pingcheck output
aaf62c14
CH
330 #end: Tester.test
331
86ab1571
CH
332 def _test_the_tester(self):
333 self._warn("this is a test warning without exception information")
334 try:
335 raise NotImplementedError('Test Exception')
336 except:
337 self._warn("this is a test warning including exception information", exception=True)
338 self._log.info('you should have seen 2 failures above -- only be alarmed if not!')
339
aaf62c14
CH
340
341 def _handle_new_ip(self, message):
a85f210b 342 # "Have IP 11.22.33.44 [61963s] for host.domain.com"
4013dbf2 343 matches = regexp('Have IP (\d+\.\d+\.\d+\.\d+) \[\d+s\] for (.+)', message)
aaf62c14 344 if matches:
4013dbf2
CH
345 ip,host_name = matches.groups()
346 else:
347 matches = regexp('Host is already an IP: (\d+\.\d+\.\d+\.\d+)', message)
348 if matches:
349 host_name = ip = matches.groups()[0]
350 else:
351 return False
352 # now have host_name and ip
353
354 self._log.info('remember new IP {0} for {1}'.format(ip, host_name))
355 self._host_for_ip[ip] = host_name
356 self._last_request[host_name] = dt.now()
357 self._last_reply[host_name] = dt.now()
358 self._n_requests[host_name] = 0
359 self._n_replies[host_name] = 0
360 return True
aaf62c14 361
3daedf04 362
aaf62c14
CH
363 def _handle_request(self, message):
364 matches = regexp('(\d+\.\d+\.\d+\.\d+): sending ping', message)
365 if matches is not None:
366 ip = matches.groups()[0]
4013dbf2
CH
367 host_name = self._host_for_ip[ip]
368 self._log.info('pinging host {0} ({1})'.format(host_name, ip))
369 self._last_request[host_name] = dt.now()
370 self._n_requests[host_name] += 1
aaf62c14
CH
371 return True
372 return False
373
3daedf04 374
aaf62c14
CH
375 def _handle_reply(self, message):
376 matches = regexp('(\d+\.\d+\.\d+\.\d+): Received reply', message)
377 if matches is not None:
378 ip = matches.groups()[0]
4013dbf2
CH
379 host_name = self._host_for_ip[ip]
380 self._log.info('received reply from {0} ({1})'.format(host_name, ip))
381 self._last_reply[host_name] = dt.now()
382 self._n_replies[host_name] += 1
aaf62c14
CH
383 return True
384 return False
385
aaf62c14 386
3daedf04 387 def _handle_change_state(self):
86ab1571
CH
388 if dt.now() < self._next_state_change:
389 return False
390
391 is_online = self._is_online()
392 if is_online is None:
393 return False # try again later
394
395 expect_online = self._prev_state_changes[-1][0]
396 if expect_online != is_online:
397 self._warn('Expect to be online={0} since {1} but are online={2}!'.format(\
398 expect_online, self._prev_state_changes[-1][1], is_online))
399
400 if is_online:
401 self._ensure_all_pinged()
402 new_val = False
403 else:
404 new_val = True
405 if self._set_online(new_val):
406 self._schedule_next_state_change()
407 return True
408 # if call to tell_connd fails, try again later
aaf62c14
CH
409 #end: _handle_change_state
410
411
3bd8a585 412 def _schedule_next_state_change(self):
3daedf04
CH
413 n_secs = randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE)
414 self._next_state_change = dt.now() + timedelta(seconds=n_secs)
415 self._log.info('schedule next state change for {0:%H:%M:%S} (in {1:d}s)'.format(\
416 self._next_state_change, n_secs))
417 #end: Tester._schedule_next_state_change
418
419
aaf62c14 420 def _is_online(self):
3bd8a585
CH
421 """ determine state using ConndState """
422 state = ConndState.get_state(self._tell_connd_bin)
423 if not isinstance(state, ConndState):
424 err_code, output_lines = state
425 self._warn('Non-zero exit status asking tell-connd for its --status: {0}'.format(\
426 err_code, output_lines[0]))
aaf62c14
CH
427 return None
428
3bd8a585 429 return len(state.connections) > 0
aaf62c14
CH
430 #end: Tester._is_online
431
432
aaf62c14 433 def _set_online(self, new_val):
3bd8a585
CH
434 """ run tell-connd --online/offline P1
435
436 returns true if afterwards are in requested state
437 """
438 if new_val == self._is_online():
439 self._warn('Want to set online state to current state!')
440 return True
aaf62c14 441 if new_val:
3daedf04 442 self._log.info('going online')
aaf62c14
CH
443 args = [self._tell_connd_bin, '--online', 'P1']
444 else:
3daedf04 445 self._log.info('going offline')
aaf62c14 446 args = [self._tell_connd_bin, '--offline']
3bd8a585 447
aaf62c14
CH
448 try:
449 return_code = subprocess.call(args, stderr=subprocess.STDOUT, timeout=1)
450 if return_code == 0:
3bd8a585 451 self._prev_state_changes.append((new_val, dt.now()))
aaf62c14
CH
452 return True
453 else:
3bd8a585
CH
454 self._log.info('tell-connd returned {0} --> online state probably unchanged!'.format(\
455 return_code))
aaf62c14
CH
456 return False
457 except subprocess.TimeoutExpired as te:
3daedf04 458 self._warn('Timeout expired running {0}. Output was {1}'.format(args, te.output))
aaf62c14
CH
459 return False
460 except:
86ab1571 461 self._warn('Trouble calling tell-connd', exception=True)
aaf62c14
CH
462 return False
463 #end: Tester._set_online
464
465
466 def _ensure_all_pinged(self):
467 """ called before going offline; check if all hosts were successfully pinged """
468 # determine when we last went online
469 time_went_online = None
470 time_go_offline = dt.now()
471
472 for went_online, change_time in reversed(self._prev_state_changes):
473 if went_online:
474 time_went_online = change_time
475 break
476 if time_went_online is None:
477 # never been online --> nothing to check yet
478 return
479
3bd8a585
CH
480 self._log.info('checking whether all hosts were pinged between '
481 + '{0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\
482 time_went_online, time_go_offline))
4013dbf2 483 for host_name, reply_time in self._last_reply.items():
aaf62c14 484 if reply_time < time_went_online:
3bd8a585
CH
485 self._log.warning(\
486 'Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\
4013dbf2
CH
487 host_name, time_went_online, time_go_offline))
488 self._fails.append((host_name, time_went_online, time_go_offline))
3daedf04
CH
489 self._log.info('remembered as failure number {0} / {1}'.format(\
490 len(self._fails), self.max_fails))
491
492 # check if there were too many fails; if so, stop
493 if len(self._fails) > self.max_fails:
494 self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
3bd8a585 495 raise TooManyWarnings()
3daedf04
CH
496 #end: if not pinged recently
497 #end: for all known ips
aaf62c14
CH
498 #end: Tester._ensure_all_pinged
499
500
86ab1571 501 def _warn(self, message, exception=False):
3daedf04 502
86ab1571
CH
503 if exception:
504 self._log.exception(message, exc_info=True)
505 else:
506 self._log.warning(message)
3daedf04
CH
507 self._fails.append(("WARN", dt.now(), message))
508 self._log.info('remembered as failure number {0} / {1}'.format(\
509 len(self._fails), self.max_fails))
aaf62c14 510
3daedf04
CH
511 # check if there were too many fails; if so, stop
512 if len(self._fails) > self.max_fails:
513 self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
3bd8a585 514 raise TooManyWarnings()
3daedf04 515 #end: Tester._warn
aaf62c14 516
3daedf04 517
3daedf04 518 def print_failures(self, log):
aaf62c14 519 if len(self._fails) == 0:
86ab1571
CH
520 log.info('No warnings nor failures since {0:%d.%m.%y, %H:%M} :-)'.format(\
521 self._prev_state_changes[0][1]))
aaf62c14 522 else:
3daedf04 523 log.info('Recorded the following failures and warnings:')
4013dbf2
CH
524 for host_name, time_went_online, time_go_offline in self._fails:
525 if host_name == 'WARN':
3daedf04 526 log.info('Warning at {0:%H:%M:%S}: {1}'.format(time_went_online, time_go_offline))
aaf62c14 527 else:
3bd8a585 528 log.info('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3:.2f}s)'\
4013dbf2 529 .format(host_name, time_went_online, time_go_offline, \
3bd8a585 530 (time_go_offline-time_went_online).total_seconds()))
86ab1571
CH
531
532 #end: Tester.print_failures
533
534 def save_state_and_history(self, save_file):
535 """ save all other data for later inspection using show_saved_data """
536 try:
537 with open(save_file, 'wb') as file_handle:
538 self._log.info('saving state and history to {0}'.format(save_file))
539 pickle.dump( (self._last_request, self._last_reply, self._n_requests, self._n_replies, \
540 self._host_for_ip, self._tell_connd_bin, self._prev_state_changes, \
541 self._fails, self.max_fails, save_file, dt.now()), \
542 file_handle)
543 return True
544 except Exception as e:
545 self._log.exception("Failed to save!", exc_info=True)
546 return False
547 #end: Tester.save_state_and_history
548
549
550 @staticmethod
551 def show_saved_data(save_file):
552 """ print info on data saved by save_state_and_history """
553
554 # load
555 save_date = None
556 with open(save_file, 'rb') as file_handle:
557 print('loading state and history from {0}'.format(save_file))
558 last_request, last_reply, n_requests, n_replies, host_for_ip, tell_connd_bin, \
559 state_changes, fails, max_fails, save_file_again, save_date \
560 = pickle.load(file_handle)
561 if save_date is None:
562 print('Failed to read state and history from {0}'.format(save_file))
563 return 1
564
565 # simple stats
566 first_change = state_changes[0][1]
567 last_change = state_changes[-1][1]
568 print('Performed {0} state changes between {1:%d.%m,%H:%M} and {2:%d.%m,%H:%M}' \
569 .format(len(state_changes), first_change, last_change), end='')
570 print(' ({0} --> one every {1} on average)'.format( \
571 strftimedelta( last_change - first_change ), \
572 strftimedelta((last_change - first_change) / len(state_changes)) ))
573 print('Used {0} IPs for {1} hosts'.format(len(host_for_ip), len(last_request)))
574 print('Overall, recorded {0} warnings/failures (max={1})'.format(len(fails), max_fails))
575
576 # sort fails into warnings and fail per host and fail per time
577 warnings = []
578 n_test_warnings = 0
579 fail_per_host = {}
580 fail_per_time = {}
581 n_fails = 0
582 for host, time_online, time_offline in fails:
583 if host == 'WARN':
584 # time_online = time, time_offline=message
585 if 'test warning' in time_offline:
586 n_test_warnings += 1
587 else:
588 warnings.append((time_online, time_offline))
589 else:
590 n_fails += 1
591 try:
592 fail_per_host[host].append(time_online)
593 except KeyError:
594 fail_per_host[host] = [time_online,]
595 duration = time_offline - time_online
596 try:
597 fail_per_time[time_online, duration].append(host)
598 except KeyError:
599 fail_per_time[time_online, duration] = [host, ]
600
601 # stats per host
602 for host, n_req in n_requests.items():
603 print(host + ' (', end='')
604 print(','.join( ip for ip, host_for_ip in host_for_ip.items() if host==host_for_ip ), end='')
605 print('): \treplied to {1} of {0} pings ({2:.1f}%); '.format(\
606 n_req, n_replies[host], 100. * n_replies[host]/n_req), end='')
607 if host in fail_per_host:
608 print('failed to ping at ', end='')
609 print(','.join( '{0:%H:%M},'.format(fail_time) for fail_time in fail_per_host[host] ))
610 else:
611 print('replied in each test interval')
612
613 # stats per time
614 if fail_per_time:
615 print("Ping failures per time (overall {0} fails in {1} intervals):".format(\
616 n_fails, len(fail_per_time)))
617 for (start, duration), host_list in fail_per_time.items():
618 print('{0:%d.%m,%H:%M} + {1}: '.format(start, strftimedelta(duration)), end='')
619 print(','.join(host_list))
620
621 # warnings
622 if warnings:
623 if n_test_warnings:
624 print("Warnings (not showing {0} test warnings):".format(n_test_warnings))
625 else:
626 print("Warnings:")
627 for time, message in warnings:
628 print("{0:%d.%m,%H:%M}: {1}".format(time, message))
629 elif n_test_warnings:
630 print('Only warnings were {0} for testing'.format(n_test_warnings))
631
632 # done
633 return 0
634 #end: static Tester.show_saved_data
635
aaf62c14
CH
636#end: class Tester
637
638########################################################################
639### MAIN
640########################################################################
641
de9d4d4c
CH
642def test_version(pingcheck_binary):
643 """ run pingcheck --version; we want > 0.1 for proper IP parsing
4013dbf2 644
de9d4d4c
CH
645 no try-catch here, also intended as test-run for pingcheck binary
646 """
647
648 output = subprocess.check_output([pingcheck_binary, '--version'], universal_newlines=True)
649 match = regexp('\s*pingcheck version (\d+\.\d+)\s*', output)
650 if match is None:
651 raise ValueError('Could not determine pingcheck version from output of {0} --version!'\
652 .format(pingcheck_binary))
653 version = float(match.groups()[0])
654 if version < 0.2:
655 raise ValueError('Pingcheck version is {0} but require version > 0.1!'.format(version))
4013dbf2
CH
656
657#end: test_version
658
659
aaf62c14
CH
660def parse_args():
661 """ parse command line arguments and set defaults """
662 from argparse import ArgumentParser
663 parser = ArgumentParser(description="Long-term testing of pingcheck: goes online and offline " \
664 + "using tell-connd and checks log for correct output", \
665 epilog="(c) Christian Herdtweck, Intra2net AG, 2015")
666
4c50ece1
CH
667 # option to show saved data
668 parser.add_argument('--show-saved-data', '-s', metavar='SAVE_FILE.pickle', \
669 default='', type=str, \
670 help='Do not run test but instead print info saved by earlier test run')
671
aaf62c14 672 # binaries
3bd8a585 673 parser.add_argument('--pingcheck-binary', '-p', metavar='PATH', \
aaf62c14
CH
674 default=default_pingcheck_bin, \
675 help='path to pingcheck binary')
3bd8a585 676 parser.add_argument('--tell-connd-binary', '-t', metavar='PATH', \
aaf62c14
CH
677 default=default_tell_connd_bin, \
678 help='path to tell-connd binary')
679
3daedf04 680 # output
3bd8a585 681 parser.add_argument('--output-prefix', '-o', metavar='PATH', \
3daedf04
CH
682 default=default_output_prefix, \
683 help='path and/or file prefix to config and log files; ~=home, 0=prog start datetime; ' \
684 + 'default="%(default)s"')
3bd8a585 685 parser.add_argument('--log-to-console', '-c', metavar='0|1', \
3daedf04
CH
686 choices=(0, 1), type=int, default=1, \
687 help='print all output to console? default=%(default)s')
3bd8a585 688 parser.add_argument('--log-level-console', metavar='LEVEL', \
3daedf04 689 type=int, default=logging.INFO, \
3bd8a585
CH
690 help='log level for console output; ' \
691 + 'debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
3daedf04 692 logging.DEBUG, logging.INFO, logging.WARN))
3bd8a585 693 parser.add_argument('--log-to-file', '-f', metavar='0|1', \
3daedf04
CH
694 choices=(0, 1), type=int, default=1, \
695 help='print all output to log file? default=%(default)s')
3bd8a585 696 parser.add_argument('--log-level-file', metavar='LEVEL', \
3daedf04 697 type=int, default=logging.DEBUG, \
3bd8a585
CH
698 help='log level for console output; ' \
699 + 'debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
3daedf04 700 logging.DEBUG, logging.INFO, logging.WARN))
3bd8a585 701 parser.add_argument('--log-file-suffix', '-l', metavar='SUFF.IX', \
3daedf04
CH
702 type=str, default='.log', \
703 help='2nd part of log file name after --output_prefix; default="%(default)s"')
3bd8a585
CH
704 parser.add_argument('--pinger-log-level', metavar='LEVEL', \
705 type=str, default='INFO', \
706 help='log level given to pingchecker, DEBUG might quickly fill log; default=%(default)s')
3daedf04 707
3bd8a585
CH
708
709 parser.add_argument('--max-fails-per-host', '-m', metavar='N', type=int, default=10, \
710 help='max number of failures/warnings per host before test will end; default: %(default)s')
aaf62c14 711 parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(version))
3daedf04
CH
712
713 # parse
714 opts = parser.parse_args()
715 opts.output_prefix = expanduser(opts.output_prefix).format(dt.now())
716 return opts
aaf62c14
CH
717#end: parse_args
718
719
3daedf04
CH
720def setup_logging(opts):
721 """ create main logger and handlers for base logger """
722
723 # create logger for main
724 log = logger_base.getChild('main')
725
726 # remove all existing handlers
727 for old_handler in logger_base.handlers:
728 print('removing handler from base')
729 logger_base.removeHandler(old_handler)
730 for old_handler in log.handlers:
731 print('removing handler from log')
732 log.removeHandler(old_handler)
733
734 logger_base.setLevel(logging.DEBUG)
735
736 # create handlers and format for base logger
737 log_file = None
86ab1571
CH
738 formatter = logging.Formatter('%(asctime)s %(name)s %(levelname)s %(message)s', \
739 datefmt='%d.%m,%H:%M:%S')
3daedf04
CH
740 if opts.log_to_console:
741 handler = logging.StreamHandler(sys.stdout)
742 handler.setLevel(opts.log_level_console)
3daedf04
CH
743 handler.setFormatter(formatter)
744 logger_base.addHandler(handler)
745
746 if opts.log_to_file:
747 log_file = opts.output_prefix + opts.log_file_suffix
748 handler = logging.FileHandler(log_file)
749 handler.setLevel(opts.log_level_file)
3daedf04
CH
750 handler.setFormatter(formatter)
751 logger_base.addHandler(handler)
752
753 # log info
754 log.info('Log for {0} in {1} created on {2}'.format(__name__, __file__, dt.now()))
755 if opts.log_to_console:
756 log.info('logging to console at level {0}'.format(opts.log_level_console))
757 else:
758 log.info('not logging to console')
759 if opts.log_to_file:
760 log.info('logging to file {1} at level {0}'.format(opts.log_level_file, log_file))
761 else:
762 log.info('not logging to file')
763
764 # return log_file only if makes sense to talk about it in main
765 if opts.log_to_console:
766 return log, log_file
767 else:
768 return log, None
769#end: setup_logging
770
771
4c50ece1 772def handle_sigchild(signum, curr_frame, log, cleanup_args):
3bd8a585
CH
773 log.error('Received SIGCHLD')
774 log.info('with args {0} and {1}'.format(signum, curr_frame))
775
4c50ece1 776 cleanup(*cleanup_args)
3bd8a585
CH
777
778
4c50ece1 779def cleanup(log, pinger, tester, line_iter, save_file):
3bd8a585
CH
780 """ the stuff from finally clause in main loop; also called from handle_sigchild """
781
782 return_code = 255
783
784 log.info('-' * 72)
785 if pinger is None:
786 log.warning('Error before pinger was created')
787 return_code = 254
788 else:
789 log.info("Cleaning up")
790
3bd8a585
CH
791 try:
792 log.info('telling pinger to terminate')
793 pinger.terminate()
794
d1df5eda
CH
795 for line_idx, line in enumerate(line_iter):
796 log.info('left-over output: {0}'.format(line))
797 if line_idx > 100:
798 log.warning('ignore further left-over output')
799 break
800
801 #return_code = pinger.wait(3) can block!
3bd8a585
CH
802 outs, errs = pinger.communicate(timeout=3)
803 for line in outs.splitlines():
804 log.info('left-over output: {0}'.format(line))
805 for line in errs.splitlines():
806 log.info('left-over output: {0}'.format(line))
807
808 return_code = pinger.returncode # may still be None
809 log.info('pinger terminated with return code {0}'.format(return_code))
810 except subprocess.TimeoutExpired as te:
811 log.warning('Timeout expired waiting for pinger to terminate: {0}'.format(te))
812 return_code = 253
813 except:
86ab1571 814 log.exception('trouble terminating pinger', exc_info=True)
3bd8a585
CH
815
816 if return_code is None:
d1df5eda 817 for line_idx, line in enumerate(line_iter):
3bd8a585 818 log.info('left-over output: {0}'.format(line))
d1df5eda
CH
819 if line_idx > 100:
820 log.warning('ignore further left-over output')
821 break
3bd8a585
CH
822
823 # kill it
824 try:
825 log.info('killing pinger the hard way')
826 pinger.kill()
827 outs, errs = pinger.communicate(timeout=3)
828 for line in outs.splitlines():
829 log.info('left-over output: {0}'.format(line))
830 for line in errs.splitlines():
831 log.info('left-over output: {0}'.format(line))
832
833 except:
86ab1571 834 log.exception('trouble killing pinger', exc_info=True)
3bd8a585
CH
835
836 if tester is not None:
86ab1571 837 tester.save_state_and_history(save_file)
3bd8a585
CH
838 tester.print_failures(log)
839 #end: if have pinger
840
841 # reset original signal handler
842 #signal.signal(signal.SIGCHLD, signal.SIG_DFL)
843
844 return return_code
845#end: cleanup
846
847
aaf62c14 848def main():
3daedf04 849 """ parses args, create a Pinger and Parser and Tester and runs them; cleans up
aaf62c14
CH
850 """
851
852 opts = parse_args()
853
4c50ece1
CH
854 if opts.show_saved_data:
855 return Tester.show_saved_data(opts.show_saved_data)
856
3daedf04
CH
857 base_dir = dirname(opts.output_prefix + "_test")
858 if not isdir(base_dir):
859 mkdir(base_dir)
860
861 log, log_file = setup_logging(opts)
862
aaf62c14 863 pinger = None
3bd8a585 864 return_code = 255
aaf62c14 865 tester = None
3bd8a585 866 line_iter = None
aaf62c14 867 try:
3bd8a585 868 config_file, n_hosts = create_conf_file(opts.output_prefix)
4c50ece1 869 save_file = opts.output_prefix + '_save.pickle'
3daedf04
CH
870 log.info('created config file {0}'.format(config_file))
871
de9d4d4c
CH
872 # test version
873 test_version(opts.pingcheck_binary)
874
3bd8a585 875 #signal.signal(signal.SIGCHLD, \
4c50ece1
CH
876 # lambda signum, curr_frame: handle_sigchild(signum, curr_frame, log, \
877 # (log, pinger, tester, line_iter, save_file) )
3bd8a585 878
3daedf04 879 # pingcheck subprocess
3bd8a585 880 pinger = subprocess.Popen([opts.pingcheck_binary, '--log-level', opts.pinger_log_level, \
3daedf04
CH
881 '--log-output', 'TERMINAL', '--config-file', config_file], \
882 stdout=subprocess.PIPE, stderr=subprocess.PIPE, \
883 universal_newlines=True)
884 log.info('started pingcheck with PID {0}'.format(pinger.pid))
885
aaf62c14
CH
886 parser = ProcOutputParser(pinger.stdout, pinger.stderr)
887 line_iter = iter(parser)
3bd8a585
CH
888 tester = Tester(line_iter, opts.tell_connd_binary, max(10, opts.max_fails_per_host*n_hosts))
889
aaf62c14
CH
890 tester.test()
891 except KeyboardInterrupt:
3daedf04 892 log.info('stopped by user')
aaf62c14 893 except:
86ab1571 894 log.exception('Caught an exception in main loop', exc_info=True)
3bd8a585 895 log.warning('Re-raise')
aaf62c14
CH
896 raise
897 finally:
86ab1571 898 return_code = cleanup(log, pinger, tester, line_iter, save_file)
3daedf04 899
3bd8a585
CH
900 log.info('Will stop now')
901 if log_file is not None:
902 log.info('might find more log output in {0}'.format(log_file))
903 log.info('-' * 72)
aaf62c14
CH
904 #end: global clean-up
905
aaf62c14
CH
906 return return_code
907
908#end: function main
909
910
911if __name__ == '__main__':
912 main()
3daedf04 913 logging.shutdown()
aaf62c14 914