added log output target UNDEFINED; return if no hosts defined
[pingcheck] / test / long_term_test.py
1 #!/usr/bin/env python3
2
3 """ A more long-term testing of pingcheck
4
5 Runs pingcheck parsing its log output; goes online and offline repeatedly to see if
6   pinging stays correct
7
8 see also: connd/stresstest_pingcheck.py
9
10 Christian Herdtweck, Intra2net, January 2015
11 (c) Intra2net AG 2015
12 """
13
14 # Version History
15 # 15/01/15 Christian Herdtweck: started creation
16 # 21/1/15 ch: running 3rd long-term test, seems to work nicely now
17
18 import subprocess
19 import sys
20 import select
21 from re import match as regexp
22 from time import sleep
23 from datetime import timedelta, datetime as dt
24 from warnings import warn
25 from random import randint
26 import logging
27 from os.path import expanduser, dirname, isdir
28 from os import mkdir, EX_OK
29 from connd_state import ConndState
30 #import signal
31 import pickle
32 from strftimedelta import strftimedelta
33
34 logger_base = logging.getLogger('long_term_test')
35
36 ########################################################################
37 ### CONSTANTS
38 ########################################################################
39
40 version = '1.1'
41 default_pingcheck_bin  = '/usr/intranator/bin/pingcheck'
42 default_tell_connd_bin = '/usr/intranator/bin/tell-connd'
43 default_output_prefix = './long_term_test/{0:%d%m%y_%H%M}'
44
45 # for parsing log messages
46 LEVEL_DEBUG  = 'dbug'
47 LEVEL_INFO   = 'info'
48 LEVEL_WARN   = 'warn'
49 LEVEL_NOTICE = 'note'
50 LEVEL_ERROR  = 'err '
51
52 text_to_level = {}
53 text_to_level['INFO'] = LEVEL_INFO
54 text_to_level['DBUG'] = LEVEL_DEBUG
55 text_to_level['WARN'] = LEVEL_WARN
56 text_to_level['NOTE'] = LEVEL_NOTICE
57 text_to_level['ERR']  = LEVEL_ERROR
58
59 # duration in which to remain online/offline
60 MIN_DURATION_SAME_STATE = 10   # [seconds]
61 MAX_DURATION_SAME_STATE = 50   # [seconds]
62
63 ########################################################################
64 #### for config file
65 ########################################################################
66
67 PING_INTERVAL = 3    # [seconds]
68 config_file_pattern = 'long_term_test_{0:%d%m%y_%H%M%S}.conf'
69 conf_start = """# config file for pingcheck testing
70
71 default-source-network-interface=eth1
72 nameserver=127.0.0.1
73 hosts-down-limit={hosts_down_limit:d}
74 ping-fail-limit=25
75 status-notifier-cmd=echo host status is ${status}
76 link-up-interval=2
77 ping-reply-timeout=3
78 """
79 conf_host = """
80 [host]
81 name={0}
82 interval={1:d}
83 ping-protocol=ICMPv4
84 port=80
85 source-network-interface=default
86 """
87 host_list = (\
88         'intranator.m.i2n', \
89         'intraroute.m.i2n', \
90         'websync.m.i2n', \
91         'aqua.m.i2n', \
92         'intratdsl.m.i2n', \
93         'beholder.m.i2n', \
94         'eagle.m.i2n', \
95         'osdist.m.i2n', \
96         'tokyo.m.i2n', \
97         'blizzard.m.i2n', \
98         'phonehelp.m.i2n', \
99         'virt-dmz.m.i2n', \
100         'virt01.m.i2n', \
101         'virt02.m.i2n', \
102         'tornado.n.i2n', \
103         'auth.m.i2n', \
104         'forum.intra2net.com', \
105         'update.intranator.com', \
106         'demo-intranator.intra2net.com', \
107         'intra2net.com', \
108         #'partner.intra2net.com', has same IP as intra2net.com --> trouble in Tester._host_for_ip \
109         'demo-securitygateway.intra2net.com', \
110         'demo-networksecurity.intra2net.com', \
111     )
112
113 def create_conf_file(file_prefix):
114     """ create config file using constants above """
115
116     file_name = file_prefix + '_config.conf'
117
118     n_hosts = len(host_list)
119
120     with open(file_name, 'wt') as file_handle:
121         file_handle.write(conf_start.format(hosts_down_limit=int(n_hosts * 0.7), status='{status}'))
122         for host in host_list:
123             file_handle.write(conf_host.format(host, PING_INTERVAL))
124
125     return file_name, n_hosts
126 #end: create_conf_file
127
128
129 ########################################################################
130 ### Exceptions
131 ########################################################################
132
133 def flags_to_str(flags):
134     text_parts = []
135     if flags & select.POLLIN:
136         text_parts.append('IN')
137     if flags & select.POLLOUT:
138         text_parts.append('OUT')
139     if flags & select.POLLPRI:
140         text_parts.append('PRI')
141     if flags & select.POLLERR:
142         text_parts.append('ERR')
143     if flags & select.POLLHUP:
144         text_parts.append('HUP')
145     if flags & select.POLLNVAL:
146         text_parts.append('INVALID')
147     return ','.join(text_parts)
148
149 class PollFlagsException(Exception):
150     def __init__(self, flags, proc_name):
151         super(PollFlagsException, self).__init__('Unexpected flags from polling {0}: {1}'.format(\
152                 proc_name, flags_to_str(flags)))
153
154 class PollHupException(Exception):
155     def __init__(self, proc_name):
156         super(PollHupException, self).__init__(\
157                 'Received HUP from polling, {0} was probably killed!'.format(proc_name))
158
159 class TooManyWarnings(Exception):
160     def __init__(self):
161         super(TooManyWarnings, self).__init__('Too many warnings / failures!')
162
163 ########################################################################
164 ### PARSER
165 ########################################################################
166
167 class ProcOutputParser:
168     """ parser for output from pingcheck process via pipes
169
170     uses select and polling
171     should never silently swallow log output, main function parse should yield all lines
172       or emit errors
173     """
174
175     _out = None
176     _err = None
177     _poller = None
178     _flags = select.POLLIN | select.POLLPRI | select.POLLHUP # | select.POLLERR
179     _regexp = '(.+)\s+(INFO|ERR|WARN|DBUG|NOTE)\s+pingcheck\[\d+\]: (.+)\s*'
180     _out_fileno = None
181     _err_fileno = None
182
183     SRC_OUT = 'stdout'
184     SRC_ERR = 'stderr'
185     SRC_UNK = 'unknown'
186
187     def __init__(self, pipe_out, pipe_err):
188         self._out = pipe_out
189         self._err = pipe_err
190         self._poller = select.poll()
191         self._poller.register(pipe_out, self._flags)
192         self._poller.register(pipe_err, self._flags)
193         self._out_fileno = pipe_out.fileno()
194         self._err_fileno = pipe_err.fileno()
195
196     def _read(self):
197         while True:
198             result = self._poller.poll(10)
199             for file_no, flags in result:
200                 if not (flags & self._flags):
201                     raise PollFlagsException(flags, 'pingcheck')
202
203                 if flags & select.POLLHUP:
204                     raise PollHupException('pingcheck')
205
206                 if file_no == self._out_fileno:
207                     src = self.SRC_OUT
208                     line = self._out.readline()
209                 elif file_no == self._err_fileno:
210                     src = self.SRC_ERR
211                     line = self._err.readline()
212                 else:
213                     src = self.SRC_UNK
214                     line = ''
215
216                 if not line:
217                     continue
218                 # if reach here, we have a new line of text
219
220                 yield src, line.strip()
221     #end: ProcOutputParser._read
222
223     def parse(self):
224         for src, line in self._read():
225             try:
226                 timestamp, level, message = regexp(self._regexp, line).groups()
227                 yield src, (timestamp, text_to_level[level], message)
228             except:
229                 yield src, line
230
231     def __iter__(self):
232         return self.parse()
233
234 #end: class ProcOutputParser
235
236 ########################################################################
237 ### TESTER
238 ########################################################################
239
240 class Tester:
241     """ """
242
243     _line_iter = None
244     _last_request = None
245     _last_reply = None
246     _n_requests = None
247     _n_replies = None
248     _host_for_ip = None
249     _next_state_change = None
250     _tell_connd_bin = None
251     _prev_state_changes = None
252     _fails = None
253     _log = None
254     _log_ping = None
255     max_fails = None
256
257     def __init__(self, line_iter, tell_connd_bin, max_fails):
258         self._line_iter = line_iter
259         self._last_request = {}
260         self._last_reply = {}
261         self._n_requests = {}
262         self._n_replies = {}
263         self._host_for_ip = {}
264         self._tell_connd_bin = tell_connd_bin
265         self._prev_state_changes = []
266         self._fails = []
267         self.max_fails = max_fails + 2   # for _test_the_tester
268
269         self._log = logger_base.getChild('Tester')
270         self._log_ping = {}
271         self._log_ping[ProcOutputParser.SRC_OUT] = logger_base.getChild('pingOut')
272         self._log_ping[ProcOutputParser.SRC_ERR] = logger_base.getChild('pingErr')
273
274
275     def test(self):
276         # preparations
277         current_state = self._is_online()
278         while current_state is None:
279             sleep(1)
280             current_state = self._is_online()
281         self._log.info('Start with state online = {0}'.format(current_state))
282         self._prev_state_changes.append((current_state, dt.now()))
283         self._schedule_next_state_change()
284
285         self._test_the_tester()
286
287         # start looping over output from observed process
288         for src, text_info in self._line_iter:
289
290             # first of all: write to pinger log so nothing is lost
291             if isinstance(text_info, str):       # ProcOutputParser could not parse
292                 msg = 'unparsed: {0}'.format(text_info)
293                 if text_info.startswith("RECV BUF ("):
294                     self._log_ping[src].debug(msg)
295                 else:
296                     self._log_ping[src].warning(msg)
297                 continue   # no need to parse message
298             else:
299                 text_time, level, message = text_info
300                 msg = '{0} {1}'.format(level, message)
301
302                 if level in (LEVEL_WARN, LEVEL_ERROR, LEVEL_NOTICE):
303                     # most common warning/error is only logged at debug level
304                     if message.endswith('could not be resolved.'):
305                         self._log_ping[src].debug(msg)
306                         continue
307                     elif message.endswith('fail sending ping data. Network is unreachable'):
308                         self._log_ping[src].debug(msg)
309                         continue
310                     elif message.endswith('fail sending ping data. Operation not permitted'):
311                         self._log_ping[src].debug(msg)
312                         continue
313                     else:   # unexpected errors and warnings from pingcheck are logged at info level
314                         self._log_ping[src].info(msg)
315                         # no continue --> try analyze
316                 else:   # normal messages are logged at debug level
317                     self._log_ping[src].debug(msg)
318                     # no continue --> try analyze
319
320             # now deal with message contents
321             if self._handle_request(message):
322                 continue
323             if self._handle_reply(message):
324                 continue
325             if self._handle_change_state():
326                 continue
327             if self._handle_new_ip(message):
328                 continue
329         #end: for all lines from pingcheck output
330     #end: Tester.test
331
332     def _test_the_tester(self):
333         self._warn("this is a test warning without exception information")
334         try:
335             raise NotImplementedError('Test Exception')
336         except:
337             self._warn("this is a test warning including exception information", exception=True)
338         self._log.info('you should have seen 2 failures above -- only be alarmed if not!')
339
340
341     def _handle_new_ip(self, message):
342         # "Have IP 11.22.33.44 [61963s] for host.domain.com"
343         matches = regexp('Have IP (\d+\.\d+\.\d+\.\d+) \[\d+s\] for (.+)', message)
344         if matches:
345             ip,host_name = matches.groups()
346         else:
347             matches = regexp('Host is already an IP: (\d+\.\d+\.\d+\.\d+)', message)
348             if matches:
349                 host_name = ip = matches.groups()[0]
350             else:
351                 return False
352         # now have host_name and ip
353
354         self._log.info('remember new IP {0} for {1}'.format(ip, host_name))
355         self._host_for_ip[ip] = host_name
356         self._last_request[host_name] = dt.now()
357         self._last_reply[host_name] = dt.now()
358         self._n_requests[host_name] = 0
359         self._n_replies[host_name] = 0
360         return True
361
362
363     def _handle_request(self, message):
364         matches = regexp('(\d+\.\d+\.\d+\.\d+): sending ping', message)
365         if matches is not None:
366             ip = matches.groups()[0]
367             host_name = self._host_for_ip[ip]
368             self._log.info('pinging host {0} ({1})'.format(host_name, ip))
369             self._last_request[host_name] = dt.now()
370             self._n_requests[host_name] += 1
371             return True
372         return False
373
374
375     def _handle_reply(self, message):
376         matches = regexp('(\d+\.\d+\.\d+\.\d+): Received reply', message)
377         if matches is not None:
378             ip = matches.groups()[0]
379             host_name = self._host_for_ip[ip]
380             self._log.info('received reply from {0} ({1})'.format(host_name, ip))
381             self._last_reply[host_name] = dt.now()
382             self._n_replies[host_name] += 1
383             return True
384         return False
385
386
387     def _handle_change_state(self):
388         if dt.now() < self._next_state_change:
389             return False
390
391         is_online = self._is_online()
392         if is_online is None:
393             return False   # try again later
394
395         expect_online = self._prev_state_changes[-1][0]
396         if expect_online != is_online:
397             self._warn('Expect to be online={0} since {1} but are online={2}!'.format(\
398                     expect_online, self._prev_state_changes[-1][1], is_online))
399
400         if is_online:
401             self._ensure_all_pinged()
402             new_val = False
403         else:
404             new_val = True
405         if self._set_online(new_val):
406             self._schedule_next_state_change()
407             return True
408         # if call to tell_connd fails, try again later
409     #end: _handle_change_state
410
411
412     def _schedule_next_state_change(self):
413         n_secs = randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE)
414         self._next_state_change = dt.now() + timedelta(seconds=n_secs)
415         self._log.info('schedule next state change for {0:%H:%M:%S} (in {1:d}s)'.format(\
416                 self._next_state_change, n_secs))
417     #end: Tester._schedule_next_state_change
418
419
420     def _is_online(self):
421         """ determine state using ConndState """
422         state = ConndState.get_state(self._tell_connd_bin)
423         if not isinstance(state, ConndState):
424             err_code, output_lines = state
425             self._warn('Non-zero exit status asking tell-connd for its --status: {0}'.format(\
426                         err_code, output_lines[0]))
427             return None
428
429         return len(state.connections) > 0
430     #end: Tester._is_online
431
432
433     def _set_online(self, new_val):
434         """ run tell-connd --online/offline P1
435
436         returns true if afterwards are in requested state
437         """
438         if new_val == self._is_online():
439             self._warn('Want to set online state to current state!')
440             return True
441         if new_val:
442             self._log.info('going online')
443             args = [self._tell_connd_bin, '--online', 'P1']
444         else:
445             self._log.info('going offline')
446             args = [self._tell_connd_bin, '--offline']
447
448         try:
449             return_code = subprocess.call(args, stderr=subprocess.STDOUT, timeout=1)
450             if return_code == 0:
451                 self._prev_state_changes.append((new_val, dt.now()))
452                 return True
453             else:
454                 self._log.info('tell-connd returned {0} --> online state probably unchanged!'.format(\
455                                 return_code))
456                 return False
457         except subprocess.TimeoutExpired as te:
458             self._warn('Timeout expired running {0}. Output was {1}'.format(args, te.output))
459             return False
460         except:
461             self._warn('Trouble calling tell-connd', exception=True)
462             return False
463     #end: Tester._set_online
464
465
466     def _ensure_all_pinged(self):
467         """ called before going offline; check if all hosts were successfully pinged """
468         # determine when we last went online
469         time_went_online = None
470         time_go_offline = dt.now()
471
472         for went_online, change_time in reversed(self._prev_state_changes):
473             if went_online:
474                 time_went_online = change_time
475                 break
476         if time_went_online is None:
477             # never been online --> nothing to check yet
478             return
479
480         self._log.info('checking whether all hosts were pinged between '
481                      + '{0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\
482                         time_went_online, time_go_offline))
483         for host_name, reply_time in self._last_reply.items():
484             if reply_time < time_went_online:
485                 self._log.warning(\
486                         'Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\
487                         host_name, time_went_online, time_go_offline))
488                 self._fails.append((host_name, time_went_online, time_go_offline))
489                 self._log.info('remembered as failure number {0} / {1}'.format(\
490                         len(self._fails), self.max_fails))
491
492                 # check if there were too many fails; if so, stop
493                 if len(self._fails) > self.max_fails:
494                     self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
495                     raise TooManyWarnings()
496             #end: if not pinged recently
497         #end: for all known ips
498     #end: Tester._ensure_all_pinged
499
500
501     def _warn(self, message, exception=False):
502
503         if exception:
504             self._log.exception(message, exc_info=True)
505         else:
506             self._log.warning(message)
507         self._fails.append(("WARN", dt.now(), message))
508         self._log.info('remembered as failure number {0} / {1}'.format(\
509                 len(self._fails), self.max_fails))
510
511         # check if there were too many fails; if so, stop
512         if len(self._fails) > self.max_fails:
513             self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
514             raise TooManyWarnings()
515     #end: Tester._warn
516
517
518     def print_failures(self, log):
519         if len(self._fails) == 0:
520             log.info('No warnings nor failures since {0:%d.%m.%y, %H:%M} :-)'.format(\
521                     self._prev_state_changes[0][1]))
522         else:
523             log.info('Recorded the following failures and warnings:')
524             for host_name, time_went_online, time_go_offline in self._fails:
525                 if host_name == 'WARN':
526                     log.info('Warning at {0:%H:%M:%S}: {1}'.format(time_went_online, time_go_offline))
527                 else:
528                     log.info('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3:.2f}s)'\
529                             .format(host_name, time_went_online, time_go_offline, \
530                                     (time_go_offline-time_went_online).total_seconds()))
531
532     #end: Tester.print_failures
533
534     def save_state_and_history(self, save_file):
535         """ save all other data for later inspection using show_saved_data """
536         try:
537             with open(save_file, 'wb') as file_handle:
538                 self._log.info('saving state and history to {0}'.format(save_file))
539                 pickle.dump( (self._last_request, self._last_reply, self._n_requests, self._n_replies, \
540                               self._host_for_ip, self._tell_connd_bin, self._prev_state_changes, \
541                               self._fails, self.max_fails, save_file, dt.now()), \
542                             file_handle)
543             return True
544         except Exception as e:
545             self._log.exception("Failed to save!", exc_info=True)
546         return False
547     #end: Tester.save_state_and_history
548
549
550     @staticmethod
551     def show_saved_data(save_file):
552         """ print info on data saved by save_state_and_history """
553
554         # load
555         save_date = None
556         with open(save_file, 'rb') as file_handle:
557             print('loading state and history from {0}'.format(save_file))
558             last_request, last_reply, n_requests, n_replies, host_for_ip, tell_connd_bin, \
559                     state_changes, fails, max_fails, save_file_again, save_date \
560                         = pickle.load(file_handle)
561         if save_date is None:
562             print('Failed to read state and history from {0}'.format(save_file))
563             return 1
564
565         # simple stats
566         first_change = state_changes[0][1]
567         last_change = state_changes[-1][1]
568         print('Performed {0} state changes between {1:%d.%m,%H:%M} and {2:%d.%m,%H:%M}' \
569                 .format(len(state_changes), first_change, last_change), end='')
570         print(' ({0} --> one every {1} on average)'.format( \
571                         strftimedelta( last_change - first_change                      ), \
572                         strftimedelta((last_change - first_change) / len(state_changes)) ))
573         print('Used {0} IPs for {1} hosts'.format(len(host_for_ip), len(last_request)))
574         print('Overall, recorded {0} warnings/failures (max={1})'.format(len(fails), max_fails))
575
576         # sort fails into warnings and fail per host and fail per time
577         warnings = []
578         n_test_warnings = 0
579         fail_per_host = {}
580         fail_per_time = {}
581         n_fails = 0
582         for host, time_online, time_offline in fails:
583             if host == 'WARN':
584                 # time_online = time, time_offline=message
585                 if 'test warning' in time_offline:
586                     n_test_warnings += 1
587                 else:
588                     warnings.append((time_online, time_offline))
589             else:
590                 n_fails += 1
591                 try:
592                     fail_per_host[host].append(time_online)
593                 except KeyError:
594                     fail_per_host[host] = [time_online,]
595                 duration = time_offline - time_online
596                 try:
597                     fail_per_time[time_online, duration].append(host)
598                 except KeyError:
599                     fail_per_time[time_online, duration] = [host, ]
600
601         # stats per host
602         for host, n_req in n_requests.items():
603             print(host + ' (', end='')
604             print(','.join( ip for ip, host_for_ip in host_for_ip.items() if host==host_for_ip ), end='')
605             print('): \treplied to {1} of {0} pings ({2:.1f}%); '.format(\
606                     n_req, n_replies[host], 100. * n_replies[host]/n_req), end='')
607             if host in fail_per_host:
608                 print('failed to ping at ', end='')
609                 print(','.join( '{0:%H:%M},'.format(fail_time) for fail_time in fail_per_host[host] ))
610             else:
611                 print('replied in each test interval')
612
613         # stats per time
614         if fail_per_time:
615             print("Ping failures per time (overall {0} fails in {1} intervals):".format(\
616                     n_fails, len(fail_per_time)))
617             for (start, duration), host_list in fail_per_time.items():
618                 print('{0:%d.%m,%H:%M} + {1}: '.format(start, strftimedelta(duration)), end='')
619                 print(','.join(host_list))
620
621         # warnings
622         if warnings:
623             if n_test_warnings:
624                 print("Warnings (not showing {0} test warnings):".format(n_test_warnings))
625             else:
626                 print("Warnings:")
627             for time, message in warnings:
628                 print("{0:%d.%m,%H:%M}: {1}".format(time, message))
629         elif n_test_warnings:
630             print('Only warnings were {0} for testing'.format(n_test_warnings))
631
632         # done
633         return 0
634     #end: static Tester.show_saved_data
635
636 #end: class Tester
637
638 ########################################################################
639 ### MAIN
640 ########################################################################
641
642 def test_version(pingcheck_binary):
643     """ run pingcheck --version; we want > 0.1 for proper IP parsing
644
645     no try-catch here, also intended as test-run for pingcheck binary
646     """
647
648     output = subprocess.check_output([pingcheck_binary, '--version'], universal_newlines=True)
649     match = regexp('\s*pingcheck version (\d+\.\d+)\s*', output)
650     if match is None:
651         raise ValueError('Could not determine pingcheck version from output of {0} --version!'\
652                 .format(pingcheck_binary))
653     version = float(match.groups()[0])
654     if version < 0.2:
655         raise ValueError('Pingcheck version is {0} but require version > 0.1!'.format(version))
656
657 #end: test_version
658
659
660 def parse_args():
661     """ parse command line arguments and set defaults """
662     from argparse import ArgumentParser
663     parser = ArgumentParser(description="Long-term testing of pingcheck: goes online and offline " \
664                                       + "using tell-connd and checks log for correct output", \
665                             epilog="(c) Christian Herdtweck, Intra2net AG, 2015")
666
667     # option to show saved data
668     parser.add_argument('--show-saved-data', '-s', metavar='SAVE_FILE.pickle', \
669             default='', type=str, \
670             help='Do not run test but instead print info saved by earlier test run')
671
672     # binaries
673     parser.add_argument('--pingcheck-binary', '-p', metavar='PATH', \
674             default=default_pingcheck_bin, \
675             help='path to pingcheck binary')
676     parser.add_argument('--tell-connd-binary', '-t', metavar='PATH', \
677             default=default_tell_connd_bin, \
678             help='path to tell-connd binary')
679
680     # output
681     parser.add_argument('--output-prefix', '-o', metavar='PATH', \
682             default=default_output_prefix, \
683             help='path and/or file prefix to config and log files; ~=home, 0=prog start datetime; ' \
684                 + 'default="%(default)s"')
685     parser.add_argument('--log-to-console', '-c', metavar='0|1', \
686             choices=(0, 1), type=int, default=1, \
687             help='print all output to console? default=%(default)s')
688     parser.add_argument('--log-level-console', metavar='LEVEL', \
689             type=int, default=logging.INFO, \
690             help='log level for console output; ' \
691                + 'debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
692                 logging.DEBUG, logging.INFO, logging.WARN))
693     parser.add_argument('--log-to-file', '-f', metavar='0|1', \
694             choices=(0, 1), type=int, default=1, \
695             help='print all output to log file? default=%(default)s')
696     parser.add_argument('--log-level-file', metavar='LEVEL', \
697             type=int, default=logging.DEBUG, \
698             help='log level for console output; ' \
699                + 'debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
700                 logging.DEBUG, logging.INFO, logging.WARN))
701     parser.add_argument('--log-file-suffix', '-l', metavar='SUFF.IX', \
702             type=str, default='.log', \
703             help='2nd part of log file name after --output_prefix; default="%(default)s"')
704     parser.add_argument('--pinger-log-level', metavar='LEVEL', \
705             type=str, default='INFO', \
706             help='log level given to pingchecker, DEBUG might quickly fill log; default=%(default)s')
707
708
709     parser.add_argument('--max-fails-per-host', '-m', metavar='N', type=int, default=10, \
710             help='max number of failures/warnings per host before test will end; default: %(default)s')
711     parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(version))
712
713     # parse
714     opts = parser.parse_args()
715     opts.output_prefix = expanduser(opts.output_prefix).format(dt.now())
716     return opts
717 #end: parse_args
718
719
720 def setup_logging(opts):
721     """ create main logger and handlers for base logger """
722
723     # create logger for main
724     log = logger_base.getChild('main')
725
726     # remove all existing handlers
727     for old_handler in logger_base.handlers:
728         print('removing handler from base')
729         logger_base.removeHandler(old_handler)
730     for old_handler in log.handlers:
731         print('removing handler from log')
732         log.removeHandler(old_handler)
733
734     logger_base.setLevel(logging.DEBUG)
735
736     # create handlers and format for base logger
737     log_file = None
738     formatter = logging.Formatter('%(asctime)s %(name)s %(levelname)s %(message)s', \
739                                   datefmt='%d.%m,%H:%M:%S')
740     if opts.log_to_console:
741         handler = logging.StreamHandler(sys.stdout)
742         handler.setLevel(opts.log_level_console)
743         handler.setFormatter(formatter)
744         logger_base.addHandler(handler)
745
746     if opts.log_to_file:
747         log_file = opts.output_prefix + opts.log_file_suffix
748         handler = logging.FileHandler(log_file)
749         handler.setLevel(opts.log_level_file)
750         handler.setFormatter(formatter)
751         logger_base.addHandler(handler)
752
753     # log info
754     log.info('Log for {0} in {1} created on {2}'.format(__name__, __file__, dt.now()))
755     if opts.log_to_console:
756         log.info('logging to console at level {0}'.format(opts.log_level_console))
757     else:
758         log.info('not logging to console')
759     if opts.log_to_file:
760         log.info('logging to file {1} at level {0}'.format(opts.log_level_file, log_file))
761     else:
762         log.info('not logging to file')
763
764     # return log_file only if makes sense to talk about it in main
765     if opts.log_to_console:
766         return log, log_file
767     else:
768         return log, None
769 #end: setup_logging
770
771
772 def handle_sigchild(signum, curr_frame, log, cleanup_args):
773     log.error('Received SIGCHLD')
774     log.info('with args {0} and {1}'.format(signum, curr_frame))
775
776     cleanup(*cleanup_args)
777
778
779 def cleanup(log, pinger, tester, line_iter, save_file):
780     """ the stuff from finally clause in main loop; also called from handle_sigchild """
781
782     return_code = 255
783
784     log.info('-' * 72)
785     if pinger is None:
786         log.warning('Error before pinger was created')
787         return_code = 254
788     else:
789         log.info("Cleaning up")
790
791         try:
792             log.info('telling pinger to terminate')
793             pinger.terminate()
794
795             for line_idx, line in enumerate(line_iter):
796                 log.info('left-over output: {0}'.format(line))
797                 if line_idx > 100:
798                     log.warning('ignore further left-over output')
799                     break
800
801             #return_code = pinger.wait(3)  can block!
802             outs, errs = pinger.communicate(timeout=3)
803             for line in outs.splitlines():
804                 log.info('left-over output: {0}'.format(line))
805             for line in errs.splitlines():
806                 log.info('left-over output: {0}'.format(line))
807
808             return_code = pinger.returncode   # may still be None
809             log.info('pinger terminated with return code {0}'.format(return_code))
810         except subprocess.TimeoutExpired as te:
811             log.warning('Timeout expired waiting for pinger to terminate: {0}'.format(te))
812             return_code = 253
813         except:
814             log.exception('trouble terminating pinger', exc_info=True)
815
816         if return_code is None:
817             for line_idx, line in enumerate(line_iter):
818                 log.info('left-over output: {0}'.format(line))
819                 if line_idx > 100:
820                     log.warning('ignore further left-over output')
821                     break
822
823             # kill it
824             try:
825                 log.info('killing pinger the hard way')
826                 pinger.kill()
827                 outs, errs = pinger.communicate(timeout=3)
828                 for line in outs.splitlines():
829                     log.info('left-over output: {0}'.format(line))
830                 for line in errs.splitlines():
831                     log.info('left-over output: {0}'.format(line))
832
833             except:
834                 log.exception('trouble killing pinger', exc_info=True)
835
836         if tester is not None:
837             tester.save_state_and_history(save_file)
838             tester.print_failures(log)
839     #end: if have pinger
840
841     # reset original signal handler
842     #signal.signal(signal.SIGCHLD, signal.SIG_DFL)
843
844     return return_code
845 #end: cleanup
846
847
848 def main():
849     """ parses args, create a Pinger and Parser and Tester and runs them; cleans up
850     """
851
852     opts = parse_args()
853
854     if opts.show_saved_data:
855         return Tester.show_saved_data(opts.show_saved_data)
856
857     base_dir = dirname(opts.output_prefix + "_test")
858     if not isdir(base_dir):
859         mkdir(base_dir)
860
861     log, log_file = setup_logging(opts)
862
863     pinger = None
864     return_code = 255
865     tester = None
866     line_iter = None
867     try:
868         config_file, n_hosts = create_conf_file(opts.output_prefix)
869         save_file = opts.output_prefix + '_save.pickle'
870         log.info('created config file {0}'.format(config_file))
871
872         # test version
873         test_version(opts.pingcheck_binary)
874
875         #signal.signal(signal.SIGCHLD, \
876         #        lambda signum, curr_frame: handle_sigchild(signum, curr_frame, log, \
877         #           (log, pinger, tester, line_iter, save_file) )
878
879         # pingcheck subprocess
880         pinger = subprocess.Popen([opts.pingcheck_binary, '--log-level', opts.pinger_log_level, \
881                                      '--log-output', 'TERMINAL', '--config-file', config_file], \
882                                    stdout=subprocess.PIPE, stderr=subprocess.PIPE, \
883                                    universal_newlines=True)
884         log.info('started pingcheck with PID {0}'.format(pinger.pid))
885
886         parser = ProcOutputParser(pinger.stdout, pinger.stderr)
887         line_iter = iter(parser)
888         tester = Tester(line_iter, opts.tell_connd_binary, max(10, opts.max_fails_per_host*n_hosts))
889
890         tester.test()
891     except KeyboardInterrupt:
892         log.info('stopped by user')
893     except:
894         log.exception('Caught an exception in main loop', exc_info=True)
895         log.warning('Re-raise')
896         raise
897     finally:
898         return_code = cleanup(log, pinger, tester, line_iter, save_file)
899
900         log.info('Will stop now')
901         if log_file is not None:
902             log.info('might find more log output in {0}'.format(log_file))
903         log.info('-' * 72)
904     #end: global clean-up
905
906     return return_code
907
908 #end: function main
909
910
911 if __name__ == '__main__':
912     main()
913     logging.shutdown()
914