moved tests
[pingcheck] / test / long_term_test.py
diff --git a/test/long_term_test.py b/test/long_term_test.py
deleted file mode 100755 (executable)
index 01b1d92..0000000
+++ /dev/null
@@ -1,914 +0,0 @@
-#!/usr/bin/env python3
-
-""" A more long-term testing of pingcheck
-
-Runs pingcheck parsing its log output; goes online and offline repeatedly to see if
-  pinging stays correct
-
-see also: connd/stresstest_pingcheck.py
-
-Christian Herdtweck, Intra2net, January 2015
-(c) Intra2net AG 2015
-"""
-
-# Version History
-# 15/01/15 Christian Herdtweck: started creation
-# 21/1/15 ch: running 3rd long-term test, seems to work nicely now
-
-import subprocess
-import sys
-import select
-from re import match as regexp
-from time import sleep
-from datetime import timedelta, datetime as dt
-from warnings import warn
-from random import randint
-import logging
-from os.path import expanduser, dirname, isdir
-from os import mkdir, EX_OK
-from connd_state import ConndState
-#import signal
-import pickle
-from strftimedelta import strftimedelta
-
-logger_base = logging.getLogger('long_term_test')
-
-########################################################################
-### CONSTANTS
-########################################################################
-
-version = '1.1'
-default_pingcheck_bin  = '/usr/intranator/bin/pingcheck'
-default_tell_connd_bin = '/usr/intranator/bin/tell-connd'
-default_output_prefix = './long_term_test/{0:%d%m%y_%H%M}'
-
-# for parsing log messages
-LEVEL_DEBUG  = 'dbug'
-LEVEL_INFO   = 'info'
-LEVEL_WARN   = 'warn'
-LEVEL_NOTICE = 'note'
-LEVEL_ERROR  = 'err '
-
-text_to_level = {}
-text_to_level['INFO'] = LEVEL_INFO
-text_to_level['DBUG'] = LEVEL_DEBUG
-text_to_level['WARN'] = LEVEL_WARN
-text_to_level['NOTE'] = LEVEL_NOTICE
-text_to_level['ERR']  = LEVEL_ERROR
-
-# duration in which to remain online/offline
-MIN_DURATION_SAME_STATE = 10   # [seconds]
-MAX_DURATION_SAME_STATE = 50   # [seconds]
-
-########################################################################
-#### for config file
-########################################################################
-
-PING_INTERVAL = 3    # [seconds]
-config_file_pattern = 'long_term_test_{0:%d%m%y_%H%M%S}.conf'
-conf_start = """# config file for pingcheck testing
-
-default-source-network-interface=eth1
-nameserver=127.0.0.1
-hosts-down-limit={hosts_down_limit:d}
-ping-fail-limit=25
-status-notifier-cmd=echo host status is ${status}
-link-up-interval=2
-ping-reply-timeout=3
-"""
-conf_host = """
-[host]
-name={0}
-interval={1:d}
-ping-protocol=ICMPv4
-port=80
-source-network-interface=default
-"""
-host_list = (\
-        'intranator.m.i2n', \
-        'intraroute.m.i2n', \
-        'websync.m.i2n', \
-        'aqua.m.i2n', \
-        'intratdsl.m.i2n', \
-        'beholder.m.i2n', \
-        'eagle.m.i2n', \
-        'osdist.m.i2n', \
-        'tokyo.m.i2n', \
-        'blizzard.m.i2n', \
-        'phonehelp.m.i2n', \
-        'virt-dmz.m.i2n', \
-        'virt01.m.i2n', \
-        'virt02.m.i2n', \
-        'tornado.n.i2n', \
-        'auth.m.i2n', \
-        'forum.intra2net.com', \
-        'update.intranator.com', \
-        'demo-intranator.intra2net.com', \
-        'intra2net.com', \
-        #'partner.intra2net.com', has same IP as intra2net.com --> trouble in Tester._host_for_ip \
-        'demo-securitygateway.intra2net.com', \
-        'demo-networksecurity.intra2net.com', \
-    )
-
-def create_conf_file(file_prefix):
-    """ create config file using constants above """
-
-    file_name = file_prefix + '_config.conf'
-
-    n_hosts = len(host_list)
-
-    with open(file_name, 'wt') as file_handle:
-        file_handle.write(conf_start.format(hosts_down_limit=int(n_hosts * 0.7), status='{status}'))
-        for host in host_list:
-            file_handle.write(conf_host.format(host, PING_INTERVAL))
-
-    return file_name, n_hosts
-#end: create_conf_file
-
-
-########################################################################
-### Exceptions
-########################################################################
-
-def flags_to_str(flags):
-    text_parts = []
-    if flags & select.POLLIN:
-        text_parts.append('IN')
-    if flags & select.POLLOUT:
-        text_parts.append('OUT')
-    if flags & select.POLLPRI:
-        text_parts.append('PRI')
-    if flags & select.POLLERR:
-        text_parts.append('ERR')
-    if flags & select.POLLHUP:
-        text_parts.append('HUP')
-    if flags & select.POLLNVAL:
-        text_parts.append('INVALID')
-    return ','.join(text_parts)
-
-class PollFlagsException(Exception):
-    def __init__(self, flags, proc_name):
-        super(PollFlagsException, self).__init__('Unexpected flags from polling {0}: {1}'.format(\
-                proc_name, flags_to_str(flags)))
-
-class PollHupException(Exception):
-    def __init__(self, proc_name):
-        super(PollHupException, self).__init__(\
-                'Received HUP from polling, {0} was probably killed!'.format(proc_name))
-
-class TooManyWarnings(Exception):
-    def __init__(self):
-        super(TooManyWarnings, self).__init__('Too many warnings / failures!')
-
-########################################################################
-### PARSER
-########################################################################
-
-class ProcOutputParser:
-    """ parser for output from pingcheck process via pipes
-
-    uses select and polling
-    should never silently swallow log output, main function parse should yield all lines
-      or emit errors
-    """
-
-    _out = None
-    _err = None
-    _poller = None
-    _flags = select.POLLIN | select.POLLPRI | select.POLLHUP # | select.POLLERR
-    _regexp = '(.+)\s+(INFO|ERR|WARN|DBUG|NOTE)\s+pingcheck\[\d+\]: (.+)\s*'
-    _out_fileno = None
-    _err_fileno = None
-
-    SRC_OUT = 'stdout'
-    SRC_ERR = 'stderr'
-    SRC_UNK = 'unknown'
-
-    def __init__(self, pipe_out, pipe_err):
-        self._out = pipe_out
-        self._err = pipe_err
-        self._poller = select.poll()
-        self._poller.register(pipe_out, self._flags)
-        self._poller.register(pipe_err, self._flags)
-        self._out_fileno = pipe_out.fileno()
-        self._err_fileno = pipe_err.fileno()
-
-    def _read(self):
-        while True:
-            result = self._poller.poll(10)
-            for file_no, flags in result:
-                if not (flags & self._flags):
-                    raise PollFlagsException(flags, 'pingcheck')
-
-                if flags & select.POLLHUP:
-                    raise PollHupException('pingcheck')
-
-                if file_no == self._out_fileno:
-                    src = self.SRC_OUT
-                    line = self._out.readline()
-                elif file_no == self._err_fileno:
-                    src = self.SRC_ERR
-                    line = self._err.readline()
-                else:
-                    src = self.SRC_UNK
-                    line = ''
-
-                if not line:
-                    continue
-                # if reach here, we have a new line of text
-
-                yield src, line.strip()
-    #end: ProcOutputParser._read
-
-    def parse(self):
-        for src, line in self._read():
-            try:
-                timestamp, level, message = regexp(self._regexp, line).groups()
-                yield src, (timestamp, text_to_level[level], message)
-            except:
-                yield src, line
-
-    def __iter__(self):
-        return self.parse()
-
-#end: class ProcOutputParser
-
-########################################################################
-### TESTER
-########################################################################
-
-class Tester:
-    """ """
-
-    _line_iter = None
-    _last_request = None
-    _last_reply = None
-    _n_requests = None
-    _n_replies = None
-    _host_for_ip = None
-    _next_state_change = None
-    _tell_connd_bin = None
-    _prev_state_changes = None
-    _fails = None
-    _log = None
-    _log_ping = None
-    max_fails = None
-
-    def __init__(self, line_iter, tell_connd_bin, max_fails):
-        self._line_iter = line_iter
-        self._last_request = {}
-        self._last_reply = {}
-        self._n_requests = {}
-        self._n_replies = {}
-        self._host_for_ip = {}
-        self._tell_connd_bin = tell_connd_bin
-        self._prev_state_changes = []
-        self._fails = []
-        self.max_fails = max_fails + 2   # for _test_the_tester
-
-        self._log = logger_base.getChild('Tester')
-        self._log_ping = {}
-        self._log_ping[ProcOutputParser.SRC_OUT] = logger_base.getChild('pingOut')
-        self._log_ping[ProcOutputParser.SRC_ERR] = logger_base.getChild('pingErr')
-
-
-    def test(self):
-        # preparations
-        current_state = self._is_online()
-        while current_state is None:
-            sleep(1)
-            current_state = self._is_online()
-        self._log.info('Start with state online = {0}'.format(current_state))
-        self._prev_state_changes.append((current_state, dt.now()))
-        self._schedule_next_state_change()
-
-        self._test_the_tester()
-
-        # start looping over output from observed process
-        for src, text_info in self._line_iter:
-
-            # first of all: write to pinger log so nothing is lost
-            if isinstance(text_info, str):       # ProcOutputParser could not parse
-                msg = 'unparsed: {0}'.format(text_info)
-                if text_info.startswith("RECV BUF ("):
-                    self._log_ping[src].debug(msg)
-                else:
-                    self._log_ping[src].warning(msg)
-                continue   # no need to parse message
-            else:
-                text_time, level, message = text_info
-                msg = '{0} {1}'.format(level, message)
-
-                if level in (LEVEL_WARN, LEVEL_ERROR, LEVEL_NOTICE):
-                    # most common warning/error is only logged at debug level
-                    if message.endswith('could not be resolved.'):
-                        self._log_ping[src].debug(msg)
-                        continue
-                    elif message.endswith('fail sending ping data. Network is unreachable'):
-                        self._log_ping[src].debug(msg)
-                        continue
-                    elif message.endswith('fail sending ping data. Operation not permitted'):
-                        self._log_ping[src].debug(msg)
-                        continue
-                    else:   # unexpected errors and warnings from pingcheck are logged at info level
-                        self._log_ping[src].info(msg)
-                        # no continue --> try analyze
-                else:   # normal messages are logged at debug level
-                    self._log_ping[src].debug(msg)
-                    # no continue --> try analyze
-
-            # now deal with message contents
-            if self._handle_request(message):
-                continue
-            if self._handle_reply(message):
-                continue
-            if self._handle_change_state():
-                continue
-            if self._handle_new_ip(message):
-                continue
-        #end: for all lines from pingcheck output
-    #end: Tester.test
-
-    def _test_the_tester(self):
-        self._warn("this is a test warning without exception information")
-        try:
-            raise NotImplementedError('Test Exception')
-        except:
-            self._warn("this is a test warning including exception information", exception=True)
-        self._log.info('you should have seen 2 failures above -- only be alarmed if not!')
-
-
-    def _handle_new_ip(self, message):
-        # "Have IP 11.22.33.44 [61963s] for host.domain.com"
-        matches = regexp('Have IP (\d+\.\d+\.\d+\.\d+) \[\d+s\] for (.+)', message)
-        if matches:
-            ip,host_name = matches.groups()
-        else:
-            matches = regexp('Host is already an IP: (\d+\.\d+\.\d+\.\d+)', message)
-            if matches:
-                host_name = ip = matches.groups()[0]
-            else:
-                return False
-        # now have host_name and ip
-
-        self._log.info('remember new IP {0} for {1}'.format(ip, host_name))
-        self._host_for_ip[ip] = host_name
-        self._last_request[host_name] = dt.now()
-        self._last_reply[host_name] = dt.now()
-        self._n_requests[host_name] = 0
-        self._n_replies[host_name] = 0
-        return True
-
-
-    def _handle_request(self, message):
-        matches = regexp('(\d+\.\d+\.\d+\.\d+): sending ping', message)
-        if matches is not None:
-            ip = matches.groups()[0]
-            host_name = self._host_for_ip[ip]
-            self._log.info('pinging host {0} ({1})'.format(host_name, ip))
-            self._last_request[host_name] = dt.now()
-            self._n_requests[host_name] += 1
-            return True
-        return False
-
-
-    def _handle_reply(self, message):
-        matches = regexp('(\d+\.\d+\.\d+\.\d+): Received reply', message)
-        if matches is not None:
-            ip = matches.groups()[0]
-            host_name = self._host_for_ip[ip]
-            self._log.info('received reply from {0} ({1})'.format(host_name, ip))
-            self._last_reply[host_name] = dt.now()
-            self._n_replies[host_name] += 1
-            return True
-        return False
-
-
-    def _handle_change_state(self):
-        if dt.now() < self._next_state_change:
-            return False
-
-        is_online = self._is_online()
-        if is_online is None:
-            return False   # try again later
-
-        expect_online = self._prev_state_changes[-1][0]
-        if expect_online != is_online:
-            self._warn('Expect to be online={0} since {1} but are online={2}!'.format(\
-                    expect_online, self._prev_state_changes[-1][1], is_online))
-
-        if is_online:
-            self._ensure_all_pinged()
-            new_val = False
-        else:
-            new_val = True
-        if self._set_online(new_val):
-            self._schedule_next_state_change()
-            return True
-        # if call to tell_connd fails, try again later
-    #end: _handle_change_state
-
-
-    def _schedule_next_state_change(self):
-        n_secs = randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE)
-        self._next_state_change = dt.now() + timedelta(seconds=n_secs)
-        self._log.info('schedule next state change for {0:%H:%M:%S} (in {1:d}s)'.format(\
-                self._next_state_change, n_secs))
-    #end: Tester._schedule_next_state_change
-
-
-    def _is_online(self):
-        """ determine state using ConndState """
-        state = ConndState.get_state(self._tell_connd_bin)
-        if not isinstance(state, ConndState):
-            err_code, output_lines = state
-            self._warn('Non-zero exit status asking tell-connd for its --status: {0}'.format(\
-                        err_code, output_lines[0]))
-            return None
-
-        return len(state.connections) > 0
-    #end: Tester._is_online
-
-
-    def _set_online(self, new_val):
-        """ run tell-connd --online/offline P1
-
-        returns true if afterwards are in requested state
-        """
-        if new_val == self._is_online():
-            self._warn('Want to set online state to current state!')
-            return True
-        if new_val:
-            self._log.info('going online')
-            args = [self._tell_connd_bin, '--online', 'P1']
-        else:
-            self._log.info('going offline')
-            args = [self._tell_connd_bin, '--offline']
-
-        try:
-            return_code = subprocess.call(args, stderr=subprocess.STDOUT, timeout=1)
-            if return_code == 0:
-                self._prev_state_changes.append((new_val, dt.now()))
-                return True
-            else:
-                self._log.info('tell-connd returned {0} --> online state probably unchanged!'.format(\
-                                return_code))
-                return False
-        except subprocess.TimeoutExpired as te:
-            self._warn('Timeout expired running {0}. Output was {1}'.format(args, te.output))
-            return False
-        except:
-            self._warn('Trouble calling tell-connd', exception=True)
-            return False
-    #end: Tester._set_online
-
-
-    def _ensure_all_pinged(self):
-        """ called before going offline; check if all hosts were successfully pinged """
-        # determine when we last went online
-        time_went_online = None
-        time_go_offline = dt.now()
-
-        for went_online, change_time in reversed(self._prev_state_changes):
-            if went_online:
-                time_went_online = change_time
-                break
-        if time_went_online is None:
-            # never been online --> nothing to check yet
-            return
-
-        self._log.info('checking whether all hosts were pinged between '
-                     + '{0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\
-                        time_went_online, time_go_offline))
-        for host_name, reply_time in self._last_reply.items():
-            if reply_time < time_went_online:
-                self._log.warning(\
-                        'Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\
-                        host_name, time_went_online, time_go_offline))
-                self._fails.append((host_name, time_went_online, time_go_offline))
-                self._log.info('remembered as failure number {0} / {1}'.format(\
-                        len(self._fails), self.max_fails))
-
-                # check if there were too many fails; if so, stop
-                if len(self._fails) > self.max_fails:
-                    self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
-                    raise TooManyWarnings()
-            #end: if not pinged recently
-        #end: for all known ips
-    #end: Tester._ensure_all_pinged
-
-
-    def _warn(self, message, exception=False):
-
-        if exception:
-            self._log.exception(message, exc_info=True)
-        else:
-            self._log.warning(message)
-        self._fails.append(("WARN", dt.now(), message))
-        self._log.info('remembered as failure number {0} / {1}'.format(\
-                len(self._fails), self.max_fails))
-
-        # check if there were too many fails; if so, stop
-        if len(self._fails) > self.max_fails:
-            self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
-            raise TooManyWarnings()
-    #end: Tester._warn
-
-
-    def print_failures(self, log):
-        if len(self._fails) == 0:
-            log.info('No warnings nor failures since {0:%d.%m.%y, %H:%M} :-)'.format(\
-                    self._prev_state_changes[0][1]))
-        else:
-            log.info('Recorded the following failures and warnings:')
-            for host_name, time_went_online, time_go_offline in self._fails:
-                if host_name == 'WARN':
-                    log.info('Warning at {0:%H:%M:%S}: {1}'.format(time_went_online, time_go_offline))
-                else:
-                    log.info('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3:.2f}s)'\
-                            .format(host_name, time_went_online, time_go_offline, \
-                                    (time_go_offline-time_went_online).total_seconds()))
-
-    #end: Tester.print_failures
-
-    def save_state_and_history(self, save_file):
-        """ save all other data for later inspection using show_saved_data """
-        try:
-            with open(save_file, 'wb') as file_handle:
-                self._log.info('saving state and history to {0}'.format(save_file))
-                pickle.dump( (self._last_request, self._last_reply, self._n_requests, self._n_replies, \
-                              self._host_for_ip, self._tell_connd_bin, self._prev_state_changes, \
-                              self._fails, self.max_fails, save_file, dt.now()), \
-                            file_handle)
-            return True
-        except Exception as e:
-            self._log.exception("Failed to save!", exc_info=True)
-        return False
-    #end: Tester.save_state_and_history
-
-
-    @staticmethod
-    def show_saved_data(save_file):
-        """ print info on data saved by save_state_and_history """
-
-        # load
-        save_date = None
-        with open(save_file, 'rb') as file_handle:
-            print('loading state and history from {0}'.format(save_file))
-            last_request, last_reply, n_requests, n_replies, host_for_ip, tell_connd_bin, \
-                    state_changes, fails, max_fails, save_file_again, save_date \
-                        = pickle.load(file_handle)
-        if save_date is None:
-            print('Failed to read state and history from {0}'.format(save_file))
-            return 1
-
-        # simple stats
-        first_change = state_changes[0][1]
-        last_change = state_changes[-1][1]
-        print('Performed {0} state changes between {1:%d.%m,%H:%M} and {2:%d.%m,%H:%M}' \
-                .format(len(state_changes), first_change, last_change), end='')
-        print(' ({0} --> one every {1} on average)'.format( \
-                        strftimedelta( last_change - first_change                      ), \
-                        strftimedelta((last_change - first_change) / len(state_changes)) ))
-        print('Used {0} IPs for {1} hosts'.format(len(host_for_ip), len(last_request)))
-        print('Overall, recorded {0} warnings/failures (max={1})'.format(len(fails), max_fails))
-
-        # sort fails into warnings and fail per host and fail per time
-        warnings = []
-        n_test_warnings = 0
-        fail_per_host = {}
-        fail_per_time = {}
-        n_fails = 0
-        for host, time_online, time_offline in fails:
-            if host == 'WARN':
-                # time_online = time, time_offline=message
-                if 'test warning' in time_offline:
-                    n_test_warnings += 1
-                else:
-                    warnings.append((time_online, time_offline))
-            else:
-                n_fails += 1
-                try:
-                    fail_per_host[host].append(time_online)
-                except KeyError:
-                    fail_per_host[host] = [time_online,]
-                duration = time_offline - time_online
-                try:
-                    fail_per_time[time_online, duration].append(host)
-                except KeyError:
-                    fail_per_time[time_online, duration] = [host, ]
-
-        # stats per host
-        for host, n_req in n_requests.items():
-            print(host + ' (', end='')
-            print(','.join( ip for ip, host_for_ip in host_for_ip.items() if host==host_for_ip ), end='')
-            print('): \treplied to {1} of {0} pings ({2:.1f}%); '.format(\
-                    n_req, n_replies[host], 100. * n_replies[host]/n_req), end='')
-            if host in fail_per_host:
-                print('failed to ping at ', end='')
-                print(','.join( '{0:%H:%M},'.format(fail_time) for fail_time in fail_per_host[host] ))
-            else:
-                print('replied in each test interval')
-
-        # stats per time
-        if fail_per_time:
-            print("Ping failures per time (overall {0} fails in {1} intervals):".format(\
-                    n_fails, len(fail_per_time)))
-            for (start, duration), host_list in fail_per_time.items():
-                print('{0:%d.%m,%H:%M} + {1}: '.format(start, strftimedelta(duration)), end='')
-                print(','.join(host_list))
-
-        # warnings
-        if warnings:
-            if n_test_warnings:
-                print("Warnings (not showing {0} test warnings):".format(n_test_warnings))
-            else:
-                print("Warnings:")
-            for time, message in warnings:
-                print("{0:%d.%m,%H:%M}: {1}".format(time, message))
-        elif n_test_warnings:
-            print('Only warnings were {0} for testing'.format(n_test_warnings))
-
-        # done
-        return 0
-    #end: static Tester.show_saved_data
-
-#end: class Tester
-
-########################################################################
-### MAIN
-########################################################################
-
-def test_version(pingcheck_binary):
-    """ run pingcheck --version; we want > 0.1 for proper IP parsing
-
-    no try-catch here, also intended as test-run for pingcheck binary
-    """
-
-    output = subprocess.check_output([pingcheck_binary, '--version'], universal_newlines=True)
-    match = regexp('\s*pingcheck version (\d+\.\d+)\s*', output)
-    if match is None:
-        raise ValueError('Could not determine pingcheck version from output of {0} --version!'\
-                .format(pingcheck_binary))
-    version = float(match.groups()[0])
-    if version < 0.2:
-        raise ValueError('Pingcheck version is {0} but require version > 0.1!'.format(version))
-
-#end: test_version
-
-
-def parse_args():
-    """ parse command line arguments and set defaults """
-    from argparse import ArgumentParser
-    parser = ArgumentParser(description="Long-term testing of pingcheck: goes online and offline " \
-                                      + "using tell-connd and checks log for correct output", \
-                            epilog="(c) Christian Herdtweck, Intra2net AG, 2015")
-
-    # option to show saved data
-    parser.add_argument('--show-saved-data', '-s', metavar='SAVE_FILE.pickle', \
-            default='', type=str, \
-            help='Do not run test but instead print info saved by earlier test run')
-
-    # binaries
-    parser.add_argument('--pingcheck-binary', '-p', metavar='PATH', \
-            default=default_pingcheck_bin, \
-            help='path to pingcheck binary')
-    parser.add_argument('--tell-connd-binary', '-t', metavar='PATH', \
-            default=default_tell_connd_bin, \
-            help='path to tell-connd binary')
-
-    # output
-    parser.add_argument('--output-prefix', '-o', metavar='PATH', \
-            default=default_output_prefix, \
-            help='path and/or file prefix to config and log files; ~=home, 0=prog start datetime; ' \
-                + 'default="%(default)s"')
-    parser.add_argument('--log-to-console', '-c', metavar='0|1', \
-            choices=(0, 1), type=int, default=1, \
-            help='print all output to console? default=%(default)s')
-    parser.add_argument('--log-level-console', metavar='LEVEL', \
-            type=int, default=logging.INFO, \
-            help='log level for console output; ' \
-               + 'debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
-                logging.DEBUG, logging.INFO, logging.WARN))
-    parser.add_argument('--log-to-file', '-f', metavar='0|1', \
-            choices=(0, 1), type=int, default=1, \
-            help='print all output to log file? default=%(default)s')
-    parser.add_argument('--log-level-file', metavar='LEVEL', \
-            type=int, default=logging.DEBUG, \
-            help='log level for console output; ' \
-               + 'debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
-                logging.DEBUG, logging.INFO, logging.WARN))
-    parser.add_argument('--log-file-suffix', '-l', metavar='SUFF.IX', \
-            type=str, default='.log', \
-            help='2nd part of log file name after --output_prefix; default="%(default)s"')
-    parser.add_argument('--pinger-log-level', metavar='LEVEL', \
-            type=str, default='INFO', \
-            help='log level given to pingchecker, DEBUG might quickly fill log; default=%(default)s')
-
-
-    parser.add_argument('--max-fails-per-host', '-m', metavar='N', type=int, default=10, \
-            help='max number of failures/warnings per host before test will end; default: %(default)s')
-    parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(version))
-
-    # parse
-    opts = parser.parse_args()
-    opts.output_prefix = expanduser(opts.output_prefix).format(dt.now())
-    return opts
-#end: parse_args
-
-
-def setup_logging(opts):
-    """ create main logger and handlers for base logger """
-
-    # create logger for main
-    log = logger_base.getChild('main')
-
-    # remove all existing handlers
-    for old_handler in logger_base.handlers:
-        print('removing handler from base')
-        logger_base.removeHandler(old_handler)
-    for old_handler in log.handlers:
-        print('removing handler from log')
-        log.removeHandler(old_handler)
-
-    logger_base.setLevel(logging.DEBUG)
-
-    # create handlers and format for base logger
-    log_file = None
-    formatter = logging.Formatter('%(asctime)s %(name)s %(levelname)s %(message)s', \
-                                  datefmt='%d.%m,%H:%M:%S')
-    if opts.log_to_console:
-        handler = logging.StreamHandler(sys.stdout)
-        handler.setLevel(opts.log_level_console)
-        handler.setFormatter(formatter)
-        logger_base.addHandler(handler)
-
-    if opts.log_to_file:
-        log_file = opts.output_prefix + opts.log_file_suffix
-        handler = logging.FileHandler(log_file)
-        handler.setLevel(opts.log_level_file)
-        handler.setFormatter(formatter)
-        logger_base.addHandler(handler)
-
-    # log info
-    log.info('Log for {0} in {1} created on {2}'.format(__name__, __file__, dt.now()))
-    if opts.log_to_console:
-        log.info('logging to console at level {0}'.format(opts.log_level_console))
-    else:
-        log.info('not logging to console')
-    if opts.log_to_file:
-        log.info('logging to file {1} at level {0}'.format(opts.log_level_file, log_file))
-    else:
-        log.info('not logging to file')
-
-    # return log_file only if makes sense to talk about it in main
-    if opts.log_to_console:
-        return log, log_file
-    else:
-        return log, None
-#end: setup_logging
-
-
-def handle_sigchild(signum, curr_frame, log, cleanup_args):
-    log.error('Received SIGCHLD')
-    log.info('with args {0} and {1}'.format(signum, curr_frame))
-
-    cleanup(*cleanup_args)
-
-
-def cleanup(log, pinger, tester, line_iter, save_file):
-    """ the stuff from finally clause in main loop; also called from handle_sigchild """
-
-    return_code = 255
-
-    log.info('-' * 72)
-    if pinger is None:
-        log.warning('Error before pinger was created')
-        return_code = 254
-    else:
-        log.info("Cleaning up")
-
-        try:
-            log.info('telling pinger to terminate')
-            pinger.terminate()
-
-            for line_idx, line in enumerate(line_iter):
-                log.info('left-over output: {0}'.format(line))
-                if line_idx > 100:
-                    log.warning('ignore further left-over output')
-                    break
-
-            #return_code = pinger.wait(3)  can block!
-            outs, errs = pinger.communicate(timeout=3)
-            for line in outs.splitlines():
-                log.info('left-over output: {0}'.format(line))
-            for line in errs.splitlines():
-                log.info('left-over output: {0}'.format(line))
-
-            return_code = pinger.returncode   # may still be None
-            log.info('pinger terminated with return code {0}'.format(return_code))
-        except subprocess.TimeoutExpired as te:
-            log.warning('Timeout expired waiting for pinger to terminate: {0}'.format(te))
-            return_code = 253
-        except:
-            log.exception('trouble terminating pinger', exc_info=True)
-
-        if return_code is None:
-            for line_idx, line in enumerate(line_iter):
-                log.info('left-over output: {0}'.format(line))
-                if line_idx > 100:
-                    log.warning('ignore further left-over output')
-                    break
-
-            # kill it
-            try:
-                log.info('killing pinger the hard way')
-                pinger.kill()
-                outs, errs = pinger.communicate(timeout=3)
-                for line in outs.splitlines():
-                    log.info('left-over output: {0}'.format(line))
-                for line in errs.splitlines():
-                    log.info('left-over output: {0}'.format(line))
-
-            except:
-                log.exception('trouble killing pinger', exc_info=True)
-
-        if tester is not None:
-            tester.save_state_and_history(save_file)
-            tester.print_failures(log)
-    #end: if have pinger
-
-    # reset original signal handler
-    #signal.signal(signal.SIGCHLD, signal.SIG_DFL)
-
-    return return_code
-#end: cleanup
-
-
-def main():
-    """ parses args, create a Pinger and Parser and Tester and runs them; cleans up
-    """
-
-    opts = parse_args()
-
-    if opts.show_saved_data:
-        return Tester.show_saved_data(opts.show_saved_data)
-
-    base_dir = dirname(opts.output_prefix + "_test")
-    if not isdir(base_dir):
-        mkdir(base_dir)
-
-    log, log_file = setup_logging(opts)
-
-    pinger = None
-    return_code = 255
-    tester = None
-    line_iter = None
-    try:
-        config_file, n_hosts = create_conf_file(opts.output_prefix)
-        save_file = opts.output_prefix + '_save.pickle'
-        log.info('created config file {0}'.format(config_file))
-
-        # test version
-        test_version(opts.pingcheck_binary)
-
-        #signal.signal(signal.SIGCHLD, \
-        #        lambda signum, curr_frame: handle_sigchild(signum, curr_frame, log, \
-        #           (log, pinger, tester, line_iter, save_file) )
-
-        # pingcheck subprocess
-        pinger = subprocess.Popen([opts.pingcheck_binary, '--log-level', opts.pinger_log_level, \
-                                     '--log-output', 'TERMINAL', '--config-file', config_file], \
-                                   stdout=subprocess.PIPE, stderr=subprocess.PIPE, \
-                                   universal_newlines=True)
-        log.info('started pingcheck with PID {0}'.format(pinger.pid))
-
-        parser = ProcOutputParser(pinger.stdout, pinger.stderr)
-        line_iter = iter(parser)
-        tester = Tester(line_iter, opts.tell_connd_binary, max(10, opts.max_fails_per_host*n_hosts))
-
-        tester.test()
-    except KeyboardInterrupt:
-        log.info('stopped by user')
-    except:
-        log.exception('Caught an exception in main loop', exc_info=True)
-        log.warning('Re-raise')
-        raise
-    finally:
-        return_code = cleanup(log, pinger, tester, line_iter, save_file)
-
-        log.info('Will stop now')
-        if log_file is not None:
-            log.info('might find more log output in {0}'.format(log_file))
-        log.info('-' * 72)
-    #end: global clean-up
-
-    return return_code
-
-#end: function main
-
-
-if __name__ == '__main__':
-    main()
-    logging.shutdown()
-