use logging for all output, improve understanding of connd, improved process terminat...
authorChristian Herdtweck <christian.herdtweck@intra2net.com>
Fri, 16 Jan 2015 17:01:13 +0000 (18:01 +0100)
committerChristian Herdtweck <christian.herdtweck@intra2net.com>
Fri, 16 Jan 2015 17:01:13 +0000 (18:01 +0100)
test/connd_state.py [new file with mode: 0755]
test/long_term_test.py

diff --git a/test/connd_state.py b/test/connd_state.py
new file mode 100755 (executable)
index 0000000..0d0b903
--- /dev/null
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+
+""" Representation for connd state as returned by tell-connd --status
+
+Christian Herdtweck, Intra2net, January 2015
+(c) Intra2net AG 2015
+"""
+
+# Version History
+# 16/01/15 Christian Herdtweck: started creation
+
+import subprocess
+from re import match as regexp
+from os import EX_OK
+
+tell_connd_binary = '/usr/intranator/bin/tell-connd'
+timeout = 1
+
+class ConndState:
+    """ representation of connd's status as returned by tell-connd --status """
+
+    online_mode = None
+    default_provider = None
+    subsys_online = None
+    subsys_offline = None
+    subsys_disabled = None
+    connections = None
+    actions = None
+    online_ips = None
+    connected_vpns = None
+
+    def __str__(self):
+        return '[ConndState: {0} (default {1}), {2} conn\'s, {3} ips, {4} vpns ]'.format(\
+                self.online_mode, self.default_provider, len(self.connections), \
+                len(self.online_ips), len(self.connected_vpns))
+
+
+    @staticmethod
+    def run_tell_connd():
+        """ run tell-connd --status, return output iterator """
+        try:
+            output = subprocess.check_output([tell_connd_binary, '--status'], \
+                    stderr=subprocess.STDOUT, universal_newlines=True, shell=False, timeout=timeout)
+            return output.splitlines(), EX_OK
+        except subprocess.CalledProcessError as cpe: # non-zero return status
+            warn('tell-connd exited with status {0}'.format(cpe.returncode))
+            return cpe.output.splitlines(), cpe.returncode
+        except subprocess.TimeoutExpired as te:
+            warn('tell-connd timed out after {0}s. Returning -1'.format(te.timeout))
+            return te.output.splitlines(), -1
+    #end: ConndState.run_tell_connd
+
+
+    @staticmethod
+    def get_state():
+
+        state = ConndState()
+
+        all_lines, err_code = ConndState.run_tell_connd()
+        output = iter(all_lines)
+
+        # first section
+        line = next(output).strip()
+        state.online_mode = regexp('online mode\s*:\s*(.+)$', line).groups()[0]
+        line = next(output).strip()
+        state.default_provider = regexp('default provider\s*:\s*(.*)$', \
+                                        line).groups()[0]
+        if len(state.default_provider) == 0:
+            state.default_provider = None
+        line = next(output).strip()
+        assert( len(line) == 0 )
+
+        # subsys
+        line = next(output).strip()
+        assert( line == 'subsys' )
+        line = next(output).strip()
+        state.subsys_online   = regexp(  'online\s*:\s*(.*)$', line).groups()[0].split()
+        line = next(output).strip()
+        state.subsys_offline  = regexp( 'offline\s*:\s*(.*)$', line).groups()[0].split()
+        line = next(output).strip()
+        state.subsys_disabled = regexp('disabled\s*:\s*(.*)$', line).groups()[0].split()
+        line = next(output).strip()
+        assert( len(line) == 0 )
+
+        # connection map
+        state.connections = []
+        line = next(output).strip()
+        assert( line == 'connection map:' )
+        expect_new = True
+        for line in output:
+            line = line.strip()
+            if len(line) == 0:
+                continue
+            if expect_new:
+                if line == 'end of connection map':
+                    break
+                conn_name, conn_info = regexp('\[\s*(.+)\s*\]\s*:\s*\(\s*(.*)\s*\)', line).groups()
+                expect_new = False
+            else:
+                conn_actions = regexp('actions\s*:\s*\[\s*(.+)\s*\]', line)
+                state.connections.append( (conn_name, conn_info, conn_actions) )
+                expect_new = True
+        #end: for lines
+        assert( expect_new )
+        line = next(output).strip()
+        assert( len(line) == 0 )
+
+        # actions
+        line = next(output).strip()
+        state.actions = regexp('actions\s*:\s*(.*)', line).groups()[0].split()
+        if len(state.actions) == 1 and state.actions[0].strip() == '-':
+            state.actions = []
+        line = next(output).strip()
+        assert( len(line) == 0 )
+
+        # online IPs
+        line = next(output).strip()
+        state.online_ips = regexp('list of online ips\s*:\s*(.*)', \
+                                  line).groups()[0].split()
+        if len(state.online_ips) == 1 and state.online_ips[0].strip() == 'NONE':
+            state.online_ips = []
+        line = next(output).strip()
+        assert( len(line) == 0 )
+
+        # VPNs
+        state.connected_vpns = []
+        line = next(output).strip()
+        assert( line == 'vpns connected:' )
+        for line in output:
+            line = line.strip()
+            if len(line) == 0:
+                continue
+            elif line == 'end of list of connected vpns':
+                break
+            else:
+                state.connected_vpns.append(line)
+        #end: for lines
+
+        # done
+        line = next(output).strip()
+        assert( len(line) == 0 )
+        line = next(output).strip()
+        assert( line == 'Done.' )
+
+        return state
+    #end: ConndState.get_state
+
+#end: class ConndState
+
+
+def test():
+    state = ConndState.get_state()
+    print(state)
+    print(state.subsys_online)
+    print(state.subsys_offline)
+    print(state.subsys_disabled)
+
+def main():
+    """ Main function, called when running file as script; runs test()
+    """
+    test()
+#end: function main
+
+
+if __name__ == '__main__':
+    main()
+
+# (created using vim -- the world's best text editor)
+
index 06415fe..cac9a44 100755 (executable)
@@ -15,24 +15,30 @@ Christian Herdtweck, Intra2net, January 2015
 # 15/01/15 Christian Herdtweck: started creation
 
 import subprocess
-from traceback import print_exc
+import sys
 import select
 from re import match as regexp
 from time import sleep
 from datetime import timedelta, datetime as dt
 from warnings import warn
 from random import randint
+import logging
+from os.path import expanduser, dirname, isdir
+from os import mkdir, EX_OK
+
+logger_base = logging.getLogger('long_term_test')
 
 ########################################################################
 ### CONSTANTS
 ########################################################################
 
-version = '1.0'
+version = '1.1'
 #default_pingcheck_bin  = '/usr/intranator/bin/pingcheck'
 default_pingcheck_bin  = '/root/git/pingcheck/build/src/pingcheck'
 default_tell_connd_bin = '/usr/intranator/bin/tell-connd'
-default_config_file = 'test.conf'
+default_output_prefix = './long_term_test/{0:%d%m%y_%H%M}'
 
+# for parsing log messages
 LEVEL_DEBUG  = 'dbug'
 LEVEL_INFO   = 'info'
 LEVEL_WARN   = 'warn'
@@ -46,36 +52,86 @@ text_to_level['WARN'] = LEVEL_WARN
 text_to_level['NOTE'] = LEVEL_NOTICE
 text_to_level['ERR']  = LEVEL_ERROR
 
+# duration in which to remain online/offline
 MIN_DURATION_SAME_STATE = 10   # [seconds]
 MAX_DURATION_SAME_STATE = 50   # [seconds]
 
 ########################################################################
-### TESTER
+#### for config file
 ########################################################################
 
-class ParserError(Exception):
-    def __init__(self, line):
-        super(ParserError, self).__init__(\
-                'ParserError: line "{0}" did not match!'.format(line))
-
-class ParserError_NotUnique(Exception):
-    def __init__(self, line):
-        super(ParserError_NotUnique, self).__init__(\
-                'ParserError: no unique match for line "{0}"!'.format(line))
-
-class PingProc(subprocess.Popen):
-   """ wrapper around pingcheck binary """
+PING_INTERVAL = 3    # [seconds]
+config_file_pattern = 'long_term_test_{0:%d%m%y_%H%M%S}.conf'
+conf_start = """# config file for pingcheck testing
 
-   def __init__(self, binary, conf_file):
-      super(PingProc, self).__init__(\
-              [binary, '--log-level', 'INFO', '--log-output', 'TERMINAL', '--config-file', conf_file], \
-              stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
-      print('started pingcheck with PID {0}'.format(self.pid))
+default-source-network-interface=eth1
+nameserver=127.0.0.1
+hosts-down-limit={hosts_down_limit:d}
+ping-fail-limit=25
+status-notifier-cmd=echo host status is ${status}
+link-up-interval=2
+"""
+conf_host = """
+[host]
+name={0}
+interval={1:d}
+ping-protocol=ICMPv4
+port=80
+source-network-interface=default
+"""
+host_list = (\
+        'intranator.m.i2n', \
+        'intraroute.m.i2n', \
+        'websync.m.i2n', \
+        'aqua.m.i2n', \
+        'intratdsl.m.i2n', \
+        'beholder.m.i2n', \
+        'eagle.m.i2n', \
+        'osdist.m.i2n', \
+        'tokyo.m.i2n', \
+        'blizzard.m.i2n', \
+        'phonehelp.m.i2n', \
+        'virt-dmz.m.i2n', \
+        'virt01.m.i2n', \
+        'virt02.m.i2n', \
+        'tornado.n.i2n', \
+        'auth.m.i2n', \
+        'forum.intra2net.com', \
+        'update.intranator.com', \
+        'demo-intranator.intra2net.com', \
+        'intra2net.com', \
+        'partner.intra2net.com', \
+        'demo-securitygateway.intra2net.com', \
+        'demo-networksecurity.intra2net.com', \
+    )
+
+def create_conf_file(file_prefix):
+    """ create config file using constants above """
+
+    file_name = file_prefix + '_config.conf'
+
+    n_hosts = len(host_list)
+
+    with open(file_name, 'wt') as file_handle:
+        file_handle.write(conf_start.format(hosts_down_limit=int(n_hosts * 0.7), status='{status}'))
+        for host in host_list:
+            file_handle.write(conf_host.format(host, PING_INTERVAL))
+
+    return file_name
+#end: create_conf_file
 
-#end: PingProc
 
+########################################################################
+### PARSER
+########################################################################
 
 class ProcOutputParser:
+    """ parser for output from pingcheck process via pipes
+
+    uses select and polling
+    should never silently swallow log output, main function parse should yield all lines
+      or emit errors
+    """
 
     _out = None
     _err = None
@@ -118,29 +174,17 @@ class ProcOutputParser:
 
                 if not line:
                     continue
+                # if reach here, we have a new line of text
 
-                yield src, dt.now(), line.strip()
+                yield src, line.strip()
     #end: ProcOutputParser._read
 
-    def _parse_line(self, line):
-        match = regexp(self._regexp, line)
-        if match is None:
-            raise ParserError(line)
-        text_time, text_level, message = match.groups()
-        try:
-            return text_time, text_to_level[text_level], message
-        except KeyError:
-            warn('unknown level: {0}'.format(text_level))
-            return text_time, text_level, message
-
     def parse(self):
-        for src, timestamp, line in self._read():
-            if src == self.SRC_ERR:
-                yield src, timestamp, self._parse_line(line)
-            elif src == self.SRC_OUT and regexp("RECV BUF \(\s*\d+\):[\d \|]+", line):
-                pass
-            else:
-                warn('Unexpected line and/or source: {0}, "{1}"'.format(src, line))
+        for src, line in self._read():
+            try:
+                yield src, regexp(self._regexp, line).groups()
+            except:
+                yield src, line
 
     def __iter__(self):
         return self.parse()
@@ -164,6 +208,10 @@ class ProcOutputParser:
 
 #end: class ProcOutputParser
 
+########################################################################
+### TESTER
+########################################################################
+
 class Tester:
     """ """
 
@@ -176,8 +224,11 @@ class Tester:
     _tell_connd_bin = None
     _prev_state_changes = None
     _fails = None
+    _log = None
+    _log_ping = None
+    max_fails = None
 
-    def __init__(self, line_iter, tell_connd_bin):
+    def __init__(self, line_iter, tell_connd_bin, max_fails):
         self._line_iter = line_iter
         self._last_request = {}
         self._last_reply = {}
@@ -186,32 +237,60 @@ class Tester:
         self._tell_connd_bin = tell_connd_bin
         self._prev_state_changes = []
         self._fails = []
+        self.max_fails = max_fails
+
+        self._log = logger_base.getChild('Tester')
+        self._log_ping = {}
+        self._log_ping[ProcOutputParser.SRC_OUT] = logger_base.getChild('pingOut')
+        self._log_ping[ProcOutputParser.SRC_ERR] = logger_base.getChild('pingErr')
 
 
     def test(self):
-        self._handle_change_state()
+        # preparations
+        current_state = self._is_online()
+        while current_state is None:
+            sleep(1)
+            current_state = self._is_online()
+        self._log.info('Start with state online = {0}'.format(current_state))
+
+        # start looping over output from observed process
+        for src, text_info in self._line_iter:
+            if isinstance(text_info, str):
+                # ProcOutputParser could not parse
+                msg = 'unparsed: {0}'.format(text_info)
+                if text_info.startswith("RECV BUF ("):
+                    level = LEVEL_DEBUG
+                else:
+                    level = LEVEL_WARN
+            else:
+                text_time, level, message = text_info
+                # first of all: write to text log so nothing is lost
+                msg = '{0} {1} (orig time {2})'.format(level, message, text_time)
 
-        for src, timestamp, (text_time, level, message) in self._line_iter:
-            # always print warnings and errors
             if level in (LEVEL_WARN, LEVEL_ERROR):
+                # most common warning/error is only logged at debug level
                 if message.endswith('could not be resolved.'):
+                    self._log_ping[src].debug(msg)
                     continue
                 if message.endswith('fail sending ping data. Network is unreachable'):
+                    self._log_ping[src].debug(msg)
                     continue
-                else:
-                    self._print('{0}: {1}'.format(level, message))
+                else:   # unexpected errors and warnings from pingcheck are logged at info level
+                    self._log_ping[src].info(msg)
                     continue
-            if self._handle_new_ip(message):
-                continue
+            else:   # normal messages are logged at debug level
+                self._log_ping[src].debug(msg)
+
+            # now deal with message contents
             if self._handle_request(message):
                 continue
             if self._handle_reply(message):
                 continue
             if self._handle_change_state():
                 continue
-
-            #self._print('ignoring message "{0}"'.format(message))
-        #end: for all lines from log
+            if self._handle_new_ip(message):
+                continue
+        #end: for all lines from pingcheck output
     #end: Tester.test
 
 
@@ -221,7 +300,7 @@ class Tester:
             matches = regexp('Host is already an IP: (\d+\.\d+\.\d+\.\d+)', message)
         if matches:
             ip = matches.groups()[0]
-            self._print('remember new IP {0}'.format(ip))
+            self._log.info('remember new IP {0}'.format(ip))
             self._last_request[ip] = dt.now()
             self._last_reply[ip] = dt.now()
             self._n_requests[ip] = 0
@@ -229,45 +308,31 @@ class Tester:
             return True
         return False
 
+
     def _handle_request(self, message):
         matches = regexp('(\d+\.\d+\.\d+\.\d+): sending ping', message)
         if matches is not None:
             ip = matches.groups()[0]
-            self._print('pinging IP {0}'.format(ip))
+            self._log.info('pinging IP {0}'.format(ip))
             self._last_request[ip] = dt.now()
             self._n_requests[ip] += 1
             return True
         return False
 
+
     def _handle_reply(self, message):
         matches = regexp('(\d+\.\d+\.\d+\.\d+): Received reply', message)
         if matches is not None:
             ip = matches.groups()[0]
-            self._print('received reply from {0}'.format(ip))
+            self._log.info('received reply from {0}'.format(ip))
             self._last_reply[ip] = dt.now()
             self._n_replies[ip] += 1
             return True
         return False
 
-    def _handle_change_state(self):
-        if self._next_state_change is None:
-            # called at start of test()
-            n_secs = randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE)
-            self._next_state_change = dt.now() + timedelta(seconds=n_secs)
-            current_state = self._is_online()
-            if current_state is not None:
-                self._print('Start with state online = {0}'.format(current_state))
-                self._prev_state_changes.append((current_state, dt.now()))
-            self._print('schedule next state change for {0:%H:%M:%S} (in {1}s)'.format(\
-                    self._next_state_change, n_secs))
-
-        elif dt.now() > self._next_state_change:
-
-            # check if there were too many fails; if so, stop
-            if len(self._fails) > 100:
-                self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
-                raise ValueError("Too many failures/warnings!")
 
+    def _handle_change_state(self):
+        if dt.now() > self._next_state_change:
             is_online = self._is_online()
             if is_online is None:
                 return False   # try again later
@@ -277,16 +342,22 @@ class Tester:
             else:
                 new_val = True
             if self._set_online(new_val):
-                self._prev_state_changes.append((new_val, dt.now()))
-                self._next_state_change = dt.now() + timedelta(\
-                        seconds=randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE))
-                self._print('schedule next state change for {0:%H:%M:%S}'.format(self._next_state_change))
+                self._schedule_next_state_change(is_online)
                 return True
             # if call to tell_connd fails, try again later
         return False
     #end: _handle_change_state
 
 
+    def _schedule_next_state_change(self, current_state):
+        self._prev_state_changes.append((current_state, dt.now()))
+        n_secs = randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE)
+        self._next_state_change = dt.now() + timedelta(seconds=n_secs)
+        self._log.info('schedule next state change for {0:%H:%M:%S} (in {1:d}s)'.format(\
+                self._next_state_change, n_secs))
+    #end: Tester._schedule_next_state_change
+
+
     def _is_online(self):
         """ determine state from tell-connd --status
 
@@ -297,18 +368,14 @@ class Tester:
             output = subprocess.check_output([self._tell_connd_bin, '--status'], \
                                              universal_newlines=True)
         except subprocess.CalledProcessError as cpe: # non-zero exit status
-            text = 'Non-zero exit status asking tell-connd for its --status: {0}'.format(cpe.returncode)
-            warn(text)
-            self._fails.append(("WARN", dt.now(), text))
+            self._warn('Non-zero exit status asking tell-connd for its --status: {0}'.format(cpe.returncode))
             return None
         except subprocess.TimeoutExpired as te:
-            text = 'Timeout expired asking tell-connd for its --status. Output was {0}'.format(te.output)
-            warn(text)
-            self._fails.append(("WARN", dt.now(), text))
+            self._warn('Timeout expired asking tell-connd for its --status. Output was {0}'.format(te.output))
             return None
         except:
-            self._print('Trouble asking tell-connd for its --status:')
-            print_exc()
+            self._info('Trouble asking tell-connd for its --status:')
+            self._warn(sys.exc_info())
             return None
 
         line_iter = output.splitlines()
@@ -318,23 +385,19 @@ class Tester:
             if matches:
                 matches = matches.groups()
                 if len(matches) != 1:
-                    raise ValueError("Could not interpret online mode from \"{0}\"".format(line))
+                    self._warn('Could not interpret online mode from \"{0}\"'.format(line))
                 else:
                     online_mode = matches[0]
                     break
         if online_mode is None:
-            text = 'Could not determine online mode of connd! Is connd running?!'
-            warn(text)
-            self._fails.append(("WARN", dt.now(), text))
+            self._warn('Could not determine online mode of connd! Is connd running?!')
             return None
         elif online_mode in ('always online', ):
             return True
         elif online_mode in ('offline', 'always offline', ):
             return False
         else:
-            text = 'Unknown online mode: {0}'.format(online_mode)
-            warn(text)
-            self._fails.append(("WARN", dt.now(), text))
+            self._warn('Unknown online mode: {0}'.format(online_mode))
             return None
     #end: Tester._is_online
 
@@ -343,26 +406,24 @@ class Tester:
     def _set_online(self, new_val):
         """ run tell-connd --online/offline P1 """
         if new_val:
-            self._print('going online')
+            self._log.info('going online')
             args = [self._tell_connd_bin, '--online', 'P1']
         else:
-            self._print('going offline')
+            self._log.info('going offline')
             args = [self._tell_connd_bin, '--offline']
         try:
             return_code = subprocess.call(args, stderr=subprocess.STDOUT, timeout=1)
             if return_code == 0:
                 return True
             else:
-                self._print('tell-connd returned {0} --> online state probably unchanged!'.format(return_code))
+                self._log.info('tell-connd returned {0} --> online state probably unchanged!'.format(return_code))
                 return False
         except subprocess.TimeoutExpired as te:
-            text = 'Timeout expired running {0}. Output was {1}'.format(args, te.output)
-            warn(text)
-            self._fails.append(("WARN", dt.now(), text))
+            self._warn('Timeout expired running {0}. Output was {1}'.format(args, te.output))
             return False
         except:
-            self._print('Trouble calling tell-connd:')
-            print_exc()
+            self._log.info('Trouble calling tell-connd:')
+            self._warn(sys.exc_info())
             return False
     #end: Tester._set_online
 
@@ -381,37 +442,61 @@ class Tester:
             # never been online --> nothing to check yet
             return
 
-        self._print('checking whether all hosts were pinged between {0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\
+        self._log.info('checking whether all hosts were pinged between {0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\
                 time_went_online, time_go_offline))
         for ip, reply_time in self._last_reply.items():
             if reply_time < time_went_online:
-                text = 'Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\
-                        ip, time_went_online, time_go_offline)
-                warn(text)
+                self._log.warn('Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\
+                        ip, time_went_online, time_go_offline))
                 self._fails.append((ip, time_went_online, time_go_offline))
-
+                self._log.info('remembered as failure number {0} / {1}'.format(\
+                        len(self._fails), self.max_fails))
+
+                # check if there were too many fails; if so, stop
+                if len(self._fails) > self.max_fails:
+                    self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
+                    raise ValueError("Too many failures/warnings!")
+            #end: if not pinged recently
+        #end: for all known ips
     #end: Tester._ensure_all_pinged
 
 
-    def _print(self, message):
-        now = dt.now()
-        if self._next_state_change is not None:
-            print('{0:%H:%M:%S};next{1:+5.1f}s;{2} fails: {3}'.format(now, \
-                    (now - self._next_state_change).total_seconds(), len(self._fails), message))
+    def _warn(self, message):
+
+        if isinstance(message, str):
+            self._log.warning(message)
         else:
-            print('{0:%H:%M:%S};next-?????;{1} fails: {2}'.format(now, len(self._fails), message))
+            self._log.formatException(sys.exc_info())
+        self._fails.append(("WARN", dt.now(), message))
+        self._log.info('remembered as failure number {0} / {1}'.format(\
+                len(self._fails), self.max_fails))
 
+        # check if there were too many fails; if so, stop
+        if len(self._fails) > self.max_fails:
+            self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
+            raise ValueError("Too many failures/warnings!")
+    #end: Tester._warn
 
-    def print_failures(self):
+
+    #def _print(self, message):
+    #    now = dt.now()
+    #    if self._next_state_change is not None:
+    #        print('{0:%H:%M:%S};next{1:+5.1f}s;{2} fails: {3}'.format(now, \
+    #                (now - self._next_state_change).total_seconds(), len(self._fails), message))
+    #    else:
+    #        print('{0:%H:%M:%S};next-?????;{1} fails: {2}'.format(now, len(self._fails), message))
+
+
+    def print_failures(self, log):
         if len(self._fails) == 0:
-            print('No fails! :-)')
+            log.info('No fails! :-)')
         else:
-            print('Recorded the following failures and warnings:')
+            log.info('Recorded the following failures and warnings:')
             for ip, time_went_online, time_go_offline in self._fails:
                 if ip == 'WARN':
-                    print('Warning at {0:%H:%M:%S}: {1}'.format(time_went_online, time_go_offline))
+                    log.info('Warning at {0:%H:%M:%S}: {1}'.format(time_went_online, time_go_offline))
                 else:
-                    print('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3}s)'.format(\
+                    log.info('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3}s)'.format(\
                             ip, time_went_online, time_go_offline, \
                             (time_go_offline-time_went_online).total_seconds()))
 #end: class Tester
@@ -435,67 +520,169 @@ def parse_args():
             default=default_tell_connd_bin, \
             help='path to tell-connd binary')
 
-    parser.add_argument('--config_file', '-c', metavar='PATH', \
-            default=default_config_file, \
-            help='path to config file for pingcheck')
-
-    # default
+    # output
+    parser.add_argument('--output_prefix', '-o', metavar='PATH', \
+            default=default_output_prefix, \
+            help='path and/or file prefix to config and log files; ~=home, 0=prog start datetime; ' \
+                + 'default="%(default)s"')
+    parser.add_argument('--log_to_console', '-c', metavar='0|1', \
+            choices=(0, 1), type=int, default=1, \
+            help='print all output to console? default=%(default)s')
+    parser.add_argument('--log_level_console', metavar='LEVEL', \
+            type=int, default=logging.INFO, \
+            help='log level for console output; debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
+                logging.DEBUG, logging.INFO, logging.WARN))
+    parser.add_argument('--log_to_file', '-f', metavar='0|1', \
+            choices=(0, 1), type=int, default=1, \
+            help='print all output to log file? default=%(default)s')
+    parser.add_argument('--log_level_file', metavar='LEVEL', \
+            type=int, default=logging.DEBUG, \
+            help='log level for console output; debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
+                logging.DEBUG, logging.INFO, logging.WARN))
+    parser.add_argument('--log_file_suffix', '-l', metavar='SUFF.IX', \
+            type=str, default='.log', \
+            help='2nd part of log file name after --output_prefix; default="%(default)s"')
+
+    parser.add_argument('--max_fails', '-m', metavar='N', type=int, default=100, \
+            help='max number of failures/warnings before test will end; default: %(default)s')
     parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(version))
-    return parser.parse_args()
+
+    # parse
+    opts = parser.parse_args()
+    opts.output_prefix = expanduser(opts.output_prefix).format(dt.now())
+    return opts
 #end: parse_args
 
 
+def setup_logging(opts):
+    """ create main logger and handlers for base logger """
+
+    # create logger for main
+    log = logger_base.getChild('main')
+
+    # remove all existing handlers
+    for old_handler in logger_base.handlers:
+        print('removing handler from base')
+        logger_base.removeHandler(old_handler)
+    for old_handler in log.handlers:
+        print('removing handler from log')
+        log.removeHandler(old_handler)
+
+    logger_base.setLevel(logging.DEBUG)
+
+    # create handlers and format for base logger
+    log_file = None
+    if opts.log_to_console:
+        handler = logging.StreamHandler(sys.stdout)
+        handler.setLevel(opts.log_level_console)
+        formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s %(message)s', \
+                                      datefmt='%d.%m,%H:%M:%S')
+        handler.setFormatter(formatter)
+        logger_base.addHandler(handler)
+
+    if opts.log_to_file:
+        log_file = opts.output_prefix + opts.log_file_suffix
+        handler = logging.FileHandler(log_file)
+        handler.setLevel(opts.log_level_file)
+        formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s', \
+                                      datefmt='%d.%m,%H:%M:%S')
+        handler.setFormatter(formatter)
+        logger_base.addHandler(handler)
+
+    # log info
+    log.info('Log for {0} in {1} created on {2}'.format(__name__, __file__, dt.now()))
+    if opts.log_to_console:
+        log.info('logging to console at level {0}'.format(opts.log_level_console))
+    else:
+        log.info('not logging to console')
+    if opts.log_to_file:
+        log.info('logging to file {1} at level {0}'.format(opts.log_level_file, log_file))
+    else:
+        log.info('not logging to file')
+
+    # return log_file only if makes sense to talk about it in main
+    if opts.log_to_console:
+        return log, log_file
+    else:
+        return log, None
+#end: setup_logging
+
+
 def main():
-    """ Main function, called when running file as script; currently raises a NotImplementedError 
+    """ parses args, create a Pinger and Parser and Tester and runs them; cleans up
     """
 
     opts = parse_args()
 
+    base_dir = dirname(opts.output_prefix + "_test")
+    if not isdir(base_dir):
+        mkdir(base_dir)
+
+    log, log_file = setup_logging(opts)
+
     pinger = None
     return_code = None
     tester = None
     try:
-        pinger = PingProc(opts.pingcheck_binary, opts.config_file)
+        config_file = create_conf_file(opts.output_prefix)
+        log.info('created config file {0}'.format(config_file))
+
+        # pingcheck subprocess
+        pinger = subprocess.Popen([opts.pingcheck_binary, '--log-level', 'DEBUG', \
+                                     '--log-output', 'TERMINAL', '--config-file', config_file], \
+                                   stdout=subprocess.PIPE, stderr=subprocess.PIPE, \
+                                   universal_newlines=True)
+        log.info('started pingcheck with PID {0}'.format(pinger.pid))
+
         parser = ProcOutputParser(pinger.stdout, pinger.stderr)
         line_iter = iter(parser)
-        tester = Tester(line_iter, opts.tell_connd_binary)
+        tester = Tester(line_iter, opts.tell_connd_binary, opts.max_fails)
         tester.test()
     except KeyboardInterrupt:
-        print('stopped by user')
+        log.info('stopped by user')
     except:
-        if tester is not None:
-            tester.print_failures()
         raise
     finally:
         if pinger is None:
-            print('failed to create pinger')
+            log.warn('Error before pinger was created')
             return_code = 254
         else:
+            log.info("Testing ended")
+            if tester is not None:
+                tester.print_failures(log)
+
             try:
-                print('telling pinger to terminate')
+                log.info('telling pinger to terminate')
                 pinger.terminate()
-                return_code = pinger.wait(3)
-                print('terminated with return code {0}'.format(return_code))
+                #return_code = pinger.wait(3)  may block!
+                outs, errs = pinger.communicate(timeout=3)
+                return_code = pinger.returncode   # may still be None
+                log.info('terminated with return code {0}'.format(return_code))
             except subprocess.TimeoutExpired as te:
-                print('Timeout expired waiting for pinger to terminate: {0}'.format(te))
+                log.warn('Timeout expired waiting for pinger to terminate: {0}'.format(te))
                 return_code = 255  # a.k.a. -1
             except:
-                print('trouble terminating pinger:')
-                print_exc()
+                log.warn('trouble terminating pinger:')
+                log.formatException(sys.exc_info())
 
             if return_code is None:
                 # kill it
                 try:
+                    log.info('killing pinger the hard way')
                     pinger.kill()
-                    outs, errs = pinger.communicate()
+                    outs, errs = pinger.communicate(timeout=3)
                 except:
-                    print('trouble killing pinger:')
-                    print_exc()
+                    log.warn('trouble killing pinger:')
+                    log.formatException(sys.exc_info())
         #end: if have pinger
     #end: global clean-up
 
     if tester is not None:
-        tester.print_failures()
+        tester.print_failures(log)
+
+    log.info('Will stop now')
+    if log_file is not None:
+        log.info('might find more log output in {0}'.format(log_file))
 
     return return_code
 
@@ -504,6 +691,7 @@ def main():
 
 if __name__ == '__main__':
     main()
+    logging.shutdown()
 
 # (created using vim -- the world's best text editor)