done with long_term_test.py
authorChristian Herdtweck <christian.herdtweck@intra2net.com>
Wed, 21 Jan 2015 10:43:35 +0000 (11:43 +0100)
committerChristian Herdtweck <christian.herdtweck@intra2net.com>
Wed, 21 Jan 2015 10:43:35 +0000 (11:43 +0100)
* set timeout for ping reply -- very important variable!
* try to get more output when pingcheck process is stopped/dying
* added and commented out sigchld / child signal handler
* different logic how state changes are set
* use ConndState
* lots of other changes...

test/long_term_test.py

index cac9a44..817e7bd 100755 (executable)
@@ -13,6 +13,7 @@ Christian Herdtweck, Intra2net, January 2015
 
 # Version History
 # 15/01/15 Christian Herdtweck: started creation
+# 21/1/15 ch: running 3rd long-term test, seems to work nicely now
 
 import subprocess
 import sys
@@ -25,6 +26,8 @@ from random import randint
 import logging
 from os.path import expanduser, dirname, isdir
 from os import mkdir, EX_OK
+from connd_state import ConndState
+#import signal
 
 logger_base = logging.getLogger('long_term_test')
 
@@ -70,6 +73,7 @@ hosts-down-limit={hosts_down_limit:d}
 ping-fail-limit=25
 status-notifier-cmd=echo host status is ${status}
 link-up-interval=2
+ping-reply-timeout=3
 """
 conf_host = """
 [host]
@@ -117,11 +121,45 @@ def create_conf_file(file_prefix):
         for host in host_list:
             file_handle.write(conf_host.format(host, PING_INTERVAL))
 
-    return file_name
+    return file_name, n_hosts
 #end: create_conf_file
 
 
 ########################################################################
+### Exceptions
+########################################################################
+
+def flags_to_str(flags):
+    text_parts = []
+    if flags & select.POLLIN:
+        text_parts.append('IN')
+    if flags & select.POLLOUT:
+        text_parts.append('OUT')
+    if flags & select.POLLPRI:
+        text_parts.append('PRI')
+    if flags & select.POLLERR:
+        text_parts.append('ERR')
+    if flags & select.POLLHUP:
+        text_parts.append('HUP')
+    if flags & select.POLLNVAL:
+        text_parts.append('INVALID')
+    return ','.join(text_parts)
+
+class PollFlagsException(Exception):
+    def __init__(self, flags, proc_name):
+        super(PollFlagsException, self).__init__('Unexpected flags from polling {0}: {1}'.format(\
+                proc_name, flags_to_str(flags)))
+
+class PollHupException(Exception):
+    def __init__(self, proc_name):
+        super(PollHupException, self).__init__(\
+                'Received HUP from polling, {0} was probably killed!'.format(proc_name))
+
+class TooManyWarnings(Exception):
+    def __init__(self):
+        super(TooManyWarnings, self).__init__('Too many warnings / failures!')
+
+########################################################################
 ### PARSER
 ########################################################################
 
@@ -136,7 +174,7 @@ class ProcOutputParser:
     _out = None
     _err = None
     _poller = None
-    _flags = select.POLLIN | select.POLLPRI
+    _flags = select.POLLIN | select.POLLPRI | select.POLLHUP # | select.POLLERR
     _regexp = '(.+)\s+(INFO|ERR|WARN|DBUG|NOTE)\s+pingcheck\[\d+\]: (.+)\s*'
     _out_fileno = None
     _err_fileno = None
@@ -159,8 +197,10 @@ class ProcOutputParser:
             result = self._poller.poll(10)
             for file_no, flags in result:
                 if not (flags & self._flags):
-                    raise NotImplementedError('unexpected flags from polling: {0} --> {1}!'.format(\
-                            flags, self.flags_to_str(flags)))
+                    raise PollFlagsException(flags, 'pingcheck')
+
+                if flags & select.POLLHUP:
+                    raise PollHupException('pingcheck')
 
                 if file_no == self._out_fileno:
                     src = self.SRC_OUT
@@ -182,30 +222,14 @@ class ProcOutputParser:
     def parse(self):
         for src, line in self._read():
             try:
-                yield src, regexp(self._regexp, line).groups()
+                timestamp, level, message = regexp(self._regexp, line).groups()
+                yield src, (timestamp, text_to_level[level], message)
             except:
                 yield src, line
 
     def __iter__(self):
         return self.parse()
 
-    @staticmethod
-    def flags_to_str(flags):
-        text_parts = []
-        if flags & select.POLLIN:
-            text_parts.append('IN')
-        if flags & select.POLLOUT:
-            text_parts.append('OUT')
-        if flags & select.POLLPRI:
-            text_parts.append('PRI')
-        if flags & select.POLLERR:
-            text_parts.append('ERR')
-        if flags & select.POLLHUP:
-            text_parts.append('HUP')
-        if flags & select.POLLNVAL:
-            text_parts.append('INVALID')
-        return ','.join(text_parts)
-
 #end: class ProcOutputParser
 
 ########################################################################
@@ -252,6 +276,8 @@ class Tester:
             sleep(1)
             current_state = self._is_online()
         self._log.info('Start with state online = {0}'.format(current_state))
+        self._prev_state_changes.append((current_state, dt.now()))
+        self._schedule_next_state_change()
 
         # start looping over output from observed process
         for src, text_info in self._line_iter:
@@ -265,14 +291,18 @@ class Tester:
             else:
                 text_time, level, message = text_info
                 # first of all: write to text log so nothing is lost
-                msg = '{0} {1} (orig time {2})'.format(level, message, text_time)
+                #msg = '{0} {1} (orig time {2})'.format(level, message, text_time)
+                msg = '{0} {1}'.format(level, message)
 
-            if level in (LEVEL_WARN, LEVEL_ERROR):
+            if level in (LEVEL_WARN, LEVEL_ERROR, LEVEL_NOTICE):
                 # most common warning/error is only logged at debug level
                 if message.endswith('could not be resolved.'):
                     self._log_ping[src].debug(msg)
                     continue
-                if message.endswith('fail sending ping data. Network is unreachable'):
+                elif message.endswith('fail sending ping data. Network is unreachable'):
+                    self._log_ping[src].debug(msg)
+                    continue
+                elif message.endswith('fail sending ping data. Operation not permitted'):
                     self._log_ping[src].debug(msg)
                     continue
                 else:   # unexpected errors and warnings from pingcheck are logged at info level
@@ -342,15 +372,14 @@ class Tester:
             else:
                 new_val = True
             if self._set_online(new_val):
-                self._schedule_next_state_change(is_online)
+                self._schedule_next_state_change()
                 return True
             # if call to tell_connd fails, try again later
         return False
     #end: _handle_change_state
 
 
-    def _schedule_next_state_change(self, current_state):
-        self._prev_state_changes.append((current_state, dt.now()))
+    def _schedule_next_state_change(self):
         n_secs = randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE)
         self._next_state_change = dt.now() + timedelta(seconds=n_secs)
         self._log.info('schedule next state change for {0:%H:%M:%S} (in {1:d}s)'.format(\
@@ -359,71 +388,47 @@ class Tester:
 
 
     def _is_online(self):
-        """ determine state from tell-connd --status
-
-        to get more info, see State.get_real_state in connd/unittests/stresstest_pingcheck.py
-        """
-        output = ''
-        try:
-            output = subprocess.check_output([self._tell_connd_bin, '--status'], \
-                                             universal_newlines=True)
-        except subprocess.CalledProcessError as cpe: # non-zero exit status
-            self._warn('Non-zero exit status asking tell-connd for its --status: {0}'.format(cpe.returncode))
-            return None
-        except subprocess.TimeoutExpired as te:
-            self._warn('Timeout expired asking tell-connd for its --status. Output was {0}'.format(te.output))
-            return None
-        except:
-            self._info('Trouble asking tell-connd for its --status:')
-            self._warn(sys.exc_info())
+        """ determine state using ConndState """
+        state = ConndState.get_state(self._tell_connd_bin)
+        if not isinstance(state, ConndState):
+            err_code, output_lines = state
+            self._warn('Non-zero exit status asking tell-connd for its --status: {0}'.format(\
+                        err_code, output_lines[0]))
             return None
 
-        line_iter = output.splitlines()
-        online_mode = None
-        for line in line_iter:
-            matches = regexp('online mode\s*:\s*(.+)$', line.strip())
-            if matches:
-                matches = matches.groups()
-                if len(matches) != 1:
-                    self._warn('Could not interpret online mode from \"{0}\"'.format(line))
-                else:
-                    online_mode = matches[0]
-                    break
-        if online_mode is None:
-            self._warn('Could not determine online mode of connd! Is connd running?!')
-            return None
-        elif online_mode in ('always online', ):
-            return True
-        elif online_mode in ('offline', 'always offline', ):
-            return False
-        else:
-            self._warn('Unknown online mode: {0}'.format(online_mode))
-            return None
+        return len(state.connections) > 0
     #end: Tester._is_online
 
 
-
     def _set_online(self, new_val):
-        """ run tell-connd --online/offline P1 """
+        """ run tell-connd --online/offline P1
+
+        returns true if afterwards are in requested state
+        """
+        if new_val == self._is_online():
+            self._warn('Want to set online state to current state!')
+            return True
         if new_val:
             self._log.info('going online')
             args = [self._tell_connd_bin, '--online', 'P1']
         else:
             self._log.info('going offline')
             args = [self._tell_connd_bin, '--offline']
+
         try:
             return_code = subprocess.call(args, stderr=subprocess.STDOUT, timeout=1)
             if return_code == 0:
+                self._prev_state_changes.append((new_val, dt.now()))
                 return True
             else:
-                self._log.info('tell-connd returned {0} --> online state probably unchanged!'.format(return_code))
+                self._log.info('tell-connd returned {0} --> online state probably unchanged!'.format(\
+                                return_code))
                 return False
         except subprocess.TimeoutExpired as te:
             self._warn('Timeout expired running {0}. Output was {1}'.format(args, te.output))
             return False
         except:
-            self._log.info('Trouble calling tell-connd:')
-            self._warn(sys.exc_info())
+            self._warn('Trouble calling tell-connd', exc_info=sys.exc_info())
             return False
     #end: Tester._set_online
 
@@ -442,11 +447,13 @@ class Tester:
             # never been online --> nothing to check yet
             return
 
-        self._log.info('checking whether all hosts were pinged between {0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\
-                time_went_online, time_go_offline))
+        self._log.info('checking whether all hosts were pinged between '
+                     + '{0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\
+                        time_went_online, time_go_offline))
         for ip, reply_time in self._last_reply.items():
             if reply_time < time_went_online:
-                self._log.warn('Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\
+                self._log.warning(\
+                        'Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\
                         ip, time_went_online, time_go_offline))
                 self._fails.append((ip, time_went_online, time_go_offline))
                 self._log.info('remembered as failure number {0} / {1}'.format(\
@@ -455,18 +462,15 @@ class Tester:
                 # check if there were too many fails; if so, stop
                 if len(self._fails) > self.max_fails:
                     self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
-                    raise ValueError("Too many failures/warnings!")
+                    raise TooManyWarnings()
             #end: if not pinged recently
         #end: for all known ips
     #end: Tester._ensure_all_pinged
 
 
-    def _warn(self, message):
+    def _warn(self, message, **kwargs):
 
-        if isinstance(message, str):
-            self._log.warning(message)
-        else:
-            self._log.formatException(sys.exc_info())
+        self._log.warning(message, **kwargs)
         self._fails.append(("WARN", dt.now(), message))
         self._log.info('remembered as failure number {0} / {1}'.format(\
                 len(self._fails), self.max_fails))
@@ -474,7 +478,7 @@ class Tester:
         # check if there were too many fails; if so, stop
         if len(self._fails) > self.max_fails:
             self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
-            raise ValueError("Too many failures/warnings!")
+            raise TooManyWarnings()
     #end: Tester._warn
 
 
@@ -496,9 +500,9 @@ class Tester:
                 if ip == 'WARN':
                     log.info('Warning at {0:%H:%M:%S}: {1}'.format(time_went_online, time_go_offline))
                 else:
-                    log.info('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3}s)'.format(\
-                            ip, time_went_online, time_go_offline, \
-                            (time_go_offline-time_went_online).total_seconds()))
+                    log.info('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3:.2f}s)'\
+                            .format(ip, time_went_online, time_go_offline, \
+                                    (time_go_offline-time_went_online).total_seconds()))
 #end: class Tester
 
 ########################################################################
@@ -513,38 +517,44 @@ def parse_args():
                             epilog="(c) Christian Herdtweck, Intra2net AG, 2015")
 
     # binaries
-    parser.add_argument('--pingcheck_binary', '-p', metavar='PATH', \
+    parser.add_argument('--pingcheck-binary', '-p', metavar='PATH', \
             default=default_pingcheck_bin, \
             help='path to pingcheck binary')
-    parser.add_argument('--tell_connd_binary', '-t', metavar='PATH', \
+    parser.add_argument('--tell-connd-binary', '-t', metavar='PATH', \
             default=default_tell_connd_bin, \
             help='path to tell-connd binary')
 
     # output
-    parser.add_argument('--output_prefix', '-o', metavar='PATH', \
+    parser.add_argument('--output-prefix', '-o', metavar='PATH', \
             default=default_output_prefix, \
             help='path and/or file prefix to config and log files; ~=home, 0=prog start datetime; ' \
                 + 'default="%(default)s"')
-    parser.add_argument('--log_to_console', '-c', metavar='0|1', \
+    parser.add_argument('--log-to-console', '-c', metavar='0|1', \
             choices=(0, 1), type=int, default=1, \
             help='print all output to console? default=%(default)s')
-    parser.add_argument('--log_level_console', metavar='LEVEL', \
+    parser.add_argument('--log-level-console', metavar='LEVEL', \
             type=int, default=logging.INFO, \
-            help='log level for console output; debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
+            help='log level for console output; ' \
+               + 'debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
                 logging.DEBUG, logging.INFO, logging.WARN))
-    parser.add_argument('--log_to_file', '-f', metavar='0|1', \
+    parser.add_argument('--log-to-file', '-f', metavar='0|1', \
             choices=(0, 1), type=int, default=1, \
             help='print all output to log file? default=%(default)s')
-    parser.add_argument('--log_level_file', metavar='LEVEL', \
+    parser.add_argument('--log-level-file', metavar='LEVEL', \
             type=int, default=logging.DEBUG, \
-            help='log level for console output; debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
+            help='log level for console output; ' \
+               + 'debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
                 logging.DEBUG, logging.INFO, logging.WARN))
-    parser.add_argument('--log_file_suffix', '-l', metavar='SUFF.IX', \
+    parser.add_argument('--log-file-suffix', '-l', metavar='SUFF.IX', \
             type=str, default='.log', \
             help='2nd part of log file name after --output_prefix; default="%(default)s"')
+    parser.add_argument('--pinger-log-level', metavar='LEVEL', \
+            type=str, default='INFO', \
+            help='log level given to pingchecker, DEBUG might quickly fill log; default=%(default)s')
 
-    parser.add_argument('--max_fails', '-m', metavar='N', type=int, default=100, \
-            help='max number of failures/warnings before test will end; default: %(default)s')
+
+    parser.add_argument('--max-fails-per-host', '-m', metavar='N', type=int, default=10, \
+            help='max number of failures/warnings per host before test will end; default: %(default)s')
     parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(version))
 
     # parse
@@ -608,6 +618,76 @@ def setup_logging(opts):
 #end: setup_logging
 
 
+def handle_sigchild(log, pinger, tester, signum, curr_frame):
+    print("\n!received sigchild!\n")
+    log.error('Received SIGCHLD')
+    log.info('with args {0} and {1}'.format(signum, curr_frame))
+
+    cleanup(log, pinger, tester)
+
+
+def cleanup(log, pinger, tester, line_iter):
+    """ the stuff from finally clause in main loop; also called from handle_sigchild """
+
+    return_code = 255
+
+    log.info('-' * 72)
+    if pinger is None:
+        log.warning('Error before pinger was created')
+        return_code = 254
+    else:
+        log.info("Cleaning up")
+
+        for line in line_iter:
+            log.info('left-over output: {0}'.format(line))
+
+        try:
+            log.info('telling pinger to terminate')
+            pinger.terminate()
+
+            #return_code = pinger.wait(3)  may block!
+            outs, errs = pinger.communicate(timeout=3)
+            for line in outs.splitlines():
+                log.info('left-over output: {0}'.format(line))
+            for line in errs.splitlines():
+                log.info('left-over output: {0}'.format(line))
+
+            return_code = pinger.returncode   # may still be None
+            log.info('pinger terminated with return code {0}'.format(return_code))
+        except subprocess.TimeoutExpired as te:
+            log.warning('Timeout expired waiting for pinger to terminate: {0}'.format(te))
+            return_code = 253
+        except:
+            log.exception('trouble terminating pinger', exc_info=sys.exc_info())
+
+        if return_code is None:
+            for line in line_iter:
+                log.info('left-over output: {0}'.format(line))
+
+            # kill it
+            try:
+                log.info('killing pinger the hard way')
+                pinger.kill()
+                outs, errs = pinger.communicate(timeout=3)
+                for line in outs.splitlines():
+                    log.info('left-over output: {0}'.format(line))
+                for line in errs.splitlines():
+                    log.info('left-over output: {0}'.format(line))
+
+            except:
+                log.exception('trouble killing pinger', exc_info=sys.exc_info())
+
+        if tester is not None:
+            tester.print_failures(log)
+    #end: if have pinger
+
+    # reset original signal handler
+    #signal.signal(signal.SIGCHLD, signal.SIG_DFL)
+
+    return return_code
+#end: cleanup
+
+
 def main():
     """ parses args, create a Pinger and Parser and Tester and runs them; cleans up
     """
@@ -621,14 +701,18 @@ def main():
     log, log_file = setup_logging(opts)
 
     pinger = None
-    return_code = None
+    return_code = 255
     tester = None
+    line_iter = None
     try:
-        config_file = create_conf_file(opts.output_prefix)
+        config_file, n_hosts = create_conf_file(opts.output_prefix)
         log.info('created config file {0}'.format(config_file))
 
+        #signal.signal(signal.SIGCHLD, \
+        #        lambda signum, curr_frame: handle_sigchild(log, pinger, tester, signum, curr_frame))
+
         # pingcheck subprocess
-        pinger = subprocess.Popen([opts.pingcheck_binary, '--log-level', 'DEBUG', \
+        pinger = subprocess.Popen([opts.pingcheck_binary, '--log-level', opts.pinger_log_level, \
                                      '--log-output', 'TERMINAL', '--config-file', config_file], \
                                    stdout=subprocess.PIPE, stderr=subprocess.PIPE, \
                                    universal_newlines=True)
@@ -636,54 +720,24 @@ def main():
 
         parser = ProcOutputParser(pinger.stdout, pinger.stderr)
         line_iter = iter(parser)
-        tester = Tester(line_iter, opts.tell_connd_binary, opts.max_fails)
+        tester = Tester(line_iter, opts.tell_connd_binary, max(10, opts.max_fails_per_host*n_hosts))
+
         tester.test()
     except KeyboardInterrupt:
         log.info('stopped by user')
     except:
+        log.exception('Caught an exception in main loop', exc_info=sys.exc_info())
+        log.warning('Re-raise')
         raise
     finally:
-        if pinger is None:
-            log.warn('Error before pinger was created')
-            return_code = 254
-        else:
-            log.info("Testing ended")
-            if tester is not None:
-                tester.print_failures(log)
+        return_code = cleanup(log, pinger, tester, line_iter)
 
-            try:
-                log.info('telling pinger to terminate')
-                pinger.terminate()
-                #return_code = pinger.wait(3)  may block!
-                outs, errs = pinger.communicate(timeout=3)
-                return_code = pinger.returncode   # may still be None
-                log.info('terminated with return code {0}'.format(return_code))
-            except subprocess.TimeoutExpired as te:
-                log.warn('Timeout expired waiting for pinger to terminate: {0}'.format(te))
-                return_code = 255  # a.k.a. -1
-            except:
-                log.warn('trouble terminating pinger:')
-                log.formatException(sys.exc_info())
-
-            if return_code is None:
-                # kill it
-                try:
-                    log.info('killing pinger the hard way')
-                    pinger.kill()
-                    outs, errs = pinger.communicate(timeout=3)
-                except:
-                    log.warn('trouble killing pinger:')
-                    log.formatException(sys.exc_info())
-        #end: if have pinger
+        log.info('Will stop now')
+        if log_file is not None:
+            log.info('might find more log output in {0}'.format(log_file))
+        log.info('-' * 72)
     #end: global clean-up
 
-    if tester is not None:
-        tester.print_failures(log)
-
-    log.info('Will stop now')
-    if log_file is not None:
-        log.info('might find more log output in {0}'.format(log_file))
-
     return return_code
 
 #end: function main