added a test python script that goes online/offline and checks if pingers pinged...
authorChristian Herdtweck <christian.herdtweck@intra2net.com>
Thu, 15 Jan 2015 17:37:20 +0000 (18:37 +0100)
committerChristian Herdtweck <christian.herdtweck@intra2net.com>
Thu, 15 Jan 2015 17:37:20 +0000 (18:37 +0100)
test/long_term_test.py [new file with mode: 0755]

diff --git a/test/long_term_test.py b/test/long_term_test.py
new file mode 100755 (executable)
index 0000000..06415fe
--- /dev/null
@@ -0,0 +1,509 @@
+#!/usr/bin/env python3
+
+""" A more long-term testing of pingcheck
+
+Runs pingcheck parsing its log output; goes online and offline repeatedly to see if
+  pinging stays correct
+
+see also: connd/stresstest_pingcheck.py
+
+Christian Herdtweck, Intra2net, January 2015
+(c) Intra2net AG 2015
+"""
+
+# Version History
+# 15/01/15 Christian Herdtweck: started creation
+
+import subprocess
+from traceback import print_exc
+import select
+from re import match as regexp
+from time import sleep
+from datetime import timedelta, datetime as dt
+from warnings import warn
+from random import randint
+
+########################################################################
+### CONSTANTS
+########################################################################
+
+version = '1.0'
+#default_pingcheck_bin  = '/usr/intranator/bin/pingcheck'
+default_pingcheck_bin  = '/root/git/pingcheck/build/src/pingcheck'
+default_tell_connd_bin = '/usr/intranator/bin/tell-connd'
+default_config_file = 'test.conf'
+
+LEVEL_DEBUG  = 'dbug'
+LEVEL_INFO   = 'info'
+LEVEL_WARN   = 'warn'
+LEVEL_NOTICE = 'note'
+LEVEL_ERROR  = 'err '
+
+text_to_level = {}
+text_to_level['INFO'] = LEVEL_INFO
+text_to_level['DBUG'] = LEVEL_DEBUG
+text_to_level['WARN'] = LEVEL_WARN
+text_to_level['NOTE'] = LEVEL_NOTICE
+text_to_level['ERR']  = LEVEL_ERROR
+
+MIN_DURATION_SAME_STATE = 10   # [seconds]
+MAX_DURATION_SAME_STATE = 50   # [seconds]
+
+########################################################################
+### TESTER
+########################################################################
+
+class ParserError(Exception):
+    def __init__(self, line):
+        super(ParserError, self).__init__(\
+                'ParserError: line "{0}" did not match!'.format(line))
+
+class ParserError_NotUnique(Exception):
+    def __init__(self, line):
+        super(ParserError_NotUnique, self).__init__(\
+                'ParserError: no unique match for line "{0}"!'.format(line))
+
+class PingProc(subprocess.Popen):
+   """ wrapper around pingcheck binary """
+
+   def __init__(self, binary, conf_file):
+      super(PingProc, self).__init__(\
+              [binary, '--log-level', 'INFO', '--log-output', 'TERMINAL', '--config-file', conf_file], \
+              stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
+      print('started pingcheck with PID {0}'.format(self.pid))
+
+#end: PingProc
+
+
+class ProcOutputParser:
+
+    _out = None
+    _err = None
+    _poller = None
+    _flags = select.POLLIN | select.POLLPRI
+    _regexp = '(.+)\s+(INFO|ERR|WARN|DBUG|NOTE)\s+pingcheck\[\d+\]: (.+)\s*'
+    _out_fileno = None
+    _err_fileno = None
+
+    SRC_OUT = 'stdout'
+    SRC_ERR = 'stderr'
+    SRC_UNK = 'unknown'
+
+    def __init__(self, pipe_out, pipe_err):
+        self._out = pipe_out
+        self._err = pipe_err
+        self._poller = select.poll()
+        self._poller.register(pipe_out, self._flags)
+        self._poller.register(pipe_err, self._flags)
+        self._out_fileno = pipe_out.fileno()
+        self._err_fileno = pipe_err.fileno()
+
+    def _read(self):
+        while True:
+            result = self._poller.poll(10)
+            for file_no, flags in result:
+                if not (flags & self._flags):
+                    raise NotImplementedError('unexpected flags from polling: {0} --> {1}!'.format(\
+                            flags, self.flags_to_str(flags)))
+
+                if file_no == self._out_fileno:
+                    src = self.SRC_OUT
+                    line = self._out.readline()
+                elif file_no == self._err_fileno:
+                    src = self.SRC_ERR
+                    line = self._err.readline()
+                else:
+                    src = self.SRC_UNK
+                    line = ''
+
+                if not line:
+                    continue
+
+                yield src, dt.now(), line.strip()
+    #end: ProcOutputParser._read
+
+    def _parse_line(self, line):
+        match = regexp(self._regexp, line)
+        if match is None:
+            raise ParserError(line)
+        text_time, text_level, message = match.groups()
+        try:
+            return text_time, text_to_level[text_level], message
+        except KeyError:
+            warn('unknown level: {0}'.format(text_level))
+            return text_time, text_level, message
+
+    def parse(self):
+        for src, timestamp, line in self._read():
+            if src == self.SRC_ERR:
+                yield src, timestamp, self._parse_line(line)
+            elif src == self.SRC_OUT and regexp("RECV BUF \(\s*\d+\):[\d \|]+", line):
+                pass
+            else:
+                warn('Unexpected line and/or source: {0}, "{1}"'.format(src, line))
+
+    def __iter__(self):
+        return self.parse()
+
+    @staticmethod
+    def flags_to_str(flags):
+        text_parts = []
+        if flags & select.POLLIN:
+            text_parts.append('IN')
+        if flags & select.POLLOUT:
+            text_parts.append('OUT')
+        if flags & select.POLLPRI:
+            text_parts.append('PRI')
+        if flags & select.POLLERR:
+            text_parts.append('ERR')
+        if flags & select.POLLHUP:
+            text_parts.append('HUP')
+        if flags & select.POLLNVAL:
+            text_parts.append('INVALID')
+        return ','.join(text_parts)
+
+#end: class ProcOutputParser
+
+class Tester:
+    """ """
+
+    _line_iter = None
+    _last_request = None
+    _last_reply = None
+    _n_requests = None
+    _n_replies = None
+    _next_state_change = None
+    _tell_connd_bin = None
+    _prev_state_changes = None
+    _fails = None
+
+    def __init__(self, line_iter, tell_connd_bin):
+        self._line_iter = line_iter
+        self._last_request = {}
+        self._last_reply = {}
+        self._n_requests = {}
+        self._n_replies = {}
+        self._tell_connd_bin = tell_connd_bin
+        self._prev_state_changes = []
+        self._fails = []
+
+
+    def test(self):
+        self._handle_change_state()
+
+        for src, timestamp, (text_time, level, message) in self._line_iter:
+            # always print warnings and errors
+            if level in (LEVEL_WARN, LEVEL_ERROR):
+                if message.endswith('could not be resolved.'):
+                    continue
+                if message.endswith('fail sending ping data. Network is unreachable'):
+                    continue
+                else:
+                    self._print('{0}: {1}'.format(level, message))
+                    continue
+            if self._handle_new_ip(message):
+                continue
+            if self._handle_request(message):
+                continue
+            if self._handle_reply(message):
+                continue
+            if self._handle_change_state():
+                continue
+
+            #self._print('ignoring message "{0}"'.format(message))
+        #end: for all lines from log
+    #end: Tester.test
+
+
+    def _handle_new_ip(self, message):
+        matches = regexp('- (\d+\.\d+\.\d+\.\d+) \[\d+s\]', message)
+        if matches is None:
+            matches = regexp('Host is already an IP: (\d+\.\d+\.\d+\.\d+)', message)
+        if matches:
+            ip = matches.groups()[0]
+            self._print('remember new IP {0}'.format(ip))
+            self._last_request[ip] = dt.now()
+            self._last_reply[ip] = dt.now()
+            self._n_requests[ip] = 0
+            self._n_replies[ip] = 0
+            return True
+        return False
+
+    def _handle_request(self, message):
+        matches = regexp('(\d+\.\d+\.\d+\.\d+): sending ping', message)
+        if matches is not None:
+            ip = matches.groups()[0]
+            self._print('pinging IP {0}'.format(ip))
+            self._last_request[ip] = dt.now()
+            self._n_requests[ip] += 1
+            return True
+        return False
+
+    def _handle_reply(self, message):
+        matches = regexp('(\d+\.\d+\.\d+\.\d+): Received reply', message)
+        if matches is not None:
+            ip = matches.groups()[0]
+            self._print('received reply from {0}'.format(ip))
+            self._last_reply[ip] = dt.now()
+            self._n_replies[ip] += 1
+            return True
+        return False
+
+    def _handle_change_state(self):
+        if self._next_state_change is None:
+            # called at start of test()
+            n_secs = randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE)
+            self._next_state_change = dt.now() + timedelta(seconds=n_secs)
+            current_state = self._is_online()
+            if current_state is not None:
+                self._print('Start with state online = {0}'.format(current_state))
+                self._prev_state_changes.append((current_state, dt.now()))
+            self._print('schedule next state change for {0:%H:%M:%S} (in {1}s)'.format(\
+                    self._next_state_change, n_secs))
+
+        elif dt.now() > self._next_state_change:
+
+            # check if there were too many fails; if so, stop
+            if len(self._fails) > 100:
+                self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
+                raise ValueError("Too many failures/warnings!")
+
+            is_online = self._is_online()
+            if is_online is None:
+                return False   # try again later
+            elif is_online:
+                self._ensure_all_pinged()
+                new_val = False
+            else:
+                new_val = True
+            if self._set_online(new_val):
+                self._prev_state_changes.append((new_val, dt.now()))
+                self._next_state_change = dt.now() + timedelta(\
+                        seconds=randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE))
+                self._print('schedule next state change for {0:%H:%M:%S}'.format(self._next_state_change))
+                return True
+            # if call to tell_connd fails, try again later
+        return False
+    #end: _handle_change_state
+
+
+    def _is_online(self):
+        """ determine state from tell-connd --status
+
+        to get more info, see State.get_real_state in connd/unittests/stresstest_pingcheck.py
+        """
+        output = ''
+        try:
+            output = subprocess.check_output([self._tell_connd_bin, '--status'], \
+                                             universal_newlines=True)
+        except subprocess.CalledProcessError as cpe: # non-zero exit status
+            text = 'Non-zero exit status asking tell-connd for its --status: {0}'.format(cpe.returncode)
+            warn(text)
+            self._fails.append(("WARN", dt.now(), text))
+            return None
+        except subprocess.TimeoutExpired as te:
+            text = 'Timeout expired asking tell-connd for its --status. Output was {0}'.format(te.output)
+            warn(text)
+            self._fails.append(("WARN", dt.now(), text))
+            return None
+        except:
+            self._print('Trouble asking tell-connd for its --status:')
+            print_exc()
+            return None
+
+        line_iter = output.splitlines()
+        online_mode = None
+        for line in line_iter:
+            matches = regexp('online mode\s*:\s*(.+)$', line.strip())
+            if matches:
+                matches = matches.groups()
+                if len(matches) != 1:
+                    raise ValueError("Could not interpret online mode from \"{0}\"".format(line))
+                else:
+                    online_mode = matches[0]
+                    break
+        if online_mode is None:
+            text = 'Could not determine online mode of connd! Is connd running?!'
+            warn(text)
+            self._fails.append(("WARN", dt.now(), text))
+            return None
+        elif online_mode in ('always online', ):
+            return True
+        elif online_mode in ('offline', 'always offline', ):
+            return False
+        else:
+            text = 'Unknown online mode: {0}'.format(online_mode)
+            warn(text)
+            self._fails.append(("WARN", dt.now(), text))
+            return None
+    #end: Tester._is_online
+
+
+
+    def _set_online(self, new_val):
+        """ run tell-connd --online/offline P1 """
+        if new_val:
+            self._print('going online')
+            args = [self._tell_connd_bin, '--online', 'P1']
+        else:
+            self._print('going offline')
+            args = [self._tell_connd_bin, '--offline']
+        try:
+            return_code = subprocess.call(args, stderr=subprocess.STDOUT, timeout=1)
+            if return_code == 0:
+                return True
+            else:
+                self._print('tell-connd returned {0} --> online state probably unchanged!'.format(return_code))
+                return False
+        except subprocess.TimeoutExpired as te:
+            text = 'Timeout expired running {0}. Output was {1}'.format(args, te.output)
+            warn(text)
+            self._fails.append(("WARN", dt.now(), text))
+            return False
+        except:
+            self._print('Trouble calling tell-connd:')
+            print_exc()
+            return False
+    #end: Tester._set_online
+
+
+    def _ensure_all_pinged(self):
+        """ called before going offline; check if all hosts were successfully pinged """
+        # determine when we last went online
+        time_went_online = None
+        time_go_offline = dt.now()
+
+        for went_online, change_time in reversed(self._prev_state_changes):
+            if went_online:
+                time_went_online = change_time
+                break
+        if time_went_online is None:
+            # never been online --> nothing to check yet
+            return
+
+        self._print('checking whether all hosts were pinged between {0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\
+                time_went_online, time_go_offline))
+        for ip, reply_time in self._last_reply.items():
+            if reply_time < time_went_online:
+                text = 'Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\
+                        ip, time_went_online, time_go_offline)
+                warn(text)
+                self._fails.append((ip, time_went_online, time_go_offline))
+
+    #end: Tester._ensure_all_pinged
+
+
+    def _print(self, message):
+        now = dt.now()
+        if self._next_state_change is not None:
+            print('{0:%H:%M:%S};next{1:+5.1f}s;{2} fails: {3}'.format(now, \
+                    (now - self._next_state_change).total_seconds(), len(self._fails), message))
+        else:
+            print('{0:%H:%M:%S};next-?????;{1} fails: {2}'.format(now, len(self._fails), message))
+
+
+    def print_failures(self):
+        if len(self._fails) == 0:
+            print('No fails! :-)')
+        else:
+            print('Recorded the following failures and warnings:')
+            for ip, time_went_online, time_go_offline in self._fails:
+                if ip == 'WARN':
+                    print('Warning at {0:%H:%M:%S}: {1}'.format(time_went_online, time_go_offline))
+                else:
+                    print('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3}s)'.format(\
+                            ip, time_went_online, time_go_offline, \
+                            (time_go_offline-time_went_online).total_seconds()))
+#end: class Tester
+
+########################################################################
+### MAIN
+########################################################################
+
+def parse_args():
+    """ parse command line arguments and set defaults """
+    from argparse import ArgumentParser
+    parser = ArgumentParser(description="Long-term testing of pingcheck: goes online and offline " \
+                                      + "using tell-connd and checks log for correct output", \
+                            epilog="(c) Christian Herdtweck, Intra2net AG, 2015")
+
+    # binaries
+    parser.add_argument('--pingcheck_binary', '-p', metavar='PATH', \
+            default=default_pingcheck_bin, \
+            help='path to pingcheck binary')
+    parser.add_argument('--tell_connd_binary', '-t', metavar='PATH', \
+            default=default_tell_connd_bin, \
+            help='path to tell-connd binary')
+
+    parser.add_argument('--config_file', '-c', metavar='PATH', \
+            default=default_config_file, \
+            help='path to config file for pingcheck')
+
+    # default
+    parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(version))
+    return parser.parse_args()
+#end: parse_args
+
+
+def main():
+    """ Main function, called when running file as script; currently raises a NotImplementedError 
+    """
+
+    opts = parse_args()
+
+    pinger = None
+    return_code = None
+    tester = None
+    try:
+        pinger = PingProc(opts.pingcheck_binary, opts.config_file)
+        parser = ProcOutputParser(pinger.stdout, pinger.stderr)
+        line_iter = iter(parser)
+        tester = Tester(line_iter, opts.tell_connd_binary)
+        tester.test()
+    except KeyboardInterrupt:
+        print('stopped by user')
+    except:
+        if tester is not None:
+            tester.print_failures()
+        raise
+    finally:
+        if pinger is None:
+            print('failed to create pinger')
+            return_code = 254
+        else:
+            try:
+                print('telling pinger to terminate')
+                pinger.terminate()
+                return_code = pinger.wait(3)
+                print('terminated with return code {0}'.format(return_code))
+            except subprocess.TimeoutExpired as te:
+                print('Timeout expired waiting for pinger to terminate: {0}'.format(te))
+                return_code = 255  # a.k.a. -1
+            except:
+                print('trouble terminating pinger:')
+                print_exc()
+
+            if return_code is None:
+                # kill it
+                try:
+                    pinger.kill()
+                    outs, errs = pinger.communicate()
+                except:
+                    print('trouble killing pinger:')
+                    print_exc()
+        #end: if have pinger
+    #end: global clean-up
+
+    if tester is not None:
+        tester.print_failures()
+
+    return return_code
+
+#end: function main
+
+
+if __name__ == '__main__':
+    main()
+
+# (created using vim -- the world's best text editor)
+