--- /dev/null
+#!/usr/bin/env python3
+
+""" A more long-term testing of pingcheck
+
+Runs pingcheck parsing its log output; goes online and offline repeatedly to see if
+ pinging stays correct
+
+see also: connd/stresstest_pingcheck.py
+
+Christian Herdtweck, Intra2net, January 2015
+(c) Intra2net AG 2015
+"""
+
+# Version History
+# 15/01/15 Christian Herdtweck: started creation
+
+import subprocess
+from traceback import print_exc
+import select
+from re import match as regexp
+from time import sleep
+from datetime import timedelta, datetime as dt
+from warnings import warn
+from random import randint
+
+########################################################################
+### CONSTANTS
+########################################################################
+
+version = '1.0'
+#default_pingcheck_bin = '/usr/intranator/bin/pingcheck'
+default_pingcheck_bin = '/root/git/pingcheck/build/src/pingcheck'
+default_tell_connd_bin = '/usr/intranator/bin/tell-connd'
+default_config_file = 'test.conf'
+
+LEVEL_DEBUG = 'dbug'
+LEVEL_INFO = 'info'
+LEVEL_WARN = 'warn'
+LEVEL_NOTICE = 'note'
+LEVEL_ERROR = 'err '
+
+text_to_level = {}
+text_to_level['INFO'] = LEVEL_INFO
+text_to_level['DBUG'] = LEVEL_DEBUG
+text_to_level['WARN'] = LEVEL_WARN
+text_to_level['NOTE'] = LEVEL_NOTICE
+text_to_level['ERR'] = LEVEL_ERROR
+
+MIN_DURATION_SAME_STATE = 10 # [seconds]
+MAX_DURATION_SAME_STATE = 50 # [seconds]
+
+########################################################################
+### TESTER
+########################################################################
+
+class ParserError(Exception):
+ def __init__(self, line):
+ super(ParserError, self).__init__(\
+ 'ParserError: line "{0}" did not match!'.format(line))
+
+class ParserError_NotUnique(Exception):
+ def __init__(self, line):
+ super(ParserError_NotUnique, self).__init__(\
+ 'ParserError: no unique match for line "{0}"!'.format(line))
+
+class PingProc(subprocess.Popen):
+ """ wrapper around pingcheck binary """
+
+ def __init__(self, binary, conf_file):
+ super(PingProc, self).__init__(\
+ [binary, '--log-level', 'INFO', '--log-output', 'TERMINAL', '--config-file', conf_file], \
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
+ print('started pingcheck with PID {0}'.format(self.pid))
+
+#end: PingProc
+
+
+class ProcOutputParser:
+
+ _out = None
+ _err = None
+ _poller = None
+ _flags = select.POLLIN | select.POLLPRI
+ _regexp = '(.+)\s+(INFO|ERR|WARN|DBUG|NOTE)\s+pingcheck\[\d+\]: (.+)\s*'
+ _out_fileno = None
+ _err_fileno = None
+
+ SRC_OUT = 'stdout'
+ SRC_ERR = 'stderr'
+ SRC_UNK = 'unknown'
+
+ def __init__(self, pipe_out, pipe_err):
+ self._out = pipe_out
+ self._err = pipe_err
+ self._poller = select.poll()
+ self._poller.register(pipe_out, self._flags)
+ self._poller.register(pipe_err, self._flags)
+ self._out_fileno = pipe_out.fileno()
+ self._err_fileno = pipe_err.fileno()
+
+ def _read(self):
+ while True:
+ result = self._poller.poll(10)
+ for file_no, flags in result:
+ if not (flags & self._flags):
+ raise NotImplementedError('unexpected flags from polling: {0} --> {1}!'.format(\
+ flags, self.flags_to_str(flags)))
+
+ if file_no == self._out_fileno:
+ src = self.SRC_OUT
+ line = self._out.readline()
+ elif file_no == self._err_fileno:
+ src = self.SRC_ERR
+ line = self._err.readline()
+ else:
+ src = self.SRC_UNK
+ line = ''
+
+ if not line:
+ continue
+
+ yield src, dt.now(), line.strip()
+ #end: ProcOutputParser._read
+
+ def _parse_line(self, line):
+ match = regexp(self._regexp, line)
+ if match is None:
+ raise ParserError(line)
+ text_time, text_level, message = match.groups()
+ try:
+ return text_time, text_to_level[text_level], message
+ except KeyError:
+ warn('unknown level: {0}'.format(text_level))
+ return text_time, text_level, message
+
+ def parse(self):
+ for src, timestamp, line in self._read():
+ if src == self.SRC_ERR:
+ yield src, timestamp, self._parse_line(line)
+ elif src == self.SRC_OUT and regexp("RECV BUF \(\s*\d+\):[\d \|]+", line):
+ pass
+ else:
+ warn('Unexpected line and/or source: {0}, "{1}"'.format(src, line))
+
+ def __iter__(self):
+ return self.parse()
+
+ @staticmethod
+ def flags_to_str(flags):
+ text_parts = []
+ if flags & select.POLLIN:
+ text_parts.append('IN')
+ if flags & select.POLLOUT:
+ text_parts.append('OUT')
+ if flags & select.POLLPRI:
+ text_parts.append('PRI')
+ if flags & select.POLLERR:
+ text_parts.append('ERR')
+ if flags & select.POLLHUP:
+ text_parts.append('HUP')
+ if flags & select.POLLNVAL:
+ text_parts.append('INVALID')
+ return ','.join(text_parts)
+
+#end: class ProcOutputParser
+
+class Tester:
+ """ """
+
+ _line_iter = None
+ _last_request = None
+ _last_reply = None
+ _n_requests = None
+ _n_replies = None
+ _next_state_change = None
+ _tell_connd_bin = None
+ _prev_state_changes = None
+ _fails = None
+
+ def __init__(self, line_iter, tell_connd_bin):
+ self._line_iter = line_iter
+ self._last_request = {}
+ self._last_reply = {}
+ self._n_requests = {}
+ self._n_replies = {}
+ self._tell_connd_bin = tell_connd_bin
+ self._prev_state_changes = []
+ self._fails = []
+
+
+ def test(self):
+ self._handle_change_state()
+
+ for src, timestamp, (text_time, level, message) in self._line_iter:
+ # always print warnings and errors
+ if level in (LEVEL_WARN, LEVEL_ERROR):
+ if message.endswith('could not be resolved.'):
+ continue
+ if message.endswith('fail sending ping data. Network is unreachable'):
+ continue
+ else:
+ self._print('{0}: {1}'.format(level, message))
+ continue
+ if self._handle_new_ip(message):
+ continue
+ if self._handle_request(message):
+ continue
+ if self._handle_reply(message):
+ continue
+ if self._handle_change_state():
+ continue
+
+ #self._print('ignoring message "{0}"'.format(message))
+ #end: for all lines from log
+ #end: Tester.test
+
+
+ def _handle_new_ip(self, message):
+ matches = regexp('- (\d+\.\d+\.\d+\.\d+) \[\d+s\]', message)
+ if matches is None:
+ matches = regexp('Host is already an IP: (\d+\.\d+\.\d+\.\d+)', message)
+ if matches:
+ ip = matches.groups()[0]
+ self._print('remember new IP {0}'.format(ip))
+ self._last_request[ip] = dt.now()
+ self._last_reply[ip] = dt.now()
+ self._n_requests[ip] = 0
+ self._n_replies[ip] = 0
+ return True
+ return False
+
+ def _handle_request(self, message):
+ matches = regexp('(\d+\.\d+\.\d+\.\d+): sending ping', message)
+ if matches is not None:
+ ip = matches.groups()[0]
+ self._print('pinging IP {0}'.format(ip))
+ self._last_request[ip] = dt.now()
+ self._n_requests[ip] += 1
+ return True
+ return False
+
+ def _handle_reply(self, message):
+ matches = regexp('(\d+\.\d+\.\d+\.\d+): Received reply', message)
+ if matches is not None:
+ ip = matches.groups()[0]
+ self._print('received reply from {0}'.format(ip))
+ self._last_reply[ip] = dt.now()
+ self._n_replies[ip] += 1
+ return True
+ return False
+
+ def _handle_change_state(self):
+ if self._next_state_change is None:
+ # called at start of test()
+ n_secs = randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE)
+ self._next_state_change = dt.now() + timedelta(seconds=n_secs)
+ current_state = self._is_online()
+ if current_state is not None:
+ self._print('Start with state online = {0}'.format(current_state))
+ self._prev_state_changes.append((current_state, dt.now()))
+ self._print('schedule next state change for {0:%H:%M:%S} (in {1}s)'.format(\
+ self._next_state_change, n_secs))
+
+ elif dt.now() > self._next_state_change:
+
+ # check if there were too many fails; if so, stop
+ if len(self._fails) > 100:
+ self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
+ raise ValueError("Too many failures/warnings!")
+
+ is_online = self._is_online()
+ if is_online is None:
+ return False # try again later
+ elif is_online:
+ self._ensure_all_pinged()
+ new_val = False
+ else:
+ new_val = True
+ if self._set_online(new_val):
+ self._prev_state_changes.append((new_val, dt.now()))
+ self._next_state_change = dt.now() + timedelta(\
+ seconds=randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE))
+ self._print('schedule next state change for {0:%H:%M:%S}'.format(self._next_state_change))
+ return True
+ # if call to tell_connd fails, try again later
+ return False
+ #end: _handle_change_state
+
+
+ def _is_online(self):
+ """ determine state from tell-connd --status
+
+ to get more info, see State.get_real_state in connd/unittests/stresstest_pingcheck.py
+ """
+ output = ''
+ try:
+ output = subprocess.check_output([self._tell_connd_bin, '--status'], \
+ universal_newlines=True)
+ except subprocess.CalledProcessError as cpe: # non-zero exit status
+ text = 'Non-zero exit status asking tell-connd for its --status: {0}'.format(cpe.returncode)
+ warn(text)
+ self._fails.append(("WARN", dt.now(), text))
+ return None
+ except subprocess.TimeoutExpired as te:
+ text = 'Timeout expired asking tell-connd for its --status. Output was {0}'.format(te.output)
+ warn(text)
+ self._fails.append(("WARN", dt.now(), text))
+ return None
+ except:
+ self._print('Trouble asking tell-connd for its --status:')
+ print_exc()
+ return None
+
+ line_iter = output.splitlines()
+ online_mode = None
+ for line in line_iter:
+ matches = regexp('online mode\s*:\s*(.+)$', line.strip())
+ if matches:
+ matches = matches.groups()
+ if len(matches) != 1:
+ raise ValueError("Could not interpret online mode from \"{0}\"".format(line))
+ else:
+ online_mode = matches[0]
+ break
+ if online_mode is None:
+ text = 'Could not determine online mode of connd! Is connd running?!'
+ warn(text)
+ self._fails.append(("WARN", dt.now(), text))
+ return None
+ elif online_mode in ('always online', ):
+ return True
+ elif online_mode in ('offline', 'always offline', ):
+ return False
+ else:
+ text = 'Unknown online mode: {0}'.format(online_mode)
+ warn(text)
+ self._fails.append(("WARN", dt.now(), text))
+ return None
+ #end: Tester._is_online
+
+
+
+ def _set_online(self, new_val):
+ """ run tell-connd --online/offline P1 """
+ if new_val:
+ self._print('going online')
+ args = [self._tell_connd_bin, '--online', 'P1']
+ else:
+ self._print('going offline')
+ args = [self._tell_connd_bin, '--offline']
+ try:
+ return_code = subprocess.call(args, stderr=subprocess.STDOUT, timeout=1)
+ if return_code == 0:
+ return True
+ else:
+ self._print('tell-connd returned {0} --> online state probably unchanged!'.format(return_code))
+ return False
+ except subprocess.TimeoutExpired as te:
+ text = 'Timeout expired running {0}. Output was {1}'.format(args, te.output)
+ warn(text)
+ self._fails.append(("WARN", dt.now(), text))
+ return False
+ except:
+ self._print('Trouble calling tell-connd:')
+ print_exc()
+ return False
+ #end: Tester._set_online
+
+
+ def _ensure_all_pinged(self):
+ """ called before going offline; check if all hosts were successfully pinged """
+ # determine when we last went online
+ time_went_online = None
+ time_go_offline = dt.now()
+
+ for went_online, change_time in reversed(self._prev_state_changes):
+ if went_online:
+ time_went_online = change_time
+ break
+ if time_went_online is None:
+ # never been online --> nothing to check yet
+ return
+
+ self._print('checking whether all hosts were pinged between {0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\
+ time_went_online, time_go_offline))
+ for ip, reply_time in self._last_reply.items():
+ if reply_time < time_went_online:
+ text = 'Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\
+ ip, time_went_online, time_go_offline)
+ warn(text)
+ self._fails.append((ip, time_went_online, time_go_offline))
+
+ #end: Tester._ensure_all_pinged
+
+
+ def _print(self, message):
+ now = dt.now()
+ if self._next_state_change is not None:
+ print('{0:%H:%M:%S};next{1:+5.1f}s;{2} fails: {3}'.format(now, \
+ (now - self._next_state_change).total_seconds(), len(self._fails), message))
+ else:
+ print('{0:%H:%M:%S};next-?????;{1} fails: {2}'.format(now, len(self._fails), message))
+
+
+ def print_failures(self):
+ if len(self._fails) == 0:
+ print('No fails! :-)')
+ else:
+ print('Recorded the following failures and warnings:')
+ for ip, time_went_online, time_go_offline in self._fails:
+ if ip == 'WARN':
+ print('Warning at {0:%H:%M:%S}: {1}'.format(time_went_online, time_go_offline))
+ else:
+ print('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3}s)'.format(\
+ ip, time_went_online, time_go_offline, \
+ (time_go_offline-time_went_online).total_seconds()))
+#end: class Tester
+
+########################################################################
+### MAIN
+########################################################################
+
+def parse_args():
+ """ parse command line arguments and set defaults """
+ from argparse import ArgumentParser
+ parser = ArgumentParser(description="Long-term testing of pingcheck: goes online and offline " \
+ + "using tell-connd and checks log for correct output", \
+ epilog="(c) Christian Herdtweck, Intra2net AG, 2015")
+
+ # binaries
+ parser.add_argument('--pingcheck_binary', '-p', metavar='PATH', \
+ default=default_pingcheck_bin, \
+ help='path to pingcheck binary')
+ parser.add_argument('--tell_connd_binary', '-t', metavar='PATH', \
+ default=default_tell_connd_bin, \
+ help='path to tell-connd binary')
+
+ parser.add_argument('--config_file', '-c', metavar='PATH', \
+ default=default_config_file, \
+ help='path to config file for pingcheck')
+
+ # default
+ parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(version))
+ return parser.parse_args()
+#end: parse_args
+
+
+def main():
+ """ Main function, called when running file as script; currently raises a NotImplementedError
+ """
+
+ opts = parse_args()
+
+ pinger = None
+ return_code = None
+ tester = None
+ try:
+ pinger = PingProc(opts.pingcheck_binary, opts.config_file)
+ parser = ProcOutputParser(pinger.stdout, pinger.stderr)
+ line_iter = iter(parser)
+ tester = Tester(line_iter, opts.tell_connd_binary)
+ tester.test()
+ except KeyboardInterrupt:
+ print('stopped by user')
+ except:
+ if tester is not None:
+ tester.print_failures()
+ raise
+ finally:
+ if pinger is None:
+ print('failed to create pinger')
+ return_code = 254
+ else:
+ try:
+ print('telling pinger to terminate')
+ pinger.terminate()
+ return_code = pinger.wait(3)
+ print('terminated with return code {0}'.format(return_code))
+ except subprocess.TimeoutExpired as te:
+ print('Timeout expired waiting for pinger to terminate: {0}'.format(te))
+ return_code = 255 # a.k.a. -1
+ except:
+ print('trouble terminating pinger:')
+ print_exc()
+
+ if return_code is None:
+ # kill it
+ try:
+ pinger.kill()
+ outs, errs = pinger.communicate()
+ except:
+ print('trouble killing pinger:')
+ print_exc()
+ #end: if have pinger
+ #end: global clean-up
+
+ if tester is not None:
+ tester.print_failures()
+
+ return return_code
+
+#end: function main
+
+
+if __name__ == '__main__':
+ main()
+
+# (created using vim -- the world's best text editor)
+