From aaf62c142ebba43efa5ad0e6e5e8e605352898e4 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 15 Jan 2015 18:37:20 +0100 Subject: [PATCH] added a test python script that goes online/offline and checks if pingers pinged enough --- test/long_term_test.py | 509 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 509 insertions(+), 0 deletions(-) create mode 100755 test/long_term_test.py diff --git a/test/long_term_test.py b/test/long_term_test.py new file mode 100755 index 0000000..06415fe --- /dev/null +++ b/test/long_term_test.py @@ -0,0 +1,509 @@ +#!/usr/bin/env python3 + +""" A more long-term testing of pingcheck + +Runs pingcheck parsing its log output; goes online and offline repeatedly to see if + pinging stays correct + +see also: connd/stresstest_pingcheck.py + +Christian Herdtweck, Intra2net, January 2015 +(c) Intra2net AG 2015 +""" + +# Version History +# 15/01/15 Christian Herdtweck: started creation + +import subprocess +from traceback import print_exc +import select +from re import match as regexp +from time import sleep +from datetime import timedelta, datetime as dt +from warnings import warn +from random import randint + +######################################################################## +### CONSTANTS +######################################################################## + +version = '1.0' +#default_pingcheck_bin = '/usr/intranator/bin/pingcheck' +default_pingcheck_bin = '/root/git/pingcheck/build/src/pingcheck' +default_tell_connd_bin = '/usr/intranator/bin/tell-connd' +default_config_file = 'test.conf' + +LEVEL_DEBUG = 'dbug' +LEVEL_INFO = 'info' +LEVEL_WARN = 'warn' +LEVEL_NOTICE = 'note' +LEVEL_ERROR = 'err ' + +text_to_level = {} +text_to_level['INFO'] = LEVEL_INFO +text_to_level['DBUG'] = LEVEL_DEBUG +text_to_level['WARN'] = LEVEL_WARN +text_to_level['NOTE'] = LEVEL_NOTICE +text_to_level['ERR'] = LEVEL_ERROR + +MIN_DURATION_SAME_STATE = 10 # [seconds] +MAX_DURATION_SAME_STATE = 50 # [seconds] + +######################################################################## +### TESTER +######################################################################## + +class ParserError(Exception): + def __init__(self, line): + super(ParserError, self).__init__(\ + 'ParserError: line "{0}" did not match!'.format(line)) + +class ParserError_NotUnique(Exception): + def __init__(self, line): + super(ParserError_NotUnique, self).__init__(\ + 'ParserError: no unique match for line "{0}"!'.format(line)) + +class PingProc(subprocess.Popen): + """ wrapper around pingcheck binary """ + + def __init__(self, binary, conf_file): + super(PingProc, self).__init__(\ + [binary, '--log-level', 'INFO', '--log-output', 'TERMINAL', '--config-file', conf_file], \ + stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) + print('started pingcheck with PID {0}'.format(self.pid)) + +#end: PingProc + + +class ProcOutputParser: + + _out = None + _err = None + _poller = None + _flags = select.POLLIN | select.POLLPRI + _regexp = '(.+)\s+(INFO|ERR|WARN|DBUG|NOTE)\s+pingcheck\[\d+\]: (.+)\s*' + _out_fileno = None + _err_fileno = None + + SRC_OUT = 'stdout' + SRC_ERR = 'stderr' + SRC_UNK = 'unknown' + + def __init__(self, pipe_out, pipe_err): + self._out = pipe_out + self._err = pipe_err + self._poller = select.poll() + self._poller.register(pipe_out, self._flags) + self._poller.register(pipe_err, self._flags) + self._out_fileno = pipe_out.fileno() + self._err_fileno = pipe_err.fileno() + + def _read(self): + while True: + result = self._poller.poll(10) + for file_no, flags in result: + if not (flags & self._flags): + raise NotImplementedError('unexpected flags from polling: {0} --> {1}!'.format(\ + flags, self.flags_to_str(flags))) + + if file_no == self._out_fileno: + src = self.SRC_OUT + line = self._out.readline() + elif file_no == self._err_fileno: + src = self.SRC_ERR + line = self._err.readline() + else: + src = self.SRC_UNK + line = '' + + if not line: + continue + + yield src, dt.now(), line.strip() + #end: ProcOutputParser._read + + def _parse_line(self, line): + match = regexp(self._regexp, line) + if match is None: + raise ParserError(line) + text_time, text_level, message = match.groups() + try: + return text_time, text_to_level[text_level], message + except KeyError: + warn('unknown level: {0}'.format(text_level)) + return text_time, text_level, message + + def parse(self): + for src, timestamp, line in self._read(): + if src == self.SRC_ERR: + yield src, timestamp, self._parse_line(line) + elif src == self.SRC_OUT and regexp("RECV BUF \(\s*\d+\):[\d \|]+", line): + pass + else: + warn('Unexpected line and/or source: {0}, "{1}"'.format(src, line)) + + def __iter__(self): + return self.parse() + + @staticmethod + def flags_to_str(flags): + text_parts = [] + if flags & select.POLLIN: + text_parts.append('IN') + if flags & select.POLLOUT: + text_parts.append('OUT') + if flags & select.POLLPRI: + text_parts.append('PRI') + if flags & select.POLLERR: + text_parts.append('ERR') + if flags & select.POLLHUP: + text_parts.append('HUP') + if flags & select.POLLNVAL: + text_parts.append('INVALID') + return ','.join(text_parts) + +#end: class ProcOutputParser + +class Tester: + """ """ + + _line_iter = None + _last_request = None + _last_reply = None + _n_requests = None + _n_replies = None + _next_state_change = None + _tell_connd_bin = None + _prev_state_changes = None + _fails = None + + def __init__(self, line_iter, tell_connd_bin): + self._line_iter = line_iter + self._last_request = {} + self._last_reply = {} + self._n_requests = {} + self._n_replies = {} + self._tell_connd_bin = tell_connd_bin + self._prev_state_changes = [] + self._fails = [] + + + def test(self): + self._handle_change_state() + + for src, timestamp, (text_time, level, message) in self._line_iter: + # always print warnings and errors + if level in (LEVEL_WARN, LEVEL_ERROR): + if message.endswith('could not be resolved.'): + continue + if message.endswith('fail sending ping data. Network is unreachable'): + continue + else: + self._print('{0}: {1}'.format(level, message)) + continue + if self._handle_new_ip(message): + continue + if self._handle_request(message): + continue + if self._handle_reply(message): + continue + if self._handle_change_state(): + continue + + #self._print('ignoring message "{0}"'.format(message)) + #end: for all lines from log + #end: Tester.test + + + def _handle_new_ip(self, message): + matches = regexp('- (\d+\.\d+\.\d+\.\d+) \[\d+s\]', message) + if matches is None: + matches = regexp('Host is already an IP: (\d+\.\d+\.\d+\.\d+)', message) + if matches: + ip = matches.groups()[0] + self._print('remember new IP {0}'.format(ip)) + self._last_request[ip] = dt.now() + self._last_reply[ip] = dt.now() + self._n_requests[ip] = 0 + self._n_replies[ip] = 0 + return True + return False + + def _handle_request(self, message): + matches = regexp('(\d+\.\d+\.\d+\.\d+): sending ping', message) + if matches is not None: + ip = matches.groups()[0] + self._print('pinging IP {0}'.format(ip)) + self._last_request[ip] = dt.now() + self._n_requests[ip] += 1 + return True + return False + + def _handle_reply(self, message): + matches = regexp('(\d+\.\d+\.\d+\.\d+): Received reply', message) + if matches is not None: + ip = matches.groups()[0] + self._print('received reply from {0}'.format(ip)) + self._last_reply[ip] = dt.now() + self._n_replies[ip] += 1 + return True + return False + + def _handle_change_state(self): + if self._next_state_change is None: + # called at start of test() + n_secs = randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE) + self._next_state_change = dt.now() + timedelta(seconds=n_secs) + current_state = self._is_online() + if current_state is not None: + self._print('Start with state online = {0}'.format(current_state)) + self._prev_state_changes.append((current_state, dt.now())) + self._print('schedule next state change for {0:%H:%M:%S} (in {1}s)'.format(\ + self._next_state_change, n_secs)) + + elif dt.now() > self._next_state_change: + + # check if there were too many fails; if so, stop + if len(self._fails) > 100: + self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!")) + raise ValueError("Too many failures/warnings!") + + is_online = self._is_online() + if is_online is None: + return False # try again later + elif is_online: + self._ensure_all_pinged() + new_val = False + else: + new_val = True + if self._set_online(new_val): + self._prev_state_changes.append((new_val, dt.now())) + self._next_state_change = dt.now() + timedelta(\ + seconds=randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE)) + self._print('schedule next state change for {0:%H:%M:%S}'.format(self._next_state_change)) + return True + # if call to tell_connd fails, try again later + return False + #end: _handle_change_state + + + def _is_online(self): + """ determine state from tell-connd --status + + to get more info, see State.get_real_state in connd/unittests/stresstest_pingcheck.py + """ + output = '' + try: + output = subprocess.check_output([self._tell_connd_bin, '--status'], \ + universal_newlines=True) + except subprocess.CalledProcessError as cpe: # non-zero exit status + text = 'Non-zero exit status asking tell-connd for its --status: {0}'.format(cpe.returncode) + warn(text) + self._fails.append(("WARN", dt.now(), text)) + return None + except subprocess.TimeoutExpired as te: + text = 'Timeout expired asking tell-connd for its --status. Output was {0}'.format(te.output) + warn(text) + self._fails.append(("WARN", dt.now(), text)) + return None + except: + self._print('Trouble asking tell-connd for its --status:') + print_exc() + return None + + line_iter = output.splitlines() + online_mode = None + for line in line_iter: + matches = regexp('online mode\s*:\s*(.+)$', line.strip()) + if matches: + matches = matches.groups() + if len(matches) != 1: + raise ValueError("Could not interpret online mode from \"{0}\"".format(line)) + else: + online_mode = matches[0] + break + if online_mode is None: + text = 'Could not determine online mode of connd! Is connd running?!' + warn(text) + self._fails.append(("WARN", dt.now(), text)) + return None + elif online_mode in ('always online', ): + return True + elif online_mode in ('offline', 'always offline', ): + return False + else: + text = 'Unknown online mode: {0}'.format(online_mode) + warn(text) + self._fails.append(("WARN", dt.now(), text)) + return None + #end: Tester._is_online + + + + def _set_online(self, new_val): + """ run tell-connd --online/offline P1 """ + if new_val: + self._print('going online') + args = [self._tell_connd_bin, '--online', 'P1'] + else: + self._print('going offline') + args = [self._tell_connd_bin, '--offline'] + try: + return_code = subprocess.call(args, stderr=subprocess.STDOUT, timeout=1) + if return_code == 0: + return True + else: + self._print('tell-connd returned {0} --> online state probably unchanged!'.format(return_code)) + return False + except subprocess.TimeoutExpired as te: + text = 'Timeout expired running {0}. Output was {1}'.format(args, te.output) + warn(text) + self._fails.append(("WARN", dt.now(), text)) + return False + except: + self._print('Trouble calling tell-connd:') + print_exc() + return False + #end: Tester._set_online + + + def _ensure_all_pinged(self): + """ called before going offline; check if all hosts were successfully pinged """ + # determine when we last went online + time_went_online = None + time_go_offline = dt.now() + + for went_online, change_time in reversed(self._prev_state_changes): + if went_online: + time_went_online = change_time + break + if time_went_online is None: + # never been online --> nothing to check yet + return + + self._print('checking whether all hosts were pinged between {0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\ + time_went_online, time_go_offline)) + for ip, reply_time in self._last_reply.items(): + if reply_time < time_went_online: + text = 'Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\ + ip, time_went_online, time_go_offline) + warn(text) + self._fails.append((ip, time_went_online, time_go_offline)) + + #end: Tester._ensure_all_pinged + + + def _print(self, message): + now = dt.now() + if self._next_state_change is not None: + print('{0:%H:%M:%S};next{1:+5.1f}s;{2} fails: {3}'.format(now, \ + (now - self._next_state_change).total_seconds(), len(self._fails), message)) + else: + print('{0:%H:%M:%S};next-?????;{1} fails: {2}'.format(now, len(self._fails), message)) + + + def print_failures(self): + if len(self._fails) == 0: + print('No fails! :-)') + else: + print('Recorded the following failures and warnings:') + for ip, time_went_online, time_go_offline in self._fails: + if ip == 'WARN': + print('Warning at {0:%H:%M:%S}: {1}'.format(time_went_online, time_go_offline)) + else: + print('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3}s)'.format(\ + ip, time_went_online, time_go_offline, \ + (time_go_offline-time_went_online).total_seconds())) +#end: class Tester + +######################################################################## +### MAIN +######################################################################## + +def parse_args(): + """ parse command line arguments and set defaults """ + from argparse import ArgumentParser + parser = ArgumentParser(description="Long-term testing of pingcheck: goes online and offline " \ + + "using tell-connd and checks log for correct output", \ + epilog="(c) Christian Herdtweck, Intra2net AG, 2015") + + # binaries + parser.add_argument('--pingcheck_binary', '-p', metavar='PATH', \ + default=default_pingcheck_bin, \ + help='path to pingcheck binary') + parser.add_argument('--tell_connd_binary', '-t', metavar='PATH', \ + default=default_tell_connd_bin, \ + help='path to tell-connd binary') + + parser.add_argument('--config_file', '-c', metavar='PATH', \ + default=default_config_file, \ + help='path to config file for pingcheck') + + # default + parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(version)) + return parser.parse_args() +#end: parse_args + + +def main(): + """ Main function, called when running file as script; currently raises a NotImplementedError + """ + + opts = parse_args() + + pinger = None + return_code = None + tester = None + try: + pinger = PingProc(opts.pingcheck_binary, opts.config_file) + parser = ProcOutputParser(pinger.stdout, pinger.stderr) + line_iter = iter(parser) + tester = Tester(line_iter, opts.tell_connd_binary) + tester.test() + except KeyboardInterrupt: + print('stopped by user') + except: + if tester is not None: + tester.print_failures() + raise + finally: + if pinger is None: + print('failed to create pinger') + return_code = 254 + else: + try: + print('telling pinger to terminate') + pinger.terminate() + return_code = pinger.wait(3) + print('terminated with return code {0}'.format(return_code)) + except subprocess.TimeoutExpired as te: + print('Timeout expired waiting for pinger to terminate: {0}'.format(te)) + return_code = 255 # a.k.a. -1 + except: + print('trouble terminating pinger:') + print_exc() + + if return_code is None: + # kill it + try: + pinger.kill() + outs, errs = pinger.communicate() + except: + print('trouble killing pinger:') + print_exc() + #end: if have pinger + #end: global clean-up + + if tester is not None: + tester.print_failures() + + return return_code + +#end: function main + + +if __name__ == '__main__': + main() + +# (created using vim -- the world's best text editor) + -- 1.7.1