From 3bd8a5859ba9f318f21a696e517f62d077b2b565 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Wed, 21 Jan 2015 11:43:35 +0100 Subject: [PATCH] done with long_term_test.py * set timeout for ping reply -- very important variable! * try to get more output when pingcheck process is stopped/dying * added and commented out sigchld / child signal handler * different logic how state changes are set * use ConndState * lots of other changes... --- test/long_term_test.py | 330 ++++++++++++++++++++++++++++-------------------- 1 files changed, 192 insertions(+), 138 deletions(-) diff --git a/test/long_term_test.py b/test/long_term_test.py index cac9a44..817e7bd 100755 --- a/test/long_term_test.py +++ b/test/long_term_test.py @@ -13,6 +13,7 @@ Christian Herdtweck, Intra2net, January 2015 # Version History # 15/01/15 Christian Herdtweck: started creation +# 21/1/15 ch: running 3rd long-term test, seems to work nicely now import subprocess import sys @@ -25,6 +26,8 @@ from random import randint import logging from os.path import expanduser, dirname, isdir from os import mkdir, EX_OK +from connd_state import ConndState +#import signal logger_base = logging.getLogger('long_term_test') @@ -70,6 +73,7 @@ hosts-down-limit={hosts_down_limit:d} ping-fail-limit=25 status-notifier-cmd=echo host status is ${status} link-up-interval=2 +ping-reply-timeout=3 """ conf_host = """ [host] @@ -117,11 +121,45 @@ def create_conf_file(file_prefix): for host in host_list: file_handle.write(conf_host.format(host, PING_INTERVAL)) - return file_name + return file_name, n_hosts #end: create_conf_file ######################################################################## +### Exceptions +######################################################################## + +def flags_to_str(flags): + text_parts = [] + if flags & select.POLLIN: + text_parts.append('IN') + if flags & select.POLLOUT: + text_parts.append('OUT') + if flags & select.POLLPRI: + text_parts.append('PRI') + if flags & select.POLLERR: + text_parts.append('ERR') + if flags & select.POLLHUP: + text_parts.append('HUP') + if flags & select.POLLNVAL: + text_parts.append('INVALID') + return ','.join(text_parts) + +class PollFlagsException(Exception): + def __init__(self, flags, proc_name): + super(PollFlagsException, self).__init__('Unexpected flags from polling {0}: {1}'.format(\ + proc_name, flags_to_str(flags))) + +class PollHupException(Exception): + def __init__(self, proc_name): + super(PollHupException, self).__init__(\ + 'Received HUP from polling, {0} was probably killed!'.format(proc_name)) + +class TooManyWarnings(Exception): + def __init__(self): + super(TooManyWarnings, self).__init__('Too many warnings / failures!') + +######################################################################## ### PARSER ######################################################################## @@ -136,7 +174,7 @@ class ProcOutputParser: _out = None _err = None _poller = None - _flags = select.POLLIN | select.POLLPRI + _flags = select.POLLIN | select.POLLPRI | select.POLLHUP # | select.POLLERR _regexp = '(.+)\s+(INFO|ERR|WARN|DBUG|NOTE)\s+pingcheck\[\d+\]: (.+)\s*' _out_fileno = None _err_fileno = None @@ -159,8 +197,10 @@ class ProcOutputParser: result = self._poller.poll(10) for file_no, flags in result: if not (flags & self._flags): - raise NotImplementedError('unexpected flags from polling: {0} --> {1}!'.format(\ - flags, self.flags_to_str(flags))) + raise PollFlagsException(flags, 'pingcheck') + + if flags & select.POLLHUP: + raise PollHupException('pingcheck') if file_no == self._out_fileno: src = self.SRC_OUT @@ -182,30 +222,14 @@ class ProcOutputParser: def parse(self): for src, line in self._read(): try: - yield src, regexp(self._regexp, line).groups() + timestamp, level, message = regexp(self._regexp, line).groups() + yield src, (timestamp, text_to_level[level], message) except: yield src, line def __iter__(self): return self.parse() - @staticmethod - def flags_to_str(flags): - text_parts = [] - if flags & select.POLLIN: - text_parts.append('IN') - if flags & select.POLLOUT: - text_parts.append('OUT') - if flags & select.POLLPRI: - text_parts.append('PRI') - if flags & select.POLLERR: - text_parts.append('ERR') - if flags & select.POLLHUP: - text_parts.append('HUP') - if flags & select.POLLNVAL: - text_parts.append('INVALID') - return ','.join(text_parts) - #end: class ProcOutputParser ######################################################################## @@ -252,6 +276,8 @@ class Tester: sleep(1) current_state = self._is_online() self._log.info('Start with state online = {0}'.format(current_state)) + self._prev_state_changes.append((current_state, dt.now())) + self._schedule_next_state_change() # start looping over output from observed process for src, text_info in self._line_iter: @@ -265,14 +291,18 @@ class Tester: else: text_time, level, message = text_info # first of all: write to text log so nothing is lost - msg = '{0} {1} (orig time {2})'.format(level, message, text_time) + #msg = '{0} {1} (orig time {2})'.format(level, message, text_time) + msg = '{0} {1}'.format(level, message) - if level in (LEVEL_WARN, LEVEL_ERROR): + if level in (LEVEL_WARN, LEVEL_ERROR, LEVEL_NOTICE): # most common warning/error is only logged at debug level if message.endswith('could not be resolved.'): self._log_ping[src].debug(msg) continue - if message.endswith('fail sending ping data. Network is unreachable'): + elif message.endswith('fail sending ping data. Network is unreachable'): + self._log_ping[src].debug(msg) + continue + elif message.endswith('fail sending ping data. Operation not permitted'): self._log_ping[src].debug(msg) continue else: # unexpected errors and warnings from pingcheck are logged at info level @@ -342,15 +372,14 @@ class Tester: else: new_val = True if self._set_online(new_val): - self._schedule_next_state_change(is_online) + self._schedule_next_state_change() return True # if call to tell_connd fails, try again later return False #end: _handle_change_state - def _schedule_next_state_change(self, current_state): - self._prev_state_changes.append((current_state, dt.now())) + def _schedule_next_state_change(self): n_secs = randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE) self._next_state_change = dt.now() + timedelta(seconds=n_secs) self._log.info('schedule next state change for {0:%H:%M:%S} (in {1:d}s)'.format(\ @@ -359,71 +388,47 @@ class Tester: def _is_online(self): - """ determine state from tell-connd --status - - to get more info, see State.get_real_state in connd/unittests/stresstest_pingcheck.py - """ - output = '' - try: - output = subprocess.check_output([self._tell_connd_bin, '--status'], \ - universal_newlines=True) - except subprocess.CalledProcessError as cpe: # non-zero exit status - self._warn('Non-zero exit status asking tell-connd for its --status: {0}'.format(cpe.returncode)) - return None - except subprocess.TimeoutExpired as te: - self._warn('Timeout expired asking tell-connd for its --status. Output was {0}'.format(te.output)) - return None - except: - self._info('Trouble asking tell-connd for its --status:') - self._warn(sys.exc_info()) + """ determine state using ConndState """ + state = ConndState.get_state(self._tell_connd_bin) + if not isinstance(state, ConndState): + err_code, output_lines = state + self._warn('Non-zero exit status asking tell-connd for its --status: {0}'.format(\ + err_code, output_lines[0])) return None - line_iter = output.splitlines() - online_mode = None - for line in line_iter: - matches = regexp('online mode\s*:\s*(.+)$', line.strip()) - if matches: - matches = matches.groups() - if len(matches) != 1: - self._warn('Could not interpret online mode from \"{0}\"'.format(line)) - else: - online_mode = matches[0] - break - if online_mode is None: - self._warn('Could not determine online mode of connd! Is connd running?!') - return None - elif online_mode in ('always online', ): - return True - elif online_mode in ('offline', 'always offline', ): - return False - else: - self._warn('Unknown online mode: {0}'.format(online_mode)) - return None + return len(state.connections) > 0 #end: Tester._is_online - def _set_online(self, new_val): - """ run tell-connd --online/offline P1 """ + """ run tell-connd --online/offline P1 + + returns true if afterwards are in requested state + """ + if new_val == self._is_online(): + self._warn('Want to set online state to current state!') + return True if new_val: self._log.info('going online') args = [self._tell_connd_bin, '--online', 'P1'] else: self._log.info('going offline') args = [self._tell_connd_bin, '--offline'] + try: return_code = subprocess.call(args, stderr=subprocess.STDOUT, timeout=1) if return_code == 0: + self._prev_state_changes.append((new_val, dt.now())) return True else: - self._log.info('tell-connd returned {0} --> online state probably unchanged!'.format(return_code)) + self._log.info('tell-connd returned {0} --> online state probably unchanged!'.format(\ + return_code)) return False except subprocess.TimeoutExpired as te: self._warn('Timeout expired running {0}. Output was {1}'.format(args, te.output)) return False except: - self._log.info('Trouble calling tell-connd:') - self._warn(sys.exc_info()) + self._warn('Trouble calling tell-connd', exc_info=sys.exc_info()) return False #end: Tester._set_online @@ -442,11 +447,13 @@ class Tester: # never been online --> nothing to check yet return - self._log.info('checking whether all hosts were pinged between {0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\ - time_went_online, time_go_offline)) + self._log.info('checking whether all hosts were pinged between ' + + '{0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\ + time_went_online, time_go_offline)) for ip, reply_time in self._last_reply.items(): if reply_time < time_went_online: - self._log.warn('Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\ + self._log.warning(\ + 'Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\ ip, time_went_online, time_go_offline)) self._fails.append((ip, time_went_online, time_go_offline)) self._log.info('remembered as failure number {0} / {1}'.format(\ @@ -455,18 +462,15 @@ class Tester: # check if there were too many fails; if so, stop if len(self._fails) > self.max_fails: self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!")) - raise ValueError("Too many failures/warnings!") + raise TooManyWarnings() #end: if not pinged recently #end: for all known ips #end: Tester._ensure_all_pinged - def _warn(self, message): + def _warn(self, message, **kwargs): - if isinstance(message, str): - self._log.warning(message) - else: - self._log.formatException(sys.exc_info()) + self._log.warning(message, **kwargs) self._fails.append(("WARN", dt.now(), message)) self._log.info('remembered as failure number {0} / {1}'.format(\ len(self._fails), self.max_fails)) @@ -474,7 +478,7 @@ class Tester: # check if there were too many fails; if so, stop if len(self._fails) > self.max_fails: self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!")) - raise ValueError("Too many failures/warnings!") + raise TooManyWarnings() #end: Tester._warn @@ -496,9 +500,9 @@ class Tester: if ip == 'WARN': log.info('Warning at {0:%H:%M:%S}: {1}'.format(time_went_online, time_go_offline)) else: - log.info('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3}s)'.format(\ - ip, time_went_online, time_go_offline, \ - (time_go_offline-time_went_online).total_seconds())) + log.info('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3:.2f}s)'\ + .format(ip, time_went_online, time_go_offline, \ + (time_go_offline-time_went_online).total_seconds())) #end: class Tester ######################################################################## @@ -513,38 +517,44 @@ def parse_args(): epilog="(c) Christian Herdtweck, Intra2net AG, 2015") # binaries - parser.add_argument('--pingcheck_binary', '-p', metavar='PATH', \ + parser.add_argument('--pingcheck-binary', '-p', metavar='PATH', \ default=default_pingcheck_bin, \ help='path to pingcheck binary') - parser.add_argument('--tell_connd_binary', '-t', metavar='PATH', \ + parser.add_argument('--tell-connd-binary', '-t', metavar='PATH', \ default=default_tell_connd_bin, \ help='path to tell-connd binary') # output - parser.add_argument('--output_prefix', '-o', metavar='PATH', \ + parser.add_argument('--output-prefix', '-o', metavar='PATH', \ default=default_output_prefix, \ help='path and/or file prefix to config and log files; ~=home, 0=prog start datetime; ' \ + 'default="%(default)s"') - parser.add_argument('--log_to_console', '-c', metavar='0|1', \ + parser.add_argument('--log-to-console', '-c', metavar='0|1', \ choices=(0, 1), type=int, default=1, \ help='print all output to console? default=%(default)s') - parser.add_argument('--log_level_console', metavar='LEVEL', \ + parser.add_argument('--log-level-console', metavar='LEVEL', \ type=int, default=logging.INFO, \ - help='log level for console output; debug={0}, info={1}, warn={2}; default: %(default)s'.format(\ + help='log level for console output; ' \ + + 'debug={0}, info={1}, warn={2}; default: %(default)s'.format(\ logging.DEBUG, logging.INFO, logging.WARN)) - parser.add_argument('--log_to_file', '-f', metavar='0|1', \ + parser.add_argument('--log-to-file', '-f', metavar='0|1', \ choices=(0, 1), type=int, default=1, \ help='print all output to log file? default=%(default)s') - parser.add_argument('--log_level_file', metavar='LEVEL', \ + parser.add_argument('--log-level-file', metavar='LEVEL', \ type=int, default=logging.DEBUG, \ - help='log level for console output; debug={0}, info={1}, warn={2}; default: %(default)s'.format(\ + help='log level for console output; ' \ + + 'debug={0}, info={1}, warn={2}; default: %(default)s'.format(\ logging.DEBUG, logging.INFO, logging.WARN)) - parser.add_argument('--log_file_suffix', '-l', metavar='SUFF.IX', \ + parser.add_argument('--log-file-suffix', '-l', metavar='SUFF.IX', \ type=str, default='.log', \ help='2nd part of log file name after --output_prefix; default="%(default)s"') + parser.add_argument('--pinger-log-level', metavar='LEVEL', \ + type=str, default='INFO', \ + help='log level given to pingchecker, DEBUG might quickly fill log; default=%(default)s') - parser.add_argument('--max_fails', '-m', metavar='N', type=int, default=100, \ - help='max number of failures/warnings before test will end; default: %(default)s') + + parser.add_argument('--max-fails-per-host', '-m', metavar='N', type=int, default=10, \ + help='max number of failures/warnings per host before test will end; default: %(default)s') parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(version)) # parse @@ -608,6 +618,76 @@ def setup_logging(opts): #end: setup_logging +def handle_sigchild(log, pinger, tester, signum, curr_frame): + print("\n!received sigchild!\n") + log.error('Received SIGCHLD') + log.info('with args {0} and {1}'.format(signum, curr_frame)) + + cleanup(log, pinger, tester) + + +def cleanup(log, pinger, tester, line_iter): + """ the stuff from finally clause in main loop; also called from handle_sigchild """ + + return_code = 255 + + log.info('-' * 72) + if pinger is None: + log.warning('Error before pinger was created') + return_code = 254 + else: + log.info("Cleaning up") + + for line in line_iter: + log.info('left-over output: {0}'.format(line)) + + try: + log.info('telling pinger to terminate') + pinger.terminate() + + #return_code = pinger.wait(3) may block! + outs, errs = pinger.communicate(timeout=3) + for line in outs.splitlines(): + log.info('left-over output: {0}'.format(line)) + for line in errs.splitlines(): + log.info('left-over output: {0}'.format(line)) + + return_code = pinger.returncode # may still be None + log.info('pinger terminated with return code {0}'.format(return_code)) + except subprocess.TimeoutExpired as te: + log.warning('Timeout expired waiting for pinger to terminate: {0}'.format(te)) + return_code = 253 + except: + log.exception('trouble terminating pinger', exc_info=sys.exc_info()) + + if return_code is None: + for line in line_iter: + log.info('left-over output: {0}'.format(line)) + + # kill it + try: + log.info('killing pinger the hard way') + pinger.kill() + outs, errs = pinger.communicate(timeout=3) + for line in outs.splitlines(): + log.info('left-over output: {0}'.format(line)) + for line in errs.splitlines(): + log.info('left-over output: {0}'.format(line)) + + except: + log.exception('trouble killing pinger', exc_info=sys.exc_info()) + + if tester is not None: + tester.print_failures(log) + #end: if have pinger + + # reset original signal handler + #signal.signal(signal.SIGCHLD, signal.SIG_DFL) + + return return_code +#end: cleanup + + def main(): """ parses args, create a Pinger and Parser and Tester and runs them; cleans up """ @@ -621,14 +701,18 @@ def main(): log, log_file = setup_logging(opts) pinger = None - return_code = None + return_code = 255 tester = None + line_iter = None try: - config_file = create_conf_file(opts.output_prefix) + config_file, n_hosts = create_conf_file(opts.output_prefix) log.info('created config file {0}'.format(config_file)) + #signal.signal(signal.SIGCHLD, \ + # lambda signum, curr_frame: handle_sigchild(log, pinger, tester, signum, curr_frame)) + # pingcheck subprocess - pinger = subprocess.Popen([opts.pingcheck_binary, '--log-level', 'DEBUG', \ + pinger = subprocess.Popen([opts.pingcheck_binary, '--log-level', opts.pinger_log_level, \ '--log-output', 'TERMINAL', '--config-file', config_file], \ stdout=subprocess.PIPE, stderr=subprocess.PIPE, \ universal_newlines=True) @@ -636,54 +720,24 @@ def main(): parser = ProcOutputParser(pinger.stdout, pinger.stderr) line_iter = iter(parser) - tester = Tester(line_iter, opts.tell_connd_binary, opts.max_fails) + tester = Tester(line_iter, opts.tell_connd_binary, max(10, opts.max_fails_per_host*n_hosts)) + tester.test() except KeyboardInterrupt: log.info('stopped by user') except: + log.exception('Caught an exception in main loop', exc_info=sys.exc_info()) + log.warning('Re-raise') raise finally: - if pinger is None: - log.warn('Error before pinger was created') - return_code = 254 - else: - log.info("Testing ended") - if tester is not None: - tester.print_failures(log) + return_code = cleanup(log, pinger, tester, line_iter) - try: - log.info('telling pinger to terminate') - pinger.terminate() - #return_code = pinger.wait(3) may block! - outs, errs = pinger.communicate(timeout=3) - return_code = pinger.returncode # may still be None - log.info('terminated with return code {0}'.format(return_code)) - except subprocess.TimeoutExpired as te: - log.warn('Timeout expired waiting for pinger to terminate: {0}'.format(te)) - return_code = 255 # a.k.a. -1 - except: - log.warn('trouble terminating pinger:') - log.formatException(sys.exc_info()) - - if return_code is None: - # kill it - try: - log.info('killing pinger the hard way') - pinger.kill() - outs, errs = pinger.communicate(timeout=3) - except: - log.warn('trouble killing pinger:') - log.formatException(sys.exc_info()) - #end: if have pinger + log.info('Will stop now') + if log_file is not None: + log.info('might find more log output in {0}'.format(log_file)) + log.info('-' * 72) #end: global clean-up - if tester is not None: - tester.print_failures(log) - - log.info('Will stop now') - if log_file is not None: - log.info('might find more log output in {0}'.format(log_file)) - return return_code #end: function main -- 1.7.1