From: Christian Herdtweck Date: Thu, 22 Jan 2015 16:20:39 +0000 (+0100) Subject: added a little failsave test into tester; simplified logging of exceptions; recognize... X-Git-Url: http://developer.intra2net.com/git/?a=commitdiff_plain;h=86ab1571f4f37cd74647ef4e9e2d5878ce9198c0;p=pingcheck added a little failsave test into tester; simplified logging of exceptions; recognize unexpected online states and warn; removed pre-logging Tester._print --- diff --git a/test/long_term_test.py b/test/long_term_test.py index c72db48..989a1ed 100755 --- a/test/long_term_test.py +++ b/test/long_term_test.py @@ -262,7 +262,7 @@ class Tester: self._tell_connd_bin = tell_connd_bin self._prev_state_changes = [] self._fails = [] - self.max_fails = max_fails + self.max_fails = max_fails + 2 # for _test_the_tester self._log = logger_base.getChild('Tester') self._log_ping = {} @@ -280,10 +280,13 @@ class Tester: self._prev_state_changes.append((current_state, dt.now())) self._schedule_next_state_change() + self._test_the_tester() + # start looping over output from observed process for src, text_info in self._line_iter: - if isinstance(text_info, str): - # ProcOutputParser could not parse + + # first of all: write to pinger log so nothing is lost + if isinstance(text_info, str): # ProcOutputParser could not parse msg = 'unparsed: {0}'.format(text_info) if text_info.startswith("RECV BUF ("): self._log_ping[src].debug(msg) @@ -292,8 +295,6 @@ class Tester: continue # no need to parse message else: text_time, level, message = text_info - # first of all: write to text log so nothing is lost - #msg = '{0} {1} (orig time {2})'.format(level, message, text_time) msg = '{0} {1}'.format(level, message) if level in (LEVEL_WARN, LEVEL_ERROR, LEVEL_NOTICE): @@ -326,6 +327,14 @@ class Tester: #end: for all lines from pingcheck output #end: Tester.test + def _test_the_tester(self): + self._warn("this is a test warning without exception information") + try: + raise NotImplementedError('Test Exception') + except: + self._warn("this is a test warning including exception information", exception=True) + self._log.info('you should have seen 2 failures above -- only be alarmed if not!') + def _handle_new_ip(self, message): # "Have IP 172.16.1.1 [61963s] for aqua.m.i2n" @@ -374,20 +383,27 @@ class Tester: def _handle_change_state(self): - if dt.now() > self._next_state_change: - is_online = self._is_online() - if is_online is None: - return False # try again later - elif is_online: - self._ensure_all_pinged() - new_val = False - else: - new_val = True - if self._set_online(new_val): - self._schedule_next_state_change() - return True - # if call to tell_connd fails, try again later - return False + if dt.now() < self._next_state_change: + return False + + is_online = self._is_online() + if is_online is None: + return False # try again later + + expect_online = self._prev_state_changes[-1][0] + if expect_online != is_online: + self._warn('Expect to be online={0} since {1} but are online={2}!'.format(\ + expect_online, self._prev_state_changes[-1][1], is_online)) + + if is_online: + self._ensure_all_pinged() + new_val = False + else: + new_val = True + if self._set_online(new_val): + self._schedule_next_state_change() + return True + # if call to tell_connd fails, try again later #end: _handle_change_state @@ -440,7 +456,7 @@ class Tester: self._warn('Timeout expired running {0}. Output was {1}'.format(args, te.output)) return False except: - self._warn('Trouble calling tell-connd', exc_info=sys.exc_info()) + self._warn('Trouble calling tell-connd', exception=True) return False #end: Tester._set_online @@ -480,9 +496,12 @@ class Tester: #end: Tester._ensure_all_pinged - def _warn(self, message, **kwargs): + def _warn(self, message, exception=False): - self._log.warning(message, **kwargs) + if exception: + self._log.exception(message, exc_info=True) + else: + self._log.warning(message) self._fails.append(("WARN", dt.now(), message)) self._log.info('remembered as failure number {0} / {1}'.format(\ len(self._fails), self.max_fails)) @@ -494,18 +513,10 @@ class Tester: #end: Tester._warn - #def _print(self, message): - # now = dt.now() - # if self._next_state_change is not None: - # print('{0:%H:%M:%S};next{1:+5.1f}s;{2} fails: {3}'.format(now, \ - # (now - self._next_state_change).total_seconds(), len(self._fails), message)) - # else: - # print('{0:%H:%M:%S};next-?????;{1} fails: {2}'.format(now, len(self._fails), message)) - - def print_failures(self, log): if len(self._fails) == 0: - log.info('No fails! :-)') + log.info('No warnings nor failures since {0:%d.%m.%y, %H:%M} :-)'.format(\ + self._prev_state_changes[0][1])) else: log.info('Recorded the following failures and warnings:') for host_name, time_went_online, time_go_offline in self._fails: @@ -515,6 +526,111 @@ class Tester: log.info('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3:.2f}s)'\ .format(host_name, time_went_online, time_go_offline, \ (time_go_offline-time_went_online).total_seconds())) + + #end: Tester.print_failures + + def save_state_and_history(self, save_file): + """ save all other data for later inspection using show_saved_data """ + try: + with open(save_file, 'wb') as file_handle: + self._log.info('saving state and history to {0}'.format(save_file)) + pickle.dump( (self._last_request, self._last_reply, self._n_requests, self._n_replies, \ + self._host_for_ip, self._tell_connd_bin, self._prev_state_changes, \ + self._fails, self.max_fails, save_file, dt.now()), \ + file_handle) + return True + except Exception as e: + self._log.exception("Failed to save!", exc_info=True) + return False + #end: Tester.save_state_and_history + + + @staticmethod + def show_saved_data(save_file): + """ print info on data saved by save_state_and_history """ + + # load + save_date = None + with open(save_file, 'rb') as file_handle: + print('loading state and history from {0}'.format(save_file)) + last_request, last_reply, n_requests, n_replies, host_for_ip, tell_connd_bin, \ + state_changes, fails, max_fails, save_file_again, save_date \ + = pickle.load(file_handle) + if save_date is None: + print('Failed to read state and history from {0}'.format(save_file)) + return 1 + + # simple stats + first_change = state_changes[0][1] + last_change = state_changes[-1][1] + print('Performed {0} state changes between {1:%d.%m,%H:%M} and {2:%d.%m,%H:%M}' \ + .format(len(state_changes), first_change, last_change), end='') + print(' ({0} --> one every {1} on average)'.format( \ + strftimedelta( last_change - first_change ), \ + strftimedelta((last_change - first_change) / len(state_changes)) )) + print('Used {0} IPs for {1} hosts'.format(len(host_for_ip), len(last_request))) + print('Overall, recorded {0} warnings/failures (max={1})'.format(len(fails), max_fails)) + + # sort fails into warnings and fail per host and fail per time + warnings = [] + n_test_warnings = 0 + fail_per_host = {} + fail_per_time = {} + n_fails = 0 + for host, time_online, time_offline in fails: + if host == 'WARN': + # time_online = time, time_offline=message + if 'test warning' in time_offline: + n_test_warnings += 1 + else: + warnings.append((time_online, time_offline)) + else: + n_fails += 1 + try: + fail_per_host[host].append(time_online) + except KeyError: + fail_per_host[host] = [time_online,] + duration = time_offline - time_online + try: + fail_per_time[time_online, duration].append(host) + except KeyError: + fail_per_time[time_online, duration] = [host, ] + + # stats per host + for host, n_req in n_requests.items(): + print(host + ' (', end='') + print(','.join( ip for ip, host_for_ip in host_for_ip.items() if host==host_for_ip ), end='') + print('): \treplied to {1} of {0} pings ({2:.1f}%); '.format(\ + n_req, n_replies[host], 100. * n_replies[host]/n_req), end='') + if host in fail_per_host: + print('failed to ping at ', end='') + print(','.join( '{0:%H:%M},'.format(fail_time) for fail_time in fail_per_host[host] )) + else: + print('replied in each test interval') + + # stats per time + if fail_per_time: + print("Ping failures per time (overall {0} fails in {1} intervals):".format(\ + n_fails, len(fail_per_time))) + for (start, duration), host_list in fail_per_time.items(): + print('{0:%d.%m,%H:%M} + {1}: '.format(start, strftimedelta(duration)), end='') + print(','.join(host_list)) + + # warnings + if warnings: + if n_test_warnings: + print("Warnings (not showing {0} test warnings):".format(n_test_warnings)) + else: + print("Warnings:") + for time, message in warnings: + print("{0:%d.%m,%H:%M}: {1}".format(time, message)) + elif n_test_warnings: + print('Only warnings were {0} for testing'.format(n_test_warnings)) + + # done + return 0 + #end: static Tester.show_saved_data + #end: class Tester ######################################################################## @@ -605,11 +721,11 @@ def setup_logging(opts): # create handlers and format for base logger log_file = None + formatter = logging.Formatter('%(asctime)s %(name)s %(levelname)s %(message)s', \ + datefmt='%d.%m,%H:%M:%S') if opts.log_to_console: handler = logging.StreamHandler(sys.stdout) handler.setLevel(opts.log_level_console) - formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s %(message)s', \ - datefmt='%d.%m,%H:%M:%S') handler.setFormatter(formatter) logger_base.addHandler(handler) @@ -617,8 +733,6 @@ def setup_logging(opts): log_file = opts.output_prefix + opts.log_file_suffix handler = logging.FileHandler(log_file) handler.setLevel(opts.log_level_file) - formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s', \ - datefmt='%d.%m,%H:%M:%S') handler.setFormatter(formatter) logger_base.addHandler(handler) @@ -684,7 +798,7 @@ def cleanup(log, pinger, tester, line_iter): log.warning('Timeout expired waiting for pinger to terminate: {0}'.format(te)) return_code = 253 except: - log.exception('trouble terminating pinger', exc_info=sys.exc_info()) + log.exception('trouble terminating pinger', exc_info=True) if return_code is None: for line_idx, line in enumerate(line_iter): @@ -704,9 +818,10 @@ def cleanup(log, pinger, tester, line_iter): log.info('left-over output: {0}'.format(line)) except: - log.exception('trouble killing pinger', exc_info=sys.exc_info()) + log.exception('trouble killing pinger', exc_info=True) if tester is not None: + tester.save_state_and_history(save_file) tester.print_failures(log) #end: if have pinger @@ -758,11 +873,11 @@ def main(): except KeyboardInterrupt: log.info('stopped by user') except: - log.exception('Caught an exception in main loop', exc_info=sys.exc_info()) + log.exception('Caught an exception in main loop', exc_info=True) log.warning('Re-raise') raise finally: - return_code = cleanup(log, pinger, tester, line_iter) + return_code = cleanup(log, pinger, tester, line_iter, save_file) log.info('Will stop now') if log_file is not None: