self._tell_connd_bin = tell_connd_bin
self._prev_state_changes = []
self._fails = []
- self.max_fails = max_fails
+ self.max_fails = max_fails + 2 # for _test_the_tester
self._log = logger_base.getChild('Tester')
self._log_ping = {}
self._prev_state_changes.append((current_state, dt.now()))
self._schedule_next_state_change()
+ self._test_the_tester()
+
# start looping over output from observed process
for src, text_info in self._line_iter:
- if isinstance(text_info, str):
- # ProcOutputParser could not parse
+
+ # first of all: write to pinger log so nothing is lost
+ if isinstance(text_info, str): # ProcOutputParser could not parse
msg = 'unparsed: {0}'.format(text_info)
if text_info.startswith("RECV BUF ("):
self._log_ping[src].debug(msg)
continue # no need to parse message
else:
text_time, level, message = text_info
- # first of all: write to text log so nothing is lost
- #msg = '{0} {1} (orig time {2})'.format(level, message, text_time)
msg = '{0} {1}'.format(level, message)
if level in (LEVEL_WARN, LEVEL_ERROR, LEVEL_NOTICE):
#end: for all lines from pingcheck output
#end: Tester.test
+ def _test_the_tester(self):
+ self._warn("this is a test warning without exception information")
+ try:
+ raise NotImplementedError('Test Exception')
+ except:
+ self._warn("this is a test warning including exception information", exception=True)
+ self._log.info('you should have seen 2 failures above -- only be alarmed if not!')
+
def _handle_new_ip(self, message):
# "Have IP 172.16.1.1 [61963s] for aqua.m.i2n"
def _handle_change_state(self):
- if dt.now() > self._next_state_change:
- is_online = self._is_online()
- if is_online is None:
- return False # try again later
- elif is_online:
- self._ensure_all_pinged()
- new_val = False
- else:
- new_val = True
- if self._set_online(new_val):
- self._schedule_next_state_change()
- return True
- # if call to tell_connd fails, try again later
- return False
+ if dt.now() < self._next_state_change:
+ return False
+
+ is_online = self._is_online()
+ if is_online is None:
+ return False # try again later
+
+ expect_online = self._prev_state_changes[-1][0]
+ if expect_online != is_online:
+ self._warn('Expect to be online={0} since {1} but are online={2}!'.format(\
+ expect_online, self._prev_state_changes[-1][1], is_online))
+
+ if is_online:
+ self._ensure_all_pinged()
+ new_val = False
+ else:
+ new_val = True
+ if self._set_online(new_val):
+ self._schedule_next_state_change()
+ return True
+ # if call to tell_connd fails, try again later
#end: _handle_change_state
self._warn('Timeout expired running {0}. Output was {1}'.format(args, te.output))
return False
except:
- self._warn('Trouble calling tell-connd', exc_info=sys.exc_info())
+ self._warn('Trouble calling tell-connd', exception=True)
return False
#end: Tester._set_online
#end: Tester._ensure_all_pinged
- def _warn(self, message, **kwargs):
+ def _warn(self, message, exception=False):
- self._log.warning(message, **kwargs)
+ if exception:
+ self._log.exception(message, exc_info=True)
+ else:
+ self._log.warning(message)
self._fails.append(("WARN", dt.now(), message))
self._log.info('remembered as failure number {0} / {1}'.format(\
len(self._fails), self.max_fails))
#end: Tester._warn
- #def _print(self, message):
- # now = dt.now()
- # if self._next_state_change is not None:
- # print('{0:%H:%M:%S};next{1:+5.1f}s;{2} fails: {3}'.format(now, \
- # (now - self._next_state_change).total_seconds(), len(self._fails), message))
- # else:
- # print('{0:%H:%M:%S};next-?????;{1} fails: {2}'.format(now, len(self._fails), message))
-
-
def print_failures(self, log):
if len(self._fails) == 0:
- log.info('No fails! :-)')
+ log.info('No warnings nor failures since {0:%d.%m.%y, %H:%M} :-)'.format(\
+ self._prev_state_changes[0][1]))
else:
log.info('Recorded the following failures and warnings:')
for host_name, time_went_online, time_go_offline in self._fails:
log.info('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3:.2f}s)'\
.format(host_name, time_went_online, time_go_offline, \
(time_go_offline-time_went_online).total_seconds()))
+
+ #end: Tester.print_failures
+
+ def save_state_and_history(self, save_file):
+ """ save all other data for later inspection using show_saved_data """
+ try:
+ with open(save_file, 'wb') as file_handle:
+ self._log.info('saving state and history to {0}'.format(save_file))
+ pickle.dump( (self._last_request, self._last_reply, self._n_requests, self._n_replies, \
+ self._host_for_ip, self._tell_connd_bin, self._prev_state_changes, \
+ self._fails, self.max_fails, save_file, dt.now()), \
+ file_handle)
+ return True
+ except Exception as e:
+ self._log.exception("Failed to save!", exc_info=True)
+ return False
+ #end: Tester.save_state_and_history
+
+
+ @staticmethod
+ def show_saved_data(save_file):
+ """ print info on data saved by save_state_and_history """
+
+ # load
+ save_date = None
+ with open(save_file, 'rb') as file_handle:
+ print('loading state and history from {0}'.format(save_file))
+ last_request, last_reply, n_requests, n_replies, host_for_ip, tell_connd_bin, \
+ state_changes, fails, max_fails, save_file_again, save_date \
+ = pickle.load(file_handle)
+ if save_date is None:
+ print('Failed to read state and history from {0}'.format(save_file))
+ return 1
+
+ # simple stats
+ first_change = state_changes[0][1]
+ last_change = state_changes[-1][1]
+ print('Performed {0} state changes between {1:%d.%m,%H:%M} and {2:%d.%m,%H:%M}' \
+ .format(len(state_changes), first_change, last_change), end='')
+ print(' ({0} --> one every {1} on average)'.format( \
+ strftimedelta( last_change - first_change ), \
+ strftimedelta((last_change - first_change) / len(state_changes)) ))
+ print('Used {0} IPs for {1} hosts'.format(len(host_for_ip), len(last_request)))
+ print('Overall, recorded {0} warnings/failures (max={1})'.format(len(fails), max_fails))
+
+ # sort fails into warnings and fail per host and fail per time
+ warnings = []
+ n_test_warnings = 0
+ fail_per_host = {}
+ fail_per_time = {}
+ n_fails = 0
+ for host, time_online, time_offline in fails:
+ if host == 'WARN':
+ # time_online = time, time_offline=message
+ if 'test warning' in time_offline:
+ n_test_warnings += 1
+ else:
+ warnings.append((time_online, time_offline))
+ else:
+ n_fails += 1
+ try:
+ fail_per_host[host].append(time_online)
+ except KeyError:
+ fail_per_host[host] = [time_online,]
+ duration = time_offline - time_online
+ try:
+ fail_per_time[time_online, duration].append(host)
+ except KeyError:
+ fail_per_time[time_online, duration] = [host, ]
+
+ # stats per host
+ for host, n_req in n_requests.items():
+ print(host + ' (', end='')
+ print(','.join( ip for ip, host_for_ip in host_for_ip.items() if host==host_for_ip ), end='')
+ print('): \treplied to {1} of {0} pings ({2:.1f}%); '.format(\
+ n_req, n_replies[host], 100. * n_replies[host]/n_req), end='')
+ if host in fail_per_host:
+ print('failed to ping at ', end='')
+ print(','.join( '{0:%H:%M},'.format(fail_time) for fail_time in fail_per_host[host] ))
+ else:
+ print('replied in each test interval')
+
+ # stats per time
+ if fail_per_time:
+ print("Ping failures per time (overall {0} fails in {1} intervals):".format(\
+ n_fails, len(fail_per_time)))
+ for (start, duration), host_list in fail_per_time.items():
+ print('{0:%d.%m,%H:%M} + {1}: '.format(start, strftimedelta(duration)), end='')
+ print(','.join(host_list))
+
+ # warnings
+ if warnings:
+ if n_test_warnings:
+ print("Warnings (not showing {0} test warnings):".format(n_test_warnings))
+ else:
+ print("Warnings:")
+ for time, message in warnings:
+ print("{0:%d.%m,%H:%M}: {1}".format(time, message))
+ elif n_test_warnings:
+ print('Only warnings were {0} for testing'.format(n_test_warnings))
+
+ # done
+ return 0
+ #end: static Tester.show_saved_data
+
#end: class Tester
########################################################################
# create handlers and format for base logger
log_file = None
+ formatter = logging.Formatter('%(asctime)s %(name)s %(levelname)s %(message)s', \
+ datefmt='%d.%m,%H:%M:%S')
if opts.log_to_console:
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(opts.log_level_console)
- formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s %(message)s', \
- datefmt='%d.%m,%H:%M:%S')
handler.setFormatter(formatter)
logger_base.addHandler(handler)
log_file = opts.output_prefix + opts.log_file_suffix
handler = logging.FileHandler(log_file)
handler.setLevel(opts.log_level_file)
- formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s', \
- datefmt='%d.%m,%H:%M:%S')
handler.setFormatter(formatter)
logger_base.addHandler(handler)
log.warning('Timeout expired waiting for pinger to terminate: {0}'.format(te))
return_code = 253
except:
- log.exception('trouble terminating pinger', exc_info=sys.exc_info())
+ log.exception('trouble terminating pinger', exc_info=True)
if return_code is None:
for line_idx, line in enumerate(line_iter):
log.info('left-over output: {0}'.format(line))
except:
- log.exception('trouble killing pinger', exc_info=sys.exc_info())
+ log.exception('trouble killing pinger', exc_info=True)
if tester is not None:
+ tester.save_state_and_history(save_file)
tester.print_failures(log)
#end: if have pinger
except KeyboardInterrupt:
log.info('stopped by user')
except:
- log.exception('Caught an exception in main loop', exc_info=sys.exc_info())
+ log.exception('Caught an exception in main loop', exc_info=True)
log.warning('Re-raise')
raise
finally:
- return_code = cleanup(log, pinger, tester, line_iter)
+ return_code = cleanup(log, pinger, tester, line_iter, save_file)
log.info('Will stop now')
if log_file is not None: