added a little failsave test into tester; simplified logging of exceptions; recognize...

author Christian Herdtweck <christian.herdtweck@intra2net.com>

Thu, 22 Jan 2015 16:20:39 +0000 (17:20 +0100)

committer Christian Herdtweck <christian.herdtweck@intra2net.com>

Thu, 22 Jan 2015 16:20:39 +0000 (17:20 +0100)
author Christian Herdtweck <christian.herdtweck@intra2net.com>
Thu, 22 Jan 2015 16:20:39 +0000 (17:20 +0100)
committer Christian Herdtweck <christian.herdtweck@intra2net.com>
Thu, 22 Jan 2015 16:20:39 +0000 (17:20 +0100)
diff --git a/test/long_term_test.py b/test/long_term_test.py

index c72db48..989a1ed 100755 (executable)
--- a/test/long_term_test.py
+++ b/test/long_term_test.py
@@ -262,7 +262,7 @@ class Tester:
         self._tell_connd_bin = tell_connd_bin
         self._prev_state_changes = []
         self._fails = []
-        self.max_fails = max_fails
+        self.max_fails = max_fails + 2   # for _test_the_tester
 
         self._log = logger_base.getChild('Tester')
         self._log_ping = {}
@@ -280,10 +280,13 @@ class Tester:
         self._prev_state_changes.append((current_state, dt.now()))
         self._schedule_next_state_change()
 
+        self._test_the_tester()
+
         # start looping over output from observed process
         for src, text_info in self._line_iter:
-            if isinstance(text_info, str):
-                # ProcOutputParser could not parse
+
+            # first of all: write to pinger log so nothing is lost
+            if isinstance(text_info, str):       # ProcOutputParser could not parse
                 msg = 'unparsed: {0}'.format(text_info)
                 if text_info.startswith("RECV BUF ("):
                     self._log_ping[src].debug(msg)
@@ -292,8 +295,6 @@ class Tester:
                 continue   # no need to parse message
             else:
                 text_time, level, message = text_info
-                # first of all: write to text log so nothing is lost
-                #msg = '{0} {1} (orig time {2})'.format(level, message, text_time)
                 msg = '{0} {1}'.format(level, message)
 
                 if level in (LEVEL_WARN, LEVEL_ERROR, LEVEL_NOTICE):
@@ -326,6 +327,14 @@ class Tester:
         #end: for all lines from pingcheck output
     #end: Tester.test
 
+    def _test_the_tester(self):
+        self._warn("this is a test warning without exception information")
+        try:
+            raise NotImplementedError('Test Exception')
+        except:
+            self._warn("this is a test warning including exception information", exception=True)
+        self._log.info('you should have seen 2 failures above -- only be alarmed if not!')
+
 
     def _handle_new_ip(self, message):
         # "Have IP 172.16.1.1 [61963s] for aqua.m.i2n"
@@ -374,20 +383,27 @@ class Tester:
 
 
     def _handle_change_state(self):
-        if dt.now() > self._next_state_change:
-            is_online = self._is_online()
-            if is_online is None:
-                return False   # try again later
-            elif is_online:
-                self._ensure_all_pinged()
-                new_val = False
-            else:
-                new_val = True
-            if self._set_online(new_val):
-                self._schedule_next_state_change()
-                return True
-            # if call to tell_connd fails, try again later
-        return False
+        if dt.now() < self._next_state_change:
+            return False
+
+        is_online = self._is_online()
+        if is_online is None:
+            return False   # try again later
+
+        expect_online = self._prev_state_changes[-1][0]
+        if expect_online != is_online:
+            self._warn('Expect to be online={0} since {1} but are online={2}!'.format(\
+                    expect_online, self._prev_state_changes[-1][1], is_online))
+
+        if is_online:
+            self._ensure_all_pinged()
+            new_val = False
+        else:
+            new_val = True
+        if self._set_online(new_val):
+            self._schedule_next_state_change()
+            return True
+        # if call to tell_connd fails, try again later
     #end: _handle_change_state
 
 
@@ -440,7 +456,7 @@ class Tester:
             self._warn('Timeout expired running {0}. Output was {1}'.format(args, te.output))
             return False
         except:
-            self._warn('Trouble calling tell-connd', exc_info=sys.exc_info())
+            self._warn('Trouble calling tell-connd', exception=True)
             return False
     #end: Tester._set_online
 
@@ -480,9 +496,12 @@ class Tester:
     #end: Tester._ensure_all_pinged
 
 
-    def _warn(self, message, **kwargs):
+    def _warn(self, message, exception=False):
 
-        self._log.warning(message, **kwargs)
+        if exception:
+            self._log.exception(message, exc_info=True)
+        else:
+            self._log.warning(message)
         self._fails.append(("WARN", dt.now(), message))
         self._log.info('remembered as failure number {0} / {1}'.format(\
                 len(self._fails), self.max_fails))
@@ -494,18 +513,10 @@ class Tester:
     #end: Tester._warn
 
 
-    #def _print(self, message):
-    #    now = dt.now()
-    #    if self._next_state_change is not None:
-    #        print('{0:%H:%M:%S};next{1:+5.1f}s;{2} fails: {3}'.format(now, \
-    #                (now - self._next_state_change).total_seconds(), len(self._fails), message))
-    #    else:
-    #        print('{0:%H:%M:%S};next-?????;{1} fails: {2}'.format(now, len(self._fails), message))
-
-
     def print_failures(self, log):
         if len(self._fails) == 0:
-            log.info('No fails! :-)')
+            log.info('No warnings nor failures since {0:%d.%m.%y, %H:%M} :-)'.format(\
+                    self._prev_state_changes[0][1]))
         else:
             log.info('Recorded the following failures and warnings:')
             for host_name, time_went_online, time_go_offline in self._fails:
@@ -515,6 +526,111 @@ class Tester:
                     log.info('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3:.2f}s)'\
                             .format(host_name, time_went_online, time_go_offline, \
                                     (time_go_offline-time_went_online).total_seconds()))
+
+    #end: Tester.print_failures
+
+    def save_state_and_history(self, save_file):
+        """ save all other data for later inspection using show_saved_data """
+        try:
+            with open(save_file, 'wb') as file_handle:
+                self._log.info('saving state and history to {0}'.format(save_file))
+                pickle.dump( (self._last_request, self._last_reply, self._n_requests, self._n_replies, \
+                              self._host_for_ip, self._tell_connd_bin, self._prev_state_changes, \
+                              self._fails, self.max_fails, save_file, dt.now()), \
+                            file_handle)
+            return True
+        except Exception as e:
+            self._log.exception("Failed to save!", exc_info=True)
+        return False
+    #end: Tester.save_state_and_history
+
+
+    @staticmethod
+    def show_saved_data(save_file):
+        """ print info on data saved by save_state_and_history """
+
+        # load
+        save_date = None
+        with open(save_file, 'rb') as file_handle:
+            print('loading state and history from {0}'.format(save_file))
+            last_request, last_reply, n_requests, n_replies, host_for_ip, tell_connd_bin, \
+                    state_changes, fails, max_fails, save_file_again, save_date \
+                        = pickle.load(file_handle)
+        if save_date is None:
+            print('Failed to read state and history from {0}'.format(save_file))
+            return 1
+
+        # simple stats
+        first_change = state_changes[0][1]
+        last_change = state_changes[-1][1]
+        print('Performed {0} state changes between {1:%d.%m,%H:%M} and {2:%d.%m,%H:%M}' \
+                .format(len(state_changes), first_change, last_change), end='')
+        print(' ({0} --> one every {1} on average)'.format( \
+                        strftimedelta( last_change - first_change                      ), \
+                        strftimedelta((last_change - first_change) / len(state_changes)) ))
+        print('Used {0} IPs for {1} hosts'.format(len(host_for_ip), len(last_request)))
+        print('Overall, recorded {0} warnings/failures (max={1})'.format(len(fails), max_fails))
+
+        # sort fails into warnings and fail per host and fail per time
+        warnings = []
+        n_test_warnings = 0
+        fail_per_host = {}
+        fail_per_time = {}
+        n_fails = 0
+        for host, time_online, time_offline in fails:
+            if host == 'WARN':
+                # time_online = time, time_offline=message
+                if 'test warning' in time_offline:
+                    n_test_warnings += 1
+                else:
+                    warnings.append((time_online, time_offline))
+            else:
+                n_fails += 1
+                try:
+                    fail_per_host[host].append(time_online)
+                except KeyError:
+                    fail_per_host[host] = [time_online,]
+                duration = time_offline - time_online
+                try:
+                    fail_per_time[time_online, duration].append(host)
+                except KeyError:
+                    fail_per_time[time_online, duration] = [host, ]
+
+        # stats per host
+        for host, n_req in n_requests.items():
+            print(host + ' (', end='')
+            print(','.join( ip for ip, host_for_ip in host_for_ip.items() if host==host_for_ip ), end='')
+            print('): \treplied to {1} of {0} pings ({2:.1f}%); '.format(\
+                    n_req, n_replies[host], 100. * n_replies[host]/n_req), end='')
+            if host in fail_per_host:
+                print('failed to ping at ', end='')
+                print(','.join( '{0:%H:%M},'.format(fail_time) for fail_time in fail_per_host[host] ))
+            else:
+                print('replied in each test interval')
+
+        # stats per time
+        if fail_per_time:
+            print("Ping failures per time (overall {0} fails in {1} intervals):".format(\
+                    n_fails, len(fail_per_time)))
+            for (start, duration), host_list in fail_per_time.items():
+                print('{0:%d.%m,%H:%M} + {1}: '.format(start, strftimedelta(duration)), end='')
+                print(','.join(host_list))
+
+        # warnings
+        if warnings:
+            if n_test_warnings:
+                print("Warnings (not showing {0} test warnings):".format(n_test_warnings))
+            else:
+                print("Warnings:")
+            for time, message in warnings:
+                print("{0:%d.%m,%H:%M}: {1}".format(time, message))
+        elif n_test_warnings:
+            print('Only warnings were {0} for testing'.format(n_test_warnings))
+
+        # done
+        return 0
+    #end: static Tester.show_saved_data
+
 #end: class Tester
 
 ########################################################################
@@ -605,11 +721,11 @@ def setup_logging(opts):
 
     # create handlers and format for base logger
     log_file = None
+    formatter = logging.Formatter('%(asctime)s %(name)s %(levelname)s %(message)s', \
+                                  datefmt='%d.%m,%H:%M:%S')
     if opts.log_to_console:
         handler = logging.StreamHandler(sys.stdout)
         handler.setLevel(opts.log_level_console)
-        formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s %(message)s', \
-                                      datefmt='%d.%m,%H:%M:%S')
         handler.setFormatter(formatter)
         logger_base.addHandler(handler)
 
@@ -617,8 +733,6 @@ def setup_logging(opts):
         log_file = opts.output_prefix + opts.log_file_suffix
         handler = logging.FileHandler(log_file)
         handler.setLevel(opts.log_level_file)
-        formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s', \
-                                      datefmt='%d.%m,%H:%M:%S')
         handler.setFormatter(formatter)
         logger_base.addHandler(handler)
 
@@ -684,7 +798,7 @@ def cleanup(log, pinger, tester, line_iter):
             log.warning('Timeout expired waiting for pinger to terminate: {0}'.format(te))
             return_code = 253
         except:
-            log.exception('trouble terminating pinger', exc_info=sys.exc_info())
+            log.exception('trouble terminating pinger', exc_info=True)
 
         if return_code is None:
             for line_idx, line in enumerate(line_iter):
@@ -704,9 +818,10 @@ def cleanup(log, pinger, tester, line_iter):
                     log.info('left-over output: {0}'.format(line))
 
             except:
-                log.exception('trouble killing pinger', exc_info=sys.exc_info())
+                log.exception('trouble killing pinger', exc_info=True)
 
         if tester is not None:
+            tester.save_state_and_history(save_file)
             tester.print_failures(log)
     #end: if have pinger
 
@@ -758,11 +873,11 @@ def main():
     except KeyboardInterrupt:
         log.info('stopped by user')
     except:
-        log.exception('Caught an exception in main loop', exc_info=sys.exc_info())
+        log.exception('Caught an exception in main loop', exc_info=True)
         log.warning('Re-raise')
         raise
     finally:
-        return_code = cleanup(log, pinger, tester, line_iter)
+        return_code = cleanup(log, pinger, tester, line_iter, save_file)
 
         log.info('Will stop now')
         if log_file is not None:
author	Christian Herdtweck <christian.herdtweck@intra2net.com>
	Thu, 22 Jan 2015 16:20:39 +0000 (17:20 +0100)
committer	Christian Herdtweck <christian.herdtweck@intra2net.com>
	Thu, 22 Jan 2015 16:20:39 +0000 (17:20 +0100)