# Version History
# 15/01/15 Christian Herdtweck: started creation
+# 21/1/15 ch: running 3rd long-term test, seems to work nicely now
import subprocess
import sys
import logging
from os.path import expanduser, dirname, isdir
from os import mkdir, EX_OK
+from connd_state import ConndState
+#import signal
logger_base = logging.getLogger('long_term_test')
ping-fail-limit=25
status-notifier-cmd=echo host status is ${status}
link-up-interval=2
+ping-reply-timeout=3
"""
conf_host = """
[host]
for host in host_list:
file_handle.write(conf_host.format(host, PING_INTERVAL))
- return file_name
+ return file_name, n_hosts
#end: create_conf_file
########################################################################
+### Exceptions
+########################################################################
+
+def flags_to_str(flags):
+ text_parts = []
+ if flags & select.POLLIN:
+ text_parts.append('IN')
+ if flags & select.POLLOUT:
+ text_parts.append('OUT')
+ if flags & select.POLLPRI:
+ text_parts.append('PRI')
+ if flags & select.POLLERR:
+ text_parts.append('ERR')
+ if flags & select.POLLHUP:
+ text_parts.append('HUP')
+ if flags & select.POLLNVAL:
+ text_parts.append('INVALID')
+ return ','.join(text_parts)
+
+class PollFlagsException(Exception):
+ def __init__(self, flags, proc_name):
+ super(PollFlagsException, self).__init__('Unexpected flags from polling {0}: {1}'.format(\
+ proc_name, flags_to_str(flags)))
+
+class PollHupException(Exception):
+ def __init__(self, proc_name):
+ super(PollHupException, self).__init__(\
+ 'Received HUP from polling, {0} was probably killed!'.format(proc_name))
+
+class TooManyWarnings(Exception):
+ def __init__(self):
+ super(TooManyWarnings, self).__init__('Too many warnings / failures!')
+
+########################################################################
### PARSER
########################################################################
_out = None
_err = None
_poller = None
- _flags = select.POLLIN | select.POLLPRI
+ _flags = select.POLLIN | select.POLLPRI | select.POLLHUP # | select.POLLERR
_regexp = '(.+)\s+(INFO|ERR|WARN|DBUG|NOTE)\s+pingcheck\[\d+\]: (.+)\s*'
_out_fileno = None
_err_fileno = None
result = self._poller.poll(10)
for file_no, flags in result:
if not (flags & self._flags):
- raise NotImplementedError('unexpected flags from polling: {0} --> {1}!'.format(\
- flags, self.flags_to_str(flags)))
+ raise PollFlagsException(flags, 'pingcheck')
+
+ if flags & select.POLLHUP:
+ raise PollHupException('pingcheck')
if file_no == self._out_fileno:
src = self.SRC_OUT
def parse(self):
for src, line in self._read():
try:
- yield src, regexp(self._regexp, line).groups()
+ timestamp, level, message = regexp(self._regexp, line).groups()
+ yield src, (timestamp, text_to_level[level], message)
except:
yield src, line
def __iter__(self):
return self.parse()
- @staticmethod
- def flags_to_str(flags):
- text_parts = []
- if flags & select.POLLIN:
- text_parts.append('IN')
- if flags & select.POLLOUT:
- text_parts.append('OUT')
- if flags & select.POLLPRI:
- text_parts.append('PRI')
- if flags & select.POLLERR:
- text_parts.append('ERR')
- if flags & select.POLLHUP:
- text_parts.append('HUP')
- if flags & select.POLLNVAL:
- text_parts.append('INVALID')
- return ','.join(text_parts)
-
#end: class ProcOutputParser
########################################################################
sleep(1)
current_state = self._is_online()
self._log.info('Start with state online = {0}'.format(current_state))
+ self._prev_state_changes.append((current_state, dt.now()))
+ self._schedule_next_state_change()
# start looping over output from observed process
for src, text_info in self._line_iter:
else:
text_time, level, message = text_info
# first of all: write to text log so nothing is lost
- msg = '{0} {1} (orig time {2})'.format(level, message, text_time)
+ #msg = '{0} {1} (orig time {2})'.format(level, message, text_time)
+ msg = '{0} {1}'.format(level, message)
- if level in (LEVEL_WARN, LEVEL_ERROR):
+ if level in (LEVEL_WARN, LEVEL_ERROR, LEVEL_NOTICE):
# most common warning/error is only logged at debug level
if message.endswith('could not be resolved.'):
self._log_ping[src].debug(msg)
continue
- if message.endswith('fail sending ping data. Network is unreachable'):
+ elif message.endswith('fail sending ping data. Network is unreachable'):
+ self._log_ping[src].debug(msg)
+ continue
+ elif message.endswith('fail sending ping data. Operation not permitted'):
self._log_ping[src].debug(msg)
continue
else: # unexpected errors and warnings from pingcheck are logged at info level
else:
new_val = True
if self._set_online(new_val):
- self._schedule_next_state_change(is_online)
+ self._schedule_next_state_change()
return True
# if call to tell_connd fails, try again later
return False
#end: _handle_change_state
- def _schedule_next_state_change(self, current_state):
- self._prev_state_changes.append((current_state, dt.now()))
+ def _schedule_next_state_change(self):
n_secs = randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE)
self._next_state_change = dt.now() + timedelta(seconds=n_secs)
self._log.info('schedule next state change for {0:%H:%M:%S} (in {1:d}s)'.format(\
def _is_online(self):
- """ determine state from tell-connd --status
-
- to get more info, see State.get_real_state in connd/unittests/stresstest_pingcheck.py
- """
- output = ''
- try:
- output = subprocess.check_output([self._tell_connd_bin, '--status'], \
- universal_newlines=True)
- except subprocess.CalledProcessError as cpe: # non-zero exit status
- self._warn('Non-zero exit status asking tell-connd for its --status: {0}'.format(cpe.returncode))
- return None
- except subprocess.TimeoutExpired as te:
- self._warn('Timeout expired asking tell-connd for its --status. Output was {0}'.format(te.output))
- return None
- except:
- self._info('Trouble asking tell-connd for its --status:')
- self._warn(sys.exc_info())
+ """ determine state using ConndState """
+ state = ConndState.get_state(self._tell_connd_bin)
+ if not isinstance(state, ConndState):
+ err_code, output_lines = state
+ self._warn('Non-zero exit status asking tell-connd for its --status: {0}'.format(\
+ err_code, output_lines[0]))
return None
- line_iter = output.splitlines()
- online_mode = None
- for line in line_iter:
- matches = regexp('online mode\s*:\s*(.+)$', line.strip())
- if matches:
- matches = matches.groups()
- if len(matches) != 1:
- self._warn('Could not interpret online mode from \"{0}\"'.format(line))
- else:
- online_mode = matches[0]
- break
- if online_mode is None:
- self._warn('Could not determine online mode of connd! Is connd running?!')
- return None
- elif online_mode in ('always online', ):
- return True
- elif online_mode in ('offline', 'always offline', ):
- return False
- else:
- self._warn('Unknown online mode: {0}'.format(online_mode))
- return None
+ return len(state.connections) > 0
#end: Tester._is_online
-
def _set_online(self, new_val):
- """ run tell-connd --online/offline P1 """
+ """ run tell-connd --online/offline P1
+
+ returns true if afterwards are in requested state
+ """
+ if new_val == self._is_online():
+ self._warn('Want to set online state to current state!')
+ return True
if new_val:
self._log.info('going online')
args = [self._tell_connd_bin, '--online', 'P1']
else:
self._log.info('going offline')
args = [self._tell_connd_bin, '--offline']
+
try:
return_code = subprocess.call(args, stderr=subprocess.STDOUT, timeout=1)
if return_code == 0:
+ self._prev_state_changes.append((new_val, dt.now()))
return True
else:
- self._log.info('tell-connd returned {0} --> online state probably unchanged!'.format(return_code))
+ self._log.info('tell-connd returned {0} --> online state probably unchanged!'.format(\
+ return_code))
return False
except subprocess.TimeoutExpired as te:
self._warn('Timeout expired running {0}. Output was {1}'.format(args, te.output))
return False
except:
- self._log.info('Trouble calling tell-connd:')
- self._warn(sys.exc_info())
+ self._warn('Trouble calling tell-connd', exc_info=sys.exc_info())
return False
#end: Tester._set_online
# never been online --> nothing to check yet
return
- self._log.info('checking whether all hosts were pinged between {0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\
- time_went_online, time_go_offline))
+ self._log.info('checking whether all hosts were pinged between '
+ + '{0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\
+ time_went_online, time_go_offline))
for ip, reply_time in self._last_reply.items():
if reply_time < time_went_online:
- self._log.warn('Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\
+ self._log.warning(\
+ 'Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\
ip, time_went_online, time_go_offline))
self._fails.append((ip, time_went_online, time_go_offline))
self._log.info('remembered as failure number {0} / {1}'.format(\
# check if there were too many fails; if so, stop
if len(self._fails) > self.max_fails:
self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
- raise ValueError("Too many failures/warnings!")
+ raise TooManyWarnings()
#end: if not pinged recently
#end: for all known ips
#end: Tester._ensure_all_pinged
- def _warn(self, message):
+ def _warn(self, message, **kwargs):
- if isinstance(message, str):
- self._log.warning(message)
- else:
- self._log.formatException(sys.exc_info())
+ self._log.warning(message, **kwargs)
self._fails.append(("WARN", dt.now(), message))
self._log.info('remembered as failure number {0} / {1}'.format(\
len(self._fails), self.max_fails))
# check if there were too many fails; if so, stop
if len(self._fails) > self.max_fails:
self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
- raise ValueError("Too many failures/warnings!")
+ raise TooManyWarnings()
#end: Tester._warn
if ip == 'WARN':
log.info('Warning at {0:%H:%M:%S}: {1}'.format(time_went_online, time_go_offline))
else:
- log.info('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3}s)'.format(\
- ip, time_went_online, time_go_offline, \
- (time_go_offline-time_went_online).total_seconds()))
+ log.info('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3:.2f}s)'\
+ .format(ip, time_went_online, time_go_offline, \
+ (time_go_offline-time_went_online).total_seconds()))
#end: class Tester
########################################################################
epilog="(c) Christian Herdtweck, Intra2net AG, 2015")
# binaries
- parser.add_argument('--pingcheck_binary', '-p', metavar='PATH', \
+ parser.add_argument('--pingcheck-binary', '-p', metavar='PATH', \
default=default_pingcheck_bin, \
help='path to pingcheck binary')
- parser.add_argument('--tell_connd_binary', '-t', metavar='PATH', \
+ parser.add_argument('--tell-connd-binary', '-t', metavar='PATH', \
default=default_tell_connd_bin, \
help='path to tell-connd binary')
# output
- parser.add_argument('--output_prefix', '-o', metavar='PATH', \
+ parser.add_argument('--output-prefix', '-o', metavar='PATH', \
default=default_output_prefix, \
help='path and/or file prefix to config and log files; ~=home, 0=prog start datetime; ' \
+ 'default="%(default)s"')
- parser.add_argument('--log_to_console', '-c', metavar='0|1', \
+ parser.add_argument('--log-to-console', '-c', metavar='0|1', \
choices=(0, 1), type=int, default=1, \
help='print all output to console? default=%(default)s')
- parser.add_argument('--log_level_console', metavar='LEVEL', \
+ parser.add_argument('--log-level-console', metavar='LEVEL', \
type=int, default=logging.INFO, \
- help='log level for console output; debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
+ help='log level for console output; ' \
+ + 'debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
logging.DEBUG, logging.INFO, logging.WARN))
- parser.add_argument('--log_to_file', '-f', metavar='0|1', \
+ parser.add_argument('--log-to-file', '-f', metavar='0|1', \
choices=(0, 1), type=int, default=1, \
help='print all output to log file? default=%(default)s')
- parser.add_argument('--log_level_file', metavar='LEVEL', \
+ parser.add_argument('--log-level-file', metavar='LEVEL', \
type=int, default=logging.DEBUG, \
- help='log level for console output; debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
+ help='log level for console output; ' \
+ + 'debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
logging.DEBUG, logging.INFO, logging.WARN))
- parser.add_argument('--log_file_suffix', '-l', metavar='SUFF.IX', \
+ parser.add_argument('--log-file-suffix', '-l', metavar='SUFF.IX', \
type=str, default='.log', \
help='2nd part of log file name after --output_prefix; default="%(default)s"')
+ parser.add_argument('--pinger-log-level', metavar='LEVEL', \
+ type=str, default='INFO', \
+ help='log level given to pingchecker, DEBUG might quickly fill log; default=%(default)s')
- parser.add_argument('--max_fails', '-m', metavar='N', type=int, default=100, \
- help='max number of failures/warnings before test will end; default: %(default)s')
+
+ parser.add_argument('--max-fails-per-host', '-m', metavar='N', type=int, default=10, \
+ help='max number of failures/warnings per host before test will end; default: %(default)s')
parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(version))
# parse
#end: setup_logging
+def handle_sigchild(log, pinger, tester, signum, curr_frame):
+ print("\n!received sigchild!\n")
+ log.error('Received SIGCHLD')
+ log.info('with args {0} and {1}'.format(signum, curr_frame))
+
+ cleanup(log, pinger, tester)
+
+
+def cleanup(log, pinger, tester, line_iter):
+ """ the stuff from finally clause in main loop; also called from handle_sigchild """
+
+ return_code = 255
+
+ log.info('-' * 72)
+ if pinger is None:
+ log.warning('Error before pinger was created')
+ return_code = 254
+ else:
+ log.info("Cleaning up")
+
+ for line in line_iter:
+ log.info('left-over output: {0}'.format(line))
+
+ try:
+ log.info('telling pinger to terminate')
+ pinger.terminate()
+
+ #return_code = pinger.wait(3) may block!
+ outs, errs = pinger.communicate(timeout=3)
+ for line in outs.splitlines():
+ log.info('left-over output: {0}'.format(line))
+ for line in errs.splitlines():
+ log.info('left-over output: {0}'.format(line))
+
+ return_code = pinger.returncode # may still be None
+ log.info('pinger terminated with return code {0}'.format(return_code))
+ except subprocess.TimeoutExpired as te:
+ log.warning('Timeout expired waiting for pinger to terminate: {0}'.format(te))
+ return_code = 253
+ except:
+ log.exception('trouble terminating pinger', exc_info=sys.exc_info())
+
+ if return_code is None:
+ for line in line_iter:
+ log.info('left-over output: {0}'.format(line))
+
+ # kill it
+ try:
+ log.info('killing pinger the hard way')
+ pinger.kill()
+ outs, errs = pinger.communicate(timeout=3)
+ for line in outs.splitlines():
+ log.info('left-over output: {0}'.format(line))
+ for line in errs.splitlines():
+ log.info('left-over output: {0}'.format(line))
+
+ except:
+ log.exception('trouble killing pinger', exc_info=sys.exc_info())
+
+ if tester is not None:
+ tester.print_failures(log)
+ #end: if have pinger
+
+ # reset original signal handler
+ #signal.signal(signal.SIGCHLD, signal.SIG_DFL)
+
+ return return_code
+#end: cleanup
+
+
def main():
""" parses args, create a Pinger and Parser and Tester and runs them; cleans up
"""
log, log_file = setup_logging(opts)
pinger = None
- return_code = None
+ return_code = 255
tester = None
+ line_iter = None
try:
- config_file = create_conf_file(opts.output_prefix)
+ config_file, n_hosts = create_conf_file(opts.output_prefix)
log.info('created config file {0}'.format(config_file))
+ #signal.signal(signal.SIGCHLD, \
+ # lambda signum, curr_frame: handle_sigchild(log, pinger, tester, signum, curr_frame))
+
# pingcheck subprocess
- pinger = subprocess.Popen([opts.pingcheck_binary, '--log-level', 'DEBUG', \
+ pinger = subprocess.Popen([opts.pingcheck_binary, '--log-level', opts.pinger_log_level, \
'--log-output', 'TERMINAL', '--config-file', config_file], \
stdout=subprocess.PIPE, stderr=subprocess.PIPE, \
universal_newlines=True)
parser = ProcOutputParser(pinger.stdout, pinger.stderr)
line_iter = iter(parser)
- tester = Tester(line_iter, opts.tell_connd_binary, opts.max_fails)
+ tester = Tester(line_iter, opts.tell_connd_binary, max(10, opts.max_fails_per_host*n_hosts))
+
tester.test()
except KeyboardInterrupt:
log.info('stopped by user')
except:
+ log.exception('Caught an exception in main loop', exc_info=sys.exc_info())
+ log.warning('Re-raise')
raise
finally:
- if pinger is None:
- log.warn('Error before pinger was created')
- return_code = 254
- else:
- log.info("Testing ended")
- if tester is not None:
- tester.print_failures(log)
+ return_code = cleanup(log, pinger, tester, line_iter)
- try:
- log.info('telling pinger to terminate')
- pinger.terminate()
- #return_code = pinger.wait(3) may block!
- outs, errs = pinger.communicate(timeout=3)
- return_code = pinger.returncode # may still be None
- log.info('terminated with return code {0}'.format(return_code))
- except subprocess.TimeoutExpired as te:
- log.warn('Timeout expired waiting for pinger to terminate: {0}'.format(te))
- return_code = 255 # a.k.a. -1
- except:
- log.warn('trouble terminating pinger:')
- log.formatException(sys.exc_info())
-
- if return_code is None:
- # kill it
- try:
- log.info('killing pinger the hard way')
- pinger.kill()
- outs, errs = pinger.communicate(timeout=3)
- except:
- log.warn('trouble killing pinger:')
- log.formatException(sys.exc_info())
- #end: if have pinger
+ log.info('Will stop now')
+ if log_file is not None:
+ log.info('might find more log output in {0}'.format(log_file))
+ log.info('-' * 72)
#end: global clean-up
- if tester is not None:
- tester.print_failures(log)
-
- log.info('Will stop now')
- if log_file is not None:
- log.info('might find more log output in {0}'.format(log_file))
-
return return_code
#end: function main