# 15/01/15 Christian Herdtweck: started creation
import subprocess
-from traceback import print_exc
+import sys
import select
from re import match as regexp
from time import sleep
from datetime import timedelta, datetime as dt
from warnings import warn
from random import randint
+import logging
+from os.path import expanduser, dirname, isdir
+from os import mkdir, EX_OK
+
+logger_base = logging.getLogger('long_term_test')
########################################################################
### CONSTANTS
########################################################################
-version = '1.0'
+version = '1.1'
#default_pingcheck_bin = '/usr/intranator/bin/pingcheck'
default_pingcheck_bin = '/root/git/pingcheck/build/src/pingcheck'
default_tell_connd_bin = '/usr/intranator/bin/tell-connd'
-default_config_file = 'test.conf'
+default_output_prefix = './long_term_test/{0:%d%m%y_%H%M}'
+# for parsing log messages
LEVEL_DEBUG = 'dbug'
LEVEL_INFO = 'info'
LEVEL_WARN = 'warn'
text_to_level['NOTE'] = LEVEL_NOTICE
text_to_level['ERR'] = LEVEL_ERROR
+# duration in which to remain online/offline
MIN_DURATION_SAME_STATE = 10 # [seconds]
MAX_DURATION_SAME_STATE = 50 # [seconds]
########################################################################
-### TESTER
+#### for config file
########################################################################
-class ParserError(Exception):
- def __init__(self, line):
- super(ParserError, self).__init__(\
- 'ParserError: line "{0}" did not match!'.format(line))
-
-class ParserError_NotUnique(Exception):
- def __init__(self, line):
- super(ParserError_NotUnique, self).__init__(\
- 'ParserError: no unique match for line "{0}"!'.format(line))
-
-class PingProc(subprocess.Popen):
- """ wrapper around pingcheck binary """
+PING_INTERVAL = 3 # [seconds]
+config_file_pattern = 'long_term_test_{0:%d%m%y_%H%M%S}.conf'
+conf_start = """# config file for pingcheck testing
- def __init__(self, binary, conf_file):
- super(PingProc, self).__init__(\
- [binary, '--log-level', 'INFO', '--log-output', 'TERMINAL', '--config-file', conf_file], \
- stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
- print('started pingcheck with PID {0}'.format(self.pid))
+default-source-network-interface=eth1
+nameserver=127.0.0.1
+hosts-down-limit={hosts_down_limit:d}
+ping-fail-limit=25
+status-notifier-cmd=echo host status is ${status}
+link-up-interval=2
+"""
+conf_host = """
+[host]
+name={0}
+interval={1:d}
+ping-protocol=ICMPv4
+port=80
+source-network-interface=default
+"""
+host_list = (\
+ 'intranator.m.i2n', \
+ 'intraroute.m.i2n', \
+ 'websync.m.i2n', \
+ 'aqua.m.i2n', \
+ 'intratdsl.m.i2n', \
+ 'beholder.m.i2n', \
+ 'eagle.m.i2n', \
+ 'osdist.m.i2n', \
+ 'tokyo.m.i2n', \
+ 'blizzard.m.i2n', \
+ 'phonehelp.m.i2n', \
+ 'virt-dmz.m.i2n', \
+ 'virt01.m.i2n', \
+ 'virt02.m.i2n', \
+ 'tornado.n.i2n', \
+ 'auth.m.i2n', \
+ 'forum.intra2net.com', \
+ 'update.intranator.com', \
+ 'demo-intranator.intra2net.com', \
+ 'intra2net.com', \
+ 'partner.intra2net.com', \
+ 'demo-securitygateway.intra2net.com', \
+ 'demo-networksecurity.intra2net.com', \
+ )
+
+def create_conf_file(file_prefix):
+ """ create config file using constants above """
+
+ file_name = file_prefix + '_config.conf'
+
+ n_hosts = len(host_list)
+
+ with open(file_name, 'wt') as file_handle:
+ file_handle.write(conf_start.format(hosts_down_limit=int(n_hosts * 0.7), status='{status}'))
+ for host in host_list:
+ file_handle.write(conf_host.format(host, PING_INTERVAL))
+
+ return file_name
+#end: create_conf_file
-#end: PingProc
+########################################################################
+### PARSER
+########################################################################
class ProcOutputParser:
+ """ parser for output from pingcheck process via pipes
+
+ uses select and polling
+ should never silently swallow log output, main function parse should yield all lines
+ or emit errors
+ """
_out = None
_err = None
if not line:
continue
+ # if reach here, we have a new line of text
- yield src, dt.now(), line.strip()
+ yield src, line.strip()
#end: ProcOutputParser._read
- def _parse_line(self, line):
- match = regexp(self._regexp, line)
- if match is None:
- raise ParserError(line)
- text_time, text_level, message = match.groups()
- try:
- return text_time, text_to_level[text_level], message
- except KeyError:
- warn('unknown level: {0}'.format(text_level))
- return text_time, text_level, message
-
def parse(self):
- for src, timestamp, line in self._read():
- if src == self.SRC_ERR:
- yield src, timestamp, self._parse_line(line)
- elif src == self.SRC_OUT and regexp("RECV BUF \(\s*\d+\):[\d \|]+", line):
- pass
- else:
- warn('Unexpected line and/or source: {0}, "{1}"'.format(src, line))
+ for src, line in self._read():
+ try:
+ yield src, regexp(self._regexp, line).groups()
+ except:
+ yield src, line
def __iter__(self):
return self.parse()
#end: class ProcOutputParser
+########################################################################
+### TESTER
+########################################################################
+
class Tester:
""" """
_tell_connd_bin = None
_prev_state_changes = None
_fails = None
+ _log = None
+ _log_ping = None
+ max_fails = None
- def __init__(self, line_iter, tell_connd_bin):
+ def __init__(self, line_iter, tell_connd_bin, max_fails):
self._line_iter = line_iter
self._last_request = {}
self._last_reply = {}
self._tell_connd_bin = tell_connd_bin
self._prev_state_changes = []
self._fails = []
+ self.max_fails = max_fails
+
+ self._log = logger_base.getChild('Tester')
+ self._log_ping = {}
+ self._log_ping[ProcOutputParser.SRC_OUT] = logger_base.getChild('pingOut')
+ self._log_ping[ProcOutputParser.SRC_ERR] = logger_base.getChild('pingErr')
def test(self):
- self._handle_change_state()
+ # preparations
+ current_state = self._is_online()
+ while current_state is None:
+ sleep(1)
+ current_state = self._is_online()
+ self._log.info('Start with state online = {0}'.format(current_state))
+
+ # start looping over output from observed process
+ for src, text_info in self._line_iter:
+ if isinstance(text_info, str):
+ # ProcOutputParser could not parse
+ msg = 'unparsed: {0}'.format(text_info)
+ if text_info.startswith("RECV BUF ("):
+ level = LEVEL_DEBUG
+ else:
+ level = LEVEL_WARN
+ else:
+ text_time, level, message = text_info
+ # first of all: write to text log so nothing is lost
+ msg = '{0} {1} (orig time {2})'.format(level, message, text_time)
- for src, timestamp, (text_time, level, message) in self._line_iter:
- # always print warnings and errors
if level in (LEVEL_WARN, LEVEL_ERROR):
+ # most common warning/error is only logged at debug level
if message.endswith('could not be resolved.'):
+ self._log_ping[src].debug(msg)
continue
if message.endswith('fail sending ping data. Network is unreachable'):
+ self._log_ping[src].debug(msg)
continue
- else:
- self._print('{0}: {1}'.format(level, message))
+ else: # unexpected errors and warnings from pingcheck are logged at info level
+ self._log_ping[src].info(msg)
continue
- if self._handle_new_ip(message):
- continue
+ else: # normal messages are logged at debug level
+ self._log_ping[src].debug(msg)
+
+ # now deal with message contents
if self._handle_request(message):
continue
if self._handle_reply(message):
continue
if self._handle_change_state():
continue
-
- #self._print('ignoring message "{0}"'.format(message))
- #end: for all lines from log
+ if self._handle_new_ip(message):
+ continue
+ #end: for all lines from pingcheck output
#end: Tester.test
matches = regexp('Host is already an IP: (\d+\.\d+\.\d+\.\d+)', message)
if matches:
ip = matches.groups()[0]
- self._print('remember new IP {0}'.format(ip))
+ self._log.info('remember new IP {0}'.format(ip))
self._last_request[ip] = dt.now()
self._last_reply[ip] = dt.now()
self._n_requests[ip] = 0
return True
return False
+
def _handle_request(self, message):
matches = regexp('(\d+\.\d+\.\d+\.\d+): sending ping', message)
if matches is not None:
ip = matches.groups()[0]
- self._print('pinging IP {0}'.format(ip))
+ self._log.info('pinging IP {0}'.format(ip))
self._last_request[ip] = dt.now()
self._n_requests[ip] += 1
return True
return False
+
def _handle_reply(self, message):
matches = regexp('(\d+\.\d+\.\d+\.\d+): Received reply', message)
if matches is not None:
ip = matches.groups()[0]
- self._print('received reply from {0}'.format(ip))
+ self._log.info('received reply from {0}'.format(ip))
self._last_reply[ip] = dt.now()
self._n_replies[ip] += 1
return True
return False
- def _handle_change_state(self):
- if self._next_state_change is None:
- # called at start of test()
- n_secs = randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE)
- self._next_state_change = dt.now() + timedelta(seconds=n_secs)
- current_state = self._is_online()
- if current_state is not None:
- self._print('Start with state online = {0}'.format(current_state))
- self._prev_state_changes.append((current_state, dt.now()))
- self._print('schedule next state change for {0:%H:%M:%S} (in {1}s)'.format(\
- self._next_state_change, n_secs))
-
- elif dt.now() > self._next_state_change:
-
- # check if there were too many fails; if so, stop
- if len(self._fails) > 100:
- self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
- raise ValueError("Too many failures/warnings!")
+ def _handle_change_state(self):
+ if dt.now() > self._next_state_change:
is_online = self._is_online()
if is_online is None:
return False # try again later
else:
new_val = True
if self._set_online(new_val):
- self._prev_state_changes.append((new_val, dt.now()))
- self._next_state_change = dt.now() + timedelta(\
- seconds=randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE))
- self._print('schedule next state change for {0:%H:%M:%S}'.format(self._next_state_change))
+ self._schedule_next_state_change(is_online)
return True
# if call to tell_connd fails, try again later
return False
#end: _handle_change_state
+ def _schedule_next_state_change(self, current_state):
+ self._prev_state_changes.append((current_state, dt.now()))
+ n_secs = randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE)
+ self._next_state_change = dt.now() + timedelta(seconds=n_secs)
+ self._log.info('schedule next state change for {0:%H:%M:%S} (in {1:d}s)'.format(\
+ self._next_state_change, n_secs))
+ #end: Tester._schedule_next_state_change
+
+
def _is_online(self):
""" determine state from tell-connd --status
output = subprocess.check_output([self._tell_connd_bin, '--status'], \
universal_newlines=True)
except subprocess.CalledProcessError as cpe: # non-zero exit status
- text = 'Non-zero exit status asking tell-connd for its --status: {0}'.format(cpe.returncode)
- warn(text)
- self._fails.append(("WARN", dt.now(), text))
+ self._warn('Non-zero exit status asking tell-connd for its --status: {0}'.format(cpe.returncode))
return None
except subprocess.TimeoutExpired as te:
- text = 'Timeout expired asking tell-connd for its --status. Output was {0}'.format(te.output)
- warn(text)
- self._fails.append(("WARN", dt.now(), text))
+ self._warn('Timeout expired asking tell-connd for its --status. Output was {0}'.format(te.output))
return None
except:
- self._print('Trouble asking tell-connd for its --status:')
- print_exc()
+ self._info('Trouble asking tell-connd for its --status:')
+ self._warn(sys.exc_info())
return None
line_iter = output.splitlines()
if matches:
matches = matches.groups()
if len(matches) != 1:
- raise ValueError("Could not interpret online mode from \"{0}\"".format(line))
+ self._warn('Could not interpret online mode from \"{0}\"'.format(line))
else:
online_mode = matches[0]
break
if online_mode is None:
- text = 'Could not determine online mode of connd! Is connd running?!'
- warn(text)
- self._fails.append(("WARN", dt.now(), text))
+ self._warn('Could not determine online mode of connd! Is connd running?!')
return None
elif online_mode in ('always online', ):
return True
elif online_mode in ('offline', 'always offline', ):
return False
else:
- text = 'Unknown online mode: {0}'.format(online_mode)
- warn(text)
- self._fails.append(("WARN", dt.now(), text))
+ self._warn('Unknown online mode: {0}'.format(online_mode))
return None
#end: Tester._is_online
def _set_online(self, new_val):
""" run tell-connd --online/offline P1 """
if new_val:
- self._print('going online')
+ self._log.info('going online')
args = [self._tell_connd_bin, '--online', 'P1']
else:
- self._print('going offline')
+ self._log.info('going offline')
args = [self._tell_connd_bin, '--offline']
try:
return_code = subprocess.call(args, stderr=subprocess.STDOUT, timeout=1)
if return_code == 0:
return True
else:
- self._print('tell-connd returned {0} --> online state probably unchanged!'.format(return_code))
+ self._log.info('tell-connd returned {0} --> online state probably unchanged!'.format(return_code))
return False
except subprocess.TimeoutExpired as te:
- text = 'Timeout expired running {0}. Output was {1}'.format(args, te.output)
- warn(text)
- self._fails.append(("WARN", dt.now(), text))
+ self._warn('Timeout expired running {0}. Output was {1}'.format(args, te.output))
return False
except:
- self._print('Trouble calling tell-connd:')
- print_exc()
+ self._log.info('Trouble calling tell-connd:')
+ self._warn(sys.exc_info())
return False
#end: Tester._set_online
# never been online --> nothing to check yet
return
- self._print('checking whether all hosts were pinged between {0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\
+ self._log.info('checking whether all hosts were pinged between {0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\
time_went_online, time_go_offline))
for ip, reply_time in self._last_reply.items():
if reply_time < time_went_online:
- text = 'Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\
- ip, time_went_online, time_go_offline)
- warn(text)
+ self._log.warn('Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\
+ ip, time_went_online, time_go_offline))
self._fails.append((ip, time_went_online, time_go_offline))
-
+ self._log.info('remembered as failure number {0} / {1}'.format(\
+ len(self._fails), self.max_fails))
+
+ # check if there were too many fails; if so, stop
+ if len(self._fails) > self.max_fails:
+ self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
+ raise ValueError("Too many failures/warnings!")
+ #end: if not pinged recently
+ #end: for all known ips
#end: Tester._ensure_all_pinged
- def _print(self, message):
- now = dt.now()
- if self._next_state_change is not None:
- print('{0:%H:%M:%S};next{1:+5.1f}s;{2} fails: {3}'.format(now, \
- (now - self._next_state_change).total_seconds(), len(self._fails), message))
+ def _warn(self, message):
+
+ if isinstance(message, str):
+ self._log.warning(message)
else:
- print('{0:%H:%M:%S};next-?????;{1} fails: {2}'.format(now, len(self._fails), message))
+ self._log.formatException(sys.exc_info())
+ self._fails.append(("WARN", dt.now(), message))
+ self._log.info('remembered as failure number {0} / {1}'.format(\
+ len(self._fails), self.max_fails))
+ # check if there were too many fails; if so, stop
+ if len(self._fails) > self.max_fails:
+ self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
+ raise ValueError("Too many failures/warnings!")
+ #end: Tester._warn
- def print_failures(self):
+
+ #def _print(self, message):
+ # now = dt.now()
+ # if self._next_state_change is not None:
+ # print('{0:%H:%M:%S};next{1:+5.1f}s;{2} fails: {3}'.format(now, \
+ # (now - self._next_state_change).total_seconds(), len(self._fails), message))
+ # else:
+ # print('{0:%H:%M:%S};next-?????;{1} fails: {2}'.format(now, len(self._fails), message))
+
+
+ def print_failures(self, log):
if len(self._fails) == 0:
- print('No fails! :-)')
+ log.info('No fails! :-)')
else:
- print('Recorded the following failures and warnings:')
+ log.info('Recorded the following failures and warnings:')
for ip, time_went_online, time_go_offline in self._fails:
if ip == 'WARN':
- print('Warning at {0:%H:%M:%S}: {1}'.format(time_went_online, time_go_offline))
+ log.info('Warning at {0:%H:%M:%S}: {1}'.format(time_went_online, time_go_offline))
else:
- print('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3}s)'.format(\
+ log.info('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3}s)'.format(\
ip, time_went_online, time_go_offline, \
(time_go_offline-time_went_online).total_seconds()))
#end: class Tester
default=default_tell_connd_bin, \
help='path to tell-connd binary')
- parser.add_argument('--config_file', '-c', metavar='PATH', \
- default=default_config_file, \
- help='path to config file for pingcheck')
-
- # default
+ # output
+ parser.add_argument('--output_prefix', '-o', metavar='PATH', \
+ default=default_output_prefix, \
+ help='path and/or file prefix to config and log files; ~=home, 0=prog start datetime; ' \
+ + 'default="%(default)s"')
+ parser.add_argument('--log_to_console', '-c', metavar='0|1', \
+ choices=(0, 1), type=int, default=1, \
+ help='print all output to console? default=%(default)s')
+ parser.add_argument('--log_level_console', metavar='LEVEL', \
+ type=int, default=logging.INFO, \
+ help='log level for console output; debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
+ logging.DEBUG, logging.INFO, logging.WARN))
+ parser.add_argument('--log_to_file', '-f', metavar='0|1', \
+ choices=(0, 1), type=int, default=1, \
+ help='print all output to log file? default=%(default)s')
+ parser.add_argument('--log_level_file', metavar='LEVEL', \
+ type=int, default=logging.DEBUG, \
+ help='log level for console output; debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
+ logging.DEBUG, logging.INFO, logging.WARN))
+ parser.add_argument('--log_file_suffix', '-l', metavar='SUFF.IX', \
+ type=str, default='.log', \
+ help='2nd part of log file name after --output_prefix; default="%(default)s"')
+
+ parser.add_argument('--max_fails', '-m', metavar='N', type=int, default=100, \
+ help='max number of failures/warnings before test will end; default: %(default)s')
parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(version))
- return parser.parse_args()
+
+ # parse
+ opts = parser.parse_args()
+ opts.output_prefix = expanduser(opts.output_prefix).format(dt.now())
+ return opts
#end: parse_args
+def setup_logging(opts):
+ """ create main logger and handlers for base logger """
+
+ # create logger for main
+ log = logger_base.getChild('main')
+
+ # remove all existing handlers
+ for old_handler in logger_base.handlers:
+ print('removing handler from base')
+ logger_base.removeHandler(old_handler)
+ for old_handler in log.handlers:
+ print('removing handler from log')
+ log.removeHandler(old_handler)
+
+ logger_base.setLevel(logging.DEBUG)
+
+ # create handlers and format for base logger
+ log_file = None
+ if opts.log_to_console:
+ handler = logging.StreamHandler(sys.stdout)
+ handler.setLevel(opts.log_level_console)
+ formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s %(message)s', \
+ datefmt='%d.%m,%H:%M:%S')
+ handler.setFormatter(formatter)
+ logger_base.addHandler(handler)
+
+ if opts.log_to_file:
+ log_file = opts.output_prefix + opts.log_file_suffix
+ handler = logging.FileHandler(log_file)
+ handler.setLevel(opts.log_level_file)
+ formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s', \
+ datefmt='%d.%m,%H:%M:%S')
+ handler.setFormatter(formatter)
+ logger_base.addHandler(handler)
+
+ # log info
+ log.info('Log for {0} in {1} created on {2}'.format(__name__, __file__, dt.now()))
+ if opts.log_to_console:
+ log.info('logging to console at level {0}'.format(opts.log_level_console))
+ else:
+ log.info('not logging to console')
+ if opts.log_to_file:
+ log.info('logging to file {1} at level {0}'.format(opts.log_level_file, log_file))
+ else:
+ log.info('not logging to file')
+
+ # return log_file only if makes sense to talk about it in main
+ if opts.log_to_console:
+ return log, log_file
+ else:
+ return log, None
+#end: setup_logging
+
+
def main():
- """ Main function, called when running file as script; currently raises a NotImplementedError
+ """ parses args, create a Pinger and Parser and Tester and runs them; cleans up
"""
opts = parse_args()
+ base_dir = dirname(opts.output_prefix + "_test")
+ if not isdir(base_dir):
+ mkdir(base_dir)
+
+ log, log_file = setup_logging(opts)
+
pinger = None
return_code = None
tester = None
try:
- pinger = PingProc(opts.pingcheck_binary, opts.config_file)
+ config_file = create_conf_file(opts.output_prefix)
+ log.info('created config file {0}'.format(config_file))
+
+ # pingcheck subprocess
+ pinger = subprocess.Popen([opts.pingcheck_binary, '--log-level', 'DEBUG', \
+ '--log-output', 'TERMINAL', '--config-file', config_file], \
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE, \
+ universal_newlines=True)
+ log.info('started pingcheck with PID {0}'.format(pinger.pid))
+
parser = ProcOutputParser(pinger.stdout, pinger.stderr)
line_iter = iter(parser)
- tester = Tester(line_iter, opts.tell_connd_binary)
+ tester = Tester(line_iter, opts.tell_connd_binary, opts.max_fails)
tester.test()
except KeyboardInterrupt:
- print('stopped by user')
+ log.info('stopped by user')
except:
- if tester is not None:
- tester.print_failures()
raise
finally:
if pinger is None:
- print('failed to create pinger')
+ log.warn('Error before pinger was created')
return_code = 254
else:
+ log.info("Testing ended")
+ if tester is not None:
+ tester.print_failures(log)
+
try:
- print('telling pinger to terminate')
+ log.info('telling pinger to terminate')
pinger.terminate()
- return_code = pinger.wait(3)
- print('terminated with return code {0}'.format(return_code))
+ #return_code = pinger.wait(3) may block!
+ outs, errs = pinger.communicate(timeout=3)
+ return_code = pinger.returncode # may still be None
+ log.info('terminated with return code {0}'.format(return_code))
except subprocess.TimeoutExpired as te:
- print('Timeout expired waiting for pinger to terminate: {0}'.format(te))
+ log.warn('Timeout expired waiting for pinger to terminate: {0}'.format(te))
return_code = 255 # a.k.a. -1
except:
- print('trouble terminating pinger:')
- print_exc()
+ log.warn('trouble terminating pinger:')
+ log.formatException(sys.exc_info())
if return_code is None:
# kill it
try:
+ log.info('killing pinger the hard way')
pinger.kill()
- outs, errs = pinger.communicate()
+ outs, errs = pinger.communicate(timeout=3)
except:
- print('trouble killing pinger:')
- print_exc()
+ log.warn('trouble killing pinger:')
+ log.formatException(sys.exc_info())
#end: if have pinger
#end: global clean-up
if tester is not None:
- tester.print_failures()
+ tester.print_failures(log)
+
+ log.info('Will stop now')
+ if log_file is not None:
+ log.info('might find more log output in {0}'.format(log_file))
return return_code
if __name__ == '__main__':
main()
+ logging.shutdown()
# (created using vim -- the world's best text editor)