+++ /dev/null
-#!/usr/bin/env python
-
-""" Representation for connd state as returned by tell-connd --status
-
-Christian Herdtweck, Intra2net, January 2015
-(c) Intra2net AG 2015
-"""
-
-from __future__ import print_function
-import subprocess
-from re import match as regexp
-from os import EX_OK
-
-# constants
-DEFAULT_TELL_CONND_BINARY = '/usr/intranator/bin/tell-connd'
-TIMEOUT = 1
-
-ONLINE_STATE_ALWAYS_ONLINE = 'always online'
-ONLINE_STATE_ALWAYS_OFFLINE = 'always offline'
-ONLINE_STATE_DIAL_ON_COMMAND = 'dial on command'
-ONLINE_STATE_DIAL_ON_DEMAND = 'dial on demand'
-
-SUBSYS_DNS = 'dns'
-SUBSYS_DYNDNS = 'dyndns'
-SUBSYS_MAIL = 'mail'
-SUBSYS_NTP = 'ntp'
-SUBSYS_SOCKS = 'socks'
-SUBSYS_VPN = 'vpn'
-SUBSYS_WEBPROXY = 'webproxy'
-SUBSYS_PINGCHECK = 'pingcheck'
-SUBSYS_IPONLINE = 'iponline'
-ALL_SUBSYS = (SUBSYS_DNS, SUBSYS_DYNDNS, SUBSYS_MAIL, SUBSYS_NTP,
- SUBSYS_SOCKS, SUBSYS_VPN, SUBSYS_WEBPROXY, SUBSYS_PINGCHECK,
- SUBSYS_IPONLINE)
-
-ALL_STATES = (ONLINE_STATE_DIAL_ON_DEMAND, ONLINE_STATE_DIAL_ON_COMMAND,
- ONLINE_STATE_ALWAYS_OFFLINE, ONLINE_STATE_ALWAYS_ONLINE)
-
-
-class ConndState(object):
- """ representation of connd's status as returned by tell-connd --status """
-
- online_mode = None
- default_provider = None
- subsys_online = None
- subsys_offline = None
- subsys_disabled = None
- connections = None
- actions = None
- online_ips = None
- connected_vpns = None
- log_level = None
- log_file = None
-
- def __str__(self):
- return \
- '[ConndState: {0} (default {1}), {2} conn\'s, {3} ips, {4} vpns ]'\
- .format(self.online_mode, self.default_provider,
- len(self.connections), len(self.online_ips),
- len(self.connected_vpns))
-
- def complete_str(self):
- """ return a string representating the complete state """
-
- # general
- parts = [
- 'ConndState: online mode = "{0}" (default provider: {1})\n'
- .format(self.online_mode, self.default_provider), ]
-
- # subsys
- # ' connctns: (repeated here for correct aligning)
- parts.append(' subsys: online: ')
- if self.subsys_online:
- for subsys in self.subsys_online:
- parts.append(subsys + ' ')
- else:
- parts.append('None ')
- parts.append('; offline: ')
- if self.subsys_offline:
- for subsys in self.subsys_offline:
- parts.append(subsys + ' ')
- else:
- parts.append('None ')
- parts.append('; disabled: ')
- if self.subsys_disabled:
- for subsys in self.subsys_disabled:
- parts.append(subsys + ' ')
- else:
- parts.append('None')
- parts.append('\n')
-
- # connections
- parts.append(' conns: ')
- if self.connections:
- name, info, actions = self.connections[0]
- parts.append('{0}: {1}, {2}\n'.format(name, info, actions))
- else:
- parts.append('None\n')
- for name, info, actions in self.connections[1:]:
- # ' connctns: (repeated here for correct aligning)
- parts.append(' {0}: {1}, {2}\n'.format(name, info,
- actions))
-
- # actions
- # ' connctns: (repeated here for correct aligning)
- parts.append(' actions: ')
- if self.actions:
- parts.append(self.actions[0] + '\n')
- else:
- parts.append('None\n')
- for action in self.actions[1:]:
- # ' connctns: (repeated here for correct aligning)
- parts.append(' {0}\n'.format(action))
-
- # online IPs
- # ' connctns: (repeated here for correct aligning)
- parts.append(' IPs: ')
- if self.online_ips:
- parts.append(self.online_ips[0])
- for curr_ip in self.online_ips[1:]:
- parts.append(', {0}'.format(curr_ip))
- else:
- parts.append('None')
- parts.append('\n')
-
- # VPNs
- # ' connctns: (repeated here for correct aligning)
- parts.append(' VPNs: ')
- if self.connected_vpns:
- parts.append(self.connected_vpns[0])
- for vpn in self.connected_vpns[1:]:
- parts.append(', {0}'.format(vpn))
- else:
- parts.append('None')
- parts.append('\n')
-
- # log level and target:
- # ' connctns: (repeated here for correct aligning)
- parts.append(' Log: level {0}'.format(self.log_level))
- if self.log_file:
- parts.append(' to {0}'.format(self.log_file))
- parts.append('\n')
-
- return ''.join(parts)
- # end: ConndState.complete_str
-
- @staticmethod
- def run_tell_connd(tell_connd_binary=DEFAULT_TELL_CONND_BINARY):
- """ run tell-connd --status, return output iterator and return code
-
- catches all it can, so should usually return (output, return_code)
- where output = [line1, line2, ...]
- if return_code != 0, output's first line(s) is error message
- """
- try:
- output = subprocess.check_output(
- [tell_connd_binary, '--status'], stderr=subprocess.STDOUT,
- universal_newlines=True, shell=False, timeout=TIMEOUT)
- return EX_OK, output.splitlines()
- except subprocess.CalledProcessError as cpe: # non-zero return status
- output = [
- 'tell-connd exited with status {0}'.format(cpe.returncode), ]
- output.extend(cpe.output.splitlines())
- return cpe.returncode, output
- # not python-2-compatible:
- # except subprocess.TimeoutExpired as texp:
- # output = [
- # 'tell-connd timed out after {0}s. Returning -1'.format(
- # texp.timeout), ]
- # output.extend(te.output.splitlines())
- # return -1, output
- except Exception as exp:
- output = [str(exp), ]
- return -1, output
- # end: ConndState.run_tell_connd
-
- @staticmethod
- def get_state(tell_connd_binary=DEFAULT_TELL_CONND_BINARY):
- """ get actual state from tell-connd --status
-
- returns (err_code, output_lines) if something goes wrong running
- binary; raises assertion if output from tell-connd does not match
- expected format
- """
-
- state = ConndState()
-
- err_code, all_lines = ConndState.run_tell_connd(tell_connd_binary)
- if err_code != EX_OK:
- return err_code, all_lines
-
- output = iter(all_lines)
-
- # first section
- line = next(output).strip()
- state.online_mode = regexp('online mode\s*:\s*(.+)$', line).groups()[0]
- assert state.online_mode in ALL_STATES, \
- 'unexpected online mode: {0}'.format(state.online_mode)
-
- line = next(output).strip()
- state.default_provider = regexp('default provider\s*:\s*(.*)$',
- line).groups()[0]
- if len(state.default_provider) == 0:
- state.default_provider = None
- line = next(output).strip()
- assert len(line) == 0, 'expected empty line, but got {0}'.format(line)
-
- # subsys
- line = next(output).strip()
- assert line == 'subsys', 'expected subsys but got {0}'.format(line)
- line = next(output).strip()
- state.subsys_online = regexp('online\s*:\s*(.*)$', line)\
- .groups()[0].split()
- for subsys in state.subsys_online:
- assert subsys in ALL_SUBSYS, \
- 'unexpected subsys: {0}'.format(subsys)
- line = next(output).strip()
- state.subsys_offline = regexp('offline\s*:\s*(.*)$', line)\
- .groups()[0].split()
- for subsys in state.subsys_offline:
- assert subsys in ALL_SUBSYS, \
- 'unexpected subsys: {0}'.format(subsys)
- line = next(output).strip()
- state.subsys_disabled = regexp('disabled\s*:\s*(.*)$', line)\
- .groups()[0].split()
- for subsys in state.subsys_disabled:
- assert subsys in ALL_SUBSYS, \
- 'unexpected subsys: {0}'.format(subsys)
- line = next(output).strip()
- assert len(line) == 0, 'expected empty line, but got {0}'.format(line)
-
- # connection map
- state.connections = []
- line = next(output).strip()
- assert line == 'connection map:', \
- 'expected connection map but got {0}'.format(line)
- expect_new = True
- for line in output:
- line = line.strip()
- if len(line) == 0:
- continue
- if expect_new:
- if line == 'end of connection map':
- break
- conn_name, conn_info = regexp(
- '\[\s*(.+)\s*\]\s*:\s*\(\s*(.*)\s*\)', line).groups()
- expect_new = False
- else:
- conn_actions = regexp('actions\s*:\s*\[\s*(.+)\s*\]', line)\
- .groups()
- state.connections.append((conn_name, conn_info, conn_actions))
- expect_new = True
- # end: for lines
- assert expect_new
- line = next(output).strip()
- assert len(line) == 0, 'expected empty line, but got {0}'.format(line)
-
- # actions
- line = next(output).strip()
- state.actions = regexp('actions\s*:\s*(.*)', line).groups()[0].split()
- if len(state.actions) == 1 and state.actions[0].strip() == '-':
- state.actions = []
- line = next(output).strip()
- assert len(line) == 0, 'expected empty line, but got {0}'.format(line)
-
- # online IPs
- line = next(output).strip()
- state.online_ips = regexp('list of online ips\s*:\s*(.*)', line)\
- .groups()[0].split()
- if len(state.online_ips) == 1 \
- and state.online_ips[0].strip() == 'NONE':
- state.online_ips = []
- line = next(output).strip()
- assert len(line) == 0, 'expected empty line, but got {0}'.format(line)
-
- # VPNs
- state.connected_vpns = []
- line = next(output).strip()
- assert line == 'vpns connected:', \
- 'expected vpns connected, got {0}'.format(line)
- for line in output:
- line = line.strip()
- if len(line) == 0:
- continue
- elif line == 'end of list of connected vpns':
- break
- else:
- state.connected_vpns.append(line)
- # end: for lines
- line = next(output).strip()
- assert len(line) == 0, 'expected empty line, but got {0}'.format(line)
-
- # log level
- line = next(output).strip()
- state.log_level, state.log_file = \
- regexp('Logging with level (.+)(?:\s+to\s+(.+))?', line).groups()
-
- # done
- line = next(output).strip()
- assert len(line) == 0, 'expected empty line, but got {0}'.format(line)
- line = next(output).strip()
- assert line == 'Done.', 'expect Done but got {0}'.format(line)
-
- return state
- # end: ConndState.get_state
-
-# end: class ConndState
-
-
-def test():
- """ get state and print it """
- state = ConndState.get_state()
- print(state)
- print(state.complete_str())
-
-
-def main():
- """ Main function, called when running file as script; runs test() """
- test()
-# end: function main
-
-
-if __name__ == '__main__':
- main()
+++ /dev/null
-#!/usr/bin/env python3
-
-""" A more long-term testing of pingcheck
-
-Runs pingcheck parsing its log output; goes online and offline repeatedly to see if
- pinging stays correct
-
-see also: connd/stresstest_pingcheck.py
-
-Christian Herdtweck, Intra2net, January 2015
-(c) Intra2net AG 2015
-"""
-
-# Version History
-# 15/01/15 Christian Herdtweck: started creation
-# 21/1/15 ch: running 3rd long-term test, seems to work nicely now
-
-import subprocess
-import sys
-import select
-from re import match as regexp
-from time import sleep
-from datetime import timedelta, datetime as dt
-from warnings import warn
-from random import randint
-import logging
-from os.path import expanduser, dirname, isdir
-from os import mkdir, EX_OK
-from connd_state import ConndState
-#import signal
-import pickle
-from strftimedelta import strftimedelta
-
-logger_base = logging.getLogger('long_term_test')
-
-########################################################################
-### CONSTANTS
-########################################################################
-
-version = '1.1'
-default_pingcheck_bin = '/usr/intranator/bin/pingcheck'
-default_tell_connd_bin = '/usr/intranator/bin/tell-connd'
-default_output_prefix = './long_term_test/{0:%d%m%y_%H%M}'
-
-# for parsing log messages
-LEVEL_DEBUG = 'dbug'
-LEVEL_INFO = 'info'
-LEVEL_WARN = 'warn'
-LEVEL_NOTICE = 'note'
-LEVEL_ERROR = 'err '
-
-text_to_level = {}
-text_to_level['INFO'] = LEVEL_INFO
-text_to_level['DBUG'] = LEVEL_DEBUG
-text_to_level['WARN'] = LEVEL_WARN
-text_to_level['NOTE'] = LEVEL_NOTICE
-text_to_level['ERR'] = LEVEL_ERROR
-
-# duration in which to remain online/offline
-MIN_DURATION_SAME_STATE = 10 # [seconds]
-MAX_DURATION_SAME_STATE = 50 # [seconds]
-
-########################################################################
-#### for config file
-########################################################################
-
-PING_INTERVAL = 3 # [seconds]
-config_file_pattern = 'long_term_test_{0:%d%m%y_%H%M%S}.conf'
-conf_start = """# config file for pingcheck testing
-
-default-source-network-interface=eth1
-nameserver=127.0.0.1
-hosts-down-limit={hosts_down_limit:d}
-ping-fail-limit=25
-status-notifier-cmd=echo host status is ${status}
-link-up-interval=2
-ping-reply-timeout=3
-"""
-conf_host = """
-[host]
-name={0}
-interval={1:d}
-ping-protocol=ICMPv4
-port=80
-source-network-interface=default
-"""
-host_list = (\
- 'intranator.m.i2n', \
- 'intraroute.m.i2n', \
- 'websync.m.i2n', \
- 'aqua.m.i2n', \
- 'intratdsl.m.i2n', \
- 'beholder.m.i2n', \
- 'eagle.m.i2n', \
- 'osdist.m.i2n', \
- 'tokyo.m.i2n', \
- 'blizzard.m.i2n', \
- 'phonehelp.m.i2n', \
- 'virt-dmz.m.i2n', \
- 'virt01.m.i2n', \
- 'virt02.m.i2n', \
- 'tornado.n.i2n', \
- 'auth.m.i2n', \
- 'forum.intra2net.com', \
- 'update.intranator.com', \
- 'demo-intranator.intra2net.com', \
- 'intra2net.com', \
- #'partner.intra2net.com', has same IP as intra2net.com --> trouble in Tester._host_for_ip \
- 'demo-securitygateway.intra2net.com', \
- 'demo-networksecurity.intra2net.com', \
- )
-
-def create_conf_file(file_prefix):
- """ create config file using constants above """
-
- file_name = file_prefix + '_config.conf'
-
- n_hosts = len(host_list)
-
- with open(file_name, 'wt') as file_handle:
- file_handle.write(conf_start.format(hosts_down_limit=int(n_hosts * 0.7), status='{status}'))
- for host in host_list:
- file_handle.write(conf_host.format(host, PING_INTERVAL))
-
- return file_name, n_hosts
-#end: create_conf_file
-
-
-########################################################################
-### Exceptions
-########################################################################
-
-def flags_to_str(flags):
- text_parts = []
- if flags & select.POLLIN:
- text_parts.append('IN')
- if flags & select.POLLOUT:
- text_parts.append('OUT')
- if flags & select.POLLPRI:
- text_parts.append('PRI')
- if flags & select.POLLERR:
- text_parts.append('ERR')
- if flags & select.POLLHUP:
- text_parts.append('HUP')
- if flags & select.POLLNVAL:
- text_parts.append('INVALID')
- return ','.join(text_parts)
-
-class PollFlagsException(Exception):
- def __init__(self, flags, proc_name):
- super(PollFlagsException, self).__init__('Unexpected flags from polling {0}: {1}'.format(\
- proc_name, flags_to_str(flags)))
-
-class PollHupException(Exception):
- def __init__(self, proc_name):
- super(PollHupException, self).__init__(\
- 'Received HUP from polling, {0} was probably killed!'.format(proc_name))
-
-class TooManyWarnings(Exception):
- def __init__(self):
- super(TooManyWarnings, self).__init__('Too many warnings / failures!')
-
-########################################################################
-### PARSER
-########################################################################
-
-class ProcOutputParser:
- """ parser for output from pingcheck process via pipes
-
- uses select and polling
- should never silently swallow log output, main function parse should yield all lines
- or emit errors
- """
-
- _out = None
- _err = None
- _poller = None
- _flags = select.POLLIN | select.POLLPRI | select.POLLHUP # | select.POLLERR
- _regexp = '(.+)\s+(INFO|ERR|WARN|DBUG|NOTE)\s+pingcheck\[\d+\]: (.+)\s*'
- _out_fileno = None
- _err_fileno = None
-
- SRC_OUT = 'stdout'
- SRC_ERR = 'stderr'
- SRC_UNK = 'unknown'
-
- def __init__(self, pipe_out, pipe_err):
- self._out = pipe_out
- self._err = pipe_err
- self._poller = select.poll()
- self._poller.register(pipe_out, self._flags)
- self._poller.register(pipe_err, self._flags)
- self._out_fileno = pipe_out.fileno()
- self._err_fileno = pipe_err.fileno()
-
- def _read(self):
- while True:
- result = self._poller.poll(10)
- for file_no, flags in result:
- if not (flags & self._flags):
- raise PollFlagsException(flags, 'pingcheck')
-
- if flags & select.POLLHUP:
- raise PollHupException('pingcheck')
-
- if file_no == self._out_fileno:
- src = self.SRC_OUT
- line = self._out.readline()
- elif file_no == self._err_fileno:
- src = self.SRC_ERR
- line = self._err.readline()
- else:
- src = self.SRC_UNK
- line = ''
-
- if not line:
- continue
- # if reach here, we have a new line of text
-
- yield src, line.strip()
- #end: ProcOutputParser._read
-
- def parse(self):
- for src, line in self._read():
- try:
- timestamp, level, message = regexp(self._regexp, line).groups()
- yield src, (timestamp, text_to_level[level], message)
- except:
- yield src, line
-
- def __iter__(self):
- return self.parse()
-
-#end: class ProcOutputParser
-
-########################################################################
-### TESTER
-########################################################################
-
-class Tester:
- """ """
-
- _line_iter = None
- _last_request = None
- _last_reply = None
- _n_requests = None
- _n_replies = None
- _host_for_ip = None
- _next_state_change = None
- _tell_connd_bin = None
- _prev_state_changes = None
- _fails = None
- _log = None
- _log_ping = None
- max_fails = None
-
- def __init__(self, line_iter, tell_connd_bin, max_fails):
- self._line_iter = line_iter
- self._last_request = {}
- self._last_reply = {}
- self._n_requests = {}
- self._n_replies = {}
- self._host_for_ip = {}
- self._tell_connd_bin = tell_connd_bin
- self._prev_state_changes = []
- self._fails = []
- self.max_fails = max_fails + 2 # for _test_the_tester
-
- self._log = logger_base.getChild('Tester')
- self._log_ping = {}
- self._log_ping[ProcOutputParser.SRC_OUT] = logger_base.getChild('pingOut')
- self._log_ping[ProcOutputParser.SRC_ERR] = logger_base.getChild('pingErr')
-
-
- def test(self):
- # preparations
- current_state = self._is_online()
- while current_state is None:
- sleep(1)
- current_state = self._is_online()
- self._log.info('Start with state online = {0}'.format(current_state))
- self._prev_state_changes.append((current_state, dt.now()))
- self._schedule_next_state_change()
-
- self._test_the_tester()
-
- # start looping over output from observed process
- for src, text_info in self._line_iter:
-
- # first of all: write to pinger log so nothing is lost
- if isinstance(text_info, str): # ProcOutputParser could not parse
- msg = 'unparsed: {0}'.format(text_info)
- if text_info.startswith("RECV BUF ("):
- self._log_ping[src].debug(msg)
- else:
- self._log_ping[src].warning(msg)
- continue # no need to parse message
- else:
- text_time, level, message = text_info
- msg = '{0} {1}'.format(level, message)
-
- if level in (LEVEL_WARN, LEVEL_ERROR, LEVEL_NOTICE):
- # most common warning/error is only logged at debug level
- if message.endswith('could not be resolved.'):
- self._log_ping[src].debug(msg)
- continue
- elif message.endswith('fail sending ping data. Network is unreachable'):
- self._log_ping[src].debug(msg)
- continue
- elif message.endswith('fail sending ping data. Operation not permitted'):
- self._log_ping[src].debug(msg)
- continue
- else: # unexpected errors and warnings from pingcheck are logged at info level
- self._log_ping[src].info(msg)
- # no continue --> try analyze
- else: # normal messages are logged at debug level
- self._log_ping[src].debug(msg)
- # no continue --> try analyze
-
- # now deal with message contents
- if self._handle_request(message):
- continue
- if self._handle_reply(message):
- continue
- if self._handle_change_state():
- continue
- if self._handle_new_ip(message):
- continue
- #end: for all lines from pingcheck output
- #end: Tester.test
-
- def _test_the_tester(self):
- self._warn("this is a test warning without exception information")
- try:
- raise NotImplementedError('Test Exception')
- except:
- self._warn("this is a test warning including exception information", exception=True)
- self._log.info('you should have seen 2 failures above -- only be alarmed if not!')
-
-
- def _handle_new_ip(self, message):
- # "Have IP 11.22.33.44 [61963s] for host.domain.com"
- matches = regexp('Have IP (\d+\.\d+\.\d+\.\d+) \[\d+s\] for (.+)', message)
- if matches:
- ip,host_name = matches.groups()
- else:
- matches = regexp('Host is already an IP: (\d+\.\d+\.\d+\.\d+)', message)
- if matches:
- host_name = ip = matches.groups()[0]
- else:
- return False
- # now have host_name and ip
-
- self._log.info('remember new IP {0} for {1}'.format(ip, host_name))
- self._host_for_ip[ip] = host_name
- self._last_request[host_name] = dt.now()
- self._last_reply[host_name] = dt.now()
- self._n_requests[host_name] = 0
- self._n_replies[host_name] = 0
- return True
-
-
- def _handle_request(self, message):
- matches = regexp('(\d+\.\d+\.\d+\.\d+): sending ping', message)
- if matches is not None:
- ip = matches.groups()[0]
- host_name = self._host_for_ip[ip]
- self._log.info('pinging host {0} ({1})'.format(host_name, ip))
- self._last_request[host_name] = dt.now()
- self._n_requests[host_name] += 1
- return True
- return False
-
-
- def _handle_reply(self, message):
- matches = regexp('(\d+\.\d+\.\d+\.\d+): Received reply', message)
- if matches is not None:
- ip = matches.groups()[0]
- host_name = self._host_for_ip[ip]
- self._log.info('received reply from {0} ({1})'.format(host_name, ip))
- self._last_reply[host_name] = dt.now()
- self._n_replies[host_name] += 1
- return True
- return False
-
-
- def _handle_change_state(self):
- if dt.now() < self._next_state_change:
- return False
-
- is_online = self._is_online()
- if is_online is None:
- return False # try again later
-
- expect_online = self._prev_state_changes[-1][0]
- if expect_online != is_online:
- self._warn('Expect to be online={0} since {1} but are online={2}!'.format(\
- expect_online, self._prev_state_changes[-1][1], is_online))
-
- if is_online:
- self._ensure_all_pinged()
- new_val = False
- else:
- new_val = True
- if self._set_online(new_val):
- self._schedule_next_state_change()
- return True
- # if call to tell_connd fails, try again later
- #end: _handle_change_state
-
-
- def _schedule_next_state_change(self):
- n_secs = randint(MIN_DURATION_SAME_STATE, MAX_DURATION_SAME_STATE)
- self._next_state_change = dt.now() + timedelta(seconds=n_secs)
- self._log.info('schedule next state change for {0:%H:%M:%S} (in {1:d}s)'.format(\
- self._next_state_change, n_secs))
- #end: Tester._schedule_next_state_change
-
-
- def _is_online(self):
- """ determine state using ConndState """
- state = ConndState.get_state(self._tell_connd_bin)
- if not isinstance(state, ConndState):
- err_code, output_lines = state
- self._warn('Non-zero exit status asking tell-connd for its --status: {0}'.format(\
- err_code, output_lines[0]))
- return None
-
- return len(state.connections) > 0
- #end: Tester._is_online
-
-
- def _set_online(self, new_val):
- """ run tell-connd --online/offline P1
-
- returns true if afterwards are in requested state
- """
- if new_val == self._is_online():
- self._warn('Want to set online state to current state!')
- return True
- if new_val:
- self._log.info('going online')
- args = [self._tell_connd_bin, '--online', 'P1']
- else:
- self._log.info('going offline')
- args = [self._tell_connd_bin, '--offline']
-
- try:
- return_code = subprocess.call(args, stderr=subprocess.STDOUT, timeout=1)
- if return_code == 0:
- self._prev_state_changes.append((new_val, dt.now()))
- return True
- else:
- self._log.info('tell-connd returned {0} --> online state probably unchanged!'.format(\
- return_code))
- return False
- except subprocess.TimeoutExpired as te:
- self._warn('Timeout expired running {0}. Output was {1}'.format(args, te.output))
- return False
- except:
- self._warn('Trouble calling tell-connd', exception=True)
- return False
- #end: Tester._set_online
-
-
- def _ensure_all_pinged(self):
- """ called before going offline; check if all hosts were successfully pinged """
- # determine when we last went online
- time_went_online = None
- time_go_offline = dt.now()
-
- for went_online, change_time in reversed(self._prev_state_changes):
- if went_online:
- time_went_online = change_time
- break
- if time_went_online is None:
- # never been online --> nothing to check yet
- return
-
- self._log.info('checking whether all hosts were pinged between '
- + '{0:%H:%M:%S} and now ({1:%H:%M:%S})'.format(\
- time_went_online, time_go_offline))
- for host_name, reply_time in self._last_reply.items():
- if reply_time < time_went_online:
- self._log.warning(\
- 'Did not ping host {0} between {1:%H:%M:%S} and now ({2:%H:%M:%S})!'.format(\
- host_name, time_went_online, time_go_offline))
- self._fails.append((host_name, time_went_online, time_go_offline))
- self._log.info('remembered as failure number {0} / {1}'.format(\
- len(self._fails), self.max_fails))
-
- # check if there were too many fails; if so, stop
- if len(self._fails) > self.max_fails:
- self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
- raise TooManyWarnings()
- #end: if not pinged recently
- #end: for all known ips
- #end: Tester._ensure_all_pinged
-
-
- def _warn(self, message, exception=False):
-
- if exception:
- self._log.exception(message, exc_info=True)
- else:
- self._log.warning(message)
- self._fails.append(("WARN", dt.now(), message))
- self._log.info('remembered as failure number {0} / {1}'.format(\
- len(self._fails), self.max_fails))
-
- # check if there were too many fails; if so, stop
- if len(self._fails) > self.max_fails:
- self._fails.append(("WARN", dt.now(), "Too many failurs/warnings!"))
- raise TooManyWarnings()
- #end: Tester._warn
-
-
- def print_failures(self, log):
- if len(self._fails) == 0:
- log.info('No warnings nor failures since {0:%d.%m.%y, %H:%M} :-)'.format(\
- self._prev_state_changes[0][1]))
- else:
- log.info('Recorded the following failures and warnings:')
- for host_name, time_went_online, time_go_offline in self._fails:
- if host_name == 'WARN':
- log.info('Warning at {0:%H:%M:%S}: {1}'.format(time_went_online, time_go_offline))
- else:
- log.info('Failed to ping {0} between {1:%H:%M:%S} and {2:%H:%M:%S} ({3:.2f}s)'\
- .format(host_name, time_went_online, time_go_offline, \
- (time_go_offline-time_went_online).total_seconds()))
-
- #end: Tester.print_failures
-
- def save_state_and_history(self, save_file):
- """ save all other data for later inspection using show_saved_data """
- try:
- with open(save_file, 'wb') as file_handle:
- self._log.info('saving state and history to {0}'.format(save_file))
- pickle.dump( (self._last_request, self._last_reply, self._n_requests, self._n_replies, \
- self._host_for_ip, self._tell_connd_bin, self._prev_state_changes, \
- self._fails, self.max_fails, save_file, dt.now()), \
- file_handle)
- return True
- except Exception as e:
- self._log.exception("Failed to save!", exc_info=True)
- return False
- #end: Tester.save_state_and_history
-
-
- @staticmethod
- def show_saved_data(save_file):
- """ print info on data saved by save_state_and_history """
-
- # load
- save_date = None
- with open(save_file, 'rb') as file_handle:
- print('loading state and history from {0}'.format(save_file))
- last_request, last_reply, n_requests, n_replies, host_for_ip, tell_connd_bin, \
- state_changes, fails, max_fails, save_file_again, save_date \
- = pickle.load(file_handle)
- if save_date is None:
- print('Failed to read state and history from {0}'.format(save_file))
- return 1
-
- # simple stats
- first_change = state_changes[0][1]
- last_change = state_changes[-1][1]
- print('Performed {0} state changes between {1:%d.%m,%H:%M} and {2:%d.%m,%H:%M}' \
- .format(len(state_changes), first_change, last_change), end='')
- print(' ({0} --> one every {1} on average)'.format( \
- strftimedelta( last_change - first_change ), \
- strftimedelta((last_change - first_change) / len(state_changes)) ))
- print('Used {0} IPs for {1} hosts'.format(len(host_for_ip), len(last_request)))
- print('Overall, recorded {0} warnings/failures (max={1})'.format(len(fails), max_fails))
-
- # sort fails into warnings and fail per host and fail per time
- warnings = []
- n_test_warnings = 0
- fail_per_host = {}
- fail_per_time = {}
- n_fails = 0
- for host, time_online, time_offline in fails:
- if host == 'WARN':
- # time_online = time, time_offline=message
- if 'test warning' in time_offline:
- n_test_warnings += 1
- else:
- warnings.append((time_online, time_offline))
- else:
- n_fails += 1
- try:
- fail_per_host[host].append(time_online)
- except KeyError:
- fail_per_host[host] = [time_online,]
- duration = time_offline - time_online
- try:
- fail_per_time[time_online, duration].append(host)
- except KeyError:
- fail_per_time[time_online, duration] = [host, ]
-
- # stats per host
- for host, n_req in n_requests.items():
- print(host + ' (', end='')
- print(','.join( ip for ip, host_for_ip in host_for_ip.items() if host==host_for_ip ), end='')
- print('): \treplied to {1} of {0} pings ({2:.1f}%); '.format(\
- n_req, n_replies[host], 100. * n_replies[host]/n_req), end='')
- if host in fail_per_host:
- print('failed to ping at ', end='')
- print(','.join( '{0:%H:%M},'.format(fail_time) for fail_time in fail_per_host[host] ))
- else:
- print('replied in each test interval')
-
- # stats per time
- if fail_per_time:
- print("Ping failures per time (overall {0} fails in {1} intervals):".format(\
- n_fails, len(fail_per_time)))
- for (start, duration), host_list in fail_per_time.items():
- print('{0:%d.%m,%H:%M} + {1}: '.format(start, strftimedelta(duration)), end='')
- print(','.join(host_list))
-
- # warnings
- if warnings:
- if n_test_warnings:
- print("Warnings (not showing {0} test warnings):".format(n_test_warnings))
- else:
- print("Warnings:")
- for time, message in warnings:
- print("{0:%d.%m,%H:%M}: {1}".format(time, message))
- elif n_test_warnings:
- print('Only warnings were {0} for testing'.format(n_test_warnings))
-
- # done
- return 0
- #end: static Tester.show_saved_data
-
-#end: class Tester
-
-########################################################################
-### MAIN
-########################################################################
-
-def test_version(pingcheck_binary):
- """ run pingcheck --version; we want > 0.1 for proper IP parsing
-
- no try-catch here, also intended as test-run for pingcheck binary
- """
-
- output = subprocess.check_output([pingcheck_binary, '--version'], universal_newlines=True)
- match = regexp('\s*pingcheck version (\d+\.\d+)\s*', output)
- if match is None:
- raise ValueError('Could not determine pingcheck version from output of {0} --version!'\
- .format(pingcheck_binary))
- version = float(match.groups()[0])
- if version < 0.2:
- raise ValueError('Pingcheck version is {0} but require version > 0.1!'.format(version))
-
-#end: test_version
-
-
-def parse_args():
- """ parse command line arguments and set defaults """
- from argparse import ArgumentParser
- parser = ArgumentParser(description="Long-term testing of pingcheck: goes online and offline " \
- + "using tell-connd and checks log for correct output", \
- epilog="(c) Christian Herdtweck, Intra2net AG, 2015")
-
- # option to show saved data
- parser.add_argument('--show-saved-data', '-s', metavar='SAVE_FILE.pickle', \
- default='', type=str, \
- help='Do not run test but instead print info saved by earlier test run')
-
- # binaries
- parser.add_argument('--pingcheck-binary', '-p', metavar='PATH', \
- default=default_pingcheck_bin, \
- help='path to pingcheck binary')
- parser.add_argument('--tell-connd-binary', '-t', metavar='PATH', \
- default=default_tell_connd_bin, \
- help='path to tell-connd binary')
-
- # output
- parser.add_argument('--output-prefix', '-o', metavar='PATH', \
- default=default_output_prefix, \
- help='path and/or file prefix to config and log files; ~=home, 0=prog start datetime; ' \
- + 'default="%(default)s"')
- parser.add_argument('--log-to-console', '-c', metavar='0|1', \
- choices=(0, 1), type=int, default=1, \
- help='print all output to console? default=%(default)s')
- parser.add_argument('--log-level-console', metavar='LEVEL', \
- type=int, default=logging.INFO, \
- help='log level for console output; ' \
- + 'debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
- logging.DEBUG, logging.INFO, logging.WARN))
- parser.add_argument('--log-to-file', '-f', metavar='0|1', \
- choices=(0, 1), type=int, default=1, \
- help='print all output to log file? default=%(default)s')
- parser.add_argument('--log-level-file', metavar='LEVEL', \
- type=int, default=logging.DEBUG, \
- help='log level for console output; ' \
- + 'debug={0}, info={1}, warn={2}; default: %(default)s'.format(\
- logging.DEBUG, logging.INFO, logging.WARN))
- parser.add_argument('--log-file-suffix', '-l', metavar='SUFF.IX', \
- type=str, default='.log', \
- help='2nd part of log file name after --output_prefix; default="%(default)s"')
- parser.add_argument('--pinger-log-level', metavar='LEVEL', \
- type=str, default='INFO', \
- help='log level given to pingchecker, DEBUG might quickly fill log; default=%(default)s')
-
-
- parser.add_argument('--max-fails-per-host', '-m', metavar='N', type=int, default=10, \
- help='max number of failures/warnings per host before test will end; default: %(default)s')
- parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(version))
-
- # parse
- opts = parser.parse_args()
- opts.output_prefix = expanduser(opts.output_prefix).format(dt.now())
- return opts
-#end: parse_args
-
-
-def setup_logging(opts):
- """ create main logger and handlers for base logger """
-
- # create logger for main
- log = logger_base.getChild('main')
-
- # remove all existing handlers
- for old_handler in logger_base.handlers:
- print('removing handler from base')
- logger_base.removeHandler(old_handler)
- for old_handler in log.handlers:
- print('removing handler from log')
- log.removeHandler(old_handler)
-
- logger_base.setLevel(logging.DEBUG)
-
- # create handlers and format for base logger
- log_file = None
- formatter = logging.Formatter('%(asctime)s %(name)s %(levelname)s %(message)s', \
- datefmt='%d.%m,%H:%M:%S')
- if opts.log_to_console:
- handler = logging.StreamHandler(sys.stdout)
- handler.setLevel(opts.log_level_console)
- handler.setFormatter(formatter)
- logger_base.addHandler(handler)
-
- if opts.log_to_file:
- log_file = opts.output_prefix + opts.log_file_suffix
- handler = logging.FileHandler(log_file)
- handler.setLevel(opts.log_level_file)
- handler.setFormatter(formatter)
- logger_base.addHandler(handler)
-
- # log info
- log.info('Log for {0} in {1} created on {2}'.format(__name__, __file__, dt.now()))
- if opts.log_to_console:
- log.info('logging to console at level {0}'.format(opts.log_level_console))
- else:
- log.info('not logging to console')
- if opts.log_to_file:
- log.info('logging to file {1} at level {0}'.format(opts.log_level_file, log_file))
- else:
- log.info('not logging to file')
-
- # return log_file only if makes sense to talk about it in main
- if opts.log_to_console:
- return log, log_file
- else:
- return log, None
-#end: setup_logging
-
-
-def handle_sigchild(signum, curr_frame, log, cleanup_args):
- log.error('Received SIGCHLD')
- log.info('with args {0} and {1}'.format(signum, curr_frame))
-
- cleanup(*cleanup_args)
-
-
-def cleanup(log, pinger, tester, line_iter, save_file):
- """ the stuff from finally clause in main loop; also called from handle_sigchild """
-
- return_code = 255
-
- log.info('-' * 72)
- if pinger is None:
- log.warning('Error before pinger was created')
- return_code = 254
- else:
- log.info("Cleaning up")
-
- try:
- log.info('telling pinger to terminate')
- pinger.terminate()
-
- for line_idx, line in enumerate(line_iter):
- log.info('left-over output: {0}'.format(line))
- if line_idx > 100:
- log.warning('ignore further left-over output')
- break
-
- #return_code = pinger.wait(3) can block!
- outs, errs = pinger.communicate(timeout=3)
- for line in outs.splitlines():
- log.info('left-over output: {0}'.format(line))
- for line in errs.splitlines():
- log.info('left-over output: {0}'.format(line))
-
- return_code = pinger.returncode # may still be None
- log.info('pinger terminated with return code {0}'.format(return_code))
- except subprocess.TimeoutExpired as te:
- log.warning('Timeout expired waiting for pinger to terminate: {0}'.format(te))
- return_code = 253
- except:
- log.exception('trouble terminating pinger', exc_info=True)
-
- if return_code is None:
- for line_idx, line in enumerate(line_iter):
- log.info('left-over output: {0}'.format(line))
- if line_idx > 100:
- log.warning('ignore further left-over output')
- break
-
- # kill it
- try:
- log.info('killing pinger the hard way')
- pinger.kill()
- outs, errs = pinger.communicate(timeout=3)
- for line in outs.splitlines():
- log.info('left-over output: {0}'.format(line))
- for line in errs.splitlines():
- log.info('left-over output: {0}'.format(line))
-
- except:
- log.exception('trouble killing pinger', exc_info=True)
-
- if tester is not None:
- tester.save_state_and_history(save_file)
- tester.print_failures(log)
- #end: if have pinger
-
- # reset original signal handler
- #signal.signal(signal.SIGCHLD, signal.SIG_DFL)
-
- return return_code
-#end: cleanup
-
-
-def main():
- """ parses args, create a Pinger and Parser and Tester and runs them; cleans up
- """
-
- opts = parse_args()
-
- if opts.show_saved_data:
- return Tester.show_saved_data(opts.show_saved_data)
-
- base_dir = dirname(opts.output_prefix + "_test")
- if not isdir(base_dir):
- mkdir(base_dir)
-
- log, log_file = setup_logging(opts)
-
- pinger = None
- return_code = 255
- tester = None
- line_iter = None
- try:
- config_file, n_hosts = create_conf_file(opts.output_prefix)
- save_file = opts.output_prefix + '_save.pickle'
- log.info('created config file {0}'.format(config_file))
-
- # test version
- test_version(opts.pingcheck_binary)
-
- #signal.signal(signal.SIGCHLD, \
- # lambda signum, curr_frame: handle_sigchild(signum, curr_frame, log, \
- # (log, pinger, tester, line_iter, save_file) )
-
- # pingcheck subprocess
- pinger = subprocess.Popen([opts.pingcheck_binary, '--log-level', opts.pinger_log_level, \
- '--log-output', 'TERMINAL', '--config-file', config_file], \
- stdout=subprocess.PIPE, stderr=subprocess.PIPE, \
- universal_newlines=True)
- log.info('started pingcheck with PID {0}'.format(pinger.pid))
-
- parser = ProcOutputParser(pinger.stdout, pinger.stderr)
- line_iter = iter(parser)
- tester = Tester(line_iter, opts.tell_connd_binary, max(10, opts.max_fails_per_host*n_hosts))
-
- tester.test()
- except KeyboardInterrupt:
- log.info('stopped by user')
- except:
- log.exception('Caught an exception in main loop', exc_info=True)
- log.warning('Re-raise')
- raise
- finally:
- return_code = cleanup(log, pinger, tester, line_iter, save_file)
-
- log.info('Will stop now')
- if log_file is not None:
- log.info('might find more log output in {0}'.format(log_file))
- log.info('-' * 72)
- #end: global clean-up
-
- return return_code
-
-#end: function main
-
-
-if __name__ == '__main__':
- main()
- logging.shutdown()
-