From: Christian Herdtweck Date: Tue, 26 May 2015 15:54:06 +0000 (+0200) Subject: congestion detection now working; also add case that if all IPs get timeout despite... X-Git-Url: http://developer.intra2net.com/git/?p=pingcheck;a=commitdiff_plain;h=4d7db1af9260803b1d11fa91974708493a437338 congestion detection now working; also add case that if all IPs get timeout despite higher ping numbers, then declare host down added more detailed failure returns from IcmpPinger --- diff --git a/src/host/hoststatus.cpp b/src/host/hoststatus.cpp index 7449a91..2e1db87 100644 --- a/src/host/hoststatus.cpp +++ b/src/host/hoststatus.cpp @@ -61,7 +61,8 @@ HostStatus::HostStatus( PingCongestionCount( 0 ), ExceededPingFailedLimit( false ), ExceededPingCongestionLimit( false ), - NParallelPingers( n_parallel_pings) + NParallelPingers( n_parallel_pings), + InBurstMode( false ) { BOOST_ASSERT( !HostAddress.empty() ); BOOST_ASSERT( ( 0 <= PingFailLimitPercentage ) @@ -77,6 +78,11 @@ HostStatus::~HostStatus() void HostStatus::set_n_parallel_pings(const int n_parallel_pings) { + if (ExceededPingCongestionLimit) + InBurstMode = true; + else + InBurstMode = true; + if (NParallelPingers != n_parallel_pings) { NParallelPingers = n_parallel_pings; @@ -91,8 +97,8 @@ std::string HostStatus::log_prefix() std::stringstream temp; temp << "Stat(" << HostAddress << "): " << PingsFailedCount << " fail," << PingCongestionCount << " cong/" - << PingsPerformedCount << " pings/" << ResolvedIpCount << "*" - << NParallelPingers << " IPs: "; + << PingsPerformedCount << " pings/" << NParallelPingers << "*" + << ResolvedIpCount << " IPs: "; return temp.str(); } @@ -149,8 +155,11 @@ void HostStatus::update_ping_statistics( const PingStatus &result, BOOST_ASSERT( PingsFailedCount <= PingsPerformedCount ); BOOST_ASSERT( PingCongestionCount <= PingsPerformedCount ); - update_fail_stats( result ); - update_congestion_stats( result, ping_duration_us ); + increase_ping_performed_count(); + + bool failed_because_congested = update_congestion_stats( result, + ping_duration_us ); + update_fail_stats( result, failed_because_congested ); // after we tried all IPs resolved for this host, we can analyze how many // failed @@ -166,12 +175,12 @@ void HostStatus::update_ping_statistics( const PingStatus &result, } -void HostStatus::update_fail_stats( const PingStatus &result) +void HostStatus::update_fail_stats( const PingStatus &result, + const bool failed_because_congested ) { - increase_ping_performed_count(); - if ( result != PingStatus_SuccessReply - && result != PingStatus_SuccessOutdatedIP) + && result != PingStatus_SuccessOutdatedIP + && !failed_because_congested ) { increase_ping_failed_count(); } @@ -180,17 +189,23 @@ void HostStatus::update_fail_stats( const PingStatus &result) } -void HostStatus::update_congestion_stats( const PingStatus &result, +bool HostStatus::update_congestion_stats( const PingStatus &result, const long ping_duration_us ) { + bool is_congested = false; if (ping_duration_us > PingDurationCongestionsThresh) - increase_ping_congestion_count(); + is_congested = true; else if ( result == PingStatus_FailureTimeout ) - increase_ping_congestion_count(); + is_congested = true; // PingStatus_FailureNoIP, PingStatus_SuccessOutdatedIP could also be caused // by congestion, but also by other reasons (e.g. firewall blocking port 53) + if (is_congested) + increase_ping_congestion_count(); + analyze_ping_congestion_count(); + + return is_congested; } @@ -201,11 +216,19 @@ bool HostStatus::tried_all_resolved_ip() const return ( PingsPerformedCount >= ResolvedIpCount*NParallelPingers ); } + +/** @brief called when tried_all_resolved_ip() */ void HostStatus::analyze_ping_statistics() { BOOST_ASSERT( !HostAddress.empty() ); BOOST_ASSERT( PingsPerformedCount >= ResolvedIpCount*NParallelPingers ); + // timeouts are not counted towards failures, only count as congestions + // However, if all pings timed out even in burst mode, then we still declare + // the line down + if (InBurstMode && PingCongestionCount >= PingsPerformedCount) + ExceededPingFailedLimit = true; + // notify if the amount of pings that failed exceed the limit if ( exceeded_ping_failed_limit() ) { @@ -218,7 +241,7 @@ void HostStatus::analyze_ping_statistics() LinkAnalyzer->notify_host_up( HostAddress ); } - // nothing to do about congestion here, congestion is not forwarded to + // nothing else to do about congestion here, congestion is not forwarded to // central LinkAnalyzer } //lint !e1762 @@ -256,23 +279,20 @@ void HostStatus::analyze_ping_failed_count() BOOST_ASSERT( ( 0 <= PingFailLimitPercentage ) && ( PingFailLimitPercentage <= 100 ) ); BOOST_ASSERT( ( 0 <= PingsFailedCount ) && ( PingsFailedCount <= PingsPerformedCount ) ); - int ping_fail_limit_count = ( ResolvedIpCount * PingFailLimitPercentage - * NParallelPingers) / 100; + int limit = ( PingsPerformedCount * PingFailLimitPercentage) / 100; // keep a boolean variable because the PingsFailedCount can be reseted - if ( PingsFailedCount > ping_fail_limit_count ) + if ( PingsFailedCount > limit ) { ExceededPingFailedLimit = true; - GlobalLogger.debug() << log_prefix() << "exceed fail limit=" - << ping_fail_limit_count; + GlobalLogger.debug() << log_prefix() << "exceed fail limit=" << limit; } else { ExceededPingFailedLimit = false; - GlobalLogger.debug() << log_prefix() << "below fail limit=" - << ping_fail_limit_count; + GlobalLogger.debug() << log_prefix() << "below fail limit=" << limit; } } @@ -283,22 +303,21 @@ void HostStatus::analyze_ping_congestion_count() BOOST_ASSERT( ( 0 <= PingCongestionCount ) && ( PingCongestionCount <= PingsPerformedCount ) ); - int ping_congestion_limit_count = ( ResolvedIpCount * NParallelPingers - * PingCongestionLimitPercentage ) / 100; + int limit = ( PingsPerformedCount * PingCongestionLimitPercentage) / 100; // keep a boolean variable because the PingCongestionCount can be reseted - if ( PingCongestionCount > ping_congestion_limit_count ) + if ( PingCongestionCount > limit ) { ExceededPingCongestionLimit = true; GlobalLogger.debug() << log_prefix() << "exceed congestion limit=" - << ping_congestion_limit_count; + << limit; } else { ExceededPingCongestionLimit = false; GlobalLogger.debug() << log_prefix() << "below congestion limit=" - << ping_congestion_limit_count; + << limit; } } diff --git a/src/host/hoststatus.h b/src/host/hoststatus.h index 6e3e3d2..5aa9911 100644 --- a/src/host/hoststatus.h +++ b/src/host/hoststatus.h @@ -55,8 +55,9 @@ public: void set_n_parallel_pings(const int n_parallel_pings); private: - void update_fail_stats( const PingStatus &ping_success ); - void update_congestion_stats( const PingStatus &ping_success, + void update_fail_stats( const PingStatus &ping_success, + const bool failed_because_congested ); + bool update_congestion_stats( const PingStatus &ping_success, const long ping_duration_us ); bool tried_all_resolved_ip() const; void analyze_ping_statistics(); @@ -96,6 +97,9 @@ private: bool ExceededPingCongestionLimit; /// number of pingers that ping the same IP in parallel int NParallelPingers; + /// flag whether we performed a greater number of pings because line seems + /// congested + bool InBurstMode; }; diff --git a/src/host/pingscheduler.cpp b/src/host/pingscheduler.cpp index 5e2707a..72b084e 100644 --- a/src/host/pingscheduler.cpp +++ b/src/host/pingscheduler.cpp @@ -408,6 +408,7 @@ void PingScheduler::update_ping_number() { NPingers.increase(); + GlobalLogger.notice() << LogPrefix << "Line appears congested!"; GlobalLogger.debug() << LogPrefix << "- Increasing ping number to: " << NPingers; } diff --git a/src/host/pingstatus.cpp b/src/host/pingstatus.cpp index 1b255aa..b3ff60b 100644 --- a/src/host/pingstatus.cpp +++ b/src/host/pingstatus.cpp @@ -40,6 +40,7 @@ std::string to_string( const PingStatus &status ) break; case PingStatus_FailureAsyncError: return "PingFailed(AsyncError)"; break; + case PingStatus_SendFailed: return "SendingPingFailed"; break; default: return "PingStatusUnknown"; break; } } diff --git a/src/host/pingstatus.h b/src/host/pingstatus.h index f62dc88..abb9b78 100644 --- a/src/host/pingstatus.h +++ b/src/host/pingstatus.h @@ -33,7 +33,8 @@ enum PingStatus PingStatus_FailureNoIP, PingStatus_SuccessOutdatedIP, PingStatus_FailureAsyncCancel, - PingStatus_FailureAsyncError + PingStatus_FailureAsyncError, + PingStatus_SendFailed }; std::string to_string( const PingStatus &status ); diff --git a/src/icmp/icmppinger.cpp b/src/icmp/icmppinger.cpp index c0415bf..2415c83 100644 --- a/src/icmp/icmppinger.cpp +++ b/src/icmp/icmppinger.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -208,18 +209,47 @@ bool IcmpPinger::send_echo_request( const IcmpPacketItem icmp_packet ) bytes_sent = PacketDistributor->get_socket()->send_to( data, DestinationEndpoint ); if ( bytes_sent != buffer_size( data ) ) { - GlobalLogger.error() << LogPrefix << "fail sending ping data." - << endl; + GlobalLogger.error() << LogPrefix << "fail sending ping data. Only" + << bytes_sent << " of " << buffer_size(data) + << " bytes were sent!" << endl; } + + ReplyReceived = false; + schedule_timeout_echo_reply(); + } + catch ( const boost::system::system_error &boost_err ) + { + boost::system::error_code err_code = boost_err.code(); + GlobalLogger.error() << LogPrefix << "fail sending ping data: " + << boost_err.what() << " (code " << err_code.value() + << ", category " << err_code.category().name() << ")" << endl; + + // do not wait for timeout but fail at once + set_ping_status(PingStatus_SendFailed); + ReplyReceived = true; // flag for handler to leave ping status as is + //handle_timeout( err_code ); + handle_timeout( boost::system::error_code() ); } catch ( const exception &ex ) { - GlobalLogger.error() << LogPrefix << "fail sending ping data. " + GlobalLogger.error() << LogPrefix << "fail sending ping data: " << ex.what() << endl; + + // do not wait for timeout but fail at once + set_ping_status(PingStatus_SendFailed); + ReplyReceived = true; // flag for handler to leave ping status as is + handle_timeout( boost::system::error_code() ); } + catch ( ... ) + { + GlobalLogger.error() << LogPrefix << "fail sending ping data: " + << "Unknown exception" << endl; - ReplyReceived = false; - schedule_timeout_echo_reply(); + // do not wait for timeout but fail at once + set_ping_status(PingStatus_SendFailed); + ReplyReceived = true; // flag for handler to leave ping status as is + handle_timeout( boost::system::error_code() ); + } return (bytes_sent > 0); } @@ -262,6 +292,8 @@ void IcmpPinger::handle_timeout(const boost::system::error_code& error) << " waiting for ICMP echo reply!" << endl; set_ping_status( PingStatus_FailureAsyncError ); } + // could check here for more details if error is forwarded from + // send_echo_request // Still continue with rest of function, so PingStatus is updated and Callback executed // when timer was cancelled @@ -272,6 +304,7 @@ void IcmpPinger::handle_timeout(const boost::system::error_code& error) set_ping_status( PingStatus_FailureTimeout ); } + // otherwise assume that ping status was set already // Call ping-done handler PingDoneCallback( PingerStatus, static_cast(