congestion detection now working; also add case that if all IPs get timeout despite...
authorChristian Herdtweck <christian.herdtweck@intra2net.com>
Tue, 26 May 2015 15:54:06 +0000 (17:54 +0200)
committerChristian Herdtweck <christian.herdtweck@intra2net.com>
Tue, 26 May 2015 15:54:06 +0000 (17:54 +0200)
added more detailed failure returns from IcmpPinger

src/host/hoststatus.cpp
src/host/hoststatus.h
src/host/pingscheduler.cpp
src/host/pingstatus.cpp
src/host/pingstatus.h
src/icmp/icmppinger.cpp

index 7449a91..2e1db87 100644 (file)
@@ -61,7 +61,8 @@ HostStatus::HostStatus(
     PingCongestionCount( 0 ),
     ExceededPingFailedLimit( false ),
     ExceededPingCongestionLimit( false ),
-    NParallelPingers( n_parallel_pings)
+    NParallelPingers( n_parallel_pings),
+    InBurstMode( false )
 {
     BOOST_ASSERT( !HostAddress.empty() );
     BOOST_ASSERT( ( 0 <= PingFailLimitPercentage )
@@ -77,6 +78,11 @@ HostStatus::~HostStatus()
 
 void HostStatus::set_n_parallel_pings(const int n_parallel_pings)
 {
+    if (ExceededPingCongestionLimit)
+        InBurstMode = true;
+    else
+        InBurstMode = true;
+
     if (NParallelPingers != n_parallel_pings)
     {
         NParallelPingers = n_parallel_pings;
@@ -91,8 +97,8 @@ std::string HostStatus::log_prefix()
     std::stringstream temp;
     temp << "Stat(" << HostAddress << "): "
         << PingsFailedCount << " fail," << PingCongestionCount << " cong/"
-        << PingsPerformedCount << " pings/" << ResolvedIpCount << "*"
-        << NParallelPingers << " IPs: ";
+        << PingsPerformedCount << " pings/" << NParallelPingers << "*"
+        << ResolvedIpCount << " IPs: ";
     return temp.str();
 }
 
@@ -149,8 +155,11 @@ void HostStatus::update_ping_statistics( const PingStatus &result,
     BOOST_ASSERT( PingsFailedCount <= PingsPerformedCount );
     BOOST_ASSERT( PingCongestionCount <= PingsPerformedCount );
 
-    update_fail_stats( result );
-    update_congestion_stats( result, ping_duration_us );
+    increase_ping_performed_count();
+
+    bool failed_because_congested = update_congestion_stats( result,
+                                                             ping_duration_us );
+    update_fail_stats( result, failed_because_congested );
 
     // after we tried all IPs resolved for this host, we can analyze how many
     // failed
@@ -166,12 +175,12 @@ void HostStatus::update_ping_statistics( const PingStatus &result,
 }
 
 
-void HostStatus::update_fail_stats( const PingStatus &result)
+void HostStatus::update_fail_stats( const PingStatus &result,
+                                    const bool failed_because_congested )
 {
-    increase_ping_performed_count();
-
     if ( result != PingStatus_SuccessReply
-      && result != PingStatus_SuccessOutdatedIP)
+      && result != PingStatus_SuccessOutdatedIP
+      && !failed_because_congested )
     {
         increase_ping_failed_count();
     }
@@ -180,17 +189,23 @@ void HostStatus::update_fail_stats( const PingStatus &result)
 }
 
 
-void HostStatus::update_congestion_stats( const PingStatus &result,
+bool HostStatus::update_congestion_stats( const PingStatus &result,
                                           const long ping_duration_us )
 {
+    bool is_congested = false;
     if (ping_duration_us > PingDurationCongestionsThresh)
-        increase_ping_congestion_count();
+        is_congested = true;
     else if ( result == PingStatus_FailureTimeout )
-        increase_ping_congestion_count();
+        is_congested = true;
     // PingStatus_FailureNoIP, PingStatus_SuccessOutdatedIP could also be caused
     // by congestion, but also by other reasons (e.g. firewall blocking port 53)
 
+    if (is_congested)
+        increase_ping_congestion_count();
+
     analyze_ping_congestion_count();
+
+    return is_congested;
 }
 
 
@@ -201,11 +216,19 @@ bool HostStatus::tried_all_resolved_ip() const
     return ( PingsPerformedCount >= ResolvedIpCount*NParallelPingers );
 }
 
+
+/** @brief called when tried_all_resolved_ip() */
 void HostStatus::analyze_ping_statistics()
 {
     BOOST_ASSERT( !HostAddress.empty() );
     BOOST_ASSERT( PingsPerformedCount >= ResolvedIpCount*NParallelPingers );
 
+    // timeouts are not counted towards failures, only count as congestions
+    // However, if all pings timed out even in burst mode, then we still declare
+    // the line down
+    if (InBurstMode && PingCongestionCount >= PingsPerformedCount)
+        ExceededPingFailedLimit = true;
+
     // notify if the amount of pings that failed exceed the limit
     if ( exceeded_ping_failed_limit() )
     {
@@ -218,7 +241,7 @@ void HostStatus::analyze_ping_statistics()
         LinkAnalyzer->notify_host_up( HostAddress );
     }
 
-    // nothing to do about congestion here, congestion is not forwarded to
+    // nothing else to do about congestion here, congestion is not forwarded to
     // central LinkAnalyzer
 } //lint !e1762
 
@@ -256,23 +279,20 @@ void HostStatus::analyze_ping_failed_count()
     BOOST_ASSERT( ( 0 <= PingFailLimitPercentage ) && ( PingFailLimitPercentage <= 100 ) );
     BOOST_ASSERT( ( 0 <= PingsFailedCount ) && ( PingsFailedCount <= PingsPerformedCount ) );
 
-    int ping_fail_limit_count = ( ResolvedIpCount * PingFailLimitPercentage
-                                                  * NParallelPingers) / 100;
+    int limit = ( PingsPerformedCount * PingFailLimitPercentage) / 100;
 
     // keep a boolean variable because the PingsFailedCount can be reseted
-    if ( PingsFailedCount > ping_fail_limit_count )
+    if ( PingsFailedCount > limit )
     {
         ExceededPingFailedLimit = true;
 
-        GlobalLogger.debug() << log_prefix() << "exceed fail limit="
-                             << ping_fail_limit_count;
+        GlobalLogger.debug() << log_prefix() << "exceed fail limit=" << limit;
     }
     else
     {
         ExceededPingFailedLimit = false;
 
-        GlobalLogger.debug() << log_prefix() << "below fail limit="
-                             << ping_fail_limit_count;
+        GlobalLogger.debug() << log_prefix() << "below fail limit=" << limit;
     }
 }
 
@@ -283,22 +303,21 @@ void HostStatus::analyze_ping_congestion_count()
     BOOST_ASSERT( ( 0 <= PingCongestionCount )
                     && ( PingCongestionCount <= PingsPerformedCount ) );
 
-    int ping_congestion_limit_count = ( ResolvedIpCount * NParallelPingers
-                                        * PingCongestionLimitPercentage ) / 100;
+    int limit = ( PingsPerformedCount * PingCongestionLimitPercentage) / 100;
 
     // keep a boolean variable because the PingCongestionCount can be reseted
-    if ( PingCongestionCount > ping_congestion_limit_count )
+    if ( PingCongestionCount > limit )
     {
         ExceededPingCongestionLimit = true;
 
         GlobalLogger.debug() << log_prefix() << "exceed congestion limit="
-                             << ping_congestion_limit_count;
+                             << limit;
     }
     else
     {
         ExceededPingCongestionLimit = false;
 
         GlobalLogger.debug() << log_prefix() << "below congestion limit="
-                             << ping_congestion_limit_count;
+                             << limit;
     }
 }
index 6e3e3d2..5aa9911 100644 (file)
@@ -55,8 +55,9 @@ public:
     void set_n_parallel_pings(const int n_parallel_pings);
 
 private:
-    void update_fail_stats( const PingStatus &ping_success );
-    void update_congestion_stats( const PingStatus &ping_success,
+    void update_fail_stats( const PingStatus &ping_success,
+                            const bool failed_because_congested );
+    bool update_congestion_stats( const PingStatus &ping_success,
                                   const long ping_duration_us );
     bool tried_all_resolved_ip() const;
     void analyze_ping_statistics();
@@ -96,6 +97,9 @@ private:
     bool ExceededPingCongestionLimit;
     /// number of pingers that ping the same IP in parallel
     int NParallelPingers;
+    /// flag whether we performed a greater number of pings because line seems
+    /// congested
+    bool InBurstMode;
 
 };
 
index 5e2707a..72b084e 100644 (file)
@@ -408,6 +408,7 @@ void PingScheduler::update_ping_number()
     {
         NPingers.increase();
 
+        GlobalLogger.notice() << LogPrefix << "Line appears congested!";
         GlobalLogger.debug() << LogPrefix << "- Increasing ping number to: "
                              << NPingers;
     }
index 1b255aa..b3ff60b 100644 (file)
@@ -40,6 +40,7 @@ std::string to_string( const PingStatus &status )
                                             break;
         case PingStatus_FailureAsyncError: return "PingFailed(AsyncError)";
                                            break;
+        case PingStatus_SendFailed: return "SendingPingFailed"; break;
         default: return "PingStatusUnknown"; break;
     }
 }
index f62dc88..abb9b78 100644 (file)
@@ -33,7 +33,8 @@ enum PingStatus
     PingStatus_FailureNoIP,
     PingStatus_SuccessOutdatedIP,
     PingStatus_FailureAsyncCancel,
-    PingStatus_FailureAsyncError
+    PingStatus_FailureAsyncError,
+    PingStatus_SendFailed
 };
 
 std::string to_string( const PingStatus &status );
index c0415bf..2415c83 100644 (file)
@@ -16,6 +16,7 @@
 #include <boost/uuid/uuid.hpp>
 #include <boost/uuid/uuid_generators.hpp>
 #include <boost/foreach.hpp>
+#include <boost/system/system_error.hpp>
 
 #include <logfunc.hpp>
 
@@ -208,18 +209,47 @@ bool IcmpPinger::send_echo_request( const IcmpPacketItem icmp_packet )
         bytes_sent = PacketDistributor->get_socket()->send_to( data, DestinationEndpoint );
         if ( bytes_sent != buffer_size( data ) )
         {
-            GlobalLogger.error() << LogPrefix << "fail sending ping data."
-                                 << endl;
+            GlobalLogger.error() << LogPrefix << "fail sending ping data. Only"
+                                 << bytes_sent << " of " << buffer_size(data)
+                                 << " bytes were sent!" << endl;
         }
+
+        ReplyReceived = false;
+        schedule_timeout_echo_reply();
+    }
+    catch ( const boost::system::system_error &boost_err )
+    {
+        boost::system::error_code err_code = boost_err.code();
+        GlobalLogger.error() << LogPrefix << "fail sending ping data: "
+                 << boost_err.what() << " (code " << err_code.value()
+                 << ", category " << err_code.category().name() << ")" << endl;
+
+        // do not wait for timeout but fail at once
+        set_ping_status(PingStatus_SendFailed);
+        ReplyReceived = true;   // flag for handler to leave ping status as is
+        //handle_timeout( err_code );
+        handle_timeout( boost::system::error_code() );
     }
     catch ( const exception &ex )
     {
-        GlobalLogger.error() << LogPrefix << "fail sending ping data. "
+        GlobalLogger.error() << LogPrefix << "fail sending ping data: "
                              << ex.what() << endl;
+
+        // do not wait for timeout but fail at once
+        set_ping_status(PingStatus_SendFailed);
+        ReplyReceived = true;   // flag for handler to leave ping status as is
+        handle_timeout( boost::system::error_code() );
     }
+    catch ( ... )
+    {
+        GlobalLogger.error() << LogPrefix << "fail sending ping data: "
+                             << "Unknown exception" << endl;
 
-    ReplyReceived = false;
-    schedule_timeout_echo_reply();
+        // do not wait for timeout but fail at once
+        set_ping_status(PingStatus_SendFailed);
+        ReplyReceived = true;   // flag for handler to leave ping status as is
+        handle_timeout( boost::system::error_code() );
+    }
 
     return (bytes_sent > 0);
 }
@@ -262,6 +292,8 @@ void IcmpPinger::handle_timeout(const boost::system::error_code& error)
                 << " waiting for ICMP echo reply!" << endl;
             set_ping_status( PingStatus_FailureAsyncError );
         }
+        // could check here for more details if error is forwarded from
+        // send_echo_request
 
         // Still continue with rest of function, so PingStatus is updated and Callback executed
         //   when timer was cancelled
@@ -272,6 +304,7 @@ void IcmpPinger::handle_timeout(const boost::system::error_code& error)
 
         set_ping_status( PingStatus_FailureTimeout );
     }
+    // otherwise assume that ping status was set already
 
     // Call ping-done handler
     PingDoneCallback( PingerStatus, static_cast<long>(