added variable for threshold for switching from "all congested" --> "connection failed"
[pingcheck] / src / host / hoststatus.cpp
1 /*
2 The software in this package is distributed under the GNU General
3 Public License version 2 (with a special exception described below).
4
5 A copy of GNU General Public License (GPL) is included in this distribution,
6 in the file COPYING.GPL.
7
8 As a special exception, if other files instantiate templates or use macros
9 or inline functions from this file, or you compile this file and link it
10 with other works to produce a work based on this file, this file
11 does not by itself cause the resulting work to be covered
12 by the GNU General Public License.
13
14 However the source code for this file must still be made available
15 in accordance with section (3) of the GNU General Public License.
16
17 This exception does not invalidate any other reasons why a work based
18 on this file might be covered by the GNU General Public License.
19 */
20 #include "host/hoststatus.h"
21
22 #include <iostream>
23 #include <logfunc.hpp>
24
25 #include "boost_assert_handler.h"
26
27 using namespace std;
28 using I2n::Logger::GlobalLogger;
29
30 //-----------------------------------------------------------------------------
31 // HostStatus
32 //-----------------------------------------------------------------------------
33
34 /**
35  * @param host_address The address of the host it has to analyze.
36  * @param ping_fail_percentage_limit The percentage threshold of pings that can
37  * fail.
38  * @param ping_congestion_limit_percentage The percentage threshold of pings
39  * that can fail due to line congestion
40  * @param ping_duration_congestion_thresh Threshold in micro seconds that marks
41  * the difference between a "normal" and a congested line
42  * @param n_parallel_pings Number of pings that is sent for each IP
43  * @param link_analyzer The object used to notify the status of the host.
44  */
45 HostStatus::HostStatus(
46         const string &host_address,
47         const int ping_fail_limit_percentage,
48         const int ping_congestion_limit_percentage,
49         const int congest_caused_by_fail_limit_percentage,
50         const int ping_duration_congestion_thresh,
51         const int n_parallel_pings,
52         const LinkStatusItem link_analyzer
53 ) :
54     HostAddress( host_address ),
55     LinkAnalyzer( link_analyzer ),
56     PingFailLimitPercentage( ping_fail_limit_percentage ),
57     PingCongestionLimitPercentage( ping_congestion_limit_percentage ),
58     CongestCausedByFailLimitPercentage(congest_caused_by_fail_limit_percentage),
59     PingDurationCongestionsThresh( ping_duration_congestion_thresh*1000000 ),
60     ResolvedIpCount( 0 ),
61     PingsPerformedCount( 0 ),
62     PingsFailedCount( 0 ),
63     PingCongestionCount( 0 ),
64     ExceededPingFailedLimit( false ),
65     ExceededPingCongestionLimit( false ),
66     NParallelPingers( n_parallel_pings),
67     InBurstMode( false )
68 {
69     BOOST_ASSERT( !HostAddress.empty() );
70     BOOST_ASSERT( ( 0 <= PingFailLimitPercentage )
71                     && ( PingFailLimitPercentage <= 100 ) );
72     BOOST_ASSERT( ( 0 <= PingCongestionLimitPercentage )
73                     && ( PingCongestionLimitPercentage <= 100 ) );
74 }
75
76 HostStatus::~HostStatus()
77 {
78 }
79
80
81 void HostStatus::set_n_parallel_pings(const int n_parallel_pings)
82 {
83     if (ExceededPingCongestionLimit)
84         InBurstMode = true;
85     else
86         InBurstMode = true;
87
88     if (NParallelPingers != n_parallel_pings)
89     {
90         NParallelPingers = n_parallel_pings;
91         reset_ping_counters();
92     }
93     GlobalLogger.debug() << log_prefix() << "#pingers set";
94 }
95
96
97 std::string HostStatus::log_prefix()
98 {
99     std::stringstream temp;
100     temp << "Stat(" << HostAddress << "): "
101         << PingsFailedCount << " fail," << PingCongestionCount << " cong/"
102         << PingsPerformedCount << " pings/" << NParallelPingers << "*"
103         << ResolvedIpCount << " IPs: ";
104     return temp.str();
105 }
106
107 /**
108  * @param resolved_ip_count The number of IPs resolved for the host.
109  */
110 void HostStatus::set_resolved_ip_count( const int resolved_ip_count )
111 {
112     BOOST_ASSERT( 0 <= resolved_ip_count );
113
114     if (resolved_ip_count != ResolvedIpCount)
115     {   // assume that the target has changed --> reset counters
116         reset_ping_counters();
117     }
118     ResolvedIpCount = resolved_ip_count;
119
120     GlobalLogger.debug() << log_prefix() << "#IPs set";
121 }
122
123 /**
124  * @return true if the amount of failed pings given to the host exceeded the
125  * limit.
126  */
127 bool HostStatus::exceeded_ping_failed_limit() const
128 {
129     return ExceededPingFailedLimit;
130 }
131
132 /**
133  * @return true if the amount of congested pings given to the host exceeded the
134  * limit.
135  */
136 bool HostStatus::exceeded_ping_congestion_limit() const
137 {
138     return ExceededPingCongestionLimit;
139 }
140
141 /**
142  * Tells the status analyzer how the last ping went
143  *
144  * @param result: status of ping specifying success/failure and reason of fail
145  * @param ping_duration_us duration of ping in micro seconds
146  */
147 void HostStatus::update_ping_statistics( const PingStatus &result,
148                                          const long ping_duration_us )
149 {
150     float ping_duration_ms = static_cast<float>(ping_duration_us) / 1000.0f;
151
152     GlobalLogger.debug() << log_prefix() << "add ping with result "
153         << to_string(result) << " which took " << ping_duration_ms << " ms";
154
155     BOOST_ASSERT( 0 <= ResolvedIpCount );
156     BOOST_ASSERT( 0 <= PingsPerformedCount );
157     BOOST_ASSERT( PingsFailedCount <= PingsPerformedCount );
158     BOOST_ASSERT( PingCongestionCount <= PingsPerformedCount );
159
160     increase_ping_performed_count();
161
162     bool failed_because_congested = update_congestion_stats( result,
163                                                              ping_duration_us );
164     update_fail_stats( result, failed_because_congested );
165
166     // after we tried all IPs resolved for this host, we can analyze how many
167     // failed
168     if ( tried_all_resolved_ip() )
169     {
170         analyze_ping_statistics();
171
172         reset_ping_counters();
173     }
174
175     BOOST_ASSERT( PingsFailedCount <= PingsPerformedCount );
176     BOOST_ASSERT( PingCongestionCount <= PingsPerformedCount );
177 }
178
179
180 void HostStatus::update_fail_stats( const PingStatus &result,
181                                     const bool failed_because_congested )
182 {
183     if ( result != PingStatus_SuccessReply
184       && result != PingStatus_SuccessOutdatedIP
185       && !failed_because_congested )
186     {
187         increase_ping_failed_count();
188     }
189
190     analyze_ping_failed_count();
191 }
192
193
194 bool HostStatus::update_congestion_stats( const PingStatus &result,
195                                           const long ping_duration_us )
196 {
197     bool is_congested = false;
198     if (ping_duration_us > PingDurationCongestionsThresh)
199         is_congested = true;
200     else if ( result == PingStatus_FailureTimeout )
201         is_congested = true;
202     // PingStatus_FailureNoIP, PingStatus_SuccessOutdatedIP could also be caused
203     // by congestion, but also by other reasons (e.g. firewall blocking port 53)
204
205     if (is_congested)
206         increase_ping_congestion_count();
207
208     analyze_ping_congestion_count();
209
210     return is_congested;
211 }
212
213
214 bool HostStatus::tried_all_resolved_ip() const
215 {
216     BOOST_ASSERT( 0 < PingsPerformedCount );
217
218     return ( PingsPerformedCount >= ResolvedIpCount*NParallelPingers );
219 }
220
221
222 /** @brief called when tried_all_resolved_ip() */
223 void HostStatus::analyze_ping_statistics()
224 {
225     BOOST_ASSERT( !HostAddress.empty() );
226     BOOST_ASSERT( PingsPerformedCount >= ResolvedIpCount*NParallelPingers );
227
228     // timeouts are not counted towards failures, only count as congestions
229     // However, if many pings timed out even in burst mode, then we still
230     // declare the line down
231     float limit = static_cast<float>( PingsPerformedCount
232                                     * CongestCausedByFailLimitPercentage)/100.f;
233     if (InBurstMode && PingCongestionCount > limit)
234     {
235         GlobalLogger.info() << log_prefix()
236             << "Assume congestion is actually caused by compromised connection "
237             << "to host because " << PingCongestionCount << " of "
238             << PingsPerformedCount << " burst pings timed out";
239         PingsFailedCount += PingCongestionCount;
240         PingCongestionCount = 0;
241         ExceededPingFailedLimit = true;
242         ExceededPingCongestionLimit = false;
243     }
244
245     // notify if the amount of pings that failed exceed the limit
246     if ( exceeded_ping_failed_limit() )
247     {
248         GlobalLogger.debug() << log_prefix() << "notify down";
249         LinkAnalyzer->notify_host_down( HostAddress );
250     }
251     else if (exceeded_ping_congestion_limit() && !InBurstMode)
252         // only notify up if will not try burst mode next
253         // otherwise will continuously notify up and down if get timeouts
254         GlobalLogger.notice() << log_prefix() << "will not notify up because "
255             << " will go into burst mode next";
256     else
257     {
258         GlobalLogger.debug() << log_prefix() << "notify up";
259         LinkAnalyzer->notify_host_up( HostAddress );
260     }
261
262     // nothing else to do about congestion here, congestion is not forwarded to
263     // central LinkAnalyzer
264 } //lint !e1762
265
266 void HostStatus::reset_ping_counters()
267 {
268     PingsPerformedCount = 0;
269     PingsFailedCount = 0;
270     PingCongestionCount = 0;
271 }
272
273 void HostStatus::increase_ping_performed_count()
274 {
275     ++PingsPerformedCount;
276
277     BOOST_ASSERT( 0 < PingsPerformedCount );
278 }
279
280 void HostStatus::increase_ping_failed_count()
281 {
282     ++PingsFailedCount;
283
284     BOOST_ASSERT( ( 0 <= PingsFailedCount ) && ( PingsFailedCount <= PingsPerformedCount ) );
285 }
286
287 void HostStatus::increase_ping_congestion_count()
288 {
289     ++PingCongestionCount;
290
291     BOOST_ASSERT( ( 0 <= PingCongestionCount )
292                     && ( PingCongestionCount <= PingsPerformedCount ) );
293 }
294
295 void HostStatus::analyze_ping_failed_count()
296 {
297     BOOST_ASSERT( ( 0 <= PingFailLimitPercentage ) && ( PingFailLimitPercentage <= 100 ) );
298     BOOST_ASSERT( ( 0 <= PingsFailedCount ) && ( PingsFailedCount <= PingsPerformedCount ) );
299
300     int limit = ( PingsPerformedCount * PingFailLimitPercentage) / 100;
301
302     // keep a boolean variable because the PingsFailedCount can be reseted
303     if ( PingsFailedCount > limit )
304     {
305         ExceededPingFailedLimit = true;
306
307         GlobalLogger.debug() << log_prefix() << "exceed fail limit=" << limit;
308     }
309     else
310     {
311         ExceededPingFailedLimit = false;
312
313         GlobalLogger.debug() << log_prefix() << "below fail limit=" << limit;
314     }
315 }
316
317 void HostStatus::analyze_ping_congestion_count()
318 {
319     BOOST_ASSERT( ( 0 <= PingCongestionLimitPercentage )
320                     && ( PingCongestionLimitPercentage <= 100 ) );
321     BOOST_ASSERT( ( 0 <= PingCongestionCount )
322                     && ( PingCongestionCount <= PingsPerformedCount ) );
323
324     int limit = ( PingsPerformedCount * PingCongestionLimitPercentage) / 100;
325
326     // keep a boolean variable because the PingCongestionCount can be reseted
327     if ( PingCongestionCount > limit )
328     {
329         ExceededPingCongestionLimit = true;
330
331         GlobalLogger.debug() << log_prefix() << "exceed congestion limit="
332                              << limit;
333     }
334     else
335     {
336         ExceededPingCongestionLimit = false;
337
338         GlobalLogger.debug() << log_prefix() << "below congestion limit="
339                              << limit;
340     }
341 }