made congestion/offline behaviour more stable: do no declare online right after going...
[pingcheck] / src / host / hoststatus.cpp
1 /*
2 The software in this package is distributed under the GNU General
3 Public License version 2 (with a special exception described below).
4
5 A copy of GNU General Public License (GPL) is included in this distribution,
6 in the file COPYING.GPL.
7
8 As a special exception, if other files instantiate templates or use macros
9 or inline functions from this file, or you compile this file and link it
10 with other works to produce a work based on this file, this file
11 does not by itself cause the resulting work to be covered
12 by the GNU General Public License.
13
14 However the source code for this file must still be made available
15 in accordance with section (3) of the GNU General Public License.
16
17 This exception does not invalidate any other reasons why a work based
18 on this file might be covered by the GNU General Public License.
19 */
20 #include "host/hoststatus.h"
21
22 #include <iostream>
23 #include <logfunc.hpp>
24
25 #include "boost_assert_handler.h"
26
27 using namespace std;
28 using I2n::Logger::GlobalLogger;
29
30 //-----------------------------------------------------------------------------
31 // HostStatus
32 //-----------------------------------------------------------------------------
33
34 /**
35  * @param host_address The address of the host it has to analyze.
36  * @param ping_fail_percentage_limit The percentage threshold of pings that can
37  * fail.
38  * @param ping_congestion_limit_percentage The percentage threshold of pings
39  * that can fail due to line congestion
40  * @param ping_duration_congestion_thresh Threshold in micro seconds that marks
41  * the difference between a "normal" and a congested line
42  * @param n_parallel_pings Number of pings that is sent for each IP
43  * @param link_analyzer The object used to notify the status of the host.
44  */
45 HostStatus::HostStatus(
46         const string &host_address,
47         const int ping_fail_limit_percentage,
48         const int ping_congestion_limit_percentage,
49         const int ping_duration_congestion_thresh,
50         const int n_parallel_pings,
51         const LinkStatusItem link_analyzer
52 ) :
53     HostAddress( host_address ),
54     LinkAnalyzer( link_analyzer ),
55     PingFailLimitPercentage( ping_fail_limit_percentage ),
56     PingCongestionLimitPercentage( ping_congestion_limit_percentage ),
57     PingDurationCongestionsThresh( ping_duration_congestion_thresh*1000000 ),
58     ResolvedIpCount( 0 ),
59     PingsPerformedCount( 0 ),
60     PingsFailedCount( 0 ),
61     PingCongestionCount( 0 ),
62     ExceededPingFailedLimit( false ),
63     ExceededPingCongestionLimit( false ),
64     NParallelPingers( n_parallel_pings),
65     InBurstMode( false )
66 {
67     BOOST_ASSERT( !HostAddress.empty() );
68     BOOST_ASSERT( ( 0 <= PingFailLimitPercentage )
69                     && ( PingFailLimitPercentage <= 100 ) );
70     BOOST_ASSERT( ( 0 <= PingCongestionLimitPercentage )
71                     && ( PingCongestionLimitPercentage <= 100 ) );
72 }
73
74 HostStatus::~HostStatus()
75 {
76 }
77
78
79 void HostStatus::set_n_parallel_pings(const int n_parallel_pings)
80 {
81     if (ExceededPingCongestionLimit)
82         InBurstMode = true;
83     else
84         InBurstMode = true;
85
86     if (NParallelPingers != n_parallel_pings)
87     {
88         NParallelPingers = n_parallel_pings;
89         reset_ping_counters();
90     }
91     GlobalLogger.debug() << log_prefix() << "#pingers set";
92 }
93
94
95 std::string HostStatus::log_prefix()
96 {
97     std::stringstream temp;
98     temp << "Stat(" << HostAddress << "): "
99         << PingsFailedCount << " fail," << PingCongestionCount << " cong/"
100         << PingsPerformedCount << " pings/" << NParallelPingers << "*"
101         << ResolvedIpCount << " IPs: ";
102     return temp.str();
103 }
104
105 /**
106  * @param resolved_ip_count The number of IPs resolved for the host.
107  */
108 void HostStatus::set_resolved_ip_count( const int resolved_ip_count )
109 {
110     BOOST_ASSERT( 0 <= resolved_ip_count );
111
112     if (resolved_ip_count != ResolvedIpCount)
113     {   // assume that the target has changed --> reset counters
114         reset_ping_counters();
115     }
116     ResolvedIpCount = resolved_ip_count;
117
118     GlobalLogger.debug() << log_prefix() << "#IPs set";
119 }
120
121 /**
122  * @return true if the amount of failed pings given to the host exceeded the
123  * limit.
124  */
125 bool HostStatus::exceeded_ping_failed_limit() const
126 {
127     return ExceededPingFailedLimit;
128 }
129
130 /**
131  * @return true if the amount of congested pings given to the host exceeded the
132  * limit.
133  */
134 bool HostStatus::exceeded_ping_congestion_limit() const
135 {
136     return ExceededPingCongestionLimit;
137 }
138
139 /**
140  * Tells the status analyzer how the last ping went
141  *
142  * @param result: status of ping specifying success/failure and reason of fail
143  * @param ping_duration_us duration of ping in micro seconds
144  */
145 void HostStatus::update_ping_statistics( const PingStatus &result,
146                                          const long ping_duration_us )
147 {
148     float ping_duration_ms = static_cast<float>(ping_duration_us) / 1000.0f;
149
150     GlobalLogger.debug() << log_prefix() << "add ping with result "
151         << to_string(result) << " which took " << ping_duration_ms << " ms";
152
153     BOOST_ASSERT( 0 <= ResolvedIpCount );
154     BOOST_ASSERT( 0 <= PingsPerformedCount );
155     BOOST_ASSERT( PingsFailedCount <= PingsPerformedCount );
156     BOOST_ASSERT( PingCongestionCount <= PingsPerformedCount );
157
158     increase_ping_performed_count();
159
160     bool failed_because_congested = update_congestion_stats( result,
161                                                              ping_duration_us );
162     update_fail_stats( result, failed_because_congested );
163
164     // after we tried all IPs resolved for this host, we can analyze how many
165     // failed
166     if ( tried_all_resolved_ip() )
167     {
168         analyze_ping_statistics();
169
170         reset_ping_counters();
171     }
172
173     BOOST_ASSERT( PingsFailedCount <= PingsPerformedCount );
174     BOOST_ASSERT( PingCongestionCount <= PingsPerformedCount );
175 }
176
177
178 void HostStatus::update_fail_stats( const PingStatus &result,
179                                     const bool failed_because_congested )
180 {
181     if ( result != PingStatus_SuccessReply
182       && result != PingStatus_SuccessOutdatedIP
183       && !failed_because_congested )
184     {
185         increase_ping_failed_count();
186     }
187
188     analyze_ping_failed_count();
189 }
190
191
192 bool HostStatus::update_congestion_stats( const PingStatus &result,
193                                           const long ping_duration_us )
194 {
195     bool is_congested = false;
196     if (ping_duration_us > PingDurationCongestionsThresh)
197         is_congested = true;
198     else if ( result == PingStatus_FailureTimeout )
199         is_congested = true;
200     // PingStatus_FailureNoIP, PingStatus_SuccessOutdatedIP could also be caused
201     // by congestion, but also by other reasons (e.g. firewall blocking port 53)
202
203     if (is_congested)
204         increase_ping_congestion_count();
205
206     analyze_ping_congestion_count();
207
208     return is_congested;
209 }
210
211
212 bool HostStatus::tried_all_resolved_ip() const
213 {
214     BOOST_ASSERT( 0 < PingsPerformedCount );
215
216     return ( PingsPerformedCount >= ResolvedIpCount*NParallelPingers );
217 }
218
219
220 /** @brief called when tried_all_resolved_ip() */
221 void HostStatus::analyze_ping_statistics()
222 {
223     BOOST_ASSERT( !HostAddress.empty() );
224     BOOST_ASSERT( PingsPerformedCount >= ResolvedIpCount*NParallelPingers );
225
226     // timeouts are not counted towards failures, only count as congestions
227     // However, if all pings timed out even in burst mode, then we still declare
228     // the line down
229     if (InBurstMode && PingCongestionCount >= PingsPerformedCount)
230     {
231         GlobalLogger.notice() << log_prefix() << "All pings timed out despite "
232             << "using more pings per IP --> assume connection is really down";
233         PingsFailedCount += PingCongestionCount;
234         PingCongestionCount = 0;
235         ExceededPingFailedLimit = true;
236         ExceededPingCongestionLimit = false;
237     }
238
239     // notify if the amount of pings that failed exceed the limit
240     if ( exceeded_ping_failed_limit() )
241     {
242         GlobalLogger.debug() << log_prefix() << "notify down";
243         LinkAnalyzer->notify_host_down( HostAddress );
244     }
245     else if (exceeded_ping_congestion_limit() && !InBurstMode)
246         // only notify up if will not try burst mode next
247         // otherwise will continuously notify up and down if get timeouts
248         GlobalLogger.notice() << log_prefix() << "will not notify up because "
249             << " will go into burst mode next";
250     else
251     {
252         GlobalLogger.debug() << log_prefix() << "notify up";
253         LinkAnalyzer->notify_host_up( HostAddress );
254     }
255
256     // nothing else to do about congestion here, congestion is not forwarded to
257     // central LinkAnalyzer
258 } //lint !e1762
259
260 void HostStatus::reset_ping_counters()
261 {
262     PingsPerformedCount = 0;
263     PingsFailedCount = 0;
264     PingCongestionCount = 0;
265 }
266
267 void HostStatus::increase_ping_performed_count()
268 {
269     ++PingsPerformedCount;
270
271     BOOST_ASSERT( 0 < PingsPerformedCount );
272 }
273
274 void HostStatus::increase_ping_failed_count()
275 {
276     ++PingsFailedCount;
277
278     BOOST_ASSERT( ( 0 <= PingsFailedCount ) && ( PingsFailedCount <= PingsPerformedCount ) );
279 }
280
281 void HostStatus::increase_ping_congestion_count()
282 {
283     ++PingCongestionCount;
284
285     BOOST_ASSERT( ( 0 <= PingCongestionCount )
286                     && ( PingCongestionCount <= PingsPerformedCount ) );
287 }
288
289 void HostStatus::analyze_ping_failed_count()
290 {
291     BOOST_ASSERT( ( 0 <= PingFailLimitPercentage ) && ( PingFailLimitPercentage <= 100 ) );
292     BOOST_ASSERT( ( 0 <= PingsFailedCount ) && ( PingsFailedCount <= PingsPerformedCount ) );
293
294     int limit = ( PingsPerformedCount * PingFailLimitPercentage) / 100;
295
296     // keep a boolean variable because the PingsFailedCount can be reseted
297     if ( PingsFailedCount > limit )
298     {
299         ExceededPingFailedLimit = true;
300
301         GlobalLogger.debug() << log_prefix() << "exceed fail limit=" << limit;
302     }
303     else
304     {
305         ExceededPingFailedLimit = false;
306
307         GlobalLogger.debug() << log_prefix() << "below fail limit=" << limit;
308     }
309 }
310
311 void HostStatus::analyze_ping_congestion_count()
312 {
313     BOOST_ASSERT( ( 0 <= PingCongestionLimitPercentage )
314                     && ( PingCongestionLimitPercentage <= 100 ) );
315     BOOST_ASSERT( ( 0 <= PingCongestionCount )
316                     && ( PingCongestionCount <= PingsPerformedCount ) );
317
318     int limit = ( PingsPerformedCount * PingCongestionLimitPercentage) / 100;
319
320     // keep a boolean variable because the PingCongestionCount can be reseted
321     if ( PingCongestionCount > limit )
322     {
323         ExceededPingCongestionLimit = true;
324
325         GlobalLogger.debug() << log_prefix() << "exceed congestion limit="
326                              << limit;
327     }
328     else
329     {
330         ExceededPingCongestionLimit = false;
331
332         GlobalLogger.debug() << log_prefix() << "below congestion limit="
333                              << limit;
334     }
335 }