to avoid going down in congested line scenario, also need longer ping timeout
[pingcheck] / src / host / pingscheduler.cpp
1 /*
2 The software in this package is distributed under the GNU General
3 Public License version 2 (with a special exception described below).
4
5 A copy of GNU General Public License (GPL) is included in this distribution,
6 in the file COPYING.GPL.
7
8 As a special exception, if other files instantiate templates or use macros
9 or inline functions from this file, or you compile this file and link it
10 with other works to produce a work based on this file, this file
11 does not by itself cause the resulting work to be covered
12 by the GNU General Public License.
13
14 However the source code for this file must still be made available
15 in accordance with section (3) of the GNU General Public License.
16
17 This exception does not invalidate any other reasons why a work based
18 on this file might be covered by the GNU General Public License.
19 */
20 #include "host/pingscheduler.h"
21
22 #include <iostream>
23 #include <limits>
24
25 #include <boost/bind.hpp>
26 #include <boost/foreach.hpp>
27
28 #include <logfunc.hpp>
29
30 #include "boost_assert_handler.h"
31 #include "host/pingerfactory.h"
32 #include "dns/dnsmaster.h"
33 #include "icmp/icmppinger.h"
34 #include "link/linkstatus.h"
35
36 using namespace std;
37 using boost::asio::io_service;
38 using boost::bind;
39 using boost::date_time::time_resolution_traits_adapted64_impl;
40 using boost::posix_time::microsec_clock;
41 using boost::posix_time::ptime;
42 using boost::posix_time::seconds;
43 using boost::posix_time::milliseconds;
44 using boost::shared_ptr;
45 using I2n::Logger::GlobalLogger;
46
47
48 //-----------------------------------------------------------------------------
49 // PingScheduler
50 //-----------------------------------------------------------------------------
51
52 /**
53  * @brief Parameterized constructor.
54  *
55  * @param io_serv The one @c io_serv object that controls async processing
56  * @param network_interface The name of the network interface sending the pings.
57  * @param destination_address The remote address to ping.
58  * @param destination_port The remote port to ping.
59  * @param ping_protocol_list A list of protocols to use.
60  * @param ping_interval_in_sec Amount of time between each ping.
61  * @param ping_fail_percentage_limit Maximum amount of pings that can fail.
62  * @param ping_congestion_percentage_limit Amount of pings indication congested
63  * line
64  * @param ping_congestion_duration_thresh Duration in seconds that indicates a
65  * congested line
66  * @param ping_reply_timeout Max amount time to wait for ping to finish
67  * @param link_analyzer The object to monitor the link status.
68  * @param first_delay Delay in seconds from start_pinging to first ping attempt
69  * @param n_parallel_pings: Number of pingers to ping the same IP in parallel
70  */
71 PingScheduler::PingScheduler(
72         const IoServiceItem io_serv,
73         const string &network_interface,
74         const string &destination_address,
75         const uint16_t destination_port,
76         const PingProtocolList &ping_protocol_list,
77         const long ping_interval_in_sec,
78         const int ping_fail_percentage_limit,
79         const int ping_congestion_percentage_limit,
80         const int ping_congestion_duration_thresh,
81         const int ping_reply_timeout,
82         LinkStatusItem link_analyzer,
83         const int first_delay,
84         const int n_parallel_pings,
85         const int parallel_ping_delay,
86         const int ping_timeout_factor
87 ) :
88     IoService( io_serv ),
89     NetworkInterfaceName( network_interface ),
90     DestinationAddress( destination_address ),
91     DestinationPort( destination_port ),
92     Protocols( ping_protocol_list ),
93     ProtocolIter(),
94     PingIntervalInSec( ping_interval_in_sec ),
95     NPingers( n_parallel_pings ),
96     FirstDelay( first_delay ),
97     NextPingTimer( *io_serv ),
98     TimeSentLastPing( microsec_clock::universal_time() ),
99     PingReplyTimeout( ping_reply_timeout ),
100     PingReplyTimeoutOrig( ping_reply_timeout ),
101     HostAnalyzer( destination_address, ping_fail_percentage_limit,
102                   ping_congestion_percentage_limit,
103                   ping_congestion_duration_thresh, n_parallel_pings,
104                   link_analyzer ),
105     Resolver(),
106     Pingers(),
107     NPingersDone( 0 ),
108     ParallelPingDelay( parallel_ping_delay ),
109     DelayedPingTimer( *io_serv ),
110     WantToPing( false ),
111     LogPrefix(),
112     ContinueOnOutdatedIps( false ),
113     PingTimeoutFactor( ping_timeout_factor )
114 {
115     BOOST_ASSERT( !network_interface.empty() );
116     BOOST_ASSERT( !destination_address.empty() );
117     BOOST_ASSERT( ( 0 < destination_port ) &&
118                   ( destination_port < numeric_limits<uint16_t>::max() ) );
119     BOOST_ASSERT( 0 < ping_interval_in_sec );
120     BOOST_ASSERT( (0 <= ping_fail_percentage_limit) &&
121                   ( ping_fail_percentage_limit <= 100) );
122
123     update_log_prefix();
124
125     init_ping_protocol();
126
127     // start resolving already so we are prepared to ping
128     update_dns_resolver();
129
130 }
131
132 /**
133  * @brief Destructor.
134  */
135 PingScheduler::~PingScheduler()
136 {
137 }
138
139 void PingScheduler::stop_pinging()
140 {
141     // stop pinger and resolver
142     GlobalLogger.debug() << LogPrefix << "scheduler: stop pinging";
143     clear_pingers();
144     cancel_resolve(true);
145
146     // now cancel the own timer in case that pinger cancelation called callback
147     GlobalLogger.debug() << LogPrefix << "scheduler: cancel timer";
148     NextPingTimer.cancel();
149 }
150
151 /**
152  * @brief stop all pingers and remove them from Pingers variable which will
153  *   proboably cause their destruction
154  *   
155  * Pingers is empty afterwards
156  */
157 void PingScheduler::clear_pingers()
158 {
159     PingerItem pinger;
160     while(!Pingers.empty())
161     {
162         pinger = Pingers.back();
163         pinger->stop_pinging();
164         Pingers.pop_back();
165     }
166 }
167
168 /**
169  * @brief Start into infinite loop of calls to ping
170  *
171  * Does not start yet but set NextPingTimer (possibly to 0), so action starts
172  *   when io_service is started
173  */
174 void PingScheduler::start_pinging()
175 {
176     if ( FirstDelay > 0 )
177         GlobalLogger.info() << LogPrefix << "Delaying first ping by "
178                                          << FirstDelay << "s";
179     else
180         GlobalLogger.info() << LogPrefix << "Schedule ping as soon as possible";
181
182     (void) NextPingTimer.expires_from_now( seconds( FirstDelay ) );
183     NextPingTimer.async_wait( bind( &PingScheduler::ping, this,
184                                           boost::asio::placeholders::error ) );
185 }
186
187
188 /**
189  * @brief call Ping::ping and schedule a call to ping_done_handler when finished
190  */
191 void PingScheduler::ping(const boost::system::error_code &error)
192 {
193     if ( error )
194     {   // get here, e.g. by NextPingTimer.cancel in stop_pinging
195         if ( error ==  boost::asio::error::operation_aborted )
196             GlobalLogger.error() << LogPrefix << "Timer for ping was cancelled!"
197                                  << " --> Stopping";
198         else
199             GlobalLogger.error() << LogPrefix << "Received error " << error
200                                  << " waiting for ping! Stopping";
201         return;
202     }
203
204     // ping as soon as dns is ready
205     WantToPing = true;
206     ping_when_ready();
207 }
208
209
210 void PingScheduler::ping_when_ready()
211 {
212     if ( !WantToPing )
213     {
214         GlobalLogger.info() << LogPrefix << "waiting for ping request "
215             << "(should take no more than " << PingIntervalInSec << "s)";
216         return;
217     }
218     else if ( Resolver && Resolver->is_resolving() )
219     {
220         GlobalLogger.info() << LogPrefix << "waiting for DNS to finish";
221         return;
222     }
223     else if ( !Resolver )
224     {
225         // should not happen, but check anyway
226         GlobalLogger.warning() << LogPrefix << "Have no resolver!";
227         return;
228     }
229
230     GlobalLogger.info() << LogPrefix << "start ping";
231     WantToPing = false;
232
233     // try to get an up-to-date IP (ContinueOnOutdatedIps may only be set
234     //   because a CNAME was out of date -- IPs may still be current)
235     HostAddress ip = Resolver->get_next_ip();
236
237     if ( !ip.is_valid() )
238     {   // this can happen in 2 cases: if ContinueOnOutdatedIps==true
239         // or when ip went out of date between resolve and now
240         // --> try to use outdated IP
241         GlobalLogger.info() << LogPrefix << "Checking for outdated IPs";
242         bool check_up_to_date = false;
243         ip = Resolver->get_next_ip(check_up_to_date);
244     }
245     if ( !ip.is_valid() )
246     {   // Do not even have an outdated IP!
247         // This happens if have no cached IPs and resolve failed
248         GlobalLogger.info() << LogPrefix << "Not even outdated IP to ping "
249             << "-- treat like a failed ping.";
250
251         // skip the ping and directly call ping_done_handler the appropriate
252         // number of times
253         for (int count=0; count<NPingers; ++count)
254             ping_done_handler(PingStatus_FailureNoIP, 0);
255     }
256     else
257     {
258         // create new pingers
259         for (int count=0; count<NPingers; ++count)
260             Pingers.push_back(
261                     PingerFactory::createPinger(*ProtocolIter, IoService,
262                                                 NetworkInterfaceName,
263                                                 PingReplyTimeout) );
264
265         // remember when pinging started
266         TimeSentLastPing = microsec_clock::universal_time();
267
268         // get actual IP
269         boost::asio::ip::address actual_ip = ip.get_ip();
270         GlobalLogger.info() << LogPrefix << "pinging IP " << actual_ip
271             << " with TTL " << ip.get_ttl().get_updated_value() << "s";
272         delayed_ping(boost::system::error_code(), actual_ip, 0);
273     }
274 }
275
276 void PingScheduler::delayed_ping( const boost::system::error_code &error,
277                                   const boost::asio::ip::address ip,
278                                   const int pinger_index )
279 {
280     if (error)
281     {
282         GlobalLogger.info() << LogPrefix << "delayed ping received an error: "
283                                          << error;
284         return;
285     }
286
287     GlobalLogger.debug() << LogPrefix << "starting delayed ping index "
288                           << pinger_index;
289     Pingers[pinger_index]->ping(ip,
290                                 DestinationPort,
291                                 boost::bind(&PingScheduler::ping_done_handler,
292                                                                 this, _1, _2) );
293     if (pinger_index >= NPingers-1)
294         GlobalLogger.debug() << LogPrefix << "started all delayed pings";
295     else
296     {
297         DelayedPingTimer.expires_from_now( milliseconds(ParallelPingDelay) );
298         DelayedPingTimer.async_wait( bind( &PingScheduler::delayed_ping,
299                   this, boost::asio::placeholders::error, ip, pinger_index+1) );
300     }
301 }
302
303
304 //------------------------------------------------------------------------------
305 // Post Processing of Ping result and Preparation for next ping
306 //------------------------------------------------------------------------------
307
308 /**
309  * @brief called when Ping::ping is done; calls functions to update
310  *   statistics, ping interval and elapsed time;
311  *   schedules a call to ping, thereby closing the loop
312  */
313 void PingScheduler::ping_done_handler( const PingStatus &result,
314                                        const long ping_duration_us )
315 {
316     PingStatus edited_result = result;
317     if (result == PingStatus_SuccessReply && ContinueOnOutdatedIps)
318     {
319         edited_result = PingStatus_SuccessOutdatedIP;
320
321         // reset ContinueOnOutdatedIps
322         ContinueOnOutdatedIps = false;
323         update_log_prefix();
324     }
325
326     ++NPingersDone;
327     GlobalLogger.info() << LogPrefix << "Ping " << NPingersDone << " of "
328         << NPingers << " done with result " << to_string(edited_result);
329
330     // post-processing
331     // can call update_ping_interval only after update_ping_statistics!
332     HostAnalyzer.update_ping_statistics( edited_result, ping_duration_us );
333
334     // prepare next ping only after all pingers are done
335     if (NPingersDone == NPingers)
336     {
337         // stop and destruct all pingers
338         clear_pingers();
339
340         GlobalLogger.debug() << LogPrefix
341             << "--------------------------------------------------------------";
342
343         // update variables for next ping: number of pings, delay, protocol
344         update_ping_protocol();
345         update_ping_interval();
346         update_ping_number();
347
348         prepare_next_ping();
349     }
350 }
351
352
353 void PingScheduler::prepare_next_ping()
354 {
355     // start DNS resolve if necessary
356     update_dns_resolver();
357
358     NPingersDone = 0;
359
360     // schedule next ping
361     int seconds_since_last_ping = (microsec_clock::universal_time()
362                                             - TimeSentLastPing).total_seconds();
363     if ( seconds_since_last_ping > PingIntervalInSec )
364     {
365         GlobalLogger.info() << "We are late for next ping!";
366         seconds_since_last_ping = PingIntervalInSec;
367         (void) NextPingTimer.expires_from_now( seconds(0) );
368     }
369     else
370         (void) NextPingTimer.expires_from_now( seconds( PingIntervalInSec
371                                                   - seconds_since_last_ping ) );
372     NextPingTimer.async_wait( bind( &PingScheduler::ping, this,
373                                          boost::asio::placeholders::error ) );
374 }
375
376 void PingScheduler::update_ping_interval()
377 {
378     // have to ping more often?
379     if ( HostAnalyzer.exceeded_ping_failed_limit() )
380     {
381         PingIntervalInSec.speed_up();
382
383         GlobalLogger.debug() << LogPrefix << "- Speeding up ping interval to: "
384                              << PingIntervalInSec << "s";
385         PingReplyTimeout = PingReplyTimeoutOrig * PingTimeoutFactor;
386         GlobalLogger.debug() << LogPrefix << "- Increase ping timeout to "
387                              << PingReplyTimeout << "s";
388     }
389     else
390     {
391         PingIntervalInSec.back_to_original();
392
393         GlobalLogger.debug() << LogPrefix << "- Stick to the original ping "
394                              << "interval: " << PingIntervalInSec << "s";
395         PingReplyTimeout = PingReplyTimeoutOrig;
396         GlobalLogger.debug() << LogPrefix << "- Reset ping timeout to "
397                              << PingReplyTimeout << "s";
398     }
399 }
400
401 /** in case of congested line, increase number of pings
402  *
403  * CAUTION! Only call this after clear_pingers !!!
404  * */
405 void PingScheduler::update_ping_number()
406 {
407     // make sure we do not loose track of pingers here
408     if ( NPingersDone != NPingers  || !Pingers.empty() )
409     {
410         GlobalLogger.warning() << LogPrefix << "Should only change number of "
411             << "pingers when all are finished and deleted! Have " << NPingers
412             << " pingers, " << NPingersDone << " of which are done and "
413             << Pingers.size() << " in listDone! Will not change NPingers.";
414         return;
415     }
416
417     if ( HostAnalyzer.exceeded_ping_congestion_limit() )
418     {
419         NPingers.increase();
420
421         GlobalLogger.notice() << LogPrefix << "Line appears congested!";
422         GlobalLogger.debug() << LogPrefix << "- Increasing ping number to: "
423                              << NPingers;
424     }
425     else
426     {
427         NPingers.back_to_original();
428
429         GlobalLogger.debug() << LogPrefix << "- Stick to the original ping "
430                              << "number: " << NPingers;
431     }
432
433     HostAnalyzer.set_n_parallel_pings(NPingers);
434 }
435
436 //------------------------------------------------------------------------------
437 // Ping Protocol Rotation
438 //------------------------------------------------------------------------------
439
440 void PingScheduler::init_ping_protocol()
441 {
442     ProtocolIter = Protocols.end();
443     get_next_ping_protocol();
444 }
445
446 void PingScheduler::update_ping_protocol()
447 {
448     if ( can_change_ping_protocol() )
449     {
450         get_next_ping_protocol();
451     }
452 }
453
454 void PingScheduler::get_next_ping_protocol()
455 {
456     ++ProtocolIter;
457     if (ProtocolIter == Protocols.end())
458         ProtocolIter = Protocols.begin();
459 }
460
461 bool PingScheduler::can_change_ping_protocol() const
462 {
463     // TODO can_change_ping_protocol() and get_next_ping_protocol() may be
464     // implemented in a Algorithm class that can be exchanged in this class to
465     // provide an algorithm neutral class
466     return true;
467 }
468
469 //------------------------------------------------------------------------------
470 // DNS host name resolution
471 //------------------------------------------------------------------------------
472
473 // show "!" after host name if running on outdated IPs
474 void PingScheduler::update_log_prefix()
475 {
476     std::stringstream temp;
477     temp << "Sched(" << DestinationAddress;
478     if (ContinueOnOutdatedIps)
479         temp << "!";
480     temp << "): ";
481     LogPrefix = temp.str();
482 }
483
484 void PingScheduler::update_dns_resolver()
485 {
486     if (Resolver && Resolver->is_resolving())
487         cancel_resolve(false);
488
489     if (ContinueOnOutdatedIps)
490     {
491         ContinueOnOutdatedIps = false;
492         update_log_prefix();
493     }
494
495     // DNS master caches created resolvers and resolved IPs, so this will
496     // probably just return an existing resolver with already resolved IPs for
497     // requested protocol ( ICMP/TCP is ignored, only IPv4/v6 is important)
498     Resolver = DnsMaster::get_instance()->get_resolver_for(DestinationAddress,
499                                                            *ProtocolIter);
500
501     // get number of up-to-date IPs
502     // TODO should check here, if they will be up to date in PingIntervalInSec
503     bool check_up_to_date = true;
504     int ip_count = Resolver->get_resolved_ip_count(check_up_to_date);
505     if (ip_count > 0)
506     {
507         GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
508             << ip_count << " (IPs may be outdated=" << !check_up_to_date << ")";
509         HostAnalyzer.set_resolved_ip_count( ip_count );
510
511         if (Resolver->is_resolving())
512             GlobalLogger.warning() << LogPrefix << "have up to date IPs but "
513                 << "resolver seems to be resolving all the same... "
514                 << "Start pinging anyway!";
515         ping_when_ready();
516     }
517     else
518     {
519         GlobalLogger.info() << LogPrefix
520                             << "No up-to-date IPs --> start resolve";
521         start_resolving_ping_address();
522         // set resolved_ip_count will be called in resolve callback
523     }
524 }
525
526 void PingScheduler::start_resolving_ping_address()
527 {
528     Resolver->async_resolve( boost::bind(&PingScheduler::dns_resolve_callback,
529                                           this, _1, _2) );
530 }
531
532 void PingScheduler::dns_resolve_callback(const bool was_success,
533                                          const int recursion_count)
534 {
535     GlobalLogger.info() << LogPrefix << "dns resolution finished "
536                         << "with success = " << was_success << " "
537                         << "after " << recursion_count << " recursions";
538
539     if ( was_success )
540     {
541         // trust that a successfull DNS resolve means we have an IP with TTL>0
542         int ip_count = Resolver->get_resolved_ip_count(!ContinueOnOutdatedIps);
543         if (ip_count == 0)
544         {
545             GlobalLogger.warning() << LogPrefix
546                 << "Should not have reached this case: resolve was "
547                 << "successfull but still have no IPs (up-to-date="
548                 << !ContinueOnOutdatedIps << ")!";
549             if (DnsMaster::get_instance()->get_resolved_ip_ttl_threshold() > 0)
550                 GlobalLogger.warning() << LogPrefix << "This probably happened "
551                     << "because you specified a TTL threshold > 0 but resolving"
552                     << " had no effect on TTLs since external cache is only "
553                     << "updated when TTL=0 is reached.";
554         }
555         else
556         {
557             GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
558                 << ip_count << " (IPs may be outdated="
559                 << ContinueOnOutdatedIps << ") --> could ping now";
560         }
561         HostAnalyzer.set_resolved_ip_count( ip_count );
562         ping_when_ready();
563     }
564     else
565     {   // host name resolution failed; try again bypassing first outdated CNAME
566         // or using cached IP
567         std::string skip_host = Resolver->get_skip_cname();
568
569         if (skip_host.empty())
570         {   // try to continue with cached IPs
571             int ip_count = Resolver->get_resolved_ip_count(false);
572
573             if (ip_count == 0)
574                 GlobalLogger.notice() << LogPrefix << "DNS failed "
575                     << "and have no cached IPs either --> cannot ping";
576                 // ping_when_ready will deal with this case
577             else
578             {
579                 ContinueOnOutdatedIps = true;
580                 update_log_prefix();
581
582                 GlobalLogger.notice() << LogPrefix << "DNS failed, "
583                     << "try anyway with cached data";
584             }
585
586             GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
587                 << ip_count << " (IPs may be outdated=" << true << ")";
588             HostAnalyzer.set_resolved_ip_count( ip_count );
589
590             ping_when_ready();
591         }
592         else
593         {   // have CNAME to continue
594             ContinueOnOutdatedIps = true;
595             update_log_prefix();
596             GlobalLogger.notice() << LogPrefix << "DNS failed, "
597                 << "try again skipping a CNAME and resolving "
598                 << skip_host << " directly";
599
600             cancel_resolve(false);
601
602             // now create new resolver
603             Resolver = DnsMaster::get_instance()
604                                    ->get_resolver_for(skip_host, *ProtocolIter);
605             start_resolving_ping_address();
606         }
607     }
608 }
609
610 /**
611  * cancel resolver if force_cancel or if it is not resolving DestinationAddress
612  *
613  * Resolvers have a life on their own: they are cached by DnsMaster so never go
614  *   out of scope and even after calling callbacks, there might still be a
615  *   longterm timer active to re-try resolving.
616  * We want to cancel that long-term timer only if the Resolver is not for our
617  *   real, original DestinationAddress but a CNAME, which can happen when trying
618  *   to skip cnames and working on out-dated IPs
619  */
620 void PingScheduler::cancel_resolve(const bool force_cancel)
621 {
622     if (force_cancel)
623     {
624         GlobalLogger.info() << "Cancelling resolver (forced)";
625         Resolver->cancel_resolve();
626     }
627     else if ( Resolver->get_hostname() == DestinationAddress )
628         GlobalLogger.info() << LogPrefix
629                             << "Leave original resolver active in background";
630     else
631     {
632         GlobalLogger.info() << LogPrefix << "Cancel resolver for "
633             << Resolver->get_hostname() << " since is not the original "
634             << DestinationAddress;
635         Resolver->cancel_resolve();
636     }
637 }
638