if sending several pings in parallel, delay them in scheduler
[pingcheck] / src / host / pingscheduler.cpp
1 /*
2 The software in this package is distributed under the GNU General
3 Public License version 2 (with a special exception described below).
4
5 A copy of GNU General Public License (GPL) is included in this distribution,
6 in the file COPYING.GPL.
7
8 As a special exception, if other files instantiate templates or use macros
9 or inline functions from this file, or you compile this file and link it
10 with other works to produce a work based on this file, this file
11 does not by itself cause the resulting work to be covered
12 by the GNU General Public License.
13
14 However the source code for this file must still be made available
15 in accordance with section (3) of the GNU General Public License.
16
17 This exception does not invalidate any other reasons why a work based
18 on this file might be covered by the GNU General Public License.
19 */
20 #include "host/pingscheduler.h"
21
22 #include <iostream>
23 #include <limits>
24
25 #include <boost/bind.hpp>
26 #include <boost/foreach.hpp>
27
28 #include <logfunc.hpp>
29
30 #include "boost_assert_handler.h"
31 #include "host/pingerfactory.h"
32 #include "dns/dnsmaster.h"
33 #include "icmp/icmppinger.h"
34 #include "link/linkstatus.h"
35
36 using namespace std;
37 using boost::asio::io_service;
38 using boost::bind;
39 using boost::date_time::time_resolution_traits_adapted64_impl;
40 using boost::posix_time::microsec_clock;
41 using boost::posix_time::ptime;
42 using boost::posix_time::seconds;
43 using boost::posix_time::milliseconds;
44 using boost::shared_ptr;
45 using I2n::Logger::GlobalLogger;
46
47 //-----------------------------------------------------------------------------
48 // PingScheduler
49 //-----------------------------------------------------------------------------
50
51 /**
52  * @brief Parameterized constructor.
53  *
54  * @param io_serv The one @c io_serv object that controls async processing
55  * @param network_interface The name of the network interface sending the pings.
56  * @param destination_address The remote address to ping.
57  * @param destination_port The remote port to ping.
58  * @param ping_protocol_list A list of protocols to use.
59  * @param ping_interval_in_sec Amount of time between each ping.
60  * @param ping_fail_percentage_limit Maximum amount of pings that can fail.
61  * @param ping_reply_timeout Max amount time to wait for ping to finish
62  * @param link_analyzer The object to monitor the link status.
63  * @param first_delay Delay in seconds from start_pinging to first ping attempt
64  * @param n_parallel_pings: Number of pingers to ping the same IP in parallel
65  */
66 PingScheduler::PingScheduler(
67         const IoServiceItem io_serv,
68         const string &network_interface,
69         const string &destination_address,
70         const uint16_t destination_port,
71         const PingProtocolList &ping_protocol_list,
72         const long ping_interval_in_sec,
73         const int ping_fail_percentage_limit,
74         const int ping_reply_timeout,
75         LinkStatusItem link_analyzer,
76         const int first_delay,
77         const int n_parallel_pings,
78         const int parallel_ping_delay
79 ) :
80     IoService( io_serv ),
81     NetworkInterfaceName( network_interface ),
82     DestinationAddress( destination_address ),
83     DestinationPort( destination_port ),
84     Protocols( ping_protocol_list ),
85     ProtocolIter(),
86     PingIntervalInSec( ping_interval_in_sec ),
87     FirstDelay( first_delay ),
88     NextPingTimer( *io_serv ),
89     TimeSentLastPing( microsec_clock::universal_time() ),
90     PingReplyTimeout( ping_reply_timeout ),
91     HostAnalyzer( destination_address, ping_fail_percentage_limit,
92                   n_parallel_pings, link_analyzer ),
93     Resolver(),
94     Pingers(),
95     NPingers( n_parallel_pings ),
96     NPingersDone( 0 ),
97     ParallelPingDelay( parallel_ping_delay ),
98     DelayedPingTimer( *io_serv ),
99     WantToPing( false ),
100     LogPrefix(),
101     ContinueOnOutdatedIps( false )
102 {
103     BOOST_ASSERT( !network_interface.empty() );
104     BOOST_ASSERT( !destination_address.empty() );
105     BOOST_ASSERT( ( 0 < destination_port ) &&
106                   ( destination_port < numeric_limits<uint16_t>::max() ) );
107     BOOST_ASSERT( 0 < ping_interval_in_sec );
108     BOOST_ASSERT( (0 <= ping_fail_percentage_limit) &&
109                   ( ping_fail_percentage_limit <= 100) );
110
111     update_log_prefix();
112
113     init_ping_protocol();
114 }
115
116 /**
117  * @brief Destructor.
118  */
119 PingScheduler::~PingScheduler()
120 {
121 }
122
123 void PingScheduler::stop_pinging()
124 {
125     // stop pinger and resolver
126     GlobalLogger.debug() << LogPrefix << "scheduler: stop pinging";
127     clear_pingers();
128     cancel_resolve(true);
129
130     // now cancel the own timer in case that pinger cancelation called callback
131     GlobalLogger.debug() << LogPrefix << "scheduler: cancel timer";
132     NextPingTimer.cancel();
133 }
134
135 /**
136  * @brief stop all pingers and remove them from Pingers variable which will
137  *   proboably cause their destruction
138  *   
139  * Pingers is empty afterwards
140  */
141 void PingScheduler::clear_pingers()
142 {
143     PingerItem pinger;
144     while(!Pingers.empty())
145     {
146         pinger = Pingers.back();
147         pinger->stop_pinging();
148         Pingers.pop_back();
149     }
150 }
151
152 /**
153  * @brief Start into infinite loop of calls to ping
154  *
155  * Does not start yet but set NextPingTimer (possibly to 0), so action starts
156  *   when io_service is started
157  */
158 void PingScheduler::start_pinging()
159 {
160     if ( FirstDelay > 0 )
161         GlobalLogger.info() << LogPrefix << "Delaying first ping by "
162                                          << FirstDelay << "s";
163     else
164         GlobalLogger.info() << LogPrefix << "Schedule ping as soon as possible";
165
166     (void) NextPingTimer.expires_from_now( seconds( FirstDelay ) );
167     NextPingTimer.async_wait( bind( &PingScheduler::ping, this,
168                                           boost::asio::placeholders::error ) );
169 }
170
171
172 /**
173  * @brief call Ping::ping and schedule a call to ping_done_handler when finished
174  */
175 void PingScheduler::ping(const boost::system::error_code &error)
176 {
177     if ( error )
178     {   // get here, e.g. by NextPingTimer.cancel in stop_pinging
179         if ( error ==  boost::asio::error::operation_aborted )
180             GlobalLogger.error() << LogPrefix << "Timer for ping was cancelled!"
181                                  << " --> Stopping";
182         else
183             GlobalLogger.error() << LogPrefix << "Received error " << error
184                                  << " waiting for ping! Stopping";
185         return;
186     }
187
188     // ping as soon as dns is ready
189     WantToPing = true;
190     ping_when_ready();
191 }
192
193
194 void PingScheduler::ping_when_ready()
195 {
196     if ( !WantToPing )
197     {
198         GlobalLogger.info() << LogPrefix << "waiting for ping request "
199             << "(should take no more than " << PingIntervalInSec << "s)";
200         return;
201     }
202     else if ( Resolver && Resolver->is_resolving() )
203     {
204         GlobalLogger.info() << LogPrefix << "waiting for DNS to finish";
205         return;
206     }
207     else if ( !Resolver )
208         // should not happen, but check anyway
209         GlobalLogger.warning() << LogPrefix << "Have no resolver!";
210
211     GlobalLogger.info() << LogPrefix << "start ping";
212     WantToPing = false;
213
214     // try to get an up-to-date IP (ContinueOnOutdatedIps may only be set
215     //   because a CNAME was out of date -- IPs may still be current)
216     HostAddress ip = Resolver->get_next_ip();
217
218     if ( !ip.is_valid() )
219     {   // this can happen in 2 cases: if ContinueOnOutdatedIps==true
220         // or when ip went out of date between resolve and now
221         // --> try to use outdated IP
222         GlobalLogger.info() << LogPrefix << "Checking for outdated IPs";
223         bool check_up_to_date = false;
224         ip = Resolver->get_next_ip(check_up_to_date);
225     }
226     if ( !ip.is_valid() )
227     {   // Do not even have an outdated IP!
228         // This happens if have no cached IPs and resolve failed
229         GlobalLogger.info() << LogPrefix << "Not even outdated IP to ping "
230             << "-- treat like a failed ping.";
231
232         // skip the ping and directly call ping_done_handler
233         HostAnalyzer.set_resolved_ip_count(1);   // must have been 0 --> failed
234              // ping would create failed assumption (nPings > nIPs)
235         ping_done_handler(PingStatus_FailureNoIP, 0);
236         HostAnalyzer.set_resolved_ip_count(0);   // set back
237     }
238     else
239     {
240         boost::asio::ip::address actual_ip = ip.get_ip();
241         GlobalLogger.info() << LogPrefix << "pinging IP " << actual_ip
242             << " with TTL " << ip.get_ttl().get_updated_value() << "s";
243         delayed_ping(boost::system::error_code(), actual_ip, 0);
244         NPingersDone = 0;
245         TimeSentLastPing = microsec_clock::universal_time();
246     }
247 }
248
249 void PingScheduler::delayed_ping( const boost::system::error_code &error,
250                                   const boost::asio::ip::address &ip,
251                                   const int pinger_index )
252 {
253     if (error)
254     {
255         GlobalLogger.info() << LogPrefix << "delayed ping received an error: "
256                                          << error;
257         return;
258     }
259     if (pinger_index == NPingers)
260     {
261         GlobalLogger.debug() << LogPrefix << "started all delayed pings";
262         return;
263     }
264
265     GlobalLogger.debug() << LogPrefix << "starting delayed ping index "
266                           << pinger_index;
267     Pingers[pinger_index]->ping(ip,
268                                 DestinationPort,
269                                 boost::bind(&PingScheduler::ping_done_handler,
270                                                                 this, _1, _2) );
271     DelayedPingTimer.expires_from_now( milliseconds(ParallelPingDelay) );
272     DelayedPingTimer.async_wait( bind( &PingScheduler::delayed_ping,
273                   this, boost::asio::placeholders::error, ip, pinger_index+1) );
274 }
275
276
277 //------------------------------------------------------------------------------
278 // Post Processing of Ping result and Preparation for next ping
279 //------------------------------------------------------------------------------
280
281 /**
282  * @brief called when Ping::ping is done; calls functions to update
283  *   statistics, ping interval and elapsed time;
284  *   schedules a call to ping, thereby closing the loop
285  */
286 void PingScheduler::ping_done_handler( const PingStatus &result,
287                                        const long ping_duration_us )
288 {
289     PingStatus edited_result = result;
290     if (result == PingStatus_SuccessReply && ContinueOnOutdatedIps)
291     {
292         edited_result = PingStatus_SuccessOutdatedIP;
293
294         // reset ContinueOnOutdatedIps
295         ContinueOnOutdatedIps = false;
296         update_log_prefix();
297     }
298
299     ++NPingersDone;
300     GlobalLogger.info() << LogPrefix << "Ping " << NPingersDone << " of "
301         << NPingers << " done with result " << to_string(edited_result);
302
303     // post-processing
304     // can call update_ping_interval only after update_ping_statistics!
305     HostAnalyzer.update_ping_statistics( edited_result, ping_duration_us );
306
307     // prepare next ping only after all pingers are done
308     if (NPingersDone == NPingers)
309         prepare_next_ping();
310 }
311
312
313 void PingScheduler::prepare_next_ping()
314 {
315     update_ping_interval();
316
317     // get next protocol, possibly start resolving IPs
318     update_ping_protocol();
319
320     // schedule next ping
321     int seconds_since_last_ping = (microsec_clock::universal_time()
322                                             - TimeSentLastPing).total_seconds();
323     if ( seconds_since_last_ping > PingIntervalInSec )
324     {
325         GlobalLogger.info() << "We are late for next ping!";
326         seconds_since_last_ping = PingIntervalInSec;
327         (void) NextPingTimer.expires_from_now( seconds(0) );
328     }
329     else
330         (void) NextPingTimer.expires_from_now( seconds( PingIntervalInSec
331                                                   - seconds_since_last_ping ) );
332     NextPingTimer.async_wait( bind( &PingScheduler::ping, this,
333                                          boost::asio::placeholders::error ) );
334 }
335
336 void PingScheduler::update_ping_interval()
337 {
338     // have to ping more often?
339     if ( HostAnalyzer.exceeded_ping_failed_limit() )
340     {
341         PingIntervalInSec.speed_up();
342
343         GlobalLogger.debug() << LogPrefix << "- Speeding up ping interval to: "
344                              << PingIntervalInSec << "s";
345     }
346     else
347     {
348         PingIntervalInSec.back_to_original();
349
350         GlobalLogger.debug() << LogPrefix << "- Stick to the original ping "
351                              << "interval: " << PingIntervalInSec << "s";
352     }
353 }
354
355 //------------------------------------------------------------------------------
356 // Ping Protocol Rotation
357 //------------------------------------------------------------------------------
358
359 void PingScheduler::init_ping_protocol()
360 {
361     ProtocolIter = Protocols.end();
362     get_next_ping_protocol();
363 }
364
365 void PingScheduler::update_ping_protocol()
366 {
367     if ( can_change_ping_protocol() )
368     {
369         get_next_ping_protocol();
370     }
371 }
372
373 void PingScheduler::get_next_ping_protocol()
374 {
375     // stop and destruct all pingers
376     clear_pingers();
377     GlobalLogger.debug() << LogPrefix
378         << "------------------------------------------------------------------";
379
380     // get next protocol
381     ++ProtocolIter;
382     if (ProtocolIter == Protocols.end())
383         ProtocolIter = Protocols.begin();
384     PingProtocol ping_protocol = *ProtocolIter;
385     // --> ProtocolIter still points to currently used protocol which is
386     //     required in dns_resolve_callback
387
388     // create new pingers
389     for (int count=0; count<NPingers; ++count)
390         Pingers.push_back( PingerFactory::createPinger(ping_protocol, IoService,
391                                       NetworkInterfaceName, PingReplyTimeout) );
392
393     update_dns_resolver( ping_protocol );
394 }
395
396 bool PingScheduler::can_change_ping_protocol() const
397 {
398     // TODO can_change_ping_protocol() and get_next_ping_protocol() may be
399     // implemented in a Algorithm class that can be exchanged in this class to
400     // provide an algorithm neutral class
401     return true;
402 }
403
404 //------------------------------------------------------------------------------
405 // DNS host name resolution
406 //------------------------------------------------------------------------------
407
408 // show "!" after host name if running on outdated IPs
409 void PingScheduler::update_log_prefix()
410 {
411     std::stringstream temp;
412     temp << "Sched(" << DestinationAddress;
413     if (ContinueOnOutdatedIps)
414         temp << "!";
415     temp << "): ";
416     LogPrefix = temp.str();
417 }
418
419 void PingScheduler::update_dns_resolver( PingProtocol current_protocol )
420 {
421     if (Resolver && Resolver->is_resolving())
422         cancel_resolve(false);
423
424     if (ContinueOnOutdatedIps)
425     {
426         ContinueOnOutdatedIps = false;
427         update_log_prefix();
428     }
429
430     // DNS master caches created resolvers and resolved IPs, so this will
431     // probably just return an existing resolver with already resolved IPs for
432     // requested protocol ( ICMP/TCP is ignored, only IPv4/v6 is important)
433     Resolver = DnsMaster::get_instance()->get_resolver_for(DestinationAddress,
434                                                            current_protocol);
435
436     // get number of up-to-date IPs
437     // TODO should check here, if they will be up to date in PingIntervalInSec
438     bool check_up_to_date = true;
439     int ip_count = Resolver->get_resolved_ip_count(check_up_to_date);
440     if (ip_count > 0)
441     {
442         GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
443             << ip_count << " (IPs may be outdated=" << !check_up_to_date << ")";
444         HostAnalyzer.set_resolved_ip_count( ip_count );
445
446         if (Resolver->is_resolving())
447             GlobalLogger.warning() << LogPrefix << "have up to date IPs but "
448                 << "resolver seems to be resolving all the same... "
449                 << "Start pinging anyway!";
450         ping_when_ready();
451     }
452     else
453     {
454         GlobalLogger.info() << LogPrefix
455                             << "No up-to-date IPs --> start resolve";
456         start_resolving_ping_address();
457         // set resolved_ip_count will be called in resolve callback
458     }
459 }
460
461 void PingScheduler::start_resolving_ping_address()
462 {
463     Resolver->async_resolve( boost::bind(&PingScheduler::dns_resolve_callback,
464                                           this, _1, _2) );
465 }
466
467 void PingScheduler::dns_resolve_callback(const bool was_success,
468                                          const int recursion_count)
469 {
470     GlobalLogger.info() << LogPrefix << "dns resolution finished "
471                         << "with success = " << was_success << " "
472                         << "after " << recursion_count << " recursions";
473
474     if ( was_success )
475     {
476         // trust that a successfull DNS resolve means we have an IP with TTL>0
477         int ip_count = Resolver->get_resolved_ip_count(!ContinueOnOutdatedIps);
478         if (ip_count == 0)
479         {   // this will create trouble in HostAnalyzer
480             GlobalLogger.warning() << LogPrefix
481                 << "Should not have reached this case: resolve was "
482                 << "successfull but still have no IPs (up-to-date="
483                 << !ContinueOnOutdatedIps << ")!";
484             if (DnsMaster::get_instance()->get_resolved_ip_ttl_threshold() > 0)
485                 GlobalLogger.warning() << LogPrefix << "This probably happened "
486                     << "because you specified a TTL threshold > 0 but resolving"
487                     << " had no effect on TTLs since external cache is only "
488                     << "updated when TTL=0 is reached.";
489         }
490         else
491         {
492             GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
493                 << ip_count << " (IPs may be outdated="
494                 << ContinueOnOutdatedIps << ") --> could ping now";
495             HostAnalyzer.set_resolved_ip_count( ip_count );
496         }
497         ping_when_ready();
498     }
499     else
500     {   // host name resolution failed; try again bypassing first outdated CNAME
501         // or using cached IP
502         std::string skip_host = Resolver->get_skip_cname();
503
504         if (skip_host.empty())
505         {   // try to continue with cached IPs
506             int ip_count = Resolver->get_resolved_ip_count(false);
507
508             if (ip_count == 0)
509                 GlobalLogger.notice() << LogPrefix << "DNS failed "
510                     << "and have no cached IPs either --> cannot ping";
511                 // ping_when_ready will deal with this case
512             else
513             {
514                 ContinueOnOutdatedIps = true;
515                 update_log_prefix();
516
517                 GlobalLogger.notice() << LogPrefix << "DNS failed, "
518                     << "try anyway with cached data";
519             }
520
521             GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
522                 << ip_count << " (IPs may be outdated=" << true << ")";
523             HostAnalyzer.set_resolved_ip_count( ip_count );
524
525             ping_when_ready();
526         }
527         else
528         {   // have CNAME to continue
529             ContinueOnOutdatedIps = true;
530             update_log_prefix();
531             GlobalLogger.notice() << LogPrefix << "DNS failed, "
532                 << "try again skipping a CNAME and resolving "
533                 << skip_host << " directly";
534
535             cancel_resolve(false);
536
537             // now create new resolver
538             Resolver = DnsMaster::get_instance()
539                                    ->get_resolver_for(skip_host, *ProtocolIter);
540             start_resolving_ping_address();
541         }
542     }
543 }
544
545 /**
546  * cancel resolver if force_cancel or if it is not resolving DestinationAddress
547  *
548  * Resolvers have a life on their own: they are cached by DnsMaster so never go
549  *   out of scope and even after calling callbacks, there might still be a
550  *   longterm timer active to re-try resolving.
551  * We want to cancel that long-term timer only if the Resolver is not for our
552  *   real, original DestinationAddress but a CNAME, which can happen when trying
553  *   to skip cnames and working on out-dated IPs
554  */
555 void PingScheduler::cancel_resolve(const bool force_cancel)
556 {
557     if (force_cancel)
558     {
559         GlobalLogger.info() << "Cancelling resolver (forced)";
560         Resolver->cancel_resolve();
561     }
562     else if ( Resolver->get_hostname() == DestinationAddress )
563         GlobalLogger.info() << LogPrefix
564                             << "Leave original resolver active in background";
565     else
566     {
567         GlobalLogger.info() << LogPrefix << "Cancel resolver for "
568             << Resolver->get_hostname() << " since is not the original "
569             << DestinationAddress;
570         Resolver->cancel_resolve();
571     }
572 }
573