464373a36b4627efe575cabfada8d55996a6a52f
[pingcheck] / src / host / pingscheduler.cpp
1 /*
2 The software in this package is distributed under the GNU General
3 Public License version 2 (with a special exception described below).
4
5 A copy of GNU General Public License (GPL) is included in this distribution,
6 in the file COPYING.GPL.
7
8 As a special exception, if other files instantiate templates or use macros
9 or inline functions from this file, or you compile this file and link it
10 with other works to produce a work based on this file, this file
11 does not by itself cause the resulting work to be covered
12 by the GNU General Public License.
13
14 However the source code for this file must still be made available
15 in accordance with section (3) of the GNU General Public License.
16
17 This exception does not invalidate any other reasons why a work based
18 on this file might be covered by the GNU General Public License.
19 */
20 #include "host/pingscheduler.h"
21
22 #include <iostream>
23 #include <limits>
24
25 #include <boost/bind.hpp>
26 #include <boost/foreach.hpp>
27
28 #include <logfunc.hpp>
29
30 #include "boost_assert_handler.h"
31 #include "host/pingerfactory.h"
32 #include "dns/dnsmaster.h"
33 #include "icmp/icmppinger.h"
34 #include "link/linkstatus.h"
35
36 using namespace std;
37 using boost::asio::io_service;
38 using boost::bind;
39 using boost::date_time::time_resolution_traits_adapted64_impl;
40 using boost::posix_time::microsec_clock;
41 using boost::posix_time::ptime;
42 using boost::posix_time::seconds;
43 using boost::posix_time::milliseconds;
44 using boost::shared_ptr;
45 using I2n::Logger::GlobalLogger;
46
47 //-----------------------------------------------------------------------------
48 // PingScheduler
49 //-----------------------------------------------------------------------------
50
51 /**
52  * @brief Parameterized constructor.
53  *
54  * @param io_serv The one @c io_serv object that controls async processing
55  * @param network_interface The name of the network interface sending the pings.
56  * @param destination_address The remote address to ping.
57  * @param destination_port The remote port to ping.
58  * @param ping_protocol_list A list of protocols to use.
59  * @param ping_interval_in_sec Amount of time between each ping.
60  * @param ping_fail_percentage_limit Maximum amount of pings that can fail.
61  * @param ping_congestion_percentage_limit Amount of pings indication congested
62  * line
63  * @param ping_congestion_duration_thresh Duration in seconds that indicates a
64  * congested line
65  * @param ping_reply_timeout Max amount time to wait for ping to finish
66  * @param link_analyzer The object to monitor the link status.
67  * @param first_delay Delay in seconds from start_pinging to first ping attempt
68  * @param n_parallel_pings: Number of pingers to ping the same IP in parallel
69  */
70 PingScheduler::PingScheduler(
71         const IoServiceItem io_serv,
72         const string &network_interface,
73         const string &destination_address,
74         const uint16_t destination_port,
75         const PingProtocolList &ping_protocol_list,
76         const long ping_interval_in_sec,
77         const int ping_fail_percentage_limit,
78         const int ping_congestion_percentage_limit,
79         const int ping_congestion_duration_thresh,
80         const int ping_reply_timeout,
81         LinkStatusItem link_analyzer,
82         const int first_delay,
83         const int n_parallel_pings,
84         const int parallel_ping_delay
85 ) :
86     IoService( io_serv ),
87     NetworkInterfaceName( network_interface ),
88     DestinationAddress( destination_address ),
89     DestinationPort( destination_port ),
90     Protocols( ping_protocol_list ),
91     ProtocolIter(),
92     PingIntervalInSec( ping_interval_in_sec ),
93     FirstDelay( first_delay ),
94     NextPingTimer( *io_serv ),
95     TimeSentLastPing( microsec_clock::universal_time() ),
96     PingReplyTimeout( ping_reply_timeout ),
97     HostAnalyzer( destination_address, ping_fail_percentage_limit,
98                   ping_congestion_percentage_limit,
99                   ping_congestion_duration_thresh, n_parallel_pings,
100                   link_analyzer ),
101     Resolver(),
102     Pingers(),
103     NPingers( n_parallel_pings ),
104     NPingersDone( 0 ),
105     ParallelPingDelay( parallel_ping_delay ),
106     DelayedPingTimer( *io_serv ),
107     WantToPing( false ),
108     LogPrefix(),
109     ContinueOnOutdatedIps( false )
110 {
111     BOOST_ASSERT( !network_interface.empty() );
112     BOOST_ASSERT( !destination_address.empty() );
113     BOOST_ASSERT( ( 0 < destination_port ) &&
114                   ( destination_port < numeric_limits<uint16_t>::max() ) );
115     BOOST_ASSERT( 0 < ping_interval_in_sec );
116     BOOST_ASSERT( (0 <= ping_fail_percentage_limit) &&
117                   ( ping_fail_percentage_limit <= 100) );
118
119     update_log_prefix();
120
121     init_ping_protocol();
122 }
123
124 /**
125  * @brief Destructor.
126  */
127 PingScheduler::~PingScheduler()
128 {
129 }
130
131 void PingScheduler::stop_pinging()
132 {
133     // stop pinger and resolver
134     GlobalLogger.debug() << LogPrefix << "scheduler: stop pinging";
135     clear_pingers();
136     cancel_resolve(true);
137
138     // now cancel the own timer in case that pinger cancelation called callback
139     GlobalLogger.debug() << LogPrefix << "scheduler: cancel timer";
140     NextPingTimer.cancel();
141 }
142
143 /**
144  * @brief stop all pingers and remove them from Pingers variable which will
145  *   proboably cause their destruction
146  *   
147  * Pingers is empty afterwards
148  */
149 void PingScheduler::clear_pingers()
150 {
151     PingerItem pinger;
152     while(!Pingers.empty())
153     {
154         pinger = Pingers.back();
155         pinger->stop_pinging();
156         Pingers.pop_back();
157     }
158 }
159
160 /**
161  * @brief Start into infinite loop of calls to ping
162  *
163  * Does not start yet but set NextPingTimer (possibly to 0), so action starts
164  *   when io_service is started
165  */
166 void PingScheduler::start_pinging()
167 {
168     if ( FirstDelay > 0 )
169         GlobalLogger.info() << LogPrefix << "Delaying first ping by "
170                                          << FirstDelay << "s";
171     else
172         GlobalLogger.info() << LogPrefix << "Schedule ping as soon as possible";
173
174     (void) NextPingTimer.expires_from_now( seconds( FirstDelay ) );
175     NextPingTimer.async_wait( bind( &PingScheduler::ping, this,
176                                           boost::asio::placeholders::error ) );
177 }
178
179
180 /**
181  * @brief call Ping::ping and schedule a call to ping_done_handler when finished
182  */
183 void PingScheduler::ping(const boost::system::error_code &error)
184 {
185     if ( error )
186     {   // get here, e.g. by NextPingTimer.cancel in stop_pinging
187         if ( error ==  boost::asio::error::operation_aborted )
188             GlobalLogger.error() << LogPrefix << "Timer for ping was cancelled!"
189                                  << " --> Stopping";
190         else
191             GlobalLogger.error() << LogPrefix << "Received error " << error
192                                  << " waiting for ping! Stopping";
193         return;
194     }
195
196     // ping as soon as dns is ready
197     WantToPing = true;
198     ping_when_ready();
199 }
200
201
202 void PingScheduler::ping_when_ready()
203 {
204     if ( !WantToPing )
205     {
206         GlobalLogger.info() << LogPrefix << "waiting for ping request "
207             << "(should take no more than " << PingIntervalInSec << "s)";
208         return;
209     }
210     else if ( Resolver && Resolver->is_resolving() )
211     {
212         GlobalLogger.info() << LogPrefix << "waiting for DNS to finish";
213         return;
214     }
215     else if ( !Resolver )
216         // should not happen, but check anyway
217         GlobalLogger.warning() << LogPrefix << "Have no resolver!";
218
219     GlobalLogger.info() << LogPrefix << "start ping";
220     WantToPing = false;
221
222     // try to get an up-to-date IP (ContinueOnOutdatedIps may only be set
223     //   because a CNAME was out of date -- IPs may still be current)
224     HostAddress ip = Resolver->get_next_ip();
225
226     if ( !ip.is_valid() )
227     {   // this can happen in 2 cases: if ContinueOnOutdatedIps==true
228         // or when ip went out of date between resolve and now
229         // --> try to use outdated IP
230         GlobalLogger.info() << LogPrefix << "Checking for outdated IPs";
231         bool check_up_to_date = false;
232         ip = Resolver->get_next_ip(check_up_to_date);
233     }
234     if ( !ip.is_valid() )
235     {   // Do not even have an outdated IP!
236         // This happens if have no cached IPs and resolve failed
237         GlobalLogger.info() << LogPrefix << "Not even outdated IP to ping "
238             << "-- treat like a failed ping.";
239
240         // skip the ping and directly call ping_done_handler
241         HostAnalyzer.set_resolved_ip_count(1);   // must have been 0 --> failed
242              // ping would create failed assumption (nPings > nIPs)
243         ping_done_handler(PingStatus_FailureNoIP, 0);
244         HostAnalyzer.set_resolved_ip_count(0);   // set back
245     }
246     else
247     {
248         boost::asio::ip::address actual_ip = ip.get_ip();
249         GlobalLogger.info() << LogPrefix << "pinging IP " << actual_ip
250             << " with TTL " << ip.get_ttl().get_updated_value() << "s";
251         delayed_ping(boost::system::error_code(), actual_ip, 0);
252         NPingersDone = 0;
253         TimeSentLastPing = microsec_clock::universal_time();
254     }
255 }
256
257 void PingScheduler::delayed_ping( const boost::system::error_code &error,
258                                   const boost::asio::ip::address &ip,
259                                   const int pinger_index )
260 {
261     if (error)
262     {
263         GlobalLogger.info() << LogPrefix << "delayed ping received an error: "
264                                          << error;
265         return;
266     }
267     if (pinger_index == NPingers)
268     {
269         GlobalLogger.debug() << LogPrefix << "started all delayed pings";
270         return;
271     }
272
273     GlobalLogger.debug() << LogPrefix << "starting delayed ping index "
274                           << pinger_index;
275     Pingers[pinger_index]->ping(ip,
276                                 DestinationPort,
277                                 boost::bind(&PingScheduler::ping_done_handler,
278                                                                 this, _1, _2) );
279     DelayedPingTimer.expires_from_now( milliseconds(ParallelPingDelay) );
280     DelayedPingTimer.async_wait( bind( &PingScheduler::delayed_ping,
281                   this, boost::asio::placeholders::error, ip, pinger_index+1) );
282 }
283
284
285 //------------------------------------------------------------------------------
286 // Post Processing of Ping result and Preparation for next ping
287 //------------------------------------------------------------------------------
288
289 /**
290  * @brief called when Ping::ping is done; calls functions to update
291  *   statistics, ping interval and elapsed time;
292  *   schedules a call to ping, thereby closing the loop
293  */
294 void PingScheduler::ping_done_handler( const PingStatus &result,
295                                        const long ping_duration_us )
296 {
297     PingStatus edited_result = result;
298     if (result == PingStatus_SuccessReply && ContinueOnOutdatedIps)
299     {
300         edited_result = PingStatus_SuccessOutdatedIP;
301
302         // reset ContinueOnOutdatedIps
303         ContinueOnOutdatedIps = false;
304         update_log_prefix();
305     }
306
307     ++NPingersDone;
308     GlobalLogger.info() << LogPrefix << "Ping " << NPingersDone << " of "
309         << NPingers << " done with result " << to_string(edited_result);
310
311     // post-processing
312     // can call update_ping_interval only after update_ping_statistics!
313     HostAnalyzer.update_ping_statistics( edited_result, ping_duration_us );
314
315     // prepare next ping only after all pingers are done
316     if (NPingersDone == NPingers)
317         prepare_next_ping();
318 }
319
320
321 void PingScheduler::prepare_next_ping()
322 {
323     update_ping_interval();
324
325     // get next protocol, possibly start resolving IPs
326     update_ping_protocol();
327
328     // schedule next ping
329     int seconds_since_last_ping = (microsec_clock::universal_time()
330                                             - TimeSentLastPing).total_seconds();
331     if ( seconds_since_last_ping > PingIntervalInSec )
332     {
333         GlobalLogger.info() << "We are late for next ping!";
334         seconds_since_last_ping = PingIntervalInSec;
335         (void) NextPingTimer.expires_from_now( seconds(0) );
336     }
337     else
338         (void) NextPingTimer.expires_from_now( seconds( PingIntervalInSec
339                                                   - seconds_since_last_ping ) );
340     NextPingTimer.async_wait( bind( &PingScheduler::ping, this,
341                                          boost::asio::placeholders::error ) );
342 }
343
344 void PingScheduler::update_ping_interval()
345 {
346     // have to ping more often?
347     if ( HostAnalyzer.exceeded_ping_failed_limit() )
348     {
349         PingIntervalInSec.speed_up();
350
351         GlobalLogger.debug() << LogPrefix << "- Speeding up ping interval to: "
352                              << PingIntervalInSec << "s";
353     }
354     else
355     {
356         PingIntervalInSec.back_to_original();
357
358         GlobalLogger.debug() << LogPrefix << "- Stick to the original ping "
359                              << "interval: " << PingIntervalInSec << "s";
360     }
361 }
362
363 //------------------------------------------------------------------------------
364 // Ping Protocol Rotation
365 //------------------------------------------------------------------------------
366
367 void PingScheduler::init_ping_protocol()
368 {
369     ProtocolIter = Protocols.end();
370     get_next_ping_protocol();
371 }
372
373 void PingScheduler::update_ping_protocol()
374 {
375     if ( can_change_ping_protocol() )
376     {
377         get_next_ping_protocol();
378     }
379 }
380
381 void PingScheduler::get_next_ping_protocol()
382 {
383     // stop and destruct all pingers
384     clear_pingers();
385     GlobalLogger.debug() << LogPrefix
386         << "------------------------------------------------------------------";
387
388     // get next protocol
389     ++ProtocolIter;
390     if (ProtocolIter == Protocols.end())
391         ProtocolIter = Protocols.begin();
392     PingProtocol ping_protocol = *ProtocolIter;
393     // --> ProtocolIter still points to currently used protocol which is
394     //     required in dns_resolve_callback
395
396     // create new pingers
397     for (int count=0; count<NPingers; ++count)
398         Pingers.push_back( PingerFactory::createPinger(ping_protocol, IoService,
399                                       NetworkInterfaceName, PingReplyTimeout) );
400
401     update_dns_resolver( ping_protocol );
402 }
403
404 bool PingScheduler::can_change_ping_protocol() const
405 {
406     // TODO can_change_ping_protocol() and get_next_ping_protocol() may be
407     // implemented in a Algorithm class that can be exchanged in this class to
408     // provide an algorithm neutral class
409     return true;
410 }
411
412 //------------------------------------------------------------------------------
413 // DNS host name resolution
414 //------------------------------------------------------------------------------
415
416 // show "!" after host name if running on outdated IPs
417 void PingScheduler::update_log_prefix()
418 {
419     std::stringstream temp;
420     temp << "Sched(" << DestinationAddress;
421     if (ContinueOnOutdatedIps)
422         temp << "!";
423     temp << "): ";
424     LogPrefix = temp.str();
425 }
426
427 void PingScheduler::update_dns_resolver( PingProtocol current_protocol )
428 {
429     if (Resolver && Resolver->is_resolving())
430         cancel_resolve(false);
431
432     if (ContinueOnOutdatedIps)
433     {
434         ContinueOnOutdatedIps = false;
435         update_log_prefix();
436     }
437
438     // DNS master caches created resolvers and resolved IPs, so this will
439     // probably just return an existing resolver with already resolved IPs for
440     // requested protocol ( ICMP/TCP is ignored, only IPv4/v6 is important)
441     Resolver = DnsMaster::get_instance()->get_resolver_for(DestinationAddress,
442                                                            current_protocol);
443
444     // get number of up-to-date IPs
445     // TODO should check here, if they will be up to date in PingIntervalInSec
446     bool check_up_to_date = true;
447     int ip_count = Resolver->get_resolved_ip_count(check_up_to_date);
448     if (ip_count > 0)
449     {
450         GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
451             << ip_count << " (IPs may be outdated=" << !check_up_to_date << ")";
452         HostAnalyzer.set_resolved_ip_count( ip_count );
453
454         if (Resolver->is_resolving())
455             GlobalLogger.warning() << LogPrefix << "have up to date IPs but "
456                 << "resolver seems to be resolving all the same... "
457                 << "Start pinging anyway!";
458         ping_when_ready();
459     }
460     else
461     {
462         GlobalLogger.info() << LogPrefix
463                             << "No up-to-date IPs --> start resolve";
464         start_resolving_ping_address();
465         // set resolved_ip_count will be called in resolve callback
466     }
467 }
468
469 void PingScheduler::start_resolving_ping_address()
470 {
471     Resolver->async_resolve( boost::bind(&PingScheduler::dns_resolve_callback,
472                                           this, _1, _2) );
473 }
474
475 void PingScheduler::dns_resolve_callback(const bool was_success,
476                                          const int recursion_count)
477 {
478     GlobalLogger.info() << LogPrefix << "dns resolution finished "
479                         << "with success = " << was_success << " "
480                         << "after " << recursion_count << " recursions";
481
482     if ( was_success )
483     {
484         // trust that a successfull DNS resolve means we have an IP with TTL>0
485         int ip_count = Resolver->get_resolved_ip_count(!ContinueOnOutdatedIps);
486         if (ip_count == 0)
487         {   // this will create trouble in HostAnalyzer
488             GlobalLogger.warning() << LogPrefix
489                 << "Should not have reached this case: resolve was "
490                 << "successfull but still have no IPs (up-to-date="
491                 << !ContinueOnOutdatedIps << ")!";
492             if (DnsMaster::get_instance()->get_resolved_ip_ttl_threshold() > 0)
493                 GlobalLogger.warning() << LogPrefix << "This probably happened "
494                     << "because you specified a TTL threshold > 0 but resolving"
495                     << " had no effect on TTLs since external cache is only "
496                     << "updated when TTL=0 is reached.";
497         }
498         else
499         {
500             GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
501                 << ip_count << " (IPs may be outdated="
502                 << ContinueOnOutdatedIps << ") --> could ping now";
503             HostAnalyzer.set_resolved_ip_count( ip_count );
504         }
505         ping_when_ready();
506     }
507     else
508     {   // host name resolution failed; try again bypassing first outdated CNAME
509         // or using cached IP
510         std::string skip_host = Resolver->get_skip_cname();
511
512         if (skip_host.empty())
513         {   // try to continue with cached IPs
514             int ip_count = Resolver->get_resolved_ip_count(false);
515
516             if (ip_count == 0)
517                 GlobalLogger.notice() << LogPrefix << "DNS failed "
518                     << "and have no cached IPs either --> cannot ping";
519                 // ping_when_ready will deal with this case
520             else
521             {
522                 ContinueOnOutdatedIps = true;
523                 update_log_prefix();
524
525                 GlobalLogger.notice() << LogPrefix << "DNS failed, "
526                     << "try anyway with cached data";
527             }
528
529             GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
530                 << ip_count << " (IPs may be outdated=" << true << ")";
531             HostAnalyzer.set_resolved_ip_count( ip_count );
532
533             ping_when_ready();
534         }
535         else
536         {   // have CNAME to continue
537             ContinueOnOutdatedIps = true;
538             update_log_prefix();
539             GlobalLogger.notice() << LogPrefix << "DNS failed, "
540                 << "try again skipping a CNAME and resolving "
541                 << skip_host << " directly";
542
543             cancel_resolve(false);
544
545             // now create new resolver
546             Resolver = DnsMaster::get_instance()
547                                    ->get_resolver_for(skip_host, *ProtocolIter);
548             start_resolving_ping_address();
549         }
550     }
551 }
552
553 /**
554  * cancel resolver if force_cancel or if it is not resolving DestinationAddress
555  *
556  * Resolvers have a life on their own: they are cached by DnsMaster so never go
557  *   out of scope and even after calling callbacks, there might still be a
558  *   longterm timer active to re-try resolving.
559  * We want to cancel that long-term timer only if the Resolver is not for our
560  *   real, original DestinationAddress but a CNAME, which can happen when trying
561  *   to skip cnames and working on out-dated IPs
562  */
563 void PingScheduler::cancel_resolve(const bool force_cancel)
564 {
565     if (force_cancel)
566     {
567         GlobalLogger.info() << "Cancelling resolver (forced)";
568         Resolver->cancel_resolve();
569     }
570     else if ( Resolver->get_hostname() == DestinationAddress )
571         GlobalLogger.info() << LogPrefix
572                             << "Leave original resolver active in background";
573     else
574     {
575         GlobalLogger.info() << LogPrefix << "Cancel resolver for "
576             << Resolver->get_hostname() << " since is not the original "
577             << DestinationAddress;
578         Resolver->cancel_resolve();
579     }
580 }
581