d259f2e42e44e3a47cdabd5252491bd5698dd413
[pingcheck] / src / host / pingscheduler.cpp
1 /*
2 The software in this package is distributed under the GNU General
3 Public License version 2 (with a special exception described below).
4
5 A copy of GNU General Public License (GPL) is included in this distribution,
6 in the file COPYING.GPL.
7
8 As a special exception, if other files instantiate templates or use macros
9 or inline functions from this file, or you compile this file and link it
10 with other works to produce a work based on this file, this file
11 does not by itself cause the resulting work to be covered
12 by the GNU General Public License.
13
14 However the source code for this file must still be made available
15 in accordance with section (3) of the GNU General Public License.
16
17 This exception does not invalidate any other reasons why a work based
18 on this file might be covered by the GNU General Public License.
19 */
20 #include "host/pingscheduler.h"
21
22 #include <iostream>
23 #include <limits>
24
25 #include <boost/bind.hpp>
26 #include <boost/foreach.hpp>
27
28 #include <logfunc.hpp>
29
30 #include "boost_assert_handler.h"
31 #include "host/pingerfactory.h"
32 #include "dns/dnsmaster.h"
33 #include "icmp/icmppinger.h"
34 #include "link/linkstatus.h"
35
36 using namespace std;
37 using boost::asio::io_service;
38 using boost::bind;
39 using boost::date_time::time_resolution_traits_adapted64_impl;
40 using boost::posix_time::microsec_clock;
41 using boost::posix_time::ptime;
42 using boost::posix_time::seconds;
43 using boost::posix_time::milliseconds;
44 using boost::shared_ptr;
45 using I2n::Logger::GlobalLogger;
46
47
48 //-----------------------------------------------------------------------------
49 // PingScheduler
50 //-----------------------------------------------------------------------------
51
52 /**
53  * @brief Parameterized constructor.
54  *
55  * @param io_serv The one @c io_serv object that controls async processing
56  * @param network_interface The name of the network interface sending the pings.
57  * @param destination_address The remote address to ping.
58  * @param destination_port The remote port to ping.
59  * @param ping_protocol_list A list of protocols to use.
60  * @param ping_interval_in_sec Amount of time between each ping.
61  * @param ping_fail_percentage_limit Maximum amount of pings that can fail.
62  * @param ping_congestion_percentage_limit Amount of pings indication congested
63  * line
64  * @param ping_congestion_duration_thresh Duration in seconds that indicates a
65  * congested line
66  * @param ping_reply_timeout Max amount time to wait for ping to finish
67  * @param link_analyzer The object to monitor the link status.
68  * @param first_delay Delay in seconds from start_pinging to first ping attempt
69  * @param n_parallel_pings: Number of pingers to ping the same IP in parallel
70  */
71 PingScheduler::PingScheduler(
72         const IoServiceItem io_serv,
73         const string &network_interface,
74         const string &destination_address,
75         const uint16_t destination_port,
76         const PingProtocolList &ping_protocol_list,
77         const long ping_interval_in_sec,
78         const int ping_fail_percentage_limit,
79         const int ping_congestion_percentage_limit,
80         const int congest_caused_by_fail_percentage_limit,
81         const int ping_congestion_duration_thresh,
82         const int ping_reply_timeout,
83         LinkStatusItem link_analyzer,
84         const int first_delay,
85         const int n_parallel_pings,
86         const int parallel_ping_delay,
87         const int ping_timeout_factor
88 ) :
89     IoService( io_serv ),
90     NetworkInterfaceName( network_interface ),
91     DestinationAddress( destination_address ),
92     DestinationPort( destination_port ),
93     Protocols( ping_protocol_list ),
94     ProtocolIter(),
95     PingIntervalInSec( ping_interval_in_sec ),
96     NPingers( n_parallel_pings ),
97     FirstDelay( first_delay ),
98     NextPingTimer( *io_serv ),
99     TimeSentLastPing( microsec_clock::universal_time() ),
100     PingReplyTimeout( ping_reply_timeout ),
101     PingReplyTimeoutOrig( ping_reply_timeout ),
102     HostAnalyzer( destination_address, ping_fail_percentage_limit,
103                   ping_congestion_percentage_limit,
104                   congest_caused_by_fail_percentage_limit,
105                   ping_congestion_duration_thresh, n_parallel_pings,
106                   link_analyzer ),
107     Resolver(),
108     Pingers(),
109     NPingersDone( 0 ),
110     ParallelPingDelay( parallel_ping_delay ),
111     DelayedPingTimer( *io_serv ),
112     WantToPing( false ),
113     LogPrefix(),
114     ContinueOnOutdatedIps( false ),
115     PingTimeoutFactor( ping_timeout_factor )
116 {
117     BOOST_ASSERT( !network_interface.empty() );
118     BOOST_ASSERT( !destination_address.empty() );
119     BOOST_ASSERT( ( 0 < destination_port ) &&
120                   ( destination_port < numeric_limits<uint16_t>::max() ) );
121     BOOST_ASSERT( 0 < ping_interval_in_sec );
122     BOOST_ASSERT( (0 <= ping_fail_percentage_limit) &&
123                   ( ping_fail_percentage_limit <= 100) );
124
125     update_log_prefix();
126
127     init_ping_protocol();
128
129     // start resolving already so we are prepared to ping
130     update_dns_resolver();
131
132 }
133
134 /**
135  * @brief Destructor.
136  */
137 PingScheduler::~PingScheduler()
138 {
139 }
140
141 void PingScheduler::stop_pinging()
142 {
143     // stop pinger and resolver
144     GlobalLogger.debug() << LogPrefix << "scheduler: stop pinging";
145     clear_pingers();
146     cancel_resolve(true);
147
148     // now cancel the own timer in case that pinger cancelation called callback
149     GlobalLogger.debug() << LogPrefix << "scheduler: cancel timer";
150     NextPingTimer.cancel();
151 }
152
153 /**
154  * @brief stop all pingers and remove them from Pingers variable which will
155  *   proboably cause their destruction
156  *
157  * Pingers is empty afterwards
158  */
159 void PingScheduler::clear_pingers()
160 {
161     PingerItem pinger;
162     while(!Pingers.empty())
163     {
164         pinger = Pingers.back();
165         pinger->stop_pinging();
166         Pingers.pop_back();
167     }
168 }
169
170 /**
171  * @brief Start into infinite loop of calls to ping
172  *
173  * Does not start yet but set NextPingTimer (possibly to 0), so action starts
174  *   when io_service is started
175  */
176 void PingScheduler::start_pinging()
177 {
178     if ( FirstDelay > 0 )
179         GlobalLogger.info() << LogPrefix << "Delaying first ping by "
180                                          << FirstDelay << "s";
181     else
182         GlobalLogger.info() << LogPrefix << "Schedule ping as soon as possible";
183
184     (void) NextPingTimer.expires_from_now( seconds( FirstDelay ) );
185     NextPingTimer.async_wait( bind( &PingScheduler::ping, this,
186                                           boost::asio::placeholders::error ) );
187 }
188
189
190 /**
191  * @brief call Ping::ping and schedule a call to ping_done_handler when finished
192  */
193 void PingScheduler::ping(const boost::system::error_code &error)
194 {
195     if ( error )
196     {   // get here, e.g. by NextPingTimer.cancel in stop_pinging
197         if ( error ==  boost::asio::error::operation_aborted )
198             GlobalLogger.error() << LogPrefix << "Timer for ping was cancelled!"
199                                  << " --> Stopping";
200         else
201             GlobalLogger.error() << LogPrefix << "Received error " << error
202                                  << " waiting for ping! Stopping";
203         return;
204     }
205
206     // ping as soon as dns is ready
207     WantToPing = true;
208     ping_when_ready();
209 }
210
211
212 void PingScheduler::ping_when_ready()
213 {
214     if ( !WantToPing )
215     {
216         GlobalLogger.info() << LogPrefix << "waiting for ping request "
217             << "(should take no more than " << PingIntervalInSec << "s)";
218         return;
219     }
220     else if ( Resolver && Resolver->is_resolving() )
221     {
222         GlobalLogger.info() << LogPrefix << "waiting for DNS to finish";
223         return;
224     }
225     else if ( !Resolver )
226     {
227         // should not happen, but check anyway
228         GlobalLogger.warning() << LogPrefix << "Have no resolver!";
229         return;
230     }
231
232     GlobalLogger.info() << LogPrefix << "start ping";
233     WantToPing = false;
234
235     // try to get an up-to-date IP (ContinueOnOutdatedIps may only be set
236     //   because a CNAME was out of date -- IPs may still be current)
237     HostAddress ip = Resolver->get_next_ip();
238
239     if ( !ip.is_valid() )
240     {   // this can happen in 2 cases: if ContinueOnOutdatedIps==true
241         // or when ip went out of date between resolve and now
242         // --> try to use outdated IP
243         GlobalLogger.info() << LogPrefix << "Checking for outdated IPs";
244         bool check_up_to_date = false;
245         ip = Resolver->get_next_ip(check_up_to_date);
246     }
247     if ( !ip.is_valid() )
248     {   // Do not even have an outdated IP!
249         // This happens if have no cached IPs and resolve failed
250         GlobalLogger.info() << LogPrefix << "Not even outdated IP to ping "
251             << "-- treat like a failed ping.";
252
253         // skip the ping and directly call ping_done_handler the appropriate
254         // number of times
255         for (int count=0; count<NPingers; ++count)
256             ping_done_handler(PingStatus_FailureNoIP, 0);
257     }
258     else
259     {
260         // create new pingers
261         for (int count=0; count<NPingers; ++count)
262             Pingers.push_back(
263                     PingerFactory::createPinger(*ProtocolIter, IoService,
264                                                 NetworkInterfaceName,
265                                                 PingReplyTimeout) );
266
267         // remember when pinging started
268         TimeSentLastPing = microsec_clock::universal_time();
269
270         // get actual IP
271         boost::asio::ip::address actual_ip = ip.get_ip();
272         GlobalLogger.info() << LogPrefix << "pinging IP " << actual_ip
273             << " with TTL " << ip.get_ttl().get_updated_value() << "s";
274         delayed_ping(boost::system::error_code(), actual_ip, 0);
275     }
276 }
277
278 void PingScheduler::delayed_ping( const boost::system::error_code &error,
279                                   const boost::asio::ip::address ip,
280                                   const int pinger_index )
281 {
282     if (error)
283     {
284         GlobalLogger.info() << LogPrefix << "delayed ping received an error: "
285                                          << error;
286         return;
287     }
288
289     GlobalLogger.debug() << LogPrefix << "starting delayed ping index "
290                           << pinger_index;
291     Pingers[pinger_index]->ping(ip,
292                                 DestinationPort,
293                                 boost::bind(&PingScheduler::ping_done_handler,
294                                                                 this, _1, _2) );
295     if (pinger_index >= NPingers-1)
296         GlobalLogger.debug() << LogPrefix << "started all delayed pings";
297     else
298     {
299         DelayedPingTimer.expires_from_now( milliseconds(ParallelPingDelay) );
300         DelayedPingTimer.async_wait( bind( &PingScheduler::delayed_ping,
301                   this, boost::asio::placeholders::error, ip, pinger_index+1) );
302     }
303 }
304
305
306 //------------------------------------------------------------------------------
307 // Post Processing of Ping result and Preparation for next ping
308 //------------------------------------------------------------------------------
309
310 /**
311  * @brief called when Ping::ping is done; calls functions to update
312  *   statistics, ping interval and elapsed time;
313  *   schedules a call to ping, thereby closing the loop
314  */
315 void PingScheduler::ping_done_handler( const PingStatus &result,
316                                        const long ping_duration_us )
317 {
318     PingStatus edited_result = result;
319     if (result == PingStatus_SuccessReply && ContinueOnOutdatedIps)
320     {
321         edited_result = PingStatus_SuccessOutdatedIP;
322
323         // reset ContinueOnOutdatedIps
324         ContinueOnOutdatedIps = false;
325         update_log_prefix();
326     }
327
328     ++NPingersDone;
329     GlobalLogger.info() << LogPrefix << "Ping " << NPingersDone << " of "
330         << NPingers << " done with result " << to_string(edited_result);
331
332     // post-processing
333     HostAnalyzer.update_ping_statistics( edited_result, ping_duration_us );
334
335     // prepare next ping only after all pingers are done
336     if (NPingersDone == NPingers)
337     {
338         // stop and destruct all pingers
339         clear_pingers();
340
341         GlobalLogger.debug() << LogPrefix
342             << "--------------------------------------------------------------";
343
344         // update variables for next ping: number of pings, delay, protocol
345         // do this only after call to update_ping_statistics!
346         update_ping_protocol();
347         update_ping_interval();
348         update_ping_number();
349
350         prepare_next_ping();
351     }
352 }
353
354
355 void PingScheduler::prepare_next_ping()
356 {
357     // start DNS resolve if necessary
358     update_dns_resolver();
359
360     NPingersDone = 0;
361
362     // schedule next ping
363     int seconds_since_last_ping = (microsec_clock::universal_time()
364                                             - TimeSentLastPing).total_seconds();
365     if ( seconds_since_last_ping > PingIntervalInSec )
366     {
367         GlobalLogger.info() << "We are late for next ping!";
368         seconds_since_last_ping = PingIntervalInSec;
369         (void) NextPingTimer.expires_from_now( seconds(0) );
370     }
371     else
372         (void) NextPingTimer.expires_from_now( seconds( PingIntervalInSec
373                                                   - seconds_since_last_ping ) );
374     NextPingTimer.async_wait( bind( &PingScheduler::ping, this,
375                                          boost::asio::placeholders::error ) );
376 }
377
378 void PingScheduler::update_ping_interval()
379 {
380     // have to ping more often?
381     if ( HostAnalyzer.exceeded_ping_failed_limit() )
382     {
383         PingIntervalInSec.speed_up();
384
385         GlobalLogger.debug() << LogPrefix << "- Speeding up ping interval to: "
386                              << PingIntervalInSec << "s";
387     }
388     else
389     {
390         PingIntervalInSec.back_to_original();
391
392         GlobalLogger.debug() << LogPrefix << "- Stick to the original ping "
393                              << "interval: " << PingIntervalInSec << "s";
394     }
395 }
396
397 /** in case of congested line, increase number of pings
398  *
399  * also increases ping timeout if line is congested
400  *
401  * CAUTION! Only call this after clear_pingers !!!
402  * */
403 void PingScheduler::update_ping_number()
404 {
405     // make sure we do not loose track of pingers here
406     if ( NPingersDone != NPingers  || !Pingers.empty() )
407     {
408         GlobalLogger.warning() << LogPrefix << "Should only change number of "
409             << "pingers when all are finished and deleted! Have " << NPingers
410             << " pingers, " << NPingersDone << " of which are done and "
411             << Pingers.size() << " in listDone! Will not change NPingers.";
412         return;
413     }
414
415     if ( HostAnalyzer.exceeded_ping_congestion_limit() )
416     {
417         NPingers.increase();
418
419         GlobalLogger.notice() << LogPrefix << "No reply from host, "
420             << "switching to burst ping mode with longer timeouts";
421         GlobalLogger.debug() << LogPrefix << "- Increasing ping number to: "
422                              << NPingers;
423
424         PingReplyTimeout = PingReplyTimeoutOrig * PingTimeoutFactor;
425         GlobalLogger.debug() << LogPrefix << "- Increase ping timeout to "
426                              << PingReplyTimeout << "s";
427     }
428     else
429     {
430         NPingers.back_to_original();
431
432         GlobalLogger.debug() << LogPrefix << "- Stick to the original ping "
433                              << "number: " << NPingers;
434
435         PingReplyTimeout = PingReplyTimeoutOrig;
436         GlobalLogger.debug() << LogPrefix << "- Reset ping timeout to "
437                              << PingReplyTimeout << "s";
438     }
439
440     // tell host analyzer so it expects the correct number of ping results
441     HostAnalyzer.set_n_parallel_pings(NPingers);
442 }
443
444 //------------------------------------------------------------------------------
445 // Ping Protocol Rotation
446 //------------------------------------------------------------------------------
447
448 void PingScheduler::init_ping_protocol()
449 {
450     ProtocolIter = Protocols.end();
451     get_next_ping_protocol();
452 }
453
454 void PingScheduler::update_ping_protocol()
455 {
456     if ( can_change_ping_protocol() )
457     {
458         get_next_ping_protocol();
459     }
460 }
461
462 void PingScheduler::get_next_ping_protocol()
463 {
464     ++ProtocolIter;
465     if (ProtocolIter == Protocols.end())
466         ProtocolIter = Protocols.begin();
467 }
468
469 bool PingScheduler::can_change_ping_protocol() const
470 {
471     // TODO can_change_ping_protocol() and get_next_ping_protocol() may be
472     // implemented in a Algorithm class that can be exchanged in this class to
473     // provide an algorithm neutral class
474     return true;
475 }
476
477 //------------------------------------------------------------------------------
478 // DNS host name resolution
479 //------------------------------------------------------------------------------
480
481 // show "!" after host name if running on outdated IPs
482 void PingScheduler::update_log_prefix()
483 {
484     std::stringstream temp;
485     temp << "Sched(" << DestinationAddress;
486     if (ContinueOnOutdatedIps)
487         temp << "!";
488     temp << "): ";
489     LogPrefix = temp.str();
490 }
491
492 void PingScheduler::update_dns_resolver()
493 {
494     if (Resolver && Resolver->is_resolving())
495         cancel_resolve(false);
496
497     if (ContinueOnOutdatedIps)
498     {
499         ContinueOnOutdatedIps = false;
500         update_log_prefix();
501     }
502
503     // DNS master caches created resolvers and resolved IPs, so this will
504     // probably just return an existing resolver with already resolved IPs for
505     // requested protocol ( ICMP/TCP is ignored, only IPv4/v6 is important)
506     Resolver = DnsMaster::get_instance()->get_resolver_for(DestinationAddress,
507                                                            *ProtocolIter);
508
509     // get number of up-to-date IPs
510     // TODO should check here, if they will be up to date in PingIntervalInSec
511     bool check_up_to_date = true;
512     int ip_count = Resolver->get_resolved_ip_count(check_up_to_date);
513     if (ip_count > 0)
514     {
515         GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
516             << ip_count << " (IPs may be outdated=" << !check_up_to_date << ")";
517         HostAnalyzer.set_resolved_ip_count( ip_count );
518
519         if (Resolver->is_resolving())
520             GlobalLogger.warning() << LogPrefix << "have up to date IPs but "
521                 << "resolver seems to be resolving all the same... "
522                 << "Start pinging anyway!";
523         ping_when_ready();
524     }
525     else
526     {
527         GlobalLogger.info() << LogPrefix
528                             << "No up-to-date IPs --> start resolve";
529         start_resolving_ping_address();
530         // set resolved_ip_count will be called in resolve callback
531     }
532 }
533
534 void PingScheduler::start_resolving_ping_address()
535 {
536     Resolver->async_resolve( boost::bind(&PingScheduler::dns_resolve_callback,
537                                           this, _1, _2) );
538 }
539
540 void PingScheduler::dns_resolve_callback(const bool was_success,
541                                          const int recursion_count)
542 {
543     GlobalLogger.info() << LogPrefix << "dns resolution finished "
544                         << "with success = " << was_success << " "
545                         << "after " << recursion_count << " recursions";
546
547     if ( was_success )
548     {
549         // trust that a successfull DNS resolve means we have an IP with TTL>0
550         int ip_count = Resolver->get_resolved_ip_count(!ContinueOnOutdatedIps);
551         if (ip_count == 0)
552         {
553             GlobalLogger.warning() << LogPrefix
554                 << "Should not have reached this case: resolve was "
555                 << "successfull but still have no IPs (up-to-date="
556                 << !ContinueOnOutdatedIps << ")!";
557             if (DnsMaster::get_instance()->get_resolved_ip_ttl_threshold() > 0)
558                 GlobalLogger.warning() << LogPrefix << "This probably happened "
559                     << "because you specified a TTL threshold > 0 but resolving"
560                     << " had no effect on TTLs since external cache is only "
561                     << "updated when TTL=0 is reached.";
562         }
563         else
564         {
565             GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
566                 << ip_count << " (IPs may be outdated="
567                 << ContinueOnOutdatedIps << ") --> could ping now";
568         }
569         HostAnalyzer.set_resolved_ip_count( ip_count );
570         ping_when_ready();
571     }
572     else
573     {   // host name resolution failed; try again bypassing first outdated CNAME
574         // or using cached IP
575         std::string skip_host = Resolver->get_skip_cname();
576
577         if (skip_host.empty())
578         {   // try to continue with cached IPs
579             int ip_count = Resolver->get_resolved_ip_count(false);
580
581             if (ip_count == 0)
582                 GlobalLogger.notice() << LogPrefix << "DNS failed "
583                     << "and have no cached IPs either --> cannot ping";
584                 // ping_when_ready will deal with this case
585             else
586             {
587                 ContinueOnOutdatedIps = true;
588                 update_log_prefix();
589
590                 GlobalLogger.notice() << LogPrefix << "DNS failed, "
591                     << "try anyway with cached data";
592             }
593
594             GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
595                 << ip_count << " (IPs may be outdated=" << true << ")";
596             HostAnalyzer.set_resolved_ip_count( ip_count );
597
598             ping_when_ready();
599         }
600         else
601         {   // have CNAME to continue
602             ContinueOnOutdatedIps = true;
603             update_log_prefix();
604             GlobalLogger.notice() << LogPrefix << "DNS failed, "
605                 << "try again skipping a CNAME and resolving "
606                 << skip_host << " directly";
607
608             cancel_resolve(false);
609
610             // now create new resolver
611             Resolver = DnsMaster::get_instance()
612                                    ->get_resolver_for(skip_host, *ProtocolIter);
613             start_resolving_ping_address();
614         }
615     }
616 }
617
618 /**
619  * cancel resolver if force_cancel or if it is not resolving DestinationAddress
620  *
621  * Resolvers have a life on their own: they are cached by DnsMaster so never go
622  *   out of scope and even after calling callbacks, there might still be a
623  *   longterm timer active to re-try resolving.
624  * We want to cancel that long-term timer only if the Resolver is not for our
625  *   real, original DestinationAddress but a CNAME, which can happen when trying
626  *   to skip cnames and working on out-dated IPs
627  */
628 void PingScheduler::cancel_resolve(const bool force_cancel)
629 {
630     if (force_cancel)
631     {
632         GlobalLogger.info() << "Cancelling resolver (forced)";
633         Resolver->cancel_resolve();
634     }
635     else if ( Resolver->get_hostname() == DestinationAddress )
636         GlobalLogger.info() << LogPrefix
637                             << "Leave original resolver active in background";
638     else
639     {
640         GlobalLogger.info() << LogPrefix << "Cancel resolver for "
641             << Resolver->get_hostname() << " since is not the original "
642             << DestinationAddress;
643         Resolver->cancel_resolve();
644     }
645 }
646