Update pingcheck to work with cmake 3.28
[pingcheck] / src / host / pingscheduler.cpp
... / ...
CommitLineData
1/*
2The software in this package is distributed under the GNU General
3Public License version 2 (with a special exception described below).
4
5A copy of GNU General Public License (GPL) is included in this distribution,
6in the file COPYING.GPL.
7
8As a special exception, if other files instantiate templates or use macros
9or inline functions from this file, or you compile this file and link it
10with other works to produce a work based on this file, this file
11does not by itself cause the resulting work to be covered
12by the GNU General Public License.
13
14However the source code for this file must still be made available
15in accordance with section (3) of the GNU General Public License.
16
17This exception does not invalidate any other reasons why a work based
18on this file might be covered by the GNU General Public License.
19*/
20#include "host/pingscheduler.h"
21
22#include <iostream>
23#include <limits>
24
25#include <boost/bind.hpp>
26#include <boost/foreach.hpp>
27
28#include <logfunc.hpp>
29
30#include "boost_assert_handler.h"
31#include "host/pingerfactory.h"
32#include "dns/dnsmaster.h"
33#include "icmp/icmppinger.h"
34#include "link/linkstatus.h"
35
36using namespace std;
37using boost::asio::io_service;
38using boost::bind;
39using boost::date_time::time_resolution_traits_adapted64_impl;
40using boost::posix_time::microsec_clock;
41using boost::posix_time::ptime;
42using boost::posix_time::seconds;
43using boost::posix_time::milliseconds;
44using boost::shared_ptr;
45using I2n::Logger::GlobalLogger;
46
47
48//-----------------------------------------------------------------------------
49// PingScheduler
50//-----------------------------------------------------------------------------
51
52/**
53 * @brief Parameterized constructor.
54 *
55 * @param io_serv The one @c io_serv object that controls async processing
56 * @param network_interface The name of the network interface sending the pings.
57 * @param destination_address The remote address to ping.
58 * @param destination_port The remote port to ping.
59 * @param ping_protocol_list A list of protocols to use.
60 * @param ping_interval_in_sec Amount of time between each ping.
61 * @param ping_fail_percentage_limit Maximum amount of pings that can fail.
62 * @param ping_congestion_percentage_limit Amount of pings indication congested
63 * line
64 * @param ping_congestion_duration_thresh Duration in seconds that indicates a
65 * congested line
66 * @param ping_reply_timeout Max amount time to wait for ping to finish
67 * @param link_analyzer The object to monitor the link status.
68 * @param first_delay Delay in seconds from start_pinging to first ping attempt
69 * @param n_parallel_pings: Number of pingers to ping the same IP in parallel
70 */
71PingScheduler::PingScheduler(
72 const IoServiceItem io_serv,
73 const string &network_interface,
74 const string &destination_address,
75 const uint16_t destination_port,
76 const PingProtocolList &ping_protocol_list,
77 const long ping_interval_in_sec,
78 const int ping_fail_percentage_limit,
79 const int ping_congestion_percentage_limit,
80 const int congest_caused_by_fail_percentage_limit,
81 const int ping_congestion_duration_thresh,
82 const int ping_reply_timeout,
83 LinkStatusItem link_analyzer,
84 const int first_delay,
85 const int n_parallel_pings,
86 const int parallel_ping_delay,
87 const int ping_timeout_factor
88) :
89 IoService( io_serv ),
90 NetworkInterfaceName( network_interface ),
91 DestinationAddress( destination_address ),
92 DestinationPort( destination_port ),
93 Protocols( ping_protocol_list ),
94 ProtocolIter(),
95 PingIntervalInSec( ping_interval_in_sec ),
96 NPingers( n_parallel_pings ),
97 FirstDelay( first_delay ),
98 NextPingTimer( *io_serv ),
99 TimeSentLastPing( microsec_clock::universal_time() ),
100 PingReplyTimeout( ping_reply_timeout ),
101 PingReplyTimeoutOrig( ping_reply_timeout ),
102 HostAnalyzer( destination_address, ping_fail_percentage_limit,
103 ping_congestion_percentage_limit,
104 congest_caused_by_fail_percentage_limit,
105 ping_congestion_duration_thresh, n_parallel_pings,
106 link_analyzer ),
107 Resolver(),
108 Pingers(),
109 NPingersDone( 0 ),
110 ParallelPingDelay( parallel_ping_delay ),
111 DelayedPingTimer( *io_serv ),
112 WantToPing( false ),
113 LogPrefix(),
114 ContinueOnOutdatedIps( false ),
115 PingTimeoutFactor( ping_timeout_factor )
116{
117 BOOST_ASSERT( !network_interface.empty() );
118 BOOST_ASSERT( !destination_address.empty() );
119 BOOST_ASSERT( ( 0 < destination_port ) &&
120 ( destination_port < numeric_limits<uint16_t>::max() ) );
121 BOOST_ASSERT( 0 < ping_interval_in_sec );
122 BOOST_ASSERT( (0 <= ping_fail_percentage_limit) &&
123 ( ping_fail_percentage_limit <= 100) );
124
125 update_log_prefix();
126
127 init_ping_protocol();
128
129 // start resolving already so we are prepared to ping
130 update_dns_resolver();
131
132}
133
134/**
135 * @brief Destructor.
136 */
137PingScheduler::~PingScheduler()
138{
139}
140
141void PingScheduler::stop_pinging()
142{
143 // stop pinger and resolver
144 GlobalLogger.debug() << LogPrefix << "scheduler: stop pinging";
145 clear_pingers();
146 cancel_resolve(true);
147
148 // now cancel the own timer in case that pinger cancelation called callback
149 GlobalLogger.debug() << LogPrefix << "scheduler: cancel timer";
150 NextPingTimer.cancel();
151}
152
153/**
154 * @brief stop all pingers and remove them from Pingers variable which will
155 * proboably cause their destruction
156 *
157 * Pingers is empty afterwards
158 */
159void PingScheduler::clear_pingers()
160{
161 PingerItem pinger;
162 while(!Pingers.empty())
163 {
164 pinger = Pingers.back();
165 pinger->stop_pinging();
166 Pingers.pop_back();
167 }
168}
169
170/**
171 * @brief Start into infinite loop of calls to ping
172 *
173 * Does not start yet but set NextPingTimer (possibly to 0), so action starts
174 * when io_service is started
175 */
176void PingScheduler::start_pinging()
177{
178 if ( FirstDelay > 0 )
179 GlobalLogger.info() << LogPrefix << "Delaying first ping by "
180 << FirstDelay << "s";
181 else
182 GlobalLogger.info() << LogPrefix << "Schedule ping as soon as possible";
183
184 (void) NextPingTimer.expires_from_now( seconds( FirstDelay ) );
185 NextPingTimer.async_wait( bind( &PingScheduler::ping, this,
186 boost::asio::placeholders::error ) );
187}
188
189
190/**
191 * @brief call Ping::ping and schedule a call to ping_done_handler when finished
192 */
193void PingScheduler::ping(const boost::system::error_code &error)
194{
195 if ( error )
196 { // get here, e.g. by NextPingTimer.cancel in stop_pinging
197 if ( error == boost::asio::error::operation_aborted )
198 GlobalLogger.error() << LogPrefix << "Timer for ping was cancelled!"
199 << " --> Stopping";
200 else
201 GlobalLogger.error() << LogPrefix << "Received error " << error
202 << " waiting for ping! Stopping";
203 return;
204 }
205
206 // ping as soon as dns is ready
207 WantToPing = true;
208 ping_when_ready();
209}
210
211
212void PingScheduler::ping_when_ready()
213{
214 if ( !WantToPing )
215 {
216 GlobalLogger.info() << LogPrefix << "waiting for ping request "
217 << "(should take no more than " << PingIntervalInSec << "s)";
218 return;
219 }
220 else if ( Resolver && Resolver->is_resolving() )
221 {
222 GlobalLogger.info() << LogPrefix << "waiting for DNS to finish";
223 return;
224 }
225 else if ( !Resolver )
226 {
227 // should not happen, but check anyway
228 GlobalLogger.warning() << LogPrefix << "Have no resolver!";
229 return;
230 }
231
232 GlobalLogger.info() << LogPrefix << "start ping";
233 WantToPing = false;
234
235 // try to get an up-to-date IP (ContinueOnOutdatedIps may only be set
236 // because a CNAME was out of date -- IPs may still be current)
237 HostAddress ip = Resolver->get_next_ip();
238
239 if ( !ip.is_valid() )
240 { // this can happen in 2 cases: if ContinueOnOutdatedIps==true
241 // or when ip went out of date between resolve and now
242 // --> try to use outdated IP
243 GlobalLogger.info() << LogPrefix << "Checking for outdated IPs";
244 bool check_up_to_date = false;
245 ip = Resolver->get_next_ip(check_up_to_date);
246 }
247 if ( !ip.is_valid() )
248 { // Do not even have an outdated IP!
249 // This happens if have no cached IPs and resolve failed
250 GlobalLogger.info() << LogPrefix << "Not even outdated IP to ping "
251 << "-- treat like a failed ping.";
252
253 // skip the ping and directly call ping_done_handler the appropriate
254 // number of times
255 for (int count=0; count<NPingers; ++count)
256 ping_done_handler(PingStatus_FailureNoIP, 0);
257 }
258 else
259 {
260 // create new pingers
261 for (int count=0; count<NPingers; ++count)
262 Pingers.push_back(
263 PingerFactory::createPinger(*ProtocolIter, IoService,
264 NetworkInterfaceName,
265 PingReplyTimeout) );
266
267 // remember when pinging started
268 TimeSentLastPing = microsec_clock::universal_time();
269
270 // get actual IP
271 boost::asio::ip::address actual_ip = ip.get_ip();
272 GlobalLogger.info() << LogPrefix << "pinging IP " << actual_ip
273 << " with TTL " << ip.get_ttl().get_updated_value() << "s";
274 delayed_ping(boost::system::error_code(), actual_ip, 0);
275 }
276}
277
278void PingScheduler::delayed_ping( const boost::system::error_code &error,
279 const boost::asio::ip::address ip,
280 const int pinger_index )
281{
282 if (error)
283 {
284 GlobalLogger.info() << LogPrefix << "delayed ping received an error: "
285 << error;
286 return;
287 }
288
289 GlobalLogger.debug() << LogPrefix << "starting delayed ping index "
290 << pinger_index;
291 Pingers[pinger_index]->ping(ip,
292 DestinationPort,
293 boost::bind(&PingScheduler::ping_done_handler,
294 this, _1, _2) );
295 if (pinger_index >= NPingers-1)
296 GlobalLogger.debug() << LogPrefix << "started all delayed pings";
297 else
298 {
299 DelayedPingTimer.expires_from_now( milliseconds(ParallelPingDelay) );
300 DelayedPingTimer.async_wait( bind( &PingScheduler::delayed_ping,
301 this, boost::asio::placeholders::error, ip, pinger_index+1) );
302 }
303}
304
305
306//------------------------------------------------------------------------------
307// Post Processing of Ping result and Preparation for next ping
308//------------------------------------------------------------------------------
309
310/**
311 * @brief called when Ping::ping is done; calls functions to update
312 * statistics, ping interval and elapsed time;
313 * schedules a call to ping, thereby closing the loop
314 */
315void PingScheduler::ping_done_handler( const PingStatus &result,
316 const long ping_duration_us )
317{
318 PingStatus edited_result = result;
319 if (result == PingStatus_SuccessReply && ContinueOnOutdatedIps)
320 {
321 edited_result = PingStatus_SuccessOutdatedIP;
322
323 // reset ContinueOnOutdatedIps
324 ContinueOnOutdatedIps = false;
325 update_log_prefix();
326 }
327
328 ++NPingersDone;
329 GlobalLogger.info() << LogPrefix << "Ping " << NPingersDone << " of "
330 << NPingers << " done with result " << to_string(edited_result);
331
332 // post-processing
333 HostAnalyzer.update_ping_statistics( edited_result, ping_duration_us );
334
335 // prepare next ping only after all pingers are done
336 if (NPingersDone == NPingers)
337 {
338 // stop and destruct all pingers
339 clear_pingers();
340
341 GlobalLogger.debug() << LogPrefix
342 << "--------------------------------------------------------------";
343
344 // update variables for next ping: number of pings, delay, protocol
345 // do this only after call to update_ping_statistics!
346 update_ping_protocol();
347 update_ping_interval();
348 update_ping_number();
349
350 prepare_next_ping();
351 }
352}
353
354
355void PingScheduler::prepare_next_ping()
356{
357 // start DNS resolve if necessary
358 update_dns_resolver();
359
360 NPingersDone = 0;
361
362 // schedule next ping
363 int seconds_since_last_ping = (microsec_clock::universal_time()
364 - TimeSentLastPing).total_seconds();
365 if ( seconds_since_last_ping > PingIntervalInSec )
366 {
367 GlobalLogger.info() << "We are late for next ping!";
368 seconds_since_last_ping = PingIntervalInSec;
369 (void) NextPingTimer.expires_from_now( seconds(0) );
370 }
371 else
372 (void) NextPingTimer.expires_from_now( seconds( PingIntervalInSec
373 - seconds_since_last_ping ) );
374 NextPingTimer.async_wait( bind( &PingScheduler::ping, this,
375 boost::asio::placeholders::error ) );
376}
377
378void PingScheduler::update_ping_interval()
379{
380 // have to ping more often?
381 if ( HostAnalyzer.exceeded_ping_failed_limit() )
382 {
383 PingIntervalInSec.speed_up();
384
385 GlobalLogger.debug() << LogPrefix << "- Speeding up ping interval to: "
386 << PingIntervalInSec << "s";
387 }
388 else
389 {
390 PingIntervalInSec.back_to_original();
391
392 GlobalLogger.debug() << LogPrefix << "- Stick to the original ping "
393 << "interval: " << PingIntervalInSec << "s";
394 }
395}
396
397/** in case of congested line, increase number of pings
398 *
399 * also increases ping timeout if line is congested
400 *
401 * CAUTION! Only call this after clear_pingers !!!
402 * */
403void PingScheduler::update_ping_number()
404{
405 // make sure we do not loose track of pingers here
406 if ( NPingersDone != NPingers || !Pingers.empty() )
407 {
408 GlobalLogger.warning() << LogPrefix << "Should only change number of "
409 << "pingers when all are finished and deleted! Have " << NPingers
410 << " pingers, " << NPingersDone << " of which are done and "
411 << Pingers.size() << " in listDone! Will not change NPingers.";
412 return;
413 }
414
415 if ( HostAnalyzer.exceeded_ping_congestion_limit() )
416 {
417 NPingers.increase();
418
419 GlobalLogger.notice() << LogPrefix << "No reply from host, "
420 << "switching to burst ping mode with longer timeouts ("
421 << DnsMaster::get_cname_chain_str(DestinationAddress) << ")";
422 GlobalLogger.debug() << LogPrefix << "- Increasing ping number to: "
423 << NPingers;
424
425 PingReplyTimeout = PingReplyTimeoutOrig * PingTimeoutFactor;
426 GlobalLogger.debug() << LogPrefix << "- Increase ping timeout to "
427 << PingReplyTimeout << "s";
428 }
429 else
430 {
431 NPingers.back_to_original();
432
433 GlobalLogger.debug() << LogPrefix << "- Stick to the original ping "
434 << "number: " << NPingers;
435
436 PingReplyTimeout = PingReplyTimeoutOrig;
437 GlobalLogger.debug() << LogPrefix << "- Reset ping timeout to "
438 << PingReplyTimeout << "s";
439 }
440
441 // tell host analyzer so it expects the correct number of ping results
442 HostAnalyzer.set_n_parallel_pings(NPingers);
443}
444
445//------------------------------------------------------------------------------
446// Ping Protocol Rotation
447//------------------------------------------------------------------------------
448
449void PingScheduler::init_ping_protocol()
450{
451 ProtocolIter = Protocols.end();
452 get_next_ping_protocol();
453}
454
455void PingScheduler::update_ping_protocol()
456{
457 if ( can_change_ping_protocol() )
458 {
459 get_next_ping_protocol();
460 }
461}
462
463void PingScheduler::get_next_ping_protocol()
464{
465 ++ProtocolIter;
466 if (ProtocolIter == Protocols.end())
467 ProtocolIter = Protocols.begin();
468}
469
470bool PingScheduler::can_change_ping_protocol() const
471{
472 // TODO can_change_ping_protocol() and get_next_ping_protocol() may be
473 // implemented in a Algorithm class that can be exchanged in this class to
474 // provide an algorithm neutral class
475 return true;
476}
477
478//------------------------------------------------------------------------------
479// DNS host name resolution
480//------------------------------------------------------------------------------
481
482// show "!" after host name if running on outdated IPs
483void PingScheduler::update_log_prefix()
484{
485 std::stringstream temp;
486 temp << "Sched(" << DestinationAddress;
487 if (ContinueOnOutdatedIps)
488 temp << "!";
489 temp << "): ";
490 LogPrefix = temp.str();
491}
492
493void PingScheduler::update_dns_resolver()
494{
495 if (Resolver && Resolver->is_resolving())
496 cancel_resolve(false);
497
498 if (ContinueOnOutdatedIps)
499 {
500 ContinueOnOutdatedIps = false;
501 update_log_prefix();
502 }
503
504 // DNS master caches created resolvers and resolved IPs, so this will
505 // probably just return an existing resolver with already resolved IPs for
506 // requested protocol ( ICMP/TCP is ignored, only IPv4/v6 is important)
507 Resolver = DnsMaster::get_instance()->get_resolver_for(DestinationAddress,
508 *ProtocolIter);
509
510 // get number of up-to-date IPs
511 // TODO should check here, if they will be up to date in PingIntervalInSec
512 bool check_up_to_date = true;
513 int ip_count = Resolver->get_resolved_ip_count(check_up_to_date);
514 if (ip_count > 0)
515 {
516 GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
517 << ip_count << " (IPs may be outdated=" << !check_up_to_date << ")";
518 HostAnalyzer.set_resolved_ip_count( ip_count );
519
520 if (Resolver->is_resolving())
521 GlobalLogger.warning() << LogPrefix << "have up to date IPs but "
522 << "resolver seems to be resolving all the same... "
523 << "Start pinging anyway!";
524 ping_when_ready();
525 }
526 else
527 {
528 GlobalLogger.info() << LogPrefix
529 << "No up-to-date IPs --> start resolve";
530 start_resolving_ping_address();
531 // set resolved_ip_count will be called in resolve callback
532 }
533}
534
535void PingScheduler::start_resolving_ping_address()
536{
537 Resolver->async_resolve( boost::bind(&PingScheduler::dns_resolve_callback,
538 this, _1, _2) );
539}
540
541void PingScheduler::dns_resolve_callback(const bool was_success,
542 const int recursion_count)
543{
544 GlobalLogger.info() << LogPrefix << "dns resolution finished "
545 << "with success = " << was_success << " "
546 << "after " << recursion_count << " recursions";
547
548 if ( was_success )
549 {
550 // trust that a successfull DNS resolve means we have an IP with TTL>0
551 int ip_count = Resolver->get_resolved_ip_count(!ContinueOnOutdatedIps);
552 if (ip_count == 0)
553 {
554 GlobalLogger.warning() << LogPrefix
555 << "Should not have reached this case: resolve was "
556 << "successfull but still have no IPs (up-to-date="
557 << !ContinueOnOutdatedIps << ")!";
558 if (DnsMaster::get_instance()->get_resolved_ip_ttl_threshold() > 0)
559 GlobalLogger.warning() << LogPrefix << "This probably happened "
560 << "because you specified a TTL threshold > 0 but resolving"
561 << " had no effect on TTLs since external cache is only "
562 << "updated when TTL=0 is reached.";
563 }
564 else
565 {
566 GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
567 << ip_count << " (IPs may be outdated="
568 << ContinueOnOutdatedIps << ") --> could ping now";
569 }
570 HostAnalyzer.set_resolved_ip_count( ip_count );
571 ping_when_ready();
572 }
573 else
574 { // host name resolution failed; try again bypassing first outdated CNAME
575 // or using cached IP
576 std::string skip_host = Resolver->get_skip_cname();
577
578 if (skip_host.empty())
579 { // try to continue with cached IPs
580 int ip_count = Resolver->get_resolved_ip_count(false);
581
582 if (ip_count == 0)
583 GlobalLogger.notice() << LogPrefix << "DNS failed "
584 << "and have no cached IPs either --> cannot ping";
585 // ping_when_ready will deal with this case
586 else
587 {
588 ContinueOnOutdatedIps = true;
589 update_log_prefix();
590
591 GlobalLogger.notice() << LogPrefix << "DNS failed, "
592 << "try anyway with cached data";
593 }
594
595 GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
596 << ip_count << " (IPs may be outdated=" << true << ")";
597 HostAnalyzer.set_resolved_ip_count( ip_count );
598
599 ping_when_ready();
600 }
601 else
602 { // have CNAME to continue
603 ContinueOnOutdatedIps = true;
604 update_log_prefix();
605 GlobalLogger.notice() << LogPrefix << "DNS failed, "
606 << "try again skipping a CNAME and resolving "
607 << skip_host << " directly";
608
609 cancel_resolve(false);
610
611 // now create new resolver
612 Resolver = DnsMaster::get_instance()
613 ->get_resolver_for(skip_host, *ProtocolIter);
614 start_resolving_ping_address();
615 }
616 }
617}
618
619/**
620 * cancel resolver if force_cancel or if it is not resolving DestinationAddress
621 *
622 * Resolvers have a life on their own: they are cached by DnsMaster so never go
623 * out of scope and even after calling callbacks, there might still be a
624 * longterm timer active to re-try resolving.
625 * We want to cancel that long-term timer only if the Resolver is not for our
626 * real, original DestinationAddress but a CNAME, which can happen when trying
627 * to skip cnames and working on out-dated IPs
628 */
629void PingScheduler::cancel_resolve(const bool force_cancel)
630{
631 if (force_cancel)
632 {
633 GlobalLogger.info() << "Cancelling resolver (forced)";
634 Resolver->cancel_resolve();
635 }
636 else if ( Resolver->get_hostname() == DestinationAddress )
637 GlobalLogger.info() << LogPrefix
638 << "Leave original resolver active in background";
639 else
640 {
641 GlobalLogger.info() << LogPrefix << "Cancel resolver for "
642 << Resolver->get_hostname() << " since is not the original "
643 << DestinationAddress;
644 Resolver->cancel_resolve();
645 }
646}
647