tested new DNS with internal server, make more robust against caching; works nicely now
authorChristian Herdtweck <christian.herdtweck@intra2net.com>
Thu, 30 Apr 2015 13:28:51 +0000 (15:28 +0200)
committerChristian Herdtweck <christian.herdtweck@intra2net.com>
Mon, 4 May 2015 14:57:59 +0000 (16:57 +0200)
src/config/configuration.cpp
src/dns/dnscache.cpp
src/dns/dnsresolver.cpp
src/dns/dnsresolver.h
src/dns/ippseudoresolver.h
src/dns/resolverbase.h
src/host/pingscheduler.cpp
src/icmp/icmppinger.cpp

index 5843563..4efd663 100644 (file)
@@ -231,7 +231,7 @@ int Configuration::get_resolved_ip_ttl_threshold() const
 }
 void Configuration::set_resolved_ip_ttl_threshold( const int resolved_ip_ttl_threshold )
 {
-    BOOST_ASSERT(resolved_ip_ttl_threshold > 0 );
+    BOOST_ASSERT(resolved_ip_ttl_threshold >= 0 );
     ResolvedIpTtlThreshold = resolved_ip_ttl_threshold;
 }
 
index 3a507e6..2796a07 100644 (file)
@@ -277,7 +277,8 @@ HostAddressVec DnsCache::get_ips(const std::string &hostname,
                          << " --> " << result.size() << "-list";
     BOOST_FOREACH( const HostAddress &addr, result )
         GlobalLogger.debug() << "DnsCache:    " << addr.get_ip().to_string()
-                             << " (TTL " << addr.get_ttl().get_value() << "s)";
+                             << " (TTL " << addr.get_ttl().get_updated_value()
+                             << "s)";
     return result;
 }
 
@@ -290,7 +291,8 @@ Cname DnsCache::get_cname(const std::string &hostname,
     std::string key = key_for_hostname(hostname);
     Cname result_obj = CnameCache[key];
     GlobalLogger.debug() << "DnsCache: request CNAME for " << key
-                         << " --> \"" << result_obj.Host << "\"";
+                         << " --> \"" << result_obj.Host << "\" (TTL "
+                         << result_obj.Ttl.get_updated_value() << "s)";
     if (result_obj.Host.empty())
         return result_obj;
 
@@ -334,7 +336,8 @@ HostAddressVec DnsCache::get_ips_recursive(const std::string &hostname,
         }
         else
         {
-            min_cname_ttl = min(min_cname_ttl, current_cname.Ttl.get_value());
+            min_cname_ttl = min(min_cname_ttl,
+                                current_cname.Ttl.get_updated_value());
             result = get_ips(current_host, check_up_to_date);
         }
     }
@@ -350,7 +353,7 @@ HostAddressVec DnsCache::get_ips_recursive(const std::string &hostname,
 
         BOOST_FOREACH( HostAddress &addr, result )
         {
-            if (addr.get_ttl().get_value() > min_cname_ttl)
+            if (addr.get_ttl().get_updated_value() > min_cname_ttl)
             {
                 GlobalLogger.debug() << "DnsCache: using shorter CNAME TTL";
                 addr.set_ttl(cname_ttl);
index 07eeac3..085eda0 100644 (file)
 
 using I2n::Logger::GlobalLogger;
 using boost::posix_time::seconds;
-using boost::posix_time::minutes;
 
 namespace Config
 {
-    const int ResolveTimeoutSeconds = 3;
-    const int PauseBeforeRetrySeconds = 10;
-    const int StaleDataLongtermMinutes = 15;
+    const int ResolveTimeoutSeconds = 2;
+    const int PauseBeforeRetrySeconds = 1;
+    const int StaleDataLongtermSeconds = 5*60;
     const int DNS_PORT = 53;
 }
 
@@ -73,7 +72,7 @@ DnsResolver::DnsResolver(IoServiceItem &io_serv,
     , RandomIdGenerator()
     , RequestId( 0 )
     , OperationCancelled( false )
-    , LongermTimerIsActive( false )
+    , LongtermTimerIsActive( false )
 {
     std::stringstream temp;
     temp << "Dns(" << ResolverBase::Hostname << "): ";
@@ -126,7 +125,7 @@ void DnsResolver::do_resolve()
     ResolveTimeoutTimer.cancel();
     PauseBeforeRetryTimer.cancel();
     StaleDataLongtermTimer.cancel();
-    LongermTimerIsActive = false;
+    LongtermTimerIsActive = false;
 
     // create DNS request
     boost::net::dns::message dns_message( ResolverBase::Hostname, Protocol );
@@ -190,7 +189,7 @@ void DnsResolver::handle_dns_result(const boost::system::error_code &error,
     if (error)
     {
         GlobalLogger.info() << LogPrefix << "DNS resolve resulted in error "
-                            << error << " --> try again after a little while";
+                            << error << " --> request retry";
         schedule_retry();
         return;
     }
@@ -247,7 +246,7 @@ void DnsResolver::handle_dns_result(const boost::system::error_code &error,
     else
     {   // no answers --> cannot proceed
         GlobalLogger.warning() << LogPrefix << "No IP nor CNAME received! "
-                               << "--> schedule retry";
+                               << "--> request retry";
         schedule_retry();
     }
 }
@@ -358,13 +357,13 @@ void DnsResolver::handle_unavailable()
 {
     // schedule new attempt in quite a while
     StaleDataLongtermTimer.expires_from_now(
-                                     minutes(Config::StaleDataLongtermMinutes));
+                                     seconds(Config::StaleDataLongtermSeconds));
     StaleDataLongtermTimer.async_wait(
             boost::bind( &DnsResolver::wait_timer_timeout_handler,
                          this, boost::asio::placeholders::error
             )
     );
-    LongermTimerIsActive = true;
+    LongtermTimerIsActive = true;
 
     // for now, admit failure
     bool was_success = false;
@@ -540,7 +539,7 @@ void DnsResolver::stop_trying(bool was_success)
     if (was_success)
     {
         StaleDataLongtermTimer.cancel();
-        LongermTimerIsActive = false;
+        LongtermTimerIsActive = false;
     }
 
     // clean up
@@ -571,7 +570,7 @@ bool DnsResolver::is_resolving() const
  */
 bool DnsResolver::is_waiting_to_resolve() const
 {
-    return IsResolving || LongermTimerIsActive;
+    return IsResolving || LongtermTimerIsActive;
 }
 
 
@@ -584,10 +583,10 @@ bool DnsResolver::is_waiting_to_resolve() const
  */
 void DnsResolver::cancel_resolve()
 {
-    if ( !IsResolving )
+    if ( !IsResolving && !LongtermTimerIsActive)
     {
-        GlobalLogger.info() << LogPrefix
-               << "Cancel called on non-resolving resolver -- ignore";
+        GlobalLogger.info() << LogPrefix << "Cancel called on non-resolving, "
+                                         << "non-waiting resolver -- ignore";
         return;
     }
     else if (OperationCancelled)
@@ -596,18 +595,23 @@ void DnsResolver::cancel_resolve()
                << "Cancel called on cancelled resolver -- ignore";
         return;
     }
+    GlobalLogger.info() << LogPrefix << "Cancel resolver";
 
     // set before finalize_resolve so can check in finalize_resolve that ID is
     //   always 0; ID is not used any more since handle_dns_result stops if
     //   OperationCancelled is true
     RequestId = 0;
 
-    bool was_success = false;
-    int cname_count = 1;
-    finalize_resolve(was_success, cname_count);
+    if ( IsResolving )
+    {
+        bool was_success = false;
+        int cname_count = 1;
+        finalize_resolve(was_success, cname_count);
+    }
 
     // also cancel the long-term timer
     StaleDataLongtermTimer.cancel();
+    LongtermTimerIsActive = false;
 
     // set after finalize_resolve, so can check in finalize_resolve that 
     // OperationCancelled is never true
@@ -620,15 +624,15 @@ void DnsResolver::handle_resolve_timeout(const boost::system::error_code &error)
 {
     if ( error ==  boost::asio::error::operation_aborted )   // cancelled
     {
-        GlobalLogger.info() << LogPrefix
-                            << "Resolve timeout timer was cancelled!";
+        GlobalLogger.debug() << LogPrefix
+                             << "Resolve timeout timer was cancelled!";
         return;
     }
     else if (error)
     {
         GlobalLogger.warning() << LogPrefix
                                << "resolve timeout handler received error "
-                               << error << " --> retry";
+                               << error << " --> request retry";
         schedule_retry();
     }
     else if ( OperationCancelled )
@@ -679,8 +683,8 @@ void DnsResolver::wait_timer_timeout_handler(
     if ( error ==  boost::asio::error::operation_aborted )   // cancelled
     {   // assume that our code cancelled this timer, so callbacks will be
         // taken care of!
-        GlobalLogger.info() << LogPrefix
-                            << "Resolve wait timer was cancelled! ";
+        GlobalLogger.debug() << LogPrefix
+                             << "Resolve wait timer was cancelled! ";
     }
     else if (error)
     {   // not sure what to do here, but callers waiting forever for a callback
@@ -700,6 +704,7 @@ void DnsResolver::wait_timer_timeout_handler(
     else
     {
         GlobalLogger.info() << LogPrefix << "Done waiting --> re-try resolve";
+        IsResolving = false;  // will be set to true immediately in do_resolve
         do_resolve();
     }
 }
@@ -713,7 +718,7 @@ HostAddress DnsResolver::get_next_ip(bool check_up_to_date)
 {
     // get cached data
     // (do not use arg check_up_to_date here in order to give NextIpIndex
-    //  a chance to stay above number of outdate IPs)
+    //  a chance to stay above number of outdated IPs)
     HostAddressVec cached_data = ResolverBase::get_cached_ips_recursively();
 
     // if no results cached, return default-constructed HostAddress (0.0.0.0)
@@ -724,7 +729,7 @@ HostAddress DnsResolver::get_next_ip(bool check_up_to_date)
         return return_candidate;
     }
 
-    int n_iter = 0;
+    std::size_t n_iter = 0;
     std::size_t n_ips = cached_data.size();
     uint32_t ttl_thresh = static_cast<uint32_t>( DnsMaster::get_instance()
                                             ->get_resolved_ip_ttl_threshold() );
@@ -750,27 +755,39 @@ HostAddress DnsResolver::get_next_ip(bool check_up_to_date)
         }
         else
         {   // there are candidates left to consider
+            GlobalLogger.debug() << LogPrefix << "Check IP candidate at index "
+                                 << NextIpIndex;
             return_candidate = cached_data[NextIpIndex++];
             if (!check_up_to_date)
+            {
+                GlobalLogger.debug() << LogPrefix << "not checking ttl, accept";
                 return return_candidate;
-            else if (cached_data[NextIpIndex].get_ttl().get_updated_value()
+            }
+            else if (return_candidate.get_ttl().get_updated_value()
                      > ttl_thresh)
+            {
+                GlobalLogger.debug() << LogPrefix << "is up to date, accept";
                 return return_candidate;
+            }
             else
+            {
+                GlobalLogger.debug() << LogPrefix << "is out of date ("
+                    << return_candidate.get_ttl().get_updated_value()
+                    << "s <= " << ttl_thresh << "s), continue";
                 ++n_iter;
+            }
         }
     }
 }
 
 bool DnsResolver::have_up_to_date_ip()
 {
-    return get_resolved_ip_count() > 0;
+    return get_resolved_ip_count(true) > 0;
 }
 
-int DnsResolver::get_resolved_ip_count()
+int DnsResolver::get_resolved_ip_count(const bool check_up_to_date)
 {
-    // run with empty hostname (--> uses internal var Hostname)
-    // and check_up_to_date = true
-    return ResolverBase::get_cached_ips_recursively("", true).size();
+    // run with empty hostname --> uses internal var Hostname
+    return ResolverBase::get_cached_ips_recursively("",check_up_to_date).size();
 }
 
index 5caf9c7..0918fd0 100644 (file)
@@ -63,7 +63,7 @@ private:
 public:
     HostAddress get_next_ip(const bool check_up_to_date=true);
     bool have_up_to_date_ip();
-    int get_resolved_ip_count();
+    int get_resolved_ip_count(const bool check_up_to_date=true);
     void cancel_resolve();
     bool is_resolving() const;
     bool is_waiting_to_resolve() const;
@@ -109,7 +109,7 @@ private:
     boost::uuids::random_generator RandomIdGenerator;
     uint16_t RequestId;
     bool OperationCancelled;
-    bool LongermTimerIsActive;
+    bool LongtermTimerIsActive;
 };
 
 #endif
index 9a33a38..eeac609 100644 (file)
@@ -66,7 +66,8 @@ public:
     HostAddress get_next_ip(const bool check_up_to_date=true)
                                        { return IpAddress; }
     bool have_up_to_date_ip()          { return true;      }
-    int get_resolved_ip_count()        { return 1;         }
+    int get_resolved_ip_count(const bool check_up_to_date=true)
+                                       { return 1;         }
     bool is_resolving() const          { return false;     }
     bool is_waiting_to_resolve() const { return false;     }
     void cancel_resolve() {}
index ad2a70c..8262124 100644 (file)
@@ -58,7 +58,7 @@ public:
 
     virtual HostAddress get_next_ip(const bool check_up_to_date=true) = 0;
     virtual bool have_up_to_date_ip() = 0;
-    virtual int get_resolved_ip_count() = 0;
+    virtual int get_resolved_ip_count(const bool check_up_to_date=true) = 0;
 
     std::string get_hostname() const;
 
index b38271a..15f10fa 100644 (file)
@@ -170,12 +170,12 @@ void PingScheduler::ping_when_ready()
 {
     if ( !WantToPing )
     {
-        GlobalLogger.info() << LogPrefix << "not pinging (not requested to)";
+        GlobalLogger.info() << LogPrefix << "waiting for ping request";
         return;
     }
     else if ( Resolver && Resolver->is_resolving() )
     {
-        GlobalLogger.info() << LogPrefix << "not pinging (DNS not finished)";
+        GlobalLogger.info() << LogPrefix << "waiting for DNS to finish";
         return;
     }
     else if ( !Resolver )
@@ -185,26 +185,44 @@ void PingScheduler::ping_when_ready()
     GlobalLogger.info() << LogPrefix << "start ping";
     WantToPing = false;
 
-    // try to get an up-to-date IP
+    // try to get an up-to-date IP (ContinueOnOutdatedIps may only be set
+    //   because a CNAME was out of date -- IPs may still be current)
     HostAddress ip = Resolver->get_next_ip();
 
-    if ( !ip.is_valid() && ContinueOnOutdatedIps)
-    {   // we failed to resolve --> try to use outdated IP
+    if ( !ip.is_valid() )
+    {   // this can happen in 2 cases: if ContinueOnOutdatedIps==true
+        // or when ip went out of date between resolve and now
+        // --> try to use outdated IP
         GlobalLogger.info() << LogPrefix << "Checking for outdated IPs";
         bool check_up_to_date = false;
         ip = Resolver->get_next_ip(check_up_to_date);
     }
-    if ( ip.is_valid() )
+    if ( !ip.is_valid() )
+    {   // still no valid IP --> should not happen
+        GlobalLogger.error() << LogPrefix << "Not even outdated IP to ping "
+             << "-- this should not have happened!! Treat like a failed ping.";
+        if (DnsMaster::get_instance()->get_resolved_ip_ttl_threshold() > 0)
+            GlobalLogger.warning() << LogPrefix << "This probably happened "
+                << "because you specified a TTL threshold > 0 but resolving"
+                << " had no effect on TTLs since external cache is only "
+                << "updated when TTL=0 is reached.";
+
+        // skip the ping and directly call ping_done_handler
+        ping_done_handler(false);
+    }
+    else
+    {
+        uint32_t ttl = ip.get_ttl().get_updated_value();
+        boost::posix_time::ptime now =
+                                  boost::posix_time::second_clock::local_time();
+        std::string expiry =
+                        boost::posix_time::to_simple_string(now + seconds(ttl));
+
+        GlobalLogger.info() << LogPrefix << "pinging IP " << ip.get_ip()
+            << " with TTL " << ttl << "s (until " << expiry << ")";
         Ping->ping( ip.get_ip(),
                     DestinationPort,
                     boost::bind(&PingScheduler::ping_done_handler, this, _1) );
-    else
-    {   // should not happen
-        GlobalLogger.error() << LogPrefix << "No IP to ping "
-                             << "-- this should not have happened!!";
-        WantToPing = true;
-        if ( !Resolver->is_resolving() )
-            start_resolving_ping_address();
     }
 }
 
@@ -295,6 +313,8 @@ void PingScheduler::update_ping_protocol()
 
 void PingScheduler::get_next_ping_protocol()
 {
+    GlobalLogger.debug() << LogPrefix
+        << "------------------------------------------------------------------";
     ++ProtocolIter;
     if (ProtocolIter == Protocols.end())
         ProtocolIter = Protocols.begin();
@@ -351,9 +371,16 @@ void PingScheduler::update_dns_resolver( PingProtocol current_protocol )
     // requested protocol ( ICMP/TCP is ignored, only IPv4/v6 is important)
     Resolver = DnsMaster::get_instance()->get_resolver_for(DestinationAddress,
                                                            current_protocol);
-    // start resolving if no ips available
-    if ( Resolver->have_up_to_date_ip() )
+
+    // get number of up-to-date IPs
+    bool check_up_to_date = true;
+    int ip_count = Resolver->get_resolved_ip_count(check_up_to_date);
+    if (ip_count > 0)
     {
+        GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
+            << ip_count << " where IPs may be outdated: " << false;
+        HostAnalyzer.set_resolved_ip_count( ip_count );
+
         if (Resolver->is_resolving())
             GlobalLogger.warning() << LogPrefix << "have up to date IPs but "
                 << "resolver seems to be resolving all the same... "
@@ -362,6 +389,7 @@ void PingScheduler::update_dns_resolver( PingProtocol current_protocol )
     }
     else
         start_resolving_ping_address();
+        // set resolved_ip_count in resolve callback
 }
 
 void PingScheduler::start_resolving_ping_address()
@@ -379,7 +407,27 @@ void PingScheduler::dns_resolve_callback(const bool was_success,
 
     if ( was_success )
     {
-        HostAnalyzer.set_resolved_ip_count( Resolver->get_resolved_ip_count());
+        // trust that a successfull DNS resolve means we have an IP with TTL>0
+        int ip_count = Resolver->get_resolved_ip_count(!ContinueOnOutdatedIps);
+        if (ip_count == 0)
+        {   // this will create trouble in HostAnalyzer
+            GlobalLogger.warning() << LogPrefix
+                << "Should not have reached this case: resolve was "
+                << "successfull but still have no IPs (up-to-date="
+                << !ContinueOnOutdatedIps << ")!";
+            if (DnsMaster::get_instance()->get_resolved_ip_ttl_threshold() > 0)
+                GlobalLogger.warning() << LogPrefix << "This probably happened "
+                    << "because you specified a TTL threshold > 0 but resolving"
+                    << " had no effect on TTLs since external cache is only "
+                    << "updated when TTL=0 is reached.";
+        }
+        else
+        {
+            GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
+                << ip_count << " where IPs may be outdated: "
+                << ContinueOnOutdatedIps << " --> could ping now";
+            HostAnalyzer.set_resolved_ip_count( ip_count );
+        }
         ping_when_ready();
     }
     else
@@ -394,7 +442,12 @@ void PingScheduler::dns_resolve_callback(const bool was_success,
         {   // continue with IP
             GlobalLogger.notice() << LogPrefix << "DNS failed, "
                 << "try anyway with cached data";
-            HostAnalyzer.set_resolved_ip_count(0);
+
+            int ip_count = Resolver->get_resolved_ip_count(false);
+            GlobalLogger.info() << LogPrefix << "Set resolved_ip_count to "
+                << ip_count << " where IPs may be outdated: " << true;
+            HostAnalyzer.set_resolved_ip_count( ip_count );
+
             ping_when_ready();
         }
         else
index e6d51b6..c9ee1b2 100644 (file)
@@ -594,7 +594,10 @@ void IcmpPacketDistributor::clean_up_all()
 
 void IcmpPacketDistributor::clean_up()
 {
-    if (PingerList.size() > 0)
+    if (PingerList.size() == 0)
+        GlobalLogger.notice() << "All IcmpPingers have de-registered :-)"
+                              << std::endl;
+    else
         GlobalLogger.warning() << "There were still " << PingerList.size()
             << " pingers registered in IcmpPacketDistributor!" << std::endl;
     PingerList.clear();