enh(MongoDB): Replica set: More robust retry of failed MongoDB commands.

This commit is contained in:
Matej Kenda
2025-12-04 11:34:15 +01:00
parent f4369464c2
commit c4f0eb14c1
11 changed files with 102 additions and 58 deletions

View File

@@ -7,7 +7,7 @@
//
// Definition of the ReadPreference class.
//
// Copyright (c) 2012-2025, Applied Informatics Software Engineering GmbH.
// Copyright (c) 2025, Applied Informatics Software Engineering GmbH.
// and Contributors.
//
// SPDX-License-Identifier: BSL-1.0

View File

@@ -7,7 +7,7 @@
//
// Definition of the ReplicaSet class.
//
// Copyright (c) 2012-2025, Applied Informatics Software Engineering GmbH.
// Copyright (c) 2025, Applied Informatics Software Engineering GmbH.
// and Contributors.
//
// SPDX-License-Identifier: BSL-1.0
@@ -23,6 +23,7 @@
#include "Poco/MongoDB/ReadPreference.h"
#include "Poco/MongoDB/TopologyDescription.h"
#include "Poco/Net/SocketAddress.h"
#include "Poco/Logger.h"
#include "Poco/Timespan.h"
#include <vector>
#include <string>
@@ -97,11 +98,20 @@ public:
Poco::Timespan heartbeatFrequency{10, 0};
/// Topology monitoring interval (default: 10 seconds)
std::size_t serverReconnectRetries { 10 };
// Number of connection retries to a server/replica set if no server is available temporarily
std::chrono::seconds serverReconnectDelay { 1 };
// Delay between re-connects to a server/replica set if no server is available temporarily
bool enableMonitoring{true};
/// Enable background topology monitoring (default: true)
Connection::SocketFactory* socketFactory{nullptr};
/// Optional socket factory for SSL/TLS connections
Logger::Ptr logger;
/// Optional logger to write important information about replica set activity
};
explicit ReplicaSet(const Config& config);
@@ -145,6 +155,9 @@ public:
/// Returns a connection to a secondary server.
/// Returns null if no secondary is available.
[[nodiscard]] Config configuration() const;
// Returns a copy of replica set configuration.
[[nodiscard]] TopologyDescription topology() const;
/// Returns a copy of the current topology description.
@@ -157,6 +170,9 @@ public:
void stopMonitoring();
/// Stops the background monitoring thread.
void setLogger(Logger::Ptr logger);
/// Sets the logger to log important replica set activity.
void setReadPreference(const ReadPreference& pref);
/// Sets the default read preference.
@@ -173,7 +189,7 @@ private:
void discover();
/// Performs initial topology discovery from seed servers.
void monitor();
void monitor() noexcept;
/// Background monitoring thread function.
Connection::Ptr selectServer(const ReadPreference& readPref);
@@ -182,10 +198,10 @@ private:
Connection::Ptr createConnection(const Net::SocketAddress& address);
/// Creates a new connection to the specified address.
void updateTopologyFromHello(const Net::SocketAddress& address);
void updateTopologyFromHello(const Net::SocketAddress& address) noexcept;
/// Queries a server with 'hello' command and updates topology.
void updateTopologyFromAllServers();
void updateTopologyFromAllServers() noexcept;
/// Queries all known servers and updates topology.
void parseURI(const std::string& uri);

View File

@@ -7,7 +7,7 @@
//
// Definition of the ReplicaSetConnection class.
//
// Copyright (c) 2012-2025, Applied Informatics Software Engineering GmbH.
// Copyright (c) 2025, Applied Informatics Software Engineering GmbH.
// and Contributors.
//
// SPDX-License-Identifier: BSL-1.0
@@ -121,6 +121,9 @@ private:
void markServerFailed();
/// Marks the current server as failed in the topology.
void logInfo(const std::string& message);
void logDebug(const std::string& message);
ReplicaSet& _replicaSet;
ReadPreference _readPreference;
Connection::Ptr _connection;

View File

@@ -7,7 +7,7 @@
//
// Definition of the ReplicaSetPoolableConnectionFactory class.
//
// Copyright (c) 2012-2025, Applied Informatics Software Engineering GmbH.
// Copyright (c) 2025, Applied Informatics Software Engineering GmbH.
// and Contributors.
//
// SPDX-License-Identifier: BSL-1.0
@@ -65,7 +65,7 @@ public:
// Check if the connection is still valid and matches the read preference.
// This ensures that if a server changes role (e.g., primary becomes secondary),
// the cached connection is invalidated and a new one is created.
return pObject->isConnected() && pObject->matchesReadPreference();
return pObject != nullptr && pObject->isConnected() && pObject->matchesReadPreference();
}
void activateObject(MongoDB::ReplicaSetConnection::Ptr pObject)
@@ -75,7 +75,7 @@ public:
try {
pObject->reconnect();
}
catch (Poco::Exception& e)
catch (const Poco::Exception& e)
{
// Ignore connect error. c->isConnected() can be used to determine if the connection is valid.
}

View File

@@ -7,7 +7,7 @@
//
// Definition of the ServerDescription class.
//
// Copyright (c) 2012-2025, Applied Informatics Software Engineering GmbH.
// Copyright (c) 2025, Applied Informatics Software Engineering GmbH.
// and Contributors.
//
// SPDX-License-Identifier: BSL-1.0

View File

@@ -7,7 +7,7 @@
//
// Definition of the TopologyDescription class.
//
// Copyright (c) 2012-2025, Applied Informatics Software Engineering GmbH.
// Copyright (c) 2025, Applied Informatics Software Engineering GmbH.
// and Contributors.
//
// SPDX-License-Identifier: BSL-1.0

View File

@@ -5,7 +5,7 @@
// Package: MongoDB
// Module: ReadPreference
//
// Copyright (c) 2012-2025, Applied Informatics Software Engineering GmbH.
// Copyright (c) 2025, Applied Informatics Software Engineering GmbH.
// and Contributors.
//
// SPDX-License-Identifier: BSL-1.0

View File

@@ -5,7 +5,7 @@
// Package: MongoDB
// Module: ReplicaSet
//
// Copyright (c) 2012-2025, Applied Informatics Software Engineering GmbH.
// Copyright (c) 2025, Applied Informatics Software Engineering GmbH.
// and Contributors.
//
// SPDX-License-Identifier: BSL-1.0
@@ -146,6 +146,12 @@ TopologyDescription ReplicaSet::topology() const
}
ReplicaSet::Config ReplicaSet::configuration() const
{
std::lock_guard<std::mutex> lock(_mutex);
return _config;
}
void ReplicaSet::refreshTopology()
{
updateTopologyFromAllServers();
@@ -183,6 +189,13 @@ void ReplicaSet::stopMonitoring()
}
void ReplicaSet::setLogger(Logger::Ptr logger)
{
std::lock_guard<std::mutex> lock(_mutex);
_config.logger = logger;
}
void ReplicaSet::setReadPreference(const ReadPreference& pref)
{
std::lock_guard<std::mutex> lock(_mutex);
@@ -238,7 +251,7 @@ void ReplicaSet::discover()
}
void ReplicaSet::monitor()
void ReplicaSet::monitor() noexcept
{
while (!_stopMonitoring.load())
{
@@ -328,7 +341,7 @@ Connection::Ptr ReplicaSet::createConnection(const Net::SocketAddress& address)
}
void ReplicaSet::updateTopologyFromHello(const Net::SocketAddress& address)
void ReplicaSet::updateTopologyFromHello(const Net::SocketAddress& address) noexcept
{
Connection::Ptr conn = new Connection();
@@ -397,7 +410,7 @@ void ReplicaSet::updateTopologyFromHello(const Net::SocketAddress& address)
}
void ReplicaSet::updateTopologyFromAllServers()
void ReplicaSet::updateTopologyFromAllServers() noexcept
{
std::vector<ServerDescription> servers;

View File

@@ -5,7 +5,7 @@
// Package: MongoDB
// Module: ReplicaSetConnection
//
// Copyright (c) 2012-2025, Applied Informatics Software Engineering GmbH.
// Copyright (c) 2025, Applied Informatics Software Engineering GmbH.
// and Contributors.
//
// SPDX-License-Identifier: BSL-1.0
@@ -17,8 +17,9 @@
#include "Poco/Net/NetException.h"
#include "Poco/Exception.h"
#include <set>
#include <thread>
using namespace std::string_literals;
using namespace std::literals;
namespace Poco {
@@ -41,6 +42,9 @@ enum class ErrorCode
SocketException = 9001
};
// Minimum retry count to run the MongoDB command.
static constexpr std::size_t lowExecuteRetryThreshold { 5 };
ReplicaSetConnection::ReplicaSetConnection(ReplicaSet& replicaSet, const ReadPreference& readPref):
_replicaSet(replicaSet),
@@ -55,7 +59,6 @@ ReplicaSetConnection::~ReplicaSetConnection() = default;
void ReplicaSetConnection::sendRequest(OpMsgMessage& request, OpMsgMessage& response)
{
executeWithRetry([&]() {
ensureConnection();
_connection->sendRequest(request, response);
// Check if response contains a retriable error
@@ -170,9 +173,12 @@ void ReplicaSetConnection::executeWithRetry(std::function<void()> operation)
std::exception_ptr lastException;
std::set<Net::SocketAddress> triedServers;
// Retry with different servers until we've tried all available servers
TopologyDescription topology = _replicaSet.topology();
const std::size_t maxAttempts = topology.serverCount();
// Retry with different servers until we've tried all available servers with a minimum
// retry threshold to cover situations when single server topology or complete replica set
// is not available temporarily.
auto topology = _replicaSet.topology();
const auto rsConfig = _replicaSet.configuration();
const std::size_t maxAttempts = std::max(topology.serverCount(), lowExecuteRetryThreshold);
std::size_t attempt = 0;
while (attempt < maxAttempts)
@@ -182,30 +188,18 @@ void ReplicaSetConnection::executeWithRetry(std::function<void()> operation)
ensureConnection();
triedServers.insert(_connection->address());
operation();
if (attempt > 0)
logDebug(Poco::format("Operation succeeded after %Lu retries."s, attempt));
return; // Success
}
catch (const Poco::Net::NetException& e)
catch (const std::exception& e)
{
if (!isRetriableError(e))
{
throw; // Non-retriable network error
}
lastException = std::current_exception();
}
catch (const Poco::TimeoutException& e)
{
if (!isRetriableError(e))
{
throw; // Non-retriable timeout
}
lastException = std::current_exception();
}
catch (const Poco::IOException& e)
{
if (!isRetriableError(e))
{
throw; // Non-retriable I/O error
throw;
}
// Retriable error.
lastException = std::current_exception();
}
catch (...)
@@ -221,17 +215,26 @@ void ReplicaSetConnection::executeWithRetry(std::function<void()> operation)
// Get new connection, avoiding servers we've already tried
bool foundNewServer = false;
for (std::size_t i = 0; i < 10 && !foundNewServer; ++i) // Try up to 10 times
for (std::size_t i = 0; i < rsConfig.serverReconnectRetries && !foundNewServer; ++i) // Try several times to connect
{
Connection::Ptr newConn = _replicaSet.getConnection(_readPreference);
if (newConn.isNull())
{
break; // No servers available
// No servers available at this moment. Wait briefly and retry.
std::this_thread::sleep_for(rsConfig.serverReconnectDelay);
triedServers.clear();
_replicaSet.refreshTopology();
topology = _replicaSet.topology();
if (!topology.servers().empty())
logInfo(Poco::format("Refreshed topology. Number of servers: %Lu"s, topology.servers().size()));
continue;
}
Net::SocketAddress addr = newConn->address();
if (triedServers.find(addr) == triedServers.end())
{
logDebug(Poco::format("Connection reconnected to server: %s"s, addr.toString()));
_connection = newConn;
foundNewServer = true;
}
@@ -261,30 +264,21 @@ void ReplicaSetConnection::executeWithRetry(std::function<void()> operation)
bool ReplicaSetConnection::isRetriableError(const std::exception& e)
{
// Network exceptions are generally retriable
if (dynamic_cast<const Poco::Net::NetException*>(&e))
if (dynamic_cast<const Poco::Net::NetException*>(&e) != nullptr)
{
return true;
}
// Timeout exceptions are retriable
if (dynamic_cast<const Poco::TimeoutException*>(&e))
if (dynamic_cast<const Poco::TimeoutException*>(&e) != nullptr)
{
return true;
}
// I/O exceptions might be retriable
const Poco::IOException* ioEx = dynamic_cast<const Poco::IOException*>(&e);
if (ioEx)
// I/O exceptions are retriable
if (dynamic_cast<const Poco::IOException*>(&e) != nullptr)
{
const auto& msg = ioEx->message();
// Check for specific retriable error messages
if (msg.find("not master"s) != std::string::npos ||
msg.find("NotMaster"s) != std::string::npos ||
msg.find("Connection"s) != std::string::npos ||
msg.find("connection"s) != std::string::npos)
{
return true;
}
return true;
}
return false;
@@ -349,4 +343,22 @@ void ReplicaSetConnection::markServerFailed()
}
void ReplicaSetConnection::logInfo(const std::string& message)
{
auto cfg { _replicaSet.configuration() };
if (cfg.logger == nullptr) return;
cfg.logger->information("MongoDB replica set: "s + message);
}
void ReplicaSetConnection::logDebug(const std::string& message)
{
auto cfg { _replicaSet.configuration() };
if (cfg.logger == nullptr) return;
cfg.logger->debug("MongoDB replica set: "s + message);
}
} } // namespace Poco::MongoDB

View File

@@ -5,7 +5,7 @@
// Package: MongoDB
// Module: ServerDescription
//
// Copyright (c) 2012-2025, Applied Informatics Software Engineering GmbH.
// Copyright (c) 2025, Applied Informatics Software Engineering GmbH.
// and Contributors.
//
// SPDX-License-Identifier: BSL-1.0

View File

@@ -5,7 +5,7 @@
// Package: MongoDB
// Module: TopologyDescription
//
// Copyright (c) 2012-2025, Applied Informatics Software Engineering GmbH.
// Copyright (c) 2025, Applied Informatics Software Engineering GmbH.
// and Contributors.
//
// SPDX-License-Identifier: BSL-1.0