From 816539fa257455aa5b98840c6f563ce35a24f1d6 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Fri, 2 Aug 2019 16:08:12 -0400 Subject: [PATCH 1/6] Clear server pool when a server is marked unknown --- lib/mongo/cluster.rb | 26 +++++++++++++++++++++++- lib/mongo/cluster/sdam_flow.rb | 10 ++++++++- lib/mongo/operation/shared/executable.rb | 13 ++++-------- lib/mongo/server.rb | 22 +++++++++++++++++--- lib/mongo/server/connection.rb | 1 - 5 files changed, 57 insertions(+), 15 deletions(-) diff --git a/lib/mongo/cluster.rb b/lib/mongo/cluster.rb index cc6380a34e..9c128b029a 100644 --- a/lib/mongo/cluster.rb +++ b/lib/mongo/cluster.rb @@ -502,11 +502,25 @@ def scan!(sync=true) true end + # Runs SDAM flow on the cluster. + # + # This method can be invoked to process a new server description returned + # by the server on a monitoring or non-monitoring connection, and also + # by the driver when it marks a server unknown as a result of a (network) + # error. + # # @param [ Server::Description ] previous_desc Previous server description. # @param [ Server::Description ] updated_desc The changed description. + # @param [ Hash ] options Options. + # + # @option options [ true | false ] :keep_connection_pool Usually when the + # new server description is unknown, the connection pool on the + # respective server is cleared. Set this option to true to keep the + # existing connection pool (required when handling not master errors + # on 4.2+ servers). # # @api private - def run_sdam_flow(previous_desc, updated_desc) + def run_sdam_flow(previous_desc, updated_desc, options = {}) @sdam_flow_lock.synchronize do flow = SdamFlow.new(self, previous_desc, updated_desc) flow.server_description_changed @@ -514,6 +528,16 @@ def run_sdam_flow(previous_desc, updated_desc) # SDAM flow may alter the updated description - grab the final # version for the purposes of broadcasting if a server is available updated_desc = flow.updated_desc + + unless options[:keep_connection_pool] + if flow.became_unknown? + servers_list.each do |server| + if server.address == updated_desc.address + server.clear_connection_pool + end + end + end + end end # Some updated descriptions, e.g. a mismatched me one, result in the diff --git a/lib/mongo/cluster/sdam_flow.rb b/lib/mongo/cluster/sdam_flow.rb index b137dfe479..98261de0a3 100644 --- a/lib/mongo/cluster/sdam_flow.rb +++ b/lib/mongo/cluster/sdam_flow.rb @@ -28,7 +28,7 @@ class SdamFlow def initialize(cluster, previous_desc, updated_desc) @cluster = cluster @topology = cluster.topology - @previous_desc = previous_desc + @original_desc = @previous_desc = previous_desc @updated_desc = updated_desc end @@ -48,6 +48,7 @@ def initialize(cluster, previous_desc, updated_desc) attr_reader :previous_desc attr_reader :updated_desc + attr_reader :original_desc def_delegators :topology, :replica_set_name @@ -490,5 +491,12 @@ def stale_primary? end false end + + # Returns whether the server whose description this flow processed + # was not previously unknown, and is now. Used to decide, in particular, + # whether to clear the server's connection pool. + def became_unknown? + updated_desc.unknown? && !original_desc.unknown? + end end end diff --git a/lib/mongo/operation/shared/executable.rb b/lib/mongo/operation/shared/executable.rb index 66e15694ca..47a05ecae5 100644 --- a/lib/mongo/operation/shared/executable.rb +++ b/lib/mongo/operation/shared/executable.rb @@ -62,18 +62,13 @@ def process_result(result, server) if result.not_master? || result.node_recovering? if result.node_shutting_down? - disconnect_pool = true + keep_pool = false else - # Max wire version needs to be checked prior to marking the - # server unknown - disconnect_pool = !server.description.server_version_gte?('4.2') + # Max wire version needs to be examined while the server is known + keep_pool = server.description.server_version_gte?('4.2') end - server.unknown! - - if disconnect_pool - server.pool.disconnect! - end + server.unknown!(keep_connection_pool: keep_pool) server.scan_semaphore.signal end diff --git a/lib/mongo/server.rb b/lib/mongo/server.rb index 7e05f57e61..eb852d72fa 100644 --- a/lib/mongo/server.rb +++ b/lib/mongo/server.rb @@ -432,7 +432,6 @@ def handle_auth_failure! rescue Mongo::Error::SocketError # non-timeout network error unknown! - pool.disconnect! raise rescue Auth::Unauthorized # auth error, keep server description and topology as they are @@ -465,11 +464,19 @@ def retry_writes? # Marks server unknown and publishes the associated SDAM event # (server description changed). # + # @param [ Hash ] options Options. + # + # @option options [ true | false ] :keep_connection_pool Usually when the + # new server description is unknown, the connection pool on the + # respective server is cleared. Set this option to true to keep the + # existing connection pool (required when handling not master errors + # on 4.2+ servers). + # # @since 2.4.0, SDAM events are sent as of version 2.7.0 - def unknown! + def unknown!(options = {}) # SDAM flow will update description on the server without in-place # mutations and invoke SDAM transitions as needed. - cluster.run_sdam_flow(description, Description.new(address)) + cluster.run_sdam_flow(description, Description.new(address), options) end # @api private @@ -477,6 +484,15 @@ def update_description(description) @description = description end + # @api private + def clear_connection_pool + @pool_lock.synchronize do + if @pool + @pool.disconnect! + end + end + end + # @api private def next_connection_id @connection_id_gen.next_id diff --git a/lib/mongo/server/connection.rb b/lib/mongo/server/connection.rb index 2dbd0efbd4..cc0c7a97a6 100644 --- a/lib/mongo/server/connection.rb +++ b/lib/mongo/server/connection.rb @@ -384,7 +384,6 @@ def deliver(message) # Important: timeout errors are not handled here rescue Error::SocketError @server.unknown! - @server.pool.disconnect! raise end end From d44d130af1730856ff2465d2c1cad41b8f6bd0a4 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Mon, 5 Aug 2019 15:09:52 -0400 Subject: [PATCH 2/6] Comment out tests that are no longer applicable --- spec/mongo/server/connection_spec.rb | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/spec/mongo/server/connection_spec.rb b/spec/mongo/server/connection_spec.rb index 5c1c86c524..206f192d5e 100644 --- a/spec/mongo/server/connection_spec.rb +++ b/spec/mongo/server/connection_spec.rb @@ -262,6 +262,7 @@ class ConnectionSpecTestException < Exception; end end end +=begin These assertions require a working cluster with working SDAM flow, which the tests do not configure shared_examples_for 'does not disconnect connection pool' do it 'does not disconnect non-monitoring sockets' do allow(server).to receive(:pool).and_return(pool) @@ -277,6 +278,7 @@ class ConnectionSpecTestException < Exception; end error end end +=end let(:auth_mechanism) do if ClusterConfig.instance.server_version >= '3' @@ -323,14 +325,14 @@ class ConnectionSpecTestException < Exception; end expect(error).to be_a(Mongo::Auth::Unauthorized) end - it_behaves_like 'disconnects connection pool' + #it_behaves_like 'disconnects connection pool' it_behaves_like 'keeps server type and topology' end # need a separate context here, otherwise disconnect expectation # is ignored due to allowing disconnects in the other context context 'checking pool disconnection' do - it_behaves_like 'disconnects connection pool' + #it_behaves_like 'disconnects connection pool' end end @@ -360,7 +362,7 @@ class ConnectionSpecTestException < Exception; end expect(error).to be_a(Mongo::Error::SocketTimeoutError) end - it_behaves_like 'does not disconnect connection pool' + #it_behaves_like 'does not disconnect connection pool' it_behaves_like 'keeps server type and topology' end @@ -390,7 +392,7 @@ class ConnectionSpecTestException < Exception; end expect(error).to be_a(Mongo::Error::SocketError) end - it_behaves_like 'disconnects connection pool' + #it_behaves_like 'disconnects connection pool' it_behaves_like 'marks server unknown' end @@ -731,10 +733,12 @@ class ConnectionSpecTestException < Exception; end expect(connection).to_not be_connected end +=begin These assertions require a working cluster with working SDAM flow, which the tests do not configure it 'does not disconnect connection pool' do expect(server.pool).not_to receive(:disconnect!) result end +=end it 'does not mark server unknown' do expect(server).not_to be_unknown From 9e44d474b0cecc66eca42d212112e1009a145a11 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Mon, 5 Aug 2019 15:26:11 -0400 Subject: [PATCH 3/6] Integration test for pool being cleared on network errors and timeouts in monitor --- spec/integration/sdam_error_handling_spec.rb | 133 +++++++++++-------- 1 file changed, 78 insertions(+), 55 deletions(-) diff --git a/spec/integration/sdam_error_handling_spec.rb b/spec/integration/sdam_error_handling_spec.rb index 3caca2d86a..af027c613b 100644 --- a/spec/integration/sdam_error_handling_spec.rb +++ b/spec/integration/sdam_error_handling_spec.rb @@ -5,13 +5,63 @@ ClientRegistry.instance.close_all_clients end - describe 'when there is an error during an operation' do + # These tests operate on specific servers, and don't work in a multi + # shard cluster where multiple servers are equally eligible + require_no_multi_shard + + let(:client) { authorized_client_without_any_retries } + + let(:server) { client.cluster.next_primary } + + shared_examples_for 'marks server unknown' do + it 'marks server unknown' do + expect(server).not_to be_unknown + operation + expect(server).to be_unknown + end + end + + shared_examples_for 'does not mark server unknown' do + it 'does not mark server unknown' do + expect(server).not_to be_unknown + operation + expect(server).not_to be_unknown + end + end - # These tests operate on specific servers, and don't work in a multi - # shard cluster where multiple servers are equally eligible - require_no_multi_shard + shared_examples_for 'requests server scan' do + it 'requests server scan' do + expect(server.scan_semaphore).to receive(:signal) + operation + end + end - let(:client) { authorized_client_without_any_retries } + shared_examples_for 'does not request server scan' do + it 'does not request server scan' do + expect(server.scan_semaphore).not_to receive(:signal) + operation + end + end + + shared_examples_for 'clears connection pool' do + it 'clears connection pool' do + generation = server.pool.generation + operation + new_generation = server.pool.generation + expect(new_generation).to eq(generation + 1) + end + end + + shared_examples_for 'does not clear connection pool' do + it 'does not clear connection pool' do + generation = server.pool.generation + operation + new_generation = server.pool.generation + expect(new_generation).to eq(generation) + end + end + + describe 'when there is an error during an operation' do before do wait_for_all_servers(client.cluster) @@ -24,8 +74,6 @@ end end - let(:server) { client.cluster.next_primary } - let(:operation) do expect_any_instance_of(Mongo::Server::Connection).to receive(:deliver).and_return(reply) expect do @@ -33,54 +81,6 @@ end.to raise_error(Mongo::Error::OperationFailure, exception_message) end - shared_examples_for 'marks server unknown' do - it 'marks server unknown' do - expect(server).not_to be_unknown - operation - expect(server).to be_unknown - end - end - - shared_examples_for 'does not mark server unknown' do - it 'does not mark server unknown' do - expect(server).not_to be_unknown - operation - expect(server).not_to be_unknown - end - end - - shared_examples_for 'requests server scan' do - it 'requests server scan' do - expect(server.scan_semaphore).to receive(:signal) - operation - end - end - - shared_examples_for 'does not request server scan' do - it 'does not request server scan' do - expect(server.scan_semaphore).not_to receive(:signal) - operation - end - end - - shared_examples_for 'clears connection pool' do - it 'clears connection pool' do - generation = server.pool.generation - operation - new_generation = server.pool.generation - expect(new_generation).to eq(generation + 1) - end - end - - shared_examples_for 'does not clear connection pool' do - it 'does not clear connection pool' do - generation = server.pool.generation - operation - new_generation = server.pool.generation - expect(new_generation).to eq(generation) - end - end - shared_examples_for 'not master or node recovering' do it_behaves_like 'marks server unknown' it_behaves_like 'requests server scan' @@ -170,4 +170,27 @@ end end end + + describe 'when there is an error on monitoring connection' do + let(:operation) do + expect(server.monitor.connection).not_to be nil + expect(server.monitor.connection).to receive(:ismaster).and_raise(exception) + server.scan_semaphore.broadcast + sleep 2 + end + + context 'non-timeout network error' do + let(:exception) { Mongo::Error::SocketError } + + it_behaves_like 'marks server unknown' + it_behaves_like 'clears connection pool' + end + + context 'network timeout' do + let(:exception) { Mongo::Error::SocketTimeoutError } + + it_behaves_like 'marks server unknown' + it_behaves_like 'clears connection pool' + end + end end From d01fc69026bd4df057746c394744c7f5ecef9f6a Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Mon, 5 Aug 2019 15:43:04 -0400 Subject: [PATCH 4/6] improve test reliability --- spec/integration/sdam_error_handling_spec.rb | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/spec/integration/sdam_error_handling_spec.rb b/spec/integration/sdam_error_handling_spec.rb index af027c613b..081a4fcc7b 100644 --- a/spec/integration/sdam_error_handling_spec.rb +++ b/spec/integration/sdam_error_handling_spec.rb @@ -174,9 +174,15 @@ describe 'when there is an error on monitoring connection' do let(:operation) do expect(server.monitor.connection).not_to be nil - expect(server.monitor.connection).to receive(:ismaster).and_raise(exception) + expect(server.monitor.connection).to receive(:ismaster).at_least(:once).and_raise(exception) server.scan_semaphore.broadcast - sleep 2 + 6.times do + sleep 0.5 + if server.unknown? + break + end + end + expect(server).to be_unknown end context 'non-timeout network error' do From 8d9ce8720755d65637ce9110c5db3b31aa0b0563 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Mon, 5 Aug 2019 16:20:42 -0400 Subject: [PATCH 5/6] try lower timeouts --- spec/integration/sdam_error_handling_spec.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spec/integration/sdam_error_handling_spec.rb b/spec/integration/sdam_error_handling_spec.rb index 081a4fcc7b..b8f82a77e9 100644 --- a/spec/integration/sdam_error_handling_spec.rb +++ b/spec/integration/sdam_error_handling_spec.rb @@ -172,6 +172,11 @@ end describe 'when there is an error on monitoring connection' do + let(:client) do + authorized_client_without_any_retries.with( + connect_timeout: 1, socket_timeout: 1) + end + let(:operation) do expect(server.monitor.connection).not_to be nil expect(server.monitor.connection).to receive(:ismaster).at_least(:once).and_raise(exception) From 95ece077c3f073a938de24d44d4535780dfbba10 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Mon, 5 Aug 2019 16:54:17 -0400 Subject: [PATCH 6/6] Retry all monitoring tests --- spec/integration/sdam_error_handling_spec.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spec/integration/sdam_error_handling_spec.rb b/spec/integration/sdam_error_handling_spec.rb index b8f82a77e9..123b9b5518 100644 --- a/spec/integration/sdam_error_handling_spec.rb +++ b/spec/integration/sdam_error_handling_spec.rb @@ -171,7 +171,8 @@ end end - describe 'when there is an error on monitoring connection' do + # These tests fail intermittently in Evergreen + describe 'when there is an error on monitoring connection', retry: 3 do let(:client) do authorized_client_without_any_retries.with( connect_timeout: 1, socket_timeout: 1)