Skip to content

Commit e14fddd

Browse files
authored
RUBY-1894 Clear connection pools when monitor ismaster times out (#1475)
* Clear server pool when a server is marked unknown * Comment out tests that are no longer applicable * Integration test for pool being cleared on network errors and timeouts in monitor * improve test reliability * try lower timeouts * Retry all monitoring tests
1 parent 675a36a commit e14fddd

File tree

7 files changed

+155
-74
lines changed

7 files changed

+155
-74
lines changed

lib/mongo/cluster.rb

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -502,18 +502,42 @@ def scan!(sync=true)
502502
true
503503
end
504504

505+
# Runs SDAM flow on the cluster.
506+
#
507+
# This method can be invoked to process a new server description returned
508+
# by the server on a monitoring or non-monitoring connection, and also
509+
# by the driver when it marks a server unknown as a result of a (network)
510+
# error.
511+
#
505512
# @param [ Server::Description ] previous_desc Previous server description.
506513
# @param [ Server::Description ] updated_desc The changed description.
514+
# @param [ Hash ] options Options.
515+
#
516+
# @option options [ true | false ] :keep_connection_pool Usually when the
517+
# new server description is unknown, the connection pool on the
518+
# respective server is cleared. Set this option to true to keep the
519+
# existing connection pool (required when handling not master errors
520+
# on 4.2+ servers).
507521
#
508522
# @api private
509-
def run_sdam_flow(previous_desc, updated_desc)
523+
def run_sdam_flow(previous_desc, updated_desc, options = {})
510524
@sdam_flow_lock.synchronize do
511525
flow = SdamFlow.new(self, previous_desc, updated_desc)
512526
flow.server_description_changed
513527

514528
# SDAM flow may alter the updated description - grab the final
515529
# version for the purposes of broadcasting if a server is available
516530
updated_desc = flow.updated_desc
531+
532+
unless options[:keep_connection_pool]
533+
if flow.became_unknown?
534+
servers_list.each do |server|
535+
if server.address == updated_desc.address
536+
server.clear_connection_pool
537+
end
538+
end
539+
end
540+
end
517541
end
518542

519543
# Some updated descriptions, e.g. a mismatched me one, result in the

lib/mongo/cluster/sdam_flow.rb

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class SdamFlow
2828
def initialize(cluster, previous_desc, updated_desc)
2929
@cluster = cluster
3030
@topology = cluster.topology
31-
@previous_desc = previous_desc
31+
@original_desc = @previous_desc = previous_desc
3232
@updated_desc = updated_desc
3333
end
3434

@@ -48,6 +48,7 @@ def initialize(cluster, previous_desc, updated_desc)
4848

4949
attr_reader :previous_desc
5050
attr_reader :updated_desc
51+
attr_reader :original_desc
5152

5253
def_delegators :topology, :replica_set_name
5354

@@ -490,5 +491,12 @@ def stale_primary?
490491
end
491492
false
492493
end
494+
495+
# Returns whether the server whose description this flow processed
496+
# was not previously unknown, and is now. Used to decide, in particular,
497+
# whether to clear the server's connection pool.
498+
def became_unknown?
499+
updated_desc.unknown? && !original_desc.unknown?
500+
end
493501
end
494502
end

lib/mongo/operation/shared/executable.rb

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -62,18 +62,13 @@ def process_result(result, server)
6262

6363
if result.not_master? || result.node_recovering?
6464
if result.node_shutting_down?
65-
disconnect_pool = true
65+
keep_pool = false
6666
else
67-
# Max wire version needs to be checked prior to marking the
68-
# server unknown
69-
disconnect_pool = !server.description.server_version_gte?('4.2')
67+
# Max wire version needs to be examined while the server is known
68+
keep_pool = server.description.server_version_gte?('4.2')
7069
end
7170

72-
server.unknown!
73-
74-
if disconnect_pool
75-
server.pool.disconnect!
76-
end
71+
server.unknown!(keep_connection_pool: keep_pool)
7772

7873
server.scan_semaphore.signal
7974
end

lib/mongo/server.rb

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,6 @@ def handle_auth_failure!
432432
rescue Mongo::Error::SocketError
433433
# non-timeout network error
434434
unknown!
435-
pool.disconnect!
436435
raise
437436
rescue Auth::Unauthorized
438437
# auth error, keep server description and topology as they are
@@ -465,18 +464,35 @@ def retry_writes?
465464
# Marks server unknown and publishes the associated SDAM event
466465
# (server description changed).
467466
#
467+
# @param [ Hash ] options Options.
468+
#
469+
# @option options [ true | false ] :keep_connection_pool Usually when the
470+
# new server description is unknown, the connection pool on the
471+
# respective server is cleared. Set this option to true to keep the
472+
# existing connection pool (required when handling not master errors
473+
# on 4.2+ servers).
474+
#
468475
# @since 2.4.0, SDAM events are sent as of version 2.7.0
469-
def unknown!
476+
def unknown!(options = {})
470477
# SDAM flow will update description on the server without in-place
471478
# mutations and invoke SDAM transitions as needed.
472-
cluster.run_sdam_flow(description, Description.new(address))
479+
cluster.run_sdam_flow(description, Description.new(address), options)
473480
end
474481

475482
# @api private
476483
def update_description(description)
477484
@description = description
478485
end
479486

487+
# @api private
488+
def clear_connection_pool
489+
@pool_lock.synchronize do
490+
if @pool
491+
@pool.disconnect!
492+
end
493+
end
494+
end
495+
480496
# @api private
481497
def next_connection_id
482498
@connection_id_gen.next_id

lib/mongo/server/connection.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -384,7 +384,6 @@ def deliver(message)
384384
# Important: timeout errors are not handled here
385385
rescue Error::SocketError
386386
@server.unknown!
387-
@server.pool.disconnect!
388387
raise
389388
end
390389
end

spec/integration/sdam_error_handling_spec.rb

Lines changed: 90 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,63 @@
55
ClientRegistry.instance.close_all_clients
66
end
77

8-
describe 'when there is an error during an operation' do
8+
# These tests operate on specific servers, and don't work in a multi
9+
# shard cluster where multiple servers are equally eligible
10+
require_no_multi_shard
11+
12+
let(:client) { authorized_client_without_any_retries }
913

10-
# These tests operate on specific servers, and don't work in a multi
11-
# shard cluster where multiple servers are equally eligible
12-
require_no_multi_shard
14+
let(:server) { client.cluster.next_primary }
15+
16+
shared_examples_for 'marks server unknown' do
17+
it 'marks server unknown' do
18+
expect(server).not_to be_unknown
19+
operation
20+
expect(server).to be_unknown
21+
end
22+
end
1323

14-
let(:client) { authorized_client_without_any_retries }
24+
shared_examples_for 'does not mark server unknown' do
25+
it 'does not mark server unknown' do
26+
expect(server).not_to be_unknown
27+
operation
28+
expect(server).not_to be_unknown
29+
end
30+
end
31+
32+
shared_examples_for 'requests server scan' do
33+
it 'requests server scan' do
34+
expect(server.scan_semaphore).to receive(:signal)
35+
operation
36+
end
37+
end
38+
39+
shared_examples_for 'does not request server scan' do
40+
it 'does not request server scan' do
41+
expect(server.scan_semaphore).not_to receive(:signal)
42+
operation
43+
end
44+
end
45+
46+
shared_examples_for 'clears connection pool' do
47+
it 'clears connection pool' do
48+
generation = server.pool.generation
49+
operation
50+
new_generation = server.pool.generation
51+
expect(new_generation).to eq(generation + 1)
52+
end
53+
end
54+
55+
shared_examples_for 'does not clear connection pool' do
56+
it 'does not clear connection pool' do
57+
generation = server.pool.generation
58+
operation
59+
new_generation = server.pool.generation
60+
expect(new_generation).to eq(generation)
61+
end
62+
end
63+
64+
describe 'when there is an error during an operation' do
1565

1666
before do
1767
wait_for_all_servers(client.cluster)
@@ -24,63 +74,13 @@
2474
end
2575
end
2676

27-
let(:server) { client.cluster.next_primary }
28-
2977
let(:operation) do
3078
expect_any_instance_of(Mongo::Server::Connection).to receive(:deliver).and_return(reply)
3179
expect do
3280
client.database.command(ping: 1)
3381
end.to raise_error(Mongo::Error::OperationFailure, exception_message)
3482
end
3583

36-
shared_examples_for 'marks server unknown' do
37-
it 'marks server unknown' do
38-
expect(server).not_to be_unknown
39-
operation
40-
expect(server).to be_unknown
41-
end
42-
end
43-
44-
shared_examples_for 'does not mark server unknown' do
45-
it 'does not mark server unknown' do
46-
expect(server).not_to be_unknown
47-
operation
48-
expect(server).not_to be_unknown
49-
end
50-
end
51-
52-
shared_examples_for 'requests server scan' do
53-
it 'requests server scan' do
54-
expect(server.scan_semaphore).to receive(:signal)
55-
operation
56-
end
57-
end
58-
59-
shared_examples_for 'does not request server scan' do
60-
it 'does not request server scan' do
61-
expect(server.scan_semaphore).not_to receive(:signal)
62-
operation
63-
end
64-
end
65-
66-
shared_examples_for 'clears connection pool' do
67-
it 'clears connection pool' do
68-
generation = server.pool.generation
69-
operation
70-
new_generation = server.pool.generation
71-
expect(new_generation).to eq(generation + 1)
72-
end
73-
end
74-
75-
shared_examples_for 'does not clear connection pool' do
76-
it 'does not clear connection pool' do
77-
generation = server.pool.generation
78-
operation
79-
new_generation = server.pool.generation
80-
expect(new_generation).to eq(generation)
81-
end
82-
end
83-
8484
shared_examples_for 'not master or node recovering' do
8585
it_behaves_like 'marks server unknown'
8686
it_behaves_like 'requests server scan'
@@ -170,4 +170,39 @@
170170
end
171171
end
172172
end
173+
174+
# These tests fail intermittently in Evergreen
175+
describe 'when there is an error on monitoring connection', retry: 3 do
176+
let(:client) do
177+
authorized_client_without_any_retries.with(
178+
connect_timeout: 1, socket_timeout: 1)
179+
end
180+
181+
let(:operation) do
182+
expect(server.monitor.connection).not_to be nil
183+
expect(server.monitor.connection).to receive(:ismaster).at_least(:once).and_raise(exception)
184+
server.scan_semaphore.broadcast
185+
6.times do
186+
sleep 0.5
187+
if server.unknown?
188+
break
189+
end
190+
end
191+
expect(server).to be_unknown
192+
end
193+
194+
context 'non-timeout network error' do
195+
let(:exception) { Mongo::Error::SocketError }
196+
197+
it_behaves_like 'marks server unknown'
198+
it_behaves_like 'clears connection pool'
199+
end
200+
201+
context 'network timeout' do
202+
let(:exception) { Mongo::Error::SocketTimeoutError }
203+
204+
it_behaves_like 'marks server unknown'
205+
it_behaves_like 'clears connection pool'
206+
end
207+
end
173208
end

spec/mongo/server/connection_spec.rb

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ class ConnectionSpecTestException < Exception; end
262262
end
263263
end
264264

265+
=begin These assertions require a working cluster with working SDAM flow, which the tests do not configure
265266
shared_examples_for 'does not disconnect connection pool' do
266267
it 'does not disconnect non-monitoring sockets' do
267268
allow(server).to receive(:pool).and_return(pool)
@@ -277,6 +278,7 @@ class ConnectionSpecTestException < Exception; end
277278
error
278279
end
279280
end
281+
=end
280282

281283
let(:auth_mechanism) do
282284
if ClusterConfig.instance.server_version >= '3'
@@ -323,14 +325,14 @@ class ConnectionSpecTestException < Exception; end
323325
expect(error).to be_a(Mongo::Auth::Unauthorized)
324326
end
325327

326-
it_behaves_like 'disconnects connection pool'
328+
#it_behaves_like 'disconnects connection pool'
327329
it_behaves_like 'keeps server type and topology'
328330
end
329331

330332
# need a separate context here, otherwise disconnect expectation
331333
# is ignored due to allowing disconnects in the other context
332334
context 'checking pool disconnection' do
333-
it_behaves_like 'disconnects connection pool'
335+
#it_behaves_like 'disconnects connection pool'
334336
end
335337
end
336338

@@ -360,7 +362,7 @@ class ConnectionSpecTestException < Exception; end
360362
expect(error).to be_a(Mongo::Error::SocketTimeoutError)
361363
end
362364

363-
it_behaves_like 'does not disconnect connection pool'
365+
#it_behaves_like 'does not disconnect connection pool'
364366
it_behaves_like 'keeps server type and topology'
365367
end
366368

@@ -390,7 +392,7 @@ class ConnectionSpecTestException < Exception; end
390392
expect(error).to be_a(Mongo::Error::SocketError)
391393
end
392394

393-
it_behaves_like 'disconnects connection pool'
395+
#it_behaves_like 'disconnects connection pool'
394396
it_behaves_like 'marks server unknown'
395397
end
396398

@@ -731,10 +733,12 @@ class ConnectionSpecTestException < Exception; end
731733
expect(connection).to_not be_connected
732734
end
733735

736+
=begin These assertions require a working cluster with working SDAM flow, which the tests do not configure
734737
it 'does not disconnect connection pool' do
735738
expect(server.pool).not_to receive(:disconnect!)
736739
result
737740
end
741+
=end
738742

739743
it 'does not mark server unknown' do
740744
expect(server).not_to be_unknown

0 commit comments

Comments
 (0)