From 6a4729790072c357ea29b41612231e963ca98dd6 Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Fri, 18 Jun 2021 21:06:55 -0400
Subject: [PATCH 01/18] WIP: EnforceSemiSyncReplicaCount

---
 go/config/config.go           |   2 +
 go/inst/analysis.go           |  41 ++++++------
 go/inst/analysis_dao.go       |   4 ++
 go/logic/topology_recovery.go | 116 +++++++++++++++++++++++++++++++++-
 4 files changed, 141 insertions(+), 22 deletions(-)

diff --git a/go/config/config.go b/go/config/config.go
index 4cae4ebff..b944b654f 100644
--- a/go/config/config.go
+++ b/go/config/config.go
@@ -275,6 +275,7 @@ type Configuration struct {
 	KVClusterMasterPrefix                      string            // Prefix to use for clusters' masters entries in KV stores (internal, consul, ZK), default: "mysql/master"
 	WebMessage                                 string            // If provided, will be shown on all web pages below the title bar
 	MaxConcurrentReplicaOperations             int               // Maximum number of concurrent operations on replicas
+	EnforceSemiSyncReplicaCount                bool
 }
 
 // ToJSONString will marshal this configuration as JSON
@@ -446,6 +447,7 @@ func newConfiguration() *Configuration {
 		KVClusterMasterPrefix:                      "mysql/master",
 		WebMessage:                                 "",
 		MaxConcurrentReplicaOperations:             5,
+		EnforceSemiSyncReplicaCount: false,
 	}
 }
 
diff --git a/go/inst/analysis.go b/go/inst/analysis.go
index 1d6ab3692..633bd8f10 100644
--- a/go/inst/analysis.go
+++ b/go/inst/analysis.go
@@ -30,26 +30,27 @@ const (
 	NoProblem                                               AnalysisCode = "NoProblem"
 	DeadMasterWithoutReplicas                                            = "DeadMasterWithoutReplicas"
 	DeadMaster                                                           = "DeadMaster"
-	DeadMasterAndReplicas                                                = "DeadMasterAndReplicas"
-	DeadMasterAndSomeReplicas                                            = "DeadMasterAndSomeReplicas"
-	UnreachableMasterWithLaggingReplicas                                 = "UnreachableMasterWithLaggingReplicas"
-	UnreachableMaster                                                    = "UnreachableMaster"
-	MasterSingleReplicaNotReplicating                                    = "MasterSingleReplicaNotReplicating"
-	MasterSingleReplicaDead                                              = "MasterSingleReplicaDead"
-	AllMasterReplicasNotReplicating                                      = "AllMasterReplicasNotReplicating"
-	AllMasterReplicasNotReplicatingOrDead                                = "AllMasterReplicasNotReplicatingOrDead"
-	LockedSemiSyncMasterHypothesis                                       = "LockedSemiSyncMasterHypothesis"
-	LockedSemiSyncMaster                                                 = "LockedSemiSyncMaster"
-	MasterWithoutReplicas                                                = "MasterWithoutReplicas"
-	DeadCoMaster                                                         = "DeadCoMaster"
-	DeadCoMasterAndSomeReplicas                                          = "DeadCoMasterAndSomeReplicas"
-	UnreachableCoMaster                                                  = "UnreachableCoMaster"
-	AllCoMasterReplicasNotReplicating                                    = "AllCoMasterReplicasNotReplicating"
-	DeadIntermediateMaster                                               = "DeadIntermediateMaster"
-	DeadIntermediateMasterWithSingleReplica                              = "DeadIntermediateMasterWithSingleReplica"
-	DeadIntermediateMasterWithSingleReplicaFailingToConnect              = "DeadIntermediateMasterWithSingleReplicaFailingToConnect"
-	DeadIntermediateMasterAndSomeReplicas                                = "DeadIntermediateMasterAndSomeReplicas"
-	DeadIntermediateMasterAndReplicas                                    = "DeadIntermediateMasterAndReplicas"
+	DeadMasterAndReplicas                                   = "DeadMasterAndReplicas"
+	DeadMasterAndSomeReplicas                               = "DeadMasterAndSomeReplicas"
+	UnreachableMasterWithLaggingReplicas                    = "UnreachableMasterWithLaggingReplicas"
+	UnreachableMaster                                       = "UnreachableMaster"
+	MasterSingleReplicaNotReplicating                       = "MasterSingleReplicaNotReplicating"
+	MasterSingleReplicaDead                                 = "MasterSingleReplicaDead"
+	AllMasterReplicasNotReplicating                         = "AllMasterReplicasNotReplicating"
+	AllMasterReplicasNotReplicatingOrDead                   = "AllMasterReplicasNotReplicatingOrDead"
+	LockedSemiSyncMasterHypothesis                          = "LockedSemiSyncMasterHypothesis"
+	LockedSemiSyncMaster                                    = "LockedSemiSyncMaster"
+	MasterWithTooManySemiSyncReplicas                       = "MasterWithTooManySemiSyncReplicas"
+	MasterWithoutReplicas                                   = "MasterWithoutReplicas"
+	DeadCoMaster                                            = "DeadCoMaster"
+	DeadCoMasterAndSomeReplicas                             = "DeadCoMasterAndSomeReplicas"
+	UnreachableCoMaster                                     = "UnreachableCoMaster"
+	AllCoMasterReplicasNotReplicating                       = "AllCoMasterReplicasNotReplicating"
+	DeadIntermediateMaster                                  = "DeadIntermediateMaster"
+	DeadIntermediateMasterWithSingleReplica                 = "DeadIntermediateMasterWithSingleReplica"
+	DeadIntermediateMasterWithSingleReplicaFailingToConnect = "DeadIntermediateMasterWithSingleReplicaFailingToConnect"
+	DeadIntermediateMasterAndSomeReplicas                   = "DeadIntermediateMasterAndSomeReplicas"
+	DeadIntermediateMasterAndReplicas                       = "DeadIntermediateMasterAndReplicas"
 	UnreachableIntermediateMasterWithLaggingReplicas                     = "UnreachableIntermediateMasterWithLaggingReplicas"
 	UnreachableIntermediateMaster                                        = "UnreachableIntermediateMaster"
 	AllIntermediateMasterReplicasFailingToConnectOrDead                  = "AllIntermediateMasterReplicasFailingToConnectOrDead"
diff --git a/go/inst/analysis_dao.go b/go/inst/analysis_dao.go
index 4b00281e4..bef8463e5 100644
--- a/go/inst/analysis_dao.go
+++ b/go/inst/analysis_dao.go
@@ -531,6 +531,10 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
 					a.Description = "Semi sync master seems to be locked, more samplings needed to validate"
 				}
 				//
+			} else if config.Config.EnforceSemiSyncReplicaCount && a.IsMaster && a.SemiSyncMasterEnabled && a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients > a.SemiSyncMasterWaitForReplicaCount {
+				a.Analysis = MasterWithTooManySemiSyncReplicas
+				a.Description = "Semi sync master has more semi sync replicas than configured"
+				//
 			} else if a.IsMaster && a.LastCheckValid && a.IsReadOnly && a.CountValidReplicatingReplicas > 0 && config.Config.RecoverNonWriteableMaster {
 				a.Analysis = NoWriteableMasterStructureWarning
 				a.Description = "Master with replicas is read_only"
diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
index 39a46bfa5..c46788aba 100644
--- a/go/logic/topology_recovery.go
+++ b/go/logic/topology_recovery.go
@@ -1479,14 +1479,124 @@ func checkAndRecoverNonWriteableMaster(analysisEntry inst.ReplicationAnalysis, c
 
 // checkAndRecoverLockedSemiSyncMaster
 func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
-
 	topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, true, true)
 	if topologyRecovery == nil {
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverLockedSemiSyncMaster.", analysisEntry.AnalyzedInstanceKey))
 		return false, nil, err
 	}
 
-	return false, nil, nil
+	if !config.Config.EnforceSemiSyncReplicaCount {
+		return false, nil, nil
+	}
+
+	replicas, err := inst.ReadReplicaInstances(&analysisEntry.AnalyzedInstanceKey)
+	if err != nil {
+		return false, topologyRecovery, err
+	}
+	if len(replicas) == 0 {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no replicas found for %+v.", analysisEntry.AnalyzedInstanceKey))
+		return false, topologyRecovery, nil
+	}
+
+	reversePriorityReplicas := replicas // TODO sort replicas by priority
+	enableCount := int(analysisEntry.SemiSyncMasterWaitForReplicaCount) - int(analysisEntry.CountSemiSyncReplicasEnabled)
+	if enableCount <= 0 || enableCount > len(replicas) {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("invalid state for recovery %+v. Wait count is lower or equal to semi sync replica count or replica count too low.", analysisEntry.AnalyzedInstanceKey))
+		return false, nil, err
+	}
+
+	// No way back. We're doing this.
+
+	// TODO before actually doing it, do sanity check (downtimed, etc.)
+
+	enabled := 0
+	for _, replica := range reversePriorityReplicas {
+		if replica.IsDowntimed || replica.SemiSyncReplicaEnabled {
+			continue
+		}
+		err := inst.EnableSemiSync(&replica.Key, false, true)
+		if err != nil {
+			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot enable semi sync on replica %+v.", replica.Key))
+			return true, topologyRecovery, nil
+		}
+		err = inst.RestartReplicationQuick(&replica.Key)
+		if err != nil {
+			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot restart replication on replica %+v.", replica.Key))
+			return true, topologyRecovery, nil
+		}
+		enabled++
+		if enabled == enableCount {
+			break
+		}
+	}
+
+	resolveRecovery(topologyRecovery, nil)
+	return true, topologyRecovery, nil
+}
+
+// checkAndRecoverMasterWithTooManySemiSyncReplicas
+func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
+	topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, true, true)
+	if topologyRecovery == nil {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverMasterWithTooManySemiSyncReplicas.", analysisEntry.AnalyzedInstanceKey))
+		return false, nil, err
+	}
+
+	// FIXME is this the right spot for this check? We shouldn't even get here right?
+	if !config.Config.EnforceSemiSyncReplicaCount {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi sync replica count enforcement not enabled. Bailng out on recovery of %+v.", analysisEntry.AnalyzedInstanceKey))
+		return false, topologyRecovery, err
+	}
+
+	instance, found, err := inst.ReadInstance(&analysisEntry.AnalyzedInstanceKey)
+	if !found || err != nil {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("instance not found in recovery %+v.", analysisEntry.AnalyzedInstanceKey))
+		return false, topologyRecovery, err
+	}
+
+	replicas, err := inst.ReadReplicaInstances(&analysisEntry.AnalyzedInstanceKey)
+	if err != nil {
+		return false, topologyRecovery, err
+	}
+	if len(replicas) == 0 {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no replicas found for %+v.", analysisEntry.AnalyzedInstanceKey))
+		return false, topologyRecovery, nil
+	}
+
+	reversePriorityReplicas := replicas // TODO reverse-sort replicas by priority
+	disableCount := int(analysisEntry.CountSemiSyncReplicasEnabled) - int(analysisEntry.SemiSyncMasterWaitForReplicaCount)
+	if disableCount <= 0 || disableCount > len(replicas) {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("invalid state for recovery %+v. Wait count is greater or equal to semi sync replica count or replica count too low.", analysisEntry.AnalyzedInstanceKey))
+		return false, nil, err
+	}
+
+	// No way back. We're doing this.
+
+	// TODO do preflight checks
+
+	disabled := 0
+	for _, replica := range reversePriorityReplicas {
+		if !replica.SemiSyncReplicaEnabled {
+			continue
+		}
+		err := inst.EnableSemiSync(&replica.Key, false, false)
+		if err != nil {
+			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot disable semi sync on replica %+v.", replica.Key))
+			return true, topologyRecovery, nil
+		}
+		err = inst.RestartReplicationQuick(&replica.Key)
+		if err != nil {
+			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot restart replication on replica %+v.", replica.Key))
+			return true, topologyRecovery, nil
+		}
+		disabled++
+		if disabled == disableCount {
+			break
+		}
+	}
+
+	resolveRecovery(topologyRecovery, instance)
+	return true, topologyRecovery, nil
 }
 
 // checkAndRecoverGenericProblem is a general-purpose recovery function
@@ -1659,6 +1769,8 @@ func getCheckAndRecoverFunction(analysisCode inst.AnalysisCode, analyzedInstance
 		} else {
 			return checkAndRecoverLockedSemiSyncMaster, true
 		}
+	case inst.MasterWithTooManySemiSyncReplicas:
+		return checkAndRecoverMasterWithTooManySemiSyncReplicas, true
 	// intermediate master
 	case inst.DeadIntermediateMaster:
 		return checkAndRecoverDeadIntermediateMaster, true

From 26f85da9b6c03353eaa08f59494384095fcc78fb Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Mon, 28 Jun 2021 20:53:13 -0400
Subject: [PATCH 02/18] Redo SemiSyncEnforced as priority; sort replicas by
 priority, then promotion_rule; only handle "exact" case for now

---
 go/config/config.go              | 46 +++++++++++++-------------
 go/inst/analysis_dao.go          |  2 +-
 go/inst/instance.go              |  2 +-
 go/inst/instance_dao.go          |  2 +-
 go/inst/instance_topology_dao.go |  8 ++---
 go/logic/topology_recovery.go    | 55 +++++++++++++++++++++++++++++---
 6 files changed, 81 insertions(+), 34 deletions(-)

diff --git a/go/config/config.go b/go/config/config.go
index 41ab044e7..c8fb179a7 100644
--- a/go/config/config.go
+++ b/go/config/config.go
@@ -62,6 +62,8 @@ const (
 	SelectTrueQuery                              = "select 1"
 	ConsulKVsPerCluster                          = 5 // KVs: "/", "/hostname", "/ipv4", "/ipv6" and "/port"
 	ConsulMaxTransactionOps                      = 64
+	EnforceExactSemiSyncReplicas                 = "exact"
+	EnforceEnoughSemiSyncReplicas                = "enough"
 )
 
 var deprecatedConfigurationVariables = []string{
@@ -265,17 +267,17 @@ type Configuration struct {
 	DiscoveryIgnoreReplicaHostnameFilters      []string          // Regexp filters to apply to prevent auto-discovering new replicas. Usage: unreachable servers due to firewalls, applications which trigger binlog dumps
 	DiscoveryIgnoreMasterHostnameFilters       []string          // Regexp filters to apply to prevent auto-discovering a master. Usage: pointing your master temporarily to replicate seom data from external host
 	DiscoveryIgnoreHostnameFilters             []string          // Regexp filters to apply to prevent discovering instances of any kind
-	ConsulAddress                              string            // Address where Consul HTTP api is found. Example: 127.0.0.1:8500
-	ConsulScheme                               string            // Scheme (http or https) for Consul
-	ConsulAclToken                             string            // ACL token used to write to Consul KV
-	ConsulCrossDataCenterDistribution          bool              // should orchestrator automatically auto-deduce all consul DCs and write KVs in all DCs
-	ConsulKVStoreProvider                      string            // Consul KV store provider (consul or consul-txn), default: "consul"
-	ConsulMaxKVsPerTransaction                 int               // Maximum number of KV operations to perform in a single Consul Transaction. Requires the "consul-txn" ConsulKVStoreProvider
-	ZkAddress                                  string            // UNSUPPERTED YET. Address where (single or multiple) ZooKeeper servers are found, in `srv1[:port1][,srv2[:port2]...]` format. Default port is 2181. Example: srv-a,srv-b:12181,srv-c
-	KVClusterMasterPrefix                      string            // Prefix to use for clusters' masters entries in KV stores (internal, consul, ZK), default: "mysql/master"
-	WebMessage                                 string            // If provided, will be shown on all web pages below the title bar
-	MaxConcurrentReplicaOperations             int               // Maximum number of concurrent operations on replicas
-	EnforceSemiSyncReplicaCount                bool
+	ConsulAddress                     string                     // Address where Consul HTTP api is found. Example: 127.0.0.1:8500
+	ConsulScheme                      string                     // Scheme (http or https) for Consul
+	ConsulAclToken                    string                     // ACL token used to write to Consul KV
+	ConsulCrossDataCenterDistribution bool                       // should orchestrator automatically auto-deduce all consul DCs and write KVs in all DCs
+	ConsulKVStoreProvider             string                     // Consul KV store provider (consul or consul-txn), default: "consul"
+	ConsulMaxKVsPerTransaction        int                        // Maximum number of KV operations to perform in a single Consul Transaction. Requires the "consul-txn" ConsulKVStoreProvider
+	ZkAddress                         string                     // UNSUPPERTED YET. Address where (single or multiple) ZooKeeper servers are found, in `srv1[:port1][,srv2[:port2]...]` format. Default port is 2181. Example: srv-a,srv-b:12181,srv-c
+	KVClusterMasterPrefix             string                     // Prefix to use for clusters' masters entries in KV stores (internal, consul, ZK), default: "mysql/master"
+	WebMessage                        string                     // If provided, will be shown on all web pages below the title bar
+	MaxConcurrentReplicaOperations    int                        // Maximum number of concurrent operations on replicas
+	EnforceSemiSyncReplicas           string                     // If empty, semi-sync replicas will not be touched; if "exact", semi-sync replicas will be enabled/disabled to match the wait count; if "enough", semi-sync replicas will be enabled until the wait count is reached
 }
 
 // ToJSONString will marshal this configuration as JSON
@@ -437,17 +439,17 @@ func newConfiguration() *Configuration {
 		GraphitePollSeconds:                        60,
 		URLPrefix:                                  "",
 		DiscoveryIgnoreReplicaHostnameFilters:      []string{},
-		ConsulAddress:                              "",
-		ConsulScheme:                               "http",
-		ConsulAclToken:                             "",
-		ConsulCrossDataCenterDistribution:          false,
-		ConsulKVStoreProvider:                      "consul",
-		ConsulMaxKVsPerTransaction:                 ConsulKVsPerCluster,
-		ZkAddress:                                  "",
-		KVClusterMasterPrefix:                      "mysql/master",
-		WebMessage:                                 "",
-		MaxConcurrentReplicaOperations:             5,
-		EnforceSemiSyncReplicaCount: false,
+		ConsulAddress:                     "",
+		ConsulScheme:                      "http",
+		ConsulAclToken:                    "",
+		ConsulCrossDataCenterDistribution: false,
+		ConsulKVStoreProvider:             "consul",
+		ConsulMaxKVsPerTransaction:        ConsulKVsPerCluster,
+		ZkAddress:                         "",
+		KVClusterMasterPrefix:             "mysql/master",
+		WebMessage:                        "",
+		MaxConcurrentReplicaOperations:    5,
+		EnforceSemiSyncReplicas:           "",
 	}
 }
 
diff --git a/go/inst/analysis_dao.go b/go/inst/analysis_dao.go
index bef8463e5..607193d9e 100644
--- a/go/inst/analysis_dao.go
+++ b/go/inst/analysis_dao.go
@@ -531,7 +531,7 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
 					a.Description = "Semi sync master seems to be locked, more samplings needed to validate"
 				}
 				//
-			} else if config.Config.EnforceSemiSyncReplicaCount && a.IsMaster && a.SemiSyncMasterEnabled && a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients > a.SemiSyncMasterWaitForReplicaCount {
+			} else if config.Config.EnforceSemiSyncReplicas == config.EnforceExactSemiSyncReplicas && a.IsMaster && a.SemiSyncMasterEnabled && a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients > a.SemiSyncMasterWaitForReplicaCount {
 				a.Analysis = MasterWithTooManySemiSyncReplicas
 				a.Description = "Semi sync master has more semi sync replicas than configured"
 				//
diff --git a/go/inst/instance.go b/go/inst/instance.go
index 5b0d4e115..3e91e9cb8 100644
--- a/go/inst/instance.go
+++ b/go/inst/instance.go
@@ -94,7 +94,7 @@ type Instance struct {
 	HasReplicationCredentials         bool
 	ReplicationCredentialsAvailable   bool
 	SemiSyncAvailable                 bool // when both semi sync plugins (master & replica) are loaded
-	SemiSyncEnforced                  bool
+	SemiSyncEnforced                  uint
 	SemiSyncMasterEnabled             bool
 	SemiSyncReplicaEnabled            bool
 	SemiSyncMasterTimeout             uint64
diff --git a/go/inst/instance_dao.go b/go/inst/instance_dao.go
index 04e4168d7..379fa63e3 100644
--- a/go/inst/instance_dao.go
+++ b/go/inst/instance_dao.go
@@ -1209,7 +1209,7 @@ func readInstanceRow(m sqlutils.RowMap) *Instance {
 	instance.DataCenter = m.GetString("data_center")
 	instance.Region = m.GetString("region")
 	instance.PhysicalEnvironment = m.GetString("physical_environment")
-	instance.SemiSyncEnforced = m.GetBool("semi_sync_enforced")
+	instance.SemiSyncEnforced = m.GetUint("semi_sync_enforced")
 	instance.SemiSyncAvailable = m.GetBool("semi_sync_available")
 	instance.SemiSyncMasterEnabled = m.GetBool("semi_sync_master_enabled")
 	instance.SemiSyncMasterTimeout = m.GetUint64("semi_sync_master_timeout")
diff --git a/go/inst/instance_topology_dao.go b/go/inst/instance_topology_dao.go
index 1631fe8eb..d3a812028 100644
--- a/go/inst/instance_topology_dao.go
+++ b/go/inst/instance_topology_dao.go
@@ -456,7 +456,7 @@ func StartReplication(instanceKey *InstanceKey) (*Instance, error) {
 	// some replicas (those that must never be promoted) should never ACK.
 	// Note: We assume that replicas use 'skip-slave-start' so they won't
 	//       START SLAVE on their own upon restart.
-	if instance.SemiSyncEnforced {
+	if instance.SemiSyncEnforced > 0 {
 		// Send ACK only from promotable instances.
 		sendACK := instance.PromotionRule != MustNotPromoteRule
 		// Always disable master setting, in case we're converting a former master.
@@ -550,7 +550,7 @@ func StartReplicationUntilMasterCoordinates(instanceKey *InstanceKey, masterCoor
 
 	log.Infof("Will start replication on %+v until coordinates: %+v", instanceKey, masterCoordinates)
 
-	if instance.SemiSyncEnforced {
+	if instance.SemiSyncEnforced > 0 {
 		// Send ACK only from promotable instances.
 		sendACK := instance.PromotionRule != MustNotPromoteRule
 		// Always disable master setting, in case we're converting a former master.
@@ -1090,7 +1090,7 @@ func SetReadOnly(instanceKey *InstanceKey, readOnly bool) (*Instance, error) {
 
 	// If async fallback is disallowed, we're responsible for flipping the master
 	// semi-sync switch ON before accepting writes. The setting is off by default.
-	if instance.SemiSyncEnforced && !readOnly {
+	if instance.SemiSyncEnforced > 0 && !readOnly {
 		// Send ACK only from promotable instances.
 		sendACK := instance.PromotionRule != MustNotPromoteRule
 		if err := EnableSemiSync(instanceKey, true, sendACK); err != nil {
@@ -1114,7 +1114,7 @@ func SetReadOnly(instanceKey *InstanceKey, readOnly bool) (*Instance, error) {
 
 	// If we just went read-only, it's safe to flip the master semi-sync switch
 	// OFF, which is the default value so that replicas can make progress.
-	if instance.SemiSyncEnforced && readOnly {
+	if instance.SemiSyncEnforced > 0 && readOnly {
 		// Send ACK only from promotable instances.
 		sendACK := instance.PromotionRule != MustNotPromoteRule
 		if err := EnableSemiSync(instanceKey, false, sendACK); err != nil {
diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
index c46788aba..1d643bb47 100644
--- a/go/logic/topology_recovery.go
+++ b/go/logic/topology_recovery.go
@@ -1485,9 +1485,18 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis,
 		return false, nil, err
 	}
 
-	if !config.Config.EnforceSemiSyncReplicaCount {
+	if config.Config.EnforceSemiSyncReplicas != config.EnforceExactSemiSyncReplicas && config.Config.EnforceSemiSyncReplicas != config.EnforceEnoughSemiSyncReplicas {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync enforcement is not enabled on %+v. Will not issue another RecoverLockedSemiSyncMaster.", analysisEntry.AnalyzedInstanceKey))
 		return false, nil, nil
 	}
+	if config.Config.EnforceSemiSyncReplicas == config.EnforceExactSemiSyncReplicas {
+		return recoverExactSemiSyncReplicas(analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses)
+	}
+
+	panic("not supported")
+
+	// Proceed with "enough"-style recovery:
+	// in priority order, enable semi-sync on replicas until there are enough to match the wait count
 
 	replicas, err := inst.ReadReplicaInstances(&analysisEntry.AnalyzedInstanceKey)
 	if err != nil {
@@ -1498,6 +1507,10 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis,
 		return false, topologyRecovery, nil
 	}
 
+	if config.Config.EnforceSemiSyncReplicas == config.EnforceEnoughSemiSyncReplicas {
+
+	}
+
 	reversePriorityReplicas := replicas // TODO sort replicas by priority
 	enableCount := int(analysisEntry.SemiSyncMasterWaitForReplicaCount) - int(analysisEntry.CountSemiSyncReplicasEnabled)
 	if enableCount <= 0 || enableCount > len(replicas) {
@@ -1542,18 +1555,22 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica
 		return false, nil, err
 	}
 
-	// FIXME is this the right spot for this check? We shouldn't even get here right?
-	if !config.Config.EnforceSemiSyncReplicaCount {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi sync replica count enforcement not enabled. Bailng out on recovery of %+v.", analysisEntry.AnalyzedInstanceKey))
+	if config.Config.EnforceSemiSyncReplicas != config.EnforceExactSemiSyncReplicas {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi sync replica count enforcement not set to %s. Bailing out on recovery of %+v.", config.EnforceExactSemiSyncReplicas, analysisEntry.AnalyzedInstanceKey))
 		return false, topologyRecovery, err
 	}
 
+	return recoverExactSemiSyncReplicas(analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses)
+}
+
+func recoverExactSemiSyncReplicas(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
 	instance, found, err := inst.ReadInstance(&analysisEntry.AnalyzedInstanceKey)
 	if !found || err != nil {
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("instance not found in recovery %+v.", analysisEntry.AnalyzedInstanceKey))
 		return false, topologyRecovery, err
 	}
 
+	// Read all replicas
 	replicas, err := inst.ReadReplicaInstances(&analysisEntry.AnalyzedInstanceKey)
 	if err != nil {
 		return false, topologyRecovery, err
@@ -1563,6 +1580,34 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica
 		return false, topologyRecovery, nil
 	}
 
+	// Filter out downtimed and down replicas
+	validSemiSyncReplicas := make([]*inst.Instance, 0)
+	for _, replica := range replicas {
+		if replica.IsDowntimed || replica.SemiSyncEnforced == 0 { // TODO do i need to check last seen?
+			continue
+		}
+		validSemiSyncReplicas = append(validSemiSyncReplicas, replica)
+	}
+
+	// Sort replicas by priority & promotion rule
+	sort.SliceStable(validSemiSyncReplicas, func(i, j int) bool {
+		if validSemiSyncReplicas[i].SemiSyncEnforced != validSemiSyncReplicas[j].SemiSyncEnforced {
+			return validSemiSyncReplicas[i].SemiSyncEnforced < validSemiSyncReplicas[j].SemiSyncEnforced
+		}
+		if validSemiSyncReplicas[i].PromotionRule != validSemiSyncReplicas[j].PromotionRule {
+			return validSemiSyncReplicas[i].PromotionRule.BetterThan(validSemiSyncReplicas[j].PromotionRule)
+		}
+		return strings.Compare(validSemiSyncReplicas[i].Key.String(), validSemiSyncReplicas[j].Key.String()) < 0
+	})
+
+	//desiredCount := analysisEntry.SemiSyncMasterWaitForReplicaCount
+
+
+	for _, replica := range validSemiSyncReplicas {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("replica: %s %d %+v %#v\n", replica.Key, replica.SemiSyncEnforced, replica.PromotionRule, replica))
+	}
+	return false, nil, nil
+
 	reversePriorityReplicas := replicas // TODO reverse-sort replicas by priority
 	disableCount := int(analysisEntry.CountSemiSyncReplicasEnabled) - int(analysisEntry.SemiSyncMasterWaitForReplicaCount)
 	if disableCount <= 0 || disableCount > len(replicas) {
@@ -1579,7 +1624,7 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica
 		if !replica.SemiSyncReplicaEnabled {
 			continue
 		}
-		err := inst.EnableSemiSync(&replica.Key, false, false)
+		_, err := inst.SetSemiSyncReplica(&replica.Key, false)
 		if err != nil {
 			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot disable semi sync on replica %+v.", replica.Key))
 			return true, topologyRecovery, nil

From d95345a8a703f0e41a2651c0ba16bb938f07525d Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Tue, 29 Jun 2021 08:21:24 -0400
Subject: [PATCH 03/18] Progress on recoverExactSemiSyncReplicas; works now,
 with issues

---
 go/logic/topology_recovery.go | 131 +++++++++++-----------------------
 1 file changed, 41 insertions(+), 90 deletions(-)

diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
index 1d643bb47..08c436b3b 100644
--- a/go/logic/topology_recovery.go
+++ b/go/logic/topology_recovery.go
@@ -1490,7 +1490,7 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis,
 		return false, nil, nil
 	}
 	if config.Config.EnforceSemiSyncReplicas == config.EnforceExactSemiSyncReplicas {
-		return recoverExactSemiSyncReplicas(analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses)
+		return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses)
 	}
 
 	panic("not supported")
@@ -1498,52 +1498,9 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis,
 	// Proceed with "enough"-style recovery:
 	// in priority order, enable semi-sync on replicas until there are enough to match the wait count
 
-	replicas, err := inst.ReadReplicaInstances(&analysisEntry.AnalyzedInstanceKey)
-	if err != nil {
-		return false, topologyRecovery, err
-	}
-	if len(replicas) == 0 {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no replicas found for %+v.", analysisEntry.AnalyzedInstanceKey))
-		return false, topologyRecovery, nil
-	}
-
-	if config.Config.EnforceSemiSyncReplicas == config.EnforceEnoughSemiSyncReplicas {
-
-	}
-
-	reversePriorityReplicas := replicas // TODO sort replicas by priority
-	enableCount := int(analysisEntry.SemiSyncMasterWaitForReplicaCount) - int(analysisEntry.CountSemiSyncReplicasEnabled)
-	if enableCount <= 0 || enableCount > len(replicas) {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("invalid state for recovery %+v. Wait count is lower or equal to semi sync replica count or replica count too low.", analysisEntry.AnalyzedInstanceKey))
-		return false, nil, err
-	}
-
-	// No way back. We're doing this.
-
-	// TODO before actually doing it, do sanity check (downtimed, etc.)
-
-	enabled := 0
-	for _, replica := range reversePriorityReplicas {
-		if replica.IsDowntimed || replica.SemiSyncReplicaEnabled {
-			continue
-		}
-		err := inst.EnableSemiSync(&replica.Key, false, true)
-		if err != nil {
-			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot enable semi sync on replica %+v.", replica.Key))
-			return true, topologyRecovery, nil
-		}
-		err = inst.RestartReplicationQuick(&replica.Key)
-		if err != nil {
-			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot restart replication on replica %+v.", replica.Key))
-			return true, topologyRecovery, nil
-		}
-		enabled++
-		if enabled == enableCount {
-			break
-		}
-	}
+	// TODO Jun 29 12:05:14 pheckel-devm-orch-1 orchestrator[564933]: 2021-06-29 12:05:14 INFO checkAndExecuteFailureDetectionProcesses: could not register LockedSemiSyncMaster detection on pheckel-devm-db-g0-1:3306
+	// TODO IMPLEMENT THIS
 
-	resolveRecovery(topologyRecovery, nil)
 	return true, topologyRecovery, nil
 }
 
@@ -1560,10 +1517,10 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica
 		return false, topologyRecovery, err
 	}
 
-	return recoverExactSemiSyncReplicas(analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses)
+	return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses)
 }
 
-func recoverExactSemiSyncReplicas(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
+func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecoveryOut *TopologyRecovery, err error) {
 	instance, found, err := inst.ReadInstance(&analysisEntry.AnalyzedInstanceKey)
 	if !found || err != nil {
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("instance not found in recovery %+v.", analysisEntry.AnalyzedInstanceKey))
@@ -1581,63 +1538,57 @@ func recoverExactSemiSyncReplicas(analysisEntry inst.ReplicationAnalysis, candid
 	}
 
 	// Filter out downtimed and down replicas
-	validSemiSyncReplicas := make([]*inst.Instance, 0)
+	desiredSemiSyncReplicaCount := analysisEntry.SemiSyncMasterWaitForReplicaCount
+	possibleSemiSyncReplicas := make([]*inst.Instance, 0)
 	for _, replica := range replicas {
 		if replica.IsDowntimed || replica.SemiSyncEnforced == 0 { // TODO do i need to check last seen?
 			continue
 		}
-		validSemiSyncReplicas = append(validSemiSyncReplicas, replica)
+		possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica)
+	}
+	if uint(len(possibleSemiSyncReplicas)) < desiredSemiSyncReplicaCount {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("not enough valid live replicas found to recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey))
+		return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this?
 	}
 
-	// Sort replicas by priority & promotion rule
-	sort.SliceStable(validSemiSyncReplicas, func(i, j int) bool {
-		if validSemiSyncReplicas[i].SemiSyncEnforced != validSemiSyncReplicas[j].SemiSyncEnforced {
-			return validSemiSyncReplicas[i].SemiSyncEnforced < validSemiSyncReplicas[j].SemiSyncEnforced
+	// Sort replicas by priority, promotion rule and name
+	sort.SliceStable(possibleSemiSyncReplicas, func(i, j int) bool {
+		if possibleSemiSyncReplicas[i].SemiSyncEnforced != possibleSemiSyncReplicas[j].SemiSyncEnforced {
+			return possibleSemiSyncReplicas[i].SemiSyncEnforced < possibleSemiSyncReplicas[j].SemiSyncEnforced
 		}
-		if validSemiSyncReplicas[i].PromotionRule != validSemiSyncReplicas[j].PromotionRule {
-			return validSemiSyncReplicas[i].PromotionRule.BetterThan(validSemiSyncReplicas[j].PromotionRule)
+		if possibleSemiSyncReplicas[i].PromotionRule != possibleSemiSyncReplicas[j].PromotionRule {
+			return possibleSemiSyncReplicas[i].PromotionRule.BetterThan(possibleSemiSyncReplicas[j].PromotionRule)
 		}
-		return strings.Compare(validSemiSyncReplicas[i].Key.String(), validSemiSyncReplicas[j].Key.String()) < 0
+		return strings.Compare(possibleSemiSyncReplicas[i].Key.String(), possibleSemiSyncReplicas[j].Key.String()) < 0
 	})
 
-	//desiredCount := analysisEntry.SemiSyncMasterWaitForReplicaCount
-
-
-	for _, replica := range validSemiSyncReplicas {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("replica: %s %d %+v %#v\n", replica.Key, replica.SemiSyncEnforced, replica.PromotionRule, replica))
-	}
-	return false, nil, nil
-
-	reversePriorityReplicas := replicas // TODO reverse-sort replicas by priority
-	disableCount := int(analysisEntry.CountSemiSyncReplicasEnabled) - int(analysisEntry.SemiSyncMasterWaitForReplicaCount)
-	if disableCount <= 0 || disableCount > len(replicas) {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("invalid state for recovery %+v. Wait count is greater or equal to semi sync replica count or replica count too low.", analysisEntry.AnalyzedInstanceKey))
-		return false, nil, err
+	// Figure out which replicas need to be acted upon
+	actions := make(map[*inst.Instance]bool, 0)
+	for i, replica := range possibleSemiSyncReplicas {
+		isSemiSyncEnabled := replica.SemiSyncReplicaEnabled
+		shouldSemiSyncBeEnabled := uint(i) < desiredSemiSyncReplicaCount
+		if shouldSemiSyncBeEnabled && !isSemiSyncEnabled {
+			actions[replica] = true
+		} else if !shouldSemiSyncBeEnabled && isSemiSyncEnabled {
+			actions[replica] = false
+		}
 	}
 
-	// No way back. We're doing this.
-
-	// TODO do preflight checks
-
-	disabled := 0
-	for _, replica := range reversePriorityReplicas {
-		if !replica.SemiSyncReplicaEnabled {
-			continue
-		}
-		_, err := inst.SetSemiSyncReplica(&replica.Key, false)
+	// Take action
+	for replica, enable := range actions {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("replica %s: setting rpl_semi_sync_slave_enabled=%t (including potential IO thread restart) to match desired state for recovery of %s on %+v", replica.Key.String(), enable, analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey))
+		_, err := inst.SetSemiSyncReplica(&replica.Key, enable)
 		if err != nil {
-			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot disable semi sync on replica %+v.", replica.Key))
+			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot change semi sync on replica %+v.", replica.Key))
 			return true, topologyRecovery, nil
 		}
-		err = inst.RestartReplicationQuick(&replica.Key)
-		if err != nil {
-			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot restart replication on replica %+v.", replica.Key))
-			return true, topologyRecovery, nil
-		}
-		disabled++
-		if disabled == disableCount {
-			break
-		}
+	}
+
+	// Re-read source instance to avoid re-triggering the same condition
+	// TODO this does not help; it is still re-triggering the same analysis until the next polling interval. WHY?
+	instance, err = inst.ReadTopologyInstance(&instance.Key)
+	if err != nil {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("warning: cannot re-read instance after recovery of %s on instance %+v; recovery might be retriggered, but this is harmless.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey))
 	}
 
 	resolveRecovery(topologyRecovery, instance)

From e36ffbf63aaccd7b3abe2ee3fa73f12eb1609eef Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Tue, 29 Jun 2021 08:36:42 -0400
Subject: [PATCH 04/18] gofmt

---
 go/config/config.go | 44 ++++++++++++++++++++++----------------------
 go/inst/analysis.go | 42 +++++++++++++++++++++---------------------
 2 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/go/config/config.go b/go/config/config.go
index c8fb179a7..0235e97a1 100644
--- a/go/config/config.go
+++ b/go/config/config.go
@@ -267,17 +267,17 @@ type Configuration struct {
 	DiscoveryIgnoreReplicaHostnameFilters      []string          // Regexp filters to apply to prevent auto-discovering new replicas. Usage: unreachable servers due to firewalls, applications which trigger binlog dumps
 	DiscoveryIgnoreMasterHostnameFilters       []string          // Regexp filters to apply to prevent auto-discovering a master. Usage: pointing your master temporarily to replicate seom data from external host
 	DiscoveryIgnoreHostnameFilters             []string          // Regexp filters to apply to prevent discovering instances of any kind
-	ConsulAddress                     string                     // Address where Consul HTTP api is found. Example: 127.0.0.1:8500
-	ConsulScheme                      string                     // Scheme (http or https) for Consul
-	ConsulAclToken                    string                     // ACL token used to write to Consul KV
-	ConsulCrossDataCenterDistribution bool                       // should orchestrator automatically auto-deduce all consul DCs and write KVs in all DCs
-	ConsulKVStoreProvider             string                     // Consul KV store provider (consul or consul-txn), default: "consul"
-	ConsulMaxKVsPerTransaction        int                        // Maximum number of KV operations to perform in a single Consul Transaction. Requires the "consul-txn" ConsulKVStoreProvider
-	ZkAddress                         string                     // UNSUPPERTED YET. Address where (single or multiple) ZooKeeper servers are found, in `srv1[:port1][,srv2[:port2]...]` format. Default port is 2181. Example: srv-a,srv-b:12181,srv-c
-	KVClusterMasterPrefix             string                     // Prefix to use for clusters' masters entries in KV stores (internal, consul, ZK), default: "mysql/master"
-	WebMessage                        string                     // If provided, will be shown on all web pages below the title bar
-	MaxConcurrentReplicaOperations    int                        // Maximum number of concurrent operations on replicas
-	EnforceSemiSyncReplicas           string                     // If empty, semi-sync replicas will not be touched; if "exact", semi-sync replicas will be enabled/disabled to match the wait count; if "enough", semi-sync replicas will be enabled until the wait count is reached
+	ConsulAddress                              string            // Address where Consul HTTP api is found. Example: 127.0.0.1:8500
+	ConsulScheme                               string            // Scheme (http or https) for Consul
+	ConsulAclToken                             string            // ACL token used to write to Consul KV
+	ConsulCrossDataCenterDistribution          bool              // should orchestrator automatically auto-deduce all consul DCs and write KVs in all DCs
+	ConsulKVStoreProvider                      string            // Consul KV store provider (consul or consul-txn), default: "consul"
+	ConsulMaxKVsPerTransaction                 int               // Maximum number of KV operations to perform in a single Consul Transaction. Requires the "consul-txn" ConsulKVStoreProvider
+	ZkAddress                                  string            // UNSUPPERTED YET. Address where (single or multiple) ZooKeeper servers are found, in `srv1[:port1][,srv2[:port2]...]` format. Default port is 2181. Example: srv-a,srv-b:12181,srv-c
+	KVClusterMasterPrefix                      string            // Prefix to use for clusters' masters entries in KV stores (internal, consul, ZK), default: "mysql/master"
+	WebMessage                                 string            // If provided, will be shown on all web pages below the title bar
+	MaxConcurrentReplicaOperations             int               // Maximum number of concurrent operations on replicas
+	EnforceSemiSyncReplicas                    string            // If empty, semi-sync replicas will not be touched; if "exact", semi-sync replicas will be enabled/disabled to match the wait count; if "enough", semi-sync replicas will be enabled until the wait count is reached
 }
 
 // ToJSONString will marshal this configuration as JSON
@@ -439,17 +439,17 @@ func newConfiguration() *Configuration {
 		GraphitePollSeconds:                        60,
 		URLPrefix:                                  "",
 		DiscoveryIgnoreReplicaHostnameFilters:      []string{},
-		ConsulAddress:                     "",
-		ConsulScheme:                      "http",
-		ConsulAclToken:                    "",
-		ConsulCrossDataCenterDistribution: false,
-		ConsulKVStoreProvider:             "consul",
-		ConsulMaxKVsPerTransaction:        ConsulKVsPerCluster,
-		ZkAddress:                         "",
-		KVClusterMasterPrefix:             "mysql/master",
-		WebMessage:                        "",
-		MaxConcurrentReplicaOperations:    5,
-		EnforceSemiSyncReplicas:           "",
+		ConsulAddress:                              "",
+		ConsulScheme:                               "http",
+		ConsulAclToken:                             "",
+		ConsulCrossDataCenterDistribution:          false,
+		ConsulKVStoreProvider:                      "consul",
+		ConsulMaxKVsPerTransaction:                 ConsulKVsPerCluster,
+		ZkAddress:                                  "",
+		KVClusterMasterPrefix:                      "mysql/master",
+		WebMessage:                                 "",
+		MaxConcurrentReplicaOperations:             5,
+		EnforceSemiSyncReplicas:                    "",
 	}
 }
 
diff --git a/go/inst/analysis.go b/go/inst/analysis.go
index 633bd8f10..d34d8f1d2 100644
--- a/go/inst/analysis.go
+++ b/go/inst/analysis.go
@@ -30,27 +30,27 @@ const (
 	NoProblem                                               AnalysisCode = "NoProblem"
 	DeadMasterWithoutReplicas                                            = "DeadMasterWithoutReplicas"
 	DeadMaster                                                           = "DeadMaster"
-	DeadMasterAndReplicas                                   = "DeadMasterAndReplicas"
-	DeadMasterAndSomeReplicas                               = "DeadMasterAndSomeReplicas"
-	UnreachableMasterWithLaggingReplicas                    = "UnreachableMasterWithLaggingReplicas"
-	UnreachableMaster                                       = "UnreachableMaster"
-	MasterSingleReplicaNotReplicating                       = "MasterSingleReplicaNotReplicating"
-	MasterSingleReplicaDead                                 = "MasterSingleReplicaDead"
-	AllMasterReplicasNotReplicating                         = "AllMasterReplicasNotReplicating"
-	AllMasterReplicasNotReplicatingOrDead                   = "AllMasterReplicasNotReplicatingOrDead"
-	LockedSemiSyncMasterHypothesis                          = "LockedSemiSyncMasterHypothesis"
-	LockedSemiSyncMaster                                    = "LockedSemiSyncMaster"
-	MasterWithTooManySemiSyncReplicas                       = "MasterWithTooManySemiSyncReplicas"
-	MasterWithoutReplicas                                   = "MasterWithoutReplicas"
-	DeadCoMaster                                            = "DeadCoMaster"
-	DeadCoMasterAndSomeReplicas                             = "DeadCoMasterAndSomeReplicas"
-	UnreachableCoMaster                                     = "UnreachableCoMaster"
-	AllCoMasterReplicasNotReplicating                       = "AllCoMasterReplicasNotReplicating"
-	DeadIntermediateMaster                                  = "DeadIntermediateMaster"
-	DeadIntermediateMasterWithSingleReplica                 = "DeadIntermediateMasterWithSingleReplica"
-	DeadIntermediateMasterWithSingleReplicaFailingToConnect = "DeadIntermediateMasterWithSingleReplicaFailingToConnect"
-	DeadIntermediateMasterAndSomeReplicas                   = "DeadIntermediateMasterAndSomeReplicas"
-	DeadIntermediateMasterAndReplicas                       = "DeadIntermediateMasterAndReplicas"
+	DeadMasterAndReplicas                                                = "DeadMasterAndReplicas"
+	DeadMasterAndSomeReplicas                                            = "DeadMasterAndSomeReplicas"
+	UnreachableMasterWithLaggingReplicas                                 = "UnreachableMasterWithLaggingReplicas"
+	UnreachableMaster                                                    = "UnreachableMaster"
+	MasterSingleReplicaNotReplicating                                    = "MasterSingleReplicaNotReplicating"
+	MasterSingleReplicaDead                                              = "MasterSingleReplicaDead"
+	AllMasterReplicasNotReplicating                                      = "AllMasterReplicasNotReplicating"
+	AllMasterReplicasNotReplicatingOrDead                                = "AllMasterReplicasNotReplicatingOrDead"
+	LockedSemiSyncMasterHypothesis                                       = "LockedSemiSyncMasterHypothesis"
+	LockedSemiSyncMaster                                                 = "LockedSemiSyncMaster"
+	MasterWithTooManySemiSyncReplicas                                    = "MasterWithTooManySemiSyncReplicas"
+	MasterWithoutReplicas                                                = "MasterWithoutReplicas"
+	DeadCoMaster                                                         = "DeadCoMaster"
+	DeadCoMasterAndSomeReplicas                                          = "DeadCoMasterAndSomeReplicas"
+	UnreachableCoMaster                                                  = "UnreachableCoMaster"
+	AllCoMasterReplicasNotReplicating                                    = "AllCoMasterReplicasNotReplicating"
+	DeadIntermediateMaster                                               = "DeadIntermediateMaster"
+	DeadIntermediateMasterWithSingleReplica                              = "DeadIntermediateMasterWithSingleReplica"
+	DeadIntermediateMasterWithSingleReplicaFailingToConnect              = "DeadIntermediateMasterWithSingleReplicaFailingToConnect"
+	DeadIntermediateMasterAndSomeReplicas                                = "DeadIntermediateMasterAndSomeReplicas"
+	DeadIntermediateMasterAndReplicas                                    = "DeadIntermediateMasterAndReplicas"
 	UnreachableIntermediateMasterWithLaggingReplicas                     = "UnreachableIntermediateMasterWithLaggingReplicas"
 	UnreachableIntermediateMaster                                        = "UnreachableIntermediateMaster"
 	AllIntermediateMasterReplicasFailingToConnectOrDead                  = "AllIntermediateMasterReplicasFailingToConnectOrDead"

From 7212bf601dc8843da58617f9f82186bb444c75cb Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Tue, 29 Jun 2021 12:52:41 -0400
Subject: [PATCH 05/18] Split and rename config variables; introduce
 ReasonableStaleBinlogCoordinatesSeconds, RecoverLockedSemiSyncMaster and
 EnforceExactSemiSyncReplicas

---
 go/config/config.go           | 10 +++++++---
 go/inst/analysis.go           |  7 +++++++
 go/inst/analysis_dao.go       |  4 ++--
 go/logic/topology_recovery.go | 17 +++++------------
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/go/config/config.go b/go/config/config.go
index 0235e97a1..861451d07 100644
--- a/go/config/config.go
+++ b/go/config/config.go
@@ -265,7 +265,7 @@ type Configuration struct {
 	GraphitePollSeconds                        int               // Graphite writes interval. 0 disables.
 	URLPrefix                                  string            // URL prefix to run orchestrator on non-root web path, e.g. /orchestrator to put it behind nginx.
 	DiscoveryIgnoreReplicaHostnameFilters      []string          // Regexp filters to apply to prevent auto-discovering new replicas. Usage: unreachable servers due to firewalls, applications which trigger binlog dumps
-	DiscoveryIgnoreMasterHostnameFilters       []string          // Regexp filters to apply to prevent auto-discovering a master. Usage: pointing your master temporarily to replicate seom data from external host
+	DiscoveryIgnoreMasterHostnameFilters       []string          // Regexp filters to apply to prevent auto-discovering a master. Usage: pointing your master temporarily to replicate some data from external host
 	DiscoveryIgnoreHostnameFilters             []string          // Regexp filters to apply to prevent discovering instances of any kind
 	ConsulAddress                              string            // Address where Consul HTTP api is found. Example: 127.0.0.1:8500
 	ConsulScheme                               string            // Scheme (http or https) for Consul
@@ -277,7 +277,9 @@ type Configuration struct {
 	KVClusterMasterPrefix                      string            // Prefix to use for clusters' masters entries in KV stores (internal, consul, ZK), default: "mysql/master"
 	WebMessage                                 string            // If provided, will be shown on all web pages below the title bar
 	MaxConcurrentReplicaOperations             int               // Maximum number of concurrent operations on replicas
-	EnforceSemiSyncReplicas                    string            // If empty, semi-sync replicas will not be touched; if "exact", semi-sync replicas will be enabled/disabled to match the wait count; if "enough", semi-sync replicas will be enabled until the wait count is reached
+	EnforceExactSemiSyncReplicas               bool              // If true, semi-sync replicas will be enabled/disabled to match the wait count in the desired priority order; this applies to LockedSemiSyncMaster and MasterWithTooManySemiSyncReplicas
+	RecoverLockedSemiSync                      bool              // If true, orchestrator will recover from a LockedSemiSync state by enabling semi-sync on replicas to match the wait count; this behavior can be overridden by EnforceExactSemiSyncReplicas
+	ReasonableStaleBinlogCoordinatesSeconds    uint              // Time to evaluate the LockedSemiSyncHypothesis before triggering the LockedSemiSync analysis; falls back to ReasonableReplicationLagSeconds if not set
 }
 
 // ToJSONString will marshal this configuration as JSON
@@ -449,7 +451,9 @@ func newConfiguration() *Configuration {
 		KVClusterMasterPrefix:                      "mysql/master",
 		WebMessage:                                 "",
 		MaxConcurrentReplicaOperations:             5,
-		EnforceSemiSyncReplicas:                    "",
+		EnforceExactSemiSyncReplicas:               false,
+		RecoverLockedSemiSync:                      false,
+		ReasonableStaleBinlogCoordinatesSeconds:    0,
 	}
 }
 
diff --git a/go/inst/analysis.go b/go/inst/analysis.go
index d34d8f1d2..a4e525b8b 100644
--- a/go/inst/analysis.go
+++ b/go/inst/analysis.go
@@ -231,3 +231,10 @@ func (this *ReplicationAnalysis) GetAnalysisInstanceType() AnalysisInstanceType
 func ValidSecondsFromSeenToLastAttemptedCheck() uint {
 	return config.Config.InstancePollSeconds + config.Config.ReasonableInstanceCheckSeconds
 }
+
+func ReasonableStaleBinlogCoordinatesSeconds() uint {
+	if config.Config.ReasonableStaleBinlogCoordinatesSeconds == 0 {
+		return uint(config.Config.ReasonableReplicationLagSeconds)
+	}
+	return config.Config.ReasonableStaleBinlogCoordinatesSeconds
+}
diff --git a/go/inst/analysis_dao.go b/go/inst/analysis_dao.go
index 607193d9e..ce1343933 100644
--- a/go/inst/analysis_dao.go
+++ b/go/inst/analysis_dao.go
@@ -55,7 +55,7 @@ func initializeAnalysisDaoPostConfiguration() {
 func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) ([]ReplicationAnalysis, error) {
 	result := []ReplicationAnalysis{}
 
-	args := sqlutils.Args(config.Config.ReasonableReplicationLagSeconds, ValidSecondsFromSeenToLastAttemptedCheck(), config.Config.ReasonableReplicationLagSeconds, clusterName)
+	args := sqlutils.Args(ReasonableStaleBinlogCoordinatesSeconds(), ValidSecondsFromSeenToLastAttemptedCheck(), config.Config.ReasonableReplicationLagSeconds, clusterName)
 	analysisQueryReductionClause := ``
 
 	if config.Config.ReduceReplicationAnalysisCount {
@@ -531,7 +531,7 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
 					a.Description = "Semi sync master seems to be locked, more samplings needed to validate"
 				}
 				//
-			} else if config.Config.EnforceSemiSyncReplicas == config.EnforceExactSemiSyncReplicas && a.IsMaster && a.SemiSyncMasterEnabled && a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients > a.SemiSyncMasterWaitForReplicaCount {
+			} else if config.Config.EnforceExactSemiSyncReplicas && a.IsMaster && a.SemiSyncMasterEnabled && a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients > a.SemiSyncMasterWaitForReplicaCount {
 				a.Analysis = MasterWithTooManySemiSyncReplicas
 				a.Description = "Semi sync master has more semi sync replicas than configured"
 				//
diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
index 08c436b3b..d5064aaaf 100644
--- a/go/logic/topology_recovery.go
+++ b/go/logic/topology_recovery.go
@@ -1484,14 +1484,13 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis,
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverLockedSemiSyncMaster.", analysisEntry.AnalyzedInstanceKey))
 		return false, nil, err
 	}
-
-	if config.Config.EnforceSemiSyncReplicas != config.EnforceExactSemiSyncReplicas && config.Config.EnforceSemiSyncReplicas != config.EnforceEnoughSemiSyncReplicas {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync enforcement is not enabled on %+v. Will not issue another RecoverLockedSemiSyncMaster.", analysisEntry.AnalyzedInstanceKey))
-		return false, nil, nil
-	}
-	if config.Config.EnforceSemiSyncReplicas == config.EnforceExactSemiSyncReplicas {
+	if config.Config.EnforceExactSemiSyncReplicas {
 		return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses)
 	}
+	if !config.Config.RecoverLockedSemiSync {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no action taken to recover locked semi sync master on %+v. Enable RecoverLockedSemiSync or EnforceExactSemiSyncReplicas change this behavior.", analysisEntry.AnalyzedInstanceKey))
+		return false, nil, err
+	}
 
 	panic("not supported")
 
@@ -1511,12 +1510,6 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverMasterWithTooManySemiSyncReplicas.", analysisEntry.AnalyzedInstanceKey))
 		return false, nil, err
 	}
-
-	if config.Config.EnforceSemiSyncReplicas != config.EnforceExactSemiSyncReplicas {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi sync replica count enforcement not set to %s. Bailing out on recovery of %+v.", config.EnforceExactSemiSyncReplicas, analysisEntry.AnalyzedInstanceKey))
-		return false, topologyRecovery, err
-	}
-
 	return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses)
 }
 

From 6fb8a1da75642a06da8893fbf542646e49c6c66d Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Tue, 29 Jun 2021 12:54:05 -0400
Subject: [PATCH 06/18] Rename variable again

---
 go/config/config.go           | 4 ++--
 go/logic/topology_recovery.go | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/go/config/config.go b/go/config/config.go
index 861451d07..9efe4b2db 100644
--- a/go/config/config.go
+++ b/go/config/config.go
@@ -278,7 +278,7 @@ type Configuration struct {
 	WebMessage                                 string            // If provided, will be shown on all web pages below the title bar
 	MaxConcurrentReplicaOperations             int               // Maximum number of concurrent operations on replicas
 	EnforceExactSemiSyncReplicas               bool              // If true, semi-sync replicas will be enabled/disabled to match the wait count in the desired priority order; this applies to LockedSemiSyncMaster and MasterWithTooManySemiSyncReplicas
-	RecoverLockedSemiSync                      bool              // If true, orchestrator will recover from a LockedSemiSync state by enabling semi-sync on replicas to match the wait count; this behavior can be overridden by EnforceExactSemiSyncReplicas
+	RecoverLockedSemiSyncMaster                bool              // If true, orchestrator will recover from a LockedSemiSync state by enabling semi-sync on replicas to match the wait count; this behavior can be overridden by EnforceExactSemiSyncReplicas
 	ReasonableStaleBinlogCoordinatesSeconds    uint              // Time to evaluate the LockedSemiSyncHypothesis before triggering the LockedSemiSync analysis; falls back to ReasonableReplicationLagSeconds if not set
 }
 
@@ -452,7 +452,7 @@ func newConfiguration() *Configuration {
 		WebMessage:                                 "",
 		MaxConcurrentReplicaOperations:             5,
 		EnforceExactSemiSyncReplicas:               false,
-		RecoverLockedSemiSync:                      false,
+		RecoverLockedSemiSyncMaster:                false,
 		ReasonableStaleBinlogCoordinatesSeconds:    0,
 	}
 }
diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
index d5064aaaf..e01396aca 100644
--- a/go/logic/topology_recovery.go
+++ b/go/logic/topology_recovery.go
@@ -1487,8 +1487,8 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis,
 	if config.Config.EnforceExactSemiSyncReplicas {
 		return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses)
 	}
-	if !config.Config.RecoverLockedSemiSync {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no action taken to recover locked semi sync master on %+v. Enable RecoverLockedSemiSync or EnforceExactSemiSyncReplicas change this behavior.", analysisEntry.AnalyzedInstanceKey))
+	if !config.Config.RecoverLockedSemiSyncMaster {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no action taken to recover locked semi sync master on %+v. Enable RecoverLockedSemiSyncMaster or EnforceExactSemiSyncReplicas change this behavior.", analysisEntry.AnalyzedInstanceKey))
 		return false, nil, err
 	}
 

From bc3cc8d66b0ce607bfd669032ca7ca0fd6da2912 Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Tue, 29 Jun 2021 16:49:02 -0400
Subject: [PATCH 07/18] Add logging; add check for replication running and last
 check valid

---
 go/config/config.go           |  2 --
 go/logic/topology_recovery.go | 38 ++++++++++++++++++++++++++++++-----
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/go/config/config.go b/go/config/config.go
index 9efe4b2db..33cd087fc 100644
--- a/go/config/config.go
+++ b/go/config/config.go
@@ -62,8 +62,6 @@ const (
 	SelectTrueQuery                              = "select 1"
 	ConsulKVsPerCluster                          = 5 // KVs: "/", "/hostname", "/ipv4", "/ipv6" and "/port"
 	ConsulMaxTransactionOps                      = 64
-	EnforceExactSemiSyncReplicas                 = "exact"
-	EnforceEnoughSemiSyncReplicas                = "enough"
 )
 
 var deprecatedConfigurationVariables = []string{
diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
index e01396aca..406ae2569 100644
--- a/go/logic/topology_recovery.go
+++ b/go/logic/topology_recovery.go
@@ -1485,7 +1485,7 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis,
 		return false, nil, err
 	}
 	if config.Config.EnforceExactSemiSyncReplicas {
-		return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses)
+		return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry)
 	}
 	if !config.Config.RecoverLockedSemiSyncMaster {
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no action taken to recover locked semi sync master on %+v. Enable RecoverLockedSemiSyncMaster or EnforceExactSemiSyncReplicas change this behavior.", analysisEntry.AnalyzedInstanceKey))
@@ -1510,10 +1510,10 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverMasterWithTooManySemiSyncReplicas.", analysisEntry.AnalyzedInstanceKey))
 		return false, nil, err
 	}
-	return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses)
+	return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry)
 }
 
-func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecoveryOut *TopologyRecovery, err error) {
+func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecoveryOut *TopologyRecovery, err error) {
 	instance, found, err := inst.ReadInstance(&analysisEntry.AnalyzedInstanceKey)
 	if !found || err != nil {
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("instance not found in recovery %+v.", analysisEntry.AnalyzedInstanceKey))
@@ -1523,6 +1523,7 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 	// Read all replicas
 	replicas, err := inst.ReadReplicaInstances(&analysisEntry.AnalyzedInstanceKey)
 	if err != nil {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("could not read replica instancesfor %+v: %s", analysisEntry.AnalyzedInstanceKey, err.Error()))
 		return false, topologyRecovery, err
 	}
 	if len(replicas) == 0 {
@@ -1533,12 +1534,34 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 	// Filter out downtimed and down replicas
 	desiredSemiSyncReplicaCount := analysisEntry.SemiSyncMasterWaitForReplicaCount
 	possibleSemiSyncReplicas := make([]*inst.Instance, 0)
+	excludedReplicas := make([]*inst.Instance, 0)
 	for _, replica := range replicas {
-		if replica.IsDowntimed || replica.SemiSyncEnforced == 0 { // TODO do i need to check last seen?
+		if replica.IsDowntimed || replica.SemiSyncEnforced == 0 || !replica.IsLastCheckValid || !replica.ReplicaRunning() {
+			excludedReplicas = append(excludedReplicas, replica)
+			// TODO make this more resilient: re-read instance if its last check was invalid
 			continue
 		}
 		possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica)
 	}
+	AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("master semi-sync wait count is %d; we have %d valid possible semi-sync replicas and %d excluded replicas", desiredSemiSyncReplicaCount, len(possibleSemiSyncReplicas), len(excludedReplicas)))
+	if len(possibleSemiSyncReplicas) > 0 {
+		AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas:")
+		for _, replica := range possibleSemiSyncReplicas {
+			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: downtimed = %t, semi-sync enforced priority = %d, last check valid = %t, replication runnning = %t", replica.Key.String(), replica.IsDowntimed, replica.SemiSyncEnforced, replica.IsLastCheckValid, replica.ReplicaRunning()))
+		}
+	} else {
+		AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas: (none)")
+	}
+	if len(excludedReplicas) > 0 {
+		AuditTopologyRecovery(topologyRecovery, "excluded replicas:")
+		for _, replica := range excludedReplicas {
+			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: downtimed = %t, semi-sync enforced priority = %d, last check valid = %t, replication runnning = %t", replica.Key.String(), replica.IsDowntimed, replica.SemiSyncEnforced, replica.IsLastCheckValid, replica.ReplicaRunning()))
+		}
+	} else {
+		AuditTopologyRecovery(topologyRecovery, "excluded replicas: (none)")
+	}
+
+	// Bail out if we cannot succeed
 	if uint(len(possibleSemiSyncReplicas)) < desiredSemiSyncReplicaCount {
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("not enough valid live replicas found to recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey))
 		return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this?
@@ -1566,10 +1589,15 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 			actions[replica] = false
 		}
 	}
+	if len(actions) == 0 {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot determine actions based on possible semi-sync replicas; cannot recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey))
+		return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this?
+	}
 
 	// Take action
+	AuditTopologyRecovery(topologyRecovery, "taking actions:")
 	for replica, enable := range actions {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("replica %s: setting rpl_semi_sync_slave_enabled=%t (including potential IO thread restart) to match desired state for recovery of %s on %+v", replica.Key.String(), enable, analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey))
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: setting rpl_semi_sync_slave_enabled=%t (including potential IO thread restart)", replica.Key.String(), enable))
 		_, err := inst.SetSemiSyncReplica(&replica.Key, enable)
 		if err != nil {
 			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot change semi sync on replica %+v.", replica.Key))

From ac1b89985db4e82ac6f281fb08bc7897e55c7936 Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Tue, 29 Jun 2021 20:05:21 -0400
Subject: [PATCH 08/18] Re-order; add logging

---
 go/logic/topology_recovery.go | 54 +++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
index 406ae2569..84f55a3cd 100644
--- a/go/logic/topology_recovery.go
+++ b/go/logic/topology_recovery.go
@@ -1543,32 +1543,9 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 		}
 		possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica)
 	}
-	AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("master semi-sync wait count is %d; we have %d valid possible semi-sync replicas and %d excluded replicas", desiredSemiSyncReplicaCount, len(possibleSemiSyncReplicas), len(excludedReplicas)))
-	if len(possibleSemiSyncReplicas) > 0 {
-		AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas:")
-		for _, replica := range possibleSemiSyncReplicas {
-			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: downtimed = %t, semi-sync enforced priority = %d, last check valid = %t, replication runnning = %t", replica.Key.String(), replica.IsDowntimed, replica.SemiSyncEnforced, replica.IsLastCheckValid, replica.ReplicaRunning()))
-		}
-	} else {
-		AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas: (none)")
-	}
-	if len(excludedReplicas) > 0 {
-		AuditTopologyRecovery(topologyRecovery, "excluded replicas:")
-		for _, replica := range excludedReplicas {
-			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: downtimed = %t, semi-sync enforced priority = %d, last check valid = %t, replication runnning = %t", replica.Key.String(), replica.IsDowntimed, replica.SemiSyncEnforced, replica.IsLastCheckValid, replica.ReplicaRunning()))
-		}
-	} else {
-		AuditTopologyRecovery(topologyRecovery, "excluded replicas: (none)")
-	}
-
-	// Bail out if we cannot succeed
-	if uint(len(possibleSemiSyncReplicas)) < desiredSemiSyncReplicaCount {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("not enough valid live replicas found to recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey))
-		return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this?
-	}
 
 	// Sort replicas by priority, promotion rule and name
-	sort.SliceStable(possibleSemiSyncReplicas, func(i, j int) bool {
+	sort.Slice(possibleSemiSyncReplicas, func(i, j int) bool {
 		if possibleSemiSyncReplicas[i].SemiSyncEnforced != possibleSemiSyncReplicas[j].SemiSyncEnforced {
 			return possibleSemiSyncReplicas[i].SemiSyncEnforced < possibleSemiSyncReplicas[j].SemiSyncEnforced
 		}
@@ -1589,6 +1566,31 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 			actions[replica] = false
 		}
 	}
+
+	// Summarize what we've determined
+	AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("master semi-sync wait count is %d; we have %d valid possible semi-sync replica(s) and %d excluded replica(s)", desiredSemiSyncReplicaCount, len(possibleSemiSyncReplicas), len(excludedReplicas)))
+	if len(possibleSemiSyncReplicas) > 0 {
+		AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas (in priority order):")
+		for _, replica := range possibleSemiSyncReplicas {
+			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncEnforced, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning()))
+		}
+	} else {
+		AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas: (none)")
+	}
+	if len(excludedReplicas) > 0 {
+		AuditTopologyRecovery(topologyRecovery, "excluded replicas:")
+		for _, replica := range excludedReplicas {
+			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncEnforced, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning()))
+		}
+	} else {
+		AuditTopologyRecovery(topologyRecovery, "excluded replicas: (none)")
+	}
+
+	// Bail out if we cannot succeed
+	if uint(len(possibleSemiSyncReplicas)) < desiredSemiSyncReplicaCount {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("not enough valid live replicas found to recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey))
+		return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this?
+	}
 	if len(actions) == 0 {
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot determine actions based on possible semi-sync replicas; cannot recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey))
 		return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this?
@@ -1597,7 +1599,7 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 	// Take action
 	AuditTopologyRecovery(topologyRecovery, "taking actions:")
 	for replica, enable := range actions {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: setting rpl_semi_sync_slave_enabled=%t (including potential IO thread restart)", replica.Key.String(), enable))
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: setting rpl_semi_sync_slave_enabled=%t, restarting slave_io thread", replica.Key.String(), enable))
 		_, err := inst.SetSemiSyncReplica(&replica.Key, enable)
 		if err != nil {
 			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot change semi sync on replica %+v.", replica.Key))
@@ -1613,6 +1615,8 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 	}
 
 	resolveRecovery(topologyRecovery, instance)
+	AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("recovery complete; success = %t", topologyRecovery.IsSuccessful))
+
 	return true, topologyRecovery, nil
 }
 

From 7f7045880e78b2b7d87057228b80e77d787e5436 Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Tue, 29 Jun 2021 20:07:59 -0400
Subject: [PATCH 09/18] Rename field

---
 go/inst/instance.go              |  2 +-
 go/inst/instance_dao.go          |  6 +++---
 go/inst/instance_topology_dao.go |  8 ++++----
 go/logic/topology_recovery.go    | 10 +++++-----
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/go/inst/instance.go b/go/inst/instance.go
index 3e91e9cb8..521895eb8 100644
--- a/go/inst/instance.go
+++ b/go/inst/instance.go
@@ -94,7 +94,7 @@ type Instance struct {
 	HasReplicationCredentials         bool
 	ReplicationCredentialsAvailable   bool
 	SemiSyncAvailable                 bool // when both semi sync plugins (master & replica) are loaded
-	SemiSyncEnforced                  uint
+	SemiSyncPriority                  uint
 	SemiSyncMasterEnabled             bool
 	SemiSyncReplicaEnabled            bool
 	SemiSyncMasterTimeout             uint64
diff --git a/go/inst/instance_dao.go b/go/inst/instance_dao.go
index 379fa63e3..c3c2cab8a 100644
--- a/go/inst/instance_dao.go
+++ b/go/inst/instance_dao.go
@@ -776,7 +776,7 @@ func ReadTopologyInstanceBufferable(instanceKey *InstanceKey, bufferWrites bool,
 		waitGroup.Add(1)
 		go func() {
 			defer waitGroup.Done()
-			err := db.QueryRow(config.Config.DetectSemiSyncEnforcedQuery).Scan(&instance.SemiSyncEnforced)
+			err := db.QueryRow(config.Config.DetectSemiSyncEnforcedQuery).Scan(&instance.SemiSyncPriority)
 			logReadTopologyInstanceError(instanceKey, "DetectSemiSyncEnforcedQuery", err)
 		}()
 	}
@@ -1209,7 +1209,7 @@ func readInstanceRow(m sqlutils.RowMap) *Instance {
 	instance.DataCenter = m.GetString("data_center")
 	instance.Region = m.GetString("region")
 	instance.PhysicalEnvironment = m.GetString("physical_environment")
-	instance.SemiSyncEnforced = m.GetUint("semi_sync_enforced")
+	instance.SemiSyncPriority = m.GetUint("semi_sync_enforced")
 	instance.SemiSyncAvailable = m.GetBool("semi_sync_available")
 	instance.SemiSyncMasterEnabled = m.GetBool("semi_sync_master_enabled")
 	instance.SemiSyncMasterTimeout = m.GetUint64("semi_sync_master_timeout")
@@ -2610,7 +2610,7 @@ func mkInsertOdkuForInstances(instances []*Instance, instanceWasActuallyFound bo
 		args = append(args, instance.ReplicationCredentialsAvailable)
 		args = append(args, instance.HasReplicationCredentials)
 		args = append(args, instance.AllowTLS)
-		args = append(args, instance.SemiSyncEnforced)
+		args = append(args, instance.SemiSyncPriority)
 		args = append(args, instance.SemiSyncAvailable)
 		args = append(args, instance.SemiSyncMasterEnabled)
 		args = append(args, instance.SemiSyncMasterTimeout)
diff --git a/go/inst/instance_topology_dao.go b/go/inst/instance_topology_dao.go
index d3a812028..9c95a50c1 100644
--- a/go/inst/instance_topology_dao.go
+++ b/go/inst/instance_topology_dao.go
@@ -456,7 +456,7 @@ func StartReplication(instanceKey *InstanceKey) (*Instance, error) {
 	// some replicas (those that must never be promoted) should never ACK.
 	// Note: We assume that replicas use 'skip-slave-start' so they won't
 	//       START SLAVE on their own upon restart.
-	if instance.SemiSyncEnforced > 0 {
+	if instance.SemiSyncPriority > 0 {
 		// Send ACK only from promotable instances.
 		sendACK := instance.PromotionRule != MustNotPromoteRule
 		// Always disable master setting, in case we're converting a former master.
@@ -550,7 +550,7 @@ func StartReplicationUntilMasterCoordinates(instanceKey *InstanceKey, masterCoor
 
 	log.Infof("Will start replication on %+v until coordinates: %+v", instanceKey, masterCoordinates)
 
-	if instance.SemiSyncEnforced > 0 {
+	if instance.SemiSyncPriority > 0 {
 		// Send ACK only from promotable instances.
 		sendACK := instance.PromotionRule != MustNotPromoteRule
 		// Always disable master setting, in case we're converting a former master.
@@ -1090,7 +1090,7 @@ func SetReadOnly(instanceKey *InstanceKey, readOnly bool) (*Instance, error) {
 
 	// If async fallback is disallowed, we're responsible for flipping the master
 	// semi-sync switch ON before accepting writes. The setting is off by default.
-	if instance.SemiSyncEnforced > 0 && !readOnly {
+	if instance.SemiSyncPriority > 0 && !readOnly {
 		// Send ACK only from promotable instances.
 		sendACK := instance.PromotionRule != MustNotPromoteRule
 		if err := EnableSemiSync(instanceKey, true, sendACK); err != nil {
@@ -1114,7 +1114,7 @@ func SetReadOnly(instanceKey *InstanceKey, readOnly bool) (*Instance, error) {
 
 	// If we just went read-only, it's safe to flip the master semi-sync switch
 	// OFF, which is the default value so that replicas can make progress.
-	if instance.SemiSyncEnforced > 0 && readOnly {
+	if instance.SemiSyncPriority > 0 && readOnly {
 		// Send ACK only from promotable instances.
 		sendACK := instance.PromotionRule != MustNotPromoteRule
 		if err := EnableSemiSync(instanceKey, false, sendACK); err != nil {
diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
index 84f55a3cd..401119173 100644
--- a/go/logic/topology_recovery.go
+++ b/go/logic/topology_recovery.go
@@ -1536,7 +1536,7 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 	possibleSemiSyncReplicas := make([]*inst.Instance, 0)
 	excludedReplicas := make([]*inst.Instance, 0)
 	for _, replica := range replicas {
-		if replica.IsDowntimed || replica.SemiSyncEnforced == 0 || !replica.IsLastCheckValid || !replica.ReplicaRunning() {
+		if replica.IsDowntimed || replica.SemiSyncPriority == 0 || !replica.IsLastCheckValid || !replica.ReplicaRunning() {
 			excludedReplicas = append(excludedReplicas, replica)
 			// TODO make this more resilient: re-read instance if its last check was invalid
 			continue
@@ -1546,8 +1546,8 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 
 	// Sort replicas by priority, promotion rule and name
 	sort.Slice(possibleSemiSyncReplicas, func(i, j int) bool {
-		if possibleSemiSyncReplicas[i].SemiSyncEnforced != possibleSemiSyncReplicas[j].SemiSyncEnforced {
-			return possibleSemiSyncReplicas[i].SemiSyncEnforced < possibleSemiSyncReplicas[j].SemiSyncEnforced
+		if possibleSemiSyncReplicas[i].SemiSyncPriority != possibleSemiSyncReplicas[j].SemiSyncPriority {
+			return possibleSemiSyncReplicas[i].SemiSyncPriority < possibleSemiSyncReplicas[j].SemiSyncPriority
 		}
 		if possibleSemiSyncReplicas[i].PromotionRule != possibleSemiSyncReplicas[j].PromotionRule {
 			return possibleSemiSyncReplicas[i].PromotionRule.BetterThan(possibleSemiSyncReplicas[j].PromotionRule)
@@ -1572,7 +1572,7 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 	if len(possibleSemiSyncReplicas) > 0 {
 		AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas (in priority order):")
 		for _, replica := range possibleSemiSyncReplicas {
-			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncEnforced, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning()))
+			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncPriority, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning()))
 		}
 	} else {
 		AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas: (none)")
@@ -1580,7 +1580,7 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 	if len(excludedReplicas) > 0 {
 		AuditTopologyRecovery(topologyRecovery, "excluded replicas:")
 		for _, replica := range excludedReplicas {
-			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncEnforced, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning()))
+			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncPriority, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning()))
 		}
 	} else {
 		AuditTopologyRecovery(topologyRecovery, "excluded replicas: (none)")

From 7916658ce51c6d9bd850565a9d57b044efd0af71 Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Tue, 29 Jun 2021 20:22:43 -0400
Subject: [PATCH 10/18] Also handle async replicas

---
 go/logic/topology_recovery.go | 44 ++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
index 401119173..9f846a9da 100644
--- a/go/logic/topology_recovery.go
+++ b/go/logic/topology_recovery.go
@@ -1534,14 +1534,17 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 	// Filter out downtimed and down replicas
 	desiredSemiSyncReplicaCount := analysisEntry.SemiSyncMasterWaitForReplicaCount
 	possibleSemiSyncReplicas := make([]*inst.Instance, 0)
+	asyncReplicas := make([]*inst.Instance, 0)
 	excludedReplicas := make([]*inst.Instance, 0)
 	for _, replica := range replicas {
-		if replica.IsDowntimed || replica.SemiSyncPriority == 0 || !replica.IsLastCheckValid || !replica.ReplicaRunning() {
+		if replica.IsDowntimed || !replica.IsLastCheckValid || !replica.ReplicaRunning() {
 			excludedReplicas = append(excludedReplicas, replica)
 			// TODO make this more resilient: re-read instance if its last check was invalid
-			continue
+		} else if replica.SemiSyncPriority == 0 {
+			asyncReplicas = append(asyncReplicas, replica)
+		} else {
+			possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica)
 		}
-		possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica)
 	}
 
 	// Sort replicas by priority, promotion rule and name
@@ -1566,25 +1569,17 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 			actions[replica] = false
 		}
 	}
+	for _, replica := range asyncReplicas {
+		if replica.SemiSyncReplicaEnabled {
+			actions[replica] = false
+		}
+	}
 
 	// Summarize what we've determined
 	AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("master semi-sync wait count is %d; we have %d valid possible semi-sync replica(s) and %d excluded replica(s)", desiredSemiSyncReplicaCount, len(possibleSemiSyncReplicas), len(excludedReplicas)))
-	if len(possibleSemiSyncReplicas) > 0 {
-		AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas (in priority order):")
-		for _, replica := range possibleSemiSyncReplicas {
-			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncPriority, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning()))
-		}
-	} else {
-		AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas: (none)")
-	}
-	if len(excludedReplicas) > 0 {
-		AuditTopologyRecovery(topologyRecovery, "excluded replicas:")
-		for _, replica := range excludedReplicas {
-			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncPriority, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning()))
-		}
-	} else {
-		AuditTopologyRecovery(topologyRecovery, "excluded replicas: (none)")
-	}
+	logReplicas(topologyRecovery, "possible semi-sync replicas (in priority order)", possibleSemiSyncReplicas)
+	logReplicas(topologyRecovery, "always-async replicas", asyncReplicas)
+	logReplicas(topologyRecovery, "excluded replicas (downtimed/defunct)", excludedReplicas)
 
 	// Bail out if we cannot succeed
 	if uint(len(possibleSemiSyncReplicas)) < desiredSemiSyncReplicaCount {
@@ -1620,6 +1615,17 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 	return true, topologyRecovery, nil
 }
 
+func logReplicas(topologyRecovery *TopologyRecovery, description string, replicas []*inst.Instance) {
+	if len(replicas) > 0 {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%s:", description))
+		for _, replica := range replicas {
+			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncPriority, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning()))
+		}
+	} else {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%s: (none)", description))
+	}
+}
+
 // checkAndRecoverGenericProblem is a general-purpose recovery function
 func checkAndRecoverGenericProblem(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) {
 	return false, nil, nil

From e6893912d2843a97ca7025f9c6561d8b0755349a Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Tue, 29 Jun 2021 20:47:30 -0400
Subject: [PATCH 11/18] Split out classify function

---
 go/logic/topology_recovery.go | 59 ++++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 26 deletions(-)

diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
index 9f846a9da..8c8077e79 100644
--- a/go/logic/topology_recovery.go
+++ b/go/logic/topology_recovery.go
@@ -1531,34 +1531,11 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 		return false, topologyRecovery, nil
 	}
 
-	// Filter out downtimed and down replicas
-	desiredSemiSyncReplicaCount := analysisEntry.SemiSyncMasterWaitForReplicaCount
-	possibleSemiSyncReplicas := make([]*inst.Instance, 0)
-	asyncReplicas := make([]*inst.Instance, 0)
-	excludedReplicas := make([]*inst.Instance, 0)
-	for _, replica := range replicas {
-		if replica.IsDowntimed || !replica.IsLastCheckValid || !replica.ReplicaRunning() {
-			excludedReplicas = append(excludedReplicas, replica)
-			// TODO make this more resilient: re-read instance if its last check was invalid
-		} else if replica.SemiSyncPriority == 0 {
-			asyncReplicas = append(asyncReplicas, replica)
-		} else {
-			possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica)
-		}
-	}
-
-	// Sort replicas by priority, promotion rule and name
-	sort.Slice(possibleSemiSyncReplicas, func(i, j int) bool {
-		if possibleSemiSyncReplicas[i].SemiSyncPriority != possibleSemiSyncReplicas[j].SemiSyncPriority {
-			return possibleSemiSyncReplicas[i].SemiSyncPriority < possibleSemiSyncReplicas[j].SemiSyncPriority
-		}
-		if possibleSemiSyncReplicas[i].PromotionRule != possibleSemiSyncReplicas[j].PromotionRule {
-			return possibleSemiSyncReplicas[i].PromotionRule.BetterThan(possibleSemiSyncReplicas[j].PromotionRule)
-		}
-		return strings.Compare(possibleSemiSyncReplicas[i].Key.String(), possibleSemiSyncReplicas[j].Key.String()) < 0
-	})
+	// Classify and prioritize replicas
+	possibleSemiSyncReplicas, asyncReplicas, excludedReplicas := classifyAndPrioritizeReplicas(replicas)
 
 	// Figure out which replicas need to be acted upon
+	desiredSemiSyncReplicaCount := analysisEntry.SemiSyncMasterWaitForReplicaCount
 	actions := make(map[*inst.Instance]bool, 0)
 	for i, replica := range possibleSemiSyncReplicas {
 		isSemiSyncEnabled := replica.SemiSyncReplicaEnabled
@@ -1615,6 +1592,36 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 	return true, topologyRecovery, nil
 }
 
+func classifyAndPrioritizeReplicas(replicas []*inst.Instance) (possibleSemiSyncReplicas []*inst.Instance, asyncReplicas []*inst.Instance, excludedReplicas[]*inst.Instance) {
+	// Filter out downtimed and down replicas
+	possibleSemiSyncReplicas = make([]*inst.Instance, 0)
+	asyncReplicas = make([]*inst.Instance, 0)
+	excludedReplicas = make([]*inst.Instance, 0)
+	for _, replica := range replicas {
+		if replica.IsDowntimed || !replica.IsLastCheckValid || !replica.ReplicaRunning() {
+			excludedReplicas = append(excludedReplicas, replica)
+			// TODO make this more resilient: re-read instance if its last check was invalid
+		} else if replica.SemiSyncPriority == 0 {
+			asyncReplicas = append(asyncReplicas, replica)
+		} else {
+			possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica)
+		}
+	}
+
+	// Sort replicas by priority, promotion rule and name
+	sort.Slice(possibleSemiSyncReplicas, func(i, j int) bool {
+		if possibleSemiSyncReplicas[i].SemiSyncPriority != possibleSemiSyncReplicas[j].SemiSyncPriority {
+			return possibleSemiSyncReplicas[i].SemiSyncPriority < possibleSemiSyncReplicas[j].SemiSyncPriority
+		}
+		if possibleSemiSyncReplicas[i].PromotionRule != possibleSemiSyncReplicas[j].PromotionRule {
+			return possibleSemiSyncReplicas[i].PromotionRule.BetterThan(possibleSemiSyncReplicas[j].PromotionRule)
+		}
+		return strings.Compare(possibleSemiSyncReplicas[i].Key.String(), possibleSemiSyncReplicas[j].Key.String()) < 0
+	})
+
+	return
+}
+
 func logReplicas(topologyRecovery *TopologyRecovery, description string, replicas []*inst.Instance) {
 	if len(replicas) > 0 {
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%s:", description))

From 684b02b3e93f344ac85ff45d846cb8fb2bb183e7 Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Tue, 29 Jun 2021 21:10:24 -0400
Subject: [PATCH 12/18] Implement RecoverLockedSemiSyncMaster without exact
 counts (enable-only mode)

---
 go/logic/topology_recovery.go | 63 ++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 30 deletions(-)

diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
index 8c8077e79..cc41c7f14 100644
--- a/go/logic/topology_recovery.go
+++ b/go/logic/topology_recovery.go
@@ -1485,22 +1485,13 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis,
 		return false, nil, err
 	}
 	if config.Config.EnforceExactSemiSyncReplicas {
-		return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry)
+		return recoverSemiSyncReplicas(topologyRecovery, analysisEntry, true)
 	}
 	if !config.Config.RecoverLockedSemiSyncMaster {
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no action taken to recover locked semi sync master on %+v. Enable RecoverLockedSemiSyncMaster or EnforceExactSemiSyncReplicas change this behavior.", analysisEntry.AnalyzedInstanceKey))
 		return false, nil, err
 	}
-
-	panic("not supported")
-
-	// Proceed with "enough"-style recovery:
-	// in priority order, enable semi-sync on replicas until there are enough to match the wait count
-
-	// TODO Jun 29 12:05:14 pheckel-devm-orch-1 orchestrator[564933]: 2021-06-29 12:05:14 INFO checkAndExecuteFailureDetectionProcesses: could not register LockedSemiSyncMaster detection on pheckel-devm-db-g0-1:3306
-	// TODO IMPLEMENT THIS
-
-	return true, topologyRecovery, nil
+	return  recoverSemiSyncReplicas(topologyRecovery, analysisEntry, false)
 }
 
 // checkAndRecoverMasterWithTooManySemiSyncReplicas
@@ -1510,10 +1501,10 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverMasterWithTooManySemiSyncReplicas.", analysisEntry.AnalyzedInstanceKey))
 		return false, nil, err
 	}
-	return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry)
+	return recoverSemiSyncReplicas(topologyRecovery, analysisEntry, true)
 }
 
-func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecoveryOut *TopologyRecovery, err error) {
+func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, allowDisable bool) (recoveryAttempted bool, topologyRecoveryOut *TopologyRecovery, err error) {
 	instance, found, err := inst.ReadInstance(&analysisEntry.AnalyzedInstanceKey)
 	if !found || err != nil {
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("instance not found in recovery %+v.", analysisEntry.AnalyzedInstanceKey))
@@ -1523,11 +1514,11 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 	// Read all replicas
 	replicas, err := inst.ReadReplicaInstances(&analysisEntry.AnalyzedInstanceKey)
 	if err != nil {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("could not read replica instancesfor %+v: %s", analysisEntry.AnalyzedInstanceKey, err.Error()))
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("could not read replica instances for %+v: %s", analysisEntry.AnalyzedInstanceKey, err.Error()))
 		return false, topologyRecovery, err
 	}
 	if len(replicas) == 0 {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no replicas found for %+v.", analysisEntry.AnalyzedInstanceKey))
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no replicas found for %+v; cannot recover", analysisEntry.AnalyzedInstanceKey))
 		return false, topologyRecovery, nil
 	}
 
@@ -1535,31 +1526,43 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn
 	possibleSemiSyncReplicas, asyncReplicas, excludedReplicas := classifyAndPrioritizeReplicas(replicas)
 
 	// Figure out which replicas need to be acted upon
-	desiredSemiSyncReplicaCount := analysisEntry.SemiSyncMasterWaitForReplicaCount
-	actions := make(map[*inst.Instance]bool, 0)
-	for i, replica := range possibleSemiSyncReplicas {
-		isSemiSyncEnabled := replica.SemiSyncReplicaEnabled
-		shouldSemiSyncBeEnabled := uint(i) < desiredSemiSyncReplicaCount
-		if shouldSemiSyncBeEnabled && !isSemiSyncEnabled {
-			actions[replica] = true
-		} else if !shouldSemiSyncBeEnabled && isSemiSyncEnabled {
-			actions[replica] = false
+	actions := make(map[*inst.Instance]bool, 0) // true = enable semi-sync, false = disable semi-sync
+	if allowDisable {
+		for i, replica := range possibleSemiSyncReplicas {
+			isSemiSyncEnabled := replica.SemiSyncReplicaEnabled
+			shouldSemiSyncBeEnabled := uint(i) < analysisEntry.SemiSyncMasterWaitForReplicaCount
+			if shouldSemiSyncBeEnabled && !isSemiSyncEnabled {
+				actions[replica] = true
+			} else if allowDisable && !shouldSemiSyncBeEnabled && isSemiSyncEnabled {
+				actions[replica] = false
+			}
 		}
-	}
-	for _, replica := range asyncReplicas {
-		if replica.SemiSyncReplicaEnabled {
-			actions[replica] = false
+		for _, replica := range asyncReplicas {
+			if replica.SemiSyncReplicaEnabled {
+				actions[replica] = false
+			}
+		}
+	} else {
+		enabled := uint(0)
+		for _, replica := range possibleSemiSyncReplicas {
+			if !replica.SemiSyncReplicaEnabled {
+				actions[replica] = true
+				enabled++
+			}
+			if enabled == analysisEntry.SemiSyncMasterWaitForReplicaCount - analysisEntry.SemiSyncMasterClients {
+				break
+			}
 		}
 	}
 
 	// Summarize what we've determined
-	AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("master semi-sync wait count is %d; we have %d valid possible semi-sync replica(s) and %d excluded replica(s)", desiredSemiSyncReplicaCount, len(possibleSemiSyncReplicas), len(excludedReplicas)))
+	AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("master semi-sync wait count is %d, currently we have %d semi-sync replica(s)", analysisEntry.SemiSyncMasterWaitForReplicaCount, analysisEntry.SemiSyncMasterClients))
 	logReplicas(topologyRecovery, "possible semi-sync replicas (in priority order)", possibleSemiSyncReplicas)
 	logReplicas(topologyRecovery, "always-async replicas", asyncReplicas)
 	logReplicas(topologyRecovery, "excluded replicas (downtimed/defunct)", excludedReplicas)
 
 	// Bail out if we cannot succeed
-	if uint(len(possibleSemiSyncReplicas)) < desiredSemiSyncReplicaCount {
+	if uint(len(possibleSemiSyncReplicas)) < analysisEntry.SemiSyncMasterWaitForReplicaCount {
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("not enough valid live replicas found to recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey))
 		return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this?
 	}

From 32835c77a37a4fcbc41aafbb3fc1f6fbd20cd17b Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Tue, 29 Jun 2021 21:11:06 -0400
Subject: [PATCH 13/18] gofmt

---
 go/logic/topology_recovery.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
index cc41c7f14..7faad8d16 100644
--- a/go/logic/topology_recovery.go
+++ b/go/logic/topology_recovery.go
@@ -1491,7 +1491,7 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis,
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no action taken to recover locked semi sync master on %+v. Enable RecoverLockedSemiSyncMaster or EnforceExactSemiSyncReplicas change this behavior.", analysisEntry.AnalyzedInstanceKey))
 		return false, nil, err
 	}
-	return  recoverSemiSyncReplicas(topologyRecovery, analysisEntry, false)
+	return recoverSemiSyncReplicas(topologyRecovery, analysisEntry, false)
 }
 
 // checkAndRecoverMasterWithTooManySemiSyncReplicas
@@ -1549,7 +1549,7 @@ func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry i
 				actions[replica] = true
 				enabled++
 			}
-			if enabled == analysisEntry.SemiSyncMasterWaitForReplicaCount - analysisEntry.SemiSyncMasterClients {
+			if enabled == analysisEntry.SemiSyncMasterWaitForReplicaCount-analysisEntry.SemiSyncMasterClients {
 				break
 			}
 		}
@@ -1595,7 +1595,7 @@ func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry i
 	return true, topologyRecovery, nil
 }
 
-func classifyAndPrioritizeReplicas(replicas []*inst.Instance) (possibleSemiSyncReplicas []*inst.Instance, asyncReplicas []*inst.Instance, excludedReplicas[]*inst.Instance) {
+func classifyAndPrioritizeReplicas(replicas []*inst.Instance) (possibleSemiSyncReplicas []*inst.Instance, asyncReplicas []*inst.Instance, excludedReplicas []*inst.Instance) {
 	// Filter out downtimed and down replicas
 	possibleSemiSyncReplicas = make([]*inst.Instance, 0)
 	asyncReplicas = make([]*inst.Instance, 0)

From 303c9dc97dc6a4b8726b7403d144e3422661a988 Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Wed, 30 Jun 2021 15:59:28 -0400
Subject: [PATCH 14/18] Rename variable

---
 go/logic/topology_recovery.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
index 7faad8d16..4f31c6b1e 100644
--- a/go/logic/topology_recovery.go
+++ b/go/logic/topology_recovery.go
@@ -1504,7 +1504,7 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica
 	return recoverSemiSyncReplicas(topologyRecovery, analysisEntry, true)
 }
 
-func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, allowDisable bool) (recoveryAttempted bool, topologyRecoveryOut *TopologyRecovery, err error) {
+func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, exactReplicaTopology bool) (recoveryAttempted bool, topologyRecoveryOut *TopologyRecovery, err error) {
 	instance, found, err := inst.ReadInstance(&analysisEntry.AnalyzedInstanceKey)
 	if !found || err != nil {
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("instance not found in recovery %+v.", analysisEntry.AnalyzedInstanceKey))
@@ -1527,13 +1527,13 @@ func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry i
 
 	// Figure out which replicas need to be acted upon
 	actions := make(map[*inst.Instance]bool, 0) // true = enable semi-sync, false = disable semi-sync
-	if allowDisable {
+	if exactReplicaTopology {
 		for i, replica := range possibleSemiSyncReplicas {
 			isSemiSyncEnabled := replica.SemiSyncReplicaEnabled
 			shouldSemiSyncBeEnabled := uint(i) < analysisEntry.SemiSyncMasterWaitForReplicaCount
 			if shouldSemiSyncBeEnabled && !isSemiSyncEnabled {
 				actions[replica] = true
-			} else if allowDisable && !shouldSemiSyncBeEnabled && isSemiSyncEnabled {
+			} else if exactReplicaTopology && !shouldSemiSyncBeEnabled && isSemiSyncEnabled {
 				actions[replica] = false
 			}
 		}

From 1b7ed20e83965196495d1ce8514df74f8d918228 Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Thu, 1 Jul 2021 13:58:13 -0400
Subject: [PATCH 15/18] Make setting the semi-sync flag work during failover

---
 go/inst/instance_topology_dao.go | 164 +++++++++++++++++++++++++------
 go/logic/topology_recovery.go    |  65 +-----------
 2 files changed, 136 insertions(+), 93 deletions(-)

diff --git a/go/inst/instance_topology_dao.go b/go/inst/instance_topology_dao.go
index 9c95a50c1..aa1b99d1f 100644
--- a/go/inst/instance_topology_dao.go
+++ b/go/inst/instance_topology_dao.go
@@ -20,6 +20,7 @@ import (
 	"context"
 	"database/sql"
 	"fmt"
+	"sort"
 	"strings"
 	"time"
 
@@ -456,13 +457,9 @@ func StartReplication(instanceKey *InstanceKey) (*Instance, error) {
 	// some replicas (those that must never be promoted) should never ACK.
 	// Note: We assume that replicas use 'skip-slave-start' so they won't
 	//       START SLAVE on their own upon restart.
-	if instance.SemiSyncPriority > 0 {
-		// Send ACK only from promotable instances.
-		sendACK := instance.PromotionRule != MustNotPromoteRule
-		// Always disable master setting, in case we're converting a former master.
-		if err := EnableSemiSync(instanceKey, false, sendACK); err != nil {
-			return instance, log.Errore(err)
-		}
+	instance, err = MaybeEnableSemiSyncReplica(instance)
+	if err != nil {
+		return instance, log.Errore(err)
 	}
 
 	_, err = ExecInstance(instanceKey, `start slave`)
@@ -534,7 +531,7 @@ func WaitForExecBinlogCoordinatesToReach(instanceKey *InstanceKey, coordinates *
 	}
 }
 
-// StartReplicationUntilMasterCoordinates issuesa START SLAVE UNTIL... statement on given instance
+// StartReplicationUntilMasterCoordinates issues a START SLAVE UNTIL... statement on given instance
 func StartReplicationUntilMasterCoordinates(instanceKey *InstanceKey, masterCoordinates *BinlogCoordinates) (*Instance, error) {
 	instance, err := ReadTopologyInstance(instanceKey)
 	if err != nil {
@@ -550,13 +547,9 @@ func StartReplicationUntilMasterCoordinates(instanceKey *InstanceKey, masterCoor
 
 	log.Infof("Will start replication on %+v until coordinates: %+v", instanceKey, masterCoordinates)
 
-	if instance.SemiSyncPriority > 0 {
-		// Send ACK only from promotable instances.
-		sendACK := instance.PromotionRule != MustNotPromoteRule
-		// Always disable master setting, in case we're converting a former master.
-		if err := EnableSemiSync(instanceKey, false, sendACK); err != nil {
-			return instance, log.Errore(err)
-		}
+	instance, err = MaybeEnableSemiSyncReplica(instance)
+	if err != nil {
+		return instance, log.Errore(err)
 	}
 
 	// MariaDB has a bug: a CHANGE MASTER TO statement does not work properly with prepared statement... :P
@@ -584,14 +577,124 @@ func StartReplicationUntilMasterCoordinates(instanceKey *InstanceKey, masterCoor
 	return instance, err
 }
 
-// EnableSemiSync sets the rpl_semi_sync_(master|replica)_enabled variables
-// on a given instance.
-func EnableSemiSync(instanceKey *InstanceKey, master, replica bool) error {
-	log.Infof("instance %+v rpl_semi_sync_master_enabled: %t, rpl_semi_sync_slave_enabled: %t", instanceKey, master, replica)
-	_, err := ExecInstance(instanceKey,
-		`set global rpl_semi_sync_master_enabled = ?, global rpl_semi_sync_slave_enabled = ?`,
-		master, replica)
-	return err
+// MaybeEnableSemiSyncReplica sets the rpl_semi_sync_(master|replica)_enabled variables on a given instance based on the
+// config and state of the world.
+func MaybeEnableSemiSyncReplica(instance *Instance) (*Instance, error) {
+	// Backwards compatible logic: Enable semi-sync if SemiSyncPriority > 0 (formerly SemiSyncEnforced)
+	// Note that this logic NEVER enables semi-sync if the promotion rule is "must_not".
+	if !config.Config.EnforceExactSemiSyncReplicas && !config.Config.RecoverLockedSemiSyncMaster && instance.SemiSyncPriority > 0 {
+		// Send ACK only from promotable instances; always disable master setting, in case we're converting a former master.
+		enableReplica := instance.PromotionRule != MustNotPromoteRule
+		log.Infof("semi-sync: %+v: setting rpl_semi_sync_master_enabled = %t, rpl_semi_sync_slave_enabled = %t (legacy behavior)", &instance.Key, false, enableReplica)
+		_, err := ExecInstance(&instance.Key, `set global rpl_semi_sync_master_enabled = ?, global rpl_semi_sync_slave_enabled = ?`, false, enableReplica)
+		if err != nil {
+			return instance, log.Errore(err)
+		}
+	}
+
+	// New logic: If EnforceExactSemiSyncReplicas or RecoverLockedSemiSyncMaster are set, we enable semi-sync only if the
+	// given replica instance is in the list of replicas to have semi-sync enabled (according to the priority).
+	masterInstance, err := ReadTopologyInstance(&instance.MasterKey)
+	if err != nil {
+		return instance, log.Errore(err)
+	}
+	replicas, err := ReadReplicaInstances(&instance.MasterKey)
+	if err != nil {
+		return instance, log.Errore(err)
+	}
+
+	possibleSemiSyncReplicas, asyncReplicas, excludedReplicas := ClassifyAndPrioritizeReplicas(replicas, false)
+	actions := DetermineSemiSyncReplicaActions(possibleSemiSyncReplicas, asyncReplicas, masterInstance.SemiSyncMasterWaitForReplicaCount, masterInstance.SemiSyncMasterClients, config.Config.EnforceExactSemiSyncReplicas)
+
+	log.Debugf("semi-sync: determining if %+v should enable rpl_semi_sync_slave_enabled or not", instance.Key)
+	log.Debugf("semi-sync: master = %+v, master semi-sync wait count = %d, master semi-sync replica count = %d", instance.MasterKey, masterInstance.SemiSyncMasterWaitForReplicaCount, masterInstance.SemiSyncMasterClients)
+	log.Debug("semi-sync: analysis:")
+	debugReplicas("semi-sync: possible semi-sync replicas (in priority order)", possibleSemiSyncReplicas)
+	debugReplicas("semi-sync: always-async replicas", asyncReplicas)
+	debugReplicas("semi-sync: excluded replicas (downtimed/defunct)", excludedReplicas)
+
+	for replica, enable := range actions {
+		if replica.Key.Equals(&instance.Key) {
+			log.Infof("semi-sync: %s: setting rpl_semi_sync_slave_enabled: %t", &instance.Key, enable)
+			return SetSemiSyncReplica(&instance.Key, enable) // override "instance", since we re-read it in this function
+		}
+	}
+
+	log.Infof("semi-sync: %+v: no action taken", &instance.Key)
+	return instance, nil
+}
+
+func debugReplicas(description string, replicas []*Instance) {
+	if len(replicas) > 0 {
+		log.Debugf("semi-sync: %s:", description)
+		for _, replica := range replicas {
+			log.Debugf("semi-sync: - %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncPriority, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning())
+		}
+	} else {
+		log.Debugf("semi-sync: %s: (none)", description)
+	}
+}
+
+func ClassifyAndPrioritizeReplicas(replicas []*Instance, excludeNotReplicatingReplicas bool) (possibleSemiSyncReplicas []*Instance, asyncReplicas []*Instance, excludedReplicas []*Instance) {
+	// Filter out downtimed and down replicas
+	possibleSemiSyncReplicas = make([]*Instance, 0)
+	asyncReplicas = make([]*Instance, 0)
+	excludedReplicas = make([]*Instance, 0)
+	for _, replica := range replicas {
+		if replica.IsDowntimed || !replica.IsLastCheckValid || (excludeNotReplicatingReplicas && !replica.ReplicaRunning()) {
+			excludedReplicas = append(excludedReplicas, replica)
+			// TODO make this more resilient: re-read instance if its last check was invalid
+		} else if replica.SemiSyncPriority == 0 {
+			asyncReplicas = append(asyncReplicas, replica)
+		} else {
+			possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica)
+		}
+	}
+
+	// Sort replicas by priority, promotion rule and name
+	sort.Slice(possibleSemiSyncReplicas, func(i, j int) bool {
+		if possibleSemiSyncReplicas[i].SemiSyncPriority != possibleSemiSyncReplicas[j].SemiSyncPriority {
+			return possibleSemiSyncReplicas[i].SemiSyncPriority < possibleSemiSyncReplicas[j].SemiSyncPriority
+		}
+		if possibleSemiSyncReplicas[i].PromotionRule != possibleSemiSyncReplicas[j].PromotionRule {
+			return possibleSemiSyncReplicas[i].PromotionRule.BetterThan(possibleSemiSyncReplicas[j].PromotionRule)
+		}
+		return strings.Compare(possibleSemiSyncReplicas[i].Key.String(), possibleSemiSyncReplicas[j].Key.String()) < 0
+	})
+
+	return
+}
+
+func DetermineSemiSyncReplicaActions(possibleSemiSyncReplicas []*Instance, asyncReplicas []*Instance, waitCount uint, currentSemiSyncReplicas uint, exactReplicaTopology bool) map[*Instance]bool {
+	actions := make(map[*Instance]bool, 0) // true = enable semi-sync, false = disable semi-sync
+	if exactReplicaTopology {
+		for i, replica := range possibleSemiSyncReplicas {
+			isSemiSyncEnabled := replica.SemiSyncReplicaEnabled
+			shouldSemiSyncBeEnabled := uint(i) < waitCount
+			if shouldSemiSyncBeEnabled && !isSemiSyncEnabled {
+				actions[replica] = true
+			} else if exactReplicaTopology && !shouldSemiSyncBeEnabled && isSemiSyncEnabled {
+				actions[replica] = false
+			}
+		}
+		for _, replica := range asyncReplicas {
+			if replica.SemiSyncReplicaEnabled {
+				actions[replica] = false
+			}
+		}
+	} else {
+		enabled := uint(0)
+		for _, replica := range possibleSemiSyncReplicas {
+			if !replica.SemiSyncReplicaEnabled {
+				actions[replica] = true
+				enabled++
+			}
+			if enabled == waitCount-currentSemiSyncReplicas {
+				break
+			}
+		}
+	}
+	return actions
 }
 
 // DelayReplication set the replication delay given seconds
@@ -1090,10 +1193,8 @@ func SetReadOnly(instanceKey *InstanceKey, readOnly bool) (*Instance, error) {
 
 	// If async fallback is disallowed, we're responsible for flipping the master
 	// semi-sync switch ON before accepting writes. The setting is off by default.
-	if instance.SemiSyncPriority > 0 && !readOnly {
-		// Send ACK only from promotable instances.
-		sendACK := instance.PromotionRule != MustNotPromoteRule
-		if err := EnableSemiSync(instanceKey, true, sendACK); err != nil {
+	if instance.SemiSyncPriority > 0 && !readOnly { // TODO I feel like this flag is a little overloaded: this is used here to determine if the MASTER's semi-sync is enabled...
+		if _, err := SetSemiSyncMaster(instanceKey, true); err != nil {
 			return instance, log.Errore(err)
 		}
 	}
@@ -1111,13 +1212,14 @@ func SetReadOnly(instanceKey *InstanceKey, readOnly bool) (*Instance, error) {
 		}
 	}
 	instance, err = ReadTopologyInstance(instanceKey)
+	if err != nil {
+		return instance, log.Errore(err)
+	}
 
 	// If we just went read-only, it's safe to flip the master semi-sync switch
 	// OFF, which is the default value so that replicas can make progress.
-	if instance.SemiSyncPriority > 0 && readOnly {
-		// Send ACK only from promotable instances.
-		sendACK := instance.PromotionRule != MustNotPromoteRule
-		if err := EnableSemiSync(instanceKey, false, sendACK); err != nil {
+	if instance.SemiSyncPriority > 0 && readOnly { // TODO I feel like this flag is a little overloaded: this is used here to determine if the MASTER's semi-sync is enabled...
+		if _, err := SetSemiSyncMaster(instanceKey, false); err != nil {
 			return instance, log.Errore(err)
 		}
 	}
diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
index 4f31c6b1e..47b1f92fb 100644
--- a/go/logic/topology_recovery.go
+++ b/go/logic/topology_recovery.go
@@ -1522,38 +1522,9 @@ func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry i
 		return false, topologyRecovery, nil
 	}
 
-	// Classify and prioritize replicas
-	possibleSemiSyncReplicas, asyncReplicas, excludedReplicas := classifyAndPrioritizeReplicas(replicas)
-
-	// Figure out which replicas need to be acted upon
-	actions := make(map[*inst.Instance]bool, 0) // true = enable semi-sync, false = disable semi-sync
-	if exactReplicaTopology {
-		for i, replica := range possibleSemiSyncReplicas {
-			isSemiSyncEnabled := replica.SemiSyncReplicaEnabled
-			shouldSemiSyncBeEnabled := uint(i) < analysisEntry.SemiSyncMasterWaitForReplicaCount
-			if shouldSemiSyncBeEnabled && !isSemiSyncEnabled {
-				actions[replica] = true
-			} else if exactReplicaTopology && !shouldSemiSyncBeEnabled && isSemiSyncEnabled {
-				actions[replica] = false
-			}
-		}
-		for _, replica := range asyncReplicas {
-			if replica.SemiSyncReplicaEnabled {
-				actions[replica] = false
-			}
-		}
-	} else {
-		enabled := uint(0)
-		for _, replica := range possibleSemiSyncReplicas {
-			if !replica.SemiSyncReplicaEnabled {
-				actions[replica] = true
-				enabled++
-			}
-			if enabled == analysisEntry.SemiSyncMasterWaitForReplicaCount-analysisEntry.SemiSyncMasterClients {
-				break
-			}
-		}
-	}
+	// Classify and prioritize replicas & figure out which replicas need to be acted upon
+	possibleSemiSyncReplicas, asyncReplicas, excludedReplicas := inst.ClassifyAndPrioritizeReplicas(replicas, true)
+	actions := inst.DetermineSemiSyncReplicaActions(possibleSemiSyncReplicas, asyncReplicas, analysisEntry.SemiSyncMasterWaitForReplicaCount, analysisEntry.SemiSyncMasterClients, exactReplicaTopology)
 
 	// Summarize what we've determined
 	AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("master semi-sync wait count is %d, currently we have %d semi-sync replica(s)", analysisEntry.SemiSyncMasterWaitForReplicaCount, analysisEntry.SemiSyncMasterClients))
@@ -1595,36 +1566,6 @@ func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry i
 	return true, topologyRecovery, nil
 }
 
-func classifyAndPrioritizeReplicas(replicas []*inst.Instance) (possibleSemiSyncReplicas []*inst.Instance, asyncReplicas []*inst.Instance, excludedReplicas []*inst.Instance) {
-	// Filter out downtimed and down replicas
-	possibleSemiSyncReplicas = make([]*inst.Instance, 0)
-	asyncReplicas = make([]*inst.Instance, 0)
-	excludedReplicas = make([]*inst.Instance, 0)
-	for _, replica := range replicas {
-		if replica.IsDowntimed || !replica.IsLastCheckValid || !replica.ReplicaRunning() {
-			excludedReplicas = append(excludedReplicas, replica)
-			// TODO make this more resilient: re-read instance if its last check was invalid
-		} else if replica.SemiSyncPriority == 0 {
-			asyncReplicas = append(asyncReplicas, replica)
-		} else {
-			possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica)
-		}
-	}
-
-	// Sort replicas by priority, promotion rule and name
-	sort.Slice(possibleSemiSyncReplicas, func(i, j int) bool {
-		if possibleSemiSyncReplicas[i].SemiSyncPriority != possibleSemiSyncReplicas[j].SemiSyncPriority {
-			return possibleSemiSyncReplicas[i].SemiSyncPriority < possibleSemiSyncReplicas[j].SemiSyncPriority
-		}
-		if possibleSemiSyncReplicas[i].PromotionRule != possibleSemiSyncReplicas[j].PromotionRule {
-			return possibleSemiSyncReplicas[i].PromotionRule.BetterThan(possibleSemiSyncReplicas[j].PromotionRule)
-		}
-		return strings.Compare(possibleSemiSyncReplicas[i].Key.String(), possibleSemiSyncReplicas[j].Key.String()) < 0
-	})
-
-	return
-}
-
 func logReplicas(topologyRecovery *TopologyRecovery, description string, replicas []*inst.Instance) {
 	if len(replicas) > 0 {
 		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%s:", description))

From e83dbad8794465a199a96ef7e632585216c0d898 Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Thu, 1 Jul 2021 14:24:27 -0400
Subject: [PATCH 16/18] Fix backwards compatible logic

---
 go/inst/instance_topology_dao.go | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/go/inst/instance_topology_dao.go b/go/inst/instance_topology_dao.go
index aa1b99d1f..5a91a6f4a 100644
--- a/go/inst/instance_topology_dao.go
+++ b/go/inst/instance_topology_dao.go
@@ -582,14 +582,17 @@ func StartReplicationUntilMasterCoordinates(instanceKey *InstanceKey, masterCoor
 func MaybeEnableSemiSyncReplica(instance *Instance) (*Instance, error) {
 	// Backwards compatible logic: Enable semi-sync if SemiSyncPriority > 0 (formerly SemiSyncEnforced)
 	// Note that this logic NEVER enables semi-sync if the promotion rule is "must_not".
-	if !config.Config.EnforceExactSemiSyncReplicas && !config.Config.RecoverLockedSemiSyncMaster && instance.SemiSyncPriority > 0 {
-		// Send ACK only from promotable instances; always disable master setting, in case we're converting a former master.
-		enableReplica := instance.PromotionRule != MustNotPromoteRule
-		log.Infof("semi-sync: %+v: setting rpl_semi_sync_master_enabled = %t, rpl_semi_sync_slave_enabled = %t (legacy behavior)", &instance.Key, false, enableReplica)
-		_, err := ExecInstance(&instance.Key, `set global rpl_semi_sync_master_enabled = ?, global rpl_semi_sync_slave_enabled = ?`, false, enableReplica)
-		if err != nil {
-			return instance, log.Errore(err)
+	if !config.Config.EnforceExactSemiSyncReplicas && !config.Config.RecoverLockedSemiSyncMaster {
+		if instance.SemiSyncPriority > 0 {
+			// Send ACK only from promotable instances; always disable master setting, in case we're converting a former master.
+			enableReplica := instance.PromotionRule != MustNotPromoteRule
+			log.Infof("semi-sync: %+v: setting rpl_semi_sync_master_enabled = %t, rpl_semi_sync_slave_enabled = %t (legacy behavior)", &instance.Key, false, enableReplica)
+			_, err := ExecInstance(&instance.Key, `set global rpl_semi_sync_master_enabled = ?, global rpl_semi_sync_slave_enabled = ?`, false, enableReplica)
+			if err != nil {
+				return instance, log.Errore(err)
+			}
 		}
+		return instance, nil
 	}
 
 	// New logic: If EnforceExactSemiSyncReplicas or RecoverLockedSemiSyncMaster are set, we enable semi-sync only if the

From a5566d896738fed1c3fe45c6bac334086b225ae9 Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Thu, 1 Jul 2021 14:35:30 -0400
Subject: [PATCH 17/18] Split determine* function

---
 go/inst/instance_topology_dao.go | 57 +++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/go/inst/instance_topology_dao.go b/go/inst/instance_topology_dao.go
index 5a91a6f4a..4438655aa 100644
--- a/go/inst/instance_topology_dao.go
+++ b/go/inst/instance_topology_dao.go
@@ -668,33 +668,44 @@ func ClassifyAndPrioritizeReplicas(replicas []*Instance, excludeNotReplicatingRe
 	return
 }
 
+// DetermineSemiSyncReplicaActions returns a map of replicas for which to change the semi-sync replica setting.
+// A value of true indicates semi-sync needs to be enabled, false that it needs to be disabled.
 func DetermineSemiSyncReplicaActions(possibleSemiSyncReplicas []*Instance, asyncReplicas []*Instance, waitCount uint, currentSemiSyncReplicas uint, exactReplicaTopology bool) map[*Instance]bool {
-	actions := make(map[*Instance]bool, 0) // true = enable semi-sync, false = disable semi-sync
 	if exactReplicaTopology {
-		for i, replica := range possibleSemiSyncReplicas {
-			isSemiSyncEnabled := replica.SemiSyncReplicaEnabled
-			shouldSemiSyncBeEnabled := uint(i) < waitCount
-			if shouldSemiSyncBeEnabled && !isSemiSyncEnabled {
-				actions[replica] = true
-			} else if exactReplicaTopology && !shouldSemiSyncBeEnabled && isSemiSyncEnabled {
-				actions[replica] = false
-			}
+		return determineSemiSyncReplicaActionsForExactTopology(possibleSemiSyncReplicas, asyncReplicas, waitCount)
+	}
+	return determineSemiSyncReplicaActionsForEnoughTopology(possibleSemiSyncReplicas, waitCount, currentSemiSyncReplicas)
+}
+
+func determineSemiSyncReplicaActionsForExactTopology(possibleSemiSyncReplicas []*Instance, asyncReplicas []*Instance, waitCount uint) map[*Instance]bool {
+	actions := make(map[*Instance]bool, 0) // true = enable semi-sync, false = disable semi-sync
+	for i, replica := range possibleSemiSyncReplicas {
+		isSemiSyncEnabled := replica.SemiSyncReplicaEnabled
+		shouldSemiSyncBeEnabled := uint(i) < waitCount
+		if shouldSemiSyncBeEnabled && !isSemiSyncEnabled {
+			actions[replica] = true
+		} else if !shouldSemiSyncBeEnabled && isSemiSyncEnabled {
+			actions[replica] = false
 		}
-		for _, replica := range asyncReplicas {
-			if replica.SemiSyncReplicaEnabled {
-				actions[replica] = false
-			}
+	}
+	for _, replica := range asyncReplicas {
+		if replica.SemiSyncReplicaEnabled {
+			actions[replica] = false
 		}
-	} else {
-		enabled := uint(0)
-		for _, replica := range possibleSemiSyncReplicas {
-			if !replica.SemiSyncReplicaEnabled {
-				actions[replica] = true
-				enabled++
-			}
-			if enabled == waitCount-currentSemiSyncReplicas {
-				break
-			}
+	}
+	return actions
+}
+
+func determineSemiSyncReplicaActionsForEnoughTopology(possibleSemiSyncReplicas []*Instance, waitCount uint, currentSemiSyncReplicas uint) map[*Instance]bool {
+	actions := make(map[*Instance]bool, 0) // true = enable semi-sync, false = disable semi-sync
+	enabled := uint(0)
+	for _, replica := range possibleSemiSyncReplicas {
+		if !replica.SemiSyncReplicaEnabled {
+			actions[replica] = true
+			enabled++
+		}
+		if enabled == waitCount-currentSemiSyncReplicas {
+			break
 		}
 	}
 	return actions

From 2bd2761b3e5f4efa9fcac7b8c7bab59cfb3eafb5 Mon Sep 17 00:00:00 2001
From: Philipp Heckel <pheckel@datto.com>
Date: Thu, 1 Jul 2021 16:05:11 -0400
Subject: [PATCH 18/18] Better logging; formattin

---
 go/inst/instance_topology_dao.go | 85 +++++++++++++++++++-------------
 go/logic/topology_recovery.go    | 37 ++++++--------
 2 files changed, 67 insertions(+), 55 deletions(-)

diff --git a/go/inst/instance_topology_dao.go b/go/inst/instance_topology_dao.go
index 4438655aa..3465b430f 100644
--- a/go/inst/instance_topology_dao.go
+++ b/go/inst/instance_topology_dao.go
@@ -579,65 +579,57 @@ func StartReplicationUntilMasterCoordinates(instanceKey *InstanceKey, masterCoor
 
 // MaybeEnableSemiSyncReplica sets the rpl_semi_sync_(master|replica)_enabled variables on a given instance based on the
 // config and state of the world.
-func MaybeEnableSemiSyncReplica(instance *Instance) (*Instance, error) {
+func MaybeEnableSemiSyncReplica(replicaInstance *Instance) (*Instance, error) {
 	// Backwards compatible logic: Enable semi-sync if SemiSyncPriority > 0 (formerly SemiSyncEnforced)
 	// Note that this logic NEVER enables semi-sync if the promotion rule is "must_not".
 	if !config.Config.EnforceExactSemiSyncReplicas && !config.Config.RecoverLockedSemiSyncMaster {
-		if instance.SemiSyncPriority > 0 {
+		if replicaInstance.SemiSyncPriority > 0 {
 			// Send ACK only from promotable instances; always disable master setting, in case we're converting a former master.
-			enableReplica := instance.PromotionRule != MustNotPromoteRule
-			log.Infof("semi-sync: %+v: setting rpl_semi_sync_master_enabled = %t, rpl_semi_sync_slave_enabled = %t (legacy behavior)", &instance.Key, false, enableReplica)
-			_, err := ExecInstance(&instance.Key, `set global rpl_semi_sync_master_enabled = ?, global rpl_semi_sync_slave_enabled = ?`, false, enableReplica)
+			enableReplica := replicaInstance.PromotionRule != MustNotPromoteRule
+			log.Infof("semi-sync: %+v: setting rpl_semi_sync_master_enabled = %t, rpl_semi_sync_slave_enabled = %t (legacy behavior)", &replicaInstance.Key, false, enableReplica)
+			_, err := ExecInstance(&replicaInstance.Key, `set global rpl_semi_sync_master_enabled = ?, global rpl_semi_sync_slave_enabled = ?`, false, enableReplica)
 			if err != nil {
-				return instance, log.Errore(err)
+				return replicaInstance, log.Errore(err)
 			}
 		}
-		return instance, nil
+		return replicaInstance, nil
 	}
 
 	// New logic: If EnforceExactSemiSyncReplicas or RecoverLockedSemiSyncMaster are set, we enable semi-sync only if the
 	// given replica instance is in the list of replicas to have semi-sync enabled (according to the priority).
-	masterInstance, err := ReadTopologyInstance(&instance.MasterKey)
+	masterInstance, err := ReadTopologyInstance(&replicaInstance.MasterKey)
 	if err != nil {
-		return instance, log.Errore(err)
+		return replicaInstance, log.Errore(err)
 	}
-	replicas, err := ReadReplicaInstances(&instance.MasterKey)
+	replicas, err := ReadReplicaInstances(&replicaInstance.MasterKey)
 	if err != nil {
-		return instance, log.Errore(err)
+		return replicaInstance, log.Errore(err)
 	}
 
 	possibleSemiSyncReplicas, asyncReplicas, excludedReplicas := ClassifyAndPrioritizeReplicas(replicas, false)
 	actions := DetermineSemiSyncReplicaActions(possibleSemiSyncReplicas, asyncReplicas, masterInstance.SemiSyncMasterWaitForReplicaCount, masterInstance.SemiSyncMasterClients, config.Config.EnforceExactSemiSyncReplicas)
 
-	log.Debugf("semi-sync: determining if %+v should enable rpl_semi_sync_slave_enabled or not", instance.Key)
-	log.Debugf("semi-sync: master = %+v, master semi-sync wait count = %d, master semi-sync replica count = %d", instance.MasterKey, masterInstance.SemiSyncMasterWaitForReplicaCount, masterInstance.SemiSyncMasterClients)
-	log.Debug("semi-sync: analysis:")
-	debugReplicas("semi-sync: possible semi-sync replicas (in priority order)", possibleSemiSyncReplicas)
-	debugReplicas("semi-sync: always-async replicas", asyncReplicas)
-	debugReplicas("semi-sync: excluded replicas (downtimed/defunct)", excludedReplicas)
+	log.Debugf("semi-sync: %+v: determining whether to enable rpl_semi_sync_slave_enabled", replicaInstance.Key)
+	log.Debugf("semi-sync: master = %+v, master semi-sync wait count = %d, master semi-sync replica count = %d", replicaInstance.MasterKey, masterInstance.SemiSyncMasterWaitForReplicaCount, masterInstance.SemiSyncMasterClients)
+	LogSemiSyncReplicaAnalysis(possibleSemiSyncReplicas, asyncReplicas, excludedReplicas, actions, func(s string, a ...interface{}) { log.Debugf(s, a...) })
 
+	// TODO This is a little odd. We sometimes get actions for replicas that are not us. If we do not take this action
+	// TODO then we'll absolutely have a MasterWithTooManySemiSyncReplicas event following this recovery. We could prevent this
+	// TODO by just executing all actions, but we're in the scope of `replicaInstance` here, so..... idk
 	for replica, enable := range actions {
-		if replica.Key.Equals(&instance.Key) {
-			log.Infof("semi-sync: %s: setting rpl_semi_sync_slave_enabled: %t", &instance.Key, enable)
-			return SetSemiSyncReplica(&instance.Key, enable) // override "instance", since we re-read it in this function
+		if replica.Key.Equals(&replicaInstance.Key) {
+			log.Infof("semi-sync: %s: setting rpl_semi_sync_slave_enabled: %t", &replicaInstance.Key, enable)
+			return SetSemiSyncReplica(&replicaInstance.Key, enable) // override "instance", since we re-read it in this function
 		}
 	}
 
-	log.Infof("semi-sync: %+v: no action taken", &instance.Key)
-	return instance, nil
-}
-
-func debugReplicas(description string, replicas []*Instance) {
-	if len(replicas) > 0 {
-		log.Debugf("semi-sync: %s:", description)
-		for _, replica := range replicas {
-			log.Debugf("semi-sync: - %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncPriority, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning())
-		}
-	} else {
-		log.Debugf("semi-sync: %s: (none)", description)
-	}
+	log.Infof("semi-sync: %+v: no action taken", &replicaInstance.Key)
+	return replicaInstance, nil
 }
 
+// ClassifyAndPrioritizeReplicas takes a list of replica instances and classifies them based on their semi-sync priority, excluding
+// replicas that are downtimed or down. It furthermore prioritizes the possible semi-sync replicas based on SemiSyncPriority, PromotionRule
+// and hostname (fallback).
 func ClassifyAndPrioritizeReplicas(replicas []*Instance, excludeNotReplicatingReplicas bool) (possibleSemiSyncReplicas []*Instance, asyncReplicas []*Instance, excludedReplicas []*Instance) {
 	// Filter out downtimed and down replicas
 	possibleSemiSyncReplicas = make([]*Instance, 0)
@@ -646,7 +638,6 @@ func ClassifyAndPrioritizeReplicas(replicas []*Instance, excludeNotReplicatingRe
 	for _, replica := range replicas {
 		if replica.IsDowntimed || !replica.IsLastCheckValid || (excludeNotReplicatingReplicas && !replica.ReplicaRunning()) {
 			excludedReplicas = append(excludedReplicas, replica)
-			// TODO make this more resilient: re-read instance if its last check was invalid
 		} else if replica.SemiSyncPriority == 0 {
 			asyncReplicas = append(asyncReplicas, replica)
 		} else {
@@ -711,6 +702,32 @@ func determineSemiSyncReplicaActionsForEnoughTopology(possibleSemiSyncReplicas [
 	return actions
 }
 
+// LogSemiSyncReplicaAnalysis outputs the analysis results for a semi-sync analysis using the given log function
+func LogSemiSyncReplicaAnalysis(possibleSemiSyncReplicas []*Instance, asyncReplicas []*Instance, excludedReplicas []*Instance, actions map[*Instance]bool, logf func(s string, a ...interface{})) {
+	logReplicas("possible semi-sync replicas (in priority order)", possibleSemiSyncReplicas, logf)
+	logReplicas("always-async replicas", asyncReplicas, logf)
+	logReplicas("excluded replicas (downtimed/defunct)", excludedReplicas, logf)
+	if len(actions) > 0 {
+		logf("semi-sync: suggested actions:")
+		for replica, enable := range actions {
+			logf("semi-sync: - %+v: should set semi-sync enabled = %t", replica.Key, enable)
+		}
+	} else {
+		logf("semi-sync: suggested actions: (none)")
+	}
+}
+
+func logReplicas(description string, replicas []*Instance, logf func(s string, a ...interface{})) {
+	if len(replicas) > 0 {
+		logf("semi-sync: %s:", description)
+		for _, replica := range replicas {
+			logf("semi-sync: - %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncPriority, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning())
+		}
+	} else {
+		logf("semi-sync: %s: (none)", description)
+	}
+}
+
 // DelayReplication set the replication delay given seconds
 // keeping the current state of the replication threads.
 func DelayReplication(instanceKey *InstanceKey, seconds int) error {
diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
index 47b1f92fb..b182cb4dd 100644
--- a/go/logic/topology_recovery.go
+++ b/go/logic/topology_recovery.go
@@ -1505,20 +1505,20 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica
 }
 
 func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, exactReplicaTopology bool) (recoveryAttempted bool, topologyRecoveryOut *TopologyRecovery, err error) {
-	instance, found, err := inst.ReadInstance(&analysisEntry.AnalyzedInstanceKey)
+	masterInstance, found, err := inst.ReadInstance(&analysisEntry.AnalyzedInstanceKey)
 	if !found || err != nil {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("instance not found in recovery %+v.", analysisEntry.AnalyzedInstanceKey))
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: instance not found in recovery %+v.", analysisEntry.AnalyzedInstanceKey))
 		return false, topologyRecovery, err
 	}
 
 	// Read all replicas
 	replicas, err := inst.ReadReplicaInstances(&analysisEntry.AnalyzedInstanceKey)
 	if err != nil {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("could not read replica instances for %+v: %s", analysisEntry.AnalyzedInstanceKey, err.Error()))
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: could not read replica instances for %+v: %s", analysisEntry.AnalyzedInstanceKey, err.Error()))
 		return false, topologyRecovery, err
 	}
 	if len(replicas) == 0 {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no replicas found for %+v; cannot recover", analysisEntry.AnalyzedInstanceKey))
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: no replicas found for %+v; cannot recover", analysisEntry.AnalyzedInstanceKey))
 		return false, topologyRecovery, nil
 	}
 
@@ -1526,42 +1526,37 @@ func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry i
 	possibleSemiSyncReplicas, asyncReplicas, excludedReplicas := inst.ClassifyAndPrioritizeReplicas(replicas, true)
 	actions := inst.DetermineSemiSyncReplicaActions(possibleSemiSyncReplicas, asyncReplicas, analysisEntry.SemiSyncMasterWaitForReplicaCount, analysisEntry.SemiSyncMasterClients, exactReplicaTopology)
 
-	// Summarize what we've determined
-	AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("master semi-sync wait count is %d, currently we have %d semi-sync replica(s)", analysisEntry.SemiSyncMasterWaitForReplicaCount, analysisEntry.SemiSyncMasterClients))
-	logReplicas(topologyRecovery, "possible semi-sync replicas (in priority order)", possibleSemiSyncReplicas)
-	logReplicas(topologyRecovery, "always-async replicas", asyncReplicas)
-	logReplicas(topologyRecovery, "excluded replicas (downtimed/defunct)", excludedReplicas)
+	// Log analysis
+	AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: analysis results for recovery of %+v:", masterInstance.Key))
+	AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: semi-sync wait count = %d, semi-sync replica count = %d", analysisEntry.SemiSyncMasterWaitForReplicaCount, analysisEntry.SemiSyncMasterClients))
+	inst.LogSemiSyncReplicaAnalysis(possibleSemiSyncReplicas, asyncReplicas, excludedReplicas, actions, func(s string, a ...interface{}) { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf(s, a...)) })
 
 	// Bail out if we cannot succeed
 	if uint(len(possibleSemiSyncReplicas)) < analysisEntry.SemiSyncMasterWaitForReplicaCount {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("not enough valid live replicas found to recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey))
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: not enough valid live replicas found to recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey))
 		return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this?
 	}
 	if len(actions) == 0 {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot determine actions based on possible semi-sync replicas; cannot recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey))
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: cannot determine actions based on possible semi-sync replicas; cannot recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey))
 		return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this?
 	}
 
 	// Take action
-	AuditTopologyRecovery(topologyRecovery, "taking actions:")
+	AuditTopologyRecovery(topologyRecovery, "semi-sync: taking actions:")
 	for replica, enable := range actions {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: setting rpl_semi_sync_slave_enabled=%t, restarting slave_io thread", replica.Key.String(), enable))
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: - %s: setting rpl_semi_sync_slave_enabled=%t, restarting slave_io thread", replica.Key.String(), enable))
 		_, err := inst.SetSemiSyncReplica(&replica.Key, enable)
 		if err != nil {
-			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot change semi sync on replica %+v.", replica.Key))
+			AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: cannot change semi sync on replica %+v.", replica.Key))
 			return true, topologyRecovery, nil
 		}
 	}
 
 	// Re-read source instance to avoid re-triggering the same condition
-	// TODO this does not help; it is still re-triggering the same analysis until the next polling interval. WHY?
-	instance, err = inst.ReadTopologyInstance(&instance.Key)
-	if err != nil {
-		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("warning: cannot re-read instance after recovery of %s on instance %+v; recovery might be retriggered, but this is harmless.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey))
-	}
+	// TODO even though we resolve correctly here, we are re-triggering the same analysis until the next polling interval. WHY?
 
-	resolveRecovery(topologyRecovery, instance)
-	AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("recovery complete; success = %t", topologyRecovery.IsSuccessful))
+	resolveRecovery(topologyRecovery, masterInstance)
+	AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: recovery complete; success = %t", topologyRecovery.IsSuccessful))
 
 	return true, topologyRecovery, nil
 }