From 6a4729790072c357ea29b41612231e963ca98dd6 Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Fri, 18 Jun 2021 21:06:55 -0400 Subject: [PATCH 01/18] WIP: EnforceSemiSyncReplicaCount --- go/config/config.go | 2 + go/inst/analysis.go | 41 ++++++------ go/inst/analysis_dao.go | 4 ++ go/logic/topology_recovery.go | 116 +++++++++++++++++++++++++++++++++- 4 files changed, 141 insertions(+), 22 deletions(-) diff --git a/go/config/config.go b/go/config/config.go index 4cae4ebff..b944b654f 100644 --- a/go/config/config.go +++ b/go/config/config.go @@ -275,6 +275,7 @@ type Configuration struct { KVClusterMasterPrefix string // Prefix to use for clusters' masters entries in KV stores (internal, consul, ZK), default: "mysql/master" WebMessage string // If provided, will be shown on all web pages below the title bar MaxConcurrentReplicaOperations int // Maximum number of concurrent operations on replicas + EnforceSemiSyncReplicaCount bool } // ToJSONString will marshal this configuration as JSON @@ -446,6 +447,7 @@ func newConfiguration() *Configuration { KVClusterMasterPrefix: "mysql/master", WebMessage: "", MaxConcurrentReplicaOperations: 5, + EnforceSemiSyncReplicaCount: false, } } diff --git a/go/inst/analysis.go b/go/inst/analysis.go index 1d6ab3692..633bd8f10 100644 --- a/go/inst/analysis.go +++ b/go/inst/analysis.go @@ -30,26 +30,27 @@ const ( NoProblem AnalysisCode = "NoProblem" DeadMasterWithoutReplicas = "DeadMasterWithoutReplicas" DeadMaster = "DeadMaster" - DeadMasterAndReplicas = "DeadMasterAndReplicas" - DeadMasterAndSomeReplicas = "DeadMasterAndSomeReplicas" - UnreachableMasterWithLaggingReplicas = "UnreachableMasterWithLaggingReplicas" - UnreachableMaster = "UnreachableMaster" - MasterSingleReplicaNotReplicating = "MasterSingleReplicaNotReplicating" - MasterSingleReplicaDead = "MasterSingleReplicaDead" - AllMasterReplicasNotReplicating = "AllMasterReplicasNotReplicating" - AllMasterReplicasNotReplicatingOrDead = "AllMasterReplicasNotReplicatingOrDead" - LockedSemiSyncMasterHypothesis = "LockedSemiSyncMasterHypothesis" - LockedSemiSyncMaster = "LockedSemiSyncMaster" - MasterWithoutReplicas = "MasterWithoutReplicas" - DeadCoMaster = "DeadCoMaster" - DeadCoMasterAndSomeReplicas = "DeadCoMasterAndSomeReplicas" - UnreachableCoMaster = "UnreachableCoMaster" - AllCoMasterReplicasNotReplicating = "AllCoMasterReplicasNotReplicating" - DeadIntermediateMaster = "DeadIntermediateMaster" - DeadIntermediateMasterWithSingleReplica = "DeadIntermediateMasterWithSingleReplica" - DeadIntermediateMasterWithSingleReplicaFailingToConnect = "DeadIntermediateMasterWithSingleReplicaFailingToConnect" - DeadIntermediateMasterAndSomeReplicas = "DeadIntermediateMasterAndSomeReplicas" - DeadIntermediateMasterAndReplicas = "DeadIntermediateMasterAndReplicas" + DeadMasterAndReplicas = "DeadMasterAndReplicas" + DeadMasterAndSomeReplicas = "DeadMasterAndSomeReplicas" + UnreachableMasterWithLaggingReplicas = "UnreachableMasterWithLaggingReplicas" + UnreachableMaster = "UnreachableMaster" + MasterSingleReplicaNotReplicating = "MasterSingleReplicaNotReplicating" + MasterSingleReplicaDead = "MasterSingleReplicaDead" + AllMasterReplicasNotReplicating = "AllMasterReplicasNotReplicating" + AllMasterReplicasNotReplicatingOrDead = "AllMasterReplicasNotReplicatingOrDead" + LockedSemiSyncMasterHypothesis = "LockedSemiSyncMasterHypothesis" + LockedSemiSyncMaster = "LockedSemiSyncMaster" + MasterWithTooManySemiSyncReplicas = "MasterWithTooManySemiSyncReplicas" + MasterWithoutReplicas = "MasterWithoutReplicas" + DeadCoMaster = "DeadCoMaster" + DeadCoMasterAndSomeReplicas = "DeadCoMasterAndSomeReplicas" + UnreachableCoMaster = "UnreachableCoMaster" + AllCoMasterReplicasNotReplicating = "AllCoMasterReplicasNotReplicating" + DeadIntermediateMaster = "DeadIntermediateMaster" + DeadIntermediateMasterWithSingleReplica = "DeadIntermediateMasterWithSingleReplica" + DeadIntermediateMasterWithSingleReplicaFailingToConnect = "DeadIntermediateMasterWithSingleReplicaFailingToConnect" + DeadIntermediateMasterAndSomeReplicas = "DeadIntermediateMasterAndSomeReplicas" + DeadIntermediateMasterAndReplicas = "DeadIntermediateMasterAndReplicas" UnreachableIntermediateMasterWithLaggingReplicas = "UnreachableIntermediateMasterWithLaggingReplicas" UnreachableIntermediateMaster = "UnreachableIntermediateMaster" AllIntermediateMasterReplicasFailingToConnectOrDead = "AllIntermediateMasterReplicasFailingToConnectOrDead" diff --git a/go/inst/analysis_dao.go b/go/inst/analysis_dao.go index 4b00281e4..bef8463e5 100644 --- a/go/inst/analysis_dao.go +++ b/go/inst/analysis_dao.go @@ -531,6 +531,10 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) a.Description = "Semi sync master seems to be locked, more samplings needed to validate" } // + } else if config.Config.EnforceSemiSyncReplicaCount && a.IsMaster && a.SemiSyncMasterEnabled && a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients > a.SemiSyncMasterWaitForReplicaCount { + a.Analysis = MasterWithTooManySemiSyncReplicas + a.Description = "Semi sync master has more semi sync replicas than configured" + // } else if a.IsMaster && a.LastCheckValid && a.IsReadOnly && a.CountValidReplicatingReplicas > 0 && config.Config.RecoverNonWriteableMaster { a.Analysis = NoWriteableMasterStructureWarning a.Description = "Master with replicas is read_only" diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index 39a46bfa5..c46788aba 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -1479,14 +1479,124 @@ func checkAndRecoverNonWriteableMaster(analysisEntry inst.ReplicationAnalysis, c // checkAndRecoverLockedSemiSyncMaster func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { - topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, true, true) if topologyRecovery == nil { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverLockedSemiSyncMaster.", analysisEntry.AnalyzedInstanceKey)) return false, nil, err } - return false, nil, nil + if !config.Config.EnforceSemiSyncReplicaCount { + return false, nil, nil + } + + replicas, err := inst.ReadReplicaInstances(&analysisEntry.AnalyzedInstanceKey) + if err != nil { + return false, topologyRecovery, err + } + if len(replicas) == 0 { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no replicas found for %+v.", analysisEntry.AnalyzedInstanceKey)) + return false, topologyRecovery, nil + } + + reversePriorityReplicas := replicas // TODO sort replicas by priority + enableCount := int(analysisEntry.SemiSyncMasterWaitForReplicaCount) - int(analysisEntry.CountSemiSyncReplicasEnabled) + if enableCount <= 0 || enableCount > len(replicas) { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("invalid state for recovery %+v. Wait count is lower or equal to semi sync replica count or replica count too low.", analysisEntry.AnalyzedInstanceKey)) + return false, nil, err + } + + // No way back. We're doing this. + + // TODO before actually doing it, do sanity check (downtimed, etc.) + + enabled := 0 + for _, replica := range reversePriorityReplicas { + if replica.IsDowntimed || replica.SemiSyncReplicaEnabled { + continue + } + err := inst.EnableSemiSync(&replica.Key, false, true) + if err != nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot enable semi sync on replica %+v.", replica.Key)) + return true, topologyRecovery, nil + } + err = inst.RestartReplicationQuick(&replica.Key) + if err != nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot restart replication on replica %+v.", replica.Key)) + return true, topologyRecovery, nil + } + enabled++ + if enabled == enableCount { + break + } + } + + resolveRecovery(topologyRecovery, nil) + return true, topologyRecovery, nil +} + +// checkAndRecoverMasterWithTooManySemiSyncReplicas +func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { + topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, true, true) + if topologyRecovery == nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverMasterWithTooManySemiSyncReplicas.", analysisEntry.AnalyzedInstanceKey)) + return false, nil, err + } + + // FIXME is this the right spot for this check? We shouldn't even get here right? + if !config.Config.EnforceSemiSyncReplicaCount { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi sync replica count enforcement not enabled. Bailng out on recovery of %+v.", analysisEntry.AnalyzedInstanceKey)) + return false, topologyRecovery, err + } + + instance, found, err := inst.ReadInstance(&analysisEntry.AnalyzedInstanceKey) + if !found || err != nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("instance not found in recovery %+v.", analysisEntry.AnalyzedInstanceKey)) + return false, topologyRecovery, err + } + + replicas, err := inst.ReadReplicaInstances(&analysisEntry.AnalyzedInstanceKey) + if err != nil { + return false, topologyRecovery, err + } + if len(replicas) == 0 { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no replicas found for %+v.", analysisEntry.AnalyzedInstanceKey)) + return false, topologyRecovery, nil + } + + reversePriorityReplicas := replicas // TODO reverse-sort replicas by priority + disableCount := int(analysisEntry.CountSemiSyncReplicasEnabled) - int(analysisEntry.SemiSyncMasterWaitForReplicaCount) + if disableCount <= 0 || disableCount > len(replicas) { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("invalid state for recovery %+v. Wait count is greater or equal to semi sync replica count or replica count too low.", analysisEntry.AnalyzedInstanceKey)) + return false, nil, err + } + + // No way back. We're doing this. + + // TODO do preflight checks + + disabled := 0 + for _, replica := range reversePriorityReplicas { + if !replica.SemiSyncReplicaEnabled { + continue + } + err := inst.EnableSemiSync(&replica.Key, false, false) + if err != nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot disable semi sync on replica %+v.", replica.Key)) + return true, topologyRecovery, nil + } + err = inst.RestartReplicationQuick(&replica.Key) + if err != nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot restart replication on replica %+v.", replica.Key)) + return true, topologyRecovery, nil + } + disabled++ + if disabled == disableCount { + break + } + } + + resolveRecovery(topologyRecovery, instance) + return true, topologyRecovery, nil } // checkAndRecoverGenericProblem is a general-purpose recovery function @@ -1659,6 +1769,8 @@ func getCheckAndRecoverFunction(analysisCode inst.AnalysisCode, analyzedInstance } else { return checkAndRecoverLockedSemiSyncMaster, true } + case inst.MasterWithTooManySemiSyncReplicas: + return checkAndRecoverMasterWithTooManySemiSyncReplicas, true // intermediate master case inst.DeadIntermediateMaster: return checkAndRecoverDeadIntermediateMaster, true From 26f85da9b6c03353eaa08f59494384095fcc78fb Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Mon, 28 Jun 2021 20:53:13 -0400 Subject: [PATCH 02/18] Redo SemiSyncEnforced as priority; sort replicas by priority, then promotion_rule; only handle "exact" case for now --- go/config/config.go | 46 +++++++++++++------------- go/inst/analysis_dao.go | 2 +- go/inst/instance.go | 2 +- go/inst/instance_dao.go | 2 +- go/inst/instance_topology_dao.go | 8 ++--- go/logic/topology_recovery.go | 55 +++++++++++++++++++++++++++++--- 6 files changed, 81 insertions(+), 34 deletions(-) diff --git a/go/config/config.go b/go/config/config.go index 41ab044e7..c8fb179a7 100644 --- a/go/config/config.go +++ b/go/config/config.go @@ -62,6 +62,8 @@ const ( SelectTrueQuery = "select 1" ConsulKVsPerCluster = 5 // KVs: "/", "/hostname", "/ipv4", "/ipv6" and "/port" ConsulMaxTransactionOps = 64 + EnforceExactSemiSyncReplicas = "exact" + EnforceEnoughSemiSyncReplicas = "enough" ) var deprecatedConfigurationVariables = []string{ @@ -265,17 +267,17 @@ type Configuration struct { DiscoveryIgnoreReplicaHostnameFilters []string // Regexp filters to apply to prevent auto-discovering new replicas. Usage: unreachable servers due to firewalls, applications which trigger binlog dumps DiscoveryIgnoreMasterHostnameFilters []string // Regexp filters to apply to prevent auto-discovering a master. Usage: pointing your master temporarily to replicate seom data from external host DiscoveryIgnoreHostnameFilters []string // Regexp filters to apply to prevent discovering instances of any kind - ConsulAddress string // Address where Consul HTTP api is found. Example: 127.0.0.1:8500 - ConsulScheme string // Scheme (http or https) for Consul - ConsulAclToken string // ACL token used to write to Consul KV - ConsulCrossDataCenterDistribution bool // should orchestrator automatically auto-deduce all consul DCs and write KVs in all DCs - ConsulKVStoreProvider string // Consul KV store provider (consul or consul-txn), default: "consul" - ConsulMaxKVsPerTransaction int // Maximum number of KV operations to perform in a single Consul Transaction. Requires the "consul-txn" ConsulKVStoreProvider - ZkAddress string // UNSUPPERTED YET. Address where (single or multiple) ZooKeeper servers are found, in `srv1[:port1][,srv2[:port2]...]` format. Default port is 2181. Example: srv-a,srv-b:12181,srv-c - KVClusterMasterPrefix string // Prefix to use for clusters' masters entries in KV stores (internal, consul, ZK), default: "mysql/master" - WebMessage string // If provided, will be shown on all web pages below the title bar - MaxConcurrentReplicaOperations int // Maximum number of concurrent operations on replicas - EnforceSemiSyncReplicaCount bool + ConsulAddress string // Address where Consul HTTP api is found. Example: 127.0.0.1:8500 + ConsulScheme string // Scheme (http or https) for Consul + ConsulAclToken string // ACL token used to write to Consul KV + ConsulCrossDataCenterDistribution bool // should orchestrator automatically auto-deduce all consul DCs and write KVs in all DCs + ConsulKVStoreProvider string // Consul KV store provider (consul or consul-txn), default: "consul" + ConsulMaxKVsPerTransaction int // Maximum number of KV operations to perform in a single Consul Transaction. Requires the "consul-txn" ConsulKVStoreProvider + ZkAddress string // UNSUPPERTED YET. Address where (single or multiple) ZooKeeper servers are found, in `srv1[:port1][,srv2[:port2]...]` format. Default port is 2181. Example: srv-a,srv-b:12181,srv-c + KVClusterMasterPrefix string // Prefix to use for clusters' masters entries in KV stores (internal, consul, ZK), default: "mysql/master" + WebMessage string // If provided, will be shown on all web pages below the title bar + MaxConcurrentReplicaOperations int // Maximum number of concurrent operations on replicas + EnforceSemiSyncReplicas string // If empty, semi-sync replicas will not be touched; if "exact", semi-sync replicas will be enabled/disabled to match the wait count; if "enough", semi-sync replicas will be enabled until the wait count is reached } // ToJSONString will marshal this configuration as JSON @@ -437,17 +439,17 @@ func newConfiguration() *Configuration { GraphitePollSeconds: 60, URLPrefix: "", DiscoveryIgnoreReplicaHostnameFilters: []string{}, - ConsulAddress: "", - ConsulScheme: "http", - ConsulAclToken: "", - ConsulCrossDataCenterDistribution: false, - ConsulKVStoreProvider: "consul", - ConsulMaxKVsPerTransaction: ConsulKVsPerCluster, - ZkAddress: "", - KVClusterMasterPrefix: "mysql/master", - WebMessage: "", - MaxConcurrentReplicaOperations: 5, - EnforceSemiSyncReplicaCount: false, + ConsulAddress: "", + ConsulScheme: "http", + ConsulAclToken: "", + ConsulCrossDataCenterDistribution: false, + ConsulKVStoreProvider: "consul", + ConsulMaxKVsPerTransaction: ConsulKVsPerCluster, + ZkAddress: "", + KVClusterMasterPrefix: "mysql/master", + WebMessage: "", + MaxConcurrentReplicaOperations: 5, + EnforceSemiSyncReplicas: "", } } diff --git a/go/inst/analysis_dao.go b/go/inst/analysis_dao.go index bef8463e5..607193d9e 100644 --- a/go/inst/analysis_dao.go +++ b/go/inst/analysis_dao.go @@ -531,7 +531,7 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) a.Description = "Semi sync master seems to be locked, more samplings needed to validate" } // - } else if config.Config.EnforceSemiSyncReplicaCount && a.IsMaster && a.SemiSyncMasterEnabled && a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients > a.SemiSyncMasterWaitForReplicaCount { + } else if config.Config.EnforceSemiSyncReplicas == config.EnforceExactSemiSyncReplicas && a.IsMaster && a.SemiSyncMasterEnabled && a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients > a.SemiSyncMasterWaitForReplicaCount { a.Analysis = MasterWithTooManySemiSyncReplicas a.Description = "Semi sync master has more semi sync replicas than configured" // diff --git a/go/inst/instance.go b/go/inst/instance.go index 5b0d4e115..3e91e9cb8 100644 --- a/go/inst/instance.go +++ b/go/inst/instance.go @@ -94,7 +94,7 @@ type Instance struct { HasReplicationCredentials bool ReplicationCredentialsAvailable bool SemiSyncAvailable bool // when both semi sync plugins (master & replica) are loaded - SemiSyncEnforced bool + SemiSyncEnforced uint SemiSyncMasterEnabled bool SemiSyncReplicaEnabled bool SemiSyncMasterTimeout uint64 diff --git a/go/inst/instance_dao.go b/go/inst/instance_dao.go index 04e4168d7..379fa63e3 100644 --- a/go/inst/instance_dao.go +++ b/go/inst/instance_dao.go @@ -1209,7 +1209,7 @@ func readInstanceRow(m sqlutils.RowMap) *Instance { instance.DataCenter = m.GetString("data_center") instance.Region = m.GetString("region") instance.PhysicalEnvironment = m.GetString("physical_environment") - instance.SemiSyncEnforced = m.GetBool("semi_sync_enforced") + instance.SemiSyncEnforced = m.GetUint("semi_sync_enforced") instance.SemiSyncAvailable = m.GetBool("semi_sync_available") instance.SemiSyncMasterEnabled = m.GetBool("semi_sync_master_enabled") instance.SemiSyncMasterTimeout = m.GetUint64("semi_sync_master_timeout") diff --git a/go/inst/instance_topology_dao.go b/go/inst/instance_topology_dao.go index 1631fe8eb..d3a812028 100644 --- a/go/inst/instance_topology_dao.go +++ b/go/inst/instance_topology_dao.go @@ -456,7 +456,7 @@ func StartReplication(instanceKey *InstanceKey) (*Instance, error) { // some replicas (those that must never be promoted) should never ACK. // Note: We assume that replicas use 'skip-slave-start' so they won't // START SLAVE on their own upon restart. - if instance.SemiSyncEnforced { + if instance.SemiSyncEnforced > 0 { // Send ACK only from promotable instances. sendACK := instance.PromotionRule != MustNotPromoteRule // Always disable master setting, in case we're converting a former master. @@ -550,7 +550,7 @@ func StartReplicationUntilMasterCoordinates(instanceKey *InstanceKey, masterCoor log.Infof("Will start replication on %+v until coordinates: %+v", instanceKey, masterCoordinates) - if instance.SemiSyncEnforced { + if instance.SemiSyncEnforced > 0 { // Send ACK only from promotable instances. sendACK := instance.PromotionRule != MustNotPromoteRule // Always disable master setting, in case we're converting a former master. @@ -1090,7 +1090,7 @@ func SetReadOnly(instanceKey *InstanceKey, readOnly bool) (*Instance, error) { // If async fallback is disallowed, we're responsible for flipping the master // semi-sync switch ON before accepting writes. The setting is off by default. - if instance.SemiSyncEnforced && !readOnly { + if instance.SemiSyncEnforced > 0 && !readOnly { // Send ACK only from promotable instances. sendACK := instance.PromotionRule != MustNotPromoteRule if err := EnableSemiSync(instanceKey, true, sendACK); err != nil { @@ -1114,7 +1114,7 @@ func SetReadOnly(instanceKey *InstanceKey, readOnly bool) (*Instance, error) { // If we just went read-only, it's safe to flip the master semi-sync switch // OFF, which is the default value so that replicas can make progress. - if instance.SemiSyncEnforced && readOnly { + if instance.SemiSyncEnforced > 0 && readOnly { // Send ACK only from promotable instances. sendACK := instance.PromotionRule != MustNotPromoteRule if err := EnableSemiSync(instanceKey, false, sendACK); err != nil { diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index c46788aba..1d643bb47 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -1485,9 +1485,18 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis, return false, nil, err } - if !config.Config.EnforceSemiSyncReplicaCount { + if config.Config.EnforceSemiSyncReplicas != config.EnforceExactSemiSyncReplicas && config.Config.EnforceSemiSyncReplicas != config.EnforceEnoughSemiSyncReplicas { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync enforcement is not enabled on %+v. Will not issue another RecoverLockedSemiSyncMaster.", analysisEntry.AnalyzedInstanceKey)) return false, nil, nil } + if config.Config.EnforceSemiSyncReplicas == config.EnforceExactSemiSyncReplicas { + return recoverExactSemiSyncReplicas(analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses) + } + + panic("not supported") + + // Proceed with "enough"-style recovery: + // in priority order, enable semi-sync on replicas until there are enough to match the wait count replicas, err := inst.ReadReplicaInstances(&analysisEntry.AnalyzedInstanceKey) if err != nil { @@ -1498,6 +1507,10 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis, return false, topologyRecovery, nil } + if config.Config.EnforceSemiSyncReplicas == config.EnforceEnoughSemiSyncReplicas { + + } + reversePriorityReplicas := replicas // TODO sort replicas by priority enableCount := int(analysisEntry.SemiSyncMasterWaitForReplicaCount) - int(analysisEntry.CountSemiSyncReplicasEnabled) if enableCount <= 0 || enableCount > len(replicas) { @@ -1542,18 +1555,22 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica return false, nil, err } - // FIXME is this the right spot for this check? We shouldn't even get here right? - if !config.Config.EnforceSemiSyncReplicaCount { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi sync replica count enforcement not enabled. Bailng out on recovery of %+v.", analysisEntry.AnalyzedInstanceKey)) + if config.Config.EnforceSemiSyncReplicas != config.EnforceExactSemiSyncReplicas { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi sync replica count enforcement not set to %s. Bailing out on recovery of %+v.", config.EnforceExactSemiSyncReplicas, analysisEntry.AnalyzedInstanceKey)) return false, topologyRecovery, err } + return recoverExactSemiSyncReplicas(analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses) +} + +func recoverExactSemiSyncReplicas(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { instance, found, err := inst.ReadInstance(&analysisEntry.AnalyzedInstanceKey) if !found || err != nil { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("instance not found in recovery %+v.", analysisEntry.AnalyzedInstanceKey)) return false, topologyRecovery, err } + // Read all replicas replicas, err := inst.ReadReplicaInstances(&analysisEntry.AnalyzedInstanceKey) if err != nil { return false, topologyRecovery, err @@ -1563,6 +1580,34 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica return false, topologyRecovery, nil } + // Filter out downtimed and down replicas + validSemiSyncReplicas := make([]*inst.Instance, 0) + for _, replica := range replicas { + if replica.IsDowntimed || replica.SemiSyncEnforced == 0 { // TODO do i need to check last seen? + continue + } + validSemiSyncReplicas = append(validSemiSyncReplicas, replica) + } + + // Sort replicas by priority & promotion rule + sort.SliceStable(validSemiSyncReplicas, func(i, j int) bool { + if validSemiSyncReplicas[i].SemiSyncEnforced != validSemiSyncReplicas[j].SemiSyncEnforced { + return validSemiSyncReplicas[i].SemiSyncEnforced < validSemiSyncReplicas[j].SemiSyncEnforced + } + if validSemiSyncReplicas[i].PromotionRule != validSemiSyncReplicas[j].PromotionRule { + return validSemiSyncReplicas[i].PromotionRule.BetterThan(validSemiSyncReplicas[j].PromotionRule) + } + return strings.Compare(validSemiSyncReplicas[i].Key.String(), validSemiSyncReplicas[j].Key.String()) < 0 + }) + + //desiredCount := analysisEntry.SemiSyncMasterWaitForReplicaCount + + + for _, replica := range validSemiSyncReplicas { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("replica: %s %d %+v %#v\n", replica.Key, replica.SemiSyncEnforced, replica.PromotionRule, replica)) + } + return false, nil, nil + reversePriorityReplicas := replicas // TODO reverse-sort replicas by priority disableCount := int(analysisEntry.CountSemiSyncReplicasEnabled) - int(analysisEntry.SemiSyncMasterWaitForReplicaCount) if disableCount <= 0 || disableCount > len(replicas) { @@ -1579,7 +1624,7 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica if !replica.SemiSyncReplicaEnabled { continue } - err := inst.EnableSemiSync(&replica.Key, false, false) + _, err := inst.SetSemiSyncReplica(&replica.Key, false) if err != nil { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot disable semi sync on replica %+v.", replica.Key)) return true, topologyRecovery, nil From d95345a8a703f0e41a2651c0ba16bb938f07525d Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Tue, 29 Jun 2021 08:21:24 -0400 Subject: [PATCH 03/18] Progress on recoverExactSemiSyncReplicas; works now, with issues --- go/logic/topology_recovery.go | 131 +++++++++++----------------------- 1 file changed, 41 insertions(+), 90 deletions(-) diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index 1d643bb47..08c436b3b 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -1490,7 +1490,7 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis, return false, nil, nil } if config.Config.EnforceSemiSyncReplicas == config.EnforceExactSemiSyncReplicas { - return recoverExactSemiSyncReplicas(analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses) + return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses) } panic("not supported") @@ -1498,52 +1498,9 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis, // Proceed with "enough"-style recovery: // in priority order, enable semi-sync on replicas until there are enough to match the wait count - replicas, err := inst.ReadReplicaInstances(&analysisEntry.AnalyzedInstanceKey) - if err != nil { - return false, topologyRecovery, err - } - if len(replicas) == 0 { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no replicas found for %+v.", analysisEntry.AnalyzedInstanceKey)) - return false, topologyRecovery, nil - } - - if config.Config.EnforceSemiSyncReplicas == config.EnforceEnoughSemiSyncReplicas { - - } - - reversePriorityReplicas := replicas // TODO sort replicas by priority - enableCount := int(analysisEntry.SemiSyncMasterWaitForReplicaCount) - int(analysisEntry.CountSemiSyncReplicasEnabled) - if enableCount <= 0 || enableCount > len(replicas) { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("invalid state for recovery %+v. Wait count is lower or equal to semi sync replica count or replica count too low.", analysisEntry.AnalyzedInstanceKey)) - return false, nil, err - } - - // No way back. We're doing this. - - // TODO before actually doing it, do sanity check (downtimed, etc.) - - enabled := 0 - for _, replica := range reversePriorityReplicas { - if replica.IsDowntimed || replica.SemiSyncReplicaEnabled { - continue - } - err := inst.EnableSemiSync(&replica.Key, false, true) - if err != nil { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot enable semi sync on replica %+v.", replica.Key)) - return true, topologyRecovery, nil - } - err = inst.RestartReplicationQuick(&replica.Key) - if err != nil { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot restart replication on replica %+v.", replica.Key)) - return true, topologyRecovery, nil - } - enabled++ - if enabled == enableCount { - break - } - } + // TODO Jun 29 12:05:14 pheckel-devm-orch-1 orchestrator[564933]: 2021-06-29 12:05:14 INFO checkAndExecuteFailureDetectionProcesses: could not register LockedSemiSyncMaster detection on pheckel-devm-db-g0-1:3306 + // TODO IMPLEMENT THIS - resolveRecovery(topologyRecovery, nil) return true, topologyRecovery, nil } @@ -1560,10 +1517,10 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica return false, topologyRecovery, err } - return recoverExactSemiSyncReplicas(analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses) + return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses) } -func recoverExactSemiSyncReplicas(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { +func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecoveryOut *TopologyRecovery, err error) { instance, found, err := inst.ReadInstance(&analysisEntry.AnalyzedInstanceKey) if !found || err != nil { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("instance not found in recovery %+v.", analysisEntry.AnalyzedInstanceKey)) @@ -1581,63 +1538,57 @@ func recoverExactSemiSyncReplicas(analysisEntry inst.ReplicationAnalysis, candid } // Filter out downtimed and down replicas - validSemiSyncReplicas := make([]*inst.Instance, 0) + desiredSemiSyncReplicaCount := analysisEntry.SemiSyncMasterWaitForReplicaCount + possibleSemiSyncReplicas := make([]*inst.Instance, 0) for _, replica := range replicas { if replica.IsDowntimed || replica.SemiSyncEnforced == 0 { // TODO do i need to check last seen? continue } - validSemiSyncReplicas = append(validSemiSyncReplicas, replica) + possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica) + } + if uint(len(possibleSemiSyncReplicas)) < desiredSemiSyncReplicaCount { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("not enough valid live replicas found to recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey)) + return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this? } - // Sort replicas by priority & promotion rule - sort.SliceStable(validSemiSyncReplicas, func(i, j int) bool { - if validSemiSyncReplicas[i].SemiSyncEnforced != validSemiSyncReplicas[j].SemiSyncEnforced { - return validSemiSyncReplicas[i].SemiSyncEnforced < validSemiSyncReplicas[j].SemiSyncEnforced + // Sort replicas by priority, promotion rule and name + sort.SliceStable(possibleSemiSyncReplicas, func(i, j int) bool { + if possibleSemiSyncReplicas[i].SemiSyncEnforced != possibleSemiSyncReplicas[j].SemiSyncEnforced { + return possibleSemiSyncReplicas[i].SemiSyncEnforced < possibleSemiSyncReplicas[j].SemiSyncEnforced } - if validSemiSyncReplicas[i].PromotionRule != validSemiSyncReplicas[j].PromotionRule { - return validSemiSyncReplicas[i].PromotionRule.BetterThan(validSemiSyncReplicas[j].PromotionRule) + if possibleSemiSyncReplicas[i].PromotionRule != possibleSemiSyncReplicas[j].PromotionRule { + return possibleSemiSyncReplicas[i].PromotionRule.BetterThan(possibleSemiSyncReplicas[j].PromotionRule) } - return strings.Compare(validSemiSyncReplicas[i].Key.String(), validSemiSyncReplicas[j].Key.String()) < 0 + return strings.Compare(possibleSemiSyncReplicas[i].Key.String(), possibleSemiSyncReplicas[j].Key.String()) < 0 }) - //desiredCount := analysisEntry.SemiSyncMasterWaitForReplicaCount - - - for _, replica := range validSemiSyncReplicas { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("replica: %s %d %+v %#v\n", replica.Key, replica.SemiSyncEnforced, replica.PromotionRule, replica)) - } - return false, nil, nil - - reversePriorityReplicas := replicas // TODO reverse-sort replicas by priority - disableCount := int(analysisEntry.CountSemiSyncReplicasEnabled) - int(analysisEntry.SemiSyncMasterWaitForReplicaCount) - if disableCount <= 0 || disableCount > len(replicas) { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("invalid state for recovery %+v. Wait count is greater or equal to semi sync replica count or replica count too low.", analysisEntry.AnalyzedInstanceKey)) - return false, nil, err + // Figure out which replicas need to be acted upon + actions := make(map[*inst.Instance]bool, 0) + for i, replica := range possibleSemiSyncReplicas { + isSemiSyncEnabled := replica.SemiSyncReplicaEnabled + shouldSemiSyncBeEnabled := uint(i) < desiredSemiSyncReplicaCount + if shouldSemiSyncBeEnabled && !isSemiSyncEnabled { + actions[replica] = true + } else if !shouldSemiSyncBeEnabled && isSemiSyncEnabled { + actions[replica] = false + } } - // No way back. We're doing this. - - // TODO do preflight checks - - disabled := 0 - for _, replica := range reversePriorityReplicas { - if !replica.SemiSyncReplicaEnabled { - continue - } - _, err := inst.SetSemiSyncReplica(&replica.Key, false) + // Take action + for replica, enable := range actions { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("replica %s: setting rpl_semi_sync_slave_enabled=%t (including potential IO thread restart) to match desired state for recovery of %s on %+v", replica.Key.String(), enable, analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey)) + _, err := inst.SetSemiSyncReplica(&replica.Key, enable) if err != nil { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot disable semi sync on replica %+v.", replica.Key)) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot change semi sync on replica %+v.", replica.Key)) return true, topologyRecovery, nil } - err = inst.RestartReplicationQuick(&replica.Key) - if err != nil { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot restart replication on replica %+v.", replica.Key)) - return true, topologyRecovery, nil - } - disabled++ - if disabled == disableCount { - break - } + } + + // Re-read source instance to avoid re-triggering the same condition + // TODO this does not help; it is still re-triggering the same analysis until the next polling interval. WHY? + instance, err = inst.ReadTopologyInstance(&instance.Key) + if err != nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("warning: cannot re-read instance after recovery of %s on instance %+v; recovery might be retriggered, but this is harmless.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey)) } resolveRecovery(topologyRecovery, instance) From e36ffbf63aaccd7b3abe2ee3fa73f12eb1609eef Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Tue, 29 Jun 2021 08:36:42 -0400 Subject: [PATCH 04/18] gofmt --- go/config/config.go | 44 ++++++++++++++++++++++---------------------- go/inst/analysis.go | 42 +++++++++++++++++++++--------------------- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/go/config/config.go b/go/config/config.go index c8fb179a7..0235e97a1 100644 --- a/go/config/config.go +++ b/go/config/config.go @@ -267,17 +267,17 @@ type Configuration struct { DiscoveryIgnoreReplicaHostnameFilters []string // Regexp filters to apply to prevent auto-discovering new replicas. Usage: unreachable servers due to firewalls, applications which trigger binlog dumps DiscoveryIgnoreMasterHostnameFilters []string // Regexp filters to apply to prevent auto-discovering a master. Usage: pointing your master temporarily to replicate seom data from external host DiscoveryIgnoreHostnameFilters []string // Regexp filters to apply to prevent discovering instances of any kind - ConsulAddress string // Address where Consul HTTP api is found. Example: 127.0.0.1:8500 - ConsulScheme string // Scheme (http or https) for Consul - ConsulAclToken string // ACL token used to write to Consul KV - ConsulCrossDataCenterDistribution bool // should orchestrator automatically auto-deduce all consul DCs and write KVs in all DCs - ConsulKVStoreProvider string // Consul KV store provider (consul or consul-txn), default: "consul" - ConsulMaxKVsPerTransaction int // Maximum number of KV operations to perform in a single Consul Transaction. Requires the "consul-txn" ConsulKVStoreProvider - ZkAddress string // UNSUPPERTED YET. Address where (single or multiple) ZooKeeper servers are found, in `srv1[:port1][,srv2[:port2]...]` format. Default port is 2181. Example: srv-a,srv-b:12181,srv-c - KVClusterMasterPrefix string // Prefix to use for clusters' masters entries in KV stores (internal, consul, ZK), default: "mysql/master" - WebMessage string // If provided, will be shown on all web pages below the title bar - MaxConcurrentReplicaOperations int // Maximum number of concurrent operations on replicas - EnforceSemiSyncReplicas string // If empty, semi-sync replicas will not be touched; if "exact", semi-sync replicas will be enabled/disabled to match the wait count; if "enough", semi-sync replicas will be enabled until the wait count is reached + ConsulAddress string // Address where Consul HTTP api is found. Example: 127.0.0.1:8500 + ConsulScheme string // Scheme (http or https) for Consul + ConsulAclToken string // ACL token used to write to Consul KV + ConsulCrossDataCenterDistribution bool // should orchestrator automatically auto-deduce all consul DCs and write KVs in all DCs + ConsulKVStoreProvider string // Consul KV store provider (consul or consul-txn), default: "consul" + ConsulMaxKVsPerTransaction int // Maximum number of KV operations to perform in a single Consul Transaction. Requires the "consul-txn" ConsulKVStoreProvider + ZkAddress string // UNSUPPERTED YET. Address where (single or multiple) ZooKeeper servers are found, in `srv1[:port1][,srv2[:port2]...]` format. Default port is 2181. Example: srv-a,srv-b:12181,srv-c + KVClusterMasterPrefix string // Prefix to use for clusters' masters entries in KV stores (internal, consul, ZK), default: "mysql/master" + WebMessage string // If provided, will be shown on all web pages below the title bar + MaxConcurrentReplicaOperations int // Maximum number of concurrent operations on replicas + EnforceSemiSyncReplicas string // If empty, semi-sync replicas will not be touched; if "exact", semi-sync replicas will be enabled/disabled to match the wait count; if "enough", semi-sync replicas will be enabled until the wait count is reached } // ToJSONString will marshal this configuration as JSON @@ -439,17 +439,17 @@ func newConfiguration() *Configuration { GraphitePollSeconds: 60, URLPrefix: "", DiscoveryIgnoreReplicaHostnameFilters: []string{}, - ConsulAddress: "", - ConsulScheme: "http", - ConsulAclToken: "", - ConsulCrossDataCenterDistribution: false, - ConsulKVStoreProvider: "consul", - ConsulMaxKVsPerTransaction: ConsulKVsPerCluster, - ZkAddress: "", - KVClusterMasterPrefix: "mysql/master", - WebMessage: "", - MaxConcurrentReplicaOperations: 5, - EnforceSemiSyncReplicas: "", + ConsulAddress: "", + ConsulScheme: "http", + ConsulAclToken: "", + ConsulCrossDataCenterDistribution: false, + ConsulKVStoreProvider: "consul", + ConsulMaxKVsPerTransaction: ConsulKVsPerCluster, + ZkAddress: "", + KVClusterMasterPrefix: "mysql/master", + WebMessage: "", + MaxConcurrentReplicaOperations: 5, + EnforceSemiSyncReplicas: "", } } diff --git a/go/inst/analysis.go b/go/inst/analysis.go index 633bd8f10..d34d8f1d2 100644 --- a/go/inst/analysis.go +++ b/go/inst/analysis.go @@ -30,27 +30,27 @@ const ( NoProblem AnalysisCode = "NoProblem" DeadMasterWithoutReplicas = "DeadMasterWithoutReplicas" DeadMaster = "DeadMaster" - DeadMasterAndReplicas = "DeadMasterAndReplicas" - DeadMasterAndSomeReplicas = "DeadMasterAndSomeReplicas" - UnreachableMasterWithLaggingReplicas = "UnreachableMasterWithLaggingReplicas" - UnreachableMaster = "UnreachableMaster" - MasterSingleReplicaNotReplicating = "MasterSingleReplicaNotReplicating" - MasterSingleReplicaDead = "MasterSingleReplicaDead" - AllMasterReplicasNotReplicating = "AllMasterReplicasNotReplicating" - AllMasterReplicasNotReplicatingOrDead = "AllMasterReplicasNotReplicatingOrDead" - LockedSemiSyncMasterHypothesis = "LockedSemiSyncMasterHypothesis" - LockedSemiSyncMaster = "LockedSemiSyncMaster" - MasterWithTooManySemiSyncReplicas = "MasterWithTooManySemiSyncReplicas" - MasterWithoutReplicas = "MasterWithoutReplicas" - DeadCoMaster = "DeadCoMaster" - DeadCoMasterAndSomeReplicas = "DeadCoMasterAndSomeReplicas" - UnreachableCoMaster = "UnreachableCoMaster" - AllCoMasterReplicasNotReplicating = "AllCoMasterReplicasNotReplicating" - DeadIntermediateMaster = "DeadIntermediateMaster" - DeadIntermediateMasterWithSingleReplica = "DeadIntermediateMasterWithSingleReplica" - DeadIntermediateMasterWithSingleReplicaFailingToConnect = "DeadIntermediateMasterWithSingleReplicaFailingToConnect" - DeadIntermediateMasterAndSomeReplicas = "DeadIntermediateMasterAndSomeReplicas" - DeadIntermediateMasterAndReplicas = "DeadIntermediateMasterAndReplicas" + DeadMasterAndReplicas = "DeadMasterAndReplicas" + DeadMasterAndSomeReplicas = "DeadMasterAndSomeReplicas" + UnreachableMasterWithLaggingReplicas = "UnreachableMasterWithLaggingReplicas" + UnreachableMaster = "UnreachableMaster" + MasterSingleReplicaNotReplicating = "MasterSingleReplicaNotReplicating" + MasterSingleReplicaDead = "MasterSingleReplicaDead" + AllMasterReplicasNotReplicating = "AllMasterReplicasNotReplicating" + AllMasterReplicasNotReplicatingOrDead = "AllMasterReplicasNotReplicatingOrDead" + LockedSemiSyncMasterHypothesis = "LockedSemiSyncMasterHypothesis" + LockedSemiSyncMaster = "LockedSemiSyncMaster" + MasterWithTooManySemiSyncReplicas = "MasterWithTooManySemiSyncReplicas" + MasterWithoutReplicas = "MasterWithoutReplicas" + DeadCoMaster = "DeadCoMaster" + DeadCoMasterAndSomeReplicas = "DeadCoMasterAndSomeReplicas" + UnreachableCoMaster = "UnreachableCoMaster" + AllCoMasterReplicasNotReplicating = "AllCoMasterReplicasNotReplicating" + DeadIntermediateMaster = "DeadIntermediateMaster" + DeadIntermediateMasterWithSingleReplica = "DeadIntermediateMasterWithSingleReplica" + DeadIntermediateMasterWithSingleReplicaFailingToConnect = "DeadIntermediateMasterWithSingleReplicaFailingToConnect" + DeadIntermediateMasterAndSomeReplicas = "DeadIntermediateMasterAndSomeReplicas" + DeadIntermediateMasterAndReplicas = "DeadIntermediateMasterAndReplicas" UnreachableIntermediateMasterWithLaggingReplicas = "UnreachableIntermediateMasterWithLaggingReplicas" UnreachableIntermediateMaster = "UnreachableIntermediateMaster" AllIntermediateMasterReplicasFailingToConnectOrDead = "AllIntermediateMasterReplicasFailingToConnectOrDead" From 7212bf601dc8843da58617f9f82186bb444c75cb Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Tue, 29 Jun 2021 12:52:41 -0400 Subject: [PATCH 05/18] Split and rename config variables; introduce ReasonableStaleBinlogCoordinatesSeconds, RecoverLockedSemiSyncMaster and EnforceExactSemiSyncReplicas --- go/config/config.go | 10 +++++++--- go/inst/analysis.go | 7 +++++++ go/inst/analysis_dao.go | 4 ++-- go/logic/topology_recovery.go | 17 +++++------------ 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/go/config/config.go b/go/config/config.go index 0235e97a1..861451d07 100644 --- a/go/config/config.go +++ b/go/config/config.go @@ -265,7 +265,7 @@ type Configuration struct { GraphitePollSeconds int // Graphite writes interval. 0 disables. URLPrefix string // URL prefix to run orchestrator on non-root web path, e.g. /orchestrator to put it behind nginx. DiscoveryIgnoreReplicaHostnameFilters []string // Regexp filters to apply to prevent auto-discovering new replicas. Usage: unreachable servers due to firewalls, applications which trigger binlog dumps - DiscoveryIgnoreMasterHostnameFilters []string // Regexp filters to apply to prevent auto-discovering a master. Usage: pointing your master temporarily to replicate seom data from external host + DiscoveryIgnoreMasterHostnameFilters []string // Regexp filters to apply to prevent auto-discovering a master. Usage: pointing your master temporarily to replicate some data from external host DiscoveryIgnoreHostnameFilters []string // Regexp filters to apply to prevent discovering instances of any kind ConsulAddress string // Address where Consul HTTP api is found. Example: 127.0.0.1:8500 ConsulScheme string // Scheme (http or https) for Consul @@ -277,7 +277,9 @@ type Configuration struct { KVClusterMasterPrefix string // Prefix to use for clusters' masters entries in KV stores (internal, consul, ZK), default: "mysql/master" WebMessage string // If provided, will be shown on all web pages below the title bar MaxConcurrentReplicaOperations int // Maximum number of concurrent operations on replicas - EnforceSemiSyncReplicas string // If empty, semi-sync replicas will not be touched; if "exact", semi-sync replicas will be enabled/disabled to match the wait count; if "enough", semi-sync replicas will be enabled until the wait count is reached + EnforceExactSemiSyncReplicas bool // If true, semi-sync replicas will be enabled/disabled to match the wait count in the desired priority order; this applies to LockedSemiSyncMaster and MasterWithTooManySemiSyncReplicas + RecoverLockedSemiSync bool // If true, orchestrator will recover from a LockedSemiSync state by enabling semi-sync on replicas to match the wait count; this behavior can be overridden by EnforceExactSemiSyncReplicas + ReasonableStaleBinlogCoordinatesSeconds uint // Time to evaluate the LockedSemiSyncHypothesis before triggering the LockedSemiSync analysis; falls back to ReasonableReplicationLagSeconds if not set } // ToJSONString will marshal this configuration as JSON @@ -449,7 +451,9 @@ func newConfiguration() *Configuration { KVClusterMasterPrefix: "mysql/master", WebMessage: "", MaxConcurrentReplicaOperations: 5, - EnforceSemiSyncReplicas: "", + EnforceExactSemiSyncReplicas: false, + RecoverLockedSemiSync: false, + ReasonableStaleBinlogCoordinatesSeconds: 0, } } diff --git a/go/inst/analysis.go b/go/inst/analysis.go index d34d8f1d2..a4e525b8b 100644 --- a/go/inst/analysis.go +++ b/go/inst/analysis.go @@ -231,3 +231,10 @@ func (this *ReplicationAnalysis) GetAnalysisInstanceType() AnalysisInstanceType func ValidSecondsFromSeenToLastAttemptedCheck() uint { return config.Config.InstancePollSeconds + config.Config.ReasonableInstanceCheckSeconds } + +func ReasonableStaleBinlogCoordinatesSeconds() uint { + if config.Config.ReasonableStaleBinlogCoordinatesSeconds == 0 { + return uint(config.Config.ReasonableReplicationLagSeconds) + } + return config.Config.ReasonableStaleBinlogCoordinatesSeconds +} diff --git a/go/inst/analysis_dao.go b/go/inst/analysis_dao.go index 607193d9e..ce1343933 100644 --- a/go/inst/analysis_dao.go +++ b/go/inst/analysis_dao.go @@ -55,7 +55,7 @@ func initializeAnalysisDaoPostConfiguration() { func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) ([]ReplicationAnalysis, error) { result := []ReplicationAnalysis{} - args := sqlutils.Args(config.Config.ReasonableReplicationLagSeconds, ValidSecondsFromSeenToLastAttemptedCheck(), config.Config.ReasonableReplicationLagSeconds, clusterName) + args := sqlutils.Args(ReasonableStaleBinlogCoordinatesSeconds(), ValidSecondsFromSeenToLastAttemptedCheck(), config.Config.ReasonableReplicationLagSeconds, clusterName) analysisQueryReductionClause := `` if config.Config.ReduceReplicationAnalysisCount { @@ -531,7 +531,7 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) a.Description = "Semi sync master seems to be locked, more samplings needed to validate" } // - } else if config.Config.EnforceSemiSyncReplicas == config.EnforceExactSemiSyncReplicas && a.IsMaster && a.SemiSyncMasterEnabled && a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients > a.SemiSyncMasterWaitForReplicaCount { + } else if config.Config.EnforceExactSemiSyncReplicas && a.IsMaster && a.SemiSyncMasterEnabled && a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients > a.SemiSyncMasterWaitForReplicaCount { a.Analysis = MasterWithTooManySemiSyncReplicas a.Description = "Semi sync master has more semi sync replicas than configured" // diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index 08c436b3b..d5064aaaf 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -1484,14 +1484,13 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis, AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverLockedSemiSyncMaster.", analysisEntry.AnalyzedInstanceKey)) return false, nil, err } - - if config.Config.EnforceSemiSyncReplicas != config.EnforceExactSemiSyncReplicas && config.Config.EnforceSemiSyncReplicas != config.EnforceEnoughSemiSyncReplicas { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync enforcement is not enabled on %+v. Will not issue another RecoverLockedSemiSyncMaster.", analysisEntry.AnalyzedInstanceKey)) - return false, nil, nil - } - if config.Config.EnforceSemiSyncReplicas == config.EnforceExactSemiSyncReplicas { + if config.Config.EnforceExactSemiSyncReplicas { return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses) } + if !config.Config.RecoverLockedSemiSync { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no action taken to recover locked semi sync master on %+v. Enable RecoverLockedSemiSync or EnforceExactSemiSyncReplicas change this behavior.", analysisEntry.AnalyzedInstanceKey)) + return false, nil, err + } panic("not supported") @@ -1511,12 +1510,6 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverMasterWithTooManySemiSyncReplicas.", analysisEntry.AnalyzedInstanceKey)) return false, nil, err } - - if config.Config.EnforceSemiSyncReplicas != config.EnforceExactSemiSyncReplicas { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi sync replica count enforcement not set to %s. Bailing out on recovery of %+v.", config.EnforceExactSemiSyncReplicas, analysisEntry.AnalyzedInstanceKey)) - return false, topologyRecovery, err - } - return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses) } From 6fb8a1da75642a06da8893fbf542646e49c6c66d Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Tue, 29 Jun 2021 12:54:05 -0400 Subject: [PATCH 06/18] Rename variable again --- go/config/config.go | 4 ++-- go/logic/topology_recovery.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/go/config/config.go b/go/config/config.go index 861451d07..9efe4b2db 100644 --- a/go/config/config.go +++ b/go/config/config.go @@ -278,7 +278,7 @@ type Configuration struct { WebMessage string // If provided, will be shown on all web pages below the title bar MaxConcurrentReplicaOperations int // Maximum number of concurrent operations on replicas EnforceExactSemiSyncReplicas bool // If true, semi-sync replicas will be enabled/disabled to match the wait count in the desired priority order; this applies to LockedSemiSyncMaster and MasterWithTooManySemiSyncReplicas - RecoverLockedSemiSync bool // If true, orchestrator will recover from a LockedSemiSync state by enabling semi-sync on replicas to match the wait count; this behavior can be overridden by EnforceExactSemiSyncReplicas + RecoverLockedSemiSyncMaster bool // If true, orchestrator will recover from a LockedSemiSync state by enabling semi-sync on replicas to match the wait count; this behavior can be overridden by EnforceExactSemiSyncReplicas ReasonableStaleBinlogCoordinatesSeconds uint // Time to evaluate the LockedSemiSyncHypothesis before triggering the LockedSemiSync analysis; falls back to ReasonableReplicationLagSeconds if not set } @@ -452,7 +452,7 @@ func newConfiguration() *Configuration { WebMessage: "", MaxConcurrentReplicaOperations: 5, EnforceExactSemiSyncReplicas: false, - RecoverLockedSemiSync: false, + RecoverLockedSemiSyncMaster: false, ReasonableStaleBinlogCoordinatesSeconds: 0, } } diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index d5064aaaf..e01396aca 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -1487,8 +1487,8 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis, if config.Config.EnforceExactSemiSyncReplicas { return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses) } - if !config.Config.RecoverLockedSemiSync { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no action taken to recover locked semi sync master on %+v. Enable RecoverLockedSemiSync or EnforceExactSemiSyncReplicas change this behavior.", analysisEntry.AnalyzedInstanceKey)) + if !config.Config.RecoverLockedSemiSyncMaster { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no action taken to recover locked semi sync master on %+v. Enable RecoverLockedSemiSyncMaster or EnforceExactSemiSyncReplicas change this behavior.", analysisEntry.AnalyzedInstanceKey)) return false, nil, err } From bc3cc8d66b0ce607bfd669032ca7ca0fd6da2912 Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Tue, 29 Jun 2021 16:49:02 -0400 Subject: [PATCH 07/18] Add logging; add check for replication running and last check valid --- go/config/config.go | 2 -- go/logic/topology_recovery.go | 38 ++++++++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/go/config/config.go b/go/config/config.go index 9efe4b2db..33cd087fc 100644 --- a/go/config/config.go +++ b/go/config/config.go @@ -62,8 +62,6 @@ const ( SelectTrueQuery = "select 1" ConsulKVsPerCluster = 5 // KVs: "/", "/hostname", "/ipv4", "/ipv6" and "/port" ConsulMaxTransactionOps = 64 - EnforceExactSemiSyncReplicas = "exact" - EnforceEnoughSemiSyncReplicas = "enough" ) var deprecatedConfigurationVariables = []string{ diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index e01396aca..406ae2569 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -1485,7 +1485,7 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis, return false, nil, err } if config.Config.EnforceExactSemiSyncReplicas { - return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses) + return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry) } if !config.Config.RecoverLockedSemiSyncMaster { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no action taken to recover locked semi sync master on %+v. Enable RecoverLockedSemiSyncMaster or EnforceExactSemiSyncReplicas change this behavior.", analysisEntry.AnalyzedInstanceKey)) @@ -1510,10 +1510,10 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverMasterWithTooManySemiSyncReplicas.", analysisEntry.AnalyzedInstanceKey)) return false, nil, err } - return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses) + return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry) } -func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecoveryOut *TopologyRecovery, err error) { +func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecoveryOut *TopologyRecovery, err error) { instance, found, err := inst.ReadInstance(&analysisEntry.AnalyzedInstanceKey) if !found || err != nil { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("instance not found in recovery %+v.", analysisEntry.AnalyzedInstanceKey)) @@ -1523,6 +1523,7 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn // Read all replicas replicas, err := inst.ReadReplicaInstances(&analysisEntry.AnalyzedInstanceKey) if err != nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("could not read replica instancesfor %+v: %s", analysisEntry.AnalyzedInstanceKey, err.Error())) return false, topologyRecovery, err } if len(replicas) == 0 { @@ -1533,12 +1534,34 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn // Filter out downtimed and down replicas desiredSemiSyncReplicaCount := analysisEntry.SemiSyncMasterWaitForReplicaCount possibleSemiSyncReplicas := make([]*inst.Instance, 0) + excludedReplicas := make([]*inst.Instance, 0) for _, replica := range replicas { - if replica.IsDowntimed || replica.SemiSyncEnforced == 0 { // TODO do i need to check last seen? + if replica.IsDowntimed || replica.SemiSyncEnforced == 0 || !replica.IsLastCheckValid || !replica.ReplicaRunning() { + excludedReplicas = append(excludedReplicas, replica) + // TODO make this more resilient: re-read instance if its last check was invalid continue } possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica) } + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("master semi-sync wait count is %d; we have %d valid possible semi-sync replicas and %d excluded replicas", desiredSemiSyncReplicaCount, len(possibleSemiSyncReplicas), len(excludedReplicas))) + if len(possibleSemiSyncReplicas) > 0 { + AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas:") + for _, replica := range possibleSemiSyncReplicas { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: downtimed = %t, semi-sync enforced priority = %d, last check valid = %t, replication runnning = %t", replica.Key.String(), replica.IsDowntimed, replica.SemiSyncEnforced, replica.IsLastCheckValid, replica.ReplicaRunning())) + } + } else { + AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas: (none)") + } + if len(excludedReplicas) > 0 { + AuditTopologyRecovery(topologyRecovery, "excluded replicas:") + for _, replica := range excludedReplicas { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: downtimed = %t, semi-sync enforced priority = %d, last check valid = %t, replication runnning = %t", replica.Key.String(), replica.IsDowntimed, replica.SemiSyncEnforced, replica.IsLastCheckValid, replica.ReplicaRunning())) + } + } else { + AuditTopologyRecovery(topologyRecovery, "excluded replicas: (none)") + } + + // Bail out if we cannot succeed if uint(len(possibleSemiSyncReplicas)) < desiredSemiSyncReplicaCount { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("not enough valid live replicas found to recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey)) return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this? @@ -1566,10 +1589,15 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn actions[replica] = false } } + if len(actions) == 0 { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot determine actions based on possible semi-sync replicas; cannot recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey)) + return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this? + } // Take action + AuditTopologyRecovery(topologyRecovery, "taking actions:") for replica, enable := range actions { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("replica %s: setting rpl_semi_sync_slave_enabled=%t (including potential IO thread restart) to match desired state for recovery of %s on %+v", replica.Key.String(), enable, analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey)) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: setting rpl_semi_sync_slave_enabled=%t (including potential IO thread restart)", replica.Key.String(), enable)) _, err := inst.SetSemiSyncReplica(&replica.Key, enable) if err != nil { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot change semi sync on replica %+v.", replica.Key)) From ac1b89985db4e82ac6f281fb08bc7897e55c7936 Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Tue, 29 Jun 2021 20:05:21 -0400 Subject: [PATCH 08/18] Re-order; add logging --- go/logic/topology_recovery.go | 54 +++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index 406ae2569..84f55a3cd 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -1543,32 +1543,9 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn } possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica) } - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("master semi-sync wait count is %d; we have %d valid possible semi-sync replicas and %d excluded replicas", desiredSemiSyncReplicaCount, len(possibleSemiSyncReplicas), len(excludedReplicas))) - if len(possibleSemiSyncReplicas) > 0 { - AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas:") - for _, replica := range possibleSemiSyncReplicas { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: downtimed = %t, semi-sync enforced priority = %d, last check valid = %t, replication runnning = %t", replica.Key.String(), replica.IsDowntimed, replica.SemiSyncEnforced, replica.IsLastCheckValid, replica.ReplicaRunning())) - } - } else { - AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas: (none)") - } - if len(excludedReplicas) > 0 { - AuditTopologyRecovery(topologyRecovery, "excluded replicas:") - for _, replica := range excludedReplicas { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: downtimed = %t, semi-sync enforced priority = %d, last check valid = %t, replication runnning = %t", replica.Key.String(), replica.IsDowntimed, replica.SemiSyncEnforced, replica.IsLastCheckValid, replica.ReplicaRunning())) - } - } else { - AuditTopologyRecovery(topologyRecovery, "excluded replicas: (none)") - } - - // Bail out if we cannot succeed - if uint(len(possibleSemiSyncReplicas)) < desiredSemiSyncReplicaCount { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("not enough valid live replicas found to recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey)) - return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this? - } // Sort replicas by priority, promotion rule and name - sort.SliceStable(possibleSemiSyncReplicas, func(i, j int) bool { + sort.Slice(possibleSemiSyncReplicas, func(i, j int) bool { if possibleSemiSyncReplicas[i].SemiSyncEnforced != possibleSemiSyncReplicas[j].SemiSyncEnforced { return possibleSemiSyncReplicas[i].SemiSyncEnforced < possibleSemiSyncReplicas[j].SemiSyncEnforced } @@ -1589,6 +1566,31 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn actions[replica] = false } } + + // Summarize what we've determined + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("master semi-sync wait count is %d; we have %d valid possible semi-sync replica(s) and %d excluded replica(s)", desiredSemiSyncReplicaCount, len(possibleSemiSyncReplicas), len(excludedReplicas))) + if len(possibleSemiSyncReplicas) > 0 { + AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas (in priority order):") + for _, replica := range possibleSemiSyncReplicas { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncEnforced, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning())) + } + } else { + AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas: (none)") + } + if len(excludedReplicas) > 0 { + AuditTopologyRecovery(topologyRecovery, "excluded replicas:") + for _, replica := range excludedReplicas { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncEnforced, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning())) + } + } else { + AuditTopologyRecovery(topologyRecovery, "excluded replicas: (none)") + } + + // Bail out if we cannot succeed + if uint(len(possibleSemiSyncReplicas)) < desiredSemiSyncReplicaCount { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("not enough valid live replicas found to recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey)) + return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this? + } if len(actions) == 0 { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot determine actions based on possible semi-sync replicas; cannot recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey)) return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this? @@ -1597,7 +1599,7 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn // Take action AuditTopologyRecovery(topologyRecovery, "taking actions:") for replica, enable := range actions { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: setting rpl_semi_sync_slave_enabled=%t (including potential IO thread restart)", replica.Key.String(), enable)) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: setting rpl_semi_sync_slave_enabled=%t, restarting slave_io thread", replica.Key.String(), enable)) _, err := inst.SetSemiSyncReplica(&replica.Key, enable) if err != nil { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot change semi sync on replica %+v.", replica.Key)) @@ -1613,6 +1615,8 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn } resolveRecovery(topologyRecovery, instance) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("recovery complete; success = %t", topologyRecovery.IsSuccessful)) + return true, topologyRecovery, nil } From 7f7045880e78b2b7d87057228b80e77d787e5436 Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Tue, 29 Jun 2021 20:07:59 -0400 Subject: [PATCH 09/18] Rename field --- go/inst/instance.go | 2 +- go/inst/instance_dao.go | 6 +++--- go/inst/instance_topology_dao.go | 8 ++++---- go/logic/topology_recovery.go | 10 +++++----- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/go/inst/instance.go b/go/inst/instance.go index 3e91e9cb8..521895eb8 100644 --- a/go/inst/instance.go +++ b/go/inst/instance.go @@ -94,7 +94,7 @@ type Instance struct { HasReplicationCredentials bool ReplicationCredentialsAvailable bool SemiSyncAvailable bool // when both semi sync plugins (master & replica) are loaded - SemiSyncEnforced uint + SemiSyncPriority uint SemiSyncMasterEnabled bool SemiSyncReplicaEnabled bool SemiSyncMasterTimeout uint64 diff --git a/go/inst/instance_dao.go b/go/inst/instance_dao.go index 379fa63e3..c3c2cab8a 100644 --- a/go/inst/instance_dao.go +++ b/go/inst/instance_dao.go @@ -776,7 +776,7 @@ func ReadTopologyInstanceBufferable(instanceKey *InstanceKey, bufferWrites bool, waitGroup.Add(1) go func() { defer waitGroup.Done() - err := db.QueryRow(config.Config.DetectSemiSyncEnforcedQuery).Scan(&instance.SemiSyncEnforced) + err := db.QueryRow(config.Config.DetectSemiSyncEnforcedQuery).Scan(&instance.SemiSyncPriority) logReadTopologyInstanceError(instanceKey, "DetectSemiSyncEnforcedQuery", err) }() } @@ -1209,7 +1209,7 @@ func readInstanceRow(m sqlutils.RowMap) *Instance { instance.DataCenter = m.GetString("data_center") instance.Region = m.GetString("region") instance.PhysicalEnvironment = m.GetString("physical_environment") - instance.SemiSyncEnforced = m.GetUint("semi_sync_enforced") + instance.SemiSyncPriority = m.GetUint("semi_sync_enforced") instance.SemiSyncAvailable = m.GetBool("semi_sync_available") instance.SemiSyncMasterEnabled = m.GetBool("semi_sync_master_enabled") instance.SemiSyncMasterTimeout = m.GetUint64("semi_sync_master_timeout") @@ -2610,7 +2610,7 @@ func mkInsertOdkuForInstances(instances []*Instance, instanceWasActuallyFound bo args = append(args, instance.ReplicationCredentialsAvailable) args = append(args, instance.HasReplicationCredentials) args = append(args, instance.AllowTLS) - args = append(args, instance.SemiSyncEnforced) + args = append(args, instance.SemiSyncPriority) args = append(args, instance.SemiSyncAvailable) args = append(args, instance.SemiSyncMasterEnabled) args = append(args, instance.SemiSyncMasterTimeout) diff --git a/go/inst/instance_topology_dao.go b/go/inst/instance_topology_dao.go index d3a812028..9c95a50c1 100644 --- a/go/inst/instance_topology_dao.go +++ b/go/inst/instance_topology_dao.go @@ -456,7 +456,7 @@ func StartReplication(instanceKey *InstanceKey) (*Instance, error) { // some replicas (those that must never be promoted) should never ACK. // Note: We assume that replicas use 'skip-slave-start' so they won't // START SLAVE on their own upon restart. - if instance.SemiSyncEnforced > 0 { + if instance.SemiSyncPriority > 0 { // Send ACK only from promotable instances. sendACK := instance.PromotionRule != MustNotPromoteRule // Always disable master setting, in case we're converting a former master. @@ -550,7 +550,7 @@ func StartReplicationUntilMasterCoordinates(instanceKey *InstanceKey, masterCoor log.Infof("Will start replication on %+v until coordinates: %+v", instanceKey, masterCoordinates) - if instance.SemiSyncEnforced > 0 { + if instance.SemiSyncPriority > 0 { // Send ACK only from promotable instances. sendACK := instance.PromotionRule != MustNotPromoteRule // Always disable master setting, in case we're converting a former master. @@ -1090,7 +1090,7 @@ func SetReadOnly(instanceKey *InstanceKey, readOnly bool) (*Instance, error) { // If async fallback is disallowed, we're responsible for flipping the master // semi-sync switch ON before accepting writes. The setting is off by default. - if instance.SemiSyncEnforced > 0 && !readOnly { + if instance.SemiSyncPriority > 0 && !readOnly { // Send ACK only from promotable instances. sendACK := instance.PromotionRule != MustNotPromoteRule if err := EnableSemiSync(instanceKey, true, sendACK); err != nil { @@ -1114,7 +1114,7 @@ func SetReadOnly(instanceKey *InstanceKey, readOnly bool) (*Instance, error) { // If we just went read-only, it's safe to flip the master semi-sync switch // OFF, which is the default value so that replicas can make progress. - if instance.SemiSyncEnforced > 0 && readOnly { + if instance.SemiSyncPriority > 0 && readOnly { // Send ACK only from promotable instances. sendACK := instance.PromotionRule != MustNotPromoteRule if err := EnableSemiSync(instanceKey, false, sendACK); err != nil { diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index 84f55a3cd..401119173 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -1536,7 +1536,7 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn possibleSemiSyncReplicas := make([]*inst.Instance, 0) excludedReplicas := make([]*inst.Instance, 0) for _, replica := range replicas { - if replica.IsDowntimed || replica.SemiSyncEnforced == 0 || !replica.IsLastCheckValid || !replica.ReplicaRunning() { + if replica.IsDowntimed || replica.SemiSyncPriority == 0 || !replica.IsLastCheckValid || !replica.ReplicaRunning() { excludedReplicas = append(excludedReplicas, replica) // TODO make this more resilient: re-read instance if its last check was invalid continue @@ -1546,8 +1546,8 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn // Sort replicas by priority, promotion rule and name sort.Slice(possibleSemiSyncReplicas, func(i, j int) bool { - if possibleSemiSyncReplicas[i].SemiSyncEnforced != possibleSemiSyncReplicas[j].SemiSyncEnforced { - return possibleSemiSyncReplicas[i].SemiSyncEnforced < possibleSemiSyncReplicas[j].SemiSyncEnforced + if possibleSemiSyncReplicas[i].SemiSyncPriority != possibleSemiSyncReplicas[j].SemiSyncPriority { + return possibleSemiSyncReplicas[i].SemiSyncPriority < possibleSemiSyncReplicas[j].SemiSyncPriority } if possibleSemiSyncReplicas[i].PromotionRule != possibleSemiSyncReplicas[j].PromotionRule { return possibleSemiSyncReplicas[i].PromotionRule.BetterThan(possibleSemiSyncReplicas[j].PromotionRule) @@ -1572,7 +1572,7 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn if len(possibleSemiSyncReplicas) > 0 { AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas (in priority order):") for _, replica := range possibleSemiSyncReplicas { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncEnforced, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning())) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncPriority, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning())) } } else { AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas: (none)") @@ -1580,7 +1580,7 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn if len(excludedReplicas) > 0 { AuditTopologyRecovery(topologyRecovery, "excluded replicas:") for _, replica := range excludedReplicas { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncEnforced, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning())) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncPriority, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning())) } } else { AuditTopologyRecovery(topologyRecovery, "excluded replicas: (none)") From 7916658ce51c6d9bd850565a9d57b044efd0af71 Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Tue, 29 Jun 2021 20:22:43 -0400 Subject: [PATCH 10/18] Also handle async replicas --- go/logic/topology_recovery.go | 44 ++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index 401119173..9f846a9da 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -1534,14 +1534,17 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn // Filter out downtimed and down replicas desiredSemiSyncReplicaCount := analysisEntry.SemiSyncMasterWaitForReplicaCount possibleSemiSyncReplicas := make([]*inst.Instance, 0) + asyncReplicas := make([]*inst.Instance, 0) excludedReplicas := make([]*inst.Instance, 0) for _, replica := range replicas { - if replica.IsDowntimed || replica.SemiSyncPriority == 0 || !replica.IsLastCheckValid || !replica.ReplicaRunning() { + if replica.IsDowntimed || !replica.IsLastCheckValid || !replica.ReplicaRunning() { excludedReplicas = append(excludedReplicas, replica) // TODO make this more resilient: re-read instance if its last check was invalid - continue + } else if replica.SemiSyncPriority == 0 { + asyncReplicas = append(asyncReplicas, replica) + } else { + possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica) } - possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica) } // Sort replicas by priority, promotion rule and name @@ -1566,25 +1569,17 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn actions[replica] = false } } + for _, replica := range asyncReplicas { + if replica.SemiSyncReplicaEnabled { + actions[replica] = false + } + } // Summarize what we've determined AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("master semi-sync wait count is %d; we have %d valid possible semi-sync replica(s) and %d excluded replica(s)", desiredSemiSyncReplicaCount, len(possibleSemiSyncReplicas), len(excludedReplicas))) - if len(possibleSemiSyncReplicas) > 0 { - AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas (in priority order):") - for _, replica := range possibleSemiSyncReplicas { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncPriority, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning())) - } - } else { - AuditTopologyRecovery(topologyRecovery, "valid possible semi-sync replicas: (none)") - } - if len(excludedReplicas) > 0 { - AuditTopologyRecovery(topologyRecovery, "excluded replicas:") - for _, replica := range excludedReplicas { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncPriority, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning())) - } - } else { - AuditTopologyRecovery(topologyRecovery, "excluded replicas: (none)") - } + logReplicas(topologyRecovery, "possible semi-sync replicas (in priority order)", possibleSemiSyncReplicas) + logReplicas(topologyRecovery, "always-async replicas", asyncReplicas) + logReplicas(topologyRecovery, "excluded replicas (downtimed/defunct)", excludedReplicas) // Bail out if we cannot succeed if uint(len(possibleSemiSyncReplicas)) < desiredSemiSyncReplicaCount { @@ -1620,6 +1615,17 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn return true, topologyRecovery, nil } +func logReplicas(topologyRecovery *TopologyRecovery, description string, replicas []*inst.Instance) { + if len(replicas) > 0 { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%s:", description)) + for _, replica := range replicas { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncPriority, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning())) + } + } else { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%s: (none)", description)) + } +} + // checkAndRecoverGenericProblem is a general-purpose recovery function func checkAndRecoverGenericProblem(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { return false, nil, nil From e6893912d2843a97ca7025f9c6561d8b0755349a Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Tue, 29 Jun 2021 20:47:30 -0400 Subject: [PATCH 11/18] Split out classify function --- go/logic/topology_recovery.go | 59 ++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index 9f846a9da..8c8077e79 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -1531,34 +1531,11 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn return false, topologyRecovery, nil } - // Filter out downtimed and down replicas - desiredSemiSyncReplicaCount := analysisEntry.SemiSyncMasterWaitForReplicaCount - possibleSemiSyncReplicas := make([]*inst.Instance, 0) - asyncReplicas := make([]*inst.Instance, 0) - excludedReplicas := make([]*inst.Instance, 0) - for _, replica := range replicas { - if replica.IsDowntimed || !replica.IsLastCheckValid || !replica.ReplicaRunning() { - excludedReplicas = append(excludedReplicas, replica) - // TODO make this more resilient: re-read instance if its last check was invalid - } else if replica.SemiSyncPriority == 0 { - asyncReplicas = append(asyncReplicas, replica) - } else { - possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica) - } - } - - // Sort replicas by priority, promotion rule and name - sort.Slice(possibleSemiSyncReplicas, func(i, j int) bool { - if possibleSemiSyncReplicas[i].SemiSyncPriority != possibleSemiSyncReplicas[j].SemiSyncPriority { - return possibleSemiSyncReplicas[i].SemiSyncPriority < possibleSemiSyncReplicas[j].SemiSyncPriority - } - if possibleSemiSyncReplicas[i].PromotionRule != possibleSemiSyncReplicas[j].PromotionRule { - return possibleSemiSyncReplicas[i].PromotionRule.BetterThan(possibleSemiSyncReplicas[j].PromotionRule) - } - return strings.Compare(possibleSemiSyncReplicas[i].Key.String(), possibleSemiSyncReplicas[j].Key.String()) < 0 - }) + // Classify and prioritize replicas + possibleSemiSyncReplicas, asyncReplicas, excludedReplicas := classifyAndPrioritizeReplicas(replicas) // Figure out which replicas need to be acted upon + desiredSemiSyncReplicaCount := analysisEntry.SemiSyncMasterWaitForReplicaCount actions := make(map[*inst.Instance]bool, 0) for i, replica := range possibleSemiSyncReplicas { isSemiSyncEnabled := replica.SemiSyncReplicaEnabled @@ -1615,6 +1592,36 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn return true, topologyRecovery, nil } +func classifyAndPrioritizeReplicas(replicas []*inst.Instance) (possibleSemiSyncReplicas []*inst.Instance, asyncReplicas []*inst.Instance, excludedReplicas[]*inst.Instance) { + // Filter out downtimed and down replicas + possibleSemiSyncReplicas = make([]*inst.Instance, 0) + asyncReplicas = make([]*inst.Instance, 0) + excludedReplicas = make([]*inst.Instance, 0) + for _, replica := range replicas { + if replica.IsDowntimed || !replica.IsLastCheckValid || !replica.ReplicaRunning() { + excludedReplicas = append(excludedReplicas, replica) + // TODO make this more resilient: re-read instance if its last check was invalid + } else if replica.SemiSyncPriority == 0 { + asyncReplicas = append(asyncReplicas, replica) + } else { + possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica) + } + } + + // Sort replicas by priority, promotion rule and name + sort.Slice(possibleSemiSyncReplicas, func(i, j int) bool { + if possibleSemiSyncReplicas[i].SemiSyncPriority != possibleSemiSyncReplicas[j].SemiSyncPriority { + return possibleSemiSyncReplicas[i].SemiSyncPriority < possibleSemiSyncReplicas[j].SemiSyncPriority + } + if possibleSemiSyncReplicas[i].PromotionRule != possibleSemiSyncReplicas[j].PromotionRule { + return possibleSemiSyncReplicas[i].PromotionRule.BetterThan(possibleSemiSyncReplicas[j].PromotionRule) + } + return strings.Compare(possibleSemiSyncReplicas[i].Key.String(), possibleSemiSyncReplicas[j].Key.String()) < 0 + }) + + return +} + func logReplicas(topologyRecovery *TopologyRecovery, description string, replicas []*inst.Instance) { if len(replicas) > 0 { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%s:", description)) From 684b02b3e93f344ac85ff45d846cb8fb2bb183e7 Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Tue, 29 Jun 2021 21:10:24 -0400 Subject: [PATCH 12/18] Implement RecoverLockedSemiSyncMaster without exact counts (enable-only mode) --- go/logic/topology_recovery.go | 63 ++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index 8c8077e79..cc41c7f14 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -1485,22 +1485,13 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis, return false, nil, err } if config.Config.EnforceExactSemiSyncReplicas { - return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry) + return recoverSemiSyncReplicas(topologyRecovery, analysisEntry, true) } if !config.Config.RecoverLockedSemiSyncMaster { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no action taken to recover locked semi sync master on %+v. Enable RecoverLockedSemiSyncMaster or EnforceExactSemiSyncReplicas change this behavior.", analysisEntry.AnalyzedInstanceKey)) return false, nil, err } - - panic("not supported") - - // Proceed with "enough"-style recovery: - // in priority order, enable semi-sync on replicas until there are enough to match the wait count - - // TODO Jun 29 12:05:14 pheckel-devm-orch-1 orchestrator[564933]: 2021-06-29 12:05:14 INFO checkAndExecuteFailureDetectionProcesses: could not register LockedSemiSyncMaster detection on pheckel-devm-db-g0-1:3306 - // TODO IMPLEMENT THIS - - return true, topologyRecovery, nil + return recoverSemiSyncReplicas(topologyRecovery, analysisEntry, false) } // checkAndRecoverMasterWithTooManySemiSyncReplicas @@ -1510,10 +1501,10 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverMasterWithTooManySemiSyncReplicas.", analysisEntry.AnalyzedInstanceKey)) return false, nil, err } - return recoverExactSemiSyncReplicas(topologyRecovery, analysisEntry) + return recoverSemiSyncReplicas(topologyRecovery, analysisEntry, true) } -func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecoveryOut *TopologyRecovery, err error) { +func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, allowDisable bool) (recoveryAttempted bool, topologyRecoveryOut *TopologyRecovery, err error) { instance, found, err := inst.ReadInstance(&analysisEntry.AnalyzedInstanceKey) if !found || err != nil { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("instance not found in recovery %+v.", analysisEntry.AnalyzedInstanceKey)) @@ -1523,11 +1514,11 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn // Read all replicas replicas, err := inst.ReadReplicaInstances(&analysisEntry.AnalyzedInstanceKey) if err != nil { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("could not read replica instancesfor %+v: %s", analysisEntry.AnalyzedInstanceKey, err.Error())) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("could not read replica instances for %+v: %s", analysisEntry.AnalyzedInstanceKey, err.Error())) return false, topologyRecovery, err } if len(replicas) == 0 { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no replicas found for %+v.", analysisEntry.AnalyzedInstanceKey)) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no replicas found for %+v; cannot recover", analysisEntry.AnalyzedInstanceKey)) return false, topologyRecovery, nil } @@ -1535,31 +1526,43 @@ func recoverExactSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEn possibleSemiSyncReplicas, asyncReplicas, excludedReplicas := classifyAndPrioritizeReplicas(replicas) // Figure out which replicas need to be acted upon - desiredSemiSyncReplicaCount := analysisEntry.SemiSyncMasterWaitForReplicaCount - actions := make(map[*inst.Instance]bool, 0) - for i, replica := range possibleSemiSyncReplicas { - isSemiSyncEnabled := replica.SemiSyncReplicaEnabled - shouldSemiSyncBeEnabled := uint(i) < desiredSemiSyncReplicaCount - if shouldSemiSyncBeEnabled && !isSemiSyncEnabled { - actions[replica] = true - } else if !shouldSemiSyncBeEnabled && isSemiSyncEnabled { - actions[replica] = false + actions := make(map[*inst.Instance]bool, 0) // true = enable semi-sync, false = disable semi-sync + if allowDisable { + for i, replica := range possibleSemiSyncReplicas { + isSemiSyncEnabled := replica.SemiSyncReplicaEnabled + shouldSemiSyncBeEnabled := uint(i) < analysisEntry.SemiSyncMasterWaitForReplicaCount + if shouldSemiSyncBeEnabled && !isSemiSyncEnabled { + actions[replica] = true + } else if allowDisable && !shouldSemiSyncBeEnabled && isSemiSyncEnabled { + actions[replica] = false + } } - } - for _, replica := range asyncReplicas { - if replica.SemiSyncReplicaEnabled { - actions[replica] = false + for _, replica := range asyncReplicas { + if replica.SemiSyncReplicaEnabled { + actions[replica] = false + } + } + } else { + enabled := uint(0) + for _, replica := range possibleSemiSyncReplicas { + if !replica.SemiSyncReplicaEnabled { + actions[replica] = true + enabled++ + } + if enabled == analysisEntry.SemiSyncMasterWaitForReplicaCount - analysisEntry.SemiSyncMasterClients { + break + } } } // Summarize what we've determined - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("master semi-sync wait count is %d; we have %d valid possible semi-sync replica(s) and %d excluded replica(s)", desiredSemiSyncReplicaCount, len(possibleSemiSyncReplicas), len(excludedReplicas))) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("master semi-sync wait count is %d, currently we have %d semi-sync replica(s)", analysisEntry.SemiSyncMasterWaitForReplicaCount, analysisEntry.SemiSyncMasterClients)) logReplicas(topologyRecovery, "possible semi-sync replicas (in priority order)", possibleSemiSyncReplicas) logReplicas(topologyRecovery, "always-async replicas", asyncReplicas) logReplicas(topologyRecovery, "excluded replicas (downtimed/defunct)", excludedReplicas) // Bail out if we cannot succeed - if uint(len(possibleSemiSyncReplicas)) < desiredSemiSyncReplicaCount { + if uint(len(possibleSemiSyncReplicas)) < analysisEntry.SemiSyncMasterWaitForReplicaCount { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("not enough valid live replicas found to recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey)) return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this? } From 32835c77a37a4fcbc41aafbb3fc1f6fbd20cd17b Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Tue, 29 Jun 2021 21:11:06 -0400 Subject: [PATCH 13/18] gofmt --- go/logic/topology_recovery.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index cc41c7f14..7faad8d16 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -1491,7 +1491,7 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis, AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no action taken to recover locked semi sync master on %+v. Enable RecoverLockedSemiSyncMaster or EnforceExactSemiSyncReplicas change this behavior.", analysisEntry.AnalyzedInstanceKey)) return false, nil, err } - return recoverSemiSyncReplicas(topologyRecovery, analysisEntry, false) + return recoverSemiSyncReplicas(topologyRecovery, analysisEntry, false) } // checkAndRecoverMasterWithTooManySemiSyncReplicas @@ -1549,7 +1549,7 @@ func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry i actions[replica] = true enabled++ } - if enabled == analysisEntry.SemiSyncMasterWaitForReplicaCount - analysisEntry.SemiSyncMasterClients { + if enabled == analysisEntry.SemiSyncMasterWaitForReplicaCount-analysisEntry.SemiSyncMasterClients { break } } @@ -1595,7 +1595,7 @@ func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry i return true, topologyRecovery, nil } -func classifyAndPrioritizeReplicas(replicas []*inst.Instance) (possibleSemiSyncReplicas []*inst.Instance, asyncReplicas []*inst.Instance, excludedReplicas[]*inst.Instance) { +func classifyAndPrioritizeReplicas(replicas []*inst.Instance) (possibleSemiSyncReplicas []*inst.Instance, asyncReplicas []*inst.Instance, excludedReplicas []*inst.Instance) { // Filter out downtimed and down replicas possibleSemiSyncReplicas = make([]*inst.Instance, 0) asyncReplicas = make([]*inst.Instance, 0) From 303c9dc97dc6a4b8726b7403d144e3422661a988 Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Wed, 30 Jun 2021 15:59:28 -0400 Subject: [PATCH 14/18] Rename variable --- go/logic/topology_recovery.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index 7faad8d16..4f31c6b1e 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -1504,7 +1504,7 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica return recoverSemiSyncReplicas(topologyRecovery, analysisEntry, true) } -func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, allowDisable bool) (recoveryAttempted bool, topologyRecoveryOut *TopologyRecovery, err error) { +func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, exactReplicaTopology bool) (recoveryAttempted bool, topologyRecoveryOut *TopologyRecovery, err error) { instance, found, err := inst.ReadInstance(&analysisEntry.AnalyzedInstanceKey) if !found || err != nil { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("instance not found in recovery %+v.", analysisEntry.AnalyzedInstanceKey)) @@ -1527,13 +1527,13 @@ func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry i // Figure out which replicas need to be acted upon actions := make(map[*inst.Instance]bool, 0) // true = enable semi-sync, false = disable semi-sync - if allowDisable { + if exactReplicaTopology { for i, replica := range possibleSemiSyncReplicas { isSemiSyncEnabled := replica.SemiSyncReplicaEnabled shouldSemiSyncBeEnabled := uint(i) < analysisEntry.SemiSyncMasterWaitForReplicaCount if shouldSemiSyncBeEnabled && !isSemiSyncEnabled { actions[replica] = true - } else if allowDisable && !shouldSemiSyncBeEnabled && isSemiSyncEnabled { + } else if exactReplicaTopology && !shouldSemiSyncBeEnabled && isSemiSyncEnabled { actions[replica] = false } } From 1b7ed20e83965196495d1ce8514df74f8d918228 Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Thu, 1 Jul 2021 13:58:13 -0400 Subject: [PATCH 15/18] Make setting the semi-sync flag work during failover --- go/inst/instance_topology_dao.go | 164 +++++++++++++++++++++++++------ go/logic/topology_recovery.go | 65 +----------- 2 files changed, 136 insertions(+), 93 deletions(-) diff --git a/go/inst/instance_topology_dao.go b/go/inst/instance_topology_dao.go index 9c95a50c1..aa1b99d1f 100644 --- a/go/inst/instance_topology_dao.go +++ b/go/inst/instance_topology_dao.go @@ -20,6 +20,7 @@ import ( "context" "database/sql" "fmt" + "sort" "strings" "time" @@ -456,13 +457,9 @@ func StartReplication(instanceKey *InstanceKey) (*Instance, error) { // some replicas (those that must never be promoted) should never ACK. // Note: We assume that replicas use 'skip-slave-start' so they won't // START SLAVE on their own upon restart. - if instance.SemiSyncPriority > 0 { - // Send ACK only from promotable instances. - sendACK := instance.PromotionRule != MustNotPromoteRule - // Always disable master setting, in case we're converting a former master. - if err := EnableSemiSync(instanceKey, false, sendACK); err != nil { - return instance, log.Errore(err) - } + instance, err = MaybeEnableSemiSyncReplica(instance) + if err != nil { + return instance, log.Errore(err) } _, err = ExecInstance(instanceKey, `start slave`) @@ -534,7 +531,7 @@ func WaitForExecBinlogCoordinatesToReach(instanceKey *InstanceKey, coordinates * } } -// StartReplicationUntilMasterCoordinates issuesa START SLAVE UNTIL... statement on given instance +// StartReplicationUntilMasterCoordinates issues a START SLAVE UNTIL... statement on given instance func StartReplicationUntilMasterCoordinates(instanceKey *InstanceKey, masterCoordinates *BinlogCoordinates) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { @@ -550,13 +547,9 @@ func StartReplicationUntilMasterCoordinates(instanceKey *InstanceKey, masterCoor log.Infof("Will start replication on %+v until coordinates: %+v", instanceKey, masterCoordinates) - if instance.SemiSyncPriority > 0 { - // Send ACK only from promotable instances. - sendACK := instance.PromotionRule != MustNotPromoteRule - // Always disable master setting, in case we're converting a former master. - if err := EnableSemiSync(instanceKey, false, sendACK); err != nil { - return instance, log.Errore(err) - } + instance, err = MaybeEnableSemiSyncReplica(instance) + if err != nil { + return instance, log.Errore(err) } // MariaDB has a bug: a CHANGE MASTER TO statement does not work properly with prepared statement... :P @@ -584,14 +577,124 @@ func StartReplicationUntilMasterCoordinates(instanceKey *InstanceKey, masterCoor return instance, err } -// EnableSemiSync sets the rpl_semi_sync_(master|replica)_enabled variables -// on a given instance. -func EnableSemiSync(instanceKey *InstanceKey, master, replica bool) error { - log.Infof("instance %+v rpl_semi_sync_master_enabled: %t, rpl_semi_sync_slave_enabled: %t", instanceKey, master, replica) - _, err := ExecInstance(instanceKey, - `set global rpl_semi_sync_master_enabled = ?, global rpl_semi_sync_slave_enabled = ?`, - master, replica) - return err +// MaybeEnableSemiSyncReplica sets the rpl_semi_sync_(master|replica)_enabled variables on a given instance based on the +// config and state of the world. +func MaybeEnableSemiSyncReplica(instance *Instance) (*Instance, error) { + // Backwards compatible logic: Enable semi-sync if SemiSyncPriority > 0 (formerly SemiSyncEnforced) + // Note that this logic NEVER enables semi-sync if the promotion rule is "must_not". + if !config.Config.EnforceExactSemiSyncReplicas && !config.Config.RecoverLockedSemiSyncMaster && instance.SemiSyncPriority > 0 { + // Send ACK only from promotable instances; always disable master setting, in case we're converting a former master. + enableReplica := instance.PromotionRule != MustNotPromoteRule + log.Infof("semi-sync: %+v: setting rpl_semi_sync_master_enabled = %t, rpl_semi_sync_slave_enabled = %t (legacy behavior)", &instance.Key, false, enableReplica) + _, err := ExecInstance(&instance.Key, `set global rpl_semi_sync_master_enabled = ?, global rpl_semi_sync_slave_enabled = ?`, false, enableReplica) + if err != nil { + return instance, log.Errore(err) + } + } + + // New logic: If EnforceExactSemiSyncReplicas or RecoverLockedSemiSyncMaster are set, we enable semi-sync only if the + // given replica instance is in the list of replicas to have semi-sync enabled (according to the priority). + masterInstance, err := ReadTopologyInstance(&instance.MasterKey) + if err != nil { + return instance, log.Errore(err) + } + replicas, err := ReadReplicaInstances(&instance.MasterKey) + if err != nil { + return instance, log.Errore(err) + } + + possibleSemiSyncReplicas, asyncReplicas, excludedReplicas := ClassifyAndPrioritizeReplicas(replicas, false) + actions := DetermineSemiSyncReplicaActions(possibleSemiSyncReplicas, asyncReplicas, masterInstance.SemiSyncMasterWaitForReplicaCount, masterInstance.SemiSyncMasterClients, config.Config.EnforceExactSemiSyncReplicas) + + log.Debugf("semi-sync: determining if %+v should enable rpl_semi_sync_slave_enabled or not", instance.Key) + log.Debugf("semi-sync: master = %+v, master semi-sync wait count = %d, master semi-sync replica count = %d", instance.MasterKey, masterInstance.SemiSyncMasterWaitForReplicaCount, masterInstance.SemiSyncMasterClients) + log.Debug("semi-sync: analysis:") + debugReplicas("semi-sync: possible semi-sync replicas (in priority order)", possibleSemiSyncReplicas) + debugReplicas("semi-sync: always-async replicas", asyncReplicas) + debugReplicas("semi-sync: excluded replicas (downtimed/defunct)", excludedReplicas) + + for replica, enable := range actions { + if replica.Key.Equals(&instance.Key) { + log.Infof("semi-sync: %s: setting rpl_semi_sync_slave_enabled: %t", &instance.Key, enable) + return SetSemiSyncReplica(&instance.Key, enable) // override "instance", since we re-read it in this function + } + } + + log.Infof("semi-sync: %+v: no action taken", &instance.Key) + return instance, nil +} + +func debugReplicas(description string, replicas []*Instance) { + if len(replicas) > 0 { + log.Debugf("semi-sync: %s:", description) + for _, replica := range replicas { + log.Debugf("semi-sync: - %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncPriority, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning()) + } + } else { + log.Debugf("semi-sync: %s: (none)", description) + } +} + +func ClassifyAndPrioritizeReplicas(replicas []*Instance, excludeNotReplicatingReplicas bool) (possibleSemiSyncReplicas []*Instance, asyncReplicas []*Instance, excludedReplicas []*Instance) { + // Filter out downtimed and down replicas + possibleSemiSyncReplicas = make([]*Instance, 0) + asyncReplicas = make([]*Instance, 0) + excludedReplicas = make([]*Instance, 0) + for _, replica := range replicas { + if replica.IsDowntimed || !replica.IsLastCheckValid || (excludeNotReplicatingReplicas && !replica.ReplicaRunning()) { + excludedReplicas = append(excludedReplicas, replica) + // TODO make this more resilient: re-read instance if its last check was invalid + } else if replica.SemiSyncPriority == 0 { + asyncReplicas = append(asyncReplicas, replica) + } else { + possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica) + } + } + + // Sort replicas by priority, promotion rule and name + sort.Slice(possibleSemiSyncReplicas, func(i, j int) bool { + if possibleSemiSyncReplicas[i].SemiSyncPriority != possibleSemiSyncReplicas[j].SemiSyncPriority { + return possibleSemiSyncReplicas[i].SemiSyncPriority < possibleSemiSyncReplicas[j].SemiSyncPriority + } + if possibleSemiSyncReplicas[i].PromotionRule != possibleSemiSyncReplicas[j].PromotionRule { + return possibleSemiSyncReplicas[i].PromotionRule.BetterThan(possibleSemiSyncReplicas[j].PromotionRule) + } + return strings.Compare(possibleSemiSyncReplicas[i].Key.String(), possibleSemiSyncReplicas[j].Key.String()) < 0 + }) + + return +} + +func DetermineSemiSyncReplicaActions(possibleSemiSyncReplicas []*Instance, asyncReplicas []*Instance, waitCount uint, currentSemiSyncReplicas uint, exactReplicaTopology bool) map[*Instance]bool { + actions := make(map[*Instance]bool, 0) // true = enable semi-sync, false = disable semi-sync + if exactReplicaTopology { + for i, replica := range possibleSemiSyncReplicas { + isSemiSyncEnabled := replica.SemiSyncReplicaEnabled + shouldSemiSyncBeEnabled := uint(i) < waitCount + if shouldSemiSyncBeEnabled && !isSemiSyncEnabled { + actions[replica] = true + } else if exactReplicaTopology && !shouldSemiSyncBeEnabled && isSemiSyncEnabled { + actions[replica] = false + } + } + for _, replica := range asyncReplicas { + if replica.SemiSyncReplicaEnabled { + actions[replica] = false + } + } + } else { + enabled := uint(0) + for _, replica := range possibleSemiSyncReplicas { + if !replica.SemiSyncReplicaEnabled { + actions[replica] = true + enabled++ + } + if enabled == waitCount-currentSemiSyncReplicas { + break + } + } + } + return actions } // DelayReplication set the replication delay given seconds @@ -1090,10 +1193,8 @@ func SetReadOnly(instanceKey *InstanceKey, readOnly bool) (*Instance, error) { // If async fallback is disallowed, we're responsible for flipping the master // semi-sync switch ON before accepting writes. The setting is off by default. - if instance.SemiSyncPriority > 0 && !readOnly { - // Send ACK only from promotable instances. - sendACK := instance.PromotionRule != MustNotPromoteRule - if err := EnableSemiSync(instanceKey, true, sendACK); err != nil { + if instance.SemiSyncPriority > 0 && !readOnly { // TODO I feel like this flag is a little overloaded: this is used here to determine if the MASTER's semi-sync is enabled... + if _, err := SetSemiSyncMaster(instanceKey, true); err != nil { return instance, log.Errore(err) } } @@ -1111,13 +1212,14 @@ func SetReadOnly(instanceKey *InstanceKey, readOnly bool) (*Instance, error) { } } instance, err = ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } // If we just went read-only, it's safe to flip the master semi-sync switch // OFF, which is the default value so that replicas can make progress. - if instance.SemiSyncPriority > 0 && readOnly { - // Send ACK only from promotable instances. - sendACK := instance.PromotionRule != MustNotPromoteRule - if err := EnableSemiSync(instanceKey, false, sendACK); err != nil { + if instance.SemiSyncPriority > 0 && readOnly { // TODO I feel like this flag is a little overloaded: this is used here to determine if the MASTER's semi-sync is enabled... + if _, err := SetSemiSyncMaster(instanceKey, false); err != nil { return instance, log.Errore(err) } } diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index 4f31c6b1e..47b1f92fb 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -1522,38 +1522,9 @@ func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry i return false, topologyRecovery, nil } - // Classify and prioritize replicas - possibleSemiSyncReplicas, asyncReplicas, excludedReplicas := classifyAndPrioritizeReplicas(replicas) - - // Figure out which replicas need to be acted upon - actions := make(map[*inst.Instance]bool, 0) // true = enable semi-sync, false = disable semi-sync - if exactReplicaTopology { - for i, replica := range possibleSemiSyncReplicas { - isSemiSyncEnabled := replica.SemiSyncReplicaEnabled - shouldSemiSyncBeEnabled := uint(i) < analysisEntry.SemiSyncMasterWaitForReplicaCount - if shouldSemiSyncBeEnabled && !isSemiSyncEnabled { - actions[replica] = true - } else if exactReplicaTopology && !shouldSemiSyncBeEnabled && isSemiSyncEnabled { - actions[replica] = false - } - } - for _, replica := range asyncReplicas { - if replica.SemiSyncReplicaEnabled { - actions[replica] = false - } - } - } else { - enabled := uint(0) - for _, replica := range possibleSemiSyncReplicas { - if !replica.SemiSyncReplicaEnabled { - actions[replica] = true - enabled++ - } - if enabled == analysisEntry.SemiSyncMasterWaitForReplicaCount-analysisEntry.SemiSyncMasterClients { - break - } - } - } + // Classify and prioritize replicas & figure out which replicas need to be acted upon + possibleSemiSyncReplicas, asyncReplicas, excludedReplicas := inst.ClassifyAndPrioritizeReplicas(replicas, true) + actions := inst.DetermineSemiSyncReplicaActions(possibleSemiSyncReplicas, asyncReplicas, analysisEntry.SemiSyncMasterWaitForReplicaCount, analysisEntry.SemiSyncMasterClients, exactReplicaTopology) // Summarize what we've determined AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("master semi-sync wait count is %d, currently we have %d semi-sync replica(s)", analysisEntry.SemiSyncMasterWaitForReplicaCount, analysisEntry.SemiSyncMasterClients)) @@ -1595,36 +1566,6 @@ func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry i return true, topologyRecovery, nil } -func classifyAndPrioritizeReplicas(replicas []*inst.Instance) (possibleSemiSyncReplicas []*inst.Instance, asyncReplicas []*inst.Instance, excludedReplicas []*inst.Instance) { - // Filter out downtimed and down replicas - possibleSemiSyncReplicas = make([]*inst.Instance, 0) - asyncReplicas = make([]*inst.Instance, 0) - excludedReplicas = make([]*inst.Instance, 0) - for _, replica := range replicas { - if replica.IsDowntimed || !replica.IsLastCheckValid || !replica.ReplicaRunning() { - excludedReplicas = append(excludedReplicas, replica) - // TODO make this more resilient: re-read instance if its last check was invalid - } else if replica.SemiSyncPriority == 0 { - asyncReplicas = append(asyncReplicas, replica) - } else { - possibleSemiSyncReplicas = append(possibleSemiSyncReplicas, replica) - } - } - - // Sort replicas by priority, promotion rule and name - sort.Slice(possibleSemiSyncReplicas, func(i, j int) bool { - if possibleSemiSyncReplicas[i].SemiSyncPriority != possibleSemiSyncReplicas[j].SemiSyncPriority { - return possibleSemiSyncReplicas[i].SemiSyncPriority < possibleSemiSyncReplicas[j].SemiSyncPriority - } - if possibleSemiSyncReplicas[i].PromotionRule != possibleSemiSyncReplicas[j].PromotionRule { - return possibleSemiSyncReplicas[i].PromotionRule.BetterThan(possibleSemiSyncReplicas[j].PromotionRule) - } - return strings.Compare(possibleSemiSyncReplicas[i].Key.String(), possibleSemiSyncReplicas[j].Key.String()) < 0 - }) - - return -} - func logReplicas(topologyRecovery *TopologyRecovery, description string, replicas []*inst.Instance) { if len(replicas) > 0 { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%s:", description)) From e83dbad8794465a199a96ef7e632585216c0d898 Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Thu, 1 Jul 2021 14:24:27 -0400 Subject: [PATCH 16/18] Fix backwards compatible logic --- go/inst/instance_topology_dao.go | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/go/inst/instance_topology_dao.go b/go/inst/instance_topology_dao.go index aa1b99d1f..5a91a6f4a 100644 --- a/go/inst/instance_topology_dao.go +++ b/go/inst/instance_topology_dao.go @@ -582,14 +582,17 @@ func StartReplicationUntilMasterCoordinates(instanceKey *InstanceKey, masterCoor func MaybeEnableSemiSyncReplica(instance *Instance) (*Instance, error) { // Backwards compatible logic: Enable semi-sync if SemiSyncPriority > 0 (formerly SemiSyncEnforced) // Note that this logic NEVER enables semi-sync if the promotion rule is "must_not". - if !config.Config.EnforceExactSemiSyncReplicas && !config.Config.RecoverLockedSemiSyncMaster && instance.SemiSyncPriority > 0 { - // Send ACK only from promotable instances; always disable master setting, in case we're converting a former master. - enableReplica := instance.PromotionRule != MustNotPromoteRule - log.Infof("semi-sync: %+v: setting rpl_semi_sync_master_enabled = %t, rpl_semi_sync_slave_enabled = %t (legacy behavior)", &instance.Key, false, enableReplica) - _, err := ExecInstance(&instance.Key, `set global rpl_semi_sync_master_enabled = ?, global rpl_semi_sync_slave_enabled = ?`, false, enableReplica) - if err != nil { - return instance, log.Errore(err) + if !config.Config.EnforceExactSemiSyncReplicas && !config.Config.RecoverLockedSemiSyncMaster { + if instance.SemiSyncPriority > 0 { + // Send ACK only from promotable instances; always disable master setting, in case we're converting a former master. + enableReplica := instance.PromotionRule != MustNotPromoteRule + log.Infof("semi-sync: %+v: setting rpl_semi_sync_master_enabled = %t, rpl_semi_sync_slave_enabled = %t (legacy behavior)", &instance.Key, false, enableReplica) + _, err := ExecInstance(&instance.Key, `set global rpl_semi_sync_master_enabled = ?, global rpl_semi_sync_slave_enabled = ?`, false, enableReplica) + if err != nil { + return instance, log.Errore(err) + } } + return instance, nil } // New logic: If EnforceExactSemiSyncReplicas or RecoverLockedSemiSyncMaster are set, we enable semi-sync only if the From a5566d896738fed1c3fe45c6bac334086b225ae9 Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Thu, 1 Jul 2021 14:35:30 -0400 Subject: [PATCH 17/18] Split determine* function --- go/inst/instance_topology_dao.go | 57 +++++++++++++++++++------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/go/inst/instance_topology_dao.go b/go/inst/instance_topology_dao.go index 5a91a6f4a..4438655aa 100644 --- a/go/inst/instance_topology_dao.go +++ b/go/inst/instance_topology_dao.go @@ -668,33 +668,44 @@ func ClassifyAndPrioritizeReplicas(replicas []*Instance, excludeNotReplicatingRe return } +// DetermineSemiSyncReplicaActions returns a map of replicas for which to change the semi-sync replica setting. +// A value of true indicates semi-sync needs to be enabled, false that it needs to be disabled. func DetermineSemiSyncReplicaActions(possibleSemiSyncReplicas []*Instance, asyncReplicas []*Instance, waitCount uint, currentSemiSyncReplicas uint, exactReplicaTopology bool) map[*Instance]bool { - actions := make(map[*Instance]bool, 0) // true = enable semi-sync, false = disable semi-sync if exactReplicaTopology { - for i, replica := range possibleSemiSyncReplicas { - isSemiSyncEnabled := replica.SemiSyncReplicaEnabled - shouldSemiSyncBeEnabled := uint(i) < waitCount - if shouldSemiSyncBeEnabled && !isSemiSyncEnabled { - actions[replica] = true - } else if exactReplicaTopology && !shouldSemiSyncBeEnabled && isSemiSyncEnabled { - actions[replica] = false - } + return determineSemiSyncReplicaActionsForExactTopology(possibleSemiSyncReplicas, asyncReplicas, waitCount) + } + return determineSemiSyncReplicaActionsForEnoughTopology(possibleSemiSyncReplicas, waitCount, currentSemiSyncReplicas) +} + +func determineSemiSyncReplicaActionsForExactTopology(possibleSemiSyncReplicas []*Instance, asyncReplicas []*Instance, waitCount uint) map[*Instance]bool { + actions := make(map[*Instance]bool, 0) // true = enable semi-sync, false = disable semi-sync + for i, replica := range possibleSemiSyncReplicas { + isSemiSyncEnabled := replica.SemiSyncReplicaEnabled + shouldSemiSyncBeEnabled := uint(i) < waitCount + if shouldSemiSyncBeEnabled && !isSemiSyncEnabled { + actions[replica] = true + } else if !shouldSemiSyncBeEnabled && isSemiSyncEnabled { + actions[replica] = false } - for _, replica := range asyncReplicas { - if replica.SemiSyncReplicaEnabled { - actions[replica] = false - } + } + for _, replica := range asyncReplicas { + if replica.SemiSyncReplicaEnabled { + actions[replica] = false } - } else { - enabled := uint(0) - for _, replica := range possibleSemiSyncReplicas { - if !replica.SemiSyncReplicaEnabled { - actions[replica] = true - enabled++ - } - if enabled == waitCount-currentSemiSyncReplicas { - break - } + } + return actions +} + +func determineSemiSyncReplicaActionsForEnoughTopology(possibleSemiSyncReplicas []*Instance, waitCount uint, currentSemiSyncReplicas uint) map[*Instance]bool { + actions := make(map[*Instance]bool, 0) // true = enable semi-sync, false = disable semi-sync + enabled := uint(0) + for _, replica := range possibleSemiSyncReplicas { + if !replica.SemiSyncReplicaEnabled { + actions[replica] = true + enabled++ + } + if enabled == waitCount-currentSemiSyncReplicas { + break } } return actions From 2bd2761b3e5f4efa9fcac7b8c7bab59cfb3eafb5 Mon Sep 17 00:00:00 2001 From: Philipp Heckel Date: Thu, 1 Jul 2021 16:05:11 -0400 Subject: [PATCH 18/18] Better logging; formattin --- go/inst/instance_topology_dao.go | 85 +++++++++++++++++++------------- go/logic/topology_recovery.go | 37 ++++++-------- 2 files changed, 67 insertions(+), 55 deletions(-) diff --git a/go/inst/instance_topology_dao.go b/go/inst/instance_topology_dao.go index 4438655aa..3465b430f 100644 --- a/go/inst/instance_topology_dao.go +++ b/go/inst/instance_topology_dao.go @@ -579,65 +579,57 @@ func StartReplicationUntilMasterCoordinates(instanceKey *InstanceKey, masterCoor // MaybeEnableSemiSyncReplica sets the rpl_semi_sync_(master|replica)_enabled variables on a given instance based on the // config and state of the world. -func MaybeEnableSemiSyncReplica(instance *Instance) (*Instance, error) { +func MaybeEnableSemiSyncReplica(replicaInstance *Instance) (*Instance, error) { // Backwards compatible logic: Enable semi-sync if SemiSyncPriority > 0 (formerly SemiSyncEnforced) // Note that this logic NEVER enables semi-sync if the promotion rule is "must_not". if !config.Config.EnforceExactSemiSyncReplicas && !config.Config.RecoverLockedSemiSyncMaster { - if instance.SemiSyncPriority > 0 { + if replicaInstance.SemiSyncPriority > 0 { // Send ACK only from promotable instances; always disable master setting, in case we're converting a former master. - enableReplica := instance.PromotionRule != MustNotPromoteRule - log.Infof("semi-sync: %+v: setting rpl_semi_sync_master_enabled = %t, rpl_semi_sync_slave_enabled = %t (legacy behavior)", &instance.Key, false, enableReplica) - _, err := ExecInstance(&instance.Key, `set global rpl_semi_sync_master_enabled = ?, global rpl_semi_sync_slave_enabled = ?`, false, enableReplica) + enableReplica := replicaInstance.PromotionRule != MustNotPromoteRule + log.Infof("semi-sync: %+v: setting rpl_semi_sync_master_enabled = %t, rpl_semi_sync_slave_enabled = %t (legacy behavior)", &replicaInstance.Key, false, enableReplica) + _, err := ExecInstance(&replicaInstance.Key, `set global rpl_semi_sync_master_enabled = ?, global rpl_semi_sync_slave_enabled = ?`, false, enableReplica) if err != nil { - return instance, log.Errore(err) + return replicaInstance, log.Errore(err) } } - return instance, nil + return replicaInstance, nil } // New logic: If EnforceExactSemiSyncReplicas or RecoverLockedSemiSyncMaster are set, we enable semi-sync only if the // given replica instance is in the list of replicas to have semi-sync enabled (according to the priority). - masterInstance, err := ReadTopologyInstance(&instance.MasterKey) + masterInstance, err := ReadTopologyInstance(&replicaInstance.MasterKey) if err != nil { - return instance, log.Errore(err) + return replicaInstance, log.Errore(err) } - replicas, err := ReadReplicaInstances(&instance.MasterKey) + replicas, err := ReadReplicaInstances(&replicaInstance.MasterKey) if err != nil { - return instance, log.Errore(err) + return replicaInstance, log.Errore(err) } possibleSemiSyncReplicas, asyncReplicas, excludedReplicas := ClassifyAndPrioritizeReplicas(replicas, false) actions := DetermineSemiSyncReplicaActions(possibleSemiSyncReplicas, asyncReplicas, masterInstance.SemiSyncMasterWaitForReplicaCount, masterInstance.SemiSyncMasterClients, config.Config.EnforceExactSemiSyncReplicas) - log.Debugf("semi-sync: determining if %+v should enable rpl_semi_sync_slave_enabled or not", instance.Key) - log.Debugf("semi-sync: master = %+v, master semi-sync wait count = %d, master semi-sync replica count = %d", instance.MasterKey, masterInstance.SemiSyncMasterWaitForReplicaCount, masterInstance.SemiSyncMasterClients) - log.Debug("semi-sync: analysis:") - debugReplicas("semi-sync: possible semi-sync replicas (in priority order)", possibleSemiSyncReplicas) - debugReplicas("semi-sync: always-async replicas", asyncReplicas) - debugReplicas("semi-sync: excluded replicas (downtimed/defunct)", excludedReplicas) + log.Debugf("semi-sync: %+v: determining whether to enable rpl_semi_sync_slave_enabled", replicaInstance.Key) + log.Debugf("semi-sync: master = %+v, master semi-sync wait count = %d, master semi-sync replica count = %d", replicaInstance.MasterKey, masterInstance.SemiSyncMasterWaitForReplicaCount, masterInstance.SemiSyncMasterClients) + LogSemiSyncReplicaAnalysis(possibleSemiSyncReplicas, asyncReplicas, excludedReplicas, actions, func(s string, a ...interface{}) { log.Debugf(s, a...) }) + // TODO This is a little odd. We sometimes get actions for replicas that are not us. If we do not take this action + // TODO then we'll absolutely have a MasterWithTooManySemiSyncReplicas event following this recovery. We could prevent this + // TODO by just executing all actions, but we're in the scope of `replicaInstance` here, so..... idk for replica, enable := range actions { - if replica.Key.Equals(&instance.Key) { - log.Infof("semi-sync: %s: setting rpl_semi_sync_slave_enabled: %t", &instance.Key, enable) - return SetSemiSyncReplica(&instance.Key, enable) // override "instance", since we re-read it in this function + if replica.Key.Equals(&replicaInstance.Key) { + log.Infof("semi-sync: %s: setting rpl_semi_sync_slave_enabled: %t", &replicaInstance.Key, enable) + return SetSemiSyncReplica(&replicaInstance.Key, enable) // override "instance", since we re-read it in this function } } - log.Infof("semi-sync: %+v: no action taken", &instance.Key) - return instance, nil -} - -func debugReplicas(description string, replicas []*Instance) { - if len(replicas) > 0 { - log.Debugf("semi-sync: %s:", description) - for _, replica := range replicas { - log.Debugf("semi-sync: - %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncPriority, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning()) - } - } else { - log.Debugf("semi-sync: %s: (none)", description) - } + log.Infof("semi-sync: %+v: no action taken", &replicaInstance.Key) + return replicaInstance, nil } +// ClassifyAndPrioritizeReplicas takes a list of replica instances and classifies them based on their semi-sync priority, excluding +// replicas that are downtimed or down. It furthermore prioritizes the possible semi-sync replicas based on SemiSyncPriority, PromotionRule +// and hostname (fallback). func ClassifyAndPrioritizeReplicas(replicas []*Instance, excludeNotReplicatingReplicas bool) (possibleSemiSyncReplicas []*Instance, asyncReplicas []*Instance, excludedReplicas []*Instance) { // Filter out downtimed and down replicas possibleSemiSyncReplicas = make([]*Instance, 0) @@ -646,7 +638,6 @@ func ClassifyAndPrioritizeReplicas(replicas []*Instance, excludeNotReplicatingRe for _, replica := range replicas { if replica.IsDowntimed || !replica.IsLastCheckValid || (excludeNotReplicatingReplicas && !replica.ReplicaRunning()) { excludedReplicas = append(excludedReplicas, replica) - // TODO make this more resilient: re-read instance if its last check was invalid } else if replica.SemiSyncPriority == 0 { asyncReplicas = append(asyncReplicas, replica) } else { @@ -711,6 +702,32 @@ func determineSemiSyncReplicaActionsForEnoughTopology(possibleSemiSyncReplicas [ return actions } +// LogSemiSyncReplicaAnalysis outputs the analysis results for a semi-sync analysis using the given log function +func LogSemiSyncReplicaAnalysis(possibleSemiSyncReplicas []*Instance, asyncReplicas []*Instance, excludedReplicas []*Instance, actions map[*Instance]bool, logf func(s string, a ...interface{})) { + logReplicas("possible semi-sync replicas (in priority order)", possibleSemiSyncReplicas, logf) + logReplicas("always-async replicas", asyncReplicas, logf) + logReplicas("excluded replicas (downtimed/defunct)", excludedReplicas, logf) + if len(actions) > 0 { + logf("semi-sync: suggested actions:") + for replica, enable := range actions { + logf("semi-sync: - %+v: should set semi-sync enabled = %t", replica.Key, enable) + } + } else { + logf("semi-sync: suggested actions: (none)") + } +} + +func logReplicas(description string, replicas []*Instance, logf func(s string, a ...interface{})) { + if len(replicas) > 0 { + logf("semi-sync: %s:", description) + for _, replica := range replicas { + logf("semi-sync: - %s: semi-sync enabled = %t, priority = %d, promotion rule = %s, downtimed = %t, last check = %t, replicating = %t", replica.Key.String(), replica.SemiSyncReplicaEnabled, replica.SemiSyncPriority, replica.PromotionRule, replica.IsDowntimed, replica.IsLastCheckValid, replica.ReplicaRunning()) + } + } else { + logf("semi-sync: %s: (none)", description) + } +} + // DelayReplication set the replication delay given seconds // keeping the current state of the replication threads. func DelayReplication(instanceKey *InstanceKey, seconds int) error { diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index 47b1f92fb..b182cb4dd 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -1505,20 +1505,20 @@ func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.Replica } func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, exactReplicaTopology bool) (recoveryAttempted bool, topologyRecoveryOut *TopologyRecovery, err error) { - instance, found, err := inst.ReadInstance(&analysisEntry.AnalyzedInstanceKey) + masterInstance, found, err := inst.ReadInstance(&analysisEntry.AnalyzedInstanceKey) if !found || err != nil { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("instance not found in recovery %+v.", analysisEntry.AnalyzedInstanceKey)) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: instance not found in recovery %+v.", analysisEntry.AnalyzedInstanceKey)) return false, topologyRecovery, err } // Read all replicas replicas, err := inst.ReadReplicaInstances(&analysisEntry.AnalyzedInstanceKey) if err != nil { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("could not read replica instances for %+v: %s", analysisEntry.AnalyzedInstanceKey, err.Error())) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: could not read replica instances for %+v: %s", analysisEntry.AnalyzedInstanceKey, err.Error())) return false, topologyRecovery, err } if len(replicas) == 0 { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no replicas found for %+v; cannot recover", analysisEntry.AnalyzedInstanceKey)) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: no replicas found for %+v; cannot recover", analysisEntry.AnalyzedInstanceKey)) return false, topologyRecovery, nil } @@ -1526,42 +1526,37 @@ func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry i possibleSemiSyncReplicas, asyncReplicas, excludedReplicas := inst.ClassifyAndPrioritizeReplicas(replicas, true) actions := inst.DetermineSemiSyncReplicaActions(possibleSemiSyncReplicas, asyncReplicas, analysisEntry.SemiSyncMasterWaitForReplicaCount, analysisEntry.SemiSyncMasterClients, exactReplicaTopology) - // Summarize what we've determined - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("master semi-sync wait count is %d, currently we have %d semi-sync replica(s)", analysisEntry.SemiSyncMasterWaitForReplicaCount, analysisEntry.SemiSyncMasterClients)) - logReplicas(topologyRecovery, "possible semi-sync replicas (in priority order)", possibleSemiSyncReplicas) - logReplicas(topologyRecovery, "always-async replicas", asyncReplicas) - logReplicas(topologyRecovery, "excluded replicas (downtimed/defunct)", excludedReplicas) + // Log analysis + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: analysis results for recovery of %+v:", masterInstance.Key)) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: semi-sync wait count = %d, semi-sync replica count = %d", analysisEntry.SemiSyncMasterWaitForReplicaCount, analysisEntry.SemiSyncMasterClients)) + inst.LogSemiSyncReplicaAnalysis(possibleSemiSyncReplicas, asyncReplicas, excludedReplicas, actions, func(s string, a ...interface{}) { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf(s, a...)) }) // Bail out if we cannot succeed if uint(len(possibleSemiSyncReplicas)) < analysisEntry.SemiSyncMasterWaitForReplicaCount { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("not enough valid live replicas found to recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey)) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: not enough valid live replicas found to recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey)) return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this? } if len(actions) == 0 { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot determine actions based on possible semi-sync replicas; cannot recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey)) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: cannot determine actions based on possible semi-sync replicas; cannot recover from %s on %+v.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey)) return true, topologyRecovery, nil // TODO recoveryAttempted = true; is this correct? what are the implications of this? } // Take action - AuditTopologyRecovery(topologyRecovery, "taking actions:") + AuditTopologyRecovery(topologyRecovery, "semi-sync: taking actions:") for replica, enable := range actions { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- %s: setting rpl_semi_sync_slave_enabled=%t, restarting slave_io thread", replica.Key.String(), enable)) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: - %s: setting rpl_semi_sync_slave_enabled=%t, restarting slave_io thread", replica.Key.String(), enable)) _, err := inst.SetSemiSyncReplica(&replica.Key, enable) if err != nil { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("cannot change semi sync on replica %+v.", replica.Key)) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: cannot change semi sync on replica %+v.", replica.Key)) return true, topologyRecovery, nil } } // Re-read source instance to avoid re-triggering the same condition - // TODO this does not help; it is still re-triggering the same analysis until the next polling interval. WHY? - instance, err = inst.ReadTopologyInstance(&instance.Key) - if err != nil { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("warning: cannot re-read instance after recovery of %s on instance %+v; recovery might be retriggered, but this is harmless.", analysisEntry.AnalysisString(), analysisEntry.AnalyzedInstanceKey)) - } + // TODO even though we resolve correctly here, we are re-triggering the same analysis until the next polling interval. WHY? - resolveRecovery(topologyRecovery, instance) - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("recovery complete; success = %t", topologyRecovery.IsSuccessful)) + resolveRecovery(topologyRecovery, masterInstance) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("semi-sync: recovery complete; success = %t", topologyRecovery.IsSuccessful)) return true, topologyRecovery, nil }