diff --git a/docs/failure-detection.md b/docs/failure-detection.md index 141dc083f..2c19bc73c 100644 --- a/docs/failure-detection.md +++ b/docs/failure-detection.md @@ -35,6 +35,7 @@ Observe the following list of potential failures: * DeadMasterAndSlaves * DeadMasterAndSomeSlaves * DeadMasterWithoutSlaves +* UnreachableMasterWithLaggingReplicas * UnreachableMaster * AllMasterSlavesNotReplicating * AllMasterSlavesNotReplicatingOrDead @@ -84,6 +85,16 @@ their time to figure out they were failing replication. This makes for a potential recovery process. +#### `UnreachableMasterWithLaggingReplicas`: + +1. Master cannot be reached +2. All of its immediate replicas (excluding SQL delayed) are lagging + +This scenario can happen when the master is overloaded. Clients would see a "Too many connections", while the replicas, which have been connected since long ago, claim the master is fine. Similarly, if the master is locked due to some metadata operation, clients would be blocked on connection while replicas _may claim_ everything's fine. However, since apps cannot connect to the master, no actual data gets written, and when using a heartbeat mechanism such as `pt-heartbeat`, we can observe a growing lag on replicas. + +`orchestrator` responds to this scenario by restarting replication on all of master's immediate replicas. This will close the old client connections on those replicas and attempt to initiate new ones. These may now fail to connect, leading to a complete replication failure on all replicas. This will next lead `orchestrator` to analyze a `DeadMaster`. + + ### Failures of no interest The following scenarios are of no interest to `orchestrator`, and while the information and state are available to `orchestrator`, it does not recognize such scenarios as _failures_ per se; there's no detection hooks invoked and obviously no recoveries attempted: diff --git a/go/config/config.go b/go/config/config.go index 02892eb24..0cf9353a7 100644 --- a/go/config/config.go +++ b/go/config/config.go @@ -230,7 +230,6 @@ type Configuration struct { PostMasterFailoverProcesses []string // Processes to execute after doing a master failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses PostIntermediateMasterFailoverProcesses []string // Processes to execute after doing a master failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses PostGracefulTakeoverProcesses []string // Processes to execute after runnign a graceful master takeover. Uses same placeholders as PostFailoverProcesses - UnreachableMasterWithStaleSlavesProcesses []string // Processes to execute when detecting an UnreachableMasterWithStaleSlaves scenario. CoMasterRecoveryMustPromoteOtherCoMaster bool // When 'false', anything can get promoted (and candidates are prefered over others). When 'true', orchestrator will promote the other co-master or else fail DetachLostSlavesAfterMasterFailover bool // synonym to DetachLostReplicasAfterMasterFailover DetachLostReplicasAfterMasterFailover bool // Should replicas that are not to be lost in master recovery (i.e. were more up-to-date than promoted replica) be forcibly detached @@ -391,7 +390,6 @@ func newConfiguration() *Configuration { PostFailoverProcesses: []string{}, PostUnsuccessfulFailoverProcesses: []string{}, PostGracefulTakeoverProcesses: []string{}, - UnreachableMasterWithStaleSlavesProcesses: []string{}, CoMasterRecoveryMustPromoteOtherCoMaster: true, DetachLostSlavesAfterMasterFailover: true, ApplyMySQLPromotionAfterMasterFailover: true, diff --git a/go/inst/analysis.go b/go/inst/analysis.go index ed9a84d07..8feca3468 100644 --- a/go/inst/analysis.go +++ b/go/inst/analysis.go @@ -32,13 +32,12 @@ const ( DeadMaster = "DeadMaster" DeadMasterAndSlaves = "DeadMasterAndSlaves" DeadMasterAndSomeSlaves = "DeadMasterAndSomeSlaves" - UnreachableMasterWithStaleSlaves = "UnreachableMasterWithStaleSlaves" + UnreachableMasterWithLaggingReplicas = "UnreachableMasterWithLaggingReplicas" UnreachableMaster = "UnreachableMaster" MasterSingleSlaveNotReplicating = "MasterSingleSlaveNotReplicating" MasterSingleSlaveDead = "MasterSingleSlaveDead" AllMasterSlavesNotReplicating = "AllMasterSlavesNotReplicating" AllMasterSlavesNotReplicatingOrDead = "AllMasterSlavesNotReplicatingOrDead" - AllMasterSlavesStale = "AllMasterSlavesStale" MasterWithoutSlaves = "MasterWithoutSlaves" DeadCoMaster = "DeadCoMaster" DeadCoMasterAndSomeSlaves = "DeadCoMasterAndSomeSlaves" @@ -111,7 +110,6 @@ type ReplicationAnalysis struct { CountValidReplicas uint CountValidReplicatingReplicas uint CountReplicasFailingToConnectToMaster uint - CountStaleReplicas uint CountDowntimedReplicas uint ReplicationDepth uint SlaveHosts InstanceKeyMap @@ -132,6 +130,8 @@ type ReplicationAnalysis struct { CountMixedBasedLoggingReplicas uint CountRowBasedLoggingReplicas uint CountDistinctMajorVersionsLoggingReplicas uint + CountDelayedReplicas uint + CountLaggingReplicas uint IsActionableRecovery bool ProcessingNodeHostname string ProcessingNodeToken string diff --git a/go/inst/analysis_dao.go b/go/inst/analysis_dao.go index 289dc9793..40538a266 100644 --- a/go/inst/analysis_dao.go +++ b/go/inst/analysis_dao.go @@ -55,7 +55,7 @@ func initializeAnalysisDaoPostConfiguration() { func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) ([]ReplicationAnalysis, error) { result := []ReplicationAnalysis{} - args := sqlutils.Args(ValidSecondsFromSeenToLastAttemptedCheck(), clusterName) + args := sqlutils.Args(ValidSecondsFromSeenToLastAttemptedCheck(), config.Config.ReasonableReplicationLagSeconds, clusterName) analysisQueryReductionClause := `` if config.Config.ReduceReplicationAnalysisCount { analysisQueryReductionClause = ` @@ -64,23 +64,23 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) master_instance.last_checked <= master_instance.last_seen and master_instance.last_attempted_check <= master_instance.last_seen + interval ? second ) = 1 /* AS is_last_check_valid */) = 0 - OR (IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen - AND slave_instance.slave_io_running = 0 - AND slave_instance.last_io_error like '%error %connecting to master%' - AND slave_instance.slave_sql_running = 1), + OR (IFNULL(SUM(replica_instance.last_checked <= replica_instance.last_seen + AND replica_instance.slave_io_running = 0 + AND replica_instance.last_io_error like '%error %connecting to master%' + AND replica_instance.slave_sql_running = 1), 0) /* AS count_slaves_failing_to_connect_to_master */ > 0) - OR (IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen), - 0) /* AS count_valid_slaves */ < COUNT(slave_instance.server_id) /* AS count_slaves */) - OR (IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen - AND slave_instance.slave_io_running != 0 - AND slave_instance.slave_sql_running != 0), - 0) /* AS count_valid_replicating_slaves */ < COUNT(slave_instance.server_id) /* AS count_slaves */) + OR (IFNULL(SUM(replica_instance.last_checked <= replica_instance.last_seen), + 0) /* AS count_valid_slaves */ < COUNT(replica_instance.server_id) /* AS count_slaves */) + OR (IFNULL(SUM(replica_instance.last_checked <= replica_instance.last_seen + AND replica_instance.slave_io_running != 0 + AND replica_instance.slave_sql_running != 0), + 0) /* AS count_valid_replicating_slaves */ < COUNT(replica_instance.server_id) /* AS count_slaves */) OR (MIN( master_instance.slave_sql_running = 1 AND master_instance.slave_io_running = 0 AND master_instance.last_io_error like '%error %connecting to master%' ) /* AS is_failing_to_connect_to_master */) - OR (COUNT(slave_instance.server_id) /* AS count_slaves */ > 0) + OR (COUNT(replica_instance.server_id) /* AS count_slaves */ > 0) ` args = append(args, ValidSecondsFromSeenToLastAttemptedCheck()) } @@ -109,20 +109,20 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) ':', master_instance.port) = master_instance.cluster_name) AS is_cluster_master, MIN(master_instance.gtid_mode) AS gtid_mode, - COUNT(slave_instance.server_id) AS count_slaves, - IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen), + COUNT(replica_instance.server_id) AS count_slaves, + IFNULL(SUM(replica_instance.last_checked <= replica_instance.last_seen), 0) AS count_valid_slaves, - IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen - AND slave_instance.slave_io_running != 0 - AND slave_instance.slave_sql_running != 0), + IFNULL(SUM(replica_instance.last_checked <= replica_instance.last_seen + AND replica_instance.slave_io_running != 0 + AND replica_instance.slave_sql_running != 0), 0) AS count_valid_replicating_slaves, - IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen - AND slave_instance.slave_io_running = 0 - AND slave_instance.last_io_error like '%%error %%connecting to master%%' - AND slave_instance.slave_sql_running = 1), + IFNULL(SUM(replica_instance.last_checked <= replica_instance.last_seen + AND replica_instance.slave_io_running = 0 + AND replica_instance.last_io_error like '%%error %%connecting to master%%' + AND replica_instance.slave_sql_running = 1), 0) AS count_slaves_failing_to_connect_to_master, MIN(master_instance.replication_depth) AS replication_depth, - GROUP_CONCAT(concat(slave_instance.Hostname, ':', slave_instance.Port)) as slave_hosts, + GROUP_CONCAT(concat(replica_instance.Hostname, ':', replica_instance.Port)) as slave_hosts, MIN( master_instance.slave_sql_running = 1 AND master_instance.slave_io_running = 0 @@ -148,49 +148,53 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) master_instance.supports_oracle_gtid ) AS supports_oracle_gtid, SUM( - slave_instance.oracle_gtid + replica_instance.oracle_gtid ) AS count_oracle_gtid_slaves, - IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen - AND slave_instance.oracle_gtid != 0), + IFNULL(SUM(replica_instance.last_checked <= replica_instance.last_seen + AND replica_instance.oracle_gtid != 0), 0) AS count_valid_oracle_gtid_slaves, SUM( - slave_instance.binlog_server + replica_instance.binlog_server ) AS count_binlog_server_slaves, - IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen - AND slave_instance.binlog_server != 0), + IFNULL(SUM(replica_instance.last_checked <= replica_instance.last_seen + AND replica_instance.binlog_server != 0), 0) AS count_valid_binlog_server_slaves, MIN( master_instance.mariadb_gtid ) AS is_mariadb_gtid, SUM( - slave_instance.mariadb_gtid + replica_instance.mariadb_gtid ) AS count_mariadb_gtid_slaves, - IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen - AND slave_instance.mariadb_gtid != 0), + IFNULL(SUM(replica_instance.last_checked <= replica_instance.last_seen + AND replica_instance.mariadb_gtid != 0), 0) AS count_valid_mariadb_gtid_slaves, - IFNULL(SUM(slave_instance.log_bin - AND slave_instance.log_slave_updates - AND slave_instance.binlog_format = 'STATEMENT'), + IFNULL(SUM(replica_instance.log_bin + AND replica_instance.log_slave_updates + AND replica_instance.binlog_format = 'STATEMENT'), 0) AS count_statement_based_loggin_slaves, - IFNULL(SUM(slave_instance.log_bin - AND slave_instance.log_slave_updates - AND slave_instance.binlog_format = 'MIXED'), + IFNULL(SUM(replica_instance.log_bin + AND replica_instance.log_slave_updates + AND replica_instance.binlog_format = 'MIXED'), 0) AS count_mixed_based_loggin_slaves, - IFNULL(SUM(slave_instance.log_bin - AND slave_instance.log_slave_updates - AND slave_instance.binlog_format = 'ROW'), + IFNULL(SUM(replica_instance.log_bin + AND replica_instance.log_slave_updates + AND replica_instance.binlog_format = 'ROW'), 0) AS count_row_based_loggin_slaves, - IFNULL(MIN(slave_instance.gtid_mode), '') + IFNULL(SUM(replica_instance.sql_delay > 0), + 0) AS count_delayed_replicas, + IFNULL(SUM(replica_instance.slave_lag_seconds > ?), + 0) AS count_lagging_replicas, + IFNULL(MIN(replica_instance.gtid_mode), '') AS min_replica_gtid_mode, - IFNULL(MAX(slave_instance.gtid_mode), '') + IFNULL(MAX(replica_instance.gtid_mode), '') AS max_replica_gtid_mode, IFNULL(SUM( replica_downtime.downtime_active is not null and ifnull(replica_downtime.end_timestamp, now()) > now()), 0) AS count_downtimed_replicas, COUNT(DISTINCT case - when slave_instance.log_bin AND slave_instance.log_slave_updates - then slave_instance.major_version + when replica_instance.log_bin AND replica_instance.log_slave_updates + then replica_instance.major_version else NULL end ) AS count_distinct_logging_major_versions @@ -199,9 +203,9 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) LEFT JOIN hostname_resolve ON (master_instance.hostname = hostname_resolve.hostname) LEFT JOIN - database_instance slave_instance ON (COALESCE(hostname_resolve.resolved_hostname, - master_instance.hostname) = slave_instance.master_host - AND master_instance.port = slave_instance.master_port) + database_instance replica_instance ON (COALESCE(hostname_resolve.resolved_hostname, + master_instance.hostname) = replica_instance.master_host + AND master_instance.port = replica_instance.master_port) LEFT JOIN database_instance_maintenance ON (master_instance.hostname = database_instance_maintenance.hostname AND master_instance.port = database_instance_maintenance.port @@ -211,15 +215,11 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) AND master_instance.port = master_downtime.port AND master_downtime.downtime_active = 1) LEFT JOIN - database_instance_downtime as replica_downtime ON (slave_instance.hostname = replica_downtime.hostname - AND slave_instance.port = replica_downtime.port + database_instance_downtime as replica_downtime ON (replica_instance.hostname = replica_downtime.hostname + AND replica_instance.port = replica_downtime.port AND replica_downtime.downtime_active = 1) LEFT JOIN cluster_alias ON (cluster_alias.cluster_name = master_instance.cluster_name) - LEFT JOIN - database_instance_recent_relaylog_history ON ( - slave_instance.hostname = database_instance_recent_relaylog_history.hostname - AND slave_instance.port = database_instance_recent_relaylog_history.port) WHERE database_instance_maintenance.database_instance_maintenance_id IS NULL AND ? IN ('', master_instance.cluster_name) @@ -255,7 +255,6 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) a.CountValidReplicatingReplicas = m.GetUint("count_valid_replicating_slaves") a.CountReplicasFailingToConnectToMaster = m.GetUint("count_slaves_failing_to_connect_to_master") a.CountDowntimedReplicas = m.GetUint("count_downtimed_replicas") - a.CountStaleReplicas = 0 a.ReplicationDepth = m.GetUint("replication_depth") a.IsFailingToConnectToMaster = m.GetBool("is_failing_to_connect_to_master") a.IsDowntimed = m.GetBool("is_downtimed") @@ -283,6 +282,17 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) a.CountRowBasedLoggingReplicas = m.GetUint("count_row_based_loggin_slaves") a.CountDistinctMajorVersionsLoggingReplicas = m.GetUint("count_distinct_logging_major_versions") + a.CountDelayedReplicas = m.GetUint("count_delayed_replicas") + a.CountLaggingReplicas = m.GetUint("count_lagging_replicas") + + if !a.LastCheckValid { + analysisMessage := fmt.Sprintf("analysis: IsMaster: %+v, LastCheckValid: %+v, LastCheckPartialSuccess: %+v, CountReplicas: %+v, CountValidReplicatingReplicas: %+v, CountLaggingReplicas: %+v, CountDelayedReplicas: %+v, ", + a.IsMaster, a.LastCheckValid, a.LastCheckPartialSuccess, a.CountReplicas, a.CountValidReplicatingReplicas, a.CountLaggingReplicas, a.CountDelayedReplicas, + ) + if util.ClearToLog("analysis_dao", analysisMessage) { + log.Debugf(analysisMessage) + } + } if a.IsMaster && !a.LastCheckValid && a.CountReplicas == 0 { a.Analysis = DeadMasterWithoutSlaves a.Description = "Master cannot be reached by orchestrator and has no slave" @@ -299,9 +309,9 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) a.Analysis = DeadMasterAndSomeSlaves a.Description = "Master cannot be reached by orchestrator; some of its replicas are unreachable and none of its reachable replicas is replicating" // - } else if a.IsMaster && !a.LastCheckValid && a.CountStaleReplicas == a.CountReplicas && a.CountValidReplicatingReplicas > 0 { - a.Analysis = UnreachableMasterWithStaleSlaves - a.Description = "Master cannot be reached by orchestrator and has running yet stale replicas" + } else if a.IsMaster && !a.LastCheckValid && a.CountLaggingReplicas == a.CountReplicas && a.CountDelayedReplicas < a.CountReplicas && a.CountValidReplicatingReplicas > 0 { + a.Analysis = UnreachableMasterWithLaggingReplicas + a.Description = "Master cannot be reached by orchestrator and all of its replicas are lagging" // } else if a.IsMaster && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { a.Analysis = UnreachableMaster @@ -323,10 +333,6 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) a.Analysis = AllMasterSlavesNotReplicatingOrDead a.Description = "Master is reachable but none of its replicas is replicating" // - } else if a.IsMaster && a.LastCheckValid && a.CountReplicas > 1 && a.CountStaleReplicas == a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { - a.Analysis = AllMasterSlavesStale - a.Description = "Master is reachable but all of its replicas are stale, although attempting to replicate" - // } else /* co-master */ if a.IsCoMaster && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { a.Analysis = DeadCoMaster a.Description = "Co-master cannot be reached by orchestrator and none of its replicas is replicating" diff --git a/go/inst/instance_dao.go b/go/inst/instance_dao.go index 93b68ca46..8dce58a5b 100644 --- a/go/inst/instance_dao.go +++ b/go/inst/instance_dao.go @@ -2693,10 +2693,6 @@ func ResetInstanceRelaylogCoordinatesHistory(instanceKey *InstanceKey) error { return ExecDBWriteFunc(writeFunc) } -func ExpireInstanceBinlogFileHistory() error { - return ExpireTableData("database_instance_binlog_files_history", "last_seen") -} - // FigureClusterName will make a best effort to deduce a cluster name using either a given alias // or an instanceKey. First attempt is at alias, and if that doesn't work, we try instanceKey. func FigureClusterName(clusterHint string, instanceKey *InstanceKey, thisInstanceKey *InstanceKey) (clusterName string, err error) { diff --git a/go/inst/instance_topology.go b/go/inst/instance_topology.go index bd2f6f7e4..6f2ea853b 100644 --- a/go/inst/instance_topology.go +++ b/go/inst/instance_topology.go @@ -612,7 +612,7 @@ func moveReplicasViaGTID(replicas [](*Instance), other *Instance) (movedReplicas return movedReplicas, unmovedReplicas, nil, errs } - log.Infof("Will move %+v replicas below %+v via GTID", len(replicas), other.Key) + log.Infof("moveReplicasViaGTID: Will move %+v replicas below %+v via GTID", len(replicas), other.Key) barrier := make(chan *InstanceKey) replicaMutex := make(chan bool, 1) @@ -627,7 +627,7 @@ func moveReplicasViaGTID(replicas [](*Instance), other *Instance) (movedReplicas if _, _, canMove := canMoveViaGTID(replica, other); canMove { replica, replicaErr = moveInstanceBelowViaGTID(replica, other) } else { - replicaErr = fmt.Errorf("%+v cannot move below %+v via GTID", replica.Key, other.Key) + replicaErr = fmt.Errorf("moveReplicasViaGTID: %+v cannot move below %+v via GTID", replica.Key, other.Key) } func() { // Instantaneous mutex. diff --git a/go/inst/instance_topology_dao.go b/go/inst/instance_topology_dao.go index 5b26af738..894ff7bba 100644 --- a/go/inst/instance_topology_dao.go +++ b/go/inst/instance_topology_dao.go @@ -259,6 +259,15 @@ func SetSemiSyncReplica(instanceKey *InstanceKey, enableReplica bool) (*Instance } +func RestartIOThread(instanceKey *InstanceKey) error { + for _, cmd := range []string{`stop slave io_thread`, `start slave io_thread`} { + if _, err := ExecInstance(instanceKey, cmd); err != nil { + return log.Errorf("%+v: RestartIOThread: '%q' failed: %+v", *instanceKey, cmd, err) + } + } + return nil +} + // StopSlaveNicely stops a replica such that SQL_thread and IO_thread are aligned (i.e. // SQL_thread consumes all relay log entries) // It will actually START the sql_thread even if the replica is completely stopped. @@ -274,8 +283,8 @@ func StopSlaveNicely(instanceKey *InstanceKey, timeout time.Duration) (*Instance // stop io_thread, start sql_thread but catch any errors for _, cmd := range []string{`stop slave io_thread`, `start slave sql_thread`} { - if _, err = ExecInstance(instanceKey, cmd); err != nil { - return nil, log.Errorf("%+v: StopSlaveNicely: %q failed: %+v", *instanceKey, cmd, err) + if _, err := ExecInstance(instanceKey, cmd); err != nil { + return nil, log.Errorf("%+v: StopSlaveNicely: '%q' failed: %+v", *instanceKey, cmd, err) } } diff --git a/go/logic/orchestrator.go b/go/logic/orchestrator.go index 72d52cfd7..2aeea99c3 100644 --- a/go/logic/orchestrator.go +++ b/go/logic/orchestrator.go @@ -488,7 +488,6 @@ func ContinuousDiscovery() { go inst.ExpireMasterPositionEquivalence() go inst.ExpirePoolInstances() go inst.FlushNontrivialResolveCacheToDatabase() - go inst.ExpireInstanceBinlogFileHistory() go inst.ExpireInjectedPseudoGTID() go process.ExpireNodesHistory() go process.ExpireAccessTokens() diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index 62577d917..427e0254b 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -155,6 +155,8 @@ const ( ) var emergencyReadTopologyInstanceMap *cache.Cache +var emergencyRestartReplicaTopologyInstanceMap *cache.Cache +var emergencyOperationGracefulPeriodMap *cache.Cache // InstancesByCountReplicas sorts instances by umber of replicas, descending type InstancesByCountReplicas [](*inst.Instance) @@ -178,9 +180,6 @@ var recoverDeadIntermediateMasterFailureCounter = metrics.NewCounter() var recoverDeadCoMasterCounter = metrics.NewCounter() var recoverDeadCoMasterSuccessCounter = metrics.NewCounter() var recoverDeadCoMasterFailureCounter = metrics.NewCounter() -var recoverUnreachableMasterWithStaleSlavesCounter = metrics.NewCounter() -var recoverUnreachableMasterWithStaleSlavesSuccessCounter = metrics.NewCounter() -var recoverUnreachableMasterWithStaleSlavesFailureCounter = metrics.NewCounter() var countPendingRecoveriesGauge = metrics.NewGauge() func init() { @@ -193,9 +192,6 @@ func init() { metrics.Register("recover.dead_co_master.start", recoverDeadCoMasterCounter) metrics.Register("recover.dead_co_master.success", recoverDeadCoMasterSuccessCounter) metrics.Register("recover.dead_co_master.fail", recoverDeadCoMasterFailureCounter) - metrics.Register("recover.unreach_master_stale_slaves.start", recoverUnreachableMasterWithStaleSlavesCounter) - metrics.Register("recover.unreach_master_stale_slaves.success", recoverUnreachableMasterWithStaleSlavesSuccessCounter) - metrics.Register("recover.unreach_master_stale_slaves.fail", recoverUnreachableMasterWithStaleSlavesFailureCounter) metrics.Register("recover.pending", countPendingRecoveriesGauge) go initializeTopologyRecoveryPostConfiguration() @@ -213,6 +209,8 @@ func initializeTopologyRecoveryPostConfiguration() { config.WaitForConfigurationToBeLoaded() emergencyReadTopologyInstanceMap = cache.New(time.Second, time.Millisecond*250) + emergencyRestartReplicaTopologyInstanceMap = cache.New(time.Second*30, time.Second) + emergencyOperationGracefulPeriodMap = cache.New(time.Second*5, time.Millisecond*500) } // AuditTopologyRecovery audits a single step in a topology recovery process. @@ -1262,26 +1260,6 @@ func checkAndRecoverDeadCoMaster(analysisEntry inst.ReplicationAnalysis, candida return true, topologyRecovery, err } -// checkAndRecoverUnreachableMasterWithStaleSlaves executes an external process. No other action is taken. -// Returns false. -func checkAndRecoverUnreachableMasterWithStaleSlaves(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { - topologyRecovery, err := AttemptRecoveryRegistration(&analysisEntry, !forceInstanceRecovery, !forceInstanceRecovery) - if topologyRecovery == nil { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another UnreachableMasterWithStaleSlaves.", analysisEntry.AnalyzedInstanceKey)) - } else { - recoverUnreachableMasterWithStaleSlavesCounter.Inc(1) - if !skipProcesses { - err := executeProcesses(config.Config.UnreachableMasterWithStaleSlavesProcesses, "UnreachableMasterWithStaleSlavesProcesses", topologyRecovery, false) - if err != nil { - recoverUnreachableMasterWithStaleSlavesFailureCounter.Inc(1) - } else { - recoverUnreachableMasterWithStaleSlavesSuccessCounter.Inc(1) - } - } - } - return false, nil, err -} - // checkAndRecoverGenericProblem is a general-purpose recovery function func checkAndRecoverGenericProblem(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { return false, nil, nil @@ -1312,6 +1290,49 @@ func emergentlyReadTopologyInstanceReplicas(instanceKey *inst.InstanceKey, analy } } +// emergentlyRestartReplicationOnTopologyInstance forces a RestartSlave on a given instance. +func emergentlyRestartReplicationOnTopologyInstance(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) { + if existsInCacheError := emergencyRestartReplicaTopologyInstanceMap.Add(instanceKey.StringCode(), true, cache.DefaultExpiration); existsInCacheError != nil { + // Just recently attempted on this specific replica + return + } + go inst.ExecuteOnTopology(func() { + inst.RestartIOThread(instanceKey) + inst.AuditOperation("emergently-restart-replication-topology-instance", instanceKey, string(analysisCode)) + }) +} + +func beginEmergencyOperationGracefulPeriod(instanceKey *inst.InstanceKey) { + emergencyOperationGracefulPeriodMap.Set(instanceKey.StringCode(), true, cache.DefaultExpiration) +} + +func isInEmergencyOperationGracefulPeriod(instanceKey *inst.InstanceKey) bool { + _, found := emergencyOperationGracefulPeriodMap.Get(instanceKey.StringCode()) + return found +} + +// emergentlyRestartReplicationOnTopologyInstanceReplicas forces a stop slave + start slave on +// replicas of a given instance, in an attempt to cause them to re-evaluate their replication state. +// This can be useful in scenarios where the master has Too Many Connections, but long-time connected +// replicas are not seeing this; when they stop+start replication, they need to re-authenticate and +// that's where we hope they realize the master is bad. +func emergentlyRestartReplicationOnTopologyInstanceReplicas(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) { + if existsInCacheError := emergencyRestartReplicaTopologyInstanceMap.Add(instanceKey.StringCode(), true, cache.DefaultExpiration); existsInCacheError != nil { + // While each replica's RestartSlave() is throttled on its own, it's also wasteful to + // iterate all replicas all the time. This is the reason why we do grand-throttle check. + return + } + beginEmergencyOperationGracefulPeriod(instanceKey) + + replicas, err := inst.ReadReplicaInstancesIncludingBinlogServerSubReplicas(instanceKey) + if err != nil { + return + } + for _, replica := range replicas { + go emergentlyRestartReplicationOnTopologyInstance(&replica.Key, analysisCode) + } +} + // checkAndExecuteFailureDetectionProcesses tries to register for failure detection and potentially executes // failure-detection processes. func checkAndExecuteFailureDetectionProcesses(analysisEntry inst.ReplicationAnalysis, skipProcesses bool) (detectionRegistrationSuccess bool, processesExecutionAttempted bool, err error) { @@ -1330,18 +1351,18 @@ func checkAndExecuteFailureDetectionProcesses(analysisEntry inst.ReplicationAnal return true, true, err } -func getCheckAndRecoverFunction(analysisCode inst.AnalysisCode) ( +func getCheckAndRecoverFunction(analysisCode inst.AnalysisCode, analyzedInstanceKey *inst.InstanceKey) ( checkAndRecoverFunction func(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error), isActionableRecovery bool, ) { switch analysisCode { // master - case inst.DeadMaster: - return checkAndRecoverDeadMaster, true - case inst.DeadMasterAndSomeSlaves: - return checkAndRecoverDeadMaster, true - case inst.UnreachableMasterWithStaleSlaves: - return checkAndRecoverUnreachableMasterWithStaleSlaves, true + case inst.DeadMaster, inst.DeadMasterAndSomeSlaves: + if isInEmergencyOperationGracefulPeriod(analyzedInstanceKey) { + return checkAndRecoverGenericProblem, false + } else { + return checkAndRecoverDeadMaster, true + } // intermediate master case inst.DeadIntermediateMaster: return checkAndRecoverDeadIntermediateMaster, true @@ -1363,6 +1384,8 @@ func getCheckAndRecoverFunction(analysisCode inst.AnalysisCode) ( return checkAndRecoverGenericProblem, false case inst.UnreachableMaster: return checkAndRecoverGenericProblem, false + case inst.UnreachableMasterWithLaggingReplicas: + return checkAndRecoverGenericProblem, false case inst.AllMasterSlavesNotReplicating: return checkAndRecoverGenericProblem, false case inst.AllMasterSlavesNotReplicatingOrDead: @@ -1383,6 +1406,8 @@ func runEmergentOperations(analysisEntry *inst.ReplicationAnalysis) { case inst.UnreachableMaster: go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) go emergentlyReadTopologyInstanceReplicas(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) + case inst.UnreachableMasterWithLaggingReplicas: + go emergentlyRestartReplicationOnTopologyInstanceReplicas(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) case inst.AllMasterSlavesNotReplicating: go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) case inst.AllMasterSlavesNotReplicatingOrDead: @@ -1398,7 +1423,7 @@ func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, cand atomic.AddInt64(&countPendingRecoveries, 1) defer atomic.AddInt64(&countPendingRecoveries, -1) - checkAndRecoverFunction, isActionableRecovery := getCheckAndRecoverFunction(analysisEntry.Analysis) + checkAndRecoverFunction, isActionableRecovery := getCheckAndRecoverFunction(analysisEntry.Analysis, &analysisEntry.AnalyzedInstanceKey) analysisEntry.IsActionableRecovery = isActionableRecovery runEmergentOperations(&analysisEntry) diff --git a/resources/bin/orchestrator-client b/resources/bin/orchestrator-client index 4ed2791c7..1660f4b6f 100755 --- a/resources/bin/orchestrator-client +++ b/resources/bin/orchestrator-client @@ -186,7 +186,7 @@ function api() { set -o pipefail api_call_result=0 - for sleep_time in 0.1 0.2 0.5 1 2 5 10 0 ; do + for sleep_time in 0.1 0.2 0.5 1 2 2.5 5 0 ; do api_response=$(curl -b "${basic_auth}" -s "$uri" | jq '.') api_call_result=$? [ $api_call_result -eq 0 ] && break diff --git a/resources/public/js/cluster-analysis-shared.js b/resources/public/js/cluster-analysis-shared.js index 859dfb1ce..c48170363 100644 --- a/resources/public/js/cluster-analysis-shared.js +++ b/resources/public/js/cluster-analysis-shared.js @@ -4,6 +4,7 @@ var interestingAnalysis = { "DeadMasterAndSomeSlaves" : true, "DeadMasterWithoutSlaves" : true, "UnreachableMasterWithStaleSlaves": true, + "UnreachableMasterWithLaggingReplicas": true, "UnreachableMaster" : true, "AllMasterSlavesNotReplicating" : true, "AllMasterSlavesNotReplicatingOrDead" : true, diff --git a/tests/integration/analysis-unreachable-master-not-lagging-replicas/create.sql b/tests/integration/analysis-unreachable-master-not-lagging-replicas/create.sql new file mode 100644 index 000000000..a07dcd324 --- /dev/null +++ b/tests/integration/analysis-unreachable-master-not-lagging-replicas/create.sql @@ -0,0 +1,2 @@ +UPDATE database_instance SET last_seen=last_checked - interval 1 minute where port=22293; +UPDATE database_instance SET slave_lag_seconds=60 where port in (22295, 22296, 22297); diff --git a/tests/integration/analysis-unreachable-master-not-lagging-replicas/expect_output b/tests/integration/analysis-unreachable-master-not-lagging-replicas/expect_output new file mode 100644 index 000000000..602886b93 --- /dev/null +++ b/tests/integration/analysis-unreachable-master-not-lagging-replicas/expect_output @@ -0,0 +1 @@ +testhost:22293 (cluster testhost:22293): UnreachableMaster diff --git a/tests/integration/analysis-unreachable-master-not-lagging-replicas/extra_args b/tests/integration/analysis-unreachable-master-not-lagging-replicas/extra_args new file mode 100644 index 000000000..e294ffff2 --- /dev/null +++ b/tests/integration/analysis-unreachable-master-not-lagging-replicas/extra_args @@ -0,0 +1 @@ +-c replication-analysis diff --git a/tests/integration/analysis-unreachable-master-with-delayed-replicas/create.sql b/tests/integration/analysis-unreachable-master-with-delayed-replicas/create.sql new file mode 100644 index 000000000..12807e540 --- /dev/null +++ b/tests/integration/analysis-unreachable-master-with-delayed-replicas/create.sql @@ -0,0 +1,2 @@ +UPDATE database_instance SET last_seen=last_checked - interval 1 minute where port=22293; +UPDATE database_instance SET slave_lag_seconds=600, sql_delay=600 where port in (22294, 22296, 22297); diff --git a/tests/integration/analysis-unreachable-master-with-delayed-replicas/expect_output b/tests/integration/analysis-unreachable-master-with-delayed-replicas/expect_output new file mode 100644 index 000000000..602886b93 --- /dev/null +++ b/tests/integration/analysis-unreachable-master-with-delayed-replicas/expect_output @@ -0,0 +1 @@ +testhost:22293 (cluster testhost:22293): UnreachableMaster diff --git a/tests/integration/analysis-unreachable-master-with-delayed-replicas/extra_args b/tests/integration/analysis-unreachable-master-with-delayed-replicas/extra_args new file mode 100644 index 000000000..e294ffff2 --- /dev/null +++ b/tests/integration/analysis-unreachable-master-with-delayed-replicas/extra_args @@ -0,0 +1 @@ +-c replication-analysis diff --git a/tests/integration/analysis-unreachable-master-with-lagging-replicas-and-delayed-replicas/create.sql b/tests/integration/analysis-unreachable-master-with-lagging-replicas-and-delayed-replicas/create.sql new file mode 100644 index 000000000..303eeeeb3 --- /dev/null +++ b/tests/integration/analysis-unreachable-master-with-lagging-replicas-and-delayed-replicas/create.sql @@ -0,0 +1,3 @@ +UPDATE database_instance SET last_seen=last_checked - interval 1 minute where port=22293; +UPDATE database_instance SET slave_lag_seconds=600 where port in (22294, 22296, 22297); +UPDATE database_instance SET sql_delay=600 where port in (22297); diff --git a/tests/integration/analysis-unreachable-master-with-lagging-replicas-and-delayed-replicas/expect_output b/tests/integration/analysis-unreachable-master-with-lagging-replicas-and-delayed-replicas/expect_output new file mode 100644 index 000000000..372a85df4 --- /dev/null +++ b/tests/integration/analysis-unreachable-master-with-lagging-replicas-and-delayed-replicas/expect_output @@ -0,0 +1 @@ +testhost:22293 (cluster testhost:22293): UnreachableMasterWithLaggingReplicas diff --git a/tests/integration/analysis-unreachable-master-with-lagging-replicas-and-delayed-replicas/extra_args b/tests/integration/analysis-unreachable-master-with-lagging-replicas-and-delayed-replicas/extra_args new file mode 100644 index 000000000..e294ffff2 --- /dev/null +++ b/tests/integration/analysis-unreachable-master-with-lagging-replicas-and-delayed-replicas/extra_args @@ -0,0 +1 @@ +-c replication-analysis diff --git a/tests/integration/analysis-unreachable-master-with-lagging-replicas/create.sql b/tests/integration/analysis-unreachable-master-with-lagging-replicas/create.sql new file mode 100644 index 000000000..5379c6848 --- /dev/null +++ b/tests/integration/analysis-unreachable-master-with-lagging-replicas/create.sql @@ -0,0 +1,2 @@ +UPDATE database_instance SET last_seen=last_checked - interval 1 minute where port=22293; +UPDATE database_instance SET slave_lag_seconds=60 where port in (22294, 22296, 22297); diff --git a/tests/integration/analysis-unreachable-master-with-lagging-replicas/expect_output b/tests/integration/analysis-unreachable-master-with-lagging-replicas/expect_output new file mode 100644 index 000000000..372a85df4 --- /dev/null +++ b/tests/integration/analysis-unreachable-master-with-lagging-replicas/expect_output @@ -0,0 +1 @@ +testhost:22293 (cluster testhost:22293): UnreachableMasterWithLaggingReplicas diff --git a/tests/integration/analysis-unreachable-master-with-lagging-replicas/extra_args b/tests/integration/analysis-unreachable-master-with-lagging-replicas/extra_args new file mode 100644 index 000000000..e294ffff2 --- /dev/null +++ b/tests/integration/analysis-unreachable-master-with-lagging-replicas/extra_args @@ -0,0 +1 @@ +-c replication-analysis