diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go index f863236eee..8bacd94ab3 100644 --- a/cli/cmd/cluster.go +++ b/cli/cmd/cluster.go @@ -57,17 +57,17 @@ import ( ) var ( - _flagClusterUpEnv string - _flagClusterInfoEnv string - _flagClusterScaleNodeGroup string - _flagClusterScaleMinInstances int64 - _flagClusterScaleMaxInstances int64 - _flagClusterConfig string - _flagClusterName string - _flagClusterRegion string - _flagClusterInfoDebug bool - _flagClusterDisallowPrompt bool - _flagClusterDownKeepVolumes bool + _flagClusterUpEnv string + _flagClusterInfoEnv string + _flagClusterScaleNodeGroup string + _flagClusterScaleMinInstances int64 + _flagClusterScaleMaxInstances int64 + _flagClusterConfig string + _flagClusterName string + _flagClusterRegion string + _flagClusterInfoDebug bool + _flagClusterDisallowPrompt bool + _flagClusterDownKeepAWSResources bool ) var _eksctlPrefixRegex = regexp.MustCompile(`^.*[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} \[.+] {2}`) @@ -100,7 +100,7 @@ func clusterInit() { addClusterNameFlag(_clusterDownCmd) addClusterRegionFlag(_clusterDownCmd) _clusterDownCmd.Flags().BoolVarP(&_flagClusterDisallowPrompt, "yes", "y", false, "skip prompts") - _clusterDownCmd.Flags().BoolVar(&_flagClusterDownKeepVolumes, "keep-volumes", false, "keep cortex provisioned persistent volumes") + _clusterDownCmd.Flags().BoolVar(&_flagClusterDownKeepAWSResources, "keep-aws-resources", false, "skip deletion of resources that cortex provisioned on aws (bucket contents, ebs volumes, log group)") _clusterCmd.AddCommand(_clusterDownCmd) _clusterExportCmd.Flags().SortFlags = false @@ -437,20 +437,33 @@ var _clusterDownCmd = &cobra.Command{ bucketName := clusterconfig.BucketName(accountID, accessConfig.ClusterName, accessConfig.Region) warnIfNotAdmin(awsClient) + fmt.Println() - clusterState, err := clusterstate.GetClusterState(awsClient, accessConfig) - if err != nil && errors.GetKind(err) != clusterstate.ErrUnexpectedCloudFormationStatus { - exit.Error(err) + errorsList := []error{} + + if _flagClusterDisallowPrompt { + fmt.Printf("your cluster named \"%s\" in %s will be spun down and all apis will be deleted\n\n", accessConfig.ClusterName, accessConfig.Region) + } else { + prompt.YesOrExit(fmt.Sprintf("your cluster named \"%s\" in %s will be spun down and all apis will be deleted, are you sure you want to continue?", accessConfig.ClusterName, accessConfig.Region), "", "") } - if err == nil { + + fmt.Print("○ retrieving cluster ... ") + var clusterExists bool + clusterState, err := clusterstate.GetClusterState(awsClient, accessConfig) + if err != nil { + errorsList = append(errorsList, err) + fmt.Print("failed ✗") + fmt.Printf("\n\ncouldn't retrieve cluster state; check the cluster stacks in the cloudformation console: https://%s.console.aws.amazon.com/cloudformation\n", accessConfig.Region) + errors.PrintError(err) + fmt.Println() + } else { switch clusterState.Status { case clusterstate.StatusNotFound: - exit.Error(clusterstate.ErrorClusterDoesNotExist(accessConfig.ClusterName, accessConfig.Region)) + fmt.Println("cluster doesn't exist ✓") case clusterstate.StatusDeleteComplete: - // silently clean up awsClient.DeleteQueuesWithPrefix(clusterconfig.SQSNamePrefix(accessConfig.ClusterName)) awsClient.DeletePolicy(clusterconfig.DefaultPolicyARN(accountID, accessConfig.ClusterName, accessConfig.Region)) - if !_flagClusterDownKeepVolumes { + if !_flagClusterDownKeepAWSResources { volumes, err := listPVCVolumesForCluster(awsClient, accessConfig.ClusterName) if err == nil { for _, volume := range volumes { @@ -458,76 +471,121 @@ var _clusterDownCmd = &cobra.Command{ } } } - exit.Error(clusterstate.ErrorClusterAlreadyDeleted(accessConfig.ClusterName, accessConfig.Region)) + fmt.Println("already deleted ✓") + default: + fmt.Println("found ✓") + clusterExists = true } } // updating CLI env is best-effort, so ignore errors loadBalancer, _ := getLoadBalancer(accessConfig.ClusterName, OperatorLoadBalancer, awsClient) - if _flagClusterDisallowPrompt { - fmt.Printf("your cluster named \"%s\" in %s will be spun down and all apis will be deleted\n\n", accessConfig.ClusterName, accessConfig.Region) - } else { - prompt.YesOrExit(fmt.Sprintf("your cluster named \"%s\" in %s will be spun down and all apis will be deleted, are you sure you want to continue?", accessConfig.ClusterName, accessConfig.Region), "", "") - } - - fmt.Print("○ deleting sqs queues ") - err = awsClient.DeleteQueuesWithPrefix(clusterconfig.SQSNamePrefix(accessConfig.ClusterName)) - if err != nil { + fmt.Print("○ deleting sqs queues ... ") + if queueExists, err := awsClient.DoesQueueExist(clusterconfig.SQSNamePrefix(accessConfig.ClusterName)); err != nil { + errorsList = append(errorsList, err) + fmt.Print("failed ✗") fmt.Printf("\n\nfailed to delete all sqs queues; please delete queues starting with the name %s via the cloudwatch console: https://%s.console.aws.amazon.com/sqs/v2/home\n", clusterconfig.SQSNamePrefix(accessConfig.ClusterName), accessConfig.Region) errors.PrintError(err) fmt.Println() + } else if !queueExists { + fmt.Println("no sqs queues exist ✓") } else { - fmt.Println("✓") + err = awsClient.DeleteQueuesWithPrefix(clusterconfig.SQSNamePrefix(accessConfig.ClusterName)) + if err != nil { + fmt.Print("failed ✗") + errorsList = append(errorsList, err) + fmt.Printf("\n\nfailed to delete all sqs queues; please delete queues starting with the name %s via the cloudwatch console: https://%s.console.aws.amazon.com/sqs/v2/home\n", clusterconfig.SQSNamePrefix(accessConfig.ClusterName), accessConfig.Region) + errors.PrintError(err) + fmt.Println() + } else { + fmt.Println("✓") + } } - fmt.Print("○ spinning down the cluster ...") - - out, exitCode, err := runManagerAccessCommand("/root/uninstall.sh", *accessConfig, awsClient, nil, nil) - if err != nil { - errors.PrintError(err) + clusterDoesntExist := !clusterExists + if clusterExists { + fmt.Print("○ spinning down the cluster ...") + out, exitCode, err := runManagerAccessCommand("/root/uninstall.sh", *accessConfig, awsClient, nil, nil) + if err != nil { + errorsList = append(errorsList, err) + fmt.Println() + errors.PrintError(err) + } else if exitCode == nil || *exitCode != 0 { + template := "\nNote: if this error cannot be resolved, please ensure that all CloudFormation stacks for this cluster eventually become fully deleted (%s)." + template += " If the stack deletion process has failed, please delete the stacks directly from the AWS console (this may require manually deleting particular AWS resources that are blocking the stack deletion)." + template += " In addition to deleting the stacks manually from the AWS console, also make sure to empty and remove the %s bucket" + helpStr := fmt.Sprintf(template, clusterstate.CloudFormationURL(accessConfig.ClusterName, accessConfig.Region), bucketName) + fmt.Println(helpStr) + errorsList = append(errorsList, ErrorClusterDown(filterEKSCTLOutput(out)+helpStr)) + } else { + clusterDoesntExist = true + } fmt.Println() - } else if exitCode == nil || *exitCode != 0 { - out = filterEKSCTLOutput(out) - template := "\nNote: if this error cannot be resolved, please ensure that all CloudFormation stacks for this cluster eventually become fully deleted (%s)." - template += " If the stack deletion process has failed, please delete the stacks directly from the AWS console (this may require manually deleting particular AWS resources that are blocking the stack deletion)." - template += " In addition to deleting the stacks manually from the AWS console, also make sure to empty and remove the %s bucket" - helpStr := fmt.Sprintf(template, clusterstate.CloudFormationURL(accessConfig.ClusterName, accessConfig.Region), bucketName) - fmt.Println(helpStr) - exit.Error(ErrorClusterDown(out + helpStr)) } // set lifecycle policy to clean the bucket - fmt.Printf("○ setting lifecycle policy to empty the %s bucket ", bucketName) - err = setLifecycleRulesOnClusterDown(awsClient, bucketName) - if err != nil { - fmt.Printf("\n\nfailed to set lifecycle policy to empty the %s bucket; you can remove the bucket manually via the s3 console: https://s3.console.aws.amazon.com/s3/management/%s\n", bucketName, bucketName) - errors.PrintError(err) - fmt.Println() + var bucketExists bool + if !_flagClusterDownKeepAWSResources { + fmt.Printf("○ setting lifecycle policy to empty the %s bucket ... ", bucketName) + bucketExists, err := awsClient.DoesBucketExist(bucketName) + if err != nil { + errorsList = append(errorsList, err) + fmt.Print("failed ✗") + fmt.Printf("\n\nfailed to set lifecycle policy to empty the %s bucket; you can remove the bucket manually via the s3 console: https://s3.console.aws.amazon.com/s3/management/%s\n", bucketName, bucketName) + errors.PrintError(err) + fmt.Println() + } else if !bucketExists { + fmt.Println("bucket doesn't exist ✗") + } else { + err = setLifecycleRulesOnClusterDown(awsClient, bucketName) + if err != nil { + errorsList = append(errorsList, err) + fmt.Print("failed ✗") + fmt.Printf("\n\nfailed to set lifecycle policy to empty the %s bucket; you can remove the bucket manually via the s3 console: https://s3.console.aws.amazon.com/s3/management/%s\n", bucketName, bucketName) + errors.PrintError(err) + fmt.Println() + } else { + fmt.Println("✓") + } + } } - fmt.Println("✓") // delete policy after spinning down the cluster (which deletes the roles) because policies can't be deleted if they are attached to roles - policyARN := clusterconfig.DefaultPolicyARN(accountID, accessConfig.ClusterName, accessConfig.Region) - fmt.Printf("○ deleting auto-generated iam policy %s ", policyARN) - err = awsClient.DeletePolicy(policyARN) - if err != nil { - fmt.Printf("\n\nfailed to delete auto-generated cortex policy %s; please delete the policy via the iam console: https://console.aws.amazon.com/iam/home#/policies\n", policyARN) - errors.PrintError(err) - fmt.Println() - } else { - fmt.Println("✓") + if clusterDoesntExist { + policyARN := clusterconfig.DefaultPolicyARN(accountID, accessConfig.ClusterName, accessConfig.Region) + fmt.Printf("○ deleting auto-generated iam policy %s ... ", policyARN) + if policy, err := awsClient.GetPolicyOrNil(policyARN); err != nil { + errorsList = append(errorsList, err) + fmt.Print("failed ✗") + fmt.Printf("\n\nfailed to delete auto-generated cortex policy %s; please delete the policy via the iam console: https://console.aws.amazon.com/iam/home#/policies\n", policyARN) + errors.PrintError(err) + fmt.Println() + } else if policy == nil { + fmt.Println("policy doesn't exist ✓") + } else { + err = awsClient.DeletePolicy(policyARN) + if err != nil { + errorsList = append(errorsList, err) + fmt.Print("failed ✗") + fmt.Printf("\n\nfailed to delete auto-generated cortex policy %s; please delete the policy via the iam console: https://console.aws.amazon.com/iam/home#/policies\n", policyARN) + errors.PrintError(err) + fmt.Println() + } else { + fmt.Println("✓") + } + } } - // delete EBS volumes - if !_flagClusterDownKeepVolumes { + if !_flagClusterDownKeepAWSResources { + fmt.Print("○ deleting ebs volumes ... ") volumes, err := listPVCVolumesForCluster(awsClient, accessConfig.ClusterName) if err != nil { - fmt.Println("\nfailed to list volumes for deletion; please delete any volumes associated with your cluster via the ec2 console: https://console.aws.amazon.com/ec2/v2/home?#Volumes") + errorsList = append(errorsList, err) + fmt.Println("\n\nfailed to list volumes for deletion; please delete any volumes associated with your cluster via the ec2 console: https://console.aws.amazon.com/ec2/v2/home?#Volumes") errors.PrintError(err) fmt.Println() } else { - fmt.Print("○ deleting ebs volumes ") var failedToDeleteVolumes []string var lastErr error for _, volume := range volumes { @@ -537,7 +595,10 @@ var _clusterDownCmd = &cobra.Command{ lastErr = err } } - if lastErr != nil { + if len(volumes) == 0 { + fmt.Println("no ebs volumes exist ✓") + } else if lastErr != nil { + errorsList = append(errorsList, lastErr) fmt.Printf("\n\nfailed to delete %s %s; please delete %s via the ec2 console: https://console.aws.amazon.com/ec2/v2/home?#Volumes\n", s.PluralS("volume", len(failedToDeleteVolumes)), s.UserStrsAnd(failedToDeleteVolumes), s.PluralCustom("it", "them", len(failedToDeleteVolumes))) errors.PrintError(lastErr) fmt.Println() @@ -545,8 +606,46 @@ var _clusterDownCmd = &cobra.Command{ fmt.Println("✓") } } + + fmt.Printf("○ deleting log group %s ... ", accessConfig.ClusterName) + logGroupExists, err := awsClient.DoesLogGroupExist(accessConfig.ClusterName) + if err != nil { + errorsList = append(errorsList, err) + fmt.Print("failed ✗") + fmt.Printf("\n\nfailed to list log group for deletion; please delete the log group associated with your cluster via the ec2 console: https://%s.console.aws.amazon.com/cloudwatch/home?#logsV2:log-groups\n", accessConfig.Region) + errors.PrintError(err) + fmt.Println() + } else { + if !logGroupExists { + fmt.Println("log group doesn't exist ✓") + } else { + err = awsClient.DeleteLogGroup(accessConfig.ClusterName) + if err != nil { + errorsList = append(errorsList, err) + fmt.Print("failed ✗") + fmt.Printf("\n\nfailed to delete log group %s; please delete the log group associated with your cluster via the ec2 console: https://%s.console.aws.amazon.com/cloudwatch/home?#logsV2:log-groups\n", accessConfig.ClusterName, accessConfig.Region) + errors.PrintError(err) + fmt.Println() + } else { + fmt.Println("✓") + } + } + } } + // best-effort deletion of cached config + cachedClusterConfigPath := cachedClusterConfigPath(accessConfig.ClusterName, accessConfig.Region) + os.Remove(cachedClusterConfigPath) + + if len(errorsList) > 0 { + exit.Error(errors.ListOfErrors(ErrClusterDown, false, errorsList...)) + } + fmt.Printf("\nplease check CloudFormation to ensure that all resources for the %s cluster eventually become successfully deleted: %s\n", accessConfig.ClusterName, clusterstate.CloudFormationURL(accessConfig.ClusterName, accessConfig.Region)) + if !_flagClusterDownKeepAWSResources && bucketExists { + fmt.Printf("\na lifecycle rule has been applied to the cluster's %s bucket to empty its contents within the next 24 hours; you can delete the %s bucket via the s3 console once it has been emptied (or you can empty and delete it now): https://s3.console.aws.amazon.com/s3/management/%s\n", bucketName, bucketName, bucketName) + } + fmt.Println() + // best-effort deletion of cli environment(s) if loadBalancer != nil { envNames, isDefaultEnv, _ := getEnvNamesByOperatorEndpoint(*loadBalancer.DNSName) @@ -569,12 +668,6 @@ var _clusterDownCmd = &cobra.Command{ } } } - - fmt.Printf("\nplease check CloudFormation to ensure that all resources for the %s cluster eventually become successfully deleted: %s\n", accessConfig.ClusterName, clusterstate.CloudFormationURL(accessConfig.ClusterName, accessConfig.Region)) - fmt.Printf("\na lifecycle rule has been applied to the cluster’s %s bucket to empty its contents later today; you can delete the %s bucket via the s3 console once it has been emptied: https://s3.console.aws.amazon.com/s3/management/%s\n", bucketName, bucketName, bucketName) - - cachedClusterConfigPath := cachedClusterConfigPath(accessConfig.ClusterName, accessConfig.Region) - os.Remove(cachedClusterConfigPath) }, } diff --git a/dev/minimum_aws_policy.json b/dev/minimum_aws_policy.json index 6e30adedbf..194f1cf04f 100644 --- a/dev/minimum_aws_policy.json +++ b/dev/minimum_aws_policy.json @@ -76,6 +76,7 @@ "iam:ListInstanceProfiles", "logs:CreateLogGroup", "logs:PutLogEvents", + "logs:DeleteLogGroup", "iam:CreateOpenIDConnectProvider", "iam:GetOpenIDConnectProvider", "iam:GetRolePolicy" diff --git a/docs/clients/cli.md b/docs/clients/cli.md index 838c0f898a..8c48f98093 100644 --- a/docs/clients/cli.md +++ b/docs/clients/cli.md @@ -164,12 +164,12 @@ Usage: cortex cluster down [flags] Flags: - -c, --config string path to a cluster configuration file - -n, --name string name of the cluster - -r, --region string aws region of the cluster - -y, --yes skip prompts - --keep-volumes keep cortex provisioned persistent volumes - -h, --help help for down + -c, --config string path to a cluster configuration file + -n, --name string name of the cluster + -r, --region string aws region of the cluster + -y, --yes skip prompts + --keep-aws-resources skip deletion of resources that cortex provisioned on aws (bucket contents, ebs volumes, log group) + -h, --help help for down ``` ## cluster export diff --git a/docs/clusters/management/auth.md b/docs/clusters/management/auth.md index 90d2687055..b01ff08a56 100644 --- a/docs/clusters/management/auth.md +++ b/docs/clusters/management/auth.md @@ -141,6 +141,7 @@ Replace the following placeholders with their respective values in the policy te "iam:ListInstanceProfiles", "logs:CreateLogGroup", "logs:PutLogEvents", + "logs:DeleteLogGroup", "iam:CreateOpenIDConnectProvider", "iam:GetOpenIDConnectProvider", "iam:GetRolePolicy" diff --git a/docs/clusters/management/delete.md b/docs/clusters/management/delete.md index 2b3480fde7..964ce779a3 100644 --- a/docs/clusters/management/delete.md +++ b/docs/clusters/management/delete.md @@ -4,34 +4,18 @@ cortex cluster down ``` -## Delete metadata and log groups +## Bucket Contents -Since you may wish to have access to your data after spinning down your cluster, Cortex's bucket, log groups, and -Prometheus volume are not automatically deleted when running `cortex cluster down`. - -To delete them: - -```bash -# identify the name of your cortex S3 bucket -aws s3 ls - -# delete the S3 bucket -aws s3 rb --force s3:// - -# delete the log group (replace with the name of your cluster, default: cortex) -aws logs describe-log-groups --log-group-name-prefix= --query logGroups[*].[logGroupName] --output text | xargs -I {} aws logs delete-log-group --log-group-name {} -``` +When a Cortex cluster is created, an S3 bucket is created for its internal use. When running `cortex cluster down`, a lifecycle rule is applied to the bucket such that its entire contents are removed within the next 24 hours. You can safely delete the bucket at any time after `cortex cluster down` has finished running. ## Delete Certificates If you've configured a custom domain for your APIs, you can remove the SSL Certificate and Hosted Zone for the domain by following these [instructions](../networking/custom-domain.md#cleanup). -## Keep Cortex Volumes +## Keep Cortex Resources -The volumes used by Cortex's Prometheus and Grafana instances are deleted by default on a cluster down operation. -If you want to keep the metrics and dashboards volumes for any reason, -you can pass the `--keep-volumes` flag to the `cortex cluster down` command. +The contents of Cortex's S3 bucket, the EBS volumes (used by Cortex's Prometheus and Grafana instances), and the log group are deleted by default when running `cortex cluster down`. If you want to keep these resources, you can pass the `--keep-aws-resources` flag to the `cortex cluster down` command. ## Troubleshooting diff --git a/pkg/lib/aws/cloudwatch.go b/pkg/lib/aws/cloudwatch.go index 317972afce..507a8a1fca 100644 --- a/pkg/lib/aws/cloudwatch.go +++ b/pkg/lib/aws/cloudwatch.go @@ -105,6 +105,17 @@ func (c *Client) CreateLogGroup(logGroup string, tags map[string]string) error { return nil } +func (c *Client) DeleteLogGroup(logGroup string) error { + _, err := c.CloudWatchLogs().DeleteLogGroup(&cloudwatchlogs.DeleteLogGroupInput{ + LogGroupName: aws.String(logGroup), + }) + if err != nil { + return errors.Wrap(err, "log group "+logGroup) + } + + return nil +} + func (c *Client) TagLogGroup(logGroup string, tagMap map[string]string) error { tags := map[string]*string{} for key, value := range tagMap { diff --git a/pkg/lib/aws/iam.go b/pkg/lib/aws/iam.go index 4490ab1902..547b36ad55 100644 --- a/pkg/lib/aws/iam.go +++ b/pkg/lib/aws/iam.go @@ -209,3 +209,20 @@ func (c *Client) DeletePolicy(policyARN string) error { } return nil } + +func (c *Client) GetPolicyOrNil(policyARN string) (*iam.Policy, error) { + policyOutput, err := c.IAM().GetPolicy(&iam.GetPolicyInput{ + PolicyArn: aws.String(policyARN), + }) + if err != nil { + if IsErrCode(err, iam.ErrCodeNoSuchEntityException) { + return nil, nil + } + return nil, errors.WithStack(err) + } + + if policyOutput != nil { + return policyOutput.Policy, nil + } + return nil, nil +} diff --git a/pkg/lib/errors/errors.go b/pkg/lib/errors/errors.go index 10ddd97b39..2758f23f7e 100644 --- a/pkg/lib/errors/errors.go +++ b/pkg/lib/errors/errors.go @@ -17,6 +17,7 @@ limitations under the License. package errors import ( + "fmt" "strings" s "github.com/cortexlabs/cortex/pkg/lib/strings" @@ -37,3 +38,20 @@ func ErrorUnexpected(msgs ...interface{}) error { Message: strings.Join(strs, ": "), }) } + +func ListOfErrors(errKind string, shouldPrint bool, errors ...error) error { + var errorsContents string + for i, err := range errors { + if err != nil { + errorsContents += fmt.Sprintf("error #%d: %s\n", i+1, err.Error()) + } + } + if errorsContents == "" { + return nil + } + return WithStack(&Error{ + Kind: errKind, + Message: errorsContents, + NoPrint: !shouldPrint, + }) +}