Skip to content

Preview Envs on Harvester: Delete when inactive #10379

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 117 additions & 44 deletions .werft/platform-delete-preview-environments-cron.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { SpanStatusCode } from '@opentelemetry/api';
import { wipePreviewEnvironmentAndNamespace, helmInstallName, listAllPreviewNamespaces } from './util/kubectl';
import { exec } from './util/shell';
import { previewNameFromBranchName } from './util/preview';
import { CORE_DEV_KUBECONFIG_PATH, HARVESTER_KUBECONFIG_PATH } from './jobs/build/const';
import {CORE_DEV_KUBECONFIG_PATH, HARVESTER_KUBECONFIG_PATH, PREVIEW_K3S_KUBECONFIG_PATH} from './jobs/build/const';
import {deleteDNSRecord} from "./util/gcloud";
import * as VM from "./vm/vm";

Expand All @@ -18,7 +18,7 @@ const SLICES = {
FETCHING_PREVIEW_ENVIRONMENTS: "Fetching preview environments",
FETCHING_BRANCHES: "Fetching branches",
CHECKING_FOR_STALE_BRANCHES: "Checking for stale branches",
CHECKING_FOR_NO_DB_ACTIVITY: "Checking for DB activity",
CHECKING_FOR_DB_ACTIVITY: "Checking for DB activity",
DETERMINING_STALE_PREVIEW_ENVIRONMENTS: "Determining stale preview environments",
DELETING_PREVIEW_ENVIRONMNETS: "Deleting preview environments"
}
Expand Down Expand Up @@ -54,9 +54,12 @@ class HarvesterPreviewEnvironment {
// The name of the namespace that the VM and related resources are in, e.g. preview-my-branch
namespace: string

// Then name of the preview environment, e.g. my-branch
// The name of the preview environment, e.g. my-branch
name: string

// The namespace in the k3s cluster where all resources are (default)
k3sNamespace: string = "default"

constructor (namespace: string) {
this.namespace = namespace
this.name = namespace.replace(HarvesterPreviewEnvironment.namespacePrefix, "")
Expand All @@ -80,12 +83,64 @@ class HarvesterPreviewEnvironment {
])
}

isInactive(): boolean {
// We'll port over the logic from CoreDevPreviewEnvironment later, for now we consider
// Harvester preview environments to never be stale due to inactivity.
const sliceID = SLICES.CHECKING_FOR_NO_DB_ACTIVITY
werft.log(sliceID, `${this.name} (${this.namespace}) - is-inactive=false - Harvester based `)
return false
/**
* Checks whether a preview environment is active based on the db activity.
*
* It errs on the side of caution, so in case of connection issues etc. it will consider the
* preview environment active.
*/
isActive(): boolean {
const sliceID = SLICES.CHECKING_FOR_DB_ACTIVITY
try {
try {
VM.get({name: this.name});
} catch(e){
if (e instanceof VM.NotFoundError){
werft.log(sliceID, `${this.name} - is-active=false - The VM doesn't exist, deleting the environment`)
return false
}
werft.log(sliceID, `${this.name} - is-active=true - Unexpected error trying to get the VM. Marking env as active: ${e.message}`)
return true
}

// The preview env is its own k3s cluster, so we need to get the kubeconfig for it
VM.startSSHProxy({ name: this.name, slice: sliceID })
exec('sleep 5', { silent: true, slice: sliceID })

VM.copyk3sKubeconfig({ name: this.name, timeoutMS: 1000 * 60 * 3, slice: sliceID })
const kubectclCmd = `KUBECONFIG=${PREVIEW_K3S_KUBECONFIG_PATH} kubectl --insecure-skip-tls-verify`

werft.log(sliceID, `${this.name} (${this.k3sNamespace}) - Checking status of the MySQL pod`)
const statusDB = exec(`${kubectclCmd} get pods mysql-0 -n ${this.k3sNamespace} -o jsonpath='{.status.phase}'`, { slice: sliceID})
const statusDbContainer = exec(`${kubectclCmd} get pods mysql-0 -n ${this.k3sNamespace} -o jsonpath='{.status.containerStatuses.*.ready}'`, { slice: sliceID})

if (statusDB.code != 0 || statusDB != "Running" || statusDbContainer == "false") {
werft.log(sliceID, `${this.name} (${this.k3sNamespace}) - is-active=true - The database is not reachable, assuming env is active`)
return true
}

const dbPassword = exec(`${kubectclCmd} get secret db-password -n ${this.k3sNamespace} -o jsonpath='{.data.mysql-root-password}' | base64 -d`, {silent: true}).stdout.trim()

// MySQL runs in the preview env cluster that is not reachable form the job's pod, so we have to port forward
exec(`${kubectclCmd} -n ${this.k3sNamespace} port-forward svc/mysql 33061:3306`, { async: true, silent:true, slice: sliceID, dontCheckRc: true })
exec('sleep 5', { silent: true, slice: sliceID })

// Using MYSQL_PWD instead of a flag for the pwd suppresses "[Warning] Using a password on the command line interface can be insecure."
const dbConn = `MYSQL_PWD=${dbPassword} mysql --host=127.0.0.1 --port=33061 --user=root --database=gitpod -s -N`
const active = isDbActive(this, dbConn, sliceID)

// clean after ourselves, as we'll be running this for quite a few environments
VM.stopKubectlPortForwards()
exec(`rm ${PREVIEW_K3S_KUBECONFIG_PATH}`, { silent :true, slice: sliceID })

return active
} catch (err) {
// cleanup in case of an error
VM.stopKubectlPortForwards()
exec(`rm ${PREVIEW_K3S_KUBECONFIG_PATH}`, { silent :true, slice: sliceID })
werft.log(sliceID, `${this.name} (${this.k3sNamespace}) - is-active=true - Unable to check DB activity, assuming env is active`)
return true
}
}

/**
Expand Down Expand Up @@ -132,51 +187,38 @@ class CoreDevPreviewEnvironment {
}

/**
* Checks whether or not a preview environment is considered inactive.
* Checks whether a preview environment is active based on the db activity.
*
* It errors on the side of caution, so in case of connection issues etc. it will consider the
* It errs on the side of caution, so in case of connection issues etc. it will consider the
* preview environment active.
*/
isInactive(): boolean {
const sliceID = SLICES.CHECKING_FOR_NO_DB_ACTIVITY
isActive(): boolean {
const sliceID = SLICES.CHECKING_FOR_DB_ACTIVITY
try {
const statusNS = exec(`KUBECONFIG=${CORE_DEV_KUBECONFIG_PATH} kubectl get ns ${this.namespace} -o jsonpath='{.status.phase}'`, { slice: sliceID })

if (statusNS != "Active") {
werft.log(sliceID, `${this.name} (${this.namespace}) - is-inactive=false - The namespace is ${statusNS}`)
return false
werft.log(sliceID, `${this.name} (${this.namespace}) - is-active=true - The namespace is ${statusNS}, assuming env is active`)
return true
}

werft.log(sliceID, `${this.name} (${this.namespace}) - Checking status of the MySQL pod`)
const statusDB = exec(`KUBECONFIG=${CORE_DEV_KUBECONFIG_PATH} kubectl get pods mysql-0 -n ${this.namespace} -o jsonpath='{.status.phase}'`, { slice: sliceID})
const statusDbContainer = exec(`KUBECONFIG=${CORE_DEV_KUBECONFIG_PATH} kubectl get pods mysql-0 -n ${this.namespace} -o jsonpath='{.status.containerStatuses.*.ready}'`, { slice: sliceID})

if (statusDB.code != 0 || statusDB != "Running" || statusDbContainer == "false") {
werft.log(sliceID, `${this.name} (${this.namespace}) - is-inactive=false - The database is not reachable`)
return false
werft.log(sliceID, `${this.name} (${this.namespace}) - is-active=true - The database is not reachable, assuming env is active`)
return true
}

const dbPassword = exec(`KUBECONFIG=${CORE_DEV_KUBECONFIG_PATH} kubectl get secret db-password -n ${this.namespace} -o jsonpath='{.data.mysql-root-password}' | base64 -d`, {silent: true}).stdout.trim()
const connectionToDb = `mysql --host=db.${this.namespace}.svc.cluster.local --port=3306 --user=root --database=gitpod -s -N --password=${dbPassword}`

const latestInstanceTimeout = 48
const latestInstance = exec(`${connectionToDb} --execute="SELECT creationTime FROM d_b_workspace_instance WHERE creationTime > DATE_SUB(NOW(), INTERVAL '${latestInstanceTimeout}' HOUR) LIMIT 1"`, { slice: sliceID})

const latestUserTimeout = 48
const latestUser= exec(`${connectionToDb} --execute="SELECT creationDate FROM d_b_user WHERE creationDate > DATE_SUB(NOW(), INTERVAL '${latestUserTimeout}' HOUR) LIMIT 1"`, { slice: sliceID})

const lastModifiedTimeout = 48
const lastModified= exec(`${connectionToDb} --execute="SELECT _lastModified FROM d_b_user WHERE _lastModified > DATE_SUB(NOW(), INTERVAL '${lastModifiedTimeout}' HOUR) LIMIT 1"`, { slice: sliceID})
const dbConn = `MYSQL_PWD=${dbPassword} mysql --host=db.${this.namespace}.svc.cluster.local --port=3306 --user=root --database=gitpod -s -N`

const heartbeatTimeout = 48
const heartbeat= exec(`${connectionToDb} --execute="SELECT lastSeen FROM d_b_workspace_instance_user WHERE lastSeen > DATE_SUB(NOW(), INTERVAL '${heartbeatTimeout}' HOUR) LIMIT 1"`, { slice: sliceID})

const isInactive = (heartbeat.length < 1) && (latestInstance.length < 1) && (latestUser.length < 1) && (lastModified.length < 1)
werft.log(sliceID, `${this.name} (${this.namespace}) - is-inactive=${isInactive}`)
return isInactive
return isDbActive(this, dbConn, sliceID)
} catch (err) {
werft.log(sliceID, `${this.name} (${this.namespace}) - is-inactive=false - Unable to check DB activity`)
return false
werft.log(sliceID, `${this.name} (${this.namespace}) - is-active=true - Unable to check DB activity, assuming env is active`)
return true
}
}

Expand Down Expand Up @@ -323,28 +365,23 @@ async function determineStalePreviewEnvironments(options: {previews: PreviewEnvi
]))
werft.done(SLICES.CHECKING_FOR_STALE_BRANCHES)

werft.log(SLICES.CHECKING_FOR_NO_DB_ACTIVITY, `Checking ${previews.length} preview environments for DB activity`)
werft.log(SLICES.CHECKING_FOR_DB_ACTIVITY, `Checking ${previews.length} preview environments for DB activity`)
const previewNamespacesWithNoDBActivity = new Set(
previews
.filter((preview) => preview.isInactive())
.filter((preview) => !preview.isActive())
.map((preview) => preview.namespace)
)

werft.done(SLICES.CHECKING_FOR_NO_DB_ACTIVITY)
werft.done(SLICES.CHECKING_FOR_DB_ACTIVITY)

const previewsToDelete = previews.filter((preview: PreviewEnvironment) => {
if (!previewNamespaceBasedOnBranches.has(preview.namespace)) {
werft.log(SLICES.DETERMINING_STALE_PREVIEW_ENVIRONMENTS, `Considering ${preview.name} (${preview.namespace}) stale due to missing branch`)
return true
}

if (previewNamespaceBasedOnStaleBranches.has(preview.namespace)) {
werft.log(SLICES.DETERMINING_STALE_PREVIEW_ENVIRONMENTS, `Considering ${preview.name} (${preview.namespace}) stale due to no recent commit activity`)
return true
}

if (previewNamespacesWithNoDBActivity.has(preview.namespace)) {
werft.log(SLICES.DETERMINING_STALE_PREVIEW_ENVIRONMENTS, `Considering ${preview.name} (${preview.namespace}) stale due to no recent DB activity`)
if (previewNamespaceBasedOnStaleBranches.has(preview.namespace) && previewNamespacesWithNoDBActivity.has(preview.namespace)) {
werft.log(SLICES.DETERMINING_STALE_PREVIEW_ENVIRONMENTS, `Considering ${preview.name} (${preview.namespace}) stale due to no recent commit and DB activity`)
return true
}

Expand Down Expand Up @@ -407,3 +444,39 @@ async function cleanLoadbalancer() {
function getAllBranches(): string[] {
return exec(`git branch -r | grep -v '\\->' | sed "s,\\x1B\\[[0-9;]*[a-zA-Z],,g" | while read remote; do echo "\${remote#origin/}"; done`).stdout.trim().split('\n');
}

/**
* Determines if the db of a preview environment is active
* by looking if there were relevant entries in the workspace and user tables in the last 48h
*
*/
function isDbActive(previewEnvironment: PreviewEnvironment, dbConn: string, sliceID: string): boolean{
const timeout = 48
let isActive = false

const queries = {
"d_b_workspace_instance": `SELECT TIMESTAMPDIFF(HOUR, creationTime, NOW()) FROM d_b_workspace_instance WHERE creationTime > DATE_SUB(NOW(), INTERVAL '${timeout}' HOUR) ORDER BY creationTime DESC LIMIT 1`,
"d_b_user-created": `SELECT TIMESTAMPDIFF(HOUR, creationDate, NOW()) FROM d_b_user WHERE creationDate > DATE_SUB(NOW(), INTERVAL '${timeout}' HOUR) ORDER BY creationDate DESC LIMIT 1`,
"d_b_user-modified": `SELECT TIMESTAMPDIFF(HOUR, _lastModified, NOW()) FROM d_b_user WHERE _lastModified > DATE_SUB(NOW(), INTERVAL '${timeout}' HOUR) ORDER BY _lastModified DESC LIMIT 1`,
"d_b_workspace_instance_user": `SELECT TIMESTAMPDIFF(HOUR, lastSeen, NOW()) FROM d_b_workspace_instance_user WHERE lastSeen > DATE_SUB(NOW(), INTERVAL '${timeout}' HOUR) ORDER BY lastSeen DESC LIMIT 1`
}

const result = {}
// let logLine = `Last Activity (hours ago):`
for (const [key, query] of Object.entries(queries)) {
// explicitly set to null, so we get an output in the logs for those queries
result[key] = null
const queryResult = exec(`${dbConn} --execute="${query}"`, { silent:true, slice: sliceID})
if (queryResult.length > 0) {
result[key] = queryResult.stdout.trim()
isActive = true
}
}

const logLines = Object.entries(result).map((kv) => `${kv.join(":")}`)
const logLine = `Last Activity (hours ago): ${logLines.join(",")}`

werft.log(sliceID, `${previewEnvironment.name} (${previewEnvironment.namespace}) - is-active=${isActive} ${logLine}`)

return isActive
}
10 changes: 10 additions & 0 deletions .werft/platform-delete-preview-environments-cron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ pod:
- name: harvester-k3s-dockerhub-pull-account
secret:
secretName: harvester-k3s-dockerhub-pull-account
- name: harvester-vm-ssh-keys
secret:
secretName: harvester-vm-ssh-keys
containers:
- name: build
image: eu.gcr.io/gitpod-core-dev/dev/dev-environment:me-cmctl.0
Expand All @@ -36,6 +39,8 @@ pod:
readOnly: true
- name: harvester-kubeconfig
mountPath: /mnt/secrets/harvester-kubeconfig
- name: harvester-vm-ssh-keys
mountPath: /mnt/secrets/harvester-vm-ssh-keys
- name: harvester-k3s-dockerhub-pull-account
mountPath: /mnt/secrets/harvester-k3s-dockerhub-pull-account
env:
Expand All @@ -56,6 +61,11 @@ pod:
set -Eeuo pipefail

sudo chown -R gitpod:gitpod /workspace
mkdir /workspace/.ssh
cp /mnt/secrets/harvester-vm-ssh-keys/id_rsa /workspace/.ssh/id_rsa_harvester_vm
cp /mnt/secrets/harvester-vm-ssh-keys/id_rsa.pub /workspace/.ssh/id_rsa_harvester_vm.pub
sudo chmod 600 /workspace/.ssh/id_rsa_harvester_vm
sudo chmod 644 /workspace/.ssh/id_rsa_harvester_vm.pub

(cd .werft && yarn install && mv node_modules ..) | werft log slice prep

Expand Down
36 changes: 35 additions & 1 deletion .werft/vm/vm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { exec } from '../util/shell';
import { getGlobalWerftInstance } from '../util/werft';

import * as Manifests from './manifests'
import * as shell from "shelljs";

/**
* Convenience function to kubectl apply a manifest from stdin.
Expand Down Expand Up @@ -117,6 +118,39 @@ export function vmExists(options: { name: string }) {
return status.code == 0
}

export class NotFoundError extends Error {
constructor(message: string) {
super(message);
this.name = "NotFoundError";
}
}

export class KubectlError extends Error{
constructor(message: string) {
super(message);
this.name = "KubectlError";
}
}

export function get(options: { name: string }): shell.ShellString {
const namespace = `preview-${options.name}`
const vmErrNotFound = `Error from server (NotFound): virtualmachineinstances.kubevirt.io "${this.name}" not found`
const namespaceErrNotFound = `Error from server (NotFound): namespaces "${namespace}" not found`
const vm = exec(`kubectl --kubeconfig ${HARVESTER_KUBECONFIG_PATH} -n ${namespace} get vmi ${options.name}`, { dontCheckRc: true, silent: true })

if (vm.code != 0){
switch (vm.stderr){
case vmErrNotFound:
case namespaceErrNotFound:
throw new NotFoundError("The VM or Namespace doesn't exist")
default:
throw new KubectlError(vm.stderr)
}
}

return vm
}

/**
* Wait until the VM Instance reaches the Running status.
* If the VM Instance doesn't reach Running before the timeoutMS it will throw an Error.
Expand Down Expand Up @@ -186,4 +220,4 @@ export function installFluentBit(options: { namespace: string, kubeconfig: strin
exec(`helm3 --kubeconfig ${options.kubeconfig} repo add fluent https://fluent.github.io/helm-charts`, { slice: options.slice, dontCheckRc: true })
exec(`helm3 --kubeconfig ${options.kubeconfig} repo update`, { slice: options.slice, dontCheckRc: true })
exec(`helm3 --kubeconfig ${options.kubeconfig} upgrade --install fluent-bit fluent/fluent-bit -n ${options.namespace} -f .werft/vm/charts/fluentbit/values.yaml`, { slice: options.slice, dontCheckRc: true })
}
}