Skip to content

Commit dd325c1

Browse files
azychPer Goncalves da Silva
authored and
Per Goncalves da Silva
committed
Handle interrupted helm releases in applier
Introduces a workaround for 'interrupted' helm releases which enter into a 'pending' (-install/uninstall/rollback) state. If that happens, for example because of immediate application exit with one of those operations being in flight, helm is not able to resolve it automatically which means we end up in a permanent reconcile error state. One of the workarounds for this that has been repeated in the community is to remove metadata of the pending release, which is the solution chosen here. For full context see: #1776 helm/helm#5595
1 parent 532c306 commit dd325c1

File tree

6 files changed

+119
-5
lines changed

6 files changed

+119
-5
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ require (
1919
github.com/opencontainers/image-spec v1.1.1
2020
github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11
2121
github.com/operator-framework/api v0.31.0
22-
github.com/operator-framework/helm-operator-plugins v0.8.0
22+
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b
2323
github.com/operator-framework/operator-registry v1.55.0
2424
github.com/prometheus/client_golang v1.22.0
2525
github.com/spf13/cobra v1.9.1

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -402,8 +402,8 @@ github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11 h1:eT
402402
github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11/go.mod h1:EmVJt97N+pfWFsli/ipXTBZqSG5F5KGQhm3c3IsGq1o=
403403
github.com/operator-framework/api v0.31.0 h1:tRsFTuZ51xD8U5QgiPo3+mZgVipHZVgRXYrI6RRXOh8=
404404
github.com/operator-framework/api v0.31.0/go.mod h1:57oCiHNeWcxmzu1Se8qlnwEKr/GGXnuHvspIYFCcXmY=
405-
github.com/operator-framework/helm-operator-plugins v0.8.0 h1:0f6HOQC5likkf0b/OvGvw7nhDb6h8Cj5twdCNjwNzMc=
406-
github.com/operator-framework/helm-operator-plugins v0.8.0/go.mod h1:Sc+8bE38xTCgCChBUvtq/PxatEg9fAypr7S5iAw8nlA=
405+
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b h1:andoOL7lpEafTXdjPGDuNLJlJQh0UmRzgj0+D31K8HU=
406+
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b/go.mod h1:qioyxwOSI89RAtTWhccc+RyaPQdKTTJRc6sFkT6kqys=
407407
github.com/operator-framework/operator-lib v0.17.0 h1:cbz51wZ9+GpWR1ZYP4CSKSSBxDlWxmmnseaHVZZjZt4=
408408
github.com/operator-framework/operator-lib v0.17.0/go.mod h1:TGopBxIE8L6E/Cojzo26R3NFp1eNlqhQNmzqhOblaLw=
409409
github.com/operator-framework/operator-registry v1.55.0 h1:iXlv53fYyg2VtLqSDEalXD72/5Uzc7Rfx17j35+8plA=

internal/operator-controller/action/helm_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/stretchr/testify/assert"
1010
"github.com/stretchr/testify/mock"
1111
"github.com/stretchr/testify/require"
12+
"helm.sh/helm/v3/pkg/action"
1213
"helm.sh/helm/v3/pkg/chart"
1314
"helm.sh/helm/v3/pkg/release"
1415
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -70,6 +71,11 @@ func (m *mockActionClient) Reconcile(rel *release.Release) error {
7071
return args.Error(0)
7172
}
7273

74+
func (m *mockActionClient) Config() *action.Configuration {
75+
args := m.Called()
76+
return args.Get(0).(*action.Configuration)
77+
}
78+
7379
var _ actionclient.ActionClientGetter = &mockActionClientGetter{}
7480

7581
type mockActionClientGetter struct {

internal/operator-controller/applier/helm.go

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
143143
return nil, "", err
144144
}
145145

146-
rel, desiredRel, state, err := h.getReleaseState(ac, ext, chrt, values, post)
146+
rel, desiredRel, state, err := h.getReleaseState(ctx, ac, ext, chrt, values, post)
147147
if err != nil {
148148
return nil, "", fmt.Errorf("failed to get release state using server-side dry-run: %w", err)
149149
}
@@ -240,9 +240,30 @@ func (h *Helm) renderClientOnlyRelease(ctx context.Context, ext *ocv1.ClusterExt
240240
}, helmclient.AppendInstallPostRenderer(post))
241241
}
242242

243-
func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
243+
func (h *Helm) getReleaseState(ctx context.Context, cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
244+
logger := log.FromContext(ctx)
244245
currentRelease, err := cl.Get(ext.GetName())
246+
247+
// if a release is pending at this point, that means that a helm action
248+
// (installation/upgrade) we were attempting was likely interrupted in-flight.
249+
// Pending release would leave us in reconciliation error loop because helm
250+
// wouldn't be able to progress automatically from it.
251+
//
252+
// one of the workarounds is to try and remove helm metadata relating to
253+
// that pending release which should 'reset' its state communicated to helm
254+
// and the next reconciliation should be able to successfully pick up from here
255+
// for context see: https://github.com/helm/helm/issues/5595 and https://github.com/helm/helm/issues/7476
256+
// and the discussion in https://github.com/operator-framework/operator-controller/pull/1776
257+
if err == nil && currentRelease.Info.Status.IsPending() {
258+
if _, err = cl.Config().Releases.Delete(currentRelease.Name, currentRelease.Version); err != nil {
259+
return nil, nil, StateError, fmt.Errorf("failed removing interrupted release %q version %d metadata: %w", currentRelease.Name, currentRelease.Version, err)
260+
}
261+
// return error to try to detect proper state (installation/upgrade) at next reconciliation
262+
return nil, nil, StateError, fmt.Errorf("removed interrupted release %q version %d metadata", currentRelease.Name, currentRelease.Version)
263+
}
264+
245265
if errors.Is(err, driver.ErrReleaseNotFound) {
266+
logger.V(4).Info("ClusterExtension dry-run install", "extension", ext.GetName())
246267
desiredRelease, err := cl.Install(ext.GetName(), ext.Spec.Namespace, chrt, values, func(i *action.Install) error {
247268
i.DryRun = true
248269
i.DryRunOption = "server"
@@ -258,6 +279,7 @@ func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterE
258279
}
259280

260281
desiredRelease, err := cl.Upgrade(ext.GetName(), ext.Spec.Namespace, chrt, values, func(upgrade *action.Upgrade) error {
282+
logger.V(4).Info("ClusterExtension dry-run upgrade", "extension", ext.GetName())
261283
upgrade.MaxHistory = maxHelmReleaseHistory
262284
upgrade.DryRun = true
263285
upgrade.DryRunOption = "server"

internal/operator-controller/applier/helm_test.go

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"helm.sh/helm/v3/pkg/action"
1414
"helm.sh/helm/v3/pkg/chart"
1515
"helm.sh/helm/v3/pkg/release"
16+
"helm.sh/helm/v3/pkg/storage"
1617
"helm.sh/helm/v3/pkg/storage/driver"
1718
corev1 "k8s.io/api/core/v1"
1819
rbacv1 "k8s.io/api/rbac/v1"
@@ -66,6 +67,7 @@ type mockActionGetter struct {
6667
reconcileErr error
6768
desiredRel *release.Release
6869
currentRel *release.Release
70+
config *action.Configuration
6971
}
7072

7173
func (mag *mockActionGetter) ActionClientFor(ctx context.Context, obj client.Object) (helmclient.ActionInterface, error) {
@@ -114,6 +116,10 @@ func (mag *mockActionGetter) Reconcile(rel *release.Release) error {
114116
return mag.reconcileErr
115117
}
116118

119+
func (mag *mockActionGetter) Config() *action.Configuration {
120+
return mag.config
121+
}
122+
117123
var (
118124
// required for unmockable call to convert.RegistryV1ToHelmChart
119125
validFS = fstest.MapFS{
@@ -223,6 +229,80 @@ func TestApply_Base(t *testing.T) {
223229
})
224230
}
225231

232+
func TestApply_InterruptedRelease(t *testing.T) {
233+
t.Run("fails removing an interrupted install release", func(t *testing.T) {
234+
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}}
235+
testStorage := storage.Init(driver.NewMemory())
236+
237+
mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
238+
helmApplier := applier.Helm{
239+
ActionClientGetter: mockAcg,
240+
BundleToHelmChartConverter: &convert.BundleToHelmChartConverter{},
241+
}
242+
243+
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
244+
require.Error(t, err)
245+
require.ErrorContains(t, err, "removing interrupted release")
246+
require.Nil(t, objs)
247+
require.Empty(t, state)
248+
})
249+
250+
t.Run("fails removing an interrupted upgrade release", func(t *testing.T) {
251+
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}}
252+
testStorage := storage.Init(driver.NewMemory())
253+
254+
mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
255+
helmApplier := applier.Helm{
256+
ActionClientGetter: mockAcg,
257+
BundleToHelmChartConverter: &convert.BundleToHelmChartConverter{},
258+
}
259+
260+
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
261+
require.Error(t, err)
262+
require.ErrorContains(t, err, "removing interrupted release")
263+
require.Nil(t, objs)
264+
require.Empty(t, state)
265+
})
266+
267+
t.Run("successfully removes an interrupted install release", func(t *testing.T) {
268+
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}}
269+
testStorage := storage.Init(driver.NewMemory())
270+
err := testStorage.Create(testRel)
271+
require.NoError(t, err)
272+
273+
mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
274+
helmApplier := applier.Helm{
275+
ActionClientGetter: mockAcg,
276+
BundleToHelmChartConverter: &convert.BundleToHelmChartConverter{},
277+
}
278+
279+
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
280+
require.Error(t, err)
281+
require.ErrorContains(t, err, "removed interrupted release")
282+
require.Nil(t, objs)
283+
require.Empty(t, state)
284+
})
285+
286+
t.Run("successfully removes an interrupted upgrade release", func(t *testing.T) {
287+
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}}
288+
testStorage := storage.Init(driver.NewMemory())
289+
err := testStorage.Create(testRel)
290+
require.NoError(t, err)
291+
292+
mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
293+
helmApplier := applier.Helm{
294+
ActionClientGetter: mockAcg,
295+
BundleToHelmChartConverter: &convert.BundleToHelmChartConverter{},
296+
}
297+
298+
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
299+
require.Error(t, err)
300+
require.ErrorContains(t, err, "removed interrupted release")
301+
require.Nil(t, objs)
302+
require.Empty(t, state)
303+
})
304+
}
305+
226306
func TestApply_Installation(t *testing.T) {
227307
t.Run("fails during dry-run installation", func(t *testing.T) {
228308
mockAcg := &mockActionGetter{
@@ -439,6 +519,7 @@ func TestApply_Upgrade(t *testing.T) {
439519

440520
t.Run("fails during dry-run upgrade", func(t *testing.T) {
441521
mockAcg := &mockActionGetter{
522+
currentRel: testCurrentRelease,
442523
dryRunUpgradeErr: errors.New("failed attempting to dry-run upgrade chart"),
443524
}
444525
helmApplier := applier.Helm{

internal/operator-controller/controllers/clusterextension_controller_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"github.com/google/go-cmp/cmp/cmpopts"
1313
"github.com/stretchr/testify/assert"
1414
"github.com/stretchr/testify/require"
15+
"helm.sh/helm/v3/pkg/action"
1516
"helm.sh/helm/v3/pkg/chart"
1617
"helm.sh/helm/v3/pkg/release"
1718
"helm.sh/helm/v3/pkg/storage/driver"
@@ -1411,6 +1412,10 @@ func (mag *MockActionGetter) Reconcile(rel *release.Release) error {
14111412
return nil
14121413
}
14131414

1415+
func (mag *MockActionGetter) Config() *action.Configuration {
1416+
return nil
1417+
}
1418+
14141419
func TestGetInstalledBundleHistory(t *testing.T) {
14151420
getter := controllers.DefaultInstalledBundleGetter{}
14161421

0 commit comments

Comments
 (0)