From cf901c914498d2df5f700c2d18511400d98efb49 Mon Sep 17 00:00:00 2001 From: Deep Mistry Date: Mon, 2 Feb 2026 14:14:12 -0600 Subject: [PATCH 1/2] DPTP-4077: Always run post steps on cancellation to prevent resource leaks --- pkg/steps/multi_stage/gen.go | 4 +++ pkg/steps/multi_stage/multi_stage.go | 44 ++++++++++++++++++++++++---- pkg/steps/multi_stage/run.go | 3 ++ 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/pkg/steps/multi_stage/gen.go b/pkg/steps/multi_stage/gen.go index 2a0ba9e3c50..3c8fadc69d3 100644 --- a/pkg/steps/multi_stage/gen.go +++ b/pkg/steps/multi_stage/gen.go @@ -53,6 +53,7 @@ func (s *multiStageTestStep) generateObservers( type generatePodOptions struct { IsObserver bool enableSecretsStoreCSIDriver bool + phase string } func defaultGeneratePodOptions() *generatePodOptions { @@ -147,6 +148,9 @@ func (s *multiStageTestStep) generatePods( delete(pod.Labels, base_steps.ProwJobIdLabel) pod.Annotations[base_steps.AnnotationSaveContainerLogs] = "true" pod.Labels[MultiStageTestLabel] = s.name + if genPodOpts.phase != "" { + pod.Labels[MultiStageTestPhaseLabel] = genPodOpts.phase + } needsKubeConfig := isKubeconfigNeeded(&step, genPodOpts) if needsKubeConfig { pod.Spec.ServiceAccountName = s.name diff --git a/pkg/steps/multi_stage/multi_stage.go b/pkg/steps/multi_stage/multi_stage.go index b34c8402f92..68e485b23b8 100644 --- a/pkg/steps/multi_stage/multi_stage.go +++ b/pkg/steps/multi_stage/multi_stage.go @@ -59,6 +59,8 @@ const ( const ( // MultiStageTestLabel is the label we use to mark a pod as part of a multi-stage test MultiStageTestLabel = "ci.openshift.io/multi-stage-test" + // MultiStageTestPhaseLabel is the label we use to mark which phase (pre, test, post) a pod belongs to + MultiStageTestPhaseLabel = "ci.openshift.io/multi-stage-test-phase" // ClusterProfileMountPath is where we mount the cluster profile in a pod ClusterProfileMountPath = "/var/run/secrets/ci.openshift.io/cluster-profile" // SecretMountPath is where we mount the shared dir secret @@ -243,16 +245,48 @@ func (s *multiStageTestStep) run(ctx context.Context) error { observerDone := make(chan struct{}) go s.runObservers(observerContext, ctx, observers, observerDone) s.flags |= shortCircuit - if err := s.runSteps(ctx, "pre", s.pre, env, secretVolumes, secretVolumeMounts); err != nil { - errs = append(errs, fmt.Errorf("%q pre steps failed: %w", s.name, err)) - } else if err := s.runSteps(ctx, "test", s.test, env, secretVolumes, secretVolumeMounts); err != nil { - errs = append(errs, fmt.Errorf("%q test steps failed: %w", s.name, err)) + + // Track if cancellation occurred during pre or test phases + cancelledDuringPreOrTest := false + + // Run pre phase + preErr := s.runSteps(ctx, "pre", s.pre, env, secretVolumes, secretVolumeMounts) + if preErr != nil { + errs = append(errs, fmt.Errorf("%q pre steps failed: %w", s.name, preErr)) + } + // Check if cancellation occurred during pre phase (check after runSteps returns) + if ctx.Err() != nil { + cancelledDuringPreOrTest = true + logrus.Warnf("Job was cancelled during pre phase for test %q", s.name) + } else if preErr == nil { + // Run test phase only if pre succeeded and wasn't cancelled + testErr := s.runSteps(ctx, "test", s.test, env, secretVolumes, secretVolumeMounts) + if testErr != nil { + errs = append(errs, fmt.Errorf("%q test steps failed: %w", s.name, testErr)) + } + // Check if cancellation occurred during test phase + if ctx.Err() != nil { + cancelledDuringPreOrTest = true + logrus.Warnf("Job was cancelled during test phase for test %q", s.name) + } } + s.cancelObserversContext(cancel) // signal to observers that we're tearing down s.flags &= ^shortCircuit - if err := s.runSteps(context.Background(), "post", s.post, env, secretVolumes, secretVolumeMounts); err != nil { + + // Always run post steps, even if the job was cancelled during pre or test phases. + // This ensures proper cleanup (e.g., deprovisioning resources) to prevent resource leaks. + // Use context.Background() for post steps so they run even if the main context is cancelled. + postCtx := context.Background() + if cancelledDuringPreOrTest { + logrus.Infof("Job was cancelled during pre or test phase, running post steps with background context to ensure cleanup completes for test %q", s.name) + } + if err := s.runSteps(postCtx, "post", s.post, env, secretVolumes, secretVolumeMounts); err != nil { errs = append(errs, fmt.Errorf("%q post steps failed: %w", s.name, err)) + } else if cancelledDuringPreOrTest { + logrus.Infof("Post steps completed successfully after cancellation for test %q", s.name) } + <-observerDone // wait for the observers to finish so we get their jUnit return utilerrors.NewAggregate(errs) } diff --git a/pkg/steps/multi_stage/run.go b/pkg/steps/multi_stage/run.go index c8695a76c4d..3f0664c904e 100644 --- a/pkg/steps/multi_stage/run.go +++ b/pkg/steps/multi_stage/run.go @@ -35,6 +35,7 @@ func (s *multiStageTestStep) runSteps( logrus.Infof("Running multi-stage phase %s", phase) pods, bestEffortSteps, err := s.generatePods(steps, env, secretVolumes, secretVolumeMounts, &generatePodOptions{ enableSecretsStoreCSIDriver: s.enableSecretsStoreCSIDriver, + phase: phase, }) if err != nil { s.flags |= hasPrevErrs @@ -75,7 +76,9 @@ func (s *multiStageTestStep) runSteps( Output: err.Error(), } } + s.subLock.Lock() s.subTests = append(s.subTests, testCase) + s.subLock.Unlock() logrus.Infof("Step phase %s %s after %s.", phase, verb, duration.Truncate(time.Second)) return err From aafadc225d93963b991370b8b9a5e1de08fd6c11 Mon Sep 17 00:00:00 2001 From: Deep Mistry Date: Mon, 2 Feb 2026 15:08:18 -0600 Subject: [PATCH 2/2] DPTP-4077: Use original context for post steps unless cancellation occurred --- pkg/steps/multi_stage/multi_stage.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/steps/multi_stage/multi_stage.go b/pkg/steps/multi_stage/multi_stage.go index 68e485b23b8..f54f3950d1a 100644 --- a/pkg/steps/multi_stage/multi_stage.go +++ b/pkg/steps/multi_stage/multi_stage.go @@ -276,9 +276,11 @@ func (s *multiStageTestStep) run(ctx context.Context) error { // Always run post steps, even if the job was cancelled during pre or test phases. // This ensures proper cleanup (e.g., deprovisioning resources) to prevent resource leaks. - // Use context.Background() for post steps so they run even if the main context is cancelled. - postCtx := context.Background() + // Use the original context by default to respect job deadlines/cancellation. + // Only switch to context.Background() if cancellation occurred during pre/test to ensure cleanup completes. + postCtx := ctx if cancelledDuringPreOrTest { + postCtx = context.Background() logrus.Infof("Job was cancelled during pre or test phase, running post steps with background context to ensure cleanup completes for test %q", s.name) } if err := s.runSteps(postCtx, "post", s.post, env, secretVolumes, secretVolumeMounts); err != nil {