pkg/steps/multi_stage/gen.go

-Original file line number
+Diff line change
@@ Expand Up / @@ -53,6 +53,7 @@ func (s *multiStageTestStep) generateObservers( @@
     type generatePodOptions struct {
     	IsObserver                  bool
     	enableSecretsStoreCSIDriver bool
+    	phase                       string
     }
     func defaultGeneratePodOptions() *generatePodOptions {
@@ Expand Down Expand Up / @@ -147,6 +148,9 @@ func (s *multiStageTestStep) generatePods( @@
     		delete(pod.Labels, base_steps.ProwJobIdLabel)
     		pod.Annotations[base_steps.AnnotationSaveContainerLogs] = "true"
     		pod.Labels[MultiStageTestLabel] = s.name
+    		if genPodOpts.phase != "" {
+    			pod.Labels[MultiStageTestPhaseLabel] = genPodOpts.phase
+    		}
     		needsKubeConfig := isKubeconfigNeeded(&step, genPodOpts)
     		if needsKubeConfig {
     			pod.Spec.ServiceAccountName = s.name
@@ Expand Down @@

pkg/steps/multi_stage/multi_stage.go

-Original file line number
+Diff line change
@@ Expand Up / @@ -59,6 +59,8 @@ const ( @@
     const (
     	// MultiStageTestLabel is the label we use to mark a pod as part of a multi-stage test
     	MultiStageTestLabel = "ci.openshift.io/multi-stage-test"
+    	// MultiStageTestPhaseLabel is the label we use to mark which phase (pre, test, post) a pod belongs to
+    	MultiStageTestPhaseLabel = "ci.openshift.io/multi-stage-test-phase"
     	// ClusterProfileMountPath is where we mount the cluster profile in a pod
     	ClusterProfileMountPath = "/var/run/secrets/ci.openshift.io/cluster-profile"
     	// SecretMountPath is where we mount the shared dir secret
@@ Expand Down Expand Up @@
     	observerDone := make(chan struct{})
     	go s.runObservers(observerContext, ctx, observers, observerDone)
     	s.flags |= shortCircuit
-    	if err := s.runSteps(ctx, "pre", s.pre, env, secretVolumes, secretVolumeMounts); err != nil {
-    		errs = append(errs, fmt.Errorf("%q pre steps failed: %w", s.name, err))
-    	} else if err := s.runSteps(ctx, "test", s.test, env, secretVolumes, secretVolumeMounts); err != nil {
-    		errs = append(errs, fmt.Errorf("%q test steps failed: %w", s.name, err))
+    	// Track if cancellation occurred during pre or test phases
+    	cancelledDuringPreOrTest := false
+    	// Run pre phase
+    	preErr := s.runSteps(ctx, "pre", s.pre, env, secretVolumes, secretVolumeMounts)
+    	if preErr != nil {
+    		errs = append(errs, fmt.Errorf("%q pre steps failed: %w", s.name, preErr))
+    	}
+    	// Check if cancellation occurred during pre phase (check after runSteps returns)
+    	if ctx.Err() != nil {
+    		cancelledDuringPreOrTest = true
+    		logrus.Warnf("Job was cancelled during pre phase for test %q", s.name)
+    	} else if preErr == nil {
+    		// Run test phase only if pre succeeded and wasn't cancelled
+    		testErr := s.runSteps(ctx, "test", s.test, env, secretVolumes, secretVolumeMounts)
+    		if testErr != nil {
+    			errs = append(errs, fmt.Errorf("%q test steps failed: %w", s.name, testErr))
+    		}
+    		// Check if cancellation occurred during test phase
+    		if ctx.Err() != nil {
+    			cancelledDuringPreOrTest = true
+    			logrus.Warnf("Job was cancelled during test phase for test %q", s.name)
+    		}
     	}
     	s.cancelObserversContext(cancel) // signal to observers that we're tearing down
     	s.flags &= ^shortCircuit
-    	if err := s.runSteps(context.Background(), "post", s.post, env, secretVolumes, secretVolumeMounts); err != nil {
+    	// Always run post steps, even if the job was cancelled during pre or test phases.
+    	// This ensures proper cleanup (e.g., deprovisioning resources) to prevent resource leaks.
+    	// Use the original context by default to respect job deadlines/cancellation.
+    	// Only switch to context.Background() if cancellation occurred during pre/test to ensure cleanup completes.
+    	postCtx := ctx
+    	if cancelledDuringPreOrTest {
+    		postCtx = context.Background()
+    		logrus.Infof("Job was cancelled during pre or test phase, running post steps with background context to ensure cleanup completes for test %q", s.name)
+    	}
+    	if err := s.runSteps(postCtx, "post", s.post, env, secretVolumes, secretVolumeMounts); err != nil {
     		errs = append(errs, fmt.Errorf("%q post steps failed: %w", s.name, err))
+    	} else if cancelledDuringPreOrTest {
+    		logrus.Infof("Post steps completed successfully after cancellation for test %q", s.name)
     	}
     	<-observerDone // wait for the observers to finish so we get their jUnit
     	return utilerrors.NewAggregate(errs)
     }
@@ Expand Down @@

pkg/steps/multi_stage/run.go

-Original file line number
+Diff line change
@@ Expand Up / @@ -35,6 +35,7 @@ func (s *multiStageTestStep) runSteps( @@
     	logrus.Infof("Running multi-stage phase %s", phase)
     	pods, bestEffortSteps, err := s.generatePods(steps, env, secretVolumes, secretVolumeMounts, &generatePodOptions{
     		enableSecretsStoreCSIDriver: s.enableSecretsStoreCSIDriver,
+    		phase:                       phase,
     	})
     	if err != nil {
     		s.flags |= hasPrevErrs
@@ Expand Down Expand Up / @@ -75,7 +76,9 @@ func (s *multiStageTestStep) runSteps( @@
     			Output: err.Error(),
     		}
     	}
+    	s.subLock.Lock()
     	s.subTests = append(s.subTests, testCase)
+    	s.subLock.Unlock()
     	logrus.Infof("Step phase %s %s after %s.", phase, verb, duration.Truncate(time.Second))
     	return err
@@ Expand Down @@

DPTP-4077: Always run post steps on cancellation to prevent resource leaks #4920

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

deepsm007 wants to merge 2 commits into openshift:main from deepsm007:fix-post-deprovision

+48 −5

-Original file line number
+Diff line change
@@ Expand Up / @@ -53,6 +53,7 @@ func (s *multiStageTestStep) generateObservers( @@
     type generatePodOptions struct {
     	IsObserver                  bool
     	enableSecretsStoreCSIDriver bool
+    	phase                       string
     }
     func defaultGeneratePodOptions() *generatePodOptions {
@@ Expand Down Expand Up / @@ -147,6 +148,9 @@ func (s *multiStageTestStep) generatePods( @@
     		delete(pod.Labels, base_steps.ProwJobIdLabel)
     		pod.Annotations[base_steps.AnnotationSaveContainerLogs] = "true"
     		pod.Labels[MultiStageTestLabel] = s.name
+    		if genPodOpts.phase != "" {
+    			pod.Labels[MultiStageTestPhaseLabel] = genPodOpts.phase
+    		}
     		needsKubeConfig := isKubeconfigNeeded(&step, genPodOpts)
     		if needsKubeConfig {
     			pod.Spec.ServiceAccountName = s.name
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -59,6 +59,8 @@ const ( @@
     const (
     	// MultiStageTestLabel is the label we use to mark a pod as part of a multi-stage test
     	MultiStageTestLabel = "ci.openshift.io/multi-stage-test"
+    	// MultiStageTestPhaseLabel is the label we use to mark which phase (pre, test, post) a pod belongs to
+    	MultiStageTestPhaseLabel = "ci.openshift.io/multi-stage-test-phase"
     	// ClusterProfileMountPath is where we mount the cluster profile in a pod
     	ClusterProfileMountPath = "/var/run/secrets/ci.openshift.io/cluster-profile"
     	// SecretMountPath is where we mount the shared dir secret
@@ Expand Down Expand Up @@
     	observerDone := make(chan struct{})
     	go s.runObservers(observerContext, ctx, observers, observerDone)
     	s.flags |= shortCircuit
-    	if err := s.runSteps(ctx, "pre", s.pre, env, secretVolumes, secretVolumeMounts); err != nil {
-    		errs = append(errs, fmt.Errorf("%q pre steps failed: %w", s.name, err))
-    	} else if err := s.runSteps(ctx, "test", s.test, env, secretVolumes, secretVolumeMounts); err != nil {
-    		errs = append(errs, fmt.Errorf("%q test steps failed: %w", s.name, err))
+    	// Track if cancellation occurred during pre or test phases
+    	cancelledDuringPreOrTest := false
+    	// Run pre phase
+    	preErr := s.runSteps(ctx, "pre", s.pre, env, secretVolumes, secretVolumeMounts)
+    	if preErr != nil {
+    		errs = append(errs, fmt.Errorf("%q pre steps failed: %w", s.name, preErr))
+    	}
+    	// Check if cancellation occurred during pre phase (check after runSteps returns)
+    	if ctx.Err() != nil {
+    		cancelledDuringPreOrTest = true
+    		logrus.Warnf("Job was cancelled during pre phase for test %q", s.name)
+    	} else if preErr == nil {
+    		// Run test phase only if pre succeeded and wasn't cancelled
+    		testErr := s.runSteps(ctx, "test", s.test, env, secretVolumes, secretVolumeMounts)
+    		if testErr != nil {
+    			errs = append(errs, fmt.Errorf("%q test steps failed: %w", s.name, testErr))
+    		}
+    		// Check if cancellation occurred during test phase
+    		if ctx.Err() != nil {
+    			cancelledDuringPreOrTest = true
+    			logrus.Warnf("Job was cancelled during test phase for test %q", s.name)
+    		}
     	}
     	s.cancelObserversContext(cancel) // signal to observers that we're tearing down
     	s.flags &= ^shortCircuit
-    	if err := s.runSteps(context.Background(), "post", s.post, env, secretVolumes, secretVolumeMounts); err != nil {
+    	// Always run post steps, even if the job was cancelled during pre or test phases.
+    	// This ensures proper cleanup (e.g., deprovisioning resources) to prevent resource leaks.
+    	// Use the original context by default to respect job deadlines/cancellation.
+    	// Only switch to context.Background() if cancellation occurred during pre/test to ensure cleanup completes.
+    	postCtx := ctx
+    	if cancelledDuringPreOrTest {
+    		postCtx = context.Background()
+    		logrus.Infof("Job was cancelled during pre or test phase, running post steps with background context to ensure cleanup completes for test %q", s.name)
+    	}
+    	if err := s.runSteps(postCtx, "post", s.post, env, secretVolumes, secretVolumeMounts); err != nil {
     		errs = append(errs, fmt.Errorf("%q post steps failed: %w", s.name, err))
+    	} else if cancelledDuringPreOrTest {
+    		logrus.Infof("Post steps completed successfully after cancellation for test %q", s.name)
     	}
     	<-observerDone // wait for the observers to finish so we get their jUnit
     	return utilerrors.NewAggregate(errs)
     }
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -35,6 +35,7 @@ func (s *multiStageTestStep) runSteps( @@
     	logrus.Infof("Running multi-stage phase %s", phase)
     	pods, bestEffortSteps, err := s.generatePods(steps, env, secretVolumes, secretVolumeMounts, &generatePodOptions{
     		enableSecretsStoreCSIDriver: s.enableSecretsStoreCSIDriver,
+    		phase:                       phase,
     	})
     	if err != nil {
     		s.flags |= hasPrevErrs
@@ Expand Down Expand Up / @@ -75,7 +76,9 @@ func (s *multiStageTestStep) runSteps( @@
     			Output: err.Error(),
     		}
     	}
+    	s.subLock.Lock()
     	s.subTests = append(s.subTests, testCase)
+    	s.subLock.Unlock()
     	logrus.Infof("Step phase %s %s after %s.", phase, verb, duration.Truncate(time.Second))
     	return err
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

DPTP-4077: Always run post steps on cancellation to prevent resource leaks #4920

Diff view

Diff view

There are no files selected for viewing

Uh oh!

DPTP-4077: Always run post steps on cancellation to prevent resource leaks #4920

Are you sure you want to change the base?

DPTP-4077: Always run post steps on cancellation to prevent resource leaks #4920

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!