Skip to content

Commit 50038fb

Browse files
authored
Re-schedule if the failed reason starts with OutOf (#4336)
1 parent 82d5579 commit 50038fb

File tree

2 files changed

+113
-43
lines changed

2 files changed

+113
-43
lines changed

controllers/actions.github.com/ephemeralrunner_controller.go

Lines changed: 67 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -312,26 +312,45 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
312312

313313
cs := runnerContainerStatus(pod)
314314
switch {
315-
case cs == nil:
316-
// starting, no container state yet
317-
log.Info("Waiting for runner container status to be available")
318-
return ctrl.Result{}, nil
319-
case cs.State.Terminated == nil: // still running or evicted
320-
if pod.Status.Phase == corev1.PodFailed && pod.Status.Reason == "Evicted" {
321-
log.Info("Pod set the termination phase, but container state is not terminated. Deleting pod",
322-
"PodPhase", pod.Status.Phase,
323-
"PodReason", pod.Status.Reason,
324-
"PodMessage", pod.Status.Message,
315+
case pod.Status.Phase == corev1.PodFailed: // All containers are stopped
316+
switch {
317+
case pod.Status.Reason == "Evicted":
318+
log.Info("Pod evicted; Deleting ephemeral runner or pod",
319+
"podPhase", pod.Status.Phase,
320+
"podReason", pod.Status.Reason,
321+
"podMessage", pod.Status.Message,
325322
)
326323

327-
if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil {
328-
log.Error(err, "failed to delete pod as failed on pod.Status.Phase: Failed")
324+
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
325+
326+
case strings.HasPrefix(pod.Status.Reason, "OutOf"): // most likely a transient issue.
327+
log.Info("Pod failed with reason starting with OutOf. Deleting ephemeral runner or pod",
328+
"podPhase", pod.Status.Phase,
329+
"podReason", pod.Status.Reason,
330+
"podMessage", pod.Status.Message,
331+
)
332+
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
333+
334+
default:
335+
log.Info("Pod is in failed phase; updating ephemeral runner status",
336+
"podPhase", pod.Status.Phase,
337+
"podReason", pod.Status.Reason,
338+
"podMessage", pod.Status.Message,
339+
)
340+
if err := r.updateRunStatusFromPod(ctx, ephemeralRunner, pod, log); err != nil {
341+
log.Info("Failed to update ephemeral runner status. Requeue to not miss this event")
329342
return ctrl.Result{}, err
330343
}
331344
return ctrl.Result{}, nil
332345
}
333346

334-
log.Info("Ephemeral runner container is still running")
347+
case cs == nil:
348+
// starting, no container state yet
349+
log.Info("Waiting for runner container status to be available")
350+
return ctrl.Result{}, nil
351+
352+
case cs.State.Terminated == nil: // container is not terminated and pod phase is not failed, so runner is still running
353+
log.Info("Runner container is still running; updating ephemeral runner status")
335354
if err := r.updateRunStatusFromPod(ctx, ephemeralRunner, pod, log); err != nil {
336355
log.Info("Failed to update ephemeral runner status. Requeue to not miss this event")
337356
return ctrl.Result{}, err
@@ -340,36 +359,7 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
340359

341360
case cs.State.Terminated.ExitCode != 0: // failed
342361
log.Info("Ephemeral runner container failed", "exitCode", cs.State.Terminated.ExitCode)
343-
if ephemeralRunner.HasJob() {
344-
log.Error(
345-
errors.New("ephemeral runner has a job assigned, but the pod has failed"),
346-
"Ephemeral runner either has faulty entrypoint or something external killing the runner",
347-
)
348-
log.Info("Deleting the ephemeral runner that has a job assigned but the pod has failed")
349-
if err := r.Delete(ctx, ephemeralRunner); err != nil {
350-
log.Error(err, "Failed to delete the ephemeral runner that has a job assigned but the pod has failed")
351-
return ctrl.Result{}, err
352-
}
353-
354-
log.Info("Deleted the ephemeral runner that has a job assigned but the pod has failed")
355-
log.Info("Trying to remove the runner from the service")
356-
actionsClient, err := r.GetActionsService(ctx, ephemeralRunner)
357-
if err != nil {
358-
log.Error(err, "Failed to get actions client for removing the runner from the service")
359-
return ctrl.Result{}, nil
360-
}
361-
if err := actionsClient.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerId)); err != nil {
362-
log.Error(err, "Failed to remove the runner from the service")
363-
return ctrl.Result{}, nil
364-
}
365-
log.Info("Removed the runner from the service")
366-
return ctrl.Result{}, nil
367-
}
368-
if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil {
369-
log.Error(err, "Failed to delete runner pod on failure")
370-
return ctrl.Result{}, err
371-
}
372-
return ctrl.Result{}, nil
362+
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
373363

374364
default: // succeeded
375365
log.Info("Ephemeral runner has finished successfully, deleting ephemeral runner", "exitCode", cs.State.Terminated.ExitCode)
@@ -381,6 +371,40 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
381371
}
382372
}
383373

374+
func (r *EphemeralRunnerReconciler) deleteEphemeralRunnerOrPod(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error {
375+
if ephemeralRunner.HasJob() {
376+
log.Error(
377+
errors.New("ephemeral runner has a job assigned, but the pod has failed"),
378+
"Ephemeral runner either has faulty entrypoint or something external killing the runner",
379+
)
380+
log.Info("Deleting the ephemeral runner that has a job assigned but the pod has failed")
381+
if err := r.Delete(ctx, ephemeralRunner); err != nil {
382+
log.Error(err, "Failed to delete the ephemeral runner that has a job assigned but the pod has failed")
383+
return err
384+
}
385+
386+
log.Info("Deleted the ephemeral runner that has a job assigned but the pod has failed")
387+
log.Info("Trying to remove the runner from the service")
388+
actionsClient, err := r.GetActionsService(ctx, ephemeralRunner)
389+
if err != nil {
390+
log.Error(err, "Failed to get actions client for removing the runner from the service")
391+
return nil
392+
}
393+
if err := actionsClient.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerId)); err != nil {
394+
log.Error(err, "Failed to remove the runner from the service")
395+
return nil
396+
}
397+
log.Info("Removed the runner from the service")
398+
return nil
399+
}
400+
if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil {
401+
log.Error(err, "Failed to delete runner pod on failure")
402+
return err
403+
}
404+
405+
return nil
406+
}
407+
384408
func (r *EphemeralRunnerReconciler) cleanupRunnerFromService(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (ok bool, err error) {
385409
if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil {
386410
actionsError := &actions.ActionsError{}

controllers/actions.github.com/ephemeralrunner_controller_test.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -745,6 +745,52 @@ var _ = Describe("EphemeralRunner", func() {
745745
).Should(BeEquivalentTo(true))
746746
})
747747

748+
It("It should re-create pod on reason starting with OutOf", func() {
749+
pod := new(corev1.Pod)
750+
Eventually(
751+
func() (bool, error) {
752+
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
753+
if err != nil {
754+
return false, err
755+
}
756+
return true, nil
757+
},
758+
ephemeralRunnerTimeout,
759+
ephemeralRunnerInterval,
760+
).Should(BeEquivalentTo(true))
761+
762+
pod.Status.Phase = corev1.PodFailed
763+
pod.Status.Reason = "OutOfpods"
764+
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
765+
Name: v1alpha1.EphemeralRunnerContainerName,
766+
State: corev1.ContainerState{},
767+
})
768+
err := k8sClient.Status().Update(ctx, pod)
769+
Expect(err).To(BeNil(), "failed to patch pod status")
770+
771+
updated := new(v1alpha1.EphemeralRunner)
772+
Eventually(func() (bool, error) {
773+
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
774+
if err != nil {
775+
return false, err
776+
}
777+
return len(updated.Status.Failures) == 1, nil
778+
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true))
779+
780+
// should re-create after failure
781+
Eventually(
782+
func() (bool, error) {
783+
pod := new(corev1.Pod)
784+
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
785+
return false, err
786+
}
787+
return true, nil
788+
},
789+
ephemeralRunnerTimeout,
790+
ephemeralRunnerInterval,
791+
).Should(BeEquivalentTo(true))
792+
})
793+
748794
It("It should not set the phase to succeeded without pod termination status", func() {
749795
pod := new(corev1.Pod)
750796
Eventually(

0 commit comments

Comments
 (0)