@@ -312,26 +312,45 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
312312
313313 cs := runnerContainerStatus (pod )
314314 switch {
315- case cs == nil :
316- // starting, no container state yet
317- log .Info ("Waiting for runner container status to be available" )
318- return ctrl.Result {}, nil
319- case cs .State .Terminated == nil : // still running or evicted
320- if pod .Status .Phase == corev1 .PodFailed && pod .Status .Reason == "Evicted" {
321- log .Info ("Pod set the termination phase, but container state is not terminated. Deleting pod" ,
322- "PodPhase" , pod .Status .Phase ,
323- "PodReason" , pod .Status .Reason ,
324- "PodMessage" , pod .Status .Message ,
315+ case pod .Status .Phase == corev1 .PodFailed : // All containers are stopped
316+ switch {
317+ case pod .Status .Reason == "Evicted" :
318+ log .Info ("Pod evicted; Deleting ephemeral runner or pod" ,
319+ "podPhase" , pod .Status .Phase ,
320+ "podReason" , pod .Status .Reason ,
321+ "podMessage" , pod .Status .Message ,
325322 )
326323
327- if err := r .deletePodAsFailed (ctx , ephemeralRunner , pod , log ); err != nil {
328- log .Error (err , "failed to delete pod as failed on pod.Status.Phase: Failed" )
324+ return ctrl.Result {}, r .deleteEphemeralRunnerOrPod (ctx , ephemeralRunner , pod , log )
325+
326+ case strings .HasPrefix (pod .Status .Reason , "OutOf" ): // most likely a transient issue.
327+ log .Info ("Pod failed with reason starting with OutOf. Deleting ephemeral runner or pod" ,
328+ "podPhase" , pod .Status .Phase ,
329+ "podReason" , pod .Status .Reason ,
330+ "podMessage" , pod .Status .Message ,
331+ )
332+ return ctrl.Result {}, r .deleteEphemeralRunnerOrPod (ctx , ephemeralRunner , pod , log )
333+
334+ default :
335+ log .Info ("Pod is in failed phase; updating ephemeral runner status" ,
336+ "podPhase" , pod .Status .Phase ,
337+ "podReason" , pod .Status .Reason ,
338+ "podMessage" , pod .Status .Message ,
339+ )
340+ if err := r .updateRunStatusFromPod (ctx , ephemeralRunner , pod , log ); err != nil {
341+ log .Info ("Failed to update ephemeral runner status. Requeue to not miss this event" )
329342 return ctrl.Result {}, err
330343 }
331344 return ctrl.Result {}, nil
332345 }
333346
334- log .Info ("Ephemeral runner container is still running" )
347+ case cs == nil :
348+ // starting, no container state yet
349+ log .Info ("Waiting for runner container status to be available" )
350+ return ctrl.Result {}, nil
351+
352+ case cs .State .Terminated == nil : // container is not terminated and pod phase is not failed, so runner is still running
353+ log .Info ("Runner container is still running; updating ephemeral runner status" )
335354 if err := r .updateRunStatusFromPod (ctx , ephemeralRunner , pod , log ); err != nil {
336355 log .Info ("Failed to update ephemeral runner status. Requeue to not miss this event" )
337356 return ctrl.Result {}, err
@@ -340,36 +359,7 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
340359
341360 case cs .State .Terminated .ExitCode != 0 : // failed
342361 log .Info ("Ephemeral runner container failed" , "exitCode" , cs .State .Terminated .ExitCode )
343- if ephemeralRunner .HasJob () {
344- log .Error (
345- errors .New ("ephemeral runner has a job assigned, but the pod has failed" ),
346- "Ephemeral runner either has faulty entrypoint or something external killing the runner" ,
347- )
348- log .Info ("Deleting the ephemeral runner that has a job assigned but the pod has failed" )
349- if err := r .Delete (ctx , ephemeralRunner ); err != nil {
350- log .Error (err , "Failed to delete the ephemeral runner that has a job assigned but the pod has failed" )
351- return ctrl.Result {}, err
352- }
353-
354- log .Info ("Deleted the ephemeral runner that has a job assigned but the pod has failed" )
355- log .Info ("Trying to remove the runner from the service" )
356- actionsClient , err := r .GetActionsService (ctx , ephemeralRunner )
357- if err != nil {
358- log .Error (err , "Failed to get actions client for removing the runner from the service" )
359- return ctrl.Result {}, nil
360- }
361- if err := actionsClient .RemoveRunner (ctx , int64 (ephemeralRunner .Status .RunnerId )); err != nil {
362- log .Error (err , "Failed to remove the runner from the service" )
363- return ctrl.Result {}, nil
364- }
365- log .Info ("Removed the runner from the service" )
366- return ctrl.Result {}, nil
367- }
368- if err := r .deletePodAsFailed (ctx , ephemeralRunner , pod , log ); err != nil {
369- log .Error (err , "Failed to delete runner pod on failure" )
370- return ctrl.Result {}, err
371- }
372- return ctrl.Result {}, nil
362+ return ctrl.Result {}, r .deleteEphemeralRunnerOrPod (ctx , ephemeralRunner , pod , log )
373363
374364 default : // succeeded
375365 log .Info ("Ephemeral runner has finished successfully, deleting ephemeral runner" , "exitCode" , cs .State .Terminated .ExitCode )
@@ -381,6 +371,40 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
381371 }
382372}
383373
374+ func (r * EphemeralRunnerReconciler ) deleteEphemeralRunnerOrPod (ctx context.Context , ephemeralRunner * v1alpha1.EphemeralRunner , pod * corev1.Pod , log logr.Logger ) error {
375+ if ephemeralRunner .HasJob () {
376+ log .Error (
377+ errors .New ("ephemeral runner has a job assigned, but the pod has failed" ),
378+ "Ephemeral runner either has faulty entrypoint or something external killing the runner" ,
379+ )
380+ log .Info ("Deleting the ephemeral runner that has a job assigned but the pod has failed" )
381+ if err := r .Delete (ctx , ephemeralRunner ); err != nil {
382+ log .Error (err , "Failed to delete the ephemeral runner that has a job assigned but the pod has failed" )
383+ return err
384+ }
385+
386+ log .Info ("Deleted the ephemeral runner that has a job assigned but the pod has failed" )
387+ log .Info ("Trying to remove the runner from the service" )
388+ actionsClient , err := r .GetActionsService (ctx , ephemeralRunner )
389+ if err != nil {
390+ log .Error (err , "Failed to get actions client for removing the runner from the service" )
391+ return nil
392+ }
393+ if err := actionsClient .RemoveRunner (ctx , int64 (ephemeralRunner .Status .RunnerId )); err != nil {
394+ log .Error (err , "Failed to remove the runner from the service" )
395+ return nil
396+ }
397+ log .Info ("Removed the runner from the service" )
398+ return nil
399+ }
400+ if err := r .deletePodAsFailed (ctx , ephemeralRunner , pod , log ); err != nil {
401+ log .Error (err , "Failed to delete runner pod on failure" )
402+ return err
403+ }
404+
405+ return nil
406+ }
407+
384408func (r * EphemeralRunnerReconciler ) cleanupRunnerFromService (ctx context.Context , ephemeralRunner * v1alpha1.EphemeralRunner , log logr.Logger ) (ok bool , err error ) {
385409 if err := r .deleteRunnerFromService (ctx , ephemeralRunner , log ); err != nil {
386410 actionsError := & actions.ActionsError {}
0 commit comments