@@ -22,10 +22,15 @@ class Kubernetes(Runtime):
2222 """
2323
2424 JOB_NAME_CHARACTERS = string .ascii_lowercase + string .digits
25+ # Default timeout for Kubernetes API calls in seconds
26+ # This prevents indefinite hangs on network issues
27+ DEFAULT_API_TIMEOUT = 30
2528
2629 def __init__ (self , * args , ** kwargs ):
2730 super ().__init__ (* args , ** kwargs )
2831 self .kcontext = None
32+ # Allow timeout to be configured, otherwise use default
33+ self .api_timeout = getattr (self .config , 'api_timeout' , self .DEFAULT_API_TIMEOUT )
2934
3035 @classmethod
3136 def _get_job_file_name (cls , params ):
@@ -56,11 +61,21 @@ def _fetch_load(self, ctxname):
5661 pods = None
5762 for _ in range (3 ):
5863 try :
59- pods = core_v1 .list_namespaced_pod (namespace = 'default' )
64+ # Add client-side timeout to prevent indefinite hangs on network issues
65+ # _request_timeout is passed to urllib3 and controls socket timeout
66+ pods = core_v1 .list_namespaced_pod (
67+ namespace = 'default' ,
68+ timeout_seconds = 60 , # Server-side timeout
69+ _request_timeout = self .api_timeout # Client-side timeout
70+ )
6071 break
6172 except kubernetes .client .rest .ApiException as error :
6273 print (f'Error listing pods in { ctxname } : { error } ' )
6374 continue
75+ except Exception as error : # pylint: disable=broad-except
76+ # Catch timeout and other exceptions (e.g., urllib3.exceptions.ReadTimeoutError)
77+ print (f'Exception listing pods in { ctxname } : { error } ' )
78+ continue
6479
6580 if not pods :
6681 print (f'No pods found in { ctxname } , returning 1000' )
@@ -104,7 +119,10 @@ def wait(self, job_object):
104119 core_v1 = kubernetes .client .CoreV1Api ()
105120 job_name = job_object [0 ][0 ].metadata .labels ['job-name' ]
106121 for event in watch .stream (
107- func = core_v1 .list_namespaced_pod , namespace = 'default' ):
122+ func = core_v1 .list_namespaced_pod ,
123+ namespace = 'default' ,
124+ timeout_seconds = 3600 , # Server-side timeout (1 hour)
125+ _request_timeout = self .api_timeout ): # Client-side timeout
108126 if event ['type' ] != 'MODIFIED' :
109127 continue
110128 if job_name not in event ['object' ].metadata .name :
0 commit comments