Skip to content

Commit 653fd67

Browse files
committed
fix(termination-watcher): add retry for runner-busy 422 and fix undefined log message
- Add exponential backoff retry (5 attempts, 1s/2s/4s/8s/16s) when GitHub returns 422 "currently running a job" during runner deletion - Fix "Received spot notification for undefined" log message when metrics are disabled by omitting the metric name from the log
1 parent a9ca792 commit 653fd67

File tree

3 files changed

+118
-8
lines changed

3 files changed

+118
-8
lines changed

lambdas/functions/termination-watcher/src/deregister.test.ts

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,77 @@ describe('deregisterRunner', () => {
282282
await deregisterRunner(instance, baseConfig);
283283
expect(mockCreateAppAuth).not.toHaveBeenCalled();
284284
});
285+
286+
it('should retry when runner is currently running a job (422)', async () => {
287+
vi.useFakeTimers();
288+
mockApps.getOrgInstallation.mockResolvedValue({ data: { id: 999 } });
289+
290+
async function* fakeIterator() {
291+
yield { data: [{ id: 42, name: `runner-i-12345678901234567` }] };
292+
}
293+
mockPaginate.iterator.mockReturnValue(fakeIterator());
294+
295+
const busyError = Object.assign(new Error('Bad request - Runner runner-i-12345678901234567 is currently running a job and cannot be deleted.'), { status: 422 });
296+
mockActions.deleteSelfHostedRunnerFromOrg
297+
.mockRejectedValueOnce(busyError)
298+
.mockRejectedValueOnce(busyError)
299+
.mockResolvedValueOnce({});
300+
301+
const promise = deregisterRunner(orgInstance, baseConfig);
302+
303+
// Advance through retry delays (1s, 2s)
304+
await vi.advanceTimersByTimeAsync(1000);
305+
await vi.advanceTimersByTimeAsync(2000);
306+
307+
await promise;
308+
309+
expect(mockActions.deleteSelfHostedRunnerFromOrg).toHaveBeenCalledTimes(3);
310+
vi.useRealTimers();
311+
});
312+
313+
it('should not retry on non-422 errors', async () => {
314+
mockApps.getOrgInstallation.mockResolvedValue({ data: { id: 999 } });
315+
316+
async function* fakeIterator() {
317+
yield { data: [{ id: 42, name: `runner-i-12345678901234567` }] };
318+
}
319+
mockPaginate.iterator.mockReturnValue(fakeIterator());
320+
321+
mockActions.deleteSelfHostedRunnerFromOrg.mockRejectedValueOnce(
322+
Object.assign(new Error('Internal Server Error'), { status: 500 }),
323+
);
324+
325+
// Should not throw — outer catch handles it
326+
await deregisterRunner(orgInstance, baseConfig);
327+
328+
expect(mockActions.deleteSelfHostedRunnerFromOrg).toHaveBeenCalledTimes(1);
329+
});
330+
331+
it('should give up after max retries for runner busy error', async () => {
332+
vi.useFakeTimers();
333+
mockApps.getOrgInstallation.mockResolvedValue({ data: { id: 999 } });
334+
335+
async function* fakeIterator() {
336+
yield { data: [{ id: 42, name: `runner-i-12345678901234567` }] };
337+
}
338+
mockPaginate.iterator.mockReturnValue(fakeIterator());
339+
340+
const busyError = Object.assign(new Error('Bad request - Runner is currently running a job and cannot be deleted.'), { status: 422 });
341+
mockActions.deleteSelfHostedRunnerFromOrg.mockRejectedValue(busyError);
342+
343+
const promise = deregisterRunner(orgInstance, baseConfig);
344+
345+
// Advance through all retry delays (1s, 2s, 4s, 8s)
346+
for (const delay of [1000, 2000, 4000, 8000]) {
347+
await vi.advanceTimersByTimeAsync(delay);
348+
}
349+
350+
await promise;
351+
352+
// 5 attempts total (initial + 4 retries)
353+
expect(mockActions.deleteSelfHostedRunnerFromOrg).toHaveBeenCalledTimes(5);
354+
vi.useRealTimers();
355+
});
285356
});
286357

287358
describe('createThrottleOptions', () => {

lambdas/functions/termination-watcher/src/deregister.ts

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,51 @@ async function deleteRunner(octokit: Octokit, owner: string, runnerId: number, r
155155
}
156156
}
157157

158+
const MAX_DELETE_RETRIES = 5;
159+
const INITIAL_RETRY_DELAY_MS = 1000;
160+
161+
async function deleteRunnerWithRetry(
162+
octokit: Octokit,
163+
owner: string,
164+
runnerId: number,
165+
runnerName: string,
166+
runnerType: string,
167+
instanceId: string,
168+
): Promise<void> {
169+
for (let attempt = 1; attempt <= MAX_DELETE_RETRIES; attempt++) {
170+
try {
171+
await deleteRunner(octokit, owner, runnerId, runnerType);
172+
logger.info('Successfully deregistered runner from GitHub', {
173+
instanceId,
174+
runnerId,
175+
runnerName,
176+
owner,
177+
});
178+
return;
179+
} catch (error) {
180+
const status = (error as { status?: number }).status;
181+
const message = (error as Error).message ?? '';
182+
const isRunnerBusy = status === 422 && message.includes('currently running a job');
183+
184+
if (isRunnerBusy && attempt < MAX_DELETE_RETRIES) {
185+
const delay = INITIAL_RETRY_DELAY_MS * Math.pow(2, attempt - 1);
186+
logger.warn('Runner is currently running a job, retrying after delay', {
187+
instanceId,
188+
runnerId,
189+
runnerName,
190+
owner,
191+
attempt,
192+
maxRetries: MAX_DELETE_RETRIES,
193+
delayMs: delay,
194+
});
195+
await new Promise((resolve) => setTimeout(resolve, delay));
196+
} else {
197+
throw error;
198+
}
199+
}
200+
}
201+
}
202+
158203
export async function deregisterRunner(instance: Instance, config: Config): Promise<void> {
159204
if (!config.enableRunnerDeregistration) {
160205
logger.debug('Runner deregistration is disabled, skipping');
@@ -187,13 +232,7 @@ export async function deregisterRunner(instance: Instance, config: Config): Prom
187232
return;
188233
}
189234

190-
await deleteRunner(installationOctokit, owner, runner.id, runnerType);
191-
logger.info('Successfully deregistered runner from GitHub', {
192-
instanceId,
193-
runnerId: runner.id,
194-
runnerName: runner.name,
195-
owner,
196-
});
235+
await deleteRunnerWithRetry(installationOctokit, owner, runner.id, runner.name, runnerType, instanceId);
197236
} catch (error) {
198237
logger.error('Failed to deregister runner from GitHub', {
199238
instanceId,

lambdas/functions/termination-watcher/src/metric-event.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ export async function metricEvent(
1313
const instanceRunningTimeInSeconds = instance.LaunchTime
1414
? (new Date(event.time).getTime() - new Date(instance.LaunchTime).getTime()) / 1000
1515
: undefined;
16-
logger.info(`Received spot notification for ${metricName}`, {
16+
logger.info(`Received spot notification${metricName ? ` for ${metricName}` : ''}`, {
1717
instanceId: instance.InstanceId,
1818
instanceType: instance.InstanceType ?? 'unknown',
1919
instanceName: instance.Tags?.find((tag) => tag.Key === 'Name')?.Value,

0 commit comments

Comments
 (0)