Skip to content

Commit a5b332f

Browse files
committed
fix: add retry logic for GitHub API calls in scale-down Lambda
Add exponential backoff retry for transient GitHub API failures (5xx, 429) when de-registering runners during scale-down operations. Key changes: - Add withRetry() helper with configurable max retries (3) and delays - Wrap deleteSelfHostedRunnerFromOrg/Repo calls with retry logic - Only terminate EC2 instance if GitHub de-registration succeeds - If de-registration fails after retries, leave instance running for next scale-down cycle to retry This prevents stale runner entries in GitHub org settings caused by transient API failures (e.g., 502 Server Error) during scale-down. Tests: - Add unit tests for successful retry after transient 502 error - Add unit tests verifying EC2 is NOT terminated when retries exhausted
1 parent 1f9805d commit a5b332f

File tree

2 files changed

+161
-15
lines changed

2 files changed

+161
-15
lines changed

lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,77 @@ describe('Scale down runners', () => {
631631
await expect(scaleDown()).resolves.not.toThrow();
632632
});
633633

634+
it(`Should retry on 502 error and succeed on second attempt.`, async () => {
635+
// setup
636+
const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, true)];
637+
638+
let callCount = 0;
639+
const error502 = new RequestError('Server Error', 502, {
640+
request: {
641+
method: 'DELETE',
642+
url: 'https://api.github.com/test',
643+
headers: {},
644+
},
645+
});
646+
647+
if (type === 'Org') {
648+
mockOctokit.actions.deleteSelfHostedRunnerFromOrg.mockImplementation(() => {
649+
callCount++;
650+
if (callCount === 1) {
651+
throw error502;
652+
}
653+
return { status: 204 };
654+
});
655+
} else {
656+
mockOctokit.actions.deleteSelfHostedRunnerFromRepo.mockImplementation(() => {
657+
callCount++;
658+
if (callCount === 1) {
659+
throw error502;
660+
}
661+
return { status: 204 };
662+
});
663+
}
664+
665+
mockGitHubRunners(runners);
666+
mockAwsRunners(runners);
667+
668+
// act
669+
await scaleDown();
670+
671+
// assert - should have retried and succeeded
672+
expect(callCount).toBe(2);
673+
checkTerminated(runners);
674+
});
675+
676+
it(`Should not terminate instance after all retries exhausted on 502 error.`, async () => {
677+
// setup
678+
const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false)];
679+
680+
const error502 = new RequestError('Server Error', 502, {
681+
request: {
682+
method: 'DELETE',
683+
url: 'https://api.github.com/test',
684+
headers: {},
685+
},
686+
});
687+
688+
mockOctokit.actions.deleteSelfHostedRunnerFromOrg.mockImplementation(() => {
689+
throw error502;
690+
});
691+
mockOctokit.actions.deleteSelfHostedRunnerFromRepo.mockImplementation(() => {
692+
throw error502;
693+
});
694+
695+
mockGitHubRunners(runners);
696+
mockAwsRunners(runners);
697+
698+
// act
699+
await expect(scaleDown()).resolves.not.toThrow();
700+
701+
// assert - should NOT terminate since de-registration failed
702+
expect(terminateRunner).not.toHaveBeenCalled();
703+
});
704+
634705
const evictionStrategies = ['oldest_first', 'newest_first'];
635706
describe.each(evictionStrategies)('When idle config defined', (evictionStrategy) => {
636707
const defaultConfig = {

lambdas/functions/control-plane/src/scale-runners/scale-down.ts

Lines changed: 90 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,47 @@ import { getGitHubEnterpriseApiUrl } from './scale-up';
1414

1515
const logger = createChildLogger('scale-down');
1616

17+
const RETRY_CONFIG = {
18+
maxRetries: 3,
19+
initialDelayMs: 1000,
20+
maxDelayMs: 10000,
21+
};
22+
23+
async function sleep(ms: number): Promise<void> {
24+
return new Promise((resolve) => setTimeout(resolve, ms));
25+
}
26+
27+
function isRetryableError(error: unknown): boolean {
28+
if (error instanceof RequestError) {
29+
const status = (error as RequestError).status;
30+
// Retry on server errors (5xx) and rate limiting (429)
31+
return status >= 500 || status === 429;
32+
}
33+
return false;
34+
}
35+
36+
async function withRetry<T>(operation: () => Promise<T>, operationName: string, context: string): Promise<T> {
37+
let lastError: unknown;
38+
for (let attempt = 1; attempt <= RETRY_CONFIG.maxRetries; attempt++) {
39+
try {
40+
return await operation();
41+
} catch (error) {
42+
lastError = error;
43+
if (isRetryableError(error) && attempt < RETRY_CONFIG.maxRetries) {
44+
const delay = Math.min(RETRY_CONFIG.initialDelayMs * Math.pow(2, attempt - 1), RETRY_CONFIG.maxDelayMs);
45+
logger.warn(
46+
`${operationName} failed for ${context} (attempt ${attempt}/${RETRY_CONFIG.maxRetries}), ` +
47+
`retrying in ${delay}ms. Error: ${error}`,
48+
);
49+
await sleep(delay);
50+
} else {
51+
throw error;
52+
}
53+
}
54+
}
55+
throw lastError;
56+
}
57+
1758
type OrgRunnerList = Endpoints['GET /orgs/{org}/actions/runners']['response']['data']['runners'];
1859
type RepoRunnerList = Endpoints['GET /repos/{owner}/{repo}/actions/runners']['response']['data']['runners'];
1960
type RunnerState = OrgRunnerList[number] | RepoRunnerList[number];
@@ -127,6 +168,33 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
127168
return launchTimePlusMinimum < now;
128169
}
129170

171+
async function deleteGitHubRunner(
172+
githubAppClient: Octokit,
173+
ec2runner: RunnerInfo,
174+
ghRunnerId: number,
175+
): Promise<number> {
176+
const deleteOperation = async () => {
177+
const response =
178+
ec2runner.type === 'Org'
179+
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
180+
runner_id: ghRunnerId,
181+
org: ec2runner.owner,
182+
})
183+
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
184+
runner_id: ghRunnerId,
185+
owner: ec2runner.owner.split('/')[0],
186+
repo: ec2runner.owner.split('/')[1],
187+
});
188+
return response.status;
189+
};
190+
191+
return await withRetry(
192+
deleteOperation,
193+
'Delete GitHub runner',
194+
`runner ${ec2runner.instanceId} (GitHub ID: ${ghRunnerId})`,
195+
);
196+
}
197+
130198
async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise<void> {
131199
const githubAppClient = await getOrCreateOctokit(ec2runner);
132200
try {
@@ -146,28 +214,35 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi
146214
);
147215

148216
if (states.every((busy) => busy === false)) {
149-
const statuses = await Promise.all(
217+
const results = await Promise.all(
150218
ghRunnerIds.map(async (ghRunnerId) => {
151-
return (
152-
ec2runner.type === 'Org'
153-
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
154-
runner_id: ghRunnerId,
155-
org: ec2runner.owner,
156-
})
157-
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
158-
runner_id: ghRunnerId,
159-
owner: ec2runner.owner.split('/')[0],
160-
repo: ec2runner.owner.split('/')[1],
161-
})
162-
).status;
219+
try {
220+
const status = await deleteGitHubRunner(githubAppClient, ec2runner, ghRunnerId);
221+
return { ghRunnerId, status, success: status === 204 };
222+
} catch (error) {
223+
logger.error(
224+
`Failed to de-register GitHub runner ${ghRunnerId} for instance '${ec2runner.instanceId}' after retries. Error: ${error}`,
225+
{ error: error as Error },
226+
);
227+
return { ghRunnerId, status: 0, success: false };
228+
}
163229
}),
164230
);
165231

166-
if (statuses.every((status) => status == 204)) {
232+
const allSucceeded = results.every((r) => r.success);
233+
const failedRunners = results.filter((r) => !r.success);
234+
235+
if (allSucceeded) {
167236
await terminateRunner(ec2runner.instanceId);
168237
logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
169238
} else {
170-
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
239+
// Only terminate EC2 if we successfully de-registered from GitHub
240+
// Otherwise, leave the instance running so the next scale-down cycle can retry
241+
logger.error(
242+
`Failed to de-register ${failedRunners.length} GitHub runner(s) for instance '${ec2runner.instanceId}'. ` +
243+
`Instance will NOT be terminated to allow retry on next scale-down cycle. ` +
244+
`Failed runner IDs: ${failedRunners.map((r) => r.ghRunnerId).join(', ')}`,
245+
);
171246
}
172247
} else {
173248
logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);

0 commit comments

Comments
 (0)