Skip to content

Commit 8b5e8ef

Browse files
committed
fix: add retry logic for GitHub API calls in scale-down Lambda
Add exponential backoff retry for transient GitHub API failures (5xx, 429) when de-registering runners during scale-down operations. Key changes: - Add withRetry() helper with configurable max retries (3) and delays - Wrap deleteSelfHostedRunnerFromOrg/Repo calls with retry logic - Only terminate EC2 instance if GitHub de-registration succeeds - If de-registration fails after retries, leave instance running for next scale-down cycle to retry This prevents stale runner entries in GitHub org settings caused by transient API failures (e.g., 502 Server Error) during scale-down. Tests: - Add unit tests for successful retry after transient 502 error - Add unit tests verifying EC2 is NOT terminated when retries exhausted
1 parent d5efde5 commit 8b5e8ef

File tree

2 files changed

+168
-15
lines changed

2 files changed

+168
-15
lines changed

lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,77 @@ describe('Scale down runners', () => {
631631
await expect(scaleDown()).resolves.not.toThrow();
632632
});
633633

634+
it(`Should retry on 502 error and succeed on second attempt.`, async () => {
635+
// setup
636+
const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, true)];
637+
638+
let callCount = 0;
639+
const error502 = new RequestError('Server Error', 502, {
640+
request: {
641+
method: 'DELETE',
642+
url: 'https://api.github.com/test',
643+
headers: {},
644+
},
645+
});
646+
647+
if (type === 'Org') {
648+
mockOctokit.actions.deleteSelfHostedRunnerFromOrg.mockImplementation(() => {
649+
callCount++;
650+
if (callCount === 1) {
651+
throw error502;
652+
}
653+
return { status: 204 };
654+
});
655+
} else {
656+
mockOctokit.actions.deleteSelfHostedRunnerFromRepo.mockImplementation(() => {
657+
callCount++;
658+
if (callCount === 1) {
659+
throw error502;
660+
}
661+
return { status: 204 };
662+
});
663+
}
664+
665+
mockGitHubRunners(runners);
666+
mockAwsRunners(runners);
667+
668+
// act
669+
await scaleDown();
670+
671+
// assert - should have retried and succeeded
672+
expect(callCount).toBe(2);
673+
checkTerminated(runners);
674+
});
675+
676+
it(`Should not terminate instance after all retries exhausted on 502 error.`, async () => {
677+
// setup
678+
const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false)];
679+
680+
const error502 = new RequestError('Server Error', 502, {
681+
request: {
682+
method: 'DELETE',
683+
url: 'https://api.github.com/test',
684+
headers: {},
685+
},
686+
});
687+
688+
mockOctokit.actions.deleteSelfHostedRunnerFromOrg.mockImplementation(() => {
689+
throw error502;
690+
});
691+
mockOctokit.actions.deleteSelfHostedRunnerFromRepo.mockImplementation(() => {
692+
throw error502;
693+
});
694+
695+
mockGitHubRunners(runners);
696+
mockAwsRunners(runners);
697+
698+
// act
699+
await expect(scaleDown()).resolves.not.toThrow();
700+
701+
// assert - should NOT terminate since de-registration failed
702+
expect(terminateRunner).not.toHaveBeenCalled();
703+
});
704+
634705
const evictionStrategies = ['oldest_first', 'newest_first'];
635706
describe.each(evictionStrategies)('When idle config defined', (evictionStrategy) => {
636707
const defaultConfig = {

lambdas/functions/control-plane/src/scale-runners/scale-down.ts

Lines changed: 97 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,54 @@ import { getGitHubEnterpriseApiUrl } from './scale-up';
1414

1515
const logger = createChildLogger('scale-down');
1616

17+
const RETRY_CONFIG = {
18+
maxRetries: 3,
19+
initialDelayMs: 1000,
20+
maxDelayMs: 10000,
21+
};
22+
23+
async function sleep(ms: number): Promise<void> {
24+
return new Promise((resolve) => setTimeout(resolve, ms));
25+
}
26+
27+
function isRetryableError(error: unknown): boolean {
28+
if (error instanceof RequestError) {
29+
const status = (error as RequestError).status;
30+
// Retry on server errors (5xx) and rate limiting (429)
31+
return status >= 500 || status === 429;
32+
}
33+
return false;
34+
}
35+
36+
async function withRetry<T>(
37+
operation: () => Promise<T>,
38+
operationName: string,
39+
context: string,
40+
): Promise<T> {
41+
let lastError: unknown;
42+
for (let attempt = 1; attempt <= RETRY_CONFIG.maxRetries; attempt++) {
43+
try {
44+
return await operation();
45+
} catch (error) {
46+
lastError = error;
47+
if (isRetryableError(error) && attempt < RETRY_CONFIG.maxRetries) {
48+
const delay = Math.min(
49+
RETRY_CONFIG.initialDelayMs * Math.pow(2, attempt - 1),
50+
RETRY_CONFIG.maxDelayMs,
51+
);
52+
logger.warn(
53+
`${operationName} failed for ${context} (attempt ${attempt}/${RETRY_CONFIG.maxRetries}), ` +
54+
`retrying in ${delay}ms. Error: ${error}`,
55+
);
56+
await sleep(delay);
57+
} else {
58+
throw error;
59+
}
60+
}
61+
}
62+
throw lastError;
63+
}
64+
1765
type OrgRunnerList = Endpoints['GET /orgs/{org}/actions/runners']['response']['data']['runners'];
1866
type RepoRunnerList = Endpoints['GET /repos/{owner}/{repo}/actions/runners']['response']['data']['runners'];
1967
type RunnerState = OrgRunnerList[number] | RepoRunnerList[number];
@@ -127,6 +175,33 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
127175
return launchTimePlusMinimum < now;
128176
}
129177

178+
async function deleteGitHubRunner(
179+
githubAppClient: Octokit,
180+
ec2runner: RunnerInfo,
181+
ghRunnerId: number,
182+
): Promise<number> {
183+
const deleteOperation = async () => {
184+
const response =
185+
ec2runner.type === 'Org'
186+
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
187+
runner_id: ghRunnerId,
188+
org: ec2runner.owner,
189+
})
190+
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
191+
runner_id: ghRunnerId,
192+
owner: ec2runner.owner.split('/')[0],
193+
repo: ec2runner.owner.split('/')[1],
194+
});
195+
return response.status;
196+
};
197+
198+
return await withRetry(
199+
deleteOperation,
200+
'Delete GitHub runner',
201+
`runner ${ec2runner.instanceId} (GitHub ID: ${ghRunnerId})`,
202+
);
203+
}
204+
130205
async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise<void> {
131206
const githubAppClient = await getOrCreateOctokit(ec2runner);
132207
try {
@@ -146,28 +221,35 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi
146221
);
147222

148223
if (states.every((busy) => busy === false)) {
149-
const statuses = await Promise.all(
224+
const results = await Promise.all(
150225
ghRunnerIds.map(async (ghRunnerId) => {
151-
return (
152-
ec2runner.type === 'Org'
153-
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
154-
runner_id: ghRunnerId,
155-
org: ec2runner.owner,
156-
})
157-
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
158-
runner_id: ghRunnerId,
159-
owner: ec2runner.owner.split('/')[0],
160-
repo: ec2runner.owner.split('/')[1],
161-
})
162-
).status;
226+
try {
227+
const status = await deleteGitHubRunner(githubAppClient, ec2runner, ghRunnerId);
228+
return { ghRunnerId, status, success: status === 204 };
229+
} catch (error) {
230+
logger.error(
231+
`Failed to de-register GitHub runner ${ghRunnerId} for instance '${ec2runner.instanceId}' after retries. Error: ${error}`,
232+
{ error: error as Error },
233+
);
234+
return { ghRunnerId, status: 0, success: false };
235+
}
163236
}),
164237
);
165238

166-
if (statuses.every((status) => status == 204)) {
239+
const allSucceeded = results.every((r) => r.success);
240+
const failedRunners = results.filter((r) => !r.success);
241+
242+
if (allSucceeded) {
167243
await terminateRunner(ec2runner.instanceId);
168244
logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
169245
} else {
170-
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
246+
// Only terminate EC2 if we successfully de-registered from GitHub
247+
// Otherwise, leave the instance running so the next scale-down cycle can retry
248+
logger.error(
249+
`Failed to de-register ${failedRunners.length} GitHub runner(s) for instance '${ec2runner.instanceId}'. ` +
250+
`Instance will NOT be terminated to allow retry on next scale-down cycle. ` +
251+
`Failed runner IDs: ${failedRunners.map((r) => r.ghRunnerId).join(', ')}`,
252+
);
171253
}
172254
} else {
173255
logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);

0 commit comments

Comments
 (0)