diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index 2dfb190a38..8c787a505e 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -631,6 +631,35 @@ describe('Scale down runners', () => { await expect(scaleDown()).resolves.not.toThrow(); }); + it(`Should not terminate instance when de-registration throws an error.`, async () => { + // setup - runner should NOT be terminated because de-registration fails + const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false)]; + + const error502 = new RequestError('Server Error', 502, { + request: { + method: 'DELETE', + url: 'https://api.github.com/test', + headers: {}, + }, + }); + + mockOctokit.actions.deleteSelfHostedRunnerFromOrg.mockImplementation(() => { + throw error502; + }); + mockOctokit.actions.deleteSelfHostedRunnerFromRepo.mockImplementation(() => { + throw error502; + }); + + mockGitHubRunners(runners); + mockAwsRunners(runners); + + // act + await expect(scaleDown()).resolves.not.toThrow(); + + // assert - should NOT terminate since de-registration failed + expect(terminateRunner).not.toHaveBeenCalled(); + }); + const evictionStrategies = ['oldest_first', 'newest_first']; describe.each(evictionStrategies)('When idle config defined', (evictionStrategy) => { const defaultConfig = { diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 6086af7714..1626c6d437 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -127,6 +127,33 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean { return launchTimePlusMinimum < now; } +async function deleteGitHubRunner( + githubAppClient: Octokit, + ec2runner: RunnerInfo, + ghRunnerId: number, +): Promise<{ ghRunnerId: number; status: number; success: boolean }> { + try { + const response = + ec2runner.type === 'Org' + ? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({ + runner_id: ghRunnerId, + org: ec2runner.owner, + }) + : await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({ + runner_id: ghRunnerId, + owner: ec2runner.owner.split('/')[0], + repo: ec2runner.owner.split('/')[1], + }); + return { ghRunnerId, status: response.status, success: response.status === 204 }; + } catch (error) { + logger.error( + `Failed to de-register GitHub runner ${ghRunnerId} for instance '${ec2runner.instanceId}'. Error: ${error}`, + { error: error as Error }, + ); + return { ghRunnerId, status: 0, success: false }; + } +} + async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise { const githubAppClient = await getOrCreateOctokit(ec2runner); try { @@ -146,28 +173,24 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi ); if (states.every((busy) => busy === false)) { - const statuses = await Promise.all( - ghRunnerIds.map(async (ghRunnerId) => { - return ( - ec2runner.type === 'Org' - ? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({ - runner_id: ghRunnerId, - org: ec2runner.owner, - }) - : await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({ - runner_id: ghRunnerId, - owner: ec2runner.owner.split('/')[0], - repo: ec2runner.owner.split('/')[1], - }) - ).status; - }), + const results = await Promise.all( + ghRunnerIds.map((ghRunnerId) => deleteGitHubRunner(githubAppClient, ec2runner, ghRunnerId)), ); - if (statuses.every((status) => status == 204)) { + const allSucceeded = results.every((r) => r.success); + const failedRunners = results.filter((r) => !r.success); + + if (allSucceeded) { await terminateRunner(ec2runner.instanceId); logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`); } else { - logger.error(`Failed to de-register GitHub runner: ${statuses}`); + // Only terminate EC2 if we successfully de-registered from GitHub + // Otherwise, leave the instance running so the next scale-down cycle can retry + logger.error( + `Failed to de-register ${failedRunners.length} GitHub runner(s) for instance '${ec2runner.instanceId}'. ` + + `Instance will NOT be terminated to allow retry on next scale-down cycle. ` + + `Failed runner IDs: ${failedRunners.map((r) => r.ghRunnerId).join(', ')}`, + ); } } else { logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);