diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index 2dfb190a38..8c787a505e 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -631,6 +631,35 @@ describe('Scale down runners', () => { await expect(scaleDown()).resolves.not.toThrow(); }); + it(`Should not terminate instance when de-registration throws an error.`, async () => { + // setup - runner should NOT be terminated because de-registration fails + const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false)]; + + const error502 = new RequestError('Server Error', 502, { + request: { + method: 'DELETE', + url: 'https://api.github.com/test', + headers: {}, + }, + }); + + mockOctokit.actions.deleteSelfHostedRunnerFromOrg.mockImplementation(() => { + throw error502; + }); + mockOctokit.actions.deleteSelfHostedRunnerFromRepo.mockImplementation(() => { + throw error502; + }); + + mockGitHubRunners(runners); + mockAwsRunners(runners); + + // act + await expect(scaleDown()).resolves.not.toThrow(); + + // assert - should NOT terminate since de-registration failed + expect(terminateRunner).not.toHaveBeenCalled(); + }); + const evictionStrategies = ['oldest_first', 'newest_first']; describe.each(evictionStrategies)('When idle config defined', (evictionStrategy) => { const defaultConfig = { diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 6086af7714..94fea2516b 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -103,17 +103,20 @@ async function listGitHubRunners(runner: RunnerInfo): Promise { logger.debug(`[listGithubRunners] Cache miss for ${key}`); const client = await getOrCreateOctokit(runner); - const runners = - runner.type === 'Org' - ? await client.paginate(client.actions.listSelfHostedRunnersForOrg, { - org: runner.owner, - per_page: 100, - }) - : await client.paginate(client.actions.listSelfHostedRunnersForRepo, { - owner: runner.owner.split('/')[0], - repo: runner.owner.split('/')[1], - per_page: 100, - }); + let runners; + if (runner.type === 'Org') { + runners = await client.paginate(client.actions.listSelfHostedRunnersForOrg, { + org: runner.owner, + per_page: 100, + }); + } else { + const [owner, repo] = runner.owner.split('/'); + runners = await client.paginate(client.actions.listSelfHostedRunnersForRepo, { + owner, + repo, + per_page: 100, + }); + } githubCache.runners.set(key, runners); logger.debug(`[listGithubRunners] Cache set for ${key}`); logger.debug(`[listGithubRunners] Runners: ${JSON.stringify(runners)}`); @@ -127,8 +130,39 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean { return launchTimePlusMinimum < now; } +async function deleteGitHubRunner( + githubInstallationClient: Octokit, + ec2runner: RunnerInfo, + ghRunnerId: number, +): Promise<{ ghRunnerId: number; status: number; success: boolean }> { + try { + let response; + if (ec2runner.type === 'Org') { + response = await githubInstallationClient.actions.deleteSelfHostedRunnerFromOrg({ + runner_id: ghRunnerId, + org: ec2runner.owner, + }); + } else { + const [owner, repo] = ec2runner.owner.split('/'); + response = await githubInstallationClient.actions.deleteSelfHostedRunnerFromRepo({ + runner_id: ghRunnerId, + owner, + repo, + }); + } + return { ghRunnerId, status: response.status, success: response.status === 204 }; + } catch (error) { + logger.error( + `Failed to de-register GitHub runner ${ghRunnerId} for instance '${ec2runner.instanceId}'. ` + + `Error: ${error instanceof Error ? error.message : String(error)}`, + { error }, + ); + return { ghRunnerId, status: 0, success: false }; + } +} + async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise { - const githubAppClient = await getOrCreateOctokit(ec2runner); + const githubInstallationClient = await getOrCreateOctokit(ec2runner); try { const runnerList = ec2runner as unknown as RunnerList; if (runnerList.bypassRemoval) { @@ -141,41 +175,38 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi const states = await Promise.all( ghRunnerIds.map(async (ghRunnerId) => { // Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition. - return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId); + return await getGitHubRunnerBusyState(githubInstallationClient, ec2runner, ghRunnerId); }), ); if (states.every((busy) => busy === false)) { - const statuses = await Promise.all( - ghRunnerIds.map(async (ghRunnerId) => { - return ( - ec2runner.type === 'Org' - ? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({ - runner_id: ghRunnerId, - org: ec2runner.owner, - }) - : await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({ - runner_id: ghRunnerId, - owner: ec2runner.owner.split('/')[0], - repo: ec2runner.owner.split('/')[1], - }) - ).status; - }), + const results = await Promise.all( + ghRunnerIds.map((ghRunnerId) => deleteGitHubRunner(githubInstallationClient, ec2runner, ghRunnerId)), ); - if (statuses.every((status) => status == 204)) { + const allSucceeded = results.every((r) => r.success); + const failedRunners = results.filter((r) => !r.success); + + if (allSucceeded) { await terminateRunner(ec2runner.instanceId); logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`); } else { - logger.error(`Failed to de-register GitHub runner: ${statuses}`); + // Only terminate EC2 if we successfully de-registered from GitHub + // Otherwise, leave the instance running so the next scale-down cycle can retry + logger.error( + `Failed to de-register ${failedRunners.length} GitHub runner(s) for instance '${ec2runner.instanceId}'. ` + + `Instance will NOT be terminated to allow retry on next scale-down cycle. ` + + `Failed runner IDs: ${failedRunners.map((r) => r.ghRunnerId).join(', ')}`, + ); } } else { logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`); } } catch (e) { - logger.error(`Runner '${ec2runner.instanceId}' cannot be de-registered. Error: ${e}`, { - error: e as Error, - }); + logger.error( + `Runner '${ec2runner.instanceId}' cannot be de-registered. Error: ${e instanceof Error ? e.message : String(e)}`, + { error: e }, + ); } } diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts index 8a10b82ca4..9346730cf9 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts @@ -228,6 +228,24 @@ describe('scaleUp with GHES', () => { expect(mockOctokit.actions.createRegistrationTokenForRepo).not.toBeCalled(); }); + it('does not create runners when current runners exceed maximum (race condition)', async () => { + process.env.RUNNERS_MAXIMUM_COUNT = '5'; + process.env.ENABLE_EPHEMERAL_RUNNERS = 'false'; + // Simulate race condition where pool lambda created more runners than max + mockListRunners.mockImplementation(async () => + Array.from({ length: 10 }, (_, i) => ({ + instanceId: `i-${i}`, + launchTime: new Date(), + type: 'Org', + owner: TEST_DATA_SINGLE.repositoryOwner, + })), + ); + await scaleUpModule.scaleUp(TEST_DATA); + // Should not attempt to create runners (would be negative without fix) + expect(createRunner).not.toBeCalled(); + expect(mockOctokit.actions.createRegistrationTokenForOrg).not.toBeCalled(); + }); + it('does create a runner if maximum is set to -1', async () => { process.env.RUNNERS_MAXIMUM_COUNT = '-1'; process.env.ENABLE_EPHEMERAL_RUNNERS = 'false'; diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index 2a4c2c1c58..0c60e223c4 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -418,12 +418,14 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise