From 8599aac9e92ddc269e324c2d1b3c996dabefb5c9 Mon Sep 17 00:00:00 2001 From: Shiv <92113562+shivdesh@users.noreply.github.com> Date: Wed, 11 Mar 2026 13:47:25 -0700 Subject: [PATCH 1/2] fix: prevent EC2 termination when GitHub runner de-registration fails When the scale-down Lambda fails to de-register a runner from GitHub (even after automatic retries via @octokit/plugin-retry), the EC2 instance should NOT be terminated. This prevents stale runner entries in GitHub org settings. This change complements PR #4990 which added @octokit/plugin-retry for automatic retries. While that handles transient failures, this ensures that if de-registration ultimately fails, we don't leave orphaned GitHub runner entries by terminating the EC2 instance prematurely. Key changes: - Extract deleteGitHubRunner() helper that catches errors per-runner - Only terminate EC2 instance if ALL GitHub de-registrations succeed - If any de-registration fails, leave instance running for next cycle - Rename githubAppClient to githubInstallationClient for clarity - Refactor to split owner/repo once instead of multiple times - Fix error logging to handle non-Error objects properly The @octokit/plugin-retry (added in #4990) handles automatic retries at the client level, so no custom retry logic is needed here. Tests: - Add test verifying EC2 is NOT terminated when de-registration fails --- .../src/scale-runners/scale-down.test.ts | 29 ++++++ .../src/scale-runners/scale-down.ts | 97 ++++++++++++------- 2 files changed, 93 insertions(+), 33 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index 2dfb190a38..8c787a505e 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -631,6 +631,35 @@ describe('Scale down runners', () => { await expect(scaleDown()).resolves.not.toThrow(); }); + it(`Should not terminate instance when de-registration throws an error.`, async () => { + // setup - runner should NOT be terminated because de-registration fails + const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false)]; + + const error502 = new RequestError('Server Error', 502, { + request: { + method: 'DELETE', + url: 'https://api.github.com/test', + headers: {}, + }, + }); + + mockOctokit.actions.deleteSelfHostedRunnerFromOrg.mockImplementation(() => { + throw error502; + }); + mockOctokit.actions.deleteSelfHostedRunnerFromRepo.mockImplementation(() => { + throw error502; + }); + + mockGitHubRunners(runners); + mockAwsRunners(runners); + + // act + await expect(scaleDown()).resolves.not.toThrow(); + + // assert - should NOT terminate since de-registration failed + expect(terminateRunner).not.toHaveBeenCalled(); + }); + const evictionStrategies = ['oldest_first', 'newest_first']; describe.each(evictionStrategies)('When idle config defined', (evictionStrategy) => { const defaultConfig = { diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 6086af7714..94fea2516b 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -103,17 +103,20 @@ async function listGitHubRunners(runner: RunnerInfo): Promise { logger.debug(`[listGithubRunners] Cache miss for ${key}`); const client = await getOrCreateOctokit(runner); - const runners = - runner.type === 'Org' - ? await client.paginate(client.actions.listSelfHostedRunnersForOrg, { - org: runner.owner, - per_page: 100, - }) - : await client.paginate(client.actions.listSelfHostedRunnersForRepo, { - owner: runner.owner.split('/')[0], - repo: runner.owner.split('/')[1], - per_page: 100, - }); + let runners; + if (runner.type === 'Org') { + runners = await client.paginate(client.actions.listSelfHostedRunnersForOrg, { + org: runner.owner, + per_page: 100, + }); + } else { + const [owner, repo] = runner.owner.split('/'); + runners = await client.paginate(client.actions.listSelfHostedRunnersForRepo, { + owner, + repo, + per_page: 100, + }); + } githubCache.runners.set(key, runners); logger.debug(`[listGithubRunners] Cache set for ${key}`); logger.debug(`[listGithubRunners] Runners: ${JSON.stringify(runners)}`); @@ -127,8 +130,39 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean { return launchTimePlusMinimum < now; } +async function deleteGitHubRunner( + githubInstallationClient: Octokit, + ec2runner: RunnerInfo, + ghRunnerId: number, +): Promise<{ ghRunnerId: number; status: number; success: boolean }> { + try { + let response; + if (ec2runner.type === 'Org') { + response = await githubInstallationClient.actions.deleteSelfHostedRunnerFromOrg({ + runner_id: ghRunnerId, + org: ec2runner.owner, + }); + } else { + const [owner, repo] = ec2runner.owner.split('/'); + response = await githubInstallationClient.actions.deleteSelfHostedRunnerFromRepo({ + runner_id: ghRunnerId, + owner, + repo, + }); + } + return { ghRunnerId, status: response.status, success: response.status === 204 }; + } catch (error) { + logger.error( + `Failed to de-register GitHub runner ${ghRunnerId} for instance '${ec2runner.instanceId}'. ` + + `Error: ${error instanceof Error ? error.message : String(error)}`, + { error }, + ); + return { ghRunnerId, status: 0, success: false }; + } +} + async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise { - const githubAppClient = await getOrCreateOctokit(ec2runner); + const githubInstallationClient = await getOrCreateOctokit(ec2runner); try { const runnerList = ec2runner as unknown as RunnerList; if (runnerList.bypassRemoval) { @@ -141,41 +175,38 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi const states = await Promise.all( ghRunnerIds.map(async (ghRunnerId) => { // Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition. - return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId); + return await getGitHubRunnerBusyState(githubInstallationClient, ec2runner, ghRunnerId); }), ); if (states.every((busy) => busy === false)) { - const statuses = await Promise.all( - ghRunnerIds.map(async (ghRunnerId) => { - return ( - ec2runner.type === 'Org' - ? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({ - runner_id: ghRunnerId, - org: ec2runner.owner, - }) - : await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({ - runner_id: ghRunnerId, - owner: ec2runner.owner.split('/')[0], - repo: ec2runner.owner.split('/')[1], - }) - ).status; - }), + const results = await Promise.all( + ghRunnerIds.map((ghRunnerId) => deleteGitHubRunner(githubInstallationClient, ec2runner, ghRunnerId)), ); - if (statuses.every((status) => status == 204)) { + const allSucceeded = results.every((r) => r.success); + const failedRunners = results.filter((r) => !r.success); + + if (allSucceeded) { await terminateRunner(ec2runner.instanceId); logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`); } else { - logger.error(`Failed to de-register GitHub runner: ${statuses}`); + // Only terminate EC2 if we successfully de-registered from GitHub + // Otherwise, leave the instance running so the next scale-down cycle can retry + logger.error( + `Failed to de-register ${failedRunners.length} GitHub runner(s) for instance '${ec2runner.instanceId}'. ` + + `Instance will NOT be terminated to allow retry on next scale-down cycle. ` + + `Failed runner IDs: ${failedRunners.map((r) => r.ghRunnerId).join(', ')}`, + ); } } else { logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`); } } catch (e) { - logger.error(`Runner '${ec2runner.instanceId}' cannot be de-registered. Error: ${e}`, { - error: e as Error, - }); + logger.error( + `Runner '${ec2runner.instanceId}' cannot be de-registered. Error: ${e instanceof Error ? e.message : String(e)}`, + { error: e }, + ); } } From 7a368bcfab56d5dfed6ed0428a11ea8cac979a5c Mon Sep 17 00:00:00 2001 From: Shiv <92113562+shivdesh@users.noreply.github.com> Date: Tue, 10 Mar 2026 14:20:12 -0700 Subject: [PATCH 2/2] fix(scale-up): prevent negative TotalTargetCapacity when runners exceed maximum When pool and scale-up lambdas run concurrently, currentRunners can temporarily exceed maximumRunners. This caused the calculation `maximumRunners - currentRunners` to produce a negative value, which was then passed to EC2 CreateFleet API, resulting in: InvalidTargetCapacitySpecification: TotalTargetCapacity should not be negative. This fix wraps the calculation with Math.max(0, ...) to ensure we never attempt to create a negative number of runners. Fixes race condition between pool-lambda and scale-up-lambda. --- .../src/scale-runners/scale-up.test.ts | 18 ++++++++++++++++++ .../src/scale-runners/scale-up.ts | 4 +++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts index 8a10b82ca4..9346730cf9 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts @@ -228,6 +228,24 @@ describe('scaleUp with GHES', () => { expect(mockOctokit.actions.createRegistrationTokenForRepo).not.toBeCalled(); }); + it('does not create runners when current runners exceed maximum (race condition)', async () => { + process.env.RUNNERS_MAXIMUM_COUNT = '5'; + process.env.ENABLE_EPHEMERAL_RUNNERS = 'false'; + // Simulate race condition where pool lambda created more runners than max + mockListRunners.mockImplementation(async () => + Array.from({ length: 10 }, (_, i) => ({ + instanceId: `i-${i}`, + launchTime: new Date(), + type: 'Org', + owner: TEST_DATA_SINGLE.repositoryOwner, + })), + ); + await scaleUpModule.scaleUp(TEST_DATA); + // Should not attempt to create runners (would be negative without fix) + expect(createRunner).not.toBeCalled(); + expect(mockOctokit.actions.createRegistrationTokenForOrg).not.toBeCalled(); + }); + it('does create a runner if maximum is set to -1', async () => { process.env.RUNNERS_MAXIMUM_COUNT = '-1'; process.env.ENABLE_EPHEMERAL_RUNNERS = 'false'; diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index 2a4c2c1c58..0c60e223c4 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -418,12 +418,14 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise