Skip to content

Commit 097ccc6

Browse files
committed
fix: add missing instance termination for failures
Instances that failed to start up because of incorrect configuration never got terminated. This is now updated and failed instances get terminated right away. Previously we relied on a scale-down to do this.
1 parent 9f37a04 commit 097ccc6

File tree

1 file changed

+54
-7
lines changed
  • lambdas/functions/control-plane/src/scale-runners

1 file changed

+54
-7
lines changed

lambdas/functions/control-plane/src/scale-runners/scale-up.ts

Lines changed: 54 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import { getParameter, putParameter } from '@aws-github-runner/aws-ssm-util';
44
import yn from 'yn';
55

66
import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth';
7-
import { createRunner, listEC2Runners, tag } from './../aws/runners';
7+
import { createRunner, listEC2Runners, tag, terminateRunner } from './../aws/runners';
88
import { RunnerInputParameters } from './../aws/runners.d';
99
import { metricGitHubAppRateLimit } from '../github/rate-limit';
1010

@@ -222,7 +222,29 @@ export async function createRunners(
222222
...ec2RunnerConfig,
223223
});
224224
if (instances.length !== 0) {
225-
await createStartRunnerConfig(githubRunnerConfig, instances, ghClient);
225+
const failedInstances = await createStartRunnerConfig(githubRunnerConfig, instances, ghClient);
226+
227+
// Terminate instances that failed to get configured to avoid waste
228+
if (failedInstances.length > 0) {
229+
logger.warn('Terminating instances that failed to get configured', {
230+
failedInstances,
231+
failedCount: failedInstances.length,
232+
});
233+
234+
for (const instanceId of failedInstances) {
235+
try {
236+
await terminateRunner(instanceId);
237+
} catch (error) {
238+
logger.error('Failed to terminate instance', {
239+
instanceId,
240+
error: error instanceof Error ? error.message : String(error),
241+
});
242+
}
243+
}
244+
245+
// Remove failed instances from the returned list
246+
return instances.filter((id) => !failedInstances.includes(id));
247+
}
226248
}
227249

228250
return instances;
@@ -475,15 +497,21 @@ export function getGitHubEnterpriseApiUrl() {
475497
return { ghesApiUrl, ghesBaseUrl };
476498
}
477499

500+
/**
501+
* Creates the start configuration for runner instances by either generating JIT configs
502+
* or registration tokens.
503+
*
504+
* @returns Array of instance IDs that failed to get configured
505+
*/
478506
async function createStartRunnerConfig(
479507
githubRunnerConfig: CreateGitHubRunnerConfig,
480508
instances: string[],
481509
ghClient: Octokit,
482-
) {
510+
): Promise<string[]> {
483511
if (githubRunnerConfig.enableJitConfig && githubRunnerConfig.ephemeral) {
484-
await createJitConfig(githubRunnerConfig, instances, ghClient);
512+
return await createJitConfig(githubRunnerConfig, instances, ghClient);
485513
} else {
486-
await createRegistrationTokenConfig(githubRunnerConfig, instances, ghClient);
514+
return await createRegistrationTokenConfig(githubRunnerConfig, instances, ghClient);
487515
}
488516
}
489517

@@ -498,11 +526,16 @@ function addDelay(instances: string[]) {
498526
return { isDelay, delay };
499527
}
500528

529+
/**
530+
* Creates registration token configuration for non-ephemeral runners.
531+
*
532+
* @returns Empty array (this configuration method does not have failure cases)
533+
*/
501534
async function createRegistrationTokenConfig(
502535
githubRunnerConfig: CreateGitHubRunnerConfig,
503536
instances: string[],
504537
ghClient: Octokit,
505-
) {
538+
): Promise<string[]> {
506539
const { isDelay, delay } = addDelay(instances);
507540
const token = await getGithubRunnerRegistrationToken(githubRunnerConfig, ghClient);
508541
const runnerServiceConfig = generateRunnerServiceConfig(githubRunnerConfig, token);
@@ -520,6 +553,8 @@ async function createRegistrationTokenConfig(
520553
await delay(25);
521554
}
522555
}
556+
557+
return [];
523558
}
524559

525560
async function tagRunnerId(instanceId: string, runnerId: string): Promise<void> {
@@ -530,7 +565,17 @@ async function tagRunnerId(instanceId: string, runnerId: string): Promise<void>
530565
}
531566
}
532567

533-
async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, instances: string[], ghClient: Octokit) {
568+
/**
569+
* Creates JIT (Just-In-Time) configuration for ephemeral runners.
570+
* Continues processing remaining instances even if some fail.
571+
*
572+
* @returns Array of instance IDs that failed to get JIT configuration
573+
*/
574+
async function createJitConfig(
575+
githubRunnerConfig: CreateGitHubRunnerConfig,
576+
instances: string[],
577+
ghClient: Octokit,
578+
): Promise<string[]> {
534579
const runnerGroupId = await getRunnerGroupId(githubRunnerConfig, ghClient);
535580
const { isDelay, delay } = addDelay(instances);
536581
const runnerLabels = githubRunnerConfig.runnerLabels.split(',');
@@ -595,4 +640,6 @@ async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, ins
595640
successfulInstances: instances.length - failedInstances.length,
596641
});
597642
}
643+
644+
return failedInstances;
598645
}

0 commit comments

Comments
 (0)