Skip to content

Commit eb7236e

Browse files
feat: add support to use custom scale errors
1 parent 391a65f commit eb7236e

15 files changed

Lines changed: 61 additions & 1 deletion

File tree

lambdas/functions/control-plane/src/aws/runners.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,5 @@ export interface RunnerInputParameters {
4444
amiIdSsmParameterName?: string;
4545
tracingEnabled?: boolean;
4646
onDemandFailoverOnError?: string[];
47+
customScaleErrors?: string[];
4748
}

lambdas/functions/control-plane/src/aws/runners.test.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,17 @@ describe('create runner with errors', () => {
457457
expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand);
458458
});
459459

460+
it('test ScaleError with custom scale error.', async () => {
461+
createFleetMockWithErrors(['CustomAWSError']);
462+
463+
await expect(createRunner(createRunnerConfig({ ...defaultRunnerConfig, customScaleErrors: ['CustomAWSError'] }))).rejects.toBeInstanceOf(ScaleError);
464+
expect(mockEC2Client).toHaveReceivedCommandWith(
465+
CreateFleetCommand,
466+
expectedCreateFleetRequest(defaultExpectedFleetRequestValues),
467+
);
468+
expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand);
469+
});
470+
460471
it('test ScaleError with multiple error.', async () => {
461472
createFleetMockWithErrors(['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded', 'NotMappedError']);
462473

@@ -699,6 +710,7 @@ interface RunnerConfig {
699710
amiIdSsmParameterName?: string;
700711
tracingEnabled?: boolean;
701712
onDemandFailoverOnError?: string[];
713+
customScaleErrors?: string[];
702714
}
703715

704716
function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
@@ -718,6 +730,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
718730
amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName,
719731
tracingEnabled: runnerConfig.tracingEnabled,
720732
onDemandFailoverOnError: runnerConfig.onDemandFailoverOnError,
733+
customScaleErrors: runnerConfig.customScaleErrors,
721734
};
722735
}
723736

lambdas/functions/control-plane/src/aws/runners.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ async function processFleetResult(
198198

199199
// Educated guess of errors that would make sense to retry based on the list
200200
// https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html
201-
const scaleErrors = [
201+
const defaultScaleErrors = [
202202
'UnfulfillableCapacity',
203203
'MaxSpotInstanceCountExceeded',
204204
'TargetCapacityLimitExceededException',
@@ -209,6 +209,11 @@ async function processFleetResult(
209209
'InsufficientInstanceCapacity',
210210
];
211211

212+
const scaleErrors =
213+
runnerParameters.customScaleErrors && runnerParameters.customScaleErrors.length > 0
214+
? runnerParameters.customScaleErrors
215+
: defaultScaleErrors;
216+
212217
const failedCount = countScaleErrors(errors, scaleErrors);
213218
if (failedCount > 0) {
214219
logger.warn('Create fleet failed, ScaleError will be thrown to trigger retry for ephemeral runners.');

lambdas/functions/control-plane/src/pool/pool.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ export async function adjust(event: PoolEvent): Promise<void> {
4141
const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS
4242
? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string])
4343
: [];
44+
const customScaleErrors = process.env.CUSTOM_SCALE_ERRORS
45+
? (JSON.parse(process.env.CUSTOM_SCALE_ERRORS) as [string])
46+
: [];
4447

4548
const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl();
4649

@@ -95,6 +98,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
9598
amiIdSsmParameterName,
9699
tracingEnabled,
97100
onDemandFailoverOnError,
101+
customScaleErrors
98102
},
99103
topUp,
100104
githubInstallationClient,

lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ const EXPECTED_RUNNER_PARAMS: RunnerInputParameters = {
105105
subnets: ['subnet-123'],
106106
tracingEnabled: false,
107107
onDemandFailoverOnError: [],
108+
customScaleErrors: [],
108109
};
109110
let expectedRunnerParams: RunnerInputParameters;
110111

@@ -122,6 +123,7 @@ function setDefaults() {
122123
process.env.INSTANCE_TYPES = 'm5.large';
123124
process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot';
124125
process.env.ENABLE_ON_DEMAND_FAILOVER = undefined;
126+
process.env.CUSTOM_SCALE_ERRORS = undefined;
125127
}
126128

127129
beforeEach(() => {
@@ -809,6 +811,16 @@ describe('scaleUp with public GH', () => {
809811
});
810812
});
811813

814+
it('creates a runner with correct config and labels and custom scale errors enabled.', async () => {
815+
process.env.RUNNER_LABELS = 'label1,label2';
816+
process.env.CUSTOM_SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']);
817+
await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
818+
expect(createRunner).toBeCalledWith({
819+
...expectedRunnerParams,
820+
customScaleErrors: ['RequestLimitExceeded'],
821+
});
822+
});
823+
812824
it('creates a runner and ensure the group argument is ignored', async () => {
813825
process.env.RUNNER_LABELS = 'label1,label2';
814826
process.env.RUNNER_GROUP_NAME = 'TEST_GROUP_IGNORED';

lambdas/functions/control-plane/src/scale-runners/scale-up.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ interface CreateEC2RunnerConfig {
6262
amiIdSsmParameterName?: string;
6363
tracingEnabled?: boolean;
6464
onDemandFailoverOnError?: string[];
65+
customScaleErrors?: string[];
6566
}
6667

6768
function generateRunnerServiceConfig(githubRunnerConfig: CreateGitHubRunnerConfig, token: string) {
@@ -255,6 +256,9 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise<stri
255256
const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS
256257
? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string])
257258
: [];
259+
const customScaleErrors = process.env.CUSTOM_SCALE_ERRORS
260+
? (JSON.parse(process.env.CUSTOM_SCALE_ERRORS) as [string])
261+
: [];
258262

259263
const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl();
260264

@@ -431,6 +435,7 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise<stri
431435
amiIdSsmParameterName,
432436
tracingEnabled,
433437
onDemandFailoverOnError,
438+
customScaleErrors,
434439
},
435440
newRunners,
436441
githubInstallationClient,

main.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@ module "runners" {
187187
enable_jit_config = var.enable_jit_config
188188
enable_job_queued_check = var.enable_job_queued_check
189189
enable_on_demand_failover_for_errors = var.enable_runner_on_demand_failover_for_errors
190+
ccustom_scale_errors = var.custom_scale_errors
190191
disable_runner_autoupdate = var.disable_runner_autoupdate
191192
enable_managed_runner_security_group = var.enable_managed_runner_security_group
192193
enable_runner_detailed_monitoring = var.enable_runner_detailed_monitoring

modules/multi-runner/runners.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ module "runners" {
3232
github_app_parameters = local.github_app_parameters
3333
ebs_optimized = each.value.runner_config.ebs_optimized
3434
enable_on_demand_failover_for_errors = each.value.runner_config.enable_on_demand_failover_for_errors
35+
custom_scale_errors = each.value.runner_config.custom_scale_errors
3536
enable_organization_runners = each.value.runner_config.enable_organization_runners
3637
enable_ephemeral_runners = each.value.runner_config.enable_ephemeral_runners
3738
enable_jit_config = each.value.runner_config.enable_jit_config

modules/multi-runner/variables.tf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ variable "multi_runner_config" {
7979
enable_ephemeral_runners = optional(bool, false)
8080
enable_job_queued_check = optional(bool, null)
8181
enable_on_demand_failover_for_errors = optional(list(string), [])
82+
custom_scale_errors = optional(list(string), [])
8283
enable_organization_runners = optional(bool, false)
8384
enable_runner_binaries_syncer = optional(bool, true)
8485
enable_ssm_on_runners = optional(bool, false)
@@ -197,6 +198,7 @@ variable "multi_runner_config" {
197198
enable_ephemeral_runners: "Enable ephemeral runners, runners will only be used once."
198199
enable_job_queued_check: "Enables JIT configuration for creating runners instead of registration token based registraton. JIT configuration will only be applied for ephemeral runners. By default JIT configuration is enabled for ephemeral runners an can be disabled via this override. When running on GHES without support for JIT configuration this variable should be set to true for ephemeral runners."
199200
enable_on_demand_failover_for_errors: "Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later."
201+
custom_scale_errors: "List of custom aws error codes that should trigger retry during scale up."
200202
enable_organization_runners: "Register runners to organization, instead of repo level"
201203
enable_runner_binaries_syncer: "Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI."
202204
enable_ssm_on_runners: "Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances."

modules/runners/pool.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ module "pool" {
4242
ephemeral = var.enable_ephemeral_runners
4343
enable_jit_config = var.enable_jit_config
4444
enable_on_demand_failover_for_errors = var.enable_on_demand_failover_for_errors
45+
custom_scale_errors = var.custom_scale_errors
4546
boot_time_in_minutes = var.runner_boot_time_in_minutes
4647
labels = var.runner_labels
4748
launch_template = aws_launch_template.runner

0 commit comments

Comments
 (0)