Skip to content

Commit 43119b1

Browse files
feat: add support to use custom scale errors
1 parent e4b12ae commit 43119b1

13 files changed

Lines changed: 57 additions & 51 deletions

File tree

lambdas/functions/control-plane/src/aws/runners.d.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,5 +45,5 @@ export interface RunnerInputParameters {
4545
amiIdSsmParameterName?: string;
4646
tracingEnabled?: boolean;
4747
onDemandFailoverOnError?: string[];
48-
scaleErrors: string[];
48+
customScaleErrors?: string[];
4949
}

lambdas/functions/control-plane/src/aws/runners.test.ts

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,17 @@ describe('create runner with errors', () => {
461461
expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand);
462462
});
463463

464+
it('test ScaleError with custom scale error.', async () => {
465+
createFleetMockWithErrors(['CustomAWSError']);
466+
467+
await expect(createRunner(createRunnerConfig({ ...defaultRunnerConfig, customScaleErrors: ['CustomAWSError'] }))).rejects.toBeInstanceOf(ScaleError);
468+
expect(mockEC2Client).toHaveReceivedCommandWith(
469+
CreateFleetCommand,
470+
expectedCreateFleetRequest(defaultExpectedFleetRequestValues),
471+
);
472+
expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand);
473+
});
474+
464475
it('test ScaleError with multiple error.', async () => {
465476
createFleetMockWithErrors(['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded', 'NotMappedError']);
466477

@@ -703,7 +714,7 @@ interface RunnerConfig {
703714
amiIdSsmParameterName?: string;
704715
tracingEnabled?: boolean;
705716
onDemandFailoverOnError?: string[];
706-
scaleErrors: string[];
717+
customScaleErrors?: string[];
707718
}
708719

709720
function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
@@ -723,7 +734,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
723734
amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName,
724735
tracingEnabled: runnerConfig.tracingEnabled,
725736
onDemandFailoverOnError: runnerConfig.onDemandFailoverOnError,
726-
scaleErrors: runnerConfig.scaleErrors,
737+
customScaleErrors: runnerConfig.customScaleErrors,
727738
};
728739
}
729740

lambdas/functions/control-plane/src/aws/runners.ts

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,8 +197,24 @@ async function processFleetResult(
197197
return instances;
198198
}
199199

200-
const scaleErrors = runnerParameters.scaleErrors;
200+
// Educated guess of errors that would make sense to retry based on the list
201+
// https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html
202+
const defaultScaleErrors = [
203+
'UnfulfillableCapacity',
204+
'MaxSpotInstanceCountExceeded',
205+
'TargetCapacityLimitExceededException',
206+
'RequestLimitExceeded',
207+
'ResourceLimitExceeded',
208+
'MaxSpotInstanceCountExceeded',
209+
'MaxSpotFleetRequestCountExceeded',
210+
'InsufficientInstanceCapacity',
211+
];
201212

213+
const scaleErrors =
214+
runnerParameters.customScaleErrors && runnerParameters.customScaleErrors.length > 0
215+
? runnerParameters.customScaleErrors
216+
: defaultScaleErrors;
217+
202218
const failedCount = countScaleErrors(errors, scaleErrors);
203219
if (failedCount > 0) {
204220
logger.warn('Create fleet failed, ScaleError will be thrown to trigger retry for ephemeral runners.');

lambdas/functions/control-plane/src/pool/pool.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
102102
amiIdSsmParameterName,
103103
tracingEnabled,
104104
onDemandFailoverOnError,
105-
scaleErrors,
105+
customScaleErrors
106106
},
107107
topUp,
108108
githubInstallationClient,

lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ const EXPECTED_RUNNER_PARAMS: RunnerInputParameters = {
112112
subnets: ['subnet-123'],
113113
tracingEnabled: false,
114114
onDemandFailoverOnError: [],
115-
scaleErrors: ['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded', 'TargetCapacityLimitExceededException'],
115+
customScaleErrors: [],
116116
};
117117
let expectedRunnerParams: RunnerInputParameters;
118118

@@ -130,8 +130,7 @@ function setDefaults() {
130130
process.env.INSTANCE_TYPES = 'm5.large';
131131
process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot';
132132
process.env.ENABLE_ON_DEMAND_FAILOVER = undefined;
133-
process.env.SCALE_ERRORS =
134-
'["UnfulfillableCapacity","MaxSpotInstanceCountExceeded","TargetCapacityLimitExceededException"]';
133+
process.env.CUSTOM_SCALE_ERRORS = undefined;
135134
}
136135

137136
beforeEach(() => {
@@ -987,11 +986,11 @@ describe('scaleUp with public GH', () => {
987986

988987
it('creates a runner with correct config and labels and custom scale errors enabled.', async () => {
989988
process.env.RUNNER_LABELS = 'label1,label2';
990-
process.env.SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']);
991-
await scaleUpModule.scaleUp(TEST_DATA);
989+
process.env.CUSTOM_SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']);
990+
await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
992991
expect(createRunner).toBeCalledWith({
993992
...expectedRunnerParams,
994-
scaleErrors: ['RequestLimitExceeded'],
993+
customScaleErrors: ['RequestLimitExceeded'],
995994
});
996995
});
997996

lambdas/functions/control-plane/src/scale-runners/scale-up.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ interface CreateEC2RunnerConfig {
6464
amiIdSsmParameterName?: string;
6565
tracingEnabled?: boolean;
6666
onDemandFailoverOnError?: string[];
67-
scaleErrors: string[];
67+
customScaleErrors?: string[];
6868
}
6969

7070
function generateRunnerServiceConfig(githubRunnerConfig: CreateGitHubRunnerConfig, token: string) {
@@ -503,7 +503,7 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise<stri
503503
amiIdSsmParameterName,
504504
tracingEnabled,
505505
onDemandFailoverOnError,
506-
scaleErrors,
506+
customScaleErrors,
507507
},
508508
newRunners,
509509
githubInstallationClient,

main.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ module "runners" {
188188
enable_jit_config = var.enable_jit_config
189189
enable_job_queued_check = var.enable_job_queued_check
190190
enable_on_demand_failover_for_errors = var.enable_runner_on_demand_failover_for_errors
191-
scale_errors = var.scale_errors
191+
ccustom_scale_errors = var.custom_scale_errors
192192
disable_runner_autoupdate = var.disable_runner_autoupdate
193193
enable_managed_runner_security_group = var.enable_managed_runner_security_group
194194
enable_runner_detailed_monitoring = var.enable_runner_detailed_monitoring

modules/multi-runner/runners.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ module "runners" {
3232
github_app_parameters = local.github_app_parameters
3333
ebs_optimized = each.value.runner_config.ebs_optimized
3434
enable_on_demand_failover_for_errors = each.value.runner_config.enable_on_demand_failover_for_errors
35-
scale_errors = each.value.runner_config.scale_errors
35+
custom_scale_errors = each.value.runner_config.custom_scale_errors
3636
enable_organization_runners = each.value.runner_config.enable_organization_runners
3737
enable_ephemeral_runners = each.value.runner_config.enable_ephemeral_runners
3838
enable_jit_config = each.value.runner_config.enable_jit_config

modules/multi-runner/variables.tf

Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -71,25 +71,15 @@ variable "multi_runner_config" {
7171
id_ssm_parameter_arn = optional(string, null)
7272
kms_key_arn = optional(string, null)
7373
}), null)
74-
create_service_linked_role_spot = optional(bool, false)
75-
credit_specification = optional(string, null)
76-
delay_webhook_event = optional(number, 30)
77-
disable_runner_autoupdate = optional(bool, false)
78-
ebs_optimized = optional(bool, false)
79-
enable_ephemeral_runners = optional(bool, false)
80-
enable_job_queued_check = optional(bool, null)
81-
enable_on_demand_failover_for_errors = optional(list(string), [])
82-
scale_errors = optional(list(string), [
83-
"UnfulfillableCapacity",
84-
"MaxSpotInstanceCountExceeded",
85-
"TargetCapacityLimitExceededException",
86-
"RequestLimitExceeded",
87-
"ResourceLimitExceeded",
88-
"MaxSpotInstanceCountExceeded",
89-
"MaxSpotFleetRequestCountExceeded",
90-
"InsufficientInstanceCapacity",
91-
"InsufficientCapacityOnHost",
92-
])
74+
create_service_linked_role_spot = optional(bool, false)
75+
credit_specification = optional(string, null)
76+
delay_webhook_event = optional(number, 30)
77+
disable_runner_autoupdate = optional(bool, false)
78+
ebs_optimized = optional(bool, false)
79+
enable_ephemeral_runners = optional(bool, false)
80+
enable_job_queued_check = optional(bool, null)
81+
enable_on_demand_failover_for_errors = optional(list(string), [])
82+
custom_scale_errors = optional(list(string), [])
9383
enable_organization_runners = optional(bool, false)
9484
enable_runner_binaries_syncer = optional(bool, true)
9585
enable_ssm_on_runners = optional(bool, false)
@@ -209,7 +199,7 @@ variable "multi_runner_config" {
209199
enable_ephemeral_runners: "Enable ephemeral runners, runners will only be used once."
210200
enable_job_queued_check: "Enables JIT configuration for creating runners instead of registration token based registraton. JIT configuration will only be applied for ephemeral runners. By default JIT configuration is enabled for ephemeral runners an can be disabled via this override. When running on GHES without support for JIT configuration this variable should be set to true for ephemeral runners."
211201
enable_on_demand_failover_for_errors: "Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later."
212-
scale_errors: "List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts"
202+
custom_scale_errors: "List of custom aws error codes that should trigger retry during scale up."
213203
enable_organization_runners: "Register runners to organization, instead of repo level"
214204
enable_runner_binaries_syncer: "Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI."
215205
enable_ssm_on_runners: "Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances."

modules/runners/pool.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ module "pool" {
4444
ephemeral = var.enable_ephemeral_runners
4545
enable_jit_config = var.enable_jit_config
4646
enable_on_demand_failover_for_errors = var.enable_on_demand_failover_for_errors
47-
scale_errors = var.scale_errors
47+
custom_scale_errors = var.custom_scale_errors
4848
boot_time_in_minutes = var.runner_boot_time_in_minutes
4949
labels = var.runner_labels
5050
launch_template = aws_launch_template.runner

0 commit comments

Comments
 (0)