Skip to content

Commit bfcc988

Browse files
feat: add support to use custom scale errors
1 parent 021354d commit bfcc988

15 files changed

Lines changed: 71 additions & 54 deletions

File tree

lambdas/functions/control-plane/src/aws/runners.d.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,5 +45,5 @@ export interface RunnerInputParameters {
4545
amiIdSsmParameterName?: string;
4646
tracingEnabled?: boolean;
4747
onDemandFailoverOnError?: string[];
48-
scaleErrors: string[];
48+
customScaleErrors?: string[];
4949
}

lambdas/functions/control-plane/src/aws/runners.test.ts

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,17 @@ describe('create runner with errors', () => {
461461
expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand);
462462
});
463463

464+
it('test ScaleError with custom scale error.', async () => {
465+
createFleetMockWithErrors(['CustomAWSError']);
466+
467+
await expect(createRunner(createRunnerConfig({ ...defaultRunnerConfig, customScaleErrors: ['CustomAWSError'] }))).rejects.toBeInstanceOf(ScaleError);
468+
expect(mockEC2Client).toHaveReceivedCommandWith(
469+
CreateFleetCommand,
470+
expectedCreateFleetRequest(defaultExpectedFleetRequestValues),
471+
);
472+
expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand);
473+
});
474+
464475
it('test ScaleError with multiple error.', async () => {
465476
createFleetMockWithErrors(['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded', 'NotMappedError']);
466477

@@ -703,7 +714,7 @@ interface RunnerConfig {
703714
amiIdSsmParameterName?: string;
704715
tracingEnabled?: boolean;
705716
onDemandFailoverOnError?: string[];
706-
scaleErrors: string[];
717+
customScaleErrors?: string[];
707718
}
708719

709720
function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
@@ -723,7 +734,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
723734
amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName,
724735
tracingEnabled: runnerConfig.tracingEnabled,
725736
onDemandFailoverOnError: runnerConfig.onDemandFailoverOnError,
726-
scaleErrors: runnerConfig.scaleErrors,
737+
customScaleErrors: runnerConfig.customScaleErrors,
727738
};
728739
}
729740

lambdas/functions/control-plane/src/aws/runners.ts

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,8 +197,24 @@ async function processFleetResult(
197197
return instances;
198198
}
199199

200-
const scaleErrors = runnerParameters.scaleErrors;
200+
// Educated guess of errors that would make sense to retry based on the list
201+
// https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html
202+
const defaultScaleErrors = [
203+
'UnfulfillableCapacity',
204+
'MaxSpotInstanceCountExceeded',
205+
'TargetCapacityLimitExceededException',
206+
'RequestLimitExceeded',
207+
'ResourceLimitExceeded',
208+
'MaxSpotInstanceCountExceeded',
209+
'MaxSpotFleetRequestCountExceeded',
210+
'InsufficientInstanceCapacity',
211+
];
201212

213+
const scaleErrors =
214+
runnerParameters.customScaleErrors && runnerParameters.customScaleErrors.length > 0
215+
? runnerParameters.customScaleErrors
216+
: defaultScaleErrors;
217+
202218
const failedCount = countScaleErrors(errors, scaleErrors);
203219
if (failedCount > 0) {
204220
logger.warn('Create fleet failed, ScaleError will be thrown to trigger retry for ephemeral runners.');

lambdas/functions/control-plane/src/pool/pool.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,9 @@ export async function adjust(event: PoolEvent): Promise<void> {
4141
const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS
4242
? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string])
4343
: [];
44-
const scaleErrors = JSON.parse(process.env.SCALE_ERRORS) as [string];
44+
const customScaleErrors = process.env.CUSTOM_SCALE_ERRORS
45+
? (JSON.parse(process.env.CUSTOM_SCALE_ERRORS) as [string])
46+
: [];
4547

4648
const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl();
4749

@@ -96,7 +98,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
9698
amiIdSsmParameterName,
9799
tracingEnabled,
98100
onDemandFailoverOnError,
99-
scaleErrors,
101+
customScaleErrors
100102
},
101103
topUp,
102104
githubInstallationClient,

lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ const EXPECTED_RUNNER_PARAMS: RunnerInputParameters = {
105105
subnets: ['subnet-123'],
106106
tracingEnabled: false,
107107
onDemandFailoverOnError: [],
108-
scaleErrors: ['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded', 'TargetCapacityLimitExceededException'],
108+
customScaleErrors: [],
109109
};
110110
let expectedRunnerParams: RunnerInputParameters;
111111

@@ -123,8 +123,7 @@ function setDefaults() {
123123
process.env.INSTANCE_TYPES = 'm5.large';
124124
process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot';
125125
process.env.ENABLE_ON_DEMAND_FAILOVER = undefined;
126-
process.env.SCALE_ERRORS =
127-
'["UnfulfillableCapacity","MaxSpotInstanceCountExceeded","TargetCapacityLimitExceededException"]';
126+
process.env.CUSTOM_SCALE_ERRORS = undefined;
128127
}
129128

130129
beforeEach(() => {
@@ -814,11 +813,11 @@ describe('scaleUp with public GH', () => {
814813

815814
it('creates a runner with correct config and labels and custom scale errors enabled.', async () => {
816815
process.env.RUNNER_LABELS = 'label1,label2';
817-
process.env.SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']);
818-
await scaleUpModule.scaleUp(TEST_DATA);
816+
process.env.CUSTOM_SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']);
817+
await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
819818
expect(createRunner).toBeCalledWith({
820819
...expectedRunnerParams,
821-
scaleErrors: ['RequestLimitExceeded'],
820+
customScaleErrors: ['RequestLimitExceeded'],
822821
});
823822
});
824823

lambdas/functions/control-plane/src/scale-runners/scale-up.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ interface CreateEC2RunnerConfig {
6262
amiIdSsmParameterName?: string;
6363
tracingEnabled?: boolean;
6464
onDemandFailoverOnError?: string[];
65-
scaleErrors: string[];
65+
customScaleErrors?: string[];
6666
}
6767

6868
function generateRunnerServiceConfig(githubRunnerConfig: CreateGitHubRunnerConfig, token: string) {
@@ -256,7 +256,9 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise<stri
256256
const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS
257257
? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string])
258258
: [];
259-
const scaleErrors = JSON.parse(process.env.SCALE_ERRORS) as [string];
259+
const customScaleErrors = process.env.CUSTOM_SCALE_ERRORS
260+
? (JSON.parse(process.env.CUSTOM_SCALE_ERRORS) as [string])
261+
: [];
260262

261263
const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl();
262264

@@ -433,7 +435,7 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise<stri
433435
amiIdSsmParameterName,
434436
tracingEnabled,
435437
onDemandFailoverOnError,
436-
scaleErrors,
438+
customScaleErrors,
437439
},
438440
newRunners,
439441
githubInstallationClient,

main.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ module "runners" {
187187
enable_jit_config = var.enable_jit_config
188188
enable_job_queued_check = var.enable_job_queued_check
189189
enable_on_demand_failover_for_errors = var.enable_runner_on_demand_failover_for_errors
190-
scale_errors = var.scale_errors
190+
ccustom_scale_errors = var.custom_scale_errors
191191
disable_runner_autoupdate = var.disable_runner_autoupdate
192192
enable_managed_runner_security_group = var.enable_managed_runner_security_group
193193
enable_runner_detailed_monitoring = var.enable_runner_detailed_monitoring

modules/multi-runner/runners.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ module "runners" {
3232
github_app_parameters = local.github_app_parameters
3333
ebs_optimized = each.value.runner_config.ebs_optimized
3434
enable_on_demand_failover_for_errors = each.value.runner_config.enable_on_demand_failover_for_errors
35-
scale_errors = each.value.runner_config.scale_errors
35+
custom_scale_errors = each.value.runner_config.custom_scale_errors
3636
enable_organization_runners = each.value.runner_config.enable_organization_runners
3737
enable_ephemeral_runners = each.value.runner_config.enable_ephemeral_runners
3838
enable_jit_config = each.value.runner_config.enable_jit_config

modules/multi-runner/variables.tf

Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -71,25 +71,15 @@ variable "multi_runner_config" {
7171
id_ssm_parameter_arn = optional(string, null)
7272
kms_key_arn = optional(string, null)
7373
}), null)
74-
create_service_linked_role_spot = optional(bool, false)
75-
credit_specification = optional(string, null)
76-
delay_webhook_event = optional(number, 30)
77-
disable_runner_autoupdate = optional(bool, false)
78-
ebs_optimized = optional(bool, false)
79-
enable_ephemeral_runners = optional(bool, false)
80-
enable_job_queued_check = optional(bool, null)
81-
enable_on_demand_failover_for_errors = optional(list(string), [])
82-
scale_errors = optional(list(string), [
83-
"UnfulfillableCapacity",
84-
"MaxSpotInstanceCountExceeded",
85-
"TargetCapacityLimitExceededException",
86-
"RequestLimitExceeded",
87-
"ResourceLimitExceeded",
88-
"MaxSpotInstanceCountExceeded",
89-
"MaxSpotFleetRequestCountExceeded",
90-
"InsufficientInstanceCapacity",
91-
"InsufficientCapacityOnHost",
92-
])
74+
create_service_linked_role_spot = optional(bool, false)
75+
credit_specification = optional(string, null)
76+
delay_webhook_event = optional(number, 30)
77+
disable_runner_autoupdate = optional(bool, false)
78+
ebs_optimized = optional(bool, false)
79+
enable_ephemeral_runners = optional(bool, false)
80+
enable_job_queued_check = optional(bool, null)
81+
enable_on_demand_failover_for_errors = optional(list(string), [])
82+
custom_scale_errors = optional(list(string), [])
9383
enable_organization_runners = optional(bool, false)
9484
enable_runner_binaries_syncer = optional(bool, true)
9585
enable_ssm_on_runners = optional(bool, false)
@@ -208,7 +198,7 @@ variable "multi_runner_config" {
208198
enable_ephemeral_runners: "Enable ephemeral runners, runners will only be used once."
209199
enable_job_queued_check: "Enables JIT configuration for creating runners instead of registration token based registraton. JIT configuration will only be applied for ephemeral runners. By default JIT configuration is enabled for ephemeral runners an can be disabled via this override. When running on GHES without support for JIT configuration this variable should be set to true for ephemeral runners."
210200
enable_on_demand_failover_for_errors: "Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later."
211-
scale_errors: "List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts"
201+
custom_scale_errors: "List of custom aws error codes that should trigger retry during scale up."
212202
enable_organization_runners: "Register runners to organization, instead of repo level"
213203
enable_runner_binaries_syncer: "Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI."
214204
enable_ssm_on_runners: "Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances."

modules/runners/pool.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ module "pool" {
4242
ephemeral = var.enable_ephemeral_runners
4343
enable_jit_config = var.enable_jit_config
4444
enable_on_demand_failover_for_errors = var.enable_on_demand_failover_for_errors
45-
scale_errors = var.scale_errors
45+
custom_scale_errors = var.custom_scale_errors
4646
boot_time_in_minutes = var.runner_boot_time_in_minutes
4747
labels = var.runner_labels
4848
launch_template = aws_launch_template.runner

0 commit comments

Comments
 (0)