feat: add support to use custom scale errors

edersonbrilhante · edersonbrilhante · commit bfcc98865e79 · 2026-01-13T19:04:46.000+01:00
diff --git a/lambdas/functions/control-plane/src/aws/runners.d.ts b/lambdas/functions/control-plane/src/aws/runners.d.ts
@@ -45,5 +45,5 @@ export interface RunnerInputParameters {
   amiIdSsmParameterName?: string;
   tracingEnabled?: boolean;
   onDemandFailoverOnError?: string[];
-  scaleErrors: string[];
+  customScaleErrors?: string[];
 }
diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts
@@ -461,6 +461,17 @@ describe('create runner with errors', () => {
     expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand);
   });
 
+  it('test ScaleError with custom scale error.', async () => {
+    createFleetMockWithErrors(['CustomAWSError']);
+    
+    await expect(createRunner(createRunnerConfig({ ...defaultRunnerConfig, customScaleErrors: ['CustomAWSError'] }))).rejects.toBeInstanceOf(ScaleError);
+    expect(mockEC2Client).toHaveReceivedCommandWith(
+      CreateFleetCommand,
+      expectedCreateFleetRequest(defaultExpectedFleetRequestValues),
+    );
+    expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand);
+  });
+
   it('test ScaleError with multiple error.', async () => {
     createFleetMockWithErrors(['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded', 'NotMappedError']);
 
@@ -703,7 +714,7 @@ interface RunnerConfig {
   amiIdSsmParameterName?: string;
   tracingEnabled?: boolean;
   onDemandFailoverOnError?: string[];
-  scaleErrors: string[];
+  customScaleErrors?: string[];
 }
 
 function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
@@ -723,7 +734,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
     amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName,
     tracingEnabled: runnerConfig.tracingEnabled,
     onDemandFailoverOnError: runnerConfig.onDemandFailoverOnError,
-    scaleErrors: runnerConfig.scaleErrors,
+    customScaleErrors: runnerConfig.customScaleErrors,
   };
 }
 
diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts
@@ -197,8 +197,24 @@ async function processFleetResult(
     return instances;
   }
 
-  const scaleErrors = runnerParameters.scaleErrors;
+  // Educated guess of errors that would make sense to retry based on the list
+  // https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html
+  const defaultScaleErrors = [
+    'UnfulfillableCapacity',
+    'MaxSpotInstanceCountExceeded',
+    'TargetCapacityLimitExceededException',
+    'RequestLimitExceeded',
+    'ResourceLimitExceeded',
+    'MaxSpotInstanceCountExceeded',
+    'MaxSpotFleetRequestCountExceeded',
+    'InsufficientInstanceCapacity',
+  ];
 
+  const scaleErrors =
+      runnerParameters.customScaleErrors && runnerParameters.customScaleErrors.length > 0
+        ? runnerParameters.customScaleErrors
+        : defaultScaleErrors;
+  
   const failedCount = countScaleErrors(errors, scaleErrors);
   if (failedCount > 0) {
     logger.warn('Create fleet failed, ScaleError will be thrown to trigger retry for ephemeral runners.');
diff --git a/lambdas/functions/control-plane/src/pool/pool.ts b/lambdas/functions/control-plane/src/pool/pool.ts
@@ -41,7 +41,9 @@ export async function adjust(event: PoolEvent): Promise<void> {
   const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS
     ? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string])
     : [];
-  const scaleErrors = JSON.parse(process.env.SCALE_ERRORS) as [string];
+  const customScaleErrors = process.env.CUSTOM_SCALE_ERRORS
+    ? (JSON.parse(process.env.CUSTOM_SCALE_ERRORS) as [string])
+    : [];
 
   const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl();
 
@@ -96,7 +98,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
         amiIdSsmParameterName,
         tracingEnabled,
         onDemandFailoverOnError,
-        scaleErrors,
+        customScaleErrors
       },
       topUp,
       githubInstallationClient,
diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts
@@ -105,7 +105,7 @@ const EXPECTED_RUNNER_PARAMS: RunnerInputParameters = {
   subnets: ['subnet-123'],
   tracingEnabled: false,
   onDemandFailoverOnError: [],
-  scaleErrors: ['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded', 'TargetCapacityLimitExceededException'],
+  customScaleErrors: [],
 };
 let expectedRunnerParams: RunnerInputParameters;
 
@@ -123,8 +123,7 @@ function setDefaults() {
   process.env.INSTANCE_TYPES = 'm5.large';
   process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot';
   process.env.ENABLE_ON_DEMAND_FAILOVER = undefined;
-  process.env.SCALE_ERRORS =
-    '["UnfulfillableCapacity","MaxSpotInstanceCountExceeded","TargetCapacityLimitExceededException"]';
+  process.env.CUSTOM_SCALE_ERRORS = undefined;
 }
 
 beforeEach(() => {
@@ -814,11 +813,11 @@ describe('scaleUp with public GH', () => {
 
     it('creates a runner with correct config and labels and custom scale errors enabled.', async () => {
       process.env.RUNNER_LABELS = 'label1,label2';
-      process.env.SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']);
-      await scaleUpModule.scaleUp(TEST_DATA);
+      process.env.CUSTOM_SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']);
+      await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
       expect(createRunner).toBeCalledWith({
         ...expectedRunnerParams,
-        scaleErrors: ['RequestLimitExceeded'],
+        customScaleErrors: ['RequestLimitExceeded'],
       });
     });
 
diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts
@@ -62,7 +62,7 @@ interface CreateEC2RunnerConfig {
   amiIdSsmParameterName?: string;
   tracingEnabled?: boolean;
   onDemandFailoverOnError?: string[];
-  scaleErrors: string[];
+  customScaleErrors?: string[];
 }
 
 function generateRunnerServiceConfig(githubRunnerConfig: CreateGitHubRunnerConfig, token: string) {
@@ -256,7 +256,9 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise<stri
   const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS
     ? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string])
     : [];
-  const scaleErrors = JSON.parse(process.env.SCALE_ERRORS) as [string];
+  const customScaleErrors = process.env.CUSTOM_SCALE_ERRORS
+    ? (JSON.parse(process.env.CUSTOM_SCALE_ERRORS) as [string])
+    : [];
 
   const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl();
 
@@ -433,7 +435,7 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise<stri
         amiIdSsmParameterName,
         tracingEnabled,
         onDemandFailoverOnError,
-        scaleErrors,
+        customScaleErrors,
       },
       newRunners,
       githubInstallationClient,
diff --git a/main.tf b/main.tf
@@ -187,7 +187,7 @@ module "runners" {
   enable_jit_config                    = var.enable_jit_config
   enable_job_queued_check              = var.enable_job_queued_check
   enable_on_demand_failover_for_errors = var.enable_runner_on_demand_failover_for_errors
-  scale_errors                         = var.scale_errors
+  ccustom_scale_errors                 = var.custom_scale_errors
   disable_runner_autoupdate            = var.disable_runner_autoupdate
   enable_managed_runner_security_group = var.enable_managed_runner_security_group
   enable_runner_detailed_monitoring    = var.enable_runner_detailed_monitoring
diff --git a/modules/multi-runner/runners.tf b/modules/multi-runner/runners.tf
@@ -32,7 +32,7 @@ module "runners" {
   github_app_parameters                = local.github_app_parameters
   ebs_optimized                        = each.value.runner_config.ebs_optimized
   enable_on_demand_failover_for_errors = each.value.runner_config.enable_on_demand_failover_for_errors
-  scale_errors                         = each.value.runner_config.scale_errors
+  custom_scale_errors                  = each.value.runner_config.custom_scale_errors
   enable_organization_runners          = each.value.runner_config.enable_organization_runners
   enable_ephemeral_runners             = each.value.runner_config.enable_ephemeral_runners
   enable_jit_config                    = each.value.runner_config.enable_jit_config
diff --git a/modules/multi-runner/variables.tf b/modules/multi-runner/variables.tf
@@ -71,25 +71,15 @@ variable "multi_runner_config" {
         id_ssm_parameter_arn = optional(string, null)
         kms_key_arn          = optional(string, null)
       }), null)
-      create_service_linked_role_spot      = optional(bool, false)
-      credit_specification                 = optional(string, null)
-      delay_webhook_event                  = optional(number, 30)
-      disable_runner_autoupdate            = optional(bool, false)
-      ebs_optimized                        = optional(bool, false)
-      enable_ephemeral_runners             = optional(bool, false)
-      enable_job_queued_check              = optional(bool, null)
-      enable_on_demand_failover_for_errors = optional(list(string), [])
-      scale_errors = optional(list(string), [
-        "UnfulfillableCapacity",
-        "MaxSpotInstanceCountExceeded",
-        "TargetCapacityLimitExceededException",
-        "RequestLimitExceeded",
-        "ResourceLimitExceeded",
-        "MaxSpotInstanceCountExceeded",
-        "MaxSpotFleetRequestCountExceeded",
-        "InsufficientInstanceCapacity",
-        "InsufficientCapacityOnHost",
-      ])
+      create_service_linked_role_spot         = optional(bool, false)
+      credit_specification                    = optional(string, null)
+      delay_webhook_event                     = optional(number, 30)
+      disable_runner_autoupdate               = optional(bool, false)
+      ebs_optimized                           = optional(bool, false)
+      enable_ephemeral_runners                = optional(bool, false)
+      enable_job_queued_check                 = optional(bool, null)
+      enable_on_demand_failover_for_errors    = optional(list(string), [])
+      custom_scale_errors                     = optional(list(string), [])
       enable_organization_runners             = optional(bool, false)
       enable_runner_binaries_syncer           = optional(bool, true)
       enable_ssm_on_runners                   = optional(bool, false)
@@ -208,7 +198,7 @@ variable "multi_runner_config" {
         enable_ephemeral_runners: "Enable ephemeral runners, runners will only be used once."
         enable_job_queued_check: "Enables JIT configuration for creating runners instead of registration token based registraton. JIT configuration will only be applied for ephemeral runners. By default JIT configuration is enabled for ephemeral runners an can be disabled via this override. When running on GHES without support for JIT configuration this variable should be set to true for ephemeral runners."
         enable_on_demand_failover_for_errors: "Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later."
-        scale_errors: "List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts"
+        custom_scale_errors: "List of custom aws error codes that should trigger retry during scale up."
         enable_organization_runners: "Register runners to organization, instead of repo level"
         enable_runner_binaries_syncer: "Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI."
         enable_ssm_on_runners: "Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances."
diff --git a/modules/runners/pool.tf b/modules/runners/pool.tf
@@ -42,7 +42,7 @@ module "pool" {
       ephemeral                            = var.enable_ephemeral_runners
       enable_jit_config                    = var.enable_jit_config
       enable_on_demand_failover_for_errors = var.enable_on_demand_failover_for_errors
-      scale_errors                         = var.scale_errors
+      custom_scale_errors                  = var.custom_scale_errors
       boot_time_in_minutes                 = var.runner_boot_time_in_minutes
       labels                               = var.runner_labels
       launch_template                      = aws_launch_template.runner
diff --git a/modules/runners/pool/main.tf b/modules/runners/pool/main.tf
@@ -47,7 +47,7 @@ resource "aws_lambda_function" "pool" {
       POWERTOOLS_TRACER_CAPTURE_HTTPS_REQUESTS = var.tracing_config.capture_http_requests
       POWERTOOLS_TRACER_CAPTURE_ERROR          = var.tracing_config.capture_error
       ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS     = jsonencode(var.config.runner.enable_on_demand_failover_for_errors)
-      SCALE_ERRORS                             = jsonencode(var.config.runner.scale_errors)
+      CUSTOM_SCALE_ERRORS                      = jsonencode(var.config.runner.custom_scale_errors)
     }
   }
 
diff --git a/modules/runners/pool/variables.tf b/modules/runners/pool/variables.tf
@@ -32,7 +32,7 @@ variable "config" {
       ephemeral                            = bool
       enable_jit_config                    = bool
       enable_on_demand_failover_for_errors = list(string)
-      scale_errors                         = list(string)
+      custom_scale_errors                  = list(string)
       boot_time_in_minutes                 = number
       labels                               = list(string)
       launch_template = object({
diff --git a/modules/runners/scale-up.tf b/modules/runners/scale-up.tf
@@ -59,7 +59,7 @@ resource "aws_lambda_function" "scale_up" {
       SSM_CONFIG_PATH                          = "${var.ssm_paths.root}/${var.ssm_paths.config}"
       SUBNET_IDS                               = join(",", var.subnet_ids)
       ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS     = jsonencode(var.enable_on_demand_failover_for_errors)
-      SCALE_ERRORS                             = jsonencode(var.scale_errors)
+      CUSTOM_SCALE_ERRORS                      = jsonencode(var.custom_scale_errors)
       JOB_RETRY_CONFIG                         = jsonencode(local.job_retry_config)
     }
   }
diff --git a/modules/runners/variables.tf b/modules/runners/variables.tf
@@ -701,20 +701,10 @@ variable "enable_on_demand_failover_for_errors" {
   default     = []
 }
 
-variable "scale_errors" {
-  description = "List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts"
+variable "custom_scale_errors" {
+  description = "List of custom aws error codes that should trigger retry during scale up."
   type        = list(string)
-  default = [
-    "UnfulfillableCapacity",
-    "MaxSpotInstanceCountExceeded",
-    "TargetCapacityLimitExceededException",
-    "RequestLimitExceeded",
-    "ResourceLimitExceeded",
-    "MaxSpotInstanceCountExceeded",
-    "MaxSpotFleetRequestCountExceeded",
-    "InsufficientInstanceCapacity",
-    "InsufficientCapacityOnHost",
-  ]
+  default     = []
 }
 
 variable "lambda_tags" {
diff --git a/variables.tf b/variables.tf
@@ -283,6 +283,7 @@ variable "enable_runner_on_demand_failover_for_errors" {
   default     = []
 }
 
+<<<<<<< HEAD
 variable "scale_errors" {
   description = "List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts"
   type        = list(string)
@@ -297,6 +298,12 @@ variable "scale_errors" {
     "InsufficientInstanceCapacity",
     "InsufficientCapacityOnHost",
   ]
+=======
+variable "custom_scale_errors" {
+  description = "List of custom aws error codes that should trigger retry during scale up."
+  type        = list(string)
+  default     = []
+>>>>>>> eb7236e3 (feat: add support to use custom scale errors)
 }
 
 variable "enable_userdata" {

Original file line number	Diff line number	Diff line change
`@@ -45,5 +45,5 @@ export interface RunnerInputParameters {`
`45`	`45`	`amiIdSsmParameterName?: string;`
`46`	`46`	`tracingEnabled?: boolean;`
`47`	`47`	`onDemandFailoverOnError?: string[];`
`48`		`- scaleErrors: string[];`
	`48`	`+ customScaleErrors?: string[];`
`49`	`49`	`}`