github-aws-runners
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lambdas/functions/ami-housekeeper/package.json‎
Lines changed: 3 additions & 3 deletions b/‎lambdas/functions/ami-housekeeper/package.json‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎lambdas/functions/control-plane/package.json‎
Lines changed: 3 additions & 3 deletions b/‎lambdas/functions/control-plane/package.json‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎lambdas/functions/control-plane/src/aws/runners.d.ts‎
Lines changed: 1 addition & 0 deletions b/‎lambdas/functions/control-plane/src/aws/runners.d.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lambdas/functions/control-plane/src/aws/runners.test.ts‎
Lines changed: 3 additions & 0 deletions b/‎lambdas/functions/control-plane/src/aws/runners.test.ts‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lambdas/functions/control-plane/src/aws/runners.ts‎
Lines changed: 1 addition & 12 deletions b/‎lambdas/functions/control-plane/src/aws/runners.ts‎
Lines changed: 1 addition & 12 deletions
diff --git a/‎lambdas/functions/control-plane/src/modules.d.ts‎
Lines changed: 1 addition & 0 deletions b/‎lambdas/functions/control-plane/src/modules.d.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lambdas/functions/control-plane/src/pool/pool.test.ts‎
Lines changed: 2 additions & 0 deletions b/‎lambdas/functions/control-plane/src/pool/pool.test.ts‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lambdas/functions/control-plane/src/pool/pool.ts‎
Lines changed: 2 additions & 0 deletions b/‎lambdas/functions/control-plane/src/pool/pool.ts‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts‎
Lines changed: 13 additions & 0 deletions b/‎lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts‎
Lines changed: 13 additions & 0 deletions
@@ -215,6 +215,7 @@ Join our discord community via [this invite link](https://discord.gg/bxgXW8jJGh)
 | <a name="input_runners_scale_up_lambda_timeout"></a> [runners\_scale\_up\_lambda\_timeout](#input\_runners\_scale\_up\_lambda\_timeout) | Time out for the scale up lambda in seconds. | `number` | `30` | no |
 | <a name="input_runners_ssm_housekeeper"></a> [runners\_ssm\_housekeeper](#input\_runners\_ssm\_housekeeper) | Configuration for the SSM housekeeper lambda. This lambda deletes token / JIT config from SSM.<br/><br/>  `schedule_expression`: is used to configure the schedule for the lambda.<br/>  `enabled`: enable or disable the lambda trigger via the EventBridge.<br/>  `lambda_memory_size`: lambda memory size limit.<br/>  `lambda_timeout`: timeout for the lambda in seconds.<br/>  `config`: configuration for the lambda function. Token path will be read by default from the module. | <pre>object({<br/>    schedule_expression = optional(string, "rate(1 day)")<br/>    enabled             = optional(bool, true)<br/>    lambda_memory_size  = optional(number, 512)<br/>    lambda_timeout      = optional(number, 60)<br/>    config = object({<br/>      tokenPath      = optional(string)<br/>      minimumDaysOld = optional(number, 1)<br/>      dryRun         = optional(bool, false)<br/>    })<br/>  })</pre> | <pre>{<br/>  "config": {}<br/>}</pre> | no |
 | <a name="input_scale_down_schedule_expression"></a> [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. | `string` | `"cron(*/5 * * * ? *)"` | no |
+| <a name="input_scale_errors"></a> [scale\_errors](#input\_scale\_errors) | List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts | `list(string)` | <pre>[<br/>  "UnfulfillableCapacity",<br/>  "MaxSpotInstanceCountExceeded",<br/>  "TargetCapacityLimitExceededException",<br/>  "RequestLimitExceeded",<br/>  "ResourceLimitExceeded",<br/>  "MaxSpotInstanceCountExceeded",<br/>  "MaxSpotFleetRequestCountExceeded",<br/>  "InsufficientInstanceCapacity",<br/>  "InsufficientCapacityOnHost"<br/>]</pre> | no |
 | <a name="input_scale_up_reserved_concurrent_executions"></a> [scale\_up\_reserved\_concurrent\_executions](#input\_scale\_up\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no |
 | <a name="input_ssm_paths"></a> [ssm\_paths](#input\_ssm\_paths) | The root path used in SSM to store configuration and secrets. | <pre>object({<br/>    root       = optional(string, "github-action-runners")<br/>    app        = optional(string, "app")<br/>    runners    = optional(string, "runners")<br/>    webhook    = optional(string, "webhook")<br/>    use_prefix = optional(bool, true)<br/>  })</pre> | `{}` | no |
 | <a name="input_state_event_rule_binaries_syncer"></a> [state\_event\_rule\_binaries\_syncer](#input\_state\_event\_rule\_binaries\_syncer) | Option to disable EventBridge Lambda trigger for the binary syncer, useful to stop automatic updates of binary distribution | `string` | `"ENABLED"` | no |
 
@@ -17,7 +17,7 @@
     "all": "yarn build && yarn format && yarn lint && yarn test"
   },
   "devDependencies": {
-    "@aws-sdk/types": "^3.936.0",
+    "@aws-sdk/types": "^3.953.0",
     "@types/aws-lambda": "^8.10.155",
     "@vercel/ncc": "^0.38.4",
     "aws-sdk-client-mock": "^4.1.0",
@@ -26,8 +26,8 @@
   "dependencies": {
     "@aws-github-runner/aws-powertools-util": "*",
     "@aws-github-runner/aws-ssm-util": "*",
-    "@aws-sdk/client-ec2": "^3.948.0",
-    "@aws-sdk/client-ssm": "^3.948.0",
+    "@aws-sdk/client-ec2": "^3.953.0",
+    "@aws-sdk/client-ssm": "^3.953.0",
     "cron-parser": "^5.4.0"
   },
   "nx": {
 
@@ -17,7 +17,7 @@
     "all": "yarn build && yarn format && yarn lint && yarn test"
   },
   "devDependencies": {
-    "@aws-sdk/types": "^3.936.0",
+    "@aws-sdk/types": "^3.953.0",
     "@octokit/types": "^16.0.0",
     "@types/aws-lambda": "^8.10.155",
     "@types/node": "^22.19.0",
@@ -33,8 +33,8 @@
     "@aws-github-runner/aws-powertools-util": "*",
     "@aws-github-runner/aws-ssm-util": "*",
     "@aws-lambda-powertools/parameters": "^2.29.0",
-    "@aws-sdk/client-ec2": "^3.948.0",
-    "@aws-sdk/client-sqs": "^3.948.0",
+    "@aws-sdk/client-ec2": "^3.953.0",
+    "@aws-sdk/client-sqs": "^3.953.0",
     "@middy/core": "^6.4.5",
     "@octokit/auth-app": "8.1.2",
     "@octokit/core": "7.0.6",
 
@@ -44,4 +44,5 @@ export interface RunnerInputParameters {
   amiIdSsmParameterName?: string;
   tracingEnabled?: boolean;
   onDemandFailoverOnError?: string[];
+  scaleErrors: string[];
 }
@@ -429,6 +429,7 @@ describe('create runner with errors', () => {
     allocationStrategy: SpotAllocationStrategy.CAPACITY_OPTIMIZED,
     capacityType: 'spot',
     type: 'Repo',
+    scaleErrors: ['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded'],
   };
   const defaultExpectedFleetRequestValues: ExpectedFleetRequestValues = {
     type: 'Repo',
@@ -699,6 +700,7 @@ interface RunnerConfig {
   amiIdSsmParameterName?: string;
   tracingEnabled?: boolean;
   onDemandFailoverOnError?: string[];
+  scaleErrors: string[];
 }
 
 function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
@@ -718,6 +720,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
     amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName,
     tracingEnabled: runnerConfig.tracingEnabled,
     onDemandFailoverOnError: runnerConfig.onDemandFailoverOnError,
+    scaleErrors: runnerConfig.scaleErrors,
   };
 }
 
 
@@ -196,18 +196,7 @@ async function processFleetResult(
     return instances;
   }
 
-  // Educated guess of errors that would make sense to retry based on the list
-  // https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html
-  const scaleErrors = [
-    'UnfulfillableCapacity',
-    'MaxSpotInstanceCountExceeded',
-    'TargetCapacityLimitExceededException',
-    'RequestLimitExceeded',
-    'ResourceLimitExceeded',
-    'MaxSpotInstanceCountExceeded',
-    'MaxSpotFleetRequestCountExceeded',
-    'InsufficientInstanceCapacity',
-  ];
+  const scaleErrors = runnerParameters.scaleErrors;
 
   const failedCount = countScaleErrors(errors, scaleErrors);
   if (failedCount > 0) {
 
@@ -3,6 +3,7 @@ declare namespace NodeJS {
     AWS_REGION: string;
     ENABLE_METRIC_GITHUB_APP_RATE_LIMIT: string;
     ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS: string;
+    SCALE_ERRORS: string;
     ENVIRONMENT: string;
     GHES_URL: string;
     JOB_RETRY_CONFIG: string;
 
@@ -140,6 +140,8 @@ beforeEach(() => {
   process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot';
   process.env.RUNNER_OWNER = ORG;
   process.env.RUNNER_BOOT_TIME_IN_MINUTES = MINIMUM_TIME_RUNNING.toString();
+  process.env.SCALE_ERRORS =
+    '["UnfulfillableCapacity","MaxSpotInstanceCountExceeded","TargetCapacityLimitExceededException"]';
 
   const mockTokenReturnValue = {
     data: {
 
@@ -41,6 +41,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
   const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS
     ? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string])
     : [];
+  const scaleErrors = JSON.parse(process.env.SCALE_ERRORS) as [string];
 
   const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl();
 
@@ -95,6 +96,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
         amiIdSsmParameterName,
         tracingEnabled,
         onDemandFailoverOnError,
+        scaleErrors,
       },
       topUp,
       githubInstallationClient,
 
@@ -105,6 +105,7 @@ const EXPECTED_RUNNER_PARAMS: RunnerInputParameters = {
   subnets: ['subnet-123'],
   tracingEnabled: false,
   onDemandFailoverOnError: [],
+  scaleErrors: ['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded', 'TargetCapacityLimitExceededException'],
 };
 let expectedRunnerParams: RunnerInputParameters;
 
@@ -122,6 +123,8 @@ function setDefaults() {
   process.env.INSTANCE_TYPES = 'm5.large';
   process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot';
   process.env.ENABLE_ON_DEMAND_FAILOVER = undefined;
+  process.env.SCALE_ERRORS =
+    '["UnfulfillableCapacity","MaxSpotInstanceCountExceeded","TargetCapacityLimitExceededException"]';
 }
 
 beforeEach(() => {
@@ -809,6 +812,16 @@ describe('scaleUp with public GH', () => {
       });
     });
 
+    it('creates a runner with correct config and labels and custom scale errors enabled.', async () => {
+      process.env.RUNNER_LABELS = 'label1,label2';
+      process.env.SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']);
+      await scaleUpModule.scaleUp(TEST_DATA);
+      expect(createRunner).toBeCalledWith({
+        ...expectedRunnerParams,
+        scaleErrors: ['RequestLimitExceeded'],
+      });
+    });
+
     it('creates a runner and ensure the group argument is ignored', async () => {
       process.env.RUNNER_LABELS = 'label1,label2';
       process.env.RUNNER_GROUP_NAME = 'TEST_GROUP_IGNORED';
Original file line number	Diff line number	Diff line change
`@@ -44,4 +44,5 @@ export interface RunnerInputParameters {`
`44`	`44`	`amiIdSsmParameterName?: string;`
`45`	`45`	`tracingEnabled?: boolean;`
`46`	`46`	`onDemandFailoverOnError?: string[];`
	`47`	`+ scaleErrors: string[];`
`47`	`48`	`}`