Skip to content

Commit 26814ea

Browse files
authored
Merge branch 'main' into fix-placement-group-host-resource-group-arn-type
2 parents f2cadfc + 9843662 commit 26814ea

28 files changed

Lines changed: 934 additions & 868 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ Join our discord community via [this invite link](https://discord.gg/bxgXW8jJGh)
215215
| <a name="input_runners_scale_up_lambda_timeout"></a> [runners\_scale\_up\_lambda\_timeout](#input\_runners\_scale\_up\_lambda\_timeout) | Time out for the scale up lambda in seconds. | `number` | `30` | no |
216216
| <a name="input_runners_ssm_housekeeper"></a> [runners\_ssm\_housekeeper](#input\_runners\_ssm\_housekeeper) | Configuration for the SSM housekeeper lambda. This lambda deletes token / JIT config from SSM.<br/><br/> `schedule_expression`: is used to configure the schedule for the lambda.<br/> `enabled`: enable or disable the lambda trigger via the EventBridge.<br/> `lambda_memory_size`: lambda memory size limit.<br/> `lambda_timeout`: timeout for the lambda in seconds.<br/> `config`: configuration for the lambda function. Token path will be read by default from the module. | <pre>object({<br/> schedule_expression = optional(string, "rate(1 day)")<br/> enabled = optional(bool, true)<br/> lambda_memory_size = optional(number, 512)<br/> lambda_timeout = optional(number, 60)<br/> config = object({<br/> tokenPath = optional(string)<br/> minimumDaysOld = optional(number, 1)<br/> dryRun = optional(bool, false)<br/> })<br/> })</pre> | <pre>{<br/> "config": {}<br/>}</pre> | no |
217217
| <a name="input_scale_down_schedule_expression"></a> [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. | `string` | `"cron(*/5 * * * ? *)"` | no |
218+
| <a name="input_scale_errors"></a> [scale\_errors](#input\_scale\_errors) | List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts | `list(string)` | <pre>[<br/> "UnfulfillableCapacity",<br/> "MaxSpotInstanceCountExceeded",<br/> "TargetCapacityLimitExceededException",<br/> "RequestLimitExceeded",<br/> "ResourceLimitExceeded",<br/> "MaxSpotInstanceCountExceeded",<br/> "MaxSpotFleetRequestCountExceeded",<br/> "InsufficientInstanceCapacity",<br/> "InsufficientCapacityOnHost"<br/>]</pre> | no |
218219
| <a name="input_scale_up_reserved_concurrent_executions"></a> [scale\_up\_reserved\_concurrent\_executions](#input\_scale\_up\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no |
219220
| <a name="input_ssm_paths"></a> [ssm\_paths](#input\_ssm\_paths) | The root path used in SSM to store configuration and secrets. | <pre>object({<br/> root = optional(string, "github-action-runners")<br/> app = optional(string, "app")<br/> runners = optional(string, "runners")<br/> webhook = optional(string, "webhook")<br/> use_prefix = optional(bool, true)<br/> })</pre> | `{}` | no |
220221
| <a name="input_state_event_rule_binaries_syncer"></a> [state\_event\_rule\_binaries\_syncer](#input\_state\_event\_rule\_binaries\_syncer) | Option to disable EventBridge Lambda trigger for the binary syncer, useful to stop automatic updates of binary distribution | `string` | `"ENABLED"` | no |

lambdas/functions/ami-housekeeper/package.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"all": "yarn build && yarn format && yarn lint && yarn test"
1818
},
1919
"devDependencies": {
20-
"@aws-sdk/types": "^3.936.0",
20+
"@aws-sdk/types": "^3.953.0",
2121
"@types/aws-lambda": "^8.10.155",
2222
"@vercel/ncc": "^0.38.4",
2323
"aws-sdk-client-mock": "^4.1.0",
@@ -26,8 +26,8 @@
2626
"dependencies": {
2727
"@aws-github-runner/aws-powertools-util": "*",
2828
"@aws-github-runner/aws-ssm-util": "*",
29-
"@aws-sdk/client-ec2": "^3.948.0",
30-
"@aws-sdk/client-ssm": "^3.948.0",
29+
"@aws-sdk/client-ec2": "^3.953.0",
30+
"@aws-sdk/client-ssm": "^3.953.0",
3131
"cron-parser": "^5.4.0"
3232
},
3333
"nx": {

lambdas/functions/control-plane/package.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"all": "yarn build && yarn format && yarn lint && yarn test"
1818
},
1919
"devDependencies": {
20-
"@aws-sdk/types": "^3.936.0",
20+
"@aws-sdk/types": "^3.953.0",
2121
"@octokit/types": "^16.0.0",
2222
"@types/aws-lambda": "^8.10.155",
2323
"@types/node": "^22.19.0",
@@ -33,8 +33,8 @@
3333
"@aws-github-runner/aws-powertools-util": "*",
3434
"@aws-github-runner/aws-ssm-util": "*",
3535
"@aws-lambda-powertools/parameters": "^2.29.0",
36-
"@aws-sdk/client-ec2": "^3.948.0",
37-
"@aws-sdk/client-sqs": "^3.948.0",
36+
"@aws-sdk/client-ec2": "^3.953.0",
37+
"@aws-sdk/client-sqs": "^3.953.0",
3838
"@middy/core": "^6.4.5",
3939
"@octokit/auth-app": "8.1.2",
4040
"@octokit/core": "7.0.6",

lambdas/functions/control-plane/src/aws/runners.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,5 @@ export interface RunnerInputParameters {
4444
amiIdSsmParameterName?: string;
4545
tracingEnabled?: boolean;
4646
onDemandFailoverOnError?: string[];
47+
scaleErrors: string[];
4748
}

lambdas/functions/control-plane/src/aws/runners.test.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,7 @@ describe('create runner with errors', () => {
429429
allocationStrategy: SpotAllocationStrategy.CAPACITY_OPTIMIZED,
430430
capacityType: 'spot',
431431
type: 'Repo',
432+
scaleErrors: ['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded'],
432433
};
433434
const defaultExpectedFleetRequestValues: ExpectedFleetRequestValues = {
434435
type: 'Repo',
@@ -699,6 +700,7 @@ interface RunnerConfig {
699700
amiIdSsmParameterName?: string;
700701
tracingEnabled?: boolean;
701702
onDemandFailoverOnError?: string[];
703+
scaleErrors: string[];
702704
}
703705

704706
function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
@@ -718,6 +720,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
718720
amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName,
719721
tracingEnabled: runnerConfig.tracingEnabled,
720722
onDemandFailoverOnError: runnerConfig.onDemandFailoverOnError,
723+
scaleErrors: runnerConfig.scaleErrors,
721724
};
722725
}
723726

lambdas/functions/control-plane/src/aws/runners.ts

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -196,18 +196,7 @@ async function processFleetResult(
196196
return instances;
197197
}
198198

199-
// Educated guess of errors that would make sense to retry based on the list
200-
// https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html
201-
const scaleErrors = [
202-
'UnfulfillableCapacity',
203-
'MaxSpotInstanceCountExceeded',
204-
'TargetCapacityLimitExceededException',
205-
'RequestLimitExceeded',
206-
'ResourceLimitExceeded',
207-
'MaxSpotInstanceCountExceeded',
208-
'MaxSpotFleetRequestCountExceeded',
209-
'InsufficientInstanceCapacity',
210-
];
199+
const scaleErrors = runnerParameters.scaleErrors;
211200

212201
const failedCount = countScaleErrors(errors, scaleErrors);
213202
if (failedCount > 0) {

lambdas/functions/control-plane/src/modules.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ declare namespace NodeJS {
33
AWS_REGION: string;
44
ENABLE_METRIC_GITHUB_APP_RATE_LIMIT: string;
55
ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS: string;
6+
SCALE_ERRORS: string;
67
ENVIRONMENT: string;
78
GHES_URL: string;
89
JOB_RETRY_CONFIG: string;

lambdas/functions/control-plane/src/pool/pool.test.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,8 @@ beforeEach(() => {
140140
process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot';
141141
process.env.RUNNER_OWNER = ORG;
142142
process.env.RUNNER_BOOT_TIME_IN_MINUTES = MINIMUM_TIME_RUNNING.toString();
143+
process.env.SCALE_ERRORS =
144+
'["UnfulfillableCapacity","MaxSpotInstanceCountExceeded","TargetCapacityLimitExceededException"]';
143145

144146
const mockTokenReturnValue = {
145147
data: {

lambdas/functions/control-plane/src/pool/pool.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
4141
const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS
4242
? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string])
4343
: [];
44+
const scaleErrors = JSON.parse(process.env.SCALE_ERRORS) as [string];
4445

4546
const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl();
4647

@@ -95,6 +96,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
9596
amiIdSsmParameterName,
9697
tracingEnabled,
9798
onDemandFailoverOnError,
99+
scaleErrors,
98100
},
99101
topUp,
100102
githubInstallationClient,

lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ const EXPECTED_RUNNER_PARAMS: RunnerInputParameters = {
105105
subnets: ['subnet-123'],
106106
tracingEnabled: false,
107107
onDemandFailoverOnError: [],
108+
scaleErrors: ['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded', 'TargetCapacityLimitExceededException'],
108109
};
109110
let expectedRunnerParams: RunnerInputParameters;
110111

@@ -122,6 +123,8 @@ function setDefaults() {
122123
process.env.INSTANCE_TYPES = 'm5.large';
123124
process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot';
124125
process.env.ENABLE_ON_DEMAND_FAILOVER = undefined;
126+
process.env.SCALE_ERRORS =
127+
'["UnfulfillableCapacity","MaxSpotInstanceCountExceeded","TargetCapacityLimitExceededException"]';
125128
}
126129

127130
beforeEach(() => {
@@ -809,6 +812,16 @@ describe('scaleUp with public GH', () => {
809812
});
810813
});
811814

815+
it('creates a runner with correct config and labels and custom scale errors enabled.', async () => {
816+
process.env.RUNNER_LABELS = 'label1,label2';
817+
process.env.SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']);
818+
await scaleUpModule.scaleUp(TEST_DATA);
819+
expect(createRunner).toBeCalledWith({
820+
...expectedRunnerParams,
821+
scaleErrors: ['RequestLimitExceeded'],
822+
});
823+
});
824+
812825
it('creates a runner and ensure the group argument is ignored', async () => {
813826
process.env.RUNNER_LABELS = 'label1,label2';
814827
process.env.RUNNER_GROUP_NAME = 'TEST_GROUP_IGNORED';

0 commit comments

Comments
 (0)