Skip to content

Commit a83ece8

Browse files
committed
feat(multi-runner)!: support running the scale-down lambda once for every runner group
Iterating the list of active runners in the GitHub API can be slow and expensive in terms of rate limit consumption. It's a paginated API, returning up to 100 runners per page. With several thousand runners across many runner groups, running `scale-down` once per runner group can quickly eat up large portions of the rate limit. Here we break the Terraform `scale-down` module into its own sub-module, so that `multi-runner` can create one instance of the Lambda function instead of the `runner` module managing it. A flag is added to the `runner` module to disable the `scale-down` function creation in the `multi-runner` case. Then the Lambda's code is modified to accept a list of configurations, and process them all. With this, we only need to fetch the list of runners once for all runner groups. BREAKING CHANGE: When using the `multi-runner` module, the per-group `scale_down_schedule_expression` is no longer supported. Only needed if you are using the `multi-runner` module. One instance of `scale-down` will now handle all runner groups. 1. Remove any `scale_down_schedule_expression` settings from your `multi_runner_config` runner configs. 2. To customise the frequency of the consolidated `scale-down` function, set the `scale_down_schedule_expression` variable on the `multi-runner` module itself.
1 parent 67fadae commit a83ece8

29 files changed

Lines changed: 974 additions & 315 deletions

examples/multi-runner/main.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ module "runners" {
9898
# runner_extra_labels = ["amazon"]
9999
# runners_maximum_count = 1
100100
# enable_ephemeral_runners = true
101-
# scale_down_schedule_expression = "cron(* * * * ? *)"
102101
# }
103102
# }
104103
# }
@@ -107,6 +106,7 @@ module "runners" {
107106
subnet_ids = module.base.vpc.private_subnets
108107
runners_scale_up_lambda_timeout = 60
109108
runners_scale_down_lambda_timeout = 60
109+
scale_down_schedule_expression = "cron(* * * * ? *)"
110110
prefix = local.environment
111111
tags = {
112112
Project = "ProjectX"

examples/multi-runner/templates/runner-configs/linux-arm64.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ runner_config:
1919
id_ssm_parameter_arn: ${ami_id_ssm_parameter_arn}
2020
runners_maximum_count: 1
2121
delay_webhook_event: 0
22-
scale_down_schedule_expression: cron(* * * * ? *)
2322
runner_hook_job_started: |
2423
echo "Running pre job hook as $(whoami)"
2524
runner_hook_job_completed: |

examples/multi-runner/templates/runner-configs/linux-x64-ubuntu-2204.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ runner_config:
1919
- m5a.large
2020
runners_maximum_count: 1
2121
delay_webhook_event: 0
22-
scale_down_schedule_expression: cron(* * * * ? *)
2322
userdata_template: ./templates/user-data.sh
2423
ami:
2524
owners:

examples/multi-runner/templates/runner-configs/linux-x64-ubuntu.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ runner_config:
2020
- m5a.large
2121
runners_maximum_count: 1
2222
delay_webhook_event: 0
23-
scale_down_schedule_expression: cron(* * * * ? *)
2423
userdata_template: ./templates/user-data.sh
2524
ami:
2625
owners:

examples/multi-runner/templates/runner-configs/linux-x64.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ runner_config:
2121
enable_on_demand_failover_for_errors: ['InsufficientInstanceCapacity']
2222
create_service_linked_role_spot: true
2323
delay_webhook_event: 0
24-
scale_down_schedule_expression: cron(* * * * ? *)
2524
runner_metadata_options:
2625
instance_metadata_tags: disabled
2726
http_endpoint: enabled

examples/multi-runner/templates/runner-configs/windows-x64.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ runner_config:
1313
- c5.large
1414
runners_maximum_count: 1
1515
delay_webhook_event: 5
16-
scale_down_schedule_expression: cron(* * * * ? *)
1716
runner_boot_time_in_minutes: 20
1817
ami_filter:
1918
name:

lambdas/functions/control-plane/src/aws/runners.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -298,8 +298,7 @@ async function createInstances(
298298
}
299299

300300
// If launchTime is undefined, this will return false
301-
export function bootTimeExceeded(ec2Runner: { launchTime?: Date }): boolean {
302-
const runnerBootTimeInMinutes = process.env.RUNNER_BOOT_TIME_IN_MINUTES;
301+
export function bootTimeExceeded(ec2Runner: { launchTime?: Date }, runnerBootTimeInMinutes: number): boolean {
303302
const launchTimePlusBootTime = moment(ec2Runner.launchTime).utc().add(runnerBootTimeInMinutes, 'minutes');
304303
return launchTimePlusBootTime < moment(new Date()).utc();
305304
}

lambdas/functions/control-plane/src/modules.d.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ declare namespace NodeJS {
44
ENABLE_METRIC_GITHUB_APP_RATE_LIMIT: string;
55
ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS: string;
66
ENVIRONMENT: string;
7+
ENVIRONMENT_CONFIGS: string;
78
GHES_URL: string;
89
JOB_RETRY_CONFIG: string;
910
LAUNCH_TEMPLATE_NAME: string;
@@ -14,8 +15,8 @@ declare namespace NodeJS {
1415
PARAMETER_GITHUB_APP_CLIENT_SECRET_NAME: string;
1516
PARAMETER_GITHUB_APP_ID_NAME: string;
1617
PARAMETER_GITHUB_APP_KEY_BASE64_NAME: string;
18+
RUNNER_BOOT_TIME_IN_MINUTES: string;
1719
RUNNER_OWNER: string;
18-
SCALE_DOWN_CONFIG: string;
1920
SSM_TOKEN_PATH: string;
2021
SSM_CLEANUP_CONFIG: string;
2122
SUBNET_IDS: string;

lambdas/functions/control-plane/src/pool/pool.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
3737
const instanceAllocationStrategy = process.env.INSTANCE_ALLOCATION_STRATEGY || 'lowest-price'; // same as AWS default
3838
const runnerOwner = process.env.RUNNER_OWNER;
3939
const amiIdSsmParameterName = process.env.AMI_ID_SSM_PARAMETER_NAME;
40+
const runnerBootTimeInMinutes = parseInt(process.env.RUNNER_BOOT_TIME_IN_MINUTES || '5');
4041
const tracingEnabled = yn(process.env.POWERTOOLS_TRACE_ENABLED, { default: false });
4142
const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS
4243
? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string])
@@ -63,7 +64,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
6364
statuses: ['running'],
6465
});
6566

66-
const numberOfRunnersInPool = calculatePooSize(ec2runners, runnerStatusses);
67+
const numberOfRunnersInPool = calculatePooSize(ec2runners, runnerStatusses, runnerBootTimeInMinutes);
6768
const topUp = event.poolSize - numberOfRunnersInPool;
6869

6970
if (topUp > 0) {
@@ -115,7 +116,7 @@ async function getInstallationId(ghesApiUrl: string, org: string): Promise<numbe
115116
).data.id;
116117
}
117118

118-
function calculatePooSize(ec2runners: RunnerList[], runnerStatus: Map<string, RunnerStatus>): number {
119+
function calculatePooSize(ec2runners: RunnerList[], runnerStatus: Map<string, RunnerStatus>, runnerBootTimeInMinutes: number): number {
119120
// Runner should be considered idle if it is still booting, or is idle in GitHub
120121
let numberOfRunnersInPool = 0;
121122
for (const ec2Instance of ec2runners) {
@@ -127,7 +128,7 @@ function calculatePooSize(ec2runners: RunnerList[], runnerStatus: Map<string, Ru
127128
logger.debug(`Runner ${ec2Instance.instanceId} is idle in GitHub and counted as part of the pool`);
128129
} else if (runnerStatus.get(ec2Instance.instanceId) != null) {
129130
logger.debug(`Runner ${ec2Instance.instanceId} is not idle in GitHub and NOT counted as part of the pool`);
130-
} else if (!bootTimeExceeded(ec2Instance)) {
131+
} else if (!bootTimeExceeded(ec2Instance, runnerBootTimeInMinutes)) {
131132
numberOfRunnersInPool++;
132133
logger.info(`Runner ${ec2Instance.instanceId} is still booting and counted as part of the pool`);
133134
} else {

lambdas/functions/control-plane/src/scale-runners/scale-down-config.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@ export interface ScalingDownConfig {
1111
evictionStrategy?: EvictionStrategy;
1212
}
1313

14+
export interface EnvironmentScaleDownConfig {
15+
environment: string;
16+
idle_config: ScalingDownConfig[];
17+
minimum_running_time_in_minutes: number;
18+
runner_boot_time_in_minutes: number;
19+
}
20+
1421
const logger = createChildLogger('scale-down-config.ts');
1522

1623
function inPeriod(period: ScalingDownConfig): boolean {

0 commit comments

Comments
 (0)