Skip to content

Commit 746f975

Browse files
committed
feat: implement standby pool management with max age termination for idle runners
1 parent 01fd784 commit 746f975

7 files changed

Lines changed: 169 additions & 26 deletions

File tree

lambdas/functions/control-plane/src/aws/runners.d.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { DefaultTargetCapacityType, SpotAllocationStrategy } from '@aws-sdk/client-ec2';
1+
import { DefaultTargetCapacityType, SpotAllocationStrategy, Tag } from '@aws-sdk/client-ec2';
22

33
export type RunnerType = 'Org' | 'Repo';
44

@@ -11,6 +11,7 @@ export interface RunnerList {
1111
org?: string;
1212
orphan?: boolean;
1313
runnerId?: string;
14+
tags?: Tag[];
1415
}
1516

1617
export interface RunnerInfo {

lambdas/functions/control-plane/src/aws/runners.test.ts

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -87,27 +87,27 @@ describe('list instances', () => {
8787
mockEC2Client.on(DescribeInstancesCommand).resolves(mockRunningInstances);
8888
const resp = await listEC2Runners();
8989
expect(resp.length).toBe(1);
90-
expect(resp).toContainEqual({
90+
expect(resp).toContainEqual(expect.objectContaining({
9191
instanceId: 'i-1234',
9292
launchTime: new Date('2020-10-10T14:48:00.000+09:00'),
9393
type: 'Org',
9494
owner: 'CoderToCat',
9595
orphan: false,
96-
});
96+
}));
9797
});
9898

9999
it('returns a list of instances (JIT)', async () => {
100100
mockEC2Client.on(DescribeInstancesCommand).resolves(mockRunningInstancesJit);
101101
const resp = await listEC2Runners();
102102
expect(resp.length).toBe(1);
103-
expect(resp).toContainEqual({
103+
expect(resp).toContainEqual(expect.objectContaining({
104104
instanceId: 'i-1234',
105105
launchTime: new Date('2020-10-10T14:48:00.000+09:00'),
106106
type: 'Org',
107107
owner: 'CoderToCat',
108108
orphan: false,
109109
runnerId: '9876543210',
110-
});
110+
}));
111111
});
112112

113113
it('check orphan tag.', async () => {
@@ -120,13 +120,13 @@ describe('list instances', () => {
120120

121121
const resp = await listEC2Runners();
122122
expect(resp.length).toBe(1);
123-
expect(resp).toContainEqual({
123+
expect(resp).toContainEqual(expect.objectContaining({
124124
instanceId: instances.Reservations![0].Instances![0].InstanceId!,
125125
launchTime: instances.Reservations![0].Instances![0].LaunchTime!,
126126
type: 'Org',
127127
owner: 'CoderToCat',
128128
orphan: true,
129-
});
129+
}));
130130
});
131131

132132
it('calls EC2 describe instances', async () => {
@@ -302,7 +302,7 @@ describe('stop runner', () => {
302302
Resources: [runner.instanceId],
303303
Tags: [
304304
{ Key: 'ghr:state', Value: 'standby' },
305-
{ Key: 'ghr:stopped_at', Value: expect.any(String) },
305+
{ Key: 'ghr:standby_time', Value: expect.any(String) },
306306
],
307307
});
308308
});
@@ -327,7 +327,7 @@ describe('start runner', () => {
327327
});
328328
expect(mockEC2Client).toHaveReceivedCommandWith(DeleteTagsCommand, {
329329
Resources: [runner.instanceId],
330-
Tags: [{ Key: 'ghr:state' }, { Key: 'ghr:stopped_at' }],
330+
Tags: [{ Key: 'ghr:state' }, { Key: 'ghr:standby_time' }],
331331
});
332332
});
333333
});

lambdas/functions/control-plane/src/aws/runners.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ function getRunnerInfo(runningInstances: DescribeInstancesResult) {
9898
org: i.Tags?.find((e) => e.Key === 'ghr:Org')?.Value as string,
9999
orphan: i.Tags?.find((e) => e.Key === 'ghr:orphan')?.Value === 'true',
100100
runnerId: i.Tags?.find((e) => e.Key === 'ghr:github_runner_id')?.Value as string,
101+
tags: i.Tags,
101102
});
102103
}
103104
}
@@ -119,7 +120,7 @@ export async function stopRunner(instanceId: string): Promise<void> {
119120
await ec2.send(new StopInstancesCommand({ InstanceIds: [instanceId] }));
120121
await tag(instanceId, [
121122
{ Key: 'ghr:state', Value: 'standby' },
122-
{ Key: 'ghr:stopped_at', Value: new Date().toISOString() },
123+
{ Key: 'ghr:standby_time', Value: new Date().toISOString() },
123124
]);
124125
logger.debug(`Runner ${instanceId} has been stopped.`);
125126
}
@@ -128,7 +129,7 @@ export async function startRunner(instanceId: string): Promise<void> {
128129
logger.debug(`Runner '${instanceId}' will be started.`);
129130
const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION }));
130131
await ec2.send(new StartInstancesCommand({ InstanceIds: [instanceId] }));
131-
await untag(instanceId, [{ Key: 'ghr:state' }, { Key: 'ghr:stopped_at' }]);
132+
await untag(instanceId, [{ Key: 'ghr:state' }, { Key: 'ghr:standby_time' }]);
132133
logger.debug(`Runner ${instanceId} has been started.`);
133134
}
134135

lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts

Lines changed: 96 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -290,15 +290,15 @@ describe('Scale down runners', () => {
290290
it(`Should stop idle runner to standby when standby pool enabled`, async () => {
291291
process.env.STANDBY_POOL_SIZE = '2';
292292
process.env.STANDBY_IDLE_TIME_MINUTES = '0';
293-
293+
294294
const runners = [
295295
createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false),
296296
createRunnerTestData('idle-2', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false),
297297
];
298298

299299
mockGitHubRunners(runners);
300300
mockAwsRunners(runners);
301-
301+
302302
const originalListRunners = mockListRunners.getMockImplementation();
303303
mockListRunners.mockImplementation(async (filters) => {
304304
if (filters?.statuses?.includes('stopped') && filters?.standby) {
@@ -311,11 +311,104 @@ describe('Scale down runners', () => {
311311

312312
expect(mockStopRunners).toHaveBeenCalledTimes(2);
313313
expect(mockTerminateRunners).not.toHaveBeenCalled();
314-
314+
315315
delete process.env.STANDBY_POOL_SIZE;
316316
delete process.env.STANDBY_IDLE_TIME_MINUTES;
317317
});
318318

319+
it(`Should terminate standby runners older than max age`, async () => {
320+
process.env.STANDBY_POOL_SIZE = '2';
321+
process.env.STANDBY_MAX_AGE_HOURS = '24';
322+
323+
const oldStandbyTime = moment().subtract(25, 'hours').toISOString();
324+
const recentStandbyTime = moment().subtract(1, 'hours').toISOString();
325+
326+
const oldStandbyRunner: RunnerList = {
327+
instanceId: 'i-old-standby',
328+
launchTime: moment().subtract(26, 'hours').toDate(),
329+
type,
330+
owner: `${TEST_DATA.repositoryOwner}/${TEST_DATA.repositoryName}`,
331+
tags: [
332+
{ Key: 'ghr:state', Value: 'standby' },
333+
{ Key: 'ghr:standby_time', Value: oldStandbyTime },
334+
],
335+
};
336+
337+
const recentStandbyRunner: RunnerList = {
338+
instanceId: 'i-recent-standby',
339+
launchTime: moment().subtract(2, 'hours').toDate(),
340+
type,
341+
owner: `${TEST_DATA.repositoryOwner}/${TEST_DATA.repositoryName}`,
342+
tags: [
343+
{ Key: 'ghr:state', Value: 'standby' },
344+
{ Key: 'ghr:standby_time', Value: recentStandbyTime },
345+
],
346+
};
347+
348+
const runners = [
349+
createRunnerTestData('active-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false),
350+
];
351+
352+
mockGitHubRunners(runners);
353+
mockAwsRunners(runners);
354+
355+
const originalListRunners = mockListRunners.getMockImplementation();
356+
mockListRunners.mockImplementation(async (filters) => {
357+
if (filters?.statuses?.includes('stopped') && filters?.standby) {
358+
return [oldStandbyRunner, recentStandbyRunner];
359+
}
360+
return originalListRunners ? originalListRunners(filters) : runners;
361+
});
362+
363+
await scaleDown();
364+
365+
expect(mockTerminateRunners).toHaveBeenCalledWith('i-old-standby');
366+
expect(mockTerminateRunners).not.toHaveBeenCalledWith('i-recent-standby');
367+
368+
delete process.env.STANDBY_POOL_SIZE;
369+
delete process.env.STANDBY_MAX_AGE_HOURS;
370+
});
371+
372+
it(`Should skip age termination when standby pool disabled`, async () => {
373+
process.env.STANDBY_POOL_SIZE = '0';
374+
process.env.STANDBY_MAX_AGE_HOURS = '24';
375+
376+
const oldStandbyTime = moment().subtract(25, 'hours').toISOString();
377+
378+
const oldStandbyRunner: RunnerList = {
379+
instanceId: 'i-old-standby',
380+
launchTime: moment().subtract(26, 'hours').toDate(),
381+
type,
382+
owner: `${TEST_DATA.repositoryOwner}/${TEST_DATA.repositoryName}`,
383+
tags: [
384+
{ Key: 'ghr:state', Value: 'standby' },
385+
{ Key: 'ghr:standby_time', Value: oldStandbyTime },
386+
],
387+
};
388+
389+
const runners = [
390+
createRunnerTestData('active-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false),
391+
];
392+
393+
mockGitHubRunners(runners);
394+
mockAwsRunners(runners);
395+
396+
const originalListRunners = mockListRunners.getMockImplementation();
397+
mockListRunners.mockImplementation(async (filters) => {
398+
if (filters?.statuses?.includes('stopped') && filters?.standby) {
399+
return [oldStandbyRunner];
400+
}
401+
return originalListRunners ? originalListRunners(filters) : runners;
402+
});
403+
404+
await scaleDown();
405+
406+
expect(mockTerminateRunners).not.toHaveBeenCalledWith('i-old-standby');
407+
408+
delete process.env.STANDBY_POOL_SIZE;
409+
delete process.env.STANDBY_MAX_AGE_HOURS;
410+
});
411+
319412
it(`Should not terminate a runner that became busy just before deregister runner.`, async () => {
320413
// setup
321414
const runners = [

lambdas/functions/control-plane/src/scale-runners/scale-down.ts

Lines changed: 56 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
130130
function runnerIdleTimeExceeded(runner: RunnerInfo): boolean {
131131
const idleTimeMinutes = parseInt(process.env.STANDBY_IDLE_TIME_MINUTES || '0');
132132
if (idleTimeMinutes === 0) return true;
133-
133+
134134
const launchTimePlusIdle = moment(runner.launchTime).utc().add(idleTimeMinutes, 'minutes');
135135
const now = moment(new Date()).utc();
136136
return launchTimePlusIdle < now;
@@ -169,7 +169,9 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[], should
169169

170170
if (statuses.every((status) => status == 204)) {
171171
await terminateRunner(ec2runner.instanceId);
172-
logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
172+
logger.info(
173+
`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`,
174+
);
173175
} else {
174176
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
175177
}
@@ -191,7 +193,7 @@ async function evaluateAndRemoveRunners(
191193
let idleCounter = getIdleRunnerCount(scaleDownConfigs);
192194
const evictionStrategy = getEvictionStrategy(scaleDownConfigs);
193195
const ownerTags = new Set(ec2Runners.map((runner) => runner.owner));
194-
196+
195197
const standbyPoolSize = parseInt(process.env.STANDBY_POOL_SIZE || '0');
196198
const environment = process.env.ENVIRONMENT;
197199

@@ -201,15 +203,22 @@ async function evaluateAndRemoveRunners(
201203
.sort(evictionStrategy === 'oldest_first' ? oldestFirstStrategy : newestFirstStrategy);
202204
logger.debug(`Found: '${ec2RunnersFiltered.length}' active GitHub runners with owner tag: '${ownerTag}'`);
203205
logger.debug(`Active GitHub runners with owner tag: '${ownerTag}': ${JSON.stringify(ec2RunnersFiltered)}`);
204-
205-
const standbyRunners = standbyPoolSize > 0
206-
? await listEC2Runners({ environment, runnerType: 'Org', runnerOwner: ownerTag, statuses: ['stopped'], standby: true })
207-
: [];
206+
207+
const standbyRunners =
208+
standbyPoolSize > 0
209+
? await listEC2Runners({
210+
environment,
211+
runnerType: 'Org',
212+
runnerOwner: ownerTag,
213+
statuses: ['stopped'],
214+
standby: true,
215+
})
216+
: [];
208217
const currentStandbyCount = standbyRunners.length;
209218
let standbyCounter = Math.max(0, standbyPoolSize - currentStandbyCount);
210-
219+
211220
logger.debug(`Standby pool: target=${standbyPoolSize}, current=${currentStandbyCount}, needed=${standbyCounter}`);
212-
221+
213222
for (const ec2Runner of ec2RunnersFiltered) {
214223
const ghRunners = await listGitHubRunners(ec2Runner);
215224
const ghRunnersFiltered = ghRunners.filter((runner: { name: string }) =>
@@ -336,6 +345,42 @@ async function listRunners(environment: string) {
336345
});
337346
}
338347

348+
async function terminateOldStandbyRunners(environment: string): Promise<void> {
349+
const maxAgeHours = parseInt(process.env.STANDBY_MAX_AGE_HOURS || '168');
350+
const standbyPoolSize = parseInt(process.env.STANDBY_POOL_SIZE || '0');
351+
352+
if (standbyPoolSize === 0) {
353+
return;
354+
}
355+
356+
try {
357+
const standbyRunners = await listEC2Runners({
358+
environment,
359+
statuses: ['stopped'],
360+
standby: true,
361+
});
362+
363+
for (const runner of standbyRunners) {
364+
const standbyTimeTag = runner.tags?.find((t) => t.Key === 'ghr:standby_time')?.Value;
365+
if (standbyTimeTag) {
366+
const standbyTime = moment(standbyTimeTag);
367+
const ageHours = moment().diff(standbyTime, 'hours');
368+
369+
if (ageHours > maxAgeHours) {
370+
logger.info(
371+
`Terminating standby runner '${runner.instanceId}' - age ${ageHours}h exceeds limit ${maxAgeHours}h`,
372+
);
373+
await terminateRunner(runner.instanceId).catch((e) => {
374+
logger.error(`Failed to terminate old standby runner '${runner.instanceId}'`, { error: e });
375+
});
376+
}
377+
}
378+
}
379+
} catch (e) {
380+
logger.warn(`Failure during old standby runner termination.`, { error: e });
381+
}
382+
}
383+
339384
function filterRunners(ec2runners: RunnerList[]): RunnerInfo[] {
340385
return ec2runners.filter((ec2Runner) => ec2Runner.type && !ec2Runner.orphan) as RunnerInfo[];
341386
}
@@ -345,10 +390,10 @@ export async function scaleDown(): Promise<void> {
345390
const environment = process.env.ENVIRONMENT;
346391
const scaleDownConfigs = JSON.parse(process.env.SCALE_DOWN_CONFIG) as [ScalingDownConfig];
347392

348-
// first runners marked to be orphan.
349393
await terminateOrphan(environment);
350394

351-
// next scale down idle runners with respect to config and mark potential orphans
395+
await terminateOldStandbyRunners(environment);
396+
352397
const ec2Runners = await listRunners(environment);
353398
const activeEc2RunnersCount = ec2Runners.length;
354399
logger.info(`Found: '${activeEc2RunnersCount}' active GitHub EC2 runner instances before clean-up.`);

modules/runners/scale-down.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ resource "aws_lambda_function" "scale_down" {
3636
SCALE_DOWN_CONFIG = jsonencode(var.idle_config)
3737
STANDBY_POOL_SIZE = var.standby_pool_config.size
3838
STANDBY_IDLE_TIME_MINUTES = var.standby_pool_config.idle_time_minutes
39+
STANDBY_MAX_AGE_HOURS = var.standby_pool_config.max_age_hours
3940
POWERTOOLS_SERVICE_NAME = "runners-scale-down"
4041
POWERTOOLS_METRICS_NAMESPACE = var.metrics.namespace
4142
POWERTOOLS_TRACE_ENABLED = var.tracing_config.mode != null ? true : false

modules/runners/variables.tf

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -561,16 +561,18 @@ variable "pool_config" {
561561
}
562562

563563
variable "standby_pool_config" {
564-
description = "Configuration for the standby pool. Instances in the standby pool are stopped instead of terminated, allowing faster scale-up. The `size` specifies how many stopped instances to maintain. The `idle_time_minutes` specifies how long an idle runner should wait before being moved to standby (0 means immediate). Stopped instances only incur EBS storage costs."
564+
description = "Configuration for the standby pool. Instances in the standby pool are stopped instead of terminated, allowing faster scale-up. The `size` specifies how many stopped instances to maintain. The `idle_time_minutes` specifies how long an idle runner should wait before being moved to standby (0 means immediate). The `max_age_hours` specifies the maximum age of stopped instances before they are terminated (default 168 hours = 7 days). Stopped instances only incur EBS storage costs."
565565
type = object({
566566
enabled = optional(bool, false)
567567
size = optional(number, 0)
568568
idle_time_minutes = optional(number, 0)
569+
max_age_hours = optional(number, 168)
569570
})
570571
default = {
571572
enabled = false
572573
size = 0
573574
idle_time_minutes = 0
575+
max_age_hours = 168
574576
}
575577
}
576578

0 commit comments

Comments
 (0)