Skip to content

Commit 01fd784

Browse files
committed
feat: add standby pool functionality for EC2 runners with start/stop capabilities
1 parent 391a65f commit 01fd784

9 files changed

Lines changed: 311 additions & 60 deletions

File tree

lambdas/functions/control-plane/src/aws/runners.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ export interface ListRunnerFilters {
2626
environment?: string;
2727
orphan?: boolean;
2828
statuses?: string[];
29+
standby?: boolean;
2930
}
3031

3132
export interface RunnerInputParameters {

lambdas/functions/control-plane/src/aws/runners.test.ts

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import {
1111
type DescribeInstancesResult,
1212
EC2Client,
1313
SpotAllocationStrategy,
14+
StartInstancesCommand,
15+
StopInstancesCommand,
1416
TerminateInstancesCommand,
1517
} from '@aws-sdk/client-ec2';
1618
import { GetParameterCommand, type GetParameterResult, PutParameterCommand, SSMClient } from '@aws-sdk/client-ssm';
@@ -19,7 +21,7 @@ import 'aws-sdk-client-mock-jest/vitest';
1921

2022
import { beforeEach, describe, expect, it, vi } from 'vitest';
2123
import ScaleError from './../scale-runners/ScaleError';
22-
import { createRunner, listEC2Runners, tag, terminateRunner, untag } from './runners';
24+
import { createRunner, listEC2Runners, tag, terminateRunner, untag, stopRunner, startRunner } from './runners';
2325
import type { RunnerInfo, RunnerInputParameters, RunnerType } from './runners.d';
2426

2527
process.env.AWS_REGION = 'eu-east-1';
@@ -245,6 +247,19 @@ describe('list instances', () => {
245247
],
246248
});
247249
});
250+
251+
it('Filter instances for standby state', async () => {
252+
mockEC2Client.on(DescribeInstancesCommand).resolves(mockRunningInstances);
253+
await listEC2Runners({ statuses: ['stopped'], standby: true, environment: ENVIRONMENT });
254+
expect(mockEC2Client).toHaveReceivedCommandWith(DescribeInstancesCommand, {
255+
Filters: [
256+
{ Name: 'instance-state-name', Values: ['stopped'] },
257+
{ Name: 'tag:ghr:environment', Values: [ENVIRONMENT] },
258+
{ Name: 'tag:ghr:state', Values: ['standby'] },
259+
{ Name: 'tag:ghr:Application', Values: ['github-action-runner'] },
260+
],
261+
});
262+
});
248263
});
249264

250265
describe('terminate runner', () => {
@@ -266,6 +281,57 @@ describe('terminate runner', () => {
266281
});
267282
});
268283

284+
describe('stop runner', () => {
285+
beforeEach(() => {
286+
vi.clearAllMocks();
287+
});
288+
it('calls stop instances and tags with standby state', async () => {
289+
mockEC2Client.on(StopInstancesCommand).resolves({});
290+
mockEC2Client.on(CreateTagsCommand).resolves({});
291+
const runner: RunnerInfo = {
292+
instanceId: 'instance-3',
293+
owner: 'owner-3',
294+
type: 'Org',
295+
};
296+
await stopRunner(runner.instanceId);
297+
298+
expect(mockEC2Client).toHaveReceivedCommandWith(StopInstancesCommand, {
299+
InstanceIds: [runner.instanceId],
300+
});
301+
expect(mockEC2Client).toHaveReceivedCommandWith(CreateTagsCommand, {
302+
Resources: [runner.instanceId],
303+
Tags: [
304+
{ Key: 'ghr:state', Value: 'standby' },
305+
{ Key: 'ghr:stopped_at', Value: expect.any(String) },
306+
],
307+
});
308+
});
309+
});
310+
311+
describe('start runner', () => {
312+
beforeEach(() => {
313+
vi.clearAllMocks();
314+
});
315+
it('calls start instances and removes standby tags', async () => {
316+
mockEC2Client.on(StartInstancesCommand).resolves({});
317+
mockEC2Client.on(DeleteTagsCommand).resolves({});
318+
const runner: RunnerInfo = {
319+
instanceId: 'instance-4',
320+
owner: 'owner-4',
321+
type: 'Org',
322+
};
323+
await startRunner(runner.instanceId);
324+
325+
expect(mockEC2Client).toHaveReceivedCommandWith(StartInstancesCommand, {
326+
InstanceIds: [runner.instanceId],
327+
});
328+
expect(mockEC2Client).toHaveReceivedCommandWith(DeleteTagsCommand, {
329+
Resources: [runner.instanceId],
330+
Tags: [{ Key: 'ghr:state' }, { Key: 'ghr:stopped_at' }],
331+
});
332+
});
333+
});
334+
269335
describe('tag runner', () => {
270336
beforeEach(() => {
271337
vi.clearAllMocks();

lambdas/functions/control-plane/src/aws/runners.ts

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import {
77
DescribeInstancesResult,
88
EC2Client,
99
FleetLaunchTemplateOverridesRequest,
10+
StartInstancesCommand,
11+
StopInstancesCommand,
1012
Tag,
1113
TerminateInstancesCommand,
1214
_InstanceType,
@@ -52,6 +54,9 @@ function constructFilters(filters?: Runners.ListRunnerFilters): Ec2Filter[][] {
5254
if (filters.orphan) {
5355
ec2FiltersBase.push({ Name: 'tag:ghr:orphan', Values: ['true'] });
5456
}
57+
if (filters.standby) {
58+
ec2FiltersBase.push({ Name: 'tag:ghr:state', Values: ['standby'] });
59+
}
5560
}
5661

5762
for (const key of ['tag:ghr:Application']) {
@@ -108,6 +113,25 @@ export async function terminateRunner(instanceId: string): Promise<void> {
108113
logger.debug(`Runner ${instanceId} has been terminated.`);
109114
}
110115

116+
export async function stopRunner(instanceId: string): Promise<void> {
117+
logger.debug(`Runner '${instanceId}' will be stopped.`);
118+
const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION }));
119+
await ec2.send(new StopInstancesCommand({ InstanceIds: [instanceId] }));
120+
await tag(instanceId, [
121+
{ Key: 'ghr:state', Value: 'standby' },
122+
{ Key: 'ghr:stopped_at', Value: new Date().toISOString() },
123+
]);
124+
logger.debug(`Runner ${instanceId} has been stopped.`);
125+
}
126+
127+
export async function startRunner(instanceId: string): Promise<void> {
128+
logger.debug(`Runner '${instanceId}' will be started.`);
129+
const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION }));
130+
await ec2.send(new StartInstancesCommand({ InstanceIds: [instanceId] }));
131+
await untag(instanceId, [{ Key: 'ghr:state' }, { Key: 'ghr:stopped_at' }]);
132+
logger.debug(`Runner ${instanceId} has been started.`);
133+
}
134+
111135
export async function tag(instanceId: string, tags: Tag[]): Promise<void> {
112136
logger.debug(`Tagging '${instanceId}'`, { tags });
113137
const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION }));

lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import nock from 'nock';
55

66
import { RunnerInfo, RunnerList } from '../aws/runners.d';
77
import * as ghAuth from '../github/auth';
8-
import { listEC2Runners, terminateRunner, tag, untag } from './../aws/runners';
8+
import { listEC2Runners, terminateRunner, tag, untag, stopRunner } from './../aws/runners';
99
import { githubCache } from './cache';
1010
import { newestFirstStrategy, oldestFirstStrategy, scaleDown } from './scale-down';
1111
import { describe, it, expect, beforeEach, vi } from 'vitest';
@@ -32,12 +32,13 @@ vi.mock('@octokit/rest', () => ({
3232
}));
3333

3434
vi.mock('./../aws/runners', async (importOriginal) => {
35-
const actual = await importOriginal();
35+
const actual = await importOriginal<typeof import('./../aws/runners')>();
3636
return {
3737
...actual,
3838
tag: vi.fn(),
3939
untag: vi.fn(),
4040
terminateRunner: vi.fn(),
41+
stopRunner: vi.fn(),
4142
listEC2Runners: vi.fn(),
4243
};
4344
});
@@ -60,14 +61,14 @@ vi.mock('./cache', async () => ({
6061
},
6162
}));
6263

63-
const mocktokit = Octokit as vi.MockedClass<typeof Octokit>;
6464
const mockedAppAuth = vi.mocked(ghAuth.createGithubAppAuth);
6565
const mockedInstallationAuth = vi.mocked(ghAuth.createGithubInstallationAuth);
6666
const mockCreateClient = vi.mocked(ghAuth.createOctokitClient);
6767
const mockListRunners = vi.mocked(listEC2Runners);
6868
const mockTagRunners = vi.mocked(tag);
6969
const mockUntagRunners = vi.mocked(untag);
7070
const mockTerminateRunners = vi.mocked(terminateRunner);
71+
const mockStopRunners = vi.mocked(stopRunner);
7172

7273
export interface TestData {
7374
repositoryName: string;
@@ -180,7 +181,7 @@ describe('Scale down runners', () => {
180181
repositorySelection: 'all',
181182
installationId: 0,
182183
});
183-
mockCreateClient.mockResolvedValue(new mocktokit());
184+
mockCreateClient.mockResolvedValue(mockOctokit as unknown as Octokit);
184185
});
185186

186187
const endpoints = ['https://api.github.com', 'https://github.enterprise.something', 'https://companyname.ghe.com'];
@@ -286,6 +287,35 @@ describe('Scale down runners', () => {
286287
checkNonTerminated(runners);
287288
});
288289

290+
it(`Should stop idle runner to standby when standby pool enabled`, async () => {
291+
process.env.STANDBY_POOL_SIZE = '2';
292+
process.env.STANDBY_IDLE_TIME_MINUTES = '0';
293+
294+
const runners = [
295+
createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false),
296+
createRunnerTestData('idle-2', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false),
297+
];
298+
299+
mockGitHubRunners(runners);
300+
mockAwsRunners(runners);
301+
302+
const originalListRunners = mockListRunners.getMockImplementation();
303+
mockListRunners.mockImplementation(async (filters) => {
304+
if (filters?.statuses?.includes('stopped') && filters?.standby) {
305+
return [];
306+
}
307+
return originalListRunners ? originalListRunners(filters) : runners;
308+
});
309+
310+
await scaleDown();
311+
312+
expect(mockStopRunners).toHaveBeenCalledTimes(2);
313+
expect(mockTerminateRunners).not.toHaveBeenCalled();
314+
315+
delete process.env.STANDBY_POOL_SIZE;
316+
delete process.env.STANDBY_IDLE_TIME_MINUTES;
317+
});
318+
289319
it(`Should not terminate a runner that became busy just before deregister runner.`, async () => {
290320
// setup
291321
const runners = [

lambdas/functions/control-plane/src/scale-runners/scale-down.ts

Lines changed: 59 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import { createChildLogger } from '@aws-github-runner/aws-powertools-util';
55
import moment from 'moment';
66

77
import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth';
8-
import { bootTimeExceeded, listEC2Runners, tag, untag, terminateRunner } from './../aws/runners';
8+
import { bootTimeExceeded, listEC2Runners, tag, untag, terminateRunner, stopRunner } from './../aws/runners';
99
import { RunnerInfo, RunnerList } from './../aws/runners.d';
1010
import { GhRunners, githubCache } from './cache';
1111
import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config';
@@ -127,39 +127,52 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
127127
return launchTimePlusMinimum < now;
128128
}
129129

130-
async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise<void> {
130+
function runnerIdleTimeExceeded(runner: RunnerInfo): boolean {
131+
const idleTimeMinutes = parseInt(process.env.STANDBY_IDLE_TIME_MINUTES || '0');
132+
if (idleTimeMinutes === 0) return true;
133+
134+
const launchTimePlusIdle = moment(runner.launchTime).utc().add(idleTimeMinutes, 'minutes');
135+
const now = moment(new Date()).utc();
136+
return launchTimePlusIdle < now;
137+
}
138+
139+
async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[], shouldStop: boolean = false): Promise<void> {
131140
const githubAppClient = await getOrCreateOctokit(ec2runner);
132141
try {
133142
const states = await Promise.all(
134143
ghRunnerIds.map(async (ghRunnerId) => {
135-
// Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition.
136144
return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId);
137145
}),
138146
);
139147

140148
if (states.every((busy) => busy === false)) {
141-
const statuses = await Promise.all(
142-
ghRunnerIds.map(async (ghRunnerId) => {
143-
return (
144-
ec2runner.type === 'Org'
145-
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
146-
runner_id: ghRunnerId,
147-
org: ec2runner.owner,
148-
})
149-
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
150-
runner_id: ghRunnerId,
151-
owner: ec2runner.owner.split('/')[0],
152-
repo: ec2runner.owner.split('/')[1],
153-
})
154-
).status;
155-
}),
156-
);
157-
158-
if (statuses.every((status) => status == 204)) {
159-
await terminateRunner(ec2runner.instanceId);
160-
logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
149+
if (shouldStop) {
150+
await stopRunner(ec2runner.instanceId);
151+
logger.info(`AWS runner instance '${ec2runner.instanceId}' is stopped and moved to standby.`);
161152
} else {
162-
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
153+
const statuses = await Promise.all(
154+
ghRunnerIds.map(async (ghRunnerId) => {
155+
return (
156+
ec2runner.type === 'Org'
157+
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
158+
runner_id: ghRunnerId,
159+
org: ec2runner.owner,
160+
})
161+
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
162+
runner_id: ghRunnerId,
163+
owner: ec2runner.owner.split('/')[0],
164+
repo: ec2runner.owner.split('/')[1],
165+
})
166+
).status;
167+
}),
168+
);
169+
170+
if (statuses.every((status) => status == 204)) {
171+
await terminateRunner(ec2runner.instanceId);
172+
logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
173+
} else {
174+
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
175+
}
163176
}
164177
} else {
165178
logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);
@@ -178,13 +191,25 @@ async function evaluateAndRemoveRunners(
178191
let idleCounter = getIdleRunnerCount(scaleDownConfigs);
179192
const evictionStrategy = getEvictionStrategy(scaleDownConfigs);
180193
const ownerTags = new Set(ec2Runners.map((runner) => runner.owner));
194+
195+
const standbyPoolSize = parseInt(process.env.STANDBY_POOL_SIZE || '0');
196+
const environment = process.env.ENVIRONMENT;
181197

182198
for (const ownerTag of ownerTags) {
183199
const ec2RunnersFiltered = ec2Runners
184200
.filter((runner) => runner.owner === ownerTag)
185201
.sort(evictionStrategy === 'oldest_first' ? oldestFirstStrategy : newestFirstStrategy);
186202
logger.debug(`Found: '${ec2RunnersFiltered.length}' active GitHub runners with owner tag: '${ownerTag}'`);
187203
logger.debug(`Active GitHub runners with owner tag: '${ownerTag}': ${JSON.stringify(ec2RunnersFiltered)}`);
204+
205+
const standbyRunners = standbyPoolSize > 0
206+
? await listEC2Runners({ environment, runnerType: 'Org', runnerOwner: ownerTag, statuses: ['stopped'], standby: true })
207+
: [];
208+
const currentStandbyCount = standbyRunners.length;
209+
let standbyCounter = Math.max(0, standbyPoolSize - currentStandbyCount);
210+
211+
logger.debug(`Standby pool: target=${standbyPoolSize}, current=${currentStandbyCount}, needed=${standbyCounter}`);
212+
188213
for (const ec2Runner of ec2RunnersFiltered) {
189214
const ghRunners = await listGitHubRunners(ec2Runner);
190215
const ghRunnersFiltered = ghRunners.filter((runner: { name: string }) =>
@@ -201,6 +226,16 @@ async function evaluateAndRemoveRunners(
201226
if (idleCounter > 0) {
202227
idleCounter--;
203228
logger.info(`Runner '${ec2Runner.instanceId}' will be kept idle.`);
229+
} else if (standbyCounter > 0 && runnerIdleTimeExceeded(ec2Runner)) {
230+
standbyCounter--;
231+
logger.info(`Runner '${ec2Runner.instanceId}' will be moved to standby.`);
232+
await removeRunner(
233+
ec2Runner,
234+
ghRunnersFiltered.map((runner: { id: number }) => runner.id),
235+
true,
236+
);
237+
} else if (standbyCounter > 0) {
238+
logger.info(`Runner '${ec2Runner.instanceId}' waiting for idle time before moving to standby.`);
204239
} else {
205240
logger.info(`Terminating all non busy runners.`);
206241
await removeRunner(

0 commit comments

Comments
 (0)