-
Notifications
You must be signed in to change notification settings - Fork 721
Expand file tree
/
Copy pathscale-down.ts
More file actions
364 lines (325 loc) · 13.9 KB
/
scale-down.ts
File metadata and controls
364 lines (325 loc) · 13.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
import { Octokit } from '@octokit/rest';
import { Endpoints } from '@octokit/types';
import { RequestError } from '@octokit/request-error';
import { createChildLogger } from '@aws-github-runner/aws-powertools-util';
import moment from 'moment';
import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth';
import { bootTimeExceeded, listEC2Runners, tag, untag, terminateRunner } from './../aws/runners';
import { RunnerInfo, RunnerList } from './../aws/runners.d';
import { GhRunners, githubCache } from './cache';
import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config';
import { metricGitHubAppRateLimit } from '../github/rate-limit';
import { getGitHubEnterpriseApiUrl } from './scale-up';
const logger = createChildLogger('scale-down');
type OrgRunnerList = Endpoints['GET /orgs/{org}/actions/runners']['response']['data']['runners'];
type RepoRunnerList = Endpoints['GET /repos/{owner}/{repo}/actions/runners']['response']['data']['runners'];
type RunnerState = OrgRunnerList[number] | RepoRunnerList[number];
async function getOrCreateOctokit(runner: RunnerInfo): Promise<Octokit> {
const key = runner.owner;
const cachedOctokit = githubCache.clients.get(key);
if (cachedOctokit) {
logger.debug(`[createGitHubClientForRunner] Cache hit for ${key}`);
return cachedOctokit;
}
logger.debug(`[createGitHubClientForRunner] Cache miss for ${key}`);
const { ghesApiUrl } = getGitHubEnterpriseApiUrl();
const ghAuthPre = await createGithubAppAuth(undefined, ghesApiUrl);
const githubClientPre = await createOctokitClient(ghAuthPre.token, ghesApiUrl);
const installationId =
runner.type === 'Org'
? (
await githubClientPre.apps.getOrgInstallation({
org: runner.owner,
})
).data.id
: (
await githubClientPre.apps.getRepoInstallation({
owner: runner.owner.split('/')[0],
repo: runner.owner.split('/')[1],
})
).data.id;
const ghAuth = await createGithubInstallationAuth(installationId, ghesApiUrl);
const octokit = await createOctokitClient(ghAuth.token, ghesApiUrl);
githubCache.clients.set(key, octokit);
return octokit;
}
async function getGitHubSelfHostedRunnerState(
client: Octokit,
ec2runner: RunnerInfo,
runnerId: number,
): Promise<RunnerState | null> {
try {
const state =
ec2runner.type === 'Org'
? await client.actions.getSelfHostedRunnerForOrg({
runner_id: runnerId,
org: ec2runner.owner,
})
: await client.actions.getSelfHostedRunnerForRepo({
runner_id: runnerId,
owner: ec2runner.owner.split('/')[0],
repo: ec2runner.owner.split('/')[1],
});
metricGitHubAppRateLimit(state.headers);
return state.data;
} catch (error) {
if (error instanceof RequestError && error.status === 404) {
logger.info(`Runner '${ec2runner.instanceId}' with GitHub Runner ID '${runnerId}' not found on GitHub (404)`);
return null;
}
throw error;
}
}
async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, runnerId: number): Promise<boolean> {
const state = await getGitHubSelfHostedRunnerState(client, ec2runner, runnerId);
if (state === null) {
logger.info(
`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Not found on GitHub, treating as not busy`,
);
return false;
}
logger.info(`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Busy: ${state.busy}`);
return state.busy;
}
async function listGitHubRunners(runner: RunnerInfo): Promise<GhRunners> {
const key = runner.owner as string;
const cachedRunners = githubCache.runners.get(key);
if (cachedRunners) {
logger.debug(`[listGithubRunners] Cache hit for ${key}`);
return cachedRunners;
}
logger.debug(`[listGithubRunners] Cache miss for ${key}`);
const client = await getOrCreateOctokit(runner);
const runners =
runner.type === 'Org'
? await client.paginate(client.actions.listSelfHostedRunnersForOrg, {
org: runner.owner,
per_page: 100,
})
: await client.paginate(client.actions.listSelfHostedRunnersForRepo, {
owner: runner.owner.split('/')[0],
repo: runner.owner.split('/')[1],
per_page: 100,
});
githubCache.runners.set(key, runners);
logger.debug(`[listGithubRunners] Cache set for ${key}`);
logger.debug(`[listGithubRunners] Runners: ${JSON.stringify(runners)}`);
return runners;
}
function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
const minimumRunningTimeInMinutes = process.env.MINIMUM_RUNNING_TIME_IN_MINUTES;
const launchTimePlusMinimum = moment(runner.launchTime).utc().add(minimumRunningTimeInMinutes, 'minutes');
const now = moment(new Date()).utc();
return launchTimePlusMinimum < now;
}
async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise<void> {
const githubAppClient = await getOrCreateOctokit(ec2runner);
try {
const runnerList = ec2runner as unknown as RunnerList;
if (runnerList.bypassRemoval) {
logger.info(
`Runner '${ec2runner.instanceId}' has bypass-removal tag set, skipping removal. Remove the tag to allow scale-down.`,
);
return;
}
// Step 1: Check busy state as a fast-path to skip runners that are obviously busy.
const states = await Promise.all(
ghRunnerIds.map(async (ghRunnerId) => {
return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId);
}),
);
if (!states.every((busy) => busy === false)) {
logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);
return;
}
// Step 2: De-register the runner from GitHub. This prevents GitHub from assigning new jobs
// to this runner, closing the race window where a job could be assigned between the busy
// check above and the termination below.
const statuses = await Promise.all(
ghRunnerIds.map(async (ghRunnerId) => {
return (
ec2runner.type === 'Org'
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
runner_id: ghRunnerId,
org: ec2runner.owner,
})
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
runner_id: ghRunnerId,
owner: ec2runner.owner.split('/')[0],
repo: ec2runner.owner.split('/')[1],
})
).status;
}),
);
if (!statuses.every((status) => status == 204)) {
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
return;
}
// Step 3: Re-check busy state after de-registration. A job may have been assigned between
// step 1 and step 2. After de-registration no new jobs can be assigned, so this check is
// now stable. If the runner is busy, the in-flight job will complete using its job-scoped
// OAuth token (the runner worker uses credentials from the job message, not the runner
// registration). We leave the instance running and it will be cleaned up as an orphan.
const postDeregisterStates = await Promise.all(
ghRunnerIds.map(async (ghRunnerId) => {
return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId);
}),
);
if (postDeregisterStates.every((busy) => busy === false)) {
await terminateRunner(ec2runner.instanceId);
logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
} else {
logger.warn(
`Runner '${ec2runner.instanceId}' became busy between idle check and de-registration. ` +
`Skipping termination to allow the in-flight job to complete. ` +
`The instance will be cleaned up as an orphan on a subsequent cycle.`,
);
}
} catch (e) {
logger.error(`Runner '${ec2runner.instanceId}' cannot be de-registered. Error: ${e}`, {
error: e as Error,
});
}
}
async function evaluateAndRemoveRunners(
ec2Runners: RunnerInfo[],
scaleDownConfigs: ScalingDownConfig[],
): Promise<void> {
let idleCounter = getIdleRunnerCount(scaleDownConfigs);
const evictionStrategy = getEvictionStrategy(scaleDownConfigs);
const ownerTags = new Set(ec2Runners.map((runner) => runner.owner));
for (const ownerTag of ownerTags) {
const ec2RunnersFiltered = ec2Runners
.filter((runner) => runner.owner === ownerTag)
.sort(evictionStrategy === 'oldest_first' ? oldestFirstStrategy : newestFirstStrategy);
logger.debug(`Found: '${ec2RunnersFiltered.length}' active GitHub runners with owner tag: '${ownerTag}'`);
logger.debug(`Active GitHub runners with owner tag: '${ownerTag}': ${JSON.stringify(ec2RunnersFiltered)}`);
for (const ec2Runner of ec2RunnersFiltered) {
const ghRunners = await listGitHubRunners(ec2Runner);
const ghRunnersFiltered = ghRunners.filter((runner: { name: string }) =>
runner.name.endsWith(ec2Runner.instanceId),
);
logger.debug(
`Found: '${ghRunnersFiltered.length}' GitHub runners for AWS runner instance: '${ec2Runner.instanceId}'`,
);
logger.debug(
`GitHub runners for AWS runner instance: '${ec2Runner.instanceId}': ${JSON.stringify(ghRunnersFiltered)}`,
);
if (ghRunnersFiltered.length) {
if (runnerMinimumTimeExceeded(ec2Runner)) {
if (idleCounter > 0) {
idleCounter--;
logger.info(`Runner '${ec2Runner.instanceId}' will be kept idle.`);
} else {
logger.info(`Terminating all non busy runners.`);
await removeRunner(
ec2Runner,
ghRunnersFiltered.map((runner: { id: number }) => runner.id),
);
}
}
} else if (bootTimeExceeded(ec2Runner)) {
await markOrphan(ec2Runner.instanceId);
} else {
logger.debug(`Runner ${ec2Runner.instanceId} has not yet booted.`);
}
}
}
}
async function markOrphan(instanceId: string): Promise<void> {
try {
await tag(instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]);
logger.info(`Runner '${instanceId}' tagged as orphan.`);
} catch (e) {
logger.error(`Failed to tag runner '${instanceId}' as orphan.`, { error: e });
}
}
async function unMarkOrphan(instanceId: string): Promise<void> {
try {
await untag(instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]);
logger.info(`Runner '${instanceId}' untagged as orphan.`);
} catch (e) {
logger.error(`Failed to un-tag runner '${instanceId}' as orphan.`, { error: e });
}
}
async function lastChanceCheckOrphanRunner(runner: RunnerList): Promise<boolean> {
const client = await getOrCreateOctokit(runner as RunnerInfo);
const runnerId = parseInt(runner.runnerId || '0');
const ec2Instance = runner as RunnerInfo;
const state = await getGitHubSelfHostedRunnerState(client, ec2Instance, runnerId);
let isOrphan = false;
if (state === null) {
logger.debug(`Runner '${runner.instanceId}' not found on GitHub, treating as orphaned.`);
isOrphan = true;
} else {
logger.debug(
`Runner '${runner.instanceId}' is '${state.status}' and is currently '${state.busy ? 'busy' : 'idle'}'.`,
);
const isOfflineAndBusy = state.status === 'offline' && state.busy;
if (isOfflineAndBusy) {
isOrphan = true;
}
}
logger.info(`Runner '${runner.instanceId}' is judged to ${isOrphan ? 'be' : 'not be'} orphaned.`);
return isOrphan;
}
async function terminateOrphan(environment: string): Promise<void> {
try {
const orphanRunners = await listEC2Runners({ environment, orphan: true });
for (const runner of orphanRunners) {
if (runner.runnerId) {
const isOrphan = await lastChanceCheckOrphanRunner(runner);
if (isOrphan) {
await terminateRunner(runner.instanceId);
} else {
await unMarkOrphan(runner.instanceId);
}
} else {
logger.info(`Terminating orphan runner '${runner.instanceId}'`);
await terminateRunner(runner.instanceId).catch((e) => {
logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e });
});
}
}
} catch (e) {
logger.warn(`Failure during orphan termination processing.`, { error: e });
}
}
export function oldestFirstStrategy(a: RunnerInfo, b: RunnerInfo): number {
if (a.launchTime === undefined) return 1;
if (b.launchTime === undefined) return 1;
if (a.launchTime < b.launchTime) return 1;
if (a.launchTime > b.launchTime) return -1;
return 0;
}
export function newestFirstStrategy(a: RunnerInfo, b: RunnerInfo): number {
return oldestFirstStrategy(a, b) * -1;
}
async function listRunners(environment: string) {
return await listEC2Runners({
environment,
});
}
function filterRunners(ec2runners: RunnerList[]): RunnerInfo[] {
return ec2runners.filter((ec2Runner) => ec2Runner.type && !ec2Runner.orphan) as RunnerInfo[];
}
export async function scaleDown(): Promise<void> {
githubCache.reset();
const environment = process.env.ENVIRONMENT;
const scaleDownConfigs = JSON.parse(process.env.SCALE_DOWN_CONFIG) as [ScalingDownConfig];
// first runners marked to be orphan.
await terminateOrphan(environment);
// next scale down idle runners with respect to config and mark potential orphans
const ec2Runners = await listRunners(environment);
const activeEc2RunnersCount = ec2Runners.length;
logger.info(`Found: '${activeEc2RunnersCount}' active GitHub EC2 runner instances before clean-up.`);
logger.debug(`Active GitHub EC2 runner instances: ${JSON.stringify(ec2Runners)}`);
if (activeEc2RunnersCount === 0) {
logger.debug(`No active runners found for environment: '${environment}'`);
return;
}
const runners = filterRunners(ec2Runners);
await evaluateAndRemoveRunners(runners, scaleDownConfigs);
const activeEc2RunnersCountAfter = (await listRunners(environment)).length;
logger.info(`Found: '${activeEc2RunnersCountAfter}' active GitHub EC2 runners instances after clean-up.`);
}