-
Notifications
You must be signed in to change notification settings - Fork 722
Expand file tree
/
Copy pathscale-down.ts
More file actions
371 lines (333 loc) · 13.9 KB
/
scale-down.ts
File metadata and controls
371 lines (333 loc) · 13.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
import { Octokit } from '@octokit/rest';
import { Endpoints } from '@octokit/types';
import { RequestError } from '@octokit/request-error';
import { createChildLogger } from '@aws-github-runner/aws-powertools-util';
import moment from 'moment';
import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth';
import { bootTimeExceeded, listEC2Runners, tag, untag, terminateRunner } from './../aws/runners';
import { RunnerInfo, RunnerList } from './../aws/runners.d';
import { GhRunners, githubCache } from './cache';
import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config';
import { metricGitHubAppRateLimit } from '../github/rate-limit';
import { getGitHubEnterpriseApiUrl } from './scale-up';
const logger = createChildLogger('scale-down');
type OrgRunnerList = Endpoints['GET /orgs/{org}/actions/runners']['response']['data']['runners'];
type RepoRunnerList = Endpoints['GET /repos/{owner}/{repo}/actions/runners']['response']['data']['runners'];
type RunnerState = OrgRunnerList[number] | RepoRunnerList[number];
async function getOrCreateOctokit(runner: RunnerInfo): Promise<Octokit> {
const key = runner.owner;
const cachedOctokit = githubCache.clients.get(key);
if (cachedOctokit) {
logger.debug(`[createGitHubClientForRunner] Cache hit for ${key}`);
return cachedOctokit;
}
logger.debug(`[createGitHubClientForRunner] Cache miss for ${key}`);
const { ghesApiUrl } = getGitHubEnterpriseApiUrl();
const ghAuthPre = await createGithubAppAuth(undefined, ghesApiUrl);
const githubClientPre = await createOctokitClient(ghAuthPre.token, ghesApiUrl);
const installationId =
runner.type === 'Org'
? (
await githubClientPre.apps.getOrgInstallation({
org: runner.owner,
})
).data.id
: (
await githubClientPre.apps.getRepoInstallation({
owner: runner.owner.split('/')[0],
repo: runner.owner.split('/')[1],
})
).data.id;
const ghAuth = await createGithubInstallationAuth(installationId, ghesApiUrl);
const octokit = await createOctokitClient(ghAuth.token, ghesApiUrl);
githubCache.clients.set(key, octokit);
return octokit;
}
async function getGitHubSelfHostedRunnerState(
client: Octokit,
ec2runner: RunnerInfo,
runnerId: number,
): Promise<RunnerState | null> {
try {
const state =
ec2runner.type === 'Org'
? await client.actions.getSelfHostedRunnerForOrg({
runner_id: runnerId,
org: ec2runner.owner,
})
: await client.actions.getSelfHostedRunnerForRepo({
runner_id: runnerId,
owner: ec2runner.owner.split('/')[0],
repo: ec2runner.owner.split('/')[1],
});
metricGitHubAppRateLimit(state.headers);
return state.data;
} catch (error) {
if (error instanceof RequestError && error.status === 404) {
logger.info(`Runner '${ec2runner.instanceId}' with GitHub Runner ID '${runnerId}' not found on GitHub (404)`);
return null;
}
throw error;
}
}
async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, runnerId: number): Promise<boolean> {
const state = await getGitHubSelfHostedRunnerState(client, ec2runner, runnerId);
if (state === null) {
logger.info(
`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Not found on GitHub, treating as not busy`,
);
return false;
}
logger.info(`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Busy: ${state.busy}`);
return state.busy;
}
async function listGitHubRunners(runner: RunnerInfo): Promise<GhRunners> {
const key = runner.owner as string;
const cachedRunners = githubCache.runners.get(key);
if (cachedRunners) {
logger.debug(`[listGithubRunners] Cache hit for ${key}`);
return cachedRunners;
}
logger.debug(`[listGithubRunners] Cache miss for ${key}`);
const client = await getOrCreateOctokit(runner);
let runners;
if (runner.type === 'Org') {
runners = await client.paginate(client.actions.listSelfHostedRunnersForOrg, {
org: runner.owner,
per_page: 100,
});
} else {
const [owner, repo] = runner.owner.split('/');
runners = await client.paginate(client.actions.listSelfHostedRunnersForRepo, {
owner,
repo,
per_page: 100,
});
}
githubCache.runners.set(key, runners);
logger.debug(`[listGithubRunners] Cache set for ${key}`);
logger.debug(`[listGithubRunners] Runners: ${JSON.stringify(runners)}`);
return runners;
}
function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
const minimumRunningTimeInMinutes = process.env.MINIMUM_RUNNING_TIME_IN_MINUTES;
const launchTimePlusMinimum = moment(runner.launchTime).utc().add(minimumRunningTimeInMinutes, 'minutes');
const now = moment(new Date()).utc();
return launchTimePlusMinimum < now;
}
async function deleteGitHubRunner(
githubInstallationClient: Octokit,
ec2runner: RunnerInfo,
ghRunnerId: number,
): Promise<{ ghRunnerId: number; status: number; success: boolean }> {
try {
let response;
if (ec2runner.type === 'Org') {
response = await githubInstallationClient.actions.deleteSelfHostedRunnerFromOrg({
runner_id: ghRunnerId,
org: ec2runner.owner,
});
} else {
const [owner, repo] = ec2runner.owner.split('/');
response = await githubInstallationClient.actions.deleteSelfHostedRunnerFromRepo({
runner_id: ghRunnerId,
owner,
repo,
});
}
return { ghRunnerId, status: response.status, success: response.status === 204 };
} catch (error) {
logger.error(
`Failed to de-register GitHub runner ${ghRunnerId} for instance '${ec2runner.instanceId}'. ` +
`Error: ${error instanceof Error ? error.message : String(error)}`,
{ error },
);
return { ghRunnerId, status: 0, success: false };
}
}
async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise<void> {
const githubInstallationClient = await getOrCreateOctokit(ec2runner);
try {
const runnerList = ec2runner as unknown as RunnerList;
if (runnerList.bypassRemoval) {
logger.info(
`Runner '${ec2runner.instanceId}' has bypass-removal tag set, skipping removal. Remove the tag to allow scale-down.`,
);
return;
}
const states = await Promise.all(
ghRunnerIds.map(async (ghRunnerId) => {
// Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition.
return await getGitHubRunnerBusyState(githubInstallationClient, ec2runner, ghRunnerId);
}),
);
if (states.every((busy) => busy === false)) {
const results = await Promise.all(
ghRunnerIds.map((ghRunnerId) => deleteGitHubRunner(githubInstallationClient, ec2runner, ghRunnerId)),
);
const allSucceeded = results.every((r) => r.success);
const failedRunners = results.filter((r) => !r.success);
if (allSucceeded) {
await terminateRunner(ec2runner.instanceId);
logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
} else {
// Only terminate EC2 if we successfully de-registered from GitHub
// Otherwise, leave the instance running so the next scale-down cycle can retry
logger.error(
`Failed to de-register ${failedRunners.length} GitHub runner(s) for instance '${ec2runner.instanceId}'. ` +
`Instance will NOT be terminated to allow retry on next scale-down cycle. ` +
`Failed runner IDs: ${failedRunners.map((r) => r.ghRunnerId).join(', ')}`,
);
}
} else {
logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);
}
} catch (e) {
logger.error(
`Runner '${ec2runner.instanceId}' cannot be de-registered. Error: ${e instanceof Error ? e.message : String(e)}`,
{ error: e },
);
}
}
async function evaluateAndRemoveRunners(
ec2Runners: RunnerInfo[],
scaleDownConfigs: ScalingDownConfig[],
): Promise<void> {
let idleCounter = getIdleRunnerCount(scaleDownConfigs);
const evictionStrategy = getEvictionStrategy(scaleDownConfigs);
const ownerTags = new Set(ec2Runners.map((runner) => runner.owner));
for (const ownerTag of ownerTags) {
const ec2RunnersFiltered = ec2Runners
.filter((runner) => runner.owner === ownerTag)
.sort(evictionStrategy === 'oldest_first' ? oldestFirstStrategy : newestFirstStrategy);
logger.debug(`Found: '${ec2RunnersFiltered.length}' active GitHub runners with owner tag: '${ownerTag}'`);
logger.debug(`Active GitHub runners with owner tag: '${ownerTag}': ${JSON.stringify(ec2RunnersFiltered)}`);
for (const ec2Runner of ec2RunnersFiltered) {
const ghRunners = await listGitHubRunners(ec2Runner);
const ghRunnersFiltered = ghRunners.filter((runner: { name: string }) =>
runner.name.endsWith(ec2Runner.instanceId),
);
logger.debug(
`Found: '${ghRunnersFiltered.length}' GitHub runners for AWS runner instance: '${ec2Runner.instanceId}'`,
);
logger.debug(
`GitHub runners for AWS runner instance: '${ec2Runner.instanceId}': ${JSON.stringify(ghRunnersFiltered)}`,
);
if (ghRunnersFiltered.length) {
if (runnerMinimumTimeExceeded(ec2Runner)) {
if (idleCounter > 0) {
idleCounter--;
logger.info(`Runner '${ec2Runner.instanceId}' will be kept idle.`);
} else {
logger.info(`Terminating all non busy runners.`);
await removeRunner(
ec2Runner,
ghRunnersFiltered.map((runner: { id: number }) => runner.id),
);
}
}
} else if (bootTimeExceeded(ec2Runner)) {
await markOrphan(ec2Runner.instanceId);
} else {
logger.debug(`Runner ${ec2Runner.instanceId} has not yet booted.`);
}
}
}
}
async function markOrphan(instanceId: string): Promise<void> {
try {
await tag(instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]);
logger.info(`Runner '${instanceId}' tagged as orphan.`);
} catch (e) {
logger.error(`Failed to tag runner '${instanceId}' as orphan.`, { error: e });
}
}
async function unMarkOrphan(instanceId: string): Promise<void> {
try {
await untag(instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]);
logger.info(`Runner '${instanceId}' untagged as orphan.`);
} catch (e) {
logger.error(`Failed to un-tag runner '${instanceId}' as orphan.`, { error: e });
}
}
async function lastChanceCheckOrphanRunner(runner: RunnerList): Promise<boolean> {
const client = await getOrCreateOctokit(runner as RunnerInfo);
const runnerId = parseInt(runner.runnerId || '0');
const ec2Instance = runner as RunnerInfo;
const state = await getGitHubSelfHostedRunnerState(client, ec2Instance, runnerId);
let isOrphan = false;
if (state === null) {
logger.debug(`Runner '${runner.instanceId}' not found on GitHub, treating as orphaned.`);
isOrphan = true;
} else {
logger.debug(
`Runner '${runner.instanceId}' is '${state.status}' and is currently '${state.busy ? 'busy' : 'idle'}'.`,
);
const isOfflineAndBusy = state.status === 'offline' && state.busy;
if (isOfflineAndBusy) {
isOrphan = true;
}
}
logger.info(`Runner '${runner.instanceId}' is judged to ${isOrphan ? 'be' : 'not be'} orphaned.`);
return isOrphan;
}
async function terminateOrphan(environment: string): Promise<void> {
try {
const orphanRunners = await listEC2Runners({ environment, orphan: true });
for (const runner of orphanRunners) {
if (runner.runnerId) {
const isOrphan = await lastChanceCheckOrphanRunner(runner);
if (isOrphan) {
await terminateRunner(runner.instanceId);
} else {
await unMarkOrphan(runner.instanceId);
}
} else {
logger.info(`Terminating orphan runner '${runner.instanceId}'`);
await terminateRunner(runner.instanceId).catch((e) => {
logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e });
});
}
}
} catch (e) {
logger.warn(`Failure during orphan termination processing.`, { error: e });
}
}
export function oldestFirstStrategy(a: RunnerInfo, b: RunnerInfo): number {
if (a.launchTime === undefined) return 1;
if (b.launchTime === undefined) return 1;
if (a.launchTime < b.launchTime) return 1;
if (a.launchTime > b.launchTime) return -1;
return 0;
}
export function newestFirstStrategy(a: RunnerInfo, b: RunnerInfo): number {
return oldestFirstStrategy(a, b) * -1;
}
async function listRunners(environment: string) {
return await listEC2Runners({
environment,
});
}
function filterRunners(ec2runners: RunnerList[]): RunnerInfo[] {
return ec2runners.filter((ec2Runner) => ec2Runner.type && !ec2Runner.orphan) as RunnerInfo[];
}
export async function scaleDown(): Promise<void> {
githubCache.reset();
const environment = process.env.ENVIRONMENT;
const scaleDownConfigs = JSON.parse(process.env.SCALE_DOWN_CONFIG) as [ScalingDownConfig];
// first runners marked to be orphan.
await terminateOrphan(environment);
// next scale down idle runners with respect to config and mark potential orphans
const ec2Runners = await listRunners(environment);
const activeEc2RunnersCount = ec2Runners.length;
logger.info(`Found: '${activeEc2RunnersCount}' active GitHub EC2 runner instances before clean-up.`);
logger.debug(`Active GitHub EC2 runner instances: ${JSON.stringify(ec2Runners)}`);
if (activeEc2RunnersCount === 0) {
logger.debug(`No active runners found for environment: '${environment}'`);
return;
}
const runners = filterRunners(ec2Runners);
await evaluateAndRemoveRunners(runners, scaleDownConfigs);
const activeEc2RunnersCountAfter = (await listRunners(environment)).length;
logger.info(`Found: '${activeEc2RunnersCountAfter}' active GitHub EC2 runners instances after clean-up.`);
}