Skip to content

Commit c0a9766

Browse files
authored
feat: add bypass-removal tag to prevent runner scale-down (#4995)
Add support for ghr:bypass-removal EC2 tag that allows engineers to manually tag runners to prevent them from being scaled down during debugging or investigation. When this tag is set to 'true', the runner will be skipped during scale-down operations with appropriate logging. Tested on our staging environment and things are working: <img width="967" height="102" alt="image" src="https://github.com/user-attachments/assets/2de96e22-4ec7-43ba-aace-6152f4a5fdb8" /> This small change makes debugging some issue on the runner significantly easier...
1 parent 788570c commit c0a9766

File tree

6 files changed

+61
-0
lines changed

6 files changed

+61
-0
lines changed

docs/additional_notes.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,31 @@ If default labels are removed:
3232
| 'custom5' | Linux | no match |
3333
| 'custom5' | [ self-hosted, Linux ] | no match |
3434
| 'custom5' | [ custom5, self-hosted, Linux ] | no match |
35+
36+
# Preventing Runner Scale-Down for Debugging
37+
38+
The module supports a bypass mechanism that allows you to prevent specific runners from being scaled down during debugging or investigation. This is useful when you need to access a runner instance directly to troubleshoot issues.
39+
40+
## Usage
41+
42+
To prevent a runner from being terminated during scale-down operations, add the `ghr:bypass-removal` tag to the EC2 instance with a value of `true`:
43+
44+
```bash
45+
aws ec2 create-tags --resources <instance-id> --tags Key=ghr:bypass-removal,Value=true
46+
```
47+
48+
When this tag is set, the scale-down process will skip the runner and log a message indicating that the runner is protected:
49+
50+
```
51+
Runner 'i-xxxxxxxxxxxx' has bypass-removal tag set, skipping removal. Remove the tag to allow scale-down.
52+
```
53+
54+
## Removing the Protection
55+
56+
Once you've finished debugging and want to allow the runner to be scaled down normally, remove the tag or set it to any other value:
57+
58+
```bash
59+
aws ec2 delete-tags --resources <instance-id> --tags Key=ghr:bypass-removal
60+
```
61+
62+
**Note:** The bypass-removal tag only prevents automatic scale-down. The runner will still continue to process job(s) as normal. Make sure to remove the tag after debugging to ensure proper resource management. It will also still terminate itself if the instance is empheral and the job is complete.

lambdas/functions/control-plane/src/aws/runners.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ export interface RunnerList {
1111
org?: string;
1212
orphan?: boolean;
1313
runnerId?: string;
14+
bypassRemoval?: boolean;
1415
}
1516

1617
export interface RunnerInfo {

lambdas/functions/control-plane/src/aws/runners.test.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ describe('list instances', () => {
9191
type: 'Org',
9292
owner: 'CoderToCat',
9393
orphan: false,
94+
bypassRemoval: false,
9495
});
9596
});
9697

@@ -105,6 +106,7 @@ describe('list instances', () => {
105106
owner: 'CoderToCat',
106107
orphan: false,
107108
runnerId: '9876543210',
109+
bypassRemoval: false,
108110
});
109111
});
110112

@@ -124,6 +126,7 @@ describe('list instances', () => {
124126
type: 'Org',
125127
owner: 'CoderToCat',
126128
orphan: true,
129+
bypassRemoval: false,
127130
});
128131
});
129132

lambdas/functions/control-plane/src/aws/runners.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ function getRunnerInfo(runningInstances: DescribeInstancesResult) {
9393
org: i.Tags?.find((e) => e.Key === 'ghr:Org')?.Value as string,
9494
orphan: i.Tags?.find((e) => e.Key === 'ghr:orphan')?.Value === 'true',
9595
runnerId: i.Tags?.find((e) => e.Key === 'ghr:github_runner_id')?.Value as string,
96+
bypassRemoval: i.Tags?.find((e) => e.Key === 'ghr:bypass-removal')?.Value === 'true',
9697
});
9798
}
9899
}

lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,25 @@ describe('Scale down runners', () => {
286286
checkNonTerminated(runners);
287287
});
288288

289+
it(`Should not terminate runner with bypass-removal tag set.`, async () => {
290+
// setup
291+
const runners = [
292+
createRunnerTestData('idle-with-bypass', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 10, true, false, false),
293+
];
294+
// Set bypass-removal tag
295+
runners[0].bypassRemoval = true;
296+
297+
mockGitHubRunners(runners);
298+
mockAwsRunners(runners);
299+
300+
// act
301+
await scaleDown();
302+
303+
// assert
304+
expect(terminateRunner).not.toHaveBeenCalled();
305+
checkNonTerminated(runners);
306+
});
307+
289308
it(`Should not terminate a runner that became busy just before deregister runner.`, async () => {
290309
// setup
291310
const runners = [
@@ -813,5 +832,6 @@ function createRunnerTestData(
813832
orphan,
814833
shouldBeTerminated,
815834
runnerId: runnerId !== undefined ? String(runnerId) : undefined,
835+
bypassRemoval: false,
816836
};
817837
}

lambdas/functions/control-plane/src/scale-runners/scale-down.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,14 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
130130
async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise<void> {
131131
const githubAppClient = await getOrCreateOctokit(ec2runner);
132132
try {
133+
const runnerList = ec2runner as unknown as RunnerList;
134+
if (runnerList.bypassRemoval) {
135+
logger.info(
136+
`Runner '${ec2runner.instanceId}' has bypass-removal tag set, skipping removal. Remove the tag to allow scale-down.`,
137+
);
138+
return;
139+
}
140+
133141
const states = await Promise.all(
134142
ghRunnerIds.map(async (ghRunnerId) => {
135143
// Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition.

0 commit comments

Comments
 (0)