Skip to content

Commit 03ce697

Browse files
jensenboxclaude
andcommitted
C-241 Add EventBridge rule for all EC2 terminations
The existing spot-specific rules (BidEvictedEvent, Spot Interruption Warning) only fire on AWS spot reclamations. Scale-down terminations and manual terminations — the most common causes of stale runners — were not covered. Add an EC2 Instance State-change Notification rule (state: shutting-down) that catches ALL termination types. Reuses the same notification Lambda since both event types have detail['instance-id']. Gated behind enable_runner_deregistration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 56127d9 commit 03ce697

File tree

3 files changed

+45
-13
lines changed

3 files changed

+45
-13
lines changed

lambdas/functions/termination-watcher/src/deregister.ts

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,7 @@ export function createThrottleOptions() {
2525

2626
async function getAppCredentials(): Promise<{ appId: number; privateKey: string }> {
2727
const appId = parseInt(await getParameter(process.env.PARAMETER_GITHUB_APP_ID_NAME!));
28-
const privateKey = Buffer.from(
29-
await getParameter(process.env.PARAMETER_GITHUB_APP_KEY_BASE64_NAME!),
30-
'base64',
31-
)
28+
const privateKey = Buffer.from(await getParameter(process.env.PARAMETER_GITHUB_APP_KEY_BASE64_NAME!), 'base64')
3229
.toString()
3330
.replace('/[\\n]/g', String.fromCharCode(10));
3431
return { appId, privateKey };
@@ -142,12 +139,7 @@ async function findRunnerByInstanceId(
142139
return undefined;
143140
}
144141

145-
async function deleteRunner(
146-
octokit: Octokit,
147-
owner: string,
148-
runnerId: number,
149-
runnerType: string,
150-
): Promise<void> {
142+
async function deleteRunner(octokit: Octokit, owner: string, runnerId: number, runnerType: string): Promise<void> {
151143
if (runnerType === 'Repo') {
152144
const [repoOwner, repo] = owner.split('/');
153145
await octokit.actions.deleteSelfHostedRunnerFromRepo({

lambdas/vitest.base.config.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@ const defaultConfig = defineConfig({
99
include: ['**/src/**/*.ts'],
1010
exclude: ['**/*local*.ts', '**/*.d.ts', '**/*.test.ts', '**/node_modules/**'],
1111
all: true,
12-
reportsDirectory: './coverage'
12+
reportsDirectory: './coverage',
1313
},
1414
globals: true,
15-
watch: false
16-
}
15+
watch: false,
16+
},
1717
});
1818

1919
export default defaultConfig;

modules/termination-watcher/notification/main.tf

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,46 @@ resource "aws_lambda_permission" "main" {
4242
source_arn = aws_cloudwatch_event_rule.spot_instance_termination_warning.arn
4343
}
4444

45+
# EC2 Instance State-change Notification — catches ALL termination types
46+
# (scale-down, manual, spot reclamation, ASG) not just spot-specific events.
47+
# Uses "shutting-down" state to deregister runners while instance metadata is still available.
48+
# Reuses the same Lambda as the spot interruption warning handler since both event
49+
# types have detail['instance-id'] — the handler extracts it identically.
50+
resource "aws_cloudwatch_event_rule" "ec2_instance_state_change" {
51+
count = var.config._enable_runner_deregistration ? 1 : 0
52+
53+
name = "${var.config.prefix != null ? format("%s-", var.config.prefix) : ""}instance-termination"
54+
description = "EC2 Instance Termination (all causes) — deregisters runners from GitHub"
55+
tags = local.config.tags
56+
57+
event_pattern = <<EOF
58+
{
59+
"source": ["aws.ec2"],
60+
"detail-type": ["EC2 Instance State-change Notification"],
61+
"detail": {
62+
"state": ["shutting-down"]
63+
}
64+
}
65+
EOF
66+
}
67+
68+
resource "aws_cloudwatch_event_target" "state_change" {
69+
count = var.config._enable_runner_deregistration ? 1 : 0
70+
71+
rule = aws_cloudwatch_event_rule.ec2_instance_state_change[0].name
72+
arn = module.termination_warning_watcher.lambda.function.arn
73+
}
74+
75+
resource "aws_lambda_permission" "state_change" {
76+
count = var.config._enable_runner_deregistration ? 1 : 0
77+
78+
statement_id = "AllowExecutionFromCloudWatchStateChange"
79+
action = "lambda:InvokeFunction"
80+
function_name = module.termination_warning_watcher.lambda.function.function_name
81+
principal = "events.amazonaws.com"
82+
source_arn = aws_cloudwatch_event_rule.ec2_instance_state_change[0].arn
83+
}
84+
4585
resource "aws_iam_role_policy" "lambda_policy" {
4686
name = "lambda-policy"
4787
role = module.termination_warning_watcher.lambda.role.name

0 commit comments

Comments
 (0)