Skip to content

Commit db6a268

Browse files
jensenboxclaude
andcommitted
C-241 Add EventBridge rule for all EC2 terminations
The existing spot-specific rules (BidEvictedEvent, Spot Interruption Warning) only fire on AWS spot reclamations. Scale-down terminations and manual terminations — the most common causes of stale runners — were not covered. Add an EC2 Instance State-change Notification rule (state: shutting-down) that catches ALL termination types. Reuses the same notification Lambda since both event types have detail['instance-id']. Gated behind enable_runner_deregistration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 56127d9 commit db6a268

File tree

9 files changed

+101
-69
lines changed

9 files changed

+101
-69
lines changed

lambdas/functions/termination-watcher/src/deregister.ts

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,7 @@ export function createThrottleOptions() {
2525

2626
async function getAppCredentials(): Promise<{ appId: number; privateKey: string }> {
2727
const appId = parseInt(await getParameter(process.env.PARAMETER_GITHUB_APP_ID_NAME!));
28-
const privateKey = Buffer.from(
29-
await getParameter(process.env.PARAMETER_GITHUB_APP_KEY_BASE64_NAME!),
30-
'base64',
31-
)
28+
const privateKey = Buffer.from(await getParameter(process.env.PARAMETER_GITHUB_APP_KEY_BASE64_NAME!), 'base64')
3229
.toString()
3330
.replace('/[\\n]/g', String.fromCharCode(10));
3431
return { appId, privateKey };
@@ -142,12 +139,7 @@ async function findRunnerByInstanceId(
142139
return undefined;
143140
}
144141

145-
async function deleteRunner(
146-
octokit: Octokit,
147-
owner: string,
148-
runnerId: number,
149-
runnerType: string,
150-
): Promise<void> {
142+
async function deleteRunner(octokit: Octokit, owner: string, runnerId: number, runnerType: string): Promise<void> {
151143
if (runnerType === 'Repo') {
152144
const [repoOwner, repo] = owner.split('/');
153145
await octokit.actions.deleteSelfHostedRunnerFromRepo({

lambdas/vitest.base.config.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@ const defaultConfig = defineConfig({
99
include: ['**/src/**/*.ts'],
1010
exclude: ['**/*local*.ts', '**/*.d.ts', '**/*.test.ts', '**/node_modules/**'],
1111
all: true,
12-
reportsDirectory: './coverage'
12+
reportsDirectory: './coverage',
1313
},
1414
globals: true,
15-
watch: false
16-
}
15+
watch: false,
16+
},
1717
});
1818

1919
export default defaultConfig;

main.tf

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -363,26 +363,26 @@ module "ami_housekeeper" {
363363

364364
locals {
365365
lambda_instance_termination_watcher = {
366-
prefix = var.prefix
367-
tags = local.tags
368-
aws_partition = var.aws_partition
369-
architecture = var.lambda_architecture
370-
principals = var.lambda_principals
371-
runtime = var.lambda_runtime
372-
security_group_ids = var.lambda_security_group_ids
373-
subnet_ids = var.lambda_subnet_ids
374-
lambda_tags = var.lambda_tags
375-
log_level = var.log_level
376-
log_class = var.log_class
377-
logging_kms_key_id = var.logging_kms_key_id
378-
logging_retention_in_days = var.logging_retention_in_days
379-
role_path = var.role_path
380-
role_permissions_boundary = var.role_permissions_boundary
381-
s3_bucket = var.lambda_s3_bucket
382-
tracing_config = var.tracing_config
383-
metrics = var.metrics
366+
prefix = var.prefix
367+
tags = local.tags
368+
aws_partition = var.aws_partition
369+
architecture = var.lambda_architecture
370+
principals = var.lambda_principals
371+
runtime = var.lambda_runtime
372+
security_group_ids = var.lambda_security_group_ids
373+
subnet_ids = var.lambda_subnet_ids
374+
lambda_tags = var.lambda_tags
375+
log_level = var.log_level
376+
log_class = var.log_class
377+
logging_kms_key_id = var.logging_kms_key_id
378+
logging_retention_in_days = var.logging_retention_in_days
379+
role_path = var.role_path
380+
role_permissions_boundary = var.role_permissions_boundary
381+
s3_bucket = var.lambda_s3_bucket
382+
tracing_config = var.tracing_config
383+
metrics = var.metrics
384384
enable_runner_deregistration = var.instance_termination_watcher.enable_runner_deregistration
385-
github_app_parameters = var.instance_termination_watcher.enable_runner_deregistration ? {
385+
github_app_parameters = var.instance_termination_watcher.enable_runner_deregistration ? {
386386
id = local.github_app_parameters.id
387387
key_base64 = local.github_app_parameters.key_base64
388388
} : null

modules/multi-runner/termination-watcher.tf

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,25 @@
11
locals {
22
lambda_instance_termination_watcher = {
3-
prefix = var.prefix
4-
tags = local.tags
5-
aws_partition = var.aws_partition
6-
architecture = var.lambda_architecture
7-
principals = var.lambda_principals
8-
runtime = var.lambda_runtime
9-
security_group_ids = var.lambda_security_group_ids
10-
subnet_ids = var.lambda_subnet_ids
11-
log_level = var.log_level
12-
log_class = var.log_class
13-
logging_kms_key_id = var.logging_kms_key_id
14-
logging_retention_in_days = var.logging_retention_in_days
15-
role_path = var.role_path
16-
role_permissions_boundary = var.role_permissions_boundary
17-
s3_bucket = var.lambda_s3_bucket
18-
tracing_config = var.tracing_config
19-
lambda_tags = var.lambda_tags
20-
metrics = var.metrics
3+
prefix = var.prefix
4+
tags = local.tags
5+
aws_partition = var.aws_partition
6+
architecture = var.lambda_architecture
7+
principals = var.lambda_principals
8+
runtime = var.lambda_runtime
9+
security_group_ids = var.lambda_security_group_ids
10+
subnet_ids = var.lambda_subnet_ids
11+
log_level = var.log_level
12+
log_class = var.log_class
13+
logging_kms_key_id = var.logging_kms_key_id
14+
logging_retention_in_days = var.logging_retention_in_days
15+
role_path = var.role_path
16+
role_permissions_boundary = var.role_permissions_boundary
17+
s3_bucket = var.lambda_s3_bucket
18+
tracing_config = var.tracing_config
19+
lambda_tags = var.lambda_tags
20+
metrics = var.metrics
2121
enable_runner_deregistration = var.instance_termination_watcher.enable_runner_deregistration
22-
github_app_parameters = var.instance_termination_watcher.enable_runner_deregistration ? {
22+
github_app_parameters = var.instance_termination_watcher.enable_runner_deregistration ? {
2323
id = local.github_app_parameters.id
2424
key_base64 = local.github_app_parameters.key_base64
2525
} : null

modules/multi-runner/variables.tf

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -699,11 +699,11 @@ variable "instance_termination_watcher" {
699699
enable_spot_termination_notification_watcher = optional(bool, true)
700700
}), {})
701701
enable_runner_deregistration = optional(bool, true)
702-
memory_size = optional(number, null)
703-
s3_key = optional(string, null)
704-
s3_object_version = optional(string, null)
705-
timeout = optional(number, null)
706-
zip = optional(string, null)
702+
memory_size = optional(number, null)
703+
s3_key = optional(string, null)
704+
s3_object_version = optional(string, null)
705+
timeout = optional(number, null)
706+
zip = optional(string, null)
707707
})
708708
default = {}
709709
}

modules/termination-watcher/main.tf

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@ locals {
2222
}
2323

2424
config = merge(var.config, {
25-
name = local.name,
26-
handler = "index.interruptionWarning",
27-
zip = local.lambda_zip,
28-
environment_variables = local.environment_variables
29-
metrics_namespace = var.config.metrics.namespace
30-
_deregistration_env_vars = local.deregistration_env_vars
31-
_ssm_parameter_arns = local.ssm_parameter_arns
25+
name = local.name,
26+
handler = "index.interruptionWarning",
27+
zip = local.lambda_zip,
28+
environment_variables = local.environment_variables
29+
metrics_namespace = var.config.metrics.namespace
30+
_deregistration_env_vars = local.deregistration_env_vars
31+
_ssm_parameter_arns = local.ssm_parameter_arns
3232
_enable_runner_deregistration = local.enable_runner_deregistration
3333
})
3434
}

modules/termination-watcher/notification/main.tf

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,46 @@ resource "aws_lambda_permission" "main" {
4242
source_arn = aws_cloudwatch_event_rule.spot_instance_termination_warning.arn
4343
}
4444

45+
# EC2 Instance State-change Notification — catches ALL termination types
46+
# (scale-down, manual, spot reclamation, ASG) not just spot-specific events.
47+
# Uses "shutting-down" state to deregister runners while instance metadata is still available.
48+
# Reuses the same Lambda as the spot interruption warning handler since both event
49+
# types have detail['instance-id'] — the handler extracts it identically.
50+
resource "aws_cloudwatch_event_rule" "ec2_instance_state_change" {
51+
count = var.config._enable_runner_deregistration ? 1 : 0
52+
53+
name = "${var.config.prefix != null ? format("%s-", var.config.prefix) : ""}instance-termination"
54+
description = "EC2 Instance Termination (all causes) — deregisters runners from GitHub"
55+
tags = local.config.tags
56+
57+
event_pattern = <<EOF
58+
{
59+
"source": ["aws.ec2"],
60+
"detail-type": ["EC2 Instance State-change Notification"],
61+
"detail": {
62+
"state": ["shutting-down"]
63+
}
64+
}
65+
EOF
66+
}
67+
68+
resource "aws_cloudwatch_event_target" "state_change" {
69+
count = var.config._enable_runner_deregistration ? 1 : 0
70+
71+
rule = aws_cloudwatch_event_rule.ec2_instance_state_change[0].name
72+
arn = module.termination_warning_watcher.lambda.function.arn
73+
}
74+
75+
resource "aws_lambda_permission" "state_change" {
76+
count = var.config._enable_runner_deregistration ? 1 : 0
77+
78+
statement_id = "AllowExecutionFromCloudWatchStateChange"
79+
action = "lambda:InvokeFunction"
80+
function_name = module.termination_warning_watcher.lambda.function.function_name
81+
principal = "events.amazonaws.com"
82+
source_arn = aws_cloudwatch_event_rule.ec2_instance_state_change[0].arn
83+
}
84+
4585
resource "aws_iam_role_policy" "lambda_policy" {
4686
name = "lambda-policy"
4787
role = module.termination_warning_watcher.lambda.role.name

modules/termination-watcher/variables.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ variable "config" {
7171
capture_http_requests = optional(bool, false)
7272
capture_error = optional(bool, false)
7373
}), {})
74-
zip = optional(string, null)
74+
zip = optional(string, null)
7575
enable_runner_deregistration = optional(bool, false)
7676
github_app_parameters = optional(object({
7777
id = map(string)

variables.tf

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -972,11 +972,11 @@ variable "instance_termination_watcher" {
972972
enable_spot_termination_notification_watcher = optional(bool, true)
973973
}), {})
974974
enable_runner_deregistration = optional(bool, true)
975-
memory_size = optional(number, null)
976-
s3_key = optional(string, null)
977-
s3_object_version = optional(string, null)
978-
timeout = optional(number, null)
979-
zip = optional(string, null)
975+
memory_size = optional(number, null)
976+
s3_key = optional(string, null)
977+
s3_object_version = optional(string, null)
978+
timeout = optional(number, null)
979+
zip = optional(string, null)
980980
})
981981
default = {}
982982

0 commit comments

Comments
 (0)