Skip to content

Commit 60fed70

Browse files
SRE EngineerPaperclip-Paperclip
andcommitted
Add SQS retry infrastructure for runner deregistration (C-1841)
Add Terraform resources to support the SQS-based deregistration retry that was added in ed30bf8. When GitHub returns 422 (runner busy), the termination-watcher Lambda now has infrastructure to queue a delayed retry: - SQS queue with 5-minute delivery delay for retry messages - Dead-letter queue (14-day retention, 3 max receives) for failures - Dedicated Lambda function (index.deregisterRetry handler) - SQS event source mapping to trigger the retry Lambda - IAM policies: SQS send/receive, SSM read, EC2 describe - IAM policies on notification/termination Lambdas for SQS:SendMessage - Pass DEREGISTER_RETRY_QUEUE_URL env var to all termination Lambdas - Rebuild termination-watcher.zip with latest code Co-Authored-By: Paperclip <noreply@paperclip.ing>
1 parent ed30bf8 commit 60fed70

File tree

4 files changed

+171
-2
lines changed

4 files changed

+171
-2
lines changed
83.4 KB
Binary file not shown.
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
# SQS-based deregistration retry for runners that return 422 (busy executing a job).
2+
# When a runner can't be deregistered immediately, the termination-watcher Lambda
3+
# sends a message to this queue with a 5-minute delay. By the time the message
4+
# becomes visible, the EC2 instance has terminated and the runner appears offline,
5+
# allowing clean GitHub API deletion.
6+
7+
# Dead-letter queue — messages that fail after 3 attempts land here for investigation
8+
resource "aws_sqs_queue" "deregister_retry_dlq" {
9+
count = local.enable_runner_deregistration ? 1 : 0
10+
11+
name = "${var.config.prefix}-deregister-retry-dlq"
12+
message_retention_seconds = 1209600 # 14 days
13+
tags = var.config.tags
14+
}
15+
16+
# Main retry queue — 5-minute delivery delay gives EC2 time to terminate
17+
resource "aws_sqs_queue" "deregister_retry" {
18+
count = local.enable_runner_deregistration ? 1 : 0
19+
20+
name = "${var.config.prefix}-deregister-retry"
21+
delay_seconds = 300 # 5 minutes
22+
message_retention_seconds = 86400 # 24 hours
23+
visibility_timeout_seconds = 60 # Lambda timeout + buffer
24+
tags = var.config.tags
25+
26+
redrive_policy = jsonencode({
27+
deadLetterTargetArn = aws_sqs_queue.deregister_retry_dlq[0].arn
28+
maxReceiveCount = 3
29+
})
30+
}
31+
32+
# Dedicated Lambda function for processing SQS retry messages.
33+
# Uses the same code package as the termination-watcher but with
34+
# handler index.deregisterRetry (SQS event handler).
35+
module "deregister_retry_lambda" {
36+
count = local.enable_runner_deregistration ? 1 : 0
37+
source = "../lambda"
38+
39+
lambda = merge(local.config, {
40+
name = "deregister-retry"
41+
handler = "index.deregisterRetry"
42+
environment_variables = merge(
43+
local.deregistration_env_vars,
44+
var.config.environment_variables,
45+
{
46+
DEREGISTER_RETRY_QUEUE_URL = aws_sqs_queue.deregister_retry[0].url
47+
TAG_FILTERS = jsonencode(var.config.tag_filters)
48+
}
49+
)
50+
})
51+
}
52+
53+
# SQS event source mapping — triggers the retry Lambda when messages arrive
54+
resource "aws_lambda_event_source_mapping" "deregister_retry" {
55+
count = local.enable_runner_deregistration ? 1 : 0
56+
57+
event_source_arn = aws_sqs_queue.deregister_retry[0].arn
58+
function_name = module.deregister_retry_lambda[0].lambda.function.arn
59+
batch_size = 1 # Process one retry at a time to avoid GitHub rate limits
60+
enabled = true
61+
}
62+
63+
# IAM: Allow the retry Lambda to receive/delete from the retry queue
64+
resource "aws_iam_role_policy" "deregister_retry_sqs" {
65+
count = local.enable_runner_deregistration ? 1 : 0
66+
67+
name = "sqs-deregister-retry"
68+
role = module.deregister_retry_lambda[0].lambda.role.name
69+
70+
policy = jsonencode({
71+
Version = "2012-10-17"
72+
Statement = [
73+
{
74+
Effect = "Allow"
75+
Action = [
76+
"sqs:ReceiveMessage",
77+
"sqs:DeleteMessage",
78+
"sqs:GetQueueAttributes",
79+
"sqs:SendMessage"
80+
]
81+
Resource = [
82+
aws_sqs_queue.deregister_retry[0].arn,
83+
aws_sqs_queue.deregister_retry_dlq[0].arn
84+
]
85+
}
86+
]
87+
})
88+
}
89+
90+
# IAM: Allow the retry Lambda to read SSM parameters (GitHub App credentials)
91+
resource "aws_iam_role_policy" "deregister_retry_ssm" {
92+
count = local.enable_runner_deregistration ? 1 : 0
93+
94+
name = "ssm-deregister-retry"
95+
role = module.deregister_retry_lambda[0].lambda.role.name
96+
97+
policy = jsonencode({
98+
Version = "2012-10-17"
99+
Statement = [
100+
{
101+
Effect = "Allow"
102+
Action = ["ssm:GetParameter"]
103+
Resource = local.ssm_parameter_arns
104+
}
105+
]
106+
})
107+
}
108+
109+
# IAM: Allow the retry Lambda to describe EC2 instances (for tag lookups)
110+
resource "aws_iam_role_policy" "deregister_retry_ec2" {
111+
count = local.enable_runner_deregistration ? 1 : 0
112+
113+
name = "ec2-deregister-retry"
114+
role = module.deregister_retry_lambda[0].lambda.role.name
115+
116+
policy = templatefile("${path.module}/policies/lambda.json", {})
117+
}
118+
119+
# IAM: Allow the notification Lambda to send messages to the retry queue
120+
resource "aws_iam_role_policy" "notification_sqs_send" {
121+
count = local.enable_runner_deregistration && var.config.features.enable_spot_termination_notification_watcher ? 1 : 0
122+
123+
name = "sqs-deregister-retry-send"
124+
role = module.termination_notification[0].lambda.role.name
125+
126+
policy = jsonencode({
127+
Version = "2012-10-17"
128+
Statement = [
129+
{
130+
Effect = "Allow"
131+
Action = ["sqs:SendMessage"]
132+
Resource = aws_sqs_queue.deregister_retry[0].arn
133+
}
134+
]
135+
})
136+
}
137+
138+
# IAM: Allow the termination handler Lambda to send messages to the retry queue
139+
resource "aws_iam_role_policy" "termination_sqs_send" {
140+
count = local.enable_runner_deregistration && var.config.features.enable_spot_termination_handler ? 1 : 0
141+
142+
name = "sqs-deregister-retry-send"
143+
role = module.termination_handler[0].lambda.role.name
144+
145+
policy = jsonencode({
146+
Version = "2012-10-17"
147+
Statement = [
148+
{
149+
Effect = "Allow"
150+
Action = ["sqs:SendMessage"]
151+
Resource = aws_sqs_queue.deregister_retry[0].arn
152+
}
153+
]
154+
})
155+
}

modules/termination-watcher/main.tf

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@ locals {
44

55
enable_runner_deregistration = var.config.enable_runner_deregistration && var.config.github_app_parameters != null
66

7-
deregistration_env_vars = local.enable_runner_deregistration ? {
7+
deregistration_env_vars = local.enable_runner_deregistration ? merge({
88
ENABLE_RUNNER_DEREGISTRATION = "true"
99
PARAMETER_GITHUB_APP_ID_NAME = var.config.github_app_parameters.id.name
1010
PARAMETER_GITHUB_APP_KEY_BASE64_NAME = var.config.github_app_parameters.key_base64.name
1111
GHES_URL = var.config.ghes_url != null ? var.config.ghes_url : ""
12-
} : {}
12+
}, length(aws_sqs_queue.deregister_retry) > 0 ? {
13+
DEREGISTER_RETRY_QUEUE_URL = aws_sqs_queue.deregister_retry[0].url
14+
} : {}) : {}
1315

1416
ssm_parameter_arns = local.enable_runner_deregistration ? [
1517
var.config.github_app_parameters.id.arn,

modules/termination-watcher/outputs.tf

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,15 @@ output "spot_termination_handler" {
1313
lambda_role = module.termination_handler[0].lambda.role
1414
} : null
1515
}
16+
17+
output "deregister_retry" {
18+
value = local.enable_runner_deregistration ? {
19+
queue_url = aws_sqs_queue.deregister_retry[0].url
20+
queue_arn = aws_sqs_queue.deregister_retry[0].arn
21+
dlq_url = aws_sqs_queue.deregister_retry_dlq[0].url
22+
dlq_arn = aws_sqs_queue.deregister_retry_dlq[0].arn
23+
lambda = module.deregister_retry_lambda[0].lambda.function
24+
lambda_log_group = module.deregister_retry_lambda[0].lambda.log_group
25+
lambda_role = module.deregister_retry_lambda[0].lambda.role
26+
} : null
27+
}

0 commit comments

Comments
 (0)