|
| 1 | +# SQS-based deregistration retry for runners that return 422 (busy executing a job). |
| 2 | +# When a runner can't be deregistered immediately, the termination-watcher Lambda |
| 3 | +# sends a message to this queue with a 5-minute delay. By the time the message |
| 4 | +# becomes visible, the EC2 instance has terminated and the runner appears offline, |
| 5 | +# allowing clean GitHub API deletion. |
| 6 | + |
| 7 | +# Dead-letter queue — messages that fail after 3 attempts land here for investigation |
| 8 | +resource "aws_sqs_queue" "deregister_retry_dlq" { |
| 9 | + count = local.enable_runner_deregistration ? 1 : 0 |
| 10 | + |
| 11 | + name = "${var.config.prefix}-deregister-retry-dlq" |
| 12 | + message_retention_seconds = 1209600 # 14 days |
| 13 | + tags = var.config.tags |
| 14 | +} |
| 15 | + |
| 16 | +# Main retry queue — 5-minute delivery delay gives EC2 time to terminate |
| 17 | +resource "aws_sqs_queue" "deregister_retry" { |
| 18 | + count = local.enable_runner_deregistration ? 1 : 0 |
| 19 | + |
| 20 | + name = "${var.config.prefix}-deregister-retry" |
| 21 | + delay_seconds = 300 # 5 minutes |
| 22 | + message_retention_seconds = 86400 # 24 hours |
| 23 | + visibility_timeout_seconds = 60 # Lambda timeout + buffer |
| 24 | + tags = var.config.tags |
| 25 | + |
| 26 | + redrive_policy = jsonencode({ |
| 27 | + deadLetterTargetArn = aws_sqs_queue.deregister_retry_dlq[0].arn |
| 28 | + maxReceiveCount = 3 |
| 29 | + }) |
| 30 | +} |
| 31 | + |
| 32 | +# Dedicated Lambda function for processing SQS retry messages. |
| 33 | +# Uses the same code package as the termination-watcher but with |
| 34 | +# handler index.deregisterRetry (SQS event handler). |
| 35 | +module "deregister_retry_lambda" { |
| 36 | + count = local.enable_runner_deregistration ? 1 : 0 |
| 37 | + source = "../lambda" |
| 38 | + |
| 39 | + lambda = merge(local.config, { |
| 40 | + name = "deregister-retry" |
| 41 | + handler = "index.deregisterRetry" |
| 42 | + environment_variables = merge( |
| 43 | + local.deregistration_env_vars, |
| 44 | + var.config.environment_variables, |
| 45 | + { |
| 46 | + DEREGISTER_RETRY_QUEUE_URL = aws_sqs_queue.deregister_retry[0].url |
| 47 | + TAG_FILTERS = jsonencode(var.config.tag_filters) |
| 48 | + } |
| 49 | + ) |
| 50 | + }) |
| 51 | +} |
| 52 | + |
| 53 | +# SQS event source mapping — triggers the retry Lambda when messages arrive |
| 54 | +resource "aws_lambda_event_source_mapping" "deregister_retry" { |
| 55 | + count = local.enable_runner_deregistration ? 1 : 0 |
| 56 | + |
| 57 | + event_source_arn = aws_sqs_queue.deregister_retry[0].arn |
| 58 | + function_name = module.deregister_retry_lambda[0].lambda.function.arn |
| 59 | + batch_size = 1 # Process one retry at a time to avoid GitHub rate limits |
| 60 | + enabled = true |
| 61 | +} |
| 62 | + |
| 63 | +# IAM: Allow the retry Lambda to receive/delete from the retry queue |
| 64 | +resource "aws_iam_role_policy" "deregister_retry_sqs" { |
| 65 | + count = local.enable_runner_deregistration ? 1 : 0 |
| 66 | + |
| 67 | + name = "sqs-deregister-retry" |
| 68 | + role = module.deregister_retry_lambda[0].lambda.role.name |
| 69 | + |
| 70 | + policy = jsonencode({ |
| 71 | + Version = "2012-10-17" |
| 72 | + Statement = [ |
| 73 | + { |
| 74 | + Effect = "Allow" |
| 75 | + Action = [ |
| 76 | + "sqs:ReceiveMessage", |
| 77 | + "sqs:DeleteMessage", |
| 78 | + "sqs:GetQueueAttributes", |
| 79 | + "sqs:SendMessage" |
| 80 | + ] |
| 81 | + Resource = [ |
| 82 | + aws_sqs_queue.deregister_retry[0].arn, |
| 83 | + aws_sqs_queue.deregister_retry_dlq[0].arn |
| 84 | + ] |
| 85 | + } |
| 86 | + ] |
| 87 | + }) |
| 88 | +} |
| 89 | + |
| 90 | +# IAM: Allow the retry Lambda to read SSM parameters (GitHub App credentials) |
| 91 | +resource "aws_iam_role_policy" "deregister_retry_ssm" { |
| 92 | + count = local.enable_runner_deregistration ? 1 : 0 |
| 93 | + |
| 94 | + name = "ssm-deregister-retry" |
| 95 | + role = module.deregister_retry_lambda[0].lambda.role.name |
| 96 | + |
| 97 | + policy = jsonencode({ |
| 98 | + Version = "2012-10-17" |
| 99 | + Statement = [ |
| 100 | + { |
| 101 | + Effect = "Allow" |
| 102 | + Action = ["ssm:GetParameter"] |
| 103 | + Resource = local.ssm_parameter_arns |
| 104 | + } |
| 105 | + ] |
| 106 | + }) |
| 107 | +} |
| 108 | + |
| 109 | +# IAM: Allow the retry Lambda to describe EC2 instances (for tag lookups) |
| 110 | +resource "aws_iam_role_policy" "deregister_retry_ec2" { |
| 111 | + count = local.enable_runner_deregistration ? 1 : 0 |
| 112 | + |
| 113 | + name = "ec2-deregister-retry" |
| 114 | + role = module.deregister_retry_lambda[0].lambda.role.name |
| 115 | + |
| 116 | + policy = templatefile("${path.module}/policies/lambda.json", {}) |
| 117 | +} |
| 118 | + |
| 119 | +# IAM: Allow the notification Lambda to send messages to the retry queue |
| 120 | +resource "aws_iam_role_policy" "notification_sqs_send" { |
| 121 | + count = local.enable_runner_deregistration && var.config.features.enable_spot_termination_notification_watcher ? 1 : 0 |
| 122 | + |
| 123 | + name = "sqs-deregister-retry-send" |
| 124 | + role = module.termination_notification[0].lambda.role.name |
| 125 | + |
| 126 | + policy = jsonencode({ |
| 127 | + Version = "2012-10-17" |
| 128 | + Statement = [ |
| 129 | + { |
| 130 | + Effect = "Allow" |
| 131 | + Action = ["sqs:SendMessage"] |
| 132 | + Resource = aws_sqs_queue.deregister_retry[0].arn |
| 133 | + } |
| 134 | + ] |
| 135 | + }) |
| 136 | +} |
| 137 | + |
| 138 | +# IAM: Allow the termination handler Lambda to send messages to the retry queue |
| 139 | +resource "aws_iam_role_policy" "termination_sqs_send" { |
| 140 | + count = local.enable_runner_deregistration && var.config.features.enable_spot_termination_handler ? 1 : 0 |
| 141 | + |
| 142 | + name = "sqs-deregister-retry-send" |
| 143 | + role = module.termination_handler[0].lambda.role.name |
| 144 | + |
| 145 | + policy = jsonencode({ |
| 146 | + Version = "2012-10-17" |
| 147 | + Statement = [ |
| 148 | + { |
| 149 | + Effect = "Allow" |
| 150 | + Action = ["sqs:SendMessage"] |
| 151 | + Resource = aws_sqs_queue.deregister_retry[0].arn |
| 152 | + } |
| 153 | + ] |
| 154 | + }) |
| 155 | +} |
0 commit comments