Skip to content

Commit dba3dab

Browse files
committed
fix: address Copilot review comments on runner count cache PR
- Fix double-counting issue: only count 'running' state, ignore 'pending' - Add warning log when DynamoDB returns negative counts - Add EC2 permission comment explaining why resource='*' is required - Fix 'Time out' typo to 'Timeout' in variable descriptions - Fix README AWS provider version mismatch (~>5.27 -> >=6.21) - Add race condition documentation between cache layers - Add debug log when stale DynamoDB cache is used - Fix Terraform formatting Signed-off-by: s1v4-d <161426787+s1v4-d@users.noreply.github.com>
1 parent eb7f6f8 commit dba3dab

9 files changed

Lines changed: 41 additions & 16 deletions

File tree

lambdas/functions/control-plane/src/scale-runners/cache.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,16 @@ export class dynamoDbRunnerCountCache {
228228

229229
logger.debug('DynamoDB cache hit', { pk, count, isStale, ageMs: Date.now() - updated });
230230

231+
// Normalize negative counts to zero. This can happen due to race conditions with
232+
// EventBridge events (e.g., termination event arrives before running event).
233+
if (count < 0) {
234+
logger.warn('DynamoDB cache returned negative count, normalizing to 0', {
235+
pk,
236+
rawCount: count,
237+
updated,
238+
});
239+
}
240+
231241
return { count: Math.max(0, count), isStale };
232242
} catch (error) {
233243
logger.warn('Failed to read from DynamoDB cache', { pk, error });

lambdas/functions/control-plane/src/scale-runners/scale-up.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,11 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise<stri
394394
group,
395395
cacheSource,
396396
});
397+
} else if (dynamoResult !== null && dynamoResult.isStale) {
398+
logger.debug('DynamoDB cache entry is stale, falling back to other caches', {
399+
staleCount: dynamoResult.count,
400+
group,
401+
});
397402
}
398403
}
399404

@@ -484,7 +489,11 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise<stri
484489
githubInstallationClient,
485490
);
486491

487-
// Update the cache with the new runner count to avoid stale data in subsequent iterations
492+
// Update the cache with the new runner count to avoid stale data in subsequent iterations.
493+
// NOTE: There's an inherent race condition between this in-memory cache increment and
494+
// the DynamoDB cache updates from EventBridge. The in-memory cache provides immediate
495+
// visibility within this Lambda invocation, while DynamoDB provides cross-invocation
496+
// consistency. The stale threshold ensures we eventually fall back to EC2 API for accuracy.
488497
if (instances.length > 0) {
489498
ec2RunnerCountCache.increment(environment, runnerType, group, instances.length);
490499
}

lambdas/functions/runner-count-cache/src/lambda.ts

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,16 +153,19 @@ export async function handler(
153153
// Generate partition key
154154
const pk = `${tags.environment}#${tags.type}#${tags.owner}`;
155155

156-
// Determine increment based on state
156+
// Determine increment based on state.
157+
// IMPORTANT: We only count 'running' state as +1 to avoid double-counting when instances
158+
// transition from pending -> running. The 'pending' state is ignored because all instances
159+
// that reach 'running' must first pass through 'pending', which would cause double-counting.
157160
let increment = 0;
158-
if (state === 'running' || state === 'pending') {
161+
if (state === 'running') {
159162
increment = 1;
160163
} else if (state === 'terminated' || state === 'stopped' || state === 'shutting-down') {
161164
increment = -1;
162165
}
163166

164167
if (increment === 0) {
165-
logger.debug('State does not affect counter', { state });
168+
logger.debug('State does not affect counter (pending or other transitional states are ignored)', { state });
166169
return;
167170
}
168171

modules/runner-count-cache/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ The scale-up Lambda can be configured to use this cache by setting these environ
7575
| Name | Version |
7676
|------|---------|
7777
| terraform | >= 1.3.0 |
78-
| aws | ~> 5.27 |
78+
| aws | >= 6.21 |
7979

8080
## Inputs
8181

modules/runner-count-cache/lambda.tf

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@ resource "aws_lambda_function" "counter" {
2626

2727
environment {
2828
variables = {
29-
DYNAMODB_TABLE_NAME = aws_dynamodb_table.runner_counts.name
30-
ENVIRONMENT_FILTER = var.environment_filter
31-
TTL_SECONDS = var.ttl_seconds
32-
LOG_LEVEL = "info"
29+
DYNAMODB_TABLE_NAME = aws_dynamodb_table.runner_counts.name
30+
ENVIRONMENT_FILTER = var.environment_filter
31+
TTL_SECONDS = var.ttl_seconds
32+
LOG_LEVEL = "info"
3333
POWERTOOLS_SERVICE_NAME = "runner-count-cache"
3434
}
3535
}
@@ -110,6 +110,9 @@ data "aws_iam_policy_document" "counter_ec2" {
110110
"ec2:DescribeInstances",
111111
"ec2:DescribeTags",
112112
]
113+
# EC2 Describe* actions require resource = "*" - they cannot be scoped to specific
114+
# instance ARNs. See: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-policy-structure.html
115+
# The Lambda filters instances by tags after fetching to ensure only runner instances are counted.
113116
resources = ["*"]
114117
}
115118
}

modules/runner-count-cache/outputs.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ output "lambda_role" {
3333
output "cache_config" {
3434
description = "Configuration for scale-up Lambda to use the cache"
3535
value = {
36-
table_name = aws_dynamodb_table.runner_counts.name
37-
stale_threshold_ms = var.cache_stale_threshold_ms
36+
table_name = aws_dynamodb_table.runner_counts.name
37+
stale_threshold_ms = var.cache_stale_threshold_ms
3838
}
3939
}

modules/runner-count-cache/variables.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ variable "environment_filter" {
2121
}
2222

2323
variable "counter_lambda_timeout" {
24-
description = "Time out for the counter update lambda in seconds."
24+
description = "Timeout for the counter update lambda in seconds."
2525
type = number
2626
default = 30
2727
}

modules/runners/scale-up.tf

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -182,9 +182,9 @@ resource "aws_iam_role_policy" "scale_up_runner_count_cache" {
182182
Version = "2012-10-17"
183183
Statement = [
184184
{
185-
Sid = "AllowDynamoDBRead"
186-
Effect = "Allow"
187-
Action = [
185+
Sid = "AllowDynamoDBRead"
186+
Effect = "Allow"
187+
Action = [
188188
"dynamodb:GetItem"
189189
]
190190
Resource = "arn:${var.aws_partition}:dynamodb:${var.aws_region}:${data.aws_caller_identity.current.account_id}:table/${var.runner_count_cache.table_name}"

variables.runner-count-cache.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ variable "runner_count_cache" {
1010
`stale_threshold_ms`: How long (in milliseconds) before a cached count is considered stale and falls back to EC2 API. Default 60000 (1 minute).
1111
`ttl_seconds`: TTL for DynamoDB items in seconds. Default 86400 (24 hours).
1212
`lambda_memory_size`: Memory size limit in MB of the counter lambda.
13-
`lambda_timeout`: Time out of the counter lambda in seconds.
13+
`lambda_timeout`: Timeout of the counter lambda in seconds.
1414
`lambda_s3_key`: S3 key for lambda function. Required if using S3 bucket to specify lambdas.
1515
`lambda_s3_object_version`: S3 object version for lambda function.
1616
EOF

0 commit comments

Comments
 (0)