Skip to content

Commit eb7f6f8

Browse files
committed
feat(scale-up): add runner count caching to reduce EC2 API rate limiting (#4710)
Implements a multi-tier caching strategy to address EC2 DescribeInstances API rate limiting in high-volume environments (20K+ runners/day): - In-memory TTL cache (5s) for within-invocation deduplication - DynamoDB-based persistent cache with EventBridge for cross-invocation consistency using EC2 state change events - Atomic counters for accurate runner count tracking - Feature is opt-in via `runner_count_cache = { enable = true }` This can reduce EC2 API calls by 90%+ and eliminate 15+ second latency spikes caused by DescribeInstances throttling. Closes #4710 Signed-off-by: s1v4-d <161426787+s1v4-d@users.noreply.github.com>
1 parent 1262a67 commit eb7f6f8

20 files changed

Lines changed: 2267 additions & 2 deletions

File tree

lambdas/functions/control-plane/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
"@aws-github-runner/aws-powertools-util": "*",
3434
"@aws-github-runner/aws-ssm-util": "*",
3535
"@aws-lambda-powertools/parameters": "^2.29.0",
36+
"@aws-sdk/client-dynamodb": "^3.948.0",
3637
"@aws-sdk/client-ec2": "^3.948.0",
3738
"@aws-sdk/client-sqs": "^3.948.0",
3839
"@middy/core": "^6.4.5",
Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
import { describe, it, expect, beforeEach, vi, afterEach } from 'vitest';
2+
import { ec2RunnerCountCache, dynamoDbRunnerCountCache } from './cache';
3+
import { DynamoDBClient, GetItemCommand } from '@aws-sdk/client-dynamodb';
4+
import { mockClient } from 'aws-sdk-client-mock';
5+
6+
const mockDynamoDBClient = mockClient(DynamoDBClient);
7+
8+
describe('ec2RunnerCountCache', () => {
9+
beforeEach(() => {
10+
ec2RunnerCountCache.reset();
11+
vi.useFakeTimers();
12+
});
13+
14+
afterEach(() => {
15+
vi.useRealTimers();
16+
});
17+
18+
describe('get', () => {
19+
it('should return undefined when cache is empty', () => {
20+
const result = ec2RunnerCountCache.get('prod', 'Org', 'my-org');
21+
expect(result).toBeUndefined();
22+
});
23+
24+
it('should return cached value when within TTL', () => {
25+
ec2RunnerCountCache.set('prod', 'Org', 'my-org', 10);
26+
27+
// Advance time by 3 seconds (within default 5s TTL)
28+
vi.advanceTimersByTime(3000);
29+
30+
const result = ec2RunnerCountCache.get('prod', 'Org', 'my-org');
31+
expect(result).toBe(10);
32+
});
33+
34+
it('should return undefined when cache entry is expired', () => {
35+
ec2RunnerCountCache.set('prod', 'Org', 'my-org', 10);
36+
37+
// Advance time by 6 seconds (past default 5s TTL)
38+
vi.advanceTimersByTime(6000);
39+
40+
const result = ec2RunnerCountCache.get('prod', 'Org', 'my-org');
41+
expect(result).toBeUndefined();
42+
});
43+
44+
it('should respect custom TTL', () => {
45+
ec2RunnerCountCache.set('prod', 'Org', 'my-org', 10);
46+
47+
// Advance time by 8 seconds
48+
vi.advanceTimersByTime(8000);
49+
50+
// Should be expired with default TTL but valid with custom 10s TTL
51+
const expiredResult = ec2RunnerCountCache.get('prod', 'Org', 'my-org', 5000);
52+
expect(expiredResult).toBeUndefined();
53+
54+
ec2RunnerCountCache.set('prod', 'Org', 'my-org', 15);
55+
vi.advanceTimersByTime(8000);
56+
57+
const validResult = ec2RunnerCountCache.get('prod', 'Org', 'my-org', 10000);
58+
expect(validResult).toBe(15);
59+
});
60+
61+
it('should return different values for different keys', () => {
62+
ec2RunnerCountCache.set('prod', 'Org', 'org-a', 10);
63+
ec2RunnerCountCache.set('prod', 'Org', 'org-b', 20);
64+
ec2RunnerCountCache.set('prod', 'Repo', 'owner/repo', 5);
65+
66+
expect(ec2RunnerCountCache.get('prod', 'Org', 'org-a')).toBe(10);
67+
expect(ec2RunnerCountCache.get('prod', 'Org', 'org-b')).toBe(20);
68+
expect(ec2RunnerCountCache.get('prod', 'Repo', 'owner/repo')).toBe(5);
69+
});
70+
});
71+
72+
describe('set', () => {
73+
it('should store value in cache', () => {
74+
ec2RunnerCountCache.set('prod', 'Org', 'my-org', 10);
75+
expect(ec2RunnerCountCache.get('prod', 'Org', 'my-org')).toBe(10);
76+
});
77+
78+
it('should overwrite existing value', () => {
79+
ec2RunnerCountCache.set('prod', 'Org', 'my-org', 10);
80+
ec2RunnerCountCache.set('prod', 'Org', 'my-org', 20);
81+
expect(ec2RunnerCountCache.get('prod', 'Org', 'my-org')).toBe(20);
82+
});
83+
});
84+
85+
describe('increment', () => {
86+
it('should increment existing cached value', () => {
87+
ec2RunnerCountCache.set('prod', 'Org', 'my-org', 10);
88+
ec2RunnerCountCache.increment('prod', 'Org', 'my-org', 5);
89+
expect(ec2RunnerCountCache.get('prod', 'Org', 'my-org')).toBe(15);
90+
});
91+
92+
it('should handle negative increments (decrement)', () => {
93+
ec2RunnerCountCache.set('prod', 'Org', 'my-org', 10);
94+
ec2RunnerCountCache.increment('prod', 'Org', 'my-org', -3);
95+
expect(ec2RunnerCountCache.get('prod', 'Org', 'my-org')).toBe(7);
96+
});
97+
98+
it('should do nothing if cache entry does not exist', () => {
99+
ec2RunnerCountCache.increment('prod', 'Org', 'my-org', 5);
100+
expect(ec2RunnerCountCache.get('prod', 'Org', 'my-org')).toBeUndefined();
101+
});
102+
103+
it('should reset TTL on increment', () => {
104+
ec2RunnerCountCache.set('prod', 'Org', 'my-org', 10);
105+
106+
// Advance time by 4 seconds
107+
vi.advanceTimersByTime(4000);
108+
109+
// Increment, which should reset the TTL
110+
ec2RunnerCountCache.increment('prod', 'Org', 'my-org', 1);
111+
112+
// Advance another 4 seconds (total 8 seconds from original set, but only 4 from increment)
113+
vi.advanceTimersByTime(4000);
114+
115+
// Should still be valid because TTL was reset
116+
expect(ec2RunnerCountCache.get('prod', 'Org', 'my-org')).toBe(11);
117+
});
118+
});
119+
120+
describe('reset', () => {
121+
it('should clear all cache entries', () => {
122+
ec2RunnerCountCache.set('prod', 'Org', 'org-a', 10);
123+
ec2RunnerCountCache.set('prod', 'Org', 'org-b', 20);
124+
125+
expect(ec2RunnerCountCache.size()).toBe(2);
126+
127+
ec2RunnerCountCache.reset();
128+
129+
expect(ec2RunnerCountCache.size()).toBe(0);
130+
expect(ec2RunnerCountCache.get('prod', 'Org', 'org-a')).toBeUndefined();
131+
});
132+
});
133+
134+
describe('size', () => {
135+
it('should return correct cache size', () => {
136+
expect(ec2RunnerCountCache.size()).toBe(0);
137+
138+
ec2RunnerCountCache.set('prod', 'Org', 'org-a', 10);
139+
expect(ec2RunnerCountCache.size()).toBe(1);
140+
141+
ec2RunnerCountCache.set('prod', 'Org', 'org-b', 20);
142+
expect(ec2RunnerCountCache.size()).toBe(2);
143+
});
144+
});
145+
});
146+
147+
describe('dynamoDbRunnerCountCache', () => {
148+
beforeEach(() => {
149+
dynamoDbRunnerCountCache.reset();
150+
mockDynamoDBClient.reset();
151+
vi.useFakeTimers();
152+
});
153+
154+
afterEach(() => {
155+
vi.useRealTimers();
156+
});
157+
158+
describe('isEnabled', () => {
159+
it('should return false when not initialized', () => {
160+
expect(dynamoDbRunnerCountCache.isEnabled()).toBe(false);
161+
});
162+
163+
it('should return true after initialization', () => {
164+
dynamoDbRunnerCountCache.initialize('test-table', 'us-east-1', 60000);
165+
expect(dynamoDbRunnerCountCache.isEnabled()).toBe(true);
166+
});
167+
});
168+
169+
describe('get', () => {
170+
beforeEach(() => {
171+
dynamoDbRunnerCountCache.initialize('test-table', 'us-east-1', 60000);
172+
});
173+
174+
it('should return null when item not found in DynamoDB', async () => {
175+
mockDynamoDBClient.on(GetItemCommand).resolves({
176+
Item: undefined,
177+
});
178+
179+
const result = await dynamoDbRunnerCountCache.get('prod', 'Org', 'my-org');
180+
expect(result).toBeNull();
181+
});
182+
183+
it('should return count and isStale=false when item is fresh', async () => {
184+
const now = Date.now();
185+
mockDynamoDBClient.on(GetItemCommand).resolves({
186+
Item: {
187+
pk: { S: 'prod#Org#my-org' },
188+
count: { N: '10' },
189+
updated: { N: String(now - 30000) }, // 30 seconds ago
190+
},
191+
});
192+
193+
const result = await dynamoDbRunnerCountCache.get('prod', 'Org', 'my-org');
194+
expect(result).toEqual({ count: 10, isStale: false });
195+
});
196+
197+
it('should return count and isStale=true when item is stale', async () => {
198+
const now = Date.now();
199+
mockDynamoDBClient.on(GetItemCommand).resolves({
200+
Item: {
201+
pk: { S: 'prod#Org#my-org' },
202+
count: { N: '10' },
203+
updated: { N: String(now - 120000) }, // 2 minutes ago
204+
},
205+
});
206+
207+
const result = await dynamoDbRunnerCountCache.get('prod', 'Org', 'my-org');
208+
expect(result).toEqual({ count: 10, isStale: true });
209+
});
210+
211+
it('should return count >= 0 even if DynamoDB count is negative', async () => {
212+
const now = Date.now();
213+
mockDynamoDBClient.on(GetItemCommand).resolves({
214+
Item: {
215+
pk: { S: 'prod#Org#my-org' },
216+
count: { N: '-5' }, // Negative count due to race conditions
217+
updated: { N: String(now) },
218+
},
219+
});
220+
221+
const result = await dynamoDbRunnerCountCache.get('prod', 'Org', 'my-org');
222+
expect(result).toEqual({ count: 0, isStale: false });
223+
});
224+
225+
it('should return null on DynamoDB error', async () => {
226+
mockDynamoDBClient.on(GetItemCommand).rejects(new Error('DynamoDB error'));
227+
228+
const result = await dynamoDbRunnerCountCache.get('prod', 'Org', 'my-org');
229+
expect(result).toBeNull();
230+
});
231+
232+
it('should return null when not enabled', async () => {
233+
dynamoDbRunnerCountCache.reset();
234+
235+
const result = await dynamoDbRunnerCountCache.get('prod', 'Org', 'my-org');
236+
expect(result).toBeNull();
237+
});
238+
});
239+
});

0 commit comments

Comments
 (0)