Skip to content

Commit a772fc2

Browse files
fix: retry SQS messages when GitHub API fails after EC2 instance creation
1 parent c39f4a3 commit a772fc2

File tree

5 files changed

+236
-6
lines changed

5 files changed

+236
-6
lines changed

lambdas/functions/control-plane/src/lambda.test.ts

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { Context, SQSEvent, SQSRecord } from 'aws-lambda';
33

44
import { addMiddleware, adjustPool, scaleDownHandler, scaleUpHandler, ssmHousekeeper, jobRetryCheck } from './lambda';
55
import { adjust } from './pool/pool';
6-
import ScaleError from './scale-runners/ScaleError';
6+
import ScaleError, { GHHttpError } from './scale-runners/ScaleError';
77
import { scaleDown } from './scale-runners/scale-down';
88
import { ActionRequestMessage, scaleUp } from './scale-runners/scale-up';
99
import { cleanSSMTokens } from './scale-runners/ssm-housekeeper';
@@ -229,6 +229,43 @@ describe('Test scale up lambda wrapper.', () => {
229229
batchItemFailures: [{ itemIdentifier: 'message-0' }, { itemIdentifier: 'message-1' }],
230230
});
231231
});
232+
233+
it('Should return all messages as batch item failures when scaleUp throws GHHttpError', async () => {
234+
const records = createMultipleRecords(3);
235+
const multiRecordEvent: SQSEvent = { Records: records };
236+
237+
const error = new GHHttpError('Validation Failed', 422);
238+
vi.mocked(scaleUp).mockRejectedValue(error);
239+
240+
const result = await scaleUpHandler(multiRecordEvent, context);
241+
expect(result).toEqual({
242+
batchItemFailures: [
243+
{ itemIdentifier: 'message-0' },
244+
{ itemIdentifier: 'message-1' },
245+
{ itemIdentifier: 'message-2' },
246+
],
247+
});
248+
});
249+
250+
it('Should return single message as batch item failure when scaleUp throws GHHttpError', async () => {
251+
const error = new GHHttpError('Bad credentials', 401);
252+
vi.mocked(scaleUp).mockRejectedValue(error);
253+
254+
const result = await scaleUpHandler(sqsEvent, context);
255+
expect(result).toEqual({
256+
batchItemFailures: [{ itemIdentifier: sqsRecord.messageId }],
257+
});
258+
});
259+
260+
it('Should not return batch item failures for generic errors (not ScaleError or GHHttpError)', async () => {
261+
const records = createMultipleRecords(2);
262+
const multiRecordEvent: SQSEvent = { Records: records };
263+
264+
vi.mocked(scaleUp).mockRejectedValue(new Error('Some unexpected error'));
265+
266+
const result = await scaleUpHandler(multiRecordEvent, context);
267+
expect(result).toEqual({ batchItemFailures: [] });
268+
});
232269
});
233270
});
234271

lambdas/functions/control-plane/src/scale-runners/ScaleError.test.ts

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { describe, expect, it } from 'vitest';
22
import type { ActionRequestMessageSQS } from './scale-up';
3-
import ScaleError from './ScaleError';
3+
import ScaleError, { GHHttpError } from './ScaleError';
44

55
describe('ScaleError', () => {
66
describe('detailedMessage', () => {
@@ -74,3 +74,87 @@ describe('ScaleError', () => {
7474
});
7575
});
7676
});
77+
78+
describe('GHHttpError', () => {
79+
describe('constructor', () => {
80+
it('should set name, message, and status', () => {
81+
const error = new GHHttpError('Validation Failed', 422);
82+
83+
expect(error.name).toBe('GHHttpError');
84+
expect(error.message).toBe('Validation Failed');
85+
expect(error.status).toBe(422);
86+
});
87+
88+
it('should be an instance of ScaleError', () => {
89+
const error = new GHHttpError('Not Found', 404);
90+
91+
expect(error).toBeInstanceOf(ScaleError);
92+
expect(error).toBeInstanceOf(GHHttpError);
93+
expect(error).toBeInstanceOf(Error);
94+
});
95+
});
96+
97+
describe('detailedMessage', () => {
98+
it.each([
99+
{ message: 'Validation Failed', status: 422, expected: 'GitHub API HTTP error (status 422): Validation Failed' },
100+
{ message: 'Bad credentials', status: 401, expected: 'GitHub API HTTP error (status 401): Bad credentials' },
101+
{ message: 'Not Found', status: 404, expected: 'GitHub API HTTP error (status 404): Not Found' },
102+
{
103+
message: 'Internal Server Error',
104+
status: 500,
105+
expected: 'GitHub API HTTP error (status 500): Internal Server Error',
106+
},
107+
])('should format message for status $status', ({ message, status, expected }) => {
108+
const error = new GHHttpError(message, status);
109+
110+
expect(error.detailedMessage).toBe(expected);
111+
});
112+
});
113+
114+
describe('toBatchItemFailures', () => {
115+
const mockMessages: ActionRequestMessageSQS[] = [
116+
{ messageId: 'msg-1', id: 1, eventType: 'workflow_job' },
117+
{ messageId: 'msg-2', id: 2, eventType: 'workflow_job' },
118+
{ messageId: 'msg-3', id: 3, eventType: 'workflow_job' },
119+
];
120+
121+
it('should retry ALL messages regardless of status', () => {
122+
const error = new GHHttpError('Validation Failed', 422);
123+
const failures = error.toBatchItemFailures(mockMessages);
124+
125+
expect(failures).toEqual([{ itemIdentifier: 'msg-1' }, { itemIdentifier: 'msg-2' }, { itemIdentifier: 'msg-3' }]);
126+
});
127+
128+
it('should retry the single message when only one is provided', () => {
129+
const error = new GHHttpError('Bad credentials', 401);
130+
const failures = error.toBatchItemFailures([mockMessages[0]]);
131+
132+
expect(failures).toEqual([{ itemIdentifier: 'msg-1' }]);
133+
});
134+
135+
it('should return empty array for empty messages', () => {
136+
const error = new GHHttpError('Server Error', 500);
137+
const failures = error.toBatchItemFailures([]);
138+
139+
expect(failures).toEqual([]);
140+
});
141+
142+
it('should retry all messages unlike ScaleError which retries only failedInstanceCount', () => {
143+
const messages: ActionRequestMessageSQS[] = [
144+
{ messageId: 'msg-1', id: 1, eventType: 'workflow_job' },
145+
{ messageId: 'msg-2', id: 2, eventType: 'workflow_job' },
146+
{ messageId: 'msg-3', id: 3, eventType: 'workflow_job' },
147+
{ messageId: 'msg-4', id: 4, eventType: 'workflow_job' },
148+
{ messageId: 'msg-5', id: 5, eventType: 'workflow_job' },
149+
];
150+
151+
// ScaleError with failedInstanceCount=1 retries only 1 message
152+
const scaleError = new ScaleError(1);
153+
expect(scaleError.toBatchItemFailures(messages)).toHaveLength(1);
154+
155+
// GHHttpError retries ALL messages
156+
const ghError = new GHHttpError('error', 422);
157+
expect(ghError.toBatchItemFailures(messages)).toHaveLength(5);
158+
});
159+
});
160+
});

lambdas/functions/control-plane/src/scale-runners/ScaleError.ts

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,35 @@ class ScaleError extends Error {
2626
}
2727
}
2828

29+
/**
30+
* Custom error for GitHub HTTP API failures during runner config creation.
31+
* Extends ScaleError so it is caught by the same handler in the Lambda entry point.
32+
*
33+
* Unlike a plain ScaleError (which retries only `failedInstanceCount` messages),
34+
* a GHHttpError retries ALL messages because the GitHub API failure may affect
35+
* every instance that was just launched.
36+
*/
37+
export class GHHttpError extends ScaleError {
38+
public readonly status: number;
39+
40+
constructor(message: string, status: number) {
41+
super();
42+
this.message = message;
43+
this.name = 'GHHttpError';
44+
this.status = status;
45+
}
46+
47+
public override get detailedMessage(): string {
48+
return `GitHub API HTTP error (status ${this.status}): ${this.message}`;
49+
}
50+
51+
/**
52+
* Override: retry ALL messages because the GitHub API error affects the
53+
* entire batch of instances that were already created.
54+
*/
55+
public override toBatchItemFailures(messages: ActionRequestMessageSQS[]): SQSBatchItemFailure[] {
56+
return messages.map(({ messageId }) => ({ itemIdentifier: messageId }));
57+
}
58+
}
59+
2960
export default ScaleError;

lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import * as ghAuth from '../github/auth';
99
import { createRunner, listEC2Runners } from './../aws/runners';
1010
import { RunnerInputParameters } from './../aws/runners.d';
1111
import * as scaleUpModule from './scale-up';
12+
import { GHHttpError } from './ScaleError';
1213
import { getParameter } from '@aws-github-runner/aws-ssm-util';
1314
import { publishRetryMessage } from './job-retry';
1415
import { describe, it, expect, beforeEach, vi } from 'vitest';
@@ -1852,6 +1853,69 @@ describe('Retry mechanism tests', () => {
18521853
});
18531854
});
18541855

1856+
describe('GHHttpError', () => {
1857+
it('should have correct name, message, and status properties', () => {
1858+
const error = new GHHttpError('test error', 422);
1859+
expect(error).toBeInstanceOf(Error);
1860+
expect(error).toBeInstanceOf(GHHttpError);
1861+
expect(error.name).toBe('GHHttpError');
1862+
expect(error.message).toBe('test error');
1863+
expect(error.status).toBe(422);
1864+
});
1865+
1866+
describe('createRunners throws GHHttpError on GitHub API HTTP failure', () => {
1867+
beforeEach(() => {
1868+
process.env.GHES_URL = 'https://github.enterprise.something';
1869+
process.env.ENABLE_ORGANIZATION_RUNNERS = 'true';
1870+
process.env.ENABLE_EPHEMERAL_RUNNERS = 'true';
1871+
process.env.ENABLE_JIT_CONFIG = 'true';
1872+
process.env.RUNNER_NAME_PREFIX = 'unit-test-';
1873+
process.env.RUNNER_GROUP_NAME = 'Default';
1874+
process.env.SSM_CONFIG_PATH = '/github-action-runners/default/runners/config';
1875+
process.env.SSM_TOKEN_PATH = '/github-action-runners/default/runners/config';
1876+
process.env.RUNNER_LABELS = 'label1,label2';
1877+
process.env.RUNNERS_MAXIMUM_COUNT = '3';
1878+
});
1879+
1880+
it('wraps JIT config GitHub HTTP error as GHHttpError', async () => {
1881+
const httpError = new Error('Validation Failed') as Error & { status: number };
1882+
httpError.name = 'HttpError';
1883+
httpError.status = 422;
1884+
mockOctokit.actions.generateRunnerJitconfigForOrg.mockRejectedValue(httpError);
1885+
1886+
await expect(scaleUpModule.scaleUp(TEST_DATA)).rejects.toThrow(GHHttpError);
1887+
await expect(scaleUpModule.scaleUp(TEST_DATA)).rejects.toMatchObject({
1888+
status: 422,
1889+
message: expect.stringContaining('Validation Failed'),
1890+
});
1891+
});
1892+
1893+
it('wraps registration token GitHub HTTP error as GHHttpError', async () => {
1894+
process.env.ENABLE_EPHEMERAL_RUNNERS = 'false';
1895+
process.env.ENABLE_JIT_CONFIG = 'false';
1896+
1897+
const httpError = new Error('Bad credentials') as Error & { status: number };
1898+
httpError.name = 'HttpError';
1899+
httpError.status = 401;
1900+
mockOctokit.actions.createRegistrationTokenForOrg.mockRejectedValue(httpError);
1901+
1902+
await expect(scaleUpModule.scaleUp(TEST_DATA)).rejects.toThrow(GHHttpError);
1903+
await expect(scaleUpModule.scaleUp(TEST_DATA)).rejects.toMatchObject({
1904+
status: 401,
1905+
message: expect.stringContaining('Bad credentials'),
1906+
});
1907+
});
1908+
1909+
it('does not wrap non-HTTP errors as GHHttpError', async () => {
1910+
const genericError = new Error('Some SSM error');
1911+
mockOctokit.actions.generateRunnerJitconfigForOrg.mockRejectedValue(genericError);
1912+
1913+
await expect(scaleUpModule.scaleUp(TEST_DATA)).rejects.toThrow('Some SSM error');
1914+
await expect(scaleUpModule.scaleUp(TEST_DATA)).rejects.not.toBeInstanceOf(GHHttpError);
1915+
});
1916+
});
1917+
});
1918+
18551919
function defaultOctokitMockImpl() {
18561920
mockOctokit.actions.getJobForWorkflowRun.mockImplementation(() => ({
18571921
data: {

lambdas/functions/control-plane/src/scale-runners/scale-up.ts

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import { createRunner, listEC2Runners, tag } from './../aws/runners';
88
import { RunnerInputParameters } from './../aws/runners.d';
99
import { metricGitHubAppRateLimit } from '../github/rate-limit';
1010
import { publishRetryMessage } from './job-retry';
11+
import { GHHttpError } from './ScaleError';
1112

1213
const logger = createChildLogger('scale-up');
1314

@@ -538,10 +539,23 @@ async function createStartRunnerConfig(
538539
instances: string[],
539540
ghClient: Octokit,
540541
) {
541-
if (githubRunnerConfig.enableJitConfig && githubRunnerConfig.ephemeral) {
542-
await createJitConfig(githubRunnerConfig, instances, ghClient);
543-
} else {
544-
await createRegistrationTokenConfig(githubRunnerConfig, instances, ghClient);
542+
try {
543+
if (githubRunnerConfig.enableJitConfig && githubRunnerConfig.ephemeral) {
544+
await createJitConfig(githubRunnerConfig, instances, ghClient);
545+
} else {
546+
await createRegistrationTokenConfig(githubRunnerConfig, instances, ghClient);
547+
}
548+
} catch (error) {
549+
if (error instanceof Error && error.name === 'HttpError') {
550+
const status = (error as Error & { status: number }).status;
551+
logger.error('GitHub API HTTP error during runner config creation', {
552+
status,
553+
message: error.message,
554+
instances,
555+
});
556+
throw new GHHttpError(`Failed to create runner start config: ${error.message}`, status);
557+
}
558+
throw error;
545559
}
546560
}
547561

0 commit comments

Comments
 (0)