Skip to content

Commit 584b846

Browse files
fix: job retry mechanism not triggering
Co-authored-by: Brend Smits <brend.smits@philips.com>
1 parent 391a65f commit 584b846

2 files changed

Lines changed: 147 additions & 1 deletion

File tree

lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import { createRunner, listEC2Runners } from './../aws/runners';
1010
import { RunnerInputParameters } from './../aws/runners.d';
1111
import * as scaleUpModule from './scale-up';
1212
import { getParameter } from '@aws-github-runner/aws-ssm-util';
13+
import { publishRetryMessage } from './job-retry';
1314
import { describe, it, expect, beforeEach, vi } from 'vitest';
1415
import type { Octokit } from '@octokit/rest';
1516

@@ -33,6 +34,7 @@ const mockCreateRunner = vi.mocked(createRunner);
3334
const mockListRunners = vi.mocked(listEC2Runners);
3435
const mockSSMClient = mockClient(SSMClient);
3536
const mockSSMgetParameter = vi.mocked(getParameter);
37+
const mockPublishRetryMessage = vi.mocked(publishRetryMessage);
3638

3739
vi.mock('@octokit/rest', () => ({
3840
Octokit: vi.fn().mockImplementation(function () {
@@ -63,6 +65,11 @@ vi.mock('@aws-github-runner/aws-ssm-util', async () => {
6365
};
6466
});
6567

68+
vi.mock('./job-retry', () => ({
69+
publishRetryMessage: vi.fn(),
70+
checkAndRetryJob: vi.fn(),
71+
}));
72+
6673
export type RunnerType = 'ephemeral' | 'non-ephemeral';
6774

6875
// for ephemeral and non-ephemeral runners
@@ -1667,6 +1674,143 @@ describe('scaleUp with Github Data Residency', () => {
16671674
});
16681675
});
16691676

1677+
describe('Retry mechanism tests', () => {
1678+
beforeEach(() => {
1679+
process.env.ENABLE_ORGANIZATION_RUNNERS = 'true';
1680+
process.env.ENABLE_EPHEMERAL_RUNNERS = 'true';
1681+
process.env.ENABLE_JOB_QUEUED_CHECK = 'true';
1682+
process.env.RUNNERS_MAXIMUM_COUNT = '10';
1683+
expectedRunnerParams = { ...EXPECTED_RUNNER_PARAMS };
1684+
mockSSMClient.reset();
1685+
});
1686+
1687+
const createTestMessages = (
1688+
count: number,
1689+
overrides: Partial<scaleUpModule.ActionRequestMessageSQS>[] = [],
1690+
): scaleUpModule.ActionRequestMessageSQS[] => {
1691+
return Array.from({ length: count }, (_, i) => ({
1692+
...TEST_DATA_SINGLE,
1693+
id: i + 1,
1694+
messageId: `message-${i + 1}`,
1695+
...overrides[i],
1696+
}));
1697+
};
1698+
1699+
it('calls publishRetryMessage for each valid message when job is queued', async () => {
1700+
const messages = createTestMessages(3);
1701+
1702+
await scaleUpModule.scaleUp(messages);
1703+
1704+
expect(mockPublishRetryMessage).toHaveBeenCalledTimes(3);
1705+
expect(mockPublishRetryMessage).toHaveBeenNthCalledWith(1, expect.objectContaining({
1706+
id: 1,
1707+
messageId: 'message-1',
1708+
}));
1709+
expect(mockPublishRetryMessage).toHaveBeenNthCalledWith(2, expect.objectContaining({
1710+
id: 2,
1711+
messageId: 'message-2',
1712+
}));
1713+
expect(mockPublishRetryMessage).toHaveBeenNthCalledWith(3, expect.objectContaining({
1714+
id: 3,
1715+
messageId: 'message-3',
1716+
}));
1717+
});
1718+
1719+
it('does not call publishRetryMessage when job is not queued', async () => {
1720+
mockOctokit.actions.getJobForWorkflowRun.mockImplementation((params) => {
1721+
const isQueued = params.job_id === 1; // Only job 1 is queued
1722+
return {
1723+
data: {
1724+
status: isQueued ? 'queued' : 'completed',
1725+
},
1726+
};
1727+
});
1728+
1729+
const messages = createTestMessages(3);
1730+
1731+
await scaleUpModule.scaleUp(messages);
1732+
1733+
// Only message with id 1 should trigger retry
1734+
expect(mockPublishRetryMessage).toHaveBeenCalledTimes(1);
1735+
expect(mockPublishRetryMessage).toHaveBeenCalledWith(expect.objectContaining({
1736+
id: 1,
1737+
messageId: 'message-1',
1738+
}));
1739+
});
1740+
1741+
it('calls publishRetryMessage even when maximum runners is reached', async () => {
1742+
process.env.RUNNERS_MAXIMUM_COUNT = '0'; // No runners can be created
1743+
1744+
const messages = createTestMessages(2);
1745+
1746+
await scaleUpModule.scaleUp(messages);
1747+
1748+
// publishRetryMessage should still be called even though no runners will be created
1749+
expect(mockPublishRetryMessage).toHaveBeenCalledTimes(2);
1750+
expect(createRunner).not.toHaveBeenCalled();
1751+
});
1752+
1753+
it('calls publishRetryMessage with correct message structure including retry counter', async () => {
1754+
const message = {
1755+
...TEST_DATA_SINGLE,
1756+
messageId: 'test-message-id',
1757+
retryCounter: 2,
1758+
};
1759+
1760+
await scaleUpModule.scaleUp([message]);
1761+
1762+
expect(mockPublishRetryMessage).toHaveBeenCalledWith(expect.objectContaining({
1763+
id: message.id,
1764+
messageId: 'test-message-id',
1765+
retryCounter: 2,
1766+
}));
1767+
});
1768+
1769+
it('calls publishRetryMessage when ENABLE_JOB_QUEUED_CHECK is false', async () => {
1770+
process.env.ENABLE_JOB_QUEUED_CHECK = 'false';
1771+
1772+
const messages = createTestMessages(2);
1773+
1774+
await scaleUpModule.scaleUp(messages);
1775+
1776+
// Should always call publishRetryMessage when queue check is disabled
1777+
expect(mockPublishRetryMessage).toHaveBeenCalledTimes(2);
1778+
expect(mockOctokit.actions.getJobForWorkflowRun).not.toHaveBeenCalled();
1779+
});
1780+
1781+
it('calls publishRetryMessage for each message in a multi-runner scenario', async () => {
1782+
const messages = createTestMessages(5);
1783+
1784+
await scaleUpModule.scaleUp(messages);
1785+
1786+
expect(mockPublishRetryMessage).toHaveBeenCalledTimes(5);
1787+
messages.forEach((msg, index) => {
1788+
expect(mockPublishRetryMessage).toHaveBeenNthCalledWith(index + 1, expect.objectContaining({
1789+
id: msg.id,
1790+
messageId: msg.messageId,
1791+
}));
1792+
});
1793+
});
1794+
1795+
it('calls publishRetryMessage before runner creation', async () => {
1796+
const messages = createTestMessages(1);
1797+
1798+
const callOrder: string[] = [];
1799+
mockPublishRetryMessage.mockImplementation(() => {
1800+
callOrder.push('publishRetryMessage');
1801+
return Promise.resolve();
1802+
});
1803+
mockCreateRunner.mockImplementation(async () => {
1804+
callOrder.push('createRunner');
1805+
return ['i-12345'];
1806+
});
1807+
1808+
await scaleUpModule.scaleUp(messages);
1809+
1810+
expect(callOrder).toEqual(['publishRetryMessage', 'createRunner']);
1811+
});
1812+
});
1813+
16701814
function defaultOctokitMockImpl() {
16711815
mockOctokit.actions.getJobForWorkflowRun.mockImplementation(() => ({
16721816
data: {

lambdas/functions/control-plane/src/scale-runners/scale-up.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient
77
import { createRunner, listEC2Runners, tag } from './../aws/runners';
88
import { RunnerInputParameters } from './../aws/runners.d';
99
import { metricGitHubAppRateLimit } from '../github/rate-limit';
10+
import { publishRetryMessage } from './job-retry';
1011

1112
const logger = createChildLogger('scale-up');
1213

@@ -356,6 +357,7 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise<stri
356357
}
357358

358359
scaleUp++;
360+
await publishRetryMessage(message as ActionRequestMessageRetry);
359361
}
360362

361363
if (scaleUp === 0) {
@@ -395,7 +397,7 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise<stri
395397
}
396398

397399
// No runners will be created, so skip calling the EC2 API.
398-
if (missingInstanceCount === scaleUp) {
400+
if (newRunners <= 0) {
399401
continue;
400402
}
401403
}

0 commit comments

Comments
 (0)