Skip to content

Commit 272cb36

Browse files
feat: add monitoring to cronjobs (#1835)
Closes #1792
1 parent d1cbe99 commit 272cb36

7 files changed

Lines changed: 237 additions & 0 deletions

File tree

.env.example

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,18 @@ PROMETHEUS_METRICS_ENABLED=False
6565
# This maps external port 2001 to the internal Prometheus metrics port
6666
EXTERNAL_PROM_METRICS_PORT=2001
6767

68+
# -----------------------------------------------------------------------------
69+
# Notifications monitoring (optional)
70+
# -----------------------------------------------------------------------------
71+
# Healthcheck.io private tokens/UUIDs used by cronjob monitoring.
72+
# The public base URL is defined in code and not stored in env.
73+
HEALTHCHECK_ID_DELETE_UNUSED_HARDWARE_STATUS=
74+
HEALTHCHECK_ID_NOTIFICATIONS_HARDWARE_SUMMARY=
75+
HEALTHCHECK_ID_NOTIFICATIONS_METRICS_SUMMARY=
76+
HEALTHCHECK_ID_NOTIFICATIONS_NEW_ISSUES=
77+
HEALTHCHECK_ID_NOTIFICATIONS_SUMMARY_MICROSOFT=
78+
HEALTHCHECK_ID_NOTIFICATIONS_SUMMARY_MAESTRO=
79+
6880
# -----------------------------------------------------------------------------
6981
# Email / Notifications (optional)
7082
# -----------------------------------------------------------------------------

backend/kernelCI/settings.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,30 @@ def get_json_env_var(name, default):
124124
# To run cronjobs locally, execute
125125
# poetry run ./manage.py crontab arg
126126
# where "arg" is add, remove or show
127+
128+
HEALTHCHECK_BASE_URL = "https://hc-ping.com"
129+
HEALTHCHECK_MONITORING_PATH_MAP: dict[str, str] = {
130+
"delete_unused_hardware_status": os.environ.get(
131+
"HEALTHCHECK_ID_DELETE_UNUSED_HARDWARE_STATUS", ""
132+
),
133+
"notifications_hardware_summary": os.environ.get(
134+
"HEALTHCHECK_ID_NOTIFICATIONS_HARDWARE_SUMMARY", ""
135+
),
136+
"notifications_metrics_summary": os.environ.get(
137+
"HEALTHCHECK_ID_NOTIFICATIONS_METRICS_SUMMARY", ""
138+
),
139+
"notifications_new_issues": os.environ.get(
140+
"HEALTHCHECK_ID_NOTIFICATIONS_NEW_ISSUES", ""
141+
),
142+
"notifications_summary_microsoft": os.environ.get(
143+
"HEALTHCHECK_ID_NOTIFICATIONS_SUMMARY_MICROSOFT", ""
144+
),
145+
"notifications_summary_maestro": os.environ.get(
146+
"HEALTHCHECK_ID_NOTIFICATIONS_SUMMARY_MAESTRO", ""
147+
),
148+
}
149+
"""Maps monitoring_id to the relative_path that will be appended to the base healthcheck URL."""
150+
127151
SKIP_CRONJOBS = is_boolean_or_string_true(os.environ.get("SKIP_CRONJOBS", False))
128152
if SKIP_CRONJOBS:
129153
CRONJOBS = []
@@ -133,12 +157,15 @@ def get_json_env_var(name, default):
133157
"CRONTAB_COMMAND_SUFFIX", ">> /proc/1/fd/1 2>&1"
134158
)
135159
CRONJOBS = [
160+
# not using a monitoring_id in the first task since it should
161+
# be removed once the denormalization is set in stone
136162
("0 * * * *", "kernelCI_app.tasks.update_checkout_cache"),
137163
(
138164
"59 * * * *",
139165
"django.core.management.call_command",
140166
[
141167
"notifications",
168+
"--monitoring-id=notifications_new_issues",
142169
"--action=new_issues",
143170
"--to=kernelci-results@groups.io",
144171
"--cc=gus@collabora.com",
@@ -151,6 +178,7 @@ def get_json_env_var(name, default):
151178
"django.core.management.call_command",
152179
[
153180
"notifications",
181+
"--monitoring-id=notifications_summary_microsoft",
154182
"--action=summary",
155183
"--to=kernelcialerts@microsoft.com",
156184
"--cc=kernelci-results@groups.io",
@@ -165,6 +193,7 @@ def get_json_env_var(name, default):
165193
"django.core.management.call_command",
166194
[
167195
"notifications",
196+
"--monitoring-id=notifications_summary_maestro",
168197
"--action=summary",
169198
"--add-mailing-lists",
170199
"--send",
@@ -177,6 +206,7 @@ def get_json_env_var(name, default):
177206
"django.core.management.call_command",
178207
[
179208
"notifications",
209+
"--monitoring-id=notifications_hardware_summary",
180210
"--action=hardware_summary",
181211
"--cc=kernelci-results@groups.io",
182212
"--send",
@@ -188,13 +218,15 @@ def get_json_env_var(name, default):
188218
"django.core.management.call_command",
189219
[
190220
"delete_unused_hardware_status",
221+
"--monitoring-id=delete_unused_hardware_status",
191222
],
192223
),
193224
(
194225
"0 0 * * 6",
195226
"django.core.management.call_command",
196227
[
197228
"notifications",
229+
"--monitoring-id=notifications_metrics_summary",
198230
"--action=metrics_summary",
199231
"--to=kernelci@lists.linux.dev",
200232
"--cc=kernelci-results@groups.io",

backend/kernelCI_app/management/commands/delete_unused_hardware_status.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
import logging
88
from django.core.management.base import BaseCommand
99
from django.db import transaction
10+
from kernelCI_app.management.commands.helpers.healthcheck import (
11+
MONITORING_ID_PARAM_HELP_TEXT,
12+
run_with_healthcheck_monitoring,
13+
)
1014
from kernelCI_app.models import HardwareStatus, LatestCheckout, ProcessedListingItems
1115

1216
logger = logging.getLogger(__name__)
@@ -30,8 +34,21 @@ def add_arguments(self, parser):
3034
default=10000,
3135
help="Number of records to delete per batch (default: 10000)",
3236
)
37+
parser.add_argument(
38+
"--monitoring-id",
39+
type=str,
40+
default=None,
41+
help=MONITORING_ID_PARAM_HELP_TEXT,
42+
)
3343

3444
def handle(self, *args, **options):
45+
monitoring_id = options.get("monitoring_id")
46+
return run_with_healthcheck_monitoring(
47+
monitoring_id=monitoring_id,
48+
action=lambda: self._run_action(options),
49+
)
50+
51+
def _run_action(self, options):
3552
dry_run = options["dry_run"]
3653
batch_size = options["batch_size"]
3754

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
from collections.abc import Callable
2+
from typing import Any, Literal
3+
4+
from django.conf import settings
5+
6+
import requests
7+
8+
from kernelCI_app.helpers.logger import log_message
9+
10+
MONITORING_ID_PARAM_HELP_TEXT = (
11+
"Monitoring ID configured in settings for healthcheck.io pings "
12+
"(optional, used only for monitoring the command execution over time)"
13+
)
14+
type PingStatus = Literal["start", "fail", "success"]
15+
16+
17+
def _resolve_monitoring_url(*, monitoring_id: str, status: PingStatus) -> str | None:
18+
healthcheck_base_url: str = settings.HEALTHCHECK_BASE_URL
19+
monitoring_path_map: dict[str, str] = settings.HEALTHCHECK_MONITORING_PATH_MAP
20+
monitoring_path = monitoring_path_map.get(monitoring_id)
21+
22+
if not monitoring_path:
23+
return None
24+
25+
# Success just needs to ping base healthcheck.io url + uuid, no subpath
26+
status_suffix = f"/{status}" if status != "success" else ""
27+
28+
return f"{healthcheck_base_url.rstrip('/')}/{monitoring_path.lstrip('/')}{status_suffix}"
29+
30+
31+
def _ping_healthcheck(*, monitoring_id: str, status: PingStatus) -> None:
32+
monitoring_url = _resolve_monitoring_url(monitoring_id=monitoring_id, status=status)
33+
if not monitoring_url:
34+
log_message(
35+
"No healthcheck URL configured for monitoring_id='%s', skipping %s ping."
36+
% (monitoring_id, status)
37+
)
38+
return
39+
40+
try:
41+
response = requests.get(monitoring_url, timeout=10)
42+
response.raise_for_status()
43+
log_message(
44+
"Success at pinging healthcheck '%s' with monitoring_id '%s'"
45+
% (monitoring_url, monitoring_id)
46+
)
47+
except requests.RequestException as e:
48+
log_message(
49+
"ERROR: failed to ping healthcheck for monitoring_id='%s' and status='%s': %s"
50+
% (monitoring_id, status, e)
51+
)
52+
53+
54+
def run_with_healthcheck_monitoring(
55+
*, monitoring_id: str | None, action: Callable[[], Any]
56+
) -> Any:
57+
if not monitoring_id:
58+
return action()
59+
60+
_ping_healthcheck(monitoring_id=monitoring_id, status="start")
61+
62+
try:
63+
result = action()
64+
except Exception:
65+
_ping_healthcheck(monitoring_id=monitoring_id, status="fail")
66+
raise
67+
68+
_ping_healthcheck(monitoring_id=monitoring_id, status="success")
69+
return result

backend/kernelCI_app/management/commands/notifications.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818
setup_jinja_template,
1919
send_email_report,
2020
)
21+
from kernelCI_app.management.commands.helpers.healthcheck import (
22+
MONITORING_ID_PARAM_HELP_TEXT,
23+
run_with_healthcheck_monitoring,
24+
)
2125

2226
from kernelCI_app.management.commands.helpers.summary import (
2327
SIGNUP_FOLDER,
@@ -897,6 +901,12 @@ def add_arguments(self, parser):
897901
action="store_true",
898902
help="Ignore recipients.yaml file (optional for all actions)",
899903
)
904+
parser.add_argument(
905+
"--monitoring-id",
906+
type=str,
907+
default=None,
908+
help=MONITORING_ID_PARAM_HELP_TEXT,
909+
)
900910

901911
# Action argument (replaces subparsers)
902912
actions = [
@@ -977,6 +987,13 @@ def add_arguments(self, parser):
977987
)
978988

979989
def handle(self, *args, **options):
990+
monitoring_id = options.get("monitoring_id")
991+
return run_with_healthcheck_monitoring(
992+
monitoring_id=monitoring_id,
993+
action=lambda: self._run_action(options),
994+
)
995+
996+
def _run_action(self, options):
980997
# Setup connections
981998
service = smtp_setup_connection()
982999

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
from django.test import SimpleTestCase, override_settings
2+
from unittest.mock import Mock, patch
3+
4+
from kernelCI_app.management.commands.helpers.healthcheck import (
5+
_resolve_monitoring_url,
6+
run_with_healthcheck_monitoring,
7+
)
8+
9+
TEST_BASE_URL = "https://example.com"
10+
11+
12+
@override_settings(
13+
HEALTHCHECK_MONITORING_PATH_MAP={
14+
"job-1": "private-token",
15+
"job-2": "something/with/slashes",
16+
},
17+
HEALTHCHECK_BASE_URL=TEST_BASE_URL,
18+
)
19+
class TestRunWithHealthcheckMonitoring(SimpleTestCase):
20+
def test_resolve_monitoring_url_success(self):
21+
result = _resolve_monitoring_url(monitoring_id="job-1", status="start")
22+
self.assertEqual(result, f"{TEST_BASE_URL}/private-token/start")
23+
24+
def test_resolve_monitoring_url_success_status_no_suffix(self):
25+
result = _resolve_monitoring_url(monitoring_id="job-1", status="success")
26+
self.assertEqual(result, f"{TEST_BASE_URL}/private-token")
27+
28+
@patch("kernelCI_app.management.commands.helpers.healthcheck.requests.get")
29+
def test_success_path_pings_start_and_success(self, mock_get):
30+
response = Mock()
31+
response.raise_for_status.return_value = None
32+
mock_get.return_value = response
33+
34+
result = run_with_healthcheck_monitoring(
35+
monitoring_id="job-1", action=lambda: "ok"
36+
)
37+
38+
assert result == "ok"
39+
assert mock_get.call_count == 2
40+
mock_get.assert_any_call(f"{TEST_BASE_URL}/private-token/start", timeout=10)
41+
mock_get.assert_any_call(f"{TEST_BASE_URL}/private-token", timeout=10)
42+
43+
@patch("kernelCI_app.management.commands.helpers.healthcheck.requests.get")
44+
def test_failure_path_pings_start_and_fail(self, mock_get):
45+
response = Mock()
46+
response.raise_for_status.return_value = None
47+
mock_get.return_value = response
48+
49+
with self.assertRaisesRegex(RuntimeError, "boom"):
50+
run_with_healthcheck_monitoring(
51+
monitoring_id="job-1",
52+
action=lambda: (_ for _ in ()).throw(RuntimeError("boom")),
53+
)
54+
55+
assert mock_get.call_count == 2
56+
mock_get.assert_any_call(f"{TEST_BASE_URL}/private-token/start", timeout=10)
57+
mock_get.assert_any_call(f"{TEST_BASE_URL}/private-token/fail", timeout=10)
58+
59+
@patch("kernelCI_app.management.commands.helpers.healthcheck.requests.get")
60+
def test_no_monitoring_id_skips_pings(self, mock_get):
61+
result = run_with_healthcheck_monitoring(monitoring_id=None, action=lambda: 42)
62+
63+
assert result == 42
64+
mock_get.assert_not_called()
65+
66+
@patch("kernelCI_app.management.commands.helpers.healthcheck.requests.get")
67+
def test_unknown_monitoring_id_skips_network_and_runs_action(self, mock_get):
68+
result = run_with_healthcheck_monitoring(
69+
monitoring_id="missing-id", action=lambda: "ran"
70+
)
71+
72+
assert result == "ran"
73+
mock_get.assert_not_called()

docs/monitoring.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,23 @@ The monitoring system supports multi-worker Gunicorn deployments using Prometheu
9797
- `PROMETHEUS_METRICS_PORT`: Port for the metrics aggregator (default: `8001`)
9898
- `PROMETHEUS_MULTIPROC_DIR`: Directory for multiprocess metric files (default: `/tmp/prometheus_multiproc_dir`)
9999

100+
### Cronjob Healthchecks
101+
102+
The backend can ping healthcheck.io for cronjobs that run Django management commands.
103+
104+
- The public base URL is defined in code as `HEALTHCHECK_BASE_URL`.
105+
- Private monitor tokens stay in environment variables and are mapped in Django settings.
106+
- Each monitored cron run sends pings to `/start`, `/success`, and `/fail`.
107+
108+
Configure these variables in `.env.backend`:
109+
110+
- `HEALTHCHECK_ID_DELETE_UNUSED_HARDWARE_STATUS`
111+
- `HEALTHCHECK_ID_NOTIFICATIONS_HARDWARE_SUMMARY`
112+
- `HEALTHCHECK_ID_NOTIFICATIONS_METRICS_SUMMARY`
113+
- `HEALTHCHECK_ID_NOTIFICATIONS_NEW_ISSUES`
114+
- `HEALTHCHECK_ID_NOTIFICATIONS_SUMMARY_MICROSOFT`
115+
- `HEALTHCHECK_ID_NOTIFICATIONS_SUMMARY_MAESTRO`
116+
100117
## `prometheus.yml`
101118
- **Target**: `host.docker.internal:8001` (backend running locally)
102119
- **Metrics Path**: `/metrics/`

0 commit comments

Comments
 (0)