Skip to content

Commit 25e2434

Browse files
feat: add monitoring metrics to process_pending container (#1760)
1 parent dd96de9 commit 25e2434

5 files changed

Lines changed: 205 additions & 1 deletion

File tree

backend/kernelCI_app/management/commands/process_pending_aggregations.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
import hashlib
2+
import os
23
import signal
34
import time
45
from datetime import datetime
56
from typing import Literal, Optional, Sequence, TypedDict, Union
7+
from django.conf import settings
68
from django.core.management.base import BaseCommand
79
from django.db import connection, transaction
810
from kernelCI_app.constants.general import MAESTRO_DUMMY_BUILD_PREFIX
911
from kernelCI_app.helpers.logger import out
1012
from kernelCI_app.management.commands.helpers.aggregation_helpers import simplify_status
13+
from prometheus_client import start_http_server
1114
from kernelCI_app.models import (
1215
Builds,
1316
Checkouts,
@@ -17,6 +20,14 @@
1720
SimplifiedStatusChoices,
1821
)
1922

23+
from prometheus_client import Counter
24+
25+
AGGREGATION_RECORDS_WRITTEN = Counter(
26+
"aggregation_records_written_total",
27+
"Total number of records written to destination tables",
28+
["table"], # values: "tree_listing", "hardware_status", "processed_items"
29+
)
30+
2031

2132
class ListingItemCount(TypedDict):
2233
build_pass: int
@@ -516,6 +527,11 @@ def handle(self, *args, **options):
516527
loop = options["loop"]
517528
interval = options["interval"]
518529

530+
metrics_port = int(os.environ.get("PROMETHEUS_METRICS_PORT", 8001))
531+
if settings.PROMETHEUS_METRICS_ENABLED:
532+
start_http_server(metrics_port)
533+
out(f"Prometheus metrics server started on port {metrics_port}")
534+
519535
if loop:
520536
signal.signal(signal.SIGTERM, self.signal_handler)
521537
signal.signal(signal.SIGINT, self.signal_handler)
@@ -585,6 +601,9 @@ def _process_new_processed_entries(
585601
f"bulk_create ProcessedListingItems: n={len(new_processed_entries)} "
586602
f"in {time.time() - t0:.3f}s"
587603
)
604+
AGGREGATION_RECORDS_WRITTEN.labels(table="processed_items").inc(
605+
len(new_processed_entries)
606+
)
588607

589608
def _process_tree_listing(
590609
self,
@@ -640,6 +659,7 @@ def _process_tree_listing(
640659
)
641660

642661
out(f"Inserted {len(values)} tree_listing records in {time.time() - t0:.3f}s")
662+
AGGREGATION_RECORDS_WRITTEN.labels(table="tree_listing").inc(len(values))
643663

644664
def _process_hardware_status(
645665
self,

docker-compose.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ services:
9393
networks:
9494
- private
9595
- public
96+
environment:
97+
- PENDING_AGGREGATIONS_METRICS_PORT=${PENDING_AGGREGATIONS_METRICS_PORT:-8003}
9698
command:
9799
- poetry
98100
- run
@@ -105,6 +107,11 @@ services:
105107
restart: always
106108
depends_on:
107109
- dashboard_db
110+
ports:
111+
- target: 8001
112+
published: ${PENDING_AGGREGATIONS_METRICS_PORT:-8003}
113+
protocol: tcp
114+
mode: host
108115
profiles: ["with_commands"]
109116

110117
dashboard_db:

docs/monitoring.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ poetry run python manage.py runserver 0.0.0.0:8000 --noreload
4949
3. Add data source
5050
4. Select "Prometheus". URL: `http://prometheus:9090`
5151
5. Import Dashboard by JSON File
52-
6. Select: `monitoring/dashboard.json`
52+
6. Select: `monitoring/dashboard.json` for API metrics
53+
7. Select: `monitoring/aggregation_process.json` for Aggregation Process metrics
5354

5455
### 4. Verify Everything Works
5556
- **Prometheus**: http://localhost:9090 (show targets)
@@ -58,6 +59,8 @@ poetry run python manage.py runserver 0.0.0.0:8000 --noreload
5859

5960
## Dashboard Features
6061

62+
### API Dashboard
63+
6164
After importing the dashboard, you'll have:
6265

6366
- **Average Response Time by Endpoint** - Shows response time per endpoint
@@ -69,6 +72,15 @@ After importing the dashboard, you'll have:
6972
- Average Response Time
7073
- Total Time (cumulative time per endpoint)
7174

75+
### Aggregation Process Dashboard
76+
77+
This dashboard provides visibility into the `process_pending_aggregations` command:
78+
79+
- **Records Written Rate**: Rate of records written to `tree_listing`, `hardware_status`, and `processed_items` tables.
80+
- **Health Status**: Time since the last successful batch processing (alerts if > 5 minutes).
81+
- **Batch Duration Percentiles**: p50, p95, and p99 duration of batch processing.
82+
- **Error Rate**: Rate of errors encountered during processing.
83+
7284
## Implementation Details
7385

7486
### Multi-Worker Gunicorn Support
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
{
2+
"annotations": {
3+
"list": [
4+
{
5+
"builtIn": 1,
6+
"datasource": {
7+
"type": "grafana",
8+
"uid": "-- Grafana --"
9+
},
10+
"enable": true,
11+
"hide": true,
12+
"iconColor": "rgba(0, 211, 255, 1)",
13+
"name": "Annotations & Alerts",
14+
"type": "dashboard"
15+
}
16+
]
17+
},
18+
"editable": true,
19+
"fiscalYearStartMonth": 0,
20+
"graphTooltip": 0,
21+
"id": 3,
22+
"links": [],
23+
"panels": [
24+
{
25+
"datasource": {
26+
"type": "prometheus",
27+
"uid": "PBFA97CFB590B2093"
28+
},
29+
"fieldConfig": {
30+
"defaults": {
31+
"color": {
32+
"mode": "palette-classic"
33+
},
34+
"custom": {
35+
"axisBorderShow": false,
36+
"axisCenteredZero": false,
37+
"axisColorMode": "text",
38+
"axisLabel": "",
39+
"axisPlacement": "auto",
40+
"barAlignment": 0,
41+
"barWidthFactor": 0.6,
42+
"drawStyle": "line",
43+
"fillOpacity": 0,
44+
"gradientMode": "none",
45+
"hideFrom": {
46+
"legend": false,
47+
"tooltip": false,
48+
"viz": false
49+
},
50+
"insertNulls": false,
51+
"lineInterpolation": "linear",
52+
"lineWidth": 1,
53+
"pointSize": 5,
54+
"scaleDistribution": {
55+
"type": "linear"
56+
},
57+
"showPoints": "auto",
58+
"spanNulls": false,
59+
"stacking": {
60+
"group": "A",
61+
"mode": "none"
62+
},
63+
"thresholdsStyle": {
64+
"mode": "off"
65+
}
66+
},
67+
"mappings": [],
68+
"thresholds": {
69+
"mode": "absolute",
70+
"steps": [
71+
{
72+
"color": "green",
73+
"value": 0
74+
},
75+
{
76+
"color": "red",
77+
"value": 80
78+
}
79+
]
80+
}
81+
},
82+
"overrides": []
83+
},
84+
"gridPos": {
85+
"h": 8,
86+
"w": 12,
87+
"x": 0,
88+
"y": 0
89+
},
90+
"id": 1,
91+
"options": {
92+
"legend": {
93+
"calcs": [],
94+
"displayMode": "list",
95+
"placement": "bottom",
96+
"showLegend": true
97+
},
98+
"tooltip": {
99+
"hideZeros": false,
100+
"mode": "single",
101+
"sort": "none"
102+
}
103+
},
104+
"pluginVersion": "12.2.0-17142428006",
105+
"targets": [
106+
{
107+
"datasource": {
108+
"type": "prometheus",
109+
"uid": "ef6i3x5negsu8f"
110+
},
111+
"editorMode": "code",
112+
"expr": "rate(aggregation_records_written_total{table=\"tree_listing\"}[$__rate_interval])",
113+
"legendFormat": "Tree Listing",
114+
"range": true,
115+
"refId": "A"
116+
},
117+
{
118+
"datasource": {
119+
"type": "prometheus",
120+
"uid": "ef6i3x5negsu8f"
121+
},
122+
"editorMode": "code",
123+
"expr": "rate(aggregation_records_written_total{table=\"hardware_status\"}[$__rate_interval])",
124+
"legendFormat": "Hardware Status",
125+
"range": true,
126+
"refId": "B"
127+
},
128+
{
129+
"datasource": {
130+
"type": "prometheus",
131+
"uid": "ef6i3x5negsu8f"
132+
},
133+
"editorMode": "code",
134+
"expr": "rate(aggregation_records_written_total{table=\"processed_items\"}[$__rate_interval])",
135+
"legendFormat": "Processed Items",
136+
"range": true,
137+
"refId": "C"
138+
}
139+
],
140+
"title": "Records Written Rate",
141+
"type": "timeseries"
142+
}
143+
],
144+
"preload": false,
145+
"refresh": "auto",
146+
"schemaVersion": 41,
147+
"tags": [],
148+
"templating": {
149+
"list": []
150+
},
151+
"time": {
152+
"from": "now-1h",
153+
"to": "now"
154+
},
155+
"timepicker": {},
156+
"timezone": "browser",
157+
"title": "Aggregation Process",
158+
"uid": "aggregation-process",
159+
"version": 5
160+
}

monitoring/prometheus.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,8 @@ scrape_configs:
1818
- targets: ['host.docker.internal:8002']
1919
metrics_path: '/metrics/'
2020
scrape_interval: 1s
21+
- job_name: 'kernelci-pending-aggregations-processor'
22+
static_configs:
23+
- targets: ['host.docker.internal:8003']
24+
metrics_path: '/metrics/'
25+
scrape_interval: 10s

0 commit comments

Comments
 (0)