|
| 1 | +-- Because summary bytesHtmlDoc is a little wrong and always zero in the pages table (see: https://github.com/HTTPArchive/wptagent/issues/47) |
| 2 | +-- we need to recalculate it by joining back to the requests table. |
| 3 | +-- To optimize performance, we first create a CTE to pre-filter and extract the necessary data |
| 4 | +-- from the requests table, then join that CTE in the main query. |
| 5 | +-- when fixed, we can revert to just using summary.bytesHtmlDoc directly and remove the join and CTE. |
| 6 | +WITH HtmlDocRequests AS ( |
| 7 | + SELECT |
| 8 | + page, |
| 9 | + client, |
| 10 | + date, |
| 11 | + -- Extract and cast the response size once inside the CTE |
| 12 | + CAST(JSON_VALUE(summary.respSize) AS INT64) AS respSize_bytes |
| 13 | + FROM |
| 14 | + `httparchive.crawl.requests` |
| 15 | + WHERE |
| 16 | + -- Pre-filter the requests table for efficiency. |
| 17 | + -- This MUST match the date in the main query's WHERE clause. |
| 18 | + date = '2025-07-01' AND |
| 19 | + type = 'html' AND |
| 20 | + is_main_document |
| 21 | +) |
| 22 | + |
| 23 | +-- Main query now joins the CTE |
1 | 24 | SELECT |
2 | 25 | percentile, |
3 | | - client, |
4 | | - is_root_page, |
5 | | - APPROX_QUANTILES(CAST(JSON_VALUE(summary.bytesTotal) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS total_kbytes, |
6 | | - APPROX_QUANTILES(CAST(JSON_VALUE(summary.bytesHtml) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS html_kbytes, |
7 | | - APPROX_QUANTILES(CAST(JSON_VALUE(summary.bytesJS) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS js_kbytes, |
8 | | - APPROX_QUANTILES(CAST(JSON_VALUE(summary.bytesCss) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS css_kbytes, |
9 | | - APPROX_QUANTILES(CAST(JSON_VALUE(summary.bytesImg) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS img_kbytes, |
10 | | - APPROX_QUANTILES(CAST(JSON_VALUE(summary.bytesOther) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS other_kbytes, |
11 | | - APPROX_QUANTILES(CAST(JSON_VALUE(summary.bytesHtmlDoc) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS html_doc_kbytes, |
12 | | - APPROX_QUANTILES(CAST(JSON_VALUE(summary.bytesFont) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS font_kbytes |
| 26 | + p.client, |
| 27 | + p.is_root_page, |
| 28 | + APPROX_QUANTILES(CAST(JSON_VALUE(p.summary.bytesTotal) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS total_kbytes, |
| 29 | + APPROX_QUANTILES(CAST(JSON_VALUE(p.summary.bytesHtml) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS html_kbytes, |
| 30 | + APPROX_QUANTILES(CAST(JSON_VALUE(p.summary.bytesJS) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS js_kbytes, |
| 31 | + APPROX_QUANTILES(CAST(JSON_VALUE(p.summary.bytesCss) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS css_kbytes, |
| 32 | + APPROX_QUANTILES(CAST(JSON_VALUE(p.summary.bytesImg) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS img_kbytes, |
| 33 | + APPROX_QUANTILES(CAST(JSON_VALUE(p.summary.bytesOther) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS other_kbytes, |
| 34 | + -- Use the pre-calculated column from the CTE |
| 35 | + APPROX_QUANTILES(r.respSize_bytes / 1024, 1000)[OFFSET(percentile * 10)] AS html_doc_kbytes, |
| 36 | + APPROX_QUANTILES(CAST(JSON_VALUE(p.summary.bytesFont) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS font_kbytes |
13 | 37 | FROM |
14 | | - `httparchive.crawl.pages`, |
| 38 | + `httparchive.crawl.pages` p |
| 39 | +LEFT JOIN |
| 40 | + -- Join the new CTE on the matching keys |
| 41 | + HtmlDocRequests r |
| 42 | +ON |
| 43 | + p.page = r.page AND |
| 44 | + p.client = r.client AND |
| 45 | + p.date = r.date, |
15 | 46 | UNNEST([10, 25, 50, 75, 90, 100]) AS percentile |
16 | 47 | WHERE |
17 | | - date = '2025-07-01' |
| 48 | + -- This filter on the main 'pages' table is still required |
| 49 | + p.date = '2025-07-01' |
18 | 50 | GROUP BY |
19 | 51 | percentile, |
20 | | - client, |
21 | | - is_root_page |
| 52 | + p.client, |
| 53 | + p.is_root_page |
22 | 54 | ORDER BY |
23 | | - client, |
24 | | - is_root_page, |
| 55 | + p.client, |
| 56 | + p.is_root_page, |
25 | 57 | percentile |
0 commit comments