Skip to content

Commit 43aee09

Browse files
joeviggianoJaiganesh Girinathantunethewebgithub-actions[bot]
authored
CDN 2024 chapter (#3836)
* changes for 2024 CDN chapter * changes for 2024 CDN chapter * migrated all CDN queries * added sql for image formats and mime type distribution for client hints * Added counts for client hints, added early hints counts by CDN * Updated client hints, added early hints cdn vs origin * Updated CDN markdown * Updated markdown and added static images * Fixed stats on TLS. * Optimised images with calibre/image-actions * Contributors * Smart quotes * Signiticant figures * Retake images * SQL linting * More linting * Misc edits * Link --------- Co-authored-by: Jaiganesh Girinathan <ganeshji@amazon.com> Co-authored-by: Barry Pollard <barrypollard@google.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
1 parent 33d0eab commit 43aee09

37 files changed

Lines changed: 1224 additions & 14 deletions

sql/2024/cdn/README copy.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# 2024 CDN queries
2+
3+
<!--
4+
This directory contains all of the 2024 CDN chapter queries.
5+
6+
Each query should have a corresponding `metric_name.sql` file.
7+
Note that readers are linked to this directory, so try to make the SQL file names descriptive for easy browsing.
8+
9+
Analysts: if helpful, you can use this README to give additional info about the queries.
10+
-->
11+
12+
Query updates:
13+
- Dates have been updated
14+
15+
16+
17+
18+
## Resources
19+
20+
- [📄 Planning doc][~google-doc]
21+
- [📊 Results sheet][~google-sheets]
22+
- [📝 Markdown file][~chapter-markdown]
23+
24+
[~google-doc]: https://docs.google.com/document/d/11Yz8S-e3ltbYQPdzKX1E3oexfA2PwWLdA5tToDv98BI/edit
25+
[~google-sheets]:https://docs.google.com/spreadsheets/d/15YXQQjyoQ0Bnfw9KNSz_YuGDiCfW978_WKEHvDXjdm4/edit#gid=745368492
26+
[~chapter-markdown]: https://github.com/HTTPArchive/almanac.httparchive.org/tree/main/src/content/en/2024/cdn.md
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#standardSQL
2+
# cdn_usage_by_site_rank.sql : Distribution of HTML pages served by CDN vs Origin by rank
3+
4+
WITH requests AS (
5+
SELECT
6+
client,
7+
rank,
8+
-- _cdn_provider is now in requests.summary table
9+
-- Also it returns empty string ('')rather than 'ORIGIN' when no CDN
10+
IF(IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(resp.summary, '$._cdn_provider'), r'^([^,]*).*'), ''), '') = '', 'ORIGIN', 'CDN') AS cdn
11+
FROM
12+
--`httparchive.almanac.requests` -- OLD table
13+
`httparchive.all.requests` AS resp -- NEW table
14+
-- `httparchive.sample_data.requests_1k` AS resp -- SAMPLE table (quicker)
15+
INNER JOIN
16+
`httparchive.all.pages` -- NEW pages table
17+
-- `httparchive.sample_data.pages_1k` AS pages -- SAMPLE pages table (quicker)
18+
USING (page, client, date)
19+
WHERE
20+
date = '2024-06-01' AND -- Uncomment this when running on full table
21+
is_main_document -- new name for firstHtml
22+
)
23+
24+
SELECT
25+
client,
26+
nested_rank,
27+
cdn,
28+
COUNT(0) AS num_requests,
29+
SUM(COUNT(0)) OVER (PARTITION BY client, nested_rank) AS total,
30+
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client, nested_rank) AS pct_requests
31+
FROM
32+
requests,
33+
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS nested_rank -- Note extra rank since 2022
34+
WHERE
35+
rank <= nested_rank
36+
GROUP BY
37+
client,
38+
cdn,
39+
nested_rank
40+
ORDER BY
41+
client,
42+
nested_rank,
43+
cdn
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
SELECT
2+
(COUNT(0)) AS Total, client,
3+
IF(IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), '') = '', 'ORIGIN', 'CDN') AS cdn
4+
FROM `httparchive.all.requests`
5+
WHERE date = '2024-06-01'
6+
GROUP BY cdn, client
7+
UNION ALL
8+
SELECT
9+
(COUNT(req)) AS ClientHints, client,
10+
IF(IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), '') = '', 'ORIGIN', 'CDN') AS cdn
11+
FROM `httparchive.all.requests` AS req,
12+
UNNEST(response_headers) AS header
13+
WHERE date = '2024-06-01' AND
14+
header.name = 'accept-ch' AND
15+
header.value IS NOT NULL
16+
GROUP BY cdn, client
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#standardSQL
2+
# distribution_of_compression_types_by_cdn.sql : What compression formats are being used (gzip, brotli, etc) for compressed resources served by CDNs
3+
4+
SELECT
5+
client,
6+
cdn,
7+
compression_type,
8+
COUNT(0) AS num_requests,
9+
SUM(COUNT(0)) OVER (PARTITION BY client, cdn) AS total_compressed,
10+
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client, cdn) AS pct
11+
FROM (
12+
SELECT
13+
client,
14+
IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN') AS cdn, # sometimes _cdn provider detection includes multiple entries. we bias for the DNS detected entry which is the first entry
15+
CASE
16+
WHEN a.value = 'gzip' THEN 'Gzip'
17+
WHEN a.value = 'br' THEN 'Brotli'
18+
WHEN a.value = '' THEN 'no text compression'
19+
ELSE 'other'
20+
END AS compression_type
21+
FROM
22+
`httparchive.all.requests` CROSS JOIN UNNEST(response_headers) AS a
23+
WHERE
24+
date = '2024-06-01' AND
25+
a.name = 'content-encoding'
26+
-- resp_content_encoding != ''
27+
)
28+
GROUP BY
29+
client,
30+
cdn,
31+
compression_type
32+
ORDER BY
33+
client,
34+
cdn,
35+
compression_type
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#standardSQL
2+
# distribution_of_compression_types_cdn_vs_origin.sql : What compression formats are being used (gzip, brotli, etc) for compressed resources served by CDNs
3+
4+
SELECT
5+
client,
6+
cdn,
7+
compression_type,
8+
COUNT(0) AS num_requests,
9+
SUM(COUNT(0)) OVER (PARTITION BY client, cdn) AS total_compressed,
10+
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client, cdn) AS pct
11+
FROM (
12+
SELECT
13+
client,
14+
IF(IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN') = 'ORIGIN', 'ORIGIN', 'CDN') AS cdn,
15+
CASE
16+
WHEN a.value = 'gzip' THEN 'Gzip'
17+
WHEN a.value = 'br' THEN 'Brotli'
18+
WHEN a.value = '' THEN 'no text compression'
19+
ELSE 'other'
20+
END AS compression_type
21+
FROM
22+
`httparchive.all.requests` CROSS JOIN UNNEST(response_headers) AS a
23+
WHERE
24+
date = '2024-06-01' AND
25+
a.name = 'content-encoding'
26+
-- resp_content_encoding != ''
27+
)
28+
GROUP BY
29+
client,
30+
cdn,
31+
compression_type
32+
ORDER BY
33+
client,
34+
cdn,
35+
compression_type
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#standardSQL
2+
# distribution_of_http_versions: Percentage of HTTPS responses by protocol
3+
SELECT
4+
a.client,
5+
cdn,
6+
is_main_document,
7+
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/0.9') AS http09,
8+
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/1.0') AS http10,
9+
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/1.1') AS http11,
10+
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/2') AS http2,
11+
COUNTIF(IFNULL(a.protocol, b.protocol) = 'H3-29' OR IFNULL(a.protocol, b.protocol) = 'H3-Q050') AS http3,
12+
COUNTIF(IFNULL(a.protocol, b.protocol) NOT IN ('HTTP/0.9', 'HTTP/1.0', 'HTTP/1.1', 'HTTP/2', 'H3-29', 'H3-Q050')) AS http_other,
13+
COUNTIF(isSecure OR IFNULL(a.protocol, b.protocol) = 'HTTP/2') AS tls_total,
14+
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/0.9') / COUNT(0) AS http09_pct,
15+
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/1.0') / COUNT(0) AS http10_pct,
16+
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/1.1') / COUNT(0) AS http11_pct,
17+
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/2') / COUNT(0) AS http2_pct,
18+
COUNTIF(IFNULL(a.protocol, b.protocol) = 'H3-29' OR IFNULL(a.protocol, b.protocol) = 'H3-Q050') / COUNT(0) AS http3_pct,
19+
COUNTIF(IFNULL(a.protocol, b.protocol) NOT IN ('HTTP/0.9', 'HTTP/1.0', 'HTTP/1.1', 'HTTP/2', 'H3-29', 'H3-Q050')) / COUNT(0) AS http_other_pct,
20+
COUNTIF(isSecure OR IFNULL(a.protocol, b.protocol) = 'HTTP/2') / COUNT(0) AS tls_pct,
21+
COUNT(0) AS total
22+
FROM
23+
(
24+
SELECT
25+
client,
26+
page,
27+
url,
28+
is_main_document,
29+
# WPT is inconsistent with protocol population.
30+
UPPER(IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(CONCAT('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/')))) AS protocol,
31+
JSON_EXTRACT_SCALAR(payload, '$._tls_version') AS tlsVersion,
32+
33+
# WPT joins CDN detection but we bias to the DNS detection which is the first entry
34+
IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN') AS cdn,
35+
CAST(JSON_EXTRACT(payload, '$.timings.ssl') AS INT64) AS tlstime,
36+
37+
# isSecure reports what the browser thought it was going to use, but it can get upgraded with STS OR UpgradeInsecure: 1
38+
IF(STARTS_WITH(url, 'https') OR JSON_EXTRACT_SCALAR(payload, '$._tls_version') IS NOT NULL OR CAST(JSON_EXTRACT(payload, '$._is_secure') AS INT64) = 1, TRUE, FALSE) AS isSecure,
39+
CAST(JSON_EXTRACT(payload, '$._socket') AS INT64) AS socket
40+
FROM
41+
`httparchive.all.requests` CROSS JOIN UNNEST(response_headers) AS r
42+
--`httparchive.sample_data.requests`
43+
WHERE
44+
# WPT changes the response fields based on a redirect (url becomes the Location path instead of the original) causing insonsistencies in the counts, so we ignore them
45+
date = '2024-06-01' AND
46+
r.name = 'location' AND
47+
(r.value = '' OR r.value IS NULL)
48+
) a
49+
LEFT JOIN
50+
(
51+
SELECT
52+
client,
53+
page,
54+
CAST(JSON_EXTRACT(payload, '$._socket') AS INT64) AS socket,
55+
ANY_VALUE(UPPER(IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(concat('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/'))))) AS protocol,
56+
ANY_VALUE(JSON_EXTRACT_SCALAR(payload, '$._tls_version')) AS tlsVersion
57+
FROM
58+
`httparchive.all.requests`
59+
WHERE
60+
JSON_EXTRACT_SCALAR(payload, '$._tls_version') IS NOT NULL AND
61+
IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(concat('HTTP/',
62+
JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/'))) IS NOT NULL AND
63+
JSON_EXTRACT(payload, '$._socket') IS NOT NULL AND
64+
date = '2024-06-01'
65+
GROUP BY
66+
client,
67+
page,
68+
socket
69+
) b ON (a.client = b.client AND a.page = b.page AND a.socket = b.socket)
70+
71+
GROUP BY
72+
client,
73+
cdn,
74+
is_main_document
75+
ORDER BY
76+
client DESC,
77+
total DESC
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#standardSQL
2+
# distribution_of_http_versions_cdn_vs_origin.sql 17_19: Percentage of HTTPS responses by protocol
3+
SELECT
4+
a.client,
5+
IF(cdn = 'Origin', 'Origin', 'CDN') AS cdn,
6+
is_main_document,
7+
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/1.1') AS http11,
8+
COUNTIF(IFNULL(a.protocol, b.protocol) IN ('HTTP/2', 'H3-29', 'H3-Q050')) AS http2plus,
9+
COUNTIF(IFNULL(a.protocol, b.protocol) NOT IN ('HTTP/1.1', 'HTTP/2', 'H3-29', 'H3-Q050')) AS http_other,
10+
COUNT(0) AS total,
11+
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/1.1') / COUNT(0) AS http11_pct,
12+
COUNTIF(IFNULL(a.protocol, b.protocol) IN ('HTTP/2', 'H3-29', 'H3-Q050')) / COUNT(0) AS http2plus_pct,
13+
COUNTIF(IFNULL(a.protocol, b.protocol) NOT IN ('HTTP/1.1', 'HTTP/2', 'H3-29', 'H3-Q050')) / COUNT(0) AS http_other_pct
14+
FROM (
15+
SELECT
16+
client,
17+
page,
18+
url,
19+
is_main_document,
20+
CAST(JSON_EXTRACT(payload, '$._socket') AS INT64) AS socket,
21+
# WPT is inconsistent with protocol population.
22+
UPPER(IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(concat('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/')))) AS protocol,
23+
24+
# WPT joins CDN detection but we bias to the DNS detection which is the first entry
25+
IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'Origin') AS cdn
26+
FROM
27+
`httparchive.all.requests` CROSS JOIN UNNEST(response_headers) AS r
28+
WHERE
29+
# WPT changes the response fields based on a redirect (url becomes the Location path instead of the original) causing insonsistencies in the counts, so we ignore them
30+
date = '2024-06-01' AND
31+
r.name = 'location' AND
32+
(r.value = '' OR r.value IS NULL)
33+
) AS a
34+
LEFT JOIN (
35+
SELECT
36+
client,
37+
page,
38+
CAST(JSON_EXTRACT(payload, '$._socket') AS INT64) AS socket,
39+
ANY_VALUE(UPPER(IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(CONCAT('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/'))))) AS protocol
40+
FROM
41+
`httparchive.all.requests`
42+
WHERE
43+
JSON_EXTRACT_SCALAR(payload, '$._tls_version') IS NOT NULL AND
44+
IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(CONCAT('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/'))) IS NOT NULL AND
45+
JSON_EXTRACT(payload, '$._socket') IS NOT NULL AND
46+
date = '2024-06-01'
47+
GROUP BY
48+
client,
49+
page,
50+
socket) AS b
51+
ON
52+
a.client = b.client AND
53+
a.page = b.page AND
54+
a.socket = b.socket
55+
GROUP BY
56+
client,
57+
cdn,
58+
is_main_document
59+
ORDER BY
60+
client DESC,
61+
total DESC
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#standardSQL
2+
# distribution_of_tls_time_by_cdn.sql : Distribution of TLS negotiation time by CDN
3+
SELECT
4+
client,
5+
cdn,
6+
is_main_document,
7+
COUNT(0) AS requests,
8+
APPROX_QUANTILES(tlstime, 1000)[OFFSET(100)] AS p10,
9+
APPROX_QUANTILES(tlstime, 1000)[OFFSET(250)] AS p25,
10+
APPROX_QUANTILES(tlstime, 1000)[OFFSET(500)] AS p50,
11+
APPROX_QUANTILES(tlstime, 1000)[OFFSET(750)] AS p75,
12+
APPROX_QUANTILES(tlstime, 1000)[OFFSET(900)] AS p90
13+
FROM (
14+
SELECT
15+
client,
16+
-- requestid,
17+
page,
18+
url,
19+
is_main_document,
20+
IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN') AS cdn, # sometimes _cdn provider detection includes multiple entries. we bias for the DNS detected entry which is the first entry
21+
CAST(JSON_EXTRACT(payload, '$.timings.ssl') AS INT64) AS tlstime,
22+
JSON_EXTRACT_SCALAR(summary, '$.requestid') AS requestid,
23+
ARRAY_LENGTH(split(JSON_EXTRACT(payload, '$._securityDetails.sanList'), '')) AS sanLength,
24+
IF(NET.HOST(url) = NET.HOST(page), TRUE, FALSE) AS sameHost,
25+
IF(NET.HOST(url) = NET.HOST(page) OR NET.REG_DOMAIN(url) = NET.REG_DOMAIN(page), TRUE, FALSE) AS sameDomain # if toplevel reg_domain will return NULL so we group this as sameDomain
26+
FROM
27+
`httparchive.all.requests`
28+
WHERE
29+
date = '2024-06-01'
30+
GROUP BY
31+
client,
32+
requestid,
33+
page,
34+
url,
35+
is_main_document,
36+
cdn,
37+
tlstime,
38+
sanLength
39+
)
40+
WHERE
41+
tlstime != -1 AND
42+
sanLength IS NOT NULL
43+
GROUP BY
44+
client,
45+
cdn,
46+
is_main_document
47+
ORDER BY
48+
requests DESC
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#standardSQL
2+
# distribution_of_tls_time_cdn_vs_origin.sql : Distribution of TLS negotiation for CDN vs Origin (ie, no CDN)
3+
SELECT
4+
client,
5+
IF(cdn = 'ORIGIN', 'ORIGIN', 'CDN') AS cdn,
6+
is_main_document,
7+
COUNT(0) AS requests,
8+
APPROX_QUANTILES(tlstime, 1000)[OFFSET(100)] AS p10,
9+
APPROX_QUANTILES(tlstime, 1000)[OFFSET(250)] AS p25,
10+
APPROX_QUANTILES(tlstime, 1000)[OFFSET(500)] AS p50,
11+
APPROX_QUANTILES(tlstime, 1000)[OFFSET(750)] AS p75,
12+
APPROX_QUANTILES(tlstime, 1000)[OFFSET(900)] AS p90
13+
FROM (
14+
SELECT
15+
client,
16+
-- requestid,
17+
JSON_EXTRACT_SCALAR(summary, '$.requestid') AS requestid,
18+
page,
19+
url,
20+
is_main_document,
21+
IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN') AS cdn, # sometimes _cdn provider detection includes multiple entries. we bias for the DNS detected entry which is the first entry
22+
CAST(JSON_EXTRACT(payload, '$.timings.ssl') AS INT64) AS tlstime,
23+
ARRAY_LENGTH(split(JSON_EXTRACT(payload, '$._securityDetails.sanList'), '')) AS sanLength,
24+
IF(NET.HOST(url) = NET.HOST(page), TRUE, FALSE) AS sameHost,
25+
IF(NET.HOST(url) = NET.HOST(page) OR NET.REG_DOMAIN(url) = NET.REG_DOMAIN(page), TRUE, FALSE) AS sameDomain # if toplevel reg_domain will return NULL so we group this as sameDomain
26+
FROM
27+
`httparchive.all.requests`
28+
WHERE
29+
date = '2024-06-01'
30+
GROUP BY
31+
client,
32+
requestid,
33+
page,
34+
url,
35+
is_main_document,
36+
cdn,
37+
tlstime,
38+
sanLength
39+
)
40+
WHERE
41+
tlstime != -1 AND
42+
sanLength IS NOT NULL
43+
GROUP BY
44+
client,
45+
cdn,
46+
is_main_document
47+
ORDER BY
48+
requests DESC

0 commit comments

Comments
 (0)