Skip to content

Commit 9dc6c1a

Browse files
committed
Added new queries for analysis
1 parent 2e428ad commit 9dc6c1a

9 files changed

+946
-0
lines changed
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
#standardSQL
2+
# Alt-Svc Protocol Analysis - Deep dive into Alternative Service headers
3+
# Analyzes what protocols are advertised vs actually used
4+
5+
WITH alt_svc_analysis AS (
6+
SELECT
7+
client,
8+
page,
9+
url,
10+
is_main_document,
11+
12+
-- CDN detection
13+
IFNULL(
14+
NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''),
15+
'ORIGIN'
16+
) AS cdn,
17+
18+
-- Current protocol used
19+
UPPER(IFNULL(
20+
JSON_EXTRACT_SCALAR(payload, '$._protocol'),
21+
IFNULL(
22+
NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'),
23+
NULLIF(CONCAT('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/')
24+
)
25+
)) AS current_protocol,
26+
27+
-- Extract Alt-Svc header value
28+
(
29+
SELECT h.value
30+
FROM UNNEST(response_headers) AS h
31+
WHERE LOWER(h.name) = 'alt-svc'
32+
LIMIT 1
33+
) AS alt_svc_header,
34+
35+
-- Check if Alt-Svc exists
36+
EXISTS(
37+
SELECT 1
38+
FROM UNNEST(response_headers) AS h
39+
WHERE LOWER(h.name) = 'alt-svc'
40+
) AS has_alt_svc
41+
42+
FROM `httparchive.crawl.requests`
43+
WHERE date = '2025-07-01'
44+
AND EXISTS(
45+
SELECT 1
46+
FROM UNNEST(response_headers) AS h
47+
WHERE LOWER(h.name) = 'alt-svc'
48+
)
49+
),
50+
51+
protocol_extraction AS (
52+
SELECT
53+
*,
54+
-- Extract HTTP/3 variants from Alt-Svc
55+
REGEXP_CONTAINS(LOWER(alt_svc_header), r'h3[^=]*=') AS advertises_h3,
56+
REGEXP_CONTAINS(LOWER(alt_svc_header), r'h3-\d+[^=]*=') AS advertises_h3_draft,
57+
REGEXP_CONTAINS(LOWER(alt_svc_header), r'h2[^=]*=') AS advertises_h2,
58+
REGEXP_CONTAINS(LOWER(alt_svc_header), r'http/1\.1[^=]*=') AS advertises_h1,
59+
60+
-- Extract max-age values
61+
SAFE_CAST(REGEXP_EXTRACT(alt_svc_header, r'ma=(\d+)') AS INT64) AS max_age_seconds,
62+
63+
-- Check for clear directive
64+
REGEXP_CONTAINS(LOWER(alt_svc_header), r'clear') AS has_clear_directive
65+
66+
FROM alt_svc_analysis
67+
)
68+
69+
SELECT
70+
client,
71+
cdn,
72+
is_main_document,
73+
current_protocol,
74+
COUNT(*) AS total_requests,
75+
76+
-- Protocol advertising analysis
77+
COUNTIF(advertises_h3) AS advertises_h3_count,
78+
ROUND(SAFE_DIVIDE(COUNTIF(advertises_h3), COUNT(*)) * 100, 2) AS advertises_h3_pct,
79+
80+
COUNTIF(advertises_h3_draft) AS advertises_h3_draft_count,
81+
ROUND(SAFE_DIVIDE(COUNTIF(advertises_h3_draft), COUNT(*)) * 100, 2) AS advertises_h3_draft_pct,
82+
83+
COUNTIF(advertises_h2) AS advertises_h2_count,
84+
ROUND(SAFE_DIVIDE(COUNTIF(advertises_h2), COUNT(*)) * 100, 2) AS advertises_h2_pct,
85+
86+
COUNTIF(advertises_h1) AS advertises_h1_count,
87+
ROUND(SAFE_DIVIDE(COUNTIF(advertises_h1), COUNT(*)) * 100, 2) AS advertises_h1_pct,
88+
89+
-- Protocol mismatch analysis
90+
COUNTIF(advertises_h3 AND current_protocol NOT LIKE '%H3%' AND current_protocol != 'HTTP/3') AS h3_advertised_not_used,
91+
COUNTIF(current_protocol LIKE '%H3%' OR current_protocol = 'HTTP/3') AS currently_using_h3,
92+
93+
-- Max-age statistics
94+
APPROX_QUANTILES(max_age_seconds, 100)[OFFSET(50)] AS median_max_age_seconds,
95+
APPROX_QUANTILES(max_age_seconds, 100)[OFFSET(90)] AS p90_max_age_seconds,
96+
97+
-- Clear directive usage
98+
COUNTIF(has_clear_directive) AS clear_directive_count,
99+
ROUND(SAFE_DIVIDE(COUNTIF(has_clear_directive), COUNT(*)) * 100, 2) AS clear_directive_pct,
100+
101+
-- Sample Alt-Svc headers for analysis
102+
ARRAY_AGG(alt_svc_header IGNORE NULLS LIMIT 5) AS sample_alt_svc_headers
103+
104+
FROM protocol_extraction
105+
GROUP BY
106+
client,
107+
cdn,
108+
is_main_document,
109+
current_protocol
110+
HAVING
111+
total_requests >= 100
112+
ORDER BY
113+
client DESC,
114+
total_requests DESC,
115+
advertises_h3_pct DESC
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#standardSQL
2+
# 20_brotli_compression_adoption.sql: Brotli and modern compression adoption by CDN
3+
# Analyzes compression algorithm usage patterns and efficiency across CDN providers
4+
#
5+
# Rationale: Brotli compression offers 20-30% better compression than gzip for text resources.
6+
# As it becomes more widely supported, we want to track which CDNs are leading adoption
7+
# and how much bandwidth savings are being achieved. This is critical for performance
8+
# and sustainability metrics.
9+
10+
WITH compression_analysis AS (
11+
SELECT
12+
client,
13+
14+
-- CDN detection
15+
IFNULL(
16+
NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''),
17+
'ORIGIN'
18+
) AS cdn,
19+
20+
-- Page and resource info
21+
page,
22+
url,
23+
is_main_document,
24+
25+
-- Content type classification
26+
CASE
27+
WHEN REGEXP_CONTAINS(LOWER(url), r'\.(js|mjs)($|\?)') THEN 'JavaScript'
28+
WHEN REGEXP_CONTAINS(LOWER(url), r'\.css($|\?)') THEN 'CSS'
29+
WHEN REGEXP_CONTAINS(LOWER(url), r'\.(html|htm)($|\?)') OR is_main_document THEN 'HTML'
30+
WHEN REGEXP_CONTAINS(LOWER(url), r'\.(json)($|\?)') THEN 'JSON'
31+
WHEN REGEXP_CONTAINS(LOWER(url), r'\.(svg)($|\?)') THEN 'SVG'
32+
WHEN REGEXP_CONTAINS(LOWER(url), r'\.(woff2?|ttf|otf|eot)($|\?)') THEN 'Fonts'
33+
WHEN REGEXP_CONTAINS(LOWER(url), r'\.(jpg|jpeg|png|gif|webp|avif)($|\?)') THEN 'Images'
34+
ELSE 'Other'
35+
END AS content_type,
36+
37+
-- Compression detection from Content-Encoding header
38+
(
39+
SELECT LOWER(h.value)
40+
FROM UNNEST(response_headers) AS h
41+
WHERE LOWER(h.name) = 'content-encoding'
42+
LIMIT 1
43+
) AS content_encoding,
44+
45+
-- Vary header check (indicates dynamic compression support)
46+
EXISTS(
47+
SELECT 1 FROM UNNEST(response_headers) AS h
48+
WHERE LOWER(h.name) = 'vary' AND LOWER(h.value) LIKE '%accept-encoding%'
49+
) AS supports_dynamic_compression,
50+
51+
-- Response size metrics
52+
SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response.bodySize') AS INT64) AS response_body_size,
53+
SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response.bodySize') AS INT64) AS uncompressed_size,
54+
55+
-- Transfer size (actual bytes transferred)
56+
SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response._transferSize') AS INT64) AS transfer_size
57+
FROM `httparchive.crawl.requests`
58+
WHERE date = '2025-07-01'
59+
-- Focus on compressible content types
60+
AND REGEXP_CONTAINS(LOWER(url), r'\.(js|mjs|css|html|htm|json|svg|xml|txt)($|\?)')
61+
)
62+
63+
SELECT
64+
client,
65+
cdn,
66+
content_type,
67+
68+
-- Volume metrics
69+
COUNT(DISTINCT page) AS total_pages,
70+
COUNT(0) AS total_requests,
71+
72+
-- Compression type distribution
73+
COUNTIF(content_encoding = 'br') AS brotli_requests,
74+
COUNTIF(content_encoding = 'gzip') AS gzip_requests,
75+
COUNTIF(content_encoding = 'deflate') AS deflate_requests,
76+
COUNTIF(content_encoding IS NULL OR content_encoding = '') AS uncompressed_requests,
77+
COUNTIF(content_encoding NOT IN ('br', 'gzip', 'deflate', '') AND content_encoding IS NOT NULL) AS other_compression,
78+
79+
-- Compression percentages
80+
ROUND(SAFE_DIVIDE(COUNTIF(content_encoding = 'br'), COUNT(0)) * 100, 2) AS brotli_pct,
81+
ROUND(SAFE_DIVIDE(COUNTIF(content_encoding = 'gzip'), COUNT(0)) * 100, 2) AS gzip_pct,
82+
ROUND(SAFE_DIVIDE(COUNTIF(content_encoding = 'deflate'), COUNT(0)) * 100, 2) AS deflate_pct,
83+
ROUND(SAFE_DIVIDE(COUNTIF(content_encoding IS NULL OR content_encoding = ''), COUNT(0)) * 100, 2) AS uncompressed_pct,
84+
85+
-- Dynamic compression support
86+
COUNTIF(supports_dynamic_compression) AS dynamic_compression_count,
87+
ROUND(SAFE_DIVIDE(COUNTIF(supports_dynamic_compression), COUNT(0)) * 100, 2) AS dynamic_compression_pct,
88+
89+
-- Size metrics (in KB)
90+
ROUND(AVG(response_body_size) / 1024, 2) AS avg_response_size_kb,
91+
ROUND(AVG(CASE WHEN content_encoding = 'br' THEN response_body_size END) / 1024, 2) AS avg_brotli_size_kb,
92+
ROUND(AVG(CASE WHEN content_encoding = 'gzip' THEN response_body_size END) / 1024, 2) AS avg_gzip_size_kb,
93+
ROUND(AVG(CASE WHEN content_encoding IS NULL OR content_encoding = '' THEN response_body_size END) / 1024, 2) AS avg_uncompressed_size_kb,
94+
95+
-- Compression efficiency comparison
96+
ROUND(
97+
SAFE_DIVIDE(
98+
AVG(CASE WHEN content_encoding = 'gzip' THEN response_body_size END) -
99+
AVG(CASE WHEN content_encoding = 'br' THEN response_body_size END),
100+
AVG(CASE WHEN content_encoding = 'gzip' THEN response_body_size END)
101+
) * 100, 2
102+
) AS brotli_vs_gzip_savings_pct,
103+
104+
-- Total data transfer metrics
105+
ROUND(SUM(response_body_size) / (1024 * 1024 * 1024), 2) AS total_gb_transferred,
106+
ROUND(SUM(CASE WHEN content_encoding = 'br' THEN response_body_size END) / (1024 * 1024 * 1024), 2) AS total_gb_brotli,
107+
ROUND(SUM(CASE WHEN content_encoding = 'gzip' THEN response_body_size END) / (1024 * 1024 * 1024), 2) AS total_gb_gzip
108+
FROM compression_analysis
109+
GROUP BY client,
110+
cdn,
111+
content_type
112+
HAVING
113+
total_requests >= 100 -- Minimum threshold for statistical relevance
114+
ORDER BY client DESC,
115+
brotli_pct DESC,
116+
total_requests DESC
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
#standardSQL
2+
# Cache-Control and CDN caching behavior analysis
3+
# Analyzes cache directives and actual caching behavior
4+
5+
WITH cache_analysis AS (
6+
SELECT
7+
client,
8+
is_main_document,
9+
10+
-- CDN detection
11+
IFNULL(
12+
NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''),
13+
'ORIGIN'
14+
) AS cdn,
15+
16+
-- Content type
17+
JSON_EXTRACT_SCALAR(summary, '$._contentType') AS content_type,
18+
19+
-- Response status
20+
SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response.status') AS INT64) AS status_code,
21+
22+
-- Extract cache-control header value
23+
(SELECT value
24+
FROM UNNEST(response_headers)
25+
WHERE LOWER(name) = 'cache-control'
26+
LIMIT 1) AS cache_control,
27+
28+
-- Extract other cache-related headers
29+
(SELECT value
30+
FROM UNNEST(response_headers)
31+
WHERE LOWER(name) = 'expires'
32+
LIMIT 1) AS expires_header,
33+
(SELECT value
34+
FROM UNNEST(response_headers)
35+
WHERE LOWER(name) = 'etag'
36+
LIMIT 1) AS etag,
37+
(SELECT value
38+
FROM UNNEST(response_headers)
39+
WHERE LOWER(name) = 'last-modified'
40+
LIMIT 1) AS last_modified,
41+
(SELECT value
42+
FROM UNNEST(response_headers)
43+
WHERE LOWER(name) = 'vary'
44+
LIMIT 1) AS vary_header,
45+
46+
-- CDN-specific headers
47+
(SELECT value
48+
FROM UNNEST(response_headers)
49+
WHERE LOWER(name) = 'x-cache'
50+
LIMIT 1) AS x_cache,
51+
(SELECT value
52+
FROM UNNEST(response_headers)
53+
WHERE LOWER(name) = 'cf-cache-status'
54+
LIMIT 1) AS cf_cache_status,
55+
(SELECT value
56+
FROM UNNEST(response_headers)
57+
WHERE LOWER(name) = 'x-served-by'
58+
LIMIT 1) AS x_served_by
59+
FROM `httparchive.crawl.requests`
60+
WHERE date = '2025-07-01'
61+
)
62+
63+
SELECT
64+
client,
65+
cdn,
66+
is_main_document,
67+
COUNT(0) AS total_requests,
68+
69+
-- Cache-Control directives analysis
70+
COUNTIF(cache_control IS NOT NULL) AS has_cache_control,
71+
ROUND(SAFE_DIVIDE(COUNTIF(cache_control IS NOT NULL), COUNT(0)) * 100, 2) AS cache_control_pct,
72+
73+
COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'public')) AS is_public,
74+
ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'public')), COUNT(0)) * 100, 2) AS public_pct,
75+
76+
COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'private')) AS is_private,
77+
ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'private')), COUNT(0)) * 100, 2) AS private_pct,
78+
79+
COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-cache')) AS is_no_cache,
80+
ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-cache')), COUNT(0)) * 100, 2) AS no_cache_pct,
81+
82+
COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-store')) AS is_no_store,
83+
ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-store')), COUNT(0)) * 100, 2) AS no_store_pct,
84+
85+
COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'max-age=0')) AS is_max_age_zero,
86+
ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'max-age=0')), COUNT(0)) * 100, 2) AS max_age_zero_pct,
87+
88+
COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'immutable')) AS is_immutable,
89+
ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'immutable')), COUNT(0)) * 100, 2) AS immutable_pct,
90+
91+
COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r's-maxage')) AS has_s_maxage,
92+
ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r's-maxage')), COUNT(0)) * 100, 2) AS s_maxage_pct,
93+
94+
-- Modern cache directives
95+
COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-while-revalidate')) AS has_stale_while_revalidate,
96+
ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-while-revalidate')), COUNT(0)) * 100, 2) AS stale_while_revalidate_pct,
97+
98+
COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-if-error')) AS has_stale_if_error,
99+
ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-if-error')), COUNT(0)) * 100, 2) AS stale_if_error_pct,
100+
101+
-- Max-age value analysis
102+
APPROX_QUANTILES(
103+
SAFE_CAST(REGEXP_EXTRACT(cache_control, r'max-age=(\d+)') AS INT64), 100
104+
)[OFFSET(50)] AS median_max_age_seconds,
105+
106+
APPROX_QUANTILES(
107+
SAFE_CAST(REGEXP_EXTRACT(cache_control, r's-maxage=(\d+)') AS INT64), 100
108+
)[OFFSET(50)] AS median_s_maxage_seconds,
109+
110+
-- Cache hit/miss analysis from CDN headers
111+
COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')) AS cache_hits,
112+
COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'miss')) AS cache_misses,
113+
ROUND(SAFE_DIVIDE(
114+
COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')),
115+
COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit|miss'))
116+
) * 100, 2) AS cache_hit_rate_pct,
117+
118+
-- Cloudflare-specific cache status
119+
COUNTIF(REGEXP_CONTAINS(LOWER(cf_cache_status), r'hit')) AS cf_cache_hits,
120+
COUNTIF(REGEXP_CONTAINS(LOWER(cf_cache_status), r'miss')) AS cf_cache_misses,
121+
COUNTIF(REGEXP_CONTAINS(LOWER(cf_cache_status), r'dynamic')) AS cf_cache_dynamic,
122+
123+
-- Validation headers
124+
COUNTIF(etag IS NOT NULL) AS has_etag,
125+
ROUND(SAFE_DIVIDE(COUNTIF(etag IS NOT NULL), COUNT(0)) * 100, 2) AS etag_pct,
126+
127+
COUNTIF(last_modified IS NOT NULL) AS has_last_modified,
128+
ROUND(SAFE_DIVIDE(COUNTIF(last_modified IS NOT NULL), COUNT(0)) * 100, 2) AS last_modified_pct,
129+
130+
-- CDN cache hit indicators
131+
COUNTIF(x_cache IS NOT NULL) AS has_x_cache,
132+
COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')) AS cache_hit_count,
133+
ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')), COUNTIF(x_cache IS NOT NULL)) * 100, 2) AS cache_hit_rate,
134+
135+
-- Cloudflare specific
136+
COUNTIF(cf_cache_status = 'HIT') AS cf_hits,
137+
COUNTIF(cf_cache_status = 'MISS') AS cf_misses,
138+
ROUND(SAFE_DIVIDE(COUNTIF(cf_cache_status = 'HIT'), COUNTIF(cf_cache_status IS NOT NULL)) * 100, 2) AS cf_hit_rate
139+
FROM cache_analysis
140+
WHERE status_code = 200 -- Focus on successful responses
141+
GROUP BY client,
142+
cdn,
143+
is_main_document
144+
HAVING
145+
total_requests >= 1000
146+
ORDER BY client DESC,
147+
total_requests DESC

0 commit comments

Comments
 (0)