Skip to content

Commit 8c708ca

Browse files
Sustainability 2025 queries (#4171)
* Migrate 2024 sustainability queries to 2025 crawl dataset * Fix linter errors * fix linter issues * fix linter errors for green_third_party * update indentation from 4 to 2 * fix linter issues * fix CV04 * fix linter * fix LT02 * fix CV04 * fix linter issues * Remove limit * update styles and fix cms_bytes_per_type * Fix sql queries * Fix not working SQL queries * fix some queries * fix PR reviews and linter issues with SQLFluff auto-fix * fix linter issues * Linting --------- Co-authored-by: Barry Pollard <barrypollard@google.com>
1 parent 077eba8 commit 8c708ca

23 files changed

+1886
-0
lines changed
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
#standardSQL
2+
# The distribution of cache header adoption on websites by client.
3+
4+
SELECT
5+
client,
6+
COUNT(0) AS total_requests,
7+
8+
COUNTIF(uses_cache_control) AS total_using_cache_control,
9+
COUNTIF(uses_max_age) AS total_using_max_age,
10+
COUNTIF(uses_expires) AS total_using_expires,
11+
COUNTIF(uses_max_age AND uses_expires) AS total_using_max_age_and_expires,
12+
COUNTIF(
13+
uses_cache_control AND uses_expires
14+
) AS total_using_both_cc_and_expires,
15+
COUNTIF(
16+
NOT uses_cache_control AND NOT uses_expires
17+
) AS total_using_neither_cc_and_expires,
18+
COUNTIF(
19+
uses_cache_control AND NOT uses_expires
20+
) AS total_using_only_cache_control,
21+
COUNTIF(
22+
NOT uses_cache_control AND uses_expires
23+
) AS total_using_only_expires,
24+
25+
COUNTIF(uses_cache_control) / COUNT(0) AS pct_cache_control,
26+
COUNTIF(uses_max_age) / COUNT(0) AS pct_using_max_age,
27+
COUNTIF(uses_expires) / COUNT(0) AS pct_using_expires,
28+
COUNTIF(
29+
uses_max_age AND uses_expires
30+
) / COUNT(0) AS pct_using_max_age_and_expires,
31+
COUNTIF(
32+
uses_cache_control AND uses_expires
33+
) / COUNT(0) AS pct_using_both_cc_and_expires,
34+
COUNTIF(
35+
NOT uses_cache_control AND NOT uses_expires
36+
) / COUNT(0) AS pct_using_neither_cc_nor_expires,
37+
COUNTIF(
38+
uses_cache_control AND NOT uses_expires
39+
) / COUNT(0) AS pct_using_only_cache_control,
40+
COUNTIF(
41+
NOT uses_cache_control AND uses_expires
42+
) / COUNT(0) AS pct_using_only_expires
43+
44+
FROM (
45+
SELECT
46+
client,
47+
url,
48+
LOGICAL_OR(
49+
header.name = 'expires' AND header.value IS NOT NULL AND TRIM(
50+
header.value
51+
) != ''
52+
) AS uses_expires,
53+
LOGICAL_OR(
54+
header.name = 'cache-control' AND
55+
header.value IS NOT NULL AND
56+
TRIM(header.value) != ''
57+
) AS uses_cache_control,
58+
LOGICAL_OR(
59+
header.name = 'cache-control' AND REGEXP_CONTAINS(
60+
header.value, r'(?i)max-age\s*=\s*[0-9]+'
61+
)
62+
) AS uses_max_age,
63+
64+
LOGICAL_OR(
65+
header.name = 'etag' AND (
66+
header.value IS NULL OR TRIM(header.value) = ''
67+
)
68+
) AS uses_no_etag,
69+
LOGICAL_OR(
70+
header.name = 'etag' AND header.value IS NOT NULL AND TRIM(
71+
header.value
72+
) != ''
73+
) AS uses_etag,
74+
LOGICAL_OR(
75+
header.name = 'last-modified' AND
76+
header.value IS NOT NULL AND
77+
TRIM(header.value) != ''
78+
) AS uses_last_modified,
79+
80+
LOGICAL_OR(
81+
header.name = 'etag' AND REGEXP_CONTAINS(
82+
TRIM(header.value), '^W/".*"'
83+
)
84+
) AS uses_weak_etag,
85+
LOGICAL_OR(
86+
header.name = 'etag' AND REGEXP_CONTAINS(
87+
TRIM(header.value), '^".*"'
88+
)
89+
) AS uses_strong_etag
90+
91+
FROM
92+
`httparchive.crawl.requests`,
93+
UNNEST(response_headers) AS header
94+
WHERE
95+
date = '2025-07-01'
96+
GROUP BY
97+
client,
98+
url
99+
)
100+
101+
GROUP BY
102+
client
103+
ORDER BY
104+
client;
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#standardSQL
2+
# The distribution of CDN adoption on websites by client.
3+
4+
SELECT
5+
client,
6+
total,
7+
IF(cdn = '', 'No CDN', cdn) AS cdn,
8+
COUNT(0) AS freq,
9+
ROUND(100 * COUNT(0) / total, 2) AS pct
10+
FROM (
11+
SELECT
12+
client,
13+
COUNT(0) AS total,
14+
ARRAY_CONCAT_AGG(
15+
SPLIT(JSON_VALUE(summary.cdn), ', ')
16+
) AS cdn_list
17+
FROM
18+
`httparchive.crawl.pages`
19+
WHERE
20+
date = '2025-07-01' AND
21+
is_root_page = TRUE
22+
GROUP BY
23+
client
24+
),
25+
UNNEST(cdn_list) AS cdn
26+
GROUP BY
27+
client,
28+
cdn,
29+
total
30+
ORDER BY
31+
pct DESC,
32+
client ASC,
33+
cdn ASC;
Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
#standardSQL
2+
# Median resource weights by CMS
3+
4+
# Declare variables to calculate the carbon emissions of one byte
5+
# Source: https://sustainablewebdesign.org/calculating-digital-emissions/
6+
7+
DECLARE grid_intensity NUMERIC DEFAULT 494;
8+
DECLARE embodied_emissions_data_centers NUMERIC DEFAULT 0.012;
9+
DECLARE embodied_emissions_network NUMERIC DEFAULT 0.013;
10+
DECLARE embodied_emissions_user_devices NUMERIC DEFAULT 0.081;
11+
DECLARE operational_emissions_data_centers NUMERIC DEFAULT 0.055;
12+
DECLARE operational_emissions_network NUMERIC DEFAULT 0.059;
13+
DECLARE operational_emissions_user_devices NUMERIC DEFAULT 0.080;
14+
15+
WITH cms_data AS (
16+
SELECT
17+
client,
18+
page,
19+
tech.technology AS cms,
20+
INT64(summary.bytestotal) / 1024 AS total_kb,
21+
22+
-- Operational emissions calculations
23+
(
24+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
25+
) * operational_emissions_data_centers *
26+
grid_intensity AS op_emissions_dc,
27+
(
28+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
29+
) * operational_emissions_network *
30+
grid_intensity AS op_emissions_networks,
31+
(
32+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
33+
) * operational_emissions_user_devices *
34+
grid_intensity AS op_emissions_devices,
35+
36+
-- Embodied emissions calculations
37+
(
38+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
39+
) * embodied_emissions_data_centers *
40+
grid_intensity AS em_emissions_dc,
41+
(
42+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
43+
) * embodied_emissions_network *
44+
grid_intensity AS em_emissions_networks,
45+
(
46+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
47+
) * embodied_emissions_user_devices *
48+
grid_intensity AS em_emissions_devices,
49+
50+
-- Total emissions (operational + embodied)
51+
(
52+
(
53+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
54+
) * operational_emissions_data_centers * grid_intensity +
55+
(
56+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
57+
) * operational_emissions_network * grid_intensity +
58+
(
59+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
60+
) * operational_emissions_user_devices * grid_intensity
61+
) AS total_operational_emissions,
62+
63+
(
64+
(
65+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
66+
) * embodied_emissions_data_centers * grid_intensity +
67+
(
68+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
69+
) * embodied_emissions_network * grid_intensity +
70+
(
71+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
72+
) * embodied_emissions_user_devices * grid_intensity
73+
) AS total_embodied_emissions,
74+
75+
(
76+
(
77+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
78+
) * operational_emissions_data_centers * grid_intensity +
79+
(
80+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
81+
) * operational_emissions_network * grid_intensity +
82+
(
83+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
84+
) * operational_emissions_user_devices * grid_intensity +
85+
(
86+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
87+
) * embodied_emissions_data_centers * grid_intensity +
88+
(
89+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
90+
) * embodied_emissions_network * grid_intensity +
91+
(
92+
INT64(summary.bytestotal) / 1024 / 1024 / 1024
93+
) * embodied_emissions_user_devices * grid_intensity
94+
) AS total_emissions,
95+
96+
-- Proportions of each resource type relative to total bytes
97+
INT64(summary.byteshtml) / INT64(summary.bytestotal) AS html_proportion,
98+
INT64(summary.bytesjs) / INT64(summary.bytestotal) AS js_proportion,
99+
INT64(summary.bytescss) / INT64(summary.bytestotal) AS css_proportion,
100+
INT64(summary.bytesimg) / INT64(summary.bytestotal) AS img_proportion,
101+
INT64(summary.bytesfont) / INT64(summary.bytestotal) AS font_proportion,
102+
103+
-- Resource-specific emissions calculations
104+
(SAFE_DIVIDE(INT64(summary.byteshtml), INT64(summary.bytestotal)) * (
105+
(INT64(summary.bytestotal) / 1024 / 1024 / 1024) * (
106+
operational_emissions_data_centers * grid_intensity +
107+
operational_emissions_network * grid_intensity +
108+
operational_emissions_user_devices * grid_intensity +
109+
embodied_emissions_data_centers * grid_intensity +
110+
embodied_emissions_network * grid_intensity +
111+
embodied_emissions_user_devices * grid_intensity
112+
)
113+
)) AS total_html_emissions,
114+
115+
(SAFE_DIVIDE(INT64(summary.bytesjs), INT64(summary.bytestotal)) * (
116+
(INT64(summary.bytestotal) / 1024 / 1024 / 1024) * (
117+
operational_emissions_data_centers * grid_intensity +
118+
operational_emissions_network * grid_intensity +
119+
operational_emissions_user_devices * grid_intensity +
120+
embodied_emissions_data_centers * grid_intensity +
121+
embodied_emissions_network * grid_intensity +
122+
embodied_emissions_user_devices * grid_intensity
123+
)
124+
)) AS total_js_emissions,
125+
126+
(SAFE_DIVIDE(INT64(summary.bytescss), INT64(summary.bytestotal)) * (
127+
(INT64(summary.bytestotal) / 1024 / 1024 / 1024) * (
128+
operational_emissions_data_centers * grid_intensity +
129+
operational_emissions_network * grid_intensity +
130+
operational_emissions_user_devices * grid_intensity +
131+
embodied_emissions_data_centers * grid_intensity +
132+
embodied_emissions_network * grid_intensity +
133+
embodied_emissions_user_devices * grid_intensity
134+
)
135+
)) AS total_css_emissions,
136+
137+
(SAFE_DIVIDE(INT64(summary.bytesimg), INT64(summary.bytestotal)) * (
138+
(INT64(summary.bytestotal) / 1024 / 1024 / 1024) * (
139+
operational_emissions_data_centers * grid_intensity +
140+
operational_emissions_network * grid_intensity +
141+
operational_emissions_user_devices * grid_intensity +
142+
embodied_emissions_data_centers * grid_intensity +
143+
embodied_emissions_network * grid_intensity +
144+
embodied_emissions_user_devices * grid_intensity
145+
)
146+
)) AS total_img_emissions,
147+
148+
(SAFE_DIVIDE(INT64(summary.bytesfont), INT64(summary.bytestotal)) * (
149+
(INT64(summary.bytestotal) / 1024 / 1024 / 1024) * (
150+
operational_emissions_data_centers * grid_intensity +
151+
operational_emissions_network * grid_intensity +
152+
operational_emissions_user_devices * grid_intensity +
153+
embodied_emissions_data_centers * grid_intensity +
154+
embodied_emissions_network * grid_intensity +
155+
embodied_emissions_user_devices * grid_intensity
156+
)
157+
)) AS total_font_emissions,
158+
159+
-- Resource-specific size in KB
160+
INT64(summary.byteshtml) / 1024 AS html_kb,
161+
INT64(summary.bytesjs) / 1024 AS js_kb,
162+
INT64(summary.bytescss) / 1024 AS css_kb,
163+
INT64(summary.bytesimg) / 1024 AS img_kb,
164+
INT64(summary.bytesfont) / 1024 AS font_kb
165+
FROM
166+
`httparchive.crawl.pages`,
167+
UNNEST(technologies) AS tech
168+
WHERE
169+
date = '2025-07-01' AND
170+
is_root_page = TRUE AND
171+
'CMS' IN UNNEST(tech.categories)
172+
)
173+
174+
SELECT
175+
client,
176+
cms,
177+
COUNT(0) AS pages,
178+
-- Median resource weights and emissions
179+
APPROX_QUANTILES(total_kb, 1000)[OFFSET(500)] AS median_total_kb,
180+
APPROX_QUANTILES(
181+
total_operational_emissions, 1000
182+
)[OFFSET(500)] AS median_operational_emissions,
183+
APPROX_QUANTILES(
184+
total_embodied_emissions, 1000
185+
)[OFFSET(500)] AS median_embodied_emissions,
186+
APPROX_QUANTILES(
187+
total_emissions, 1000
188+
)[OFFSET(500)] AS median_total_emissions,
189+
190+
-- Resource-specific medians
191+
APPROX_QUANTILES(html_kb, 1000)[OFFSET(500)] AS median_html_kb,
192+
APPROX_QUANTILES(
193+
total_html_emissions, 1000
194+
)[OFFSET(500)] AS median_total_html_emissions,
195+
APPROX_QUANTILES(js_kb, 1000)[OFFSET(500)] AS median_js_kb,
196+
APPROX_QUANTILES(
197+
total_js_emissions, 1000
198+
)[OFFSET(500)] AS median_total_js_emissions,
199+
APPROX_QUANTILES(css_kb, 1000)[OFFSET(500)] AS median_css_kb,
200+
APPROX_QUANTILES(
201+
total_css_emissions, 1000
202+
)[OFFSET(500)] AS median_total_css_emissions,
203+
APPROX_QUANTILES(img_kb, 1000)[OFFSET(500)] AS median_img_kb,
204+
APPROX_QUANTILES(
205+
total_img_emissions, 1000
206+
)[OFFSET(500)] AS median_total_img_emissions,
207+
APPROX_QUANTILES(font_kb, 1000)[OFFSET(500)] AS median_font_kb,
208+
APPROX_QUANTILES(
209+
total_font_emissions, 1000
210+
)[OFFSET(500)] AS median_total_font_emissions
211+
FROM
212+
cms_data
213+
GROUP BY
214+
client,
215+
cms
216+
ORDER BY
217+
pages DESC,
218+
cms ASC,
219+
client ASC;

0 commit comments

Comments
 (0)