Skip to content

Commit f2f949c

Browse files
christianliebeljcmpageltunetheweb
authored
Generative AI 2025 queries (#4302)
* Add SQL query to analyze CSS gradient usage This SQL query calculates the percentage of sites using CSS gradients over specific years, grouped by client and rank. * Add SQL query for robots.txt site analysis This SQL query calculates the percentage of sites with a valid robots.txt file, including various directives, based on data from the HTTP Archive crawl. * Add SQL script for user-agent analysis in robots.txt This SQL script calculates the percentage of sites mentioning a specific user-agent in their robots.txt file, categorized by rank bucket. Inspired by https://paulcalvano.com/2025-08-21-ai-bots-and-robots-txt/ * Create user-agent-years.sql inspired by https://paulcalvano.com/2025-08-21-ai-bots-and-robots-txt/ * Create gradient-by-client.sql * Add SQL query for gradient adoption analysis This SQL query analyzes the adoption of CSS gradients by clients over specific years, grouping results by rank and calculating the percentage of sites using gradients. * Update and rename gradient-by-client.sql to gradient.sql * Add SQL query for .ai domain ranking analysis * Add SQL script for analyzing web page platforms * Fix linter errors in SQL scripts * Apply suggestions from code review * Update sql/2025/generative-ai/gradient-client-rank.sql * more formatting --------- Co-authored-by: Jonathan Pagel <63317370+jcmpagel@users.noreply.github.com> Co-authored-by: Barry Pollard <barrypollard@google.com>
1 parent 5c3bd08 commit f2f949c

File tree

8 files changed

+670
-0
lines changed

8 files changed

+670
-0
lines changed

sql/2025/generative-ai/ai_tld.sql

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#standardSQL
2+
# .ai domains per exclusive rank bucket, 2022 vs 2025
3+
4+
WITH base AS (
5+
SELECT
6+
date,
7+
client,
8+
rank,
9+
NET.HOST(page) AS host
10+
FROM
11+
`httparchive.crawl.pages`
12+
WHERE
13+
is_root_page AND
14+
client IN ('desktop', 'mobile') AND
15+
date IN ('2022-06-01', '2025-07-01') AND
16+
rank <= 10000000 AND -- later years go beyond 10,000,000 but let's keep to this limit for consistency
17+
ENDS_WITH(NET.HOST(page), '.ai')
18+
),
19+
20+
bucketed AS (
21+
SELECT
22+
date,
23+
client,
24+
CASE
25+
WHEN rank <= 1000 THEN 1000
26+
WHEN rank <= 10000 THEN 10000
27+
WHEN rank <= 100000 THEN 100000
28+
WHEN rank <= 1000000 THEN 1000000
29+
WHEN rank <= 10000000 THEN 10000000
30+
END AS rank_bucket,
31+
host
32+
FROM
33+
base
34+
)
35+
36+
SELECT
37+
date,
38+
client,
39+
rank_bucket,
40+
COUNT(DISTINCT host) AS ai_domains
41+
FROM
42+
bucketed
43+
GROUP BY
44+
date,
45+
client,
46+
rank_bucket
47+
ORDER BY
48+
date,
49+
client,
50+
rank_bucket;
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#standardSQL
2+
-- Adoption of CSS gradients in custom_metrics.css_variables
3+
-- Grouped by: year, client, rank bucket
4+
5+
#standardSQL
6+
-- Adoption of CSS gradients in custom_metrics.css_variables
7+
-- Grouped by: year, client, rank bucket
8+
9+
SELECT
10+
EXTRACT(YEAR FROM date) AS year,
11+
client,
12+
rank_grouping,
13+
COUNT(DISTINCT page) AS total_sites,
14+
COUNT(DISTINCT IF(
15+
REGEXP_CONTAINS(
16+
TO_JSON_STRING(custom_metrics.css_variables),
17+
r'(?i)gradient\('
18+
),
19+
page,
20+
NULL
21+
)) AS sites_using_gradient,
22+
SAFE_DIVIDE(
23+
COUNT(DISTINCT IF(
24+
REGEXP_CONTAINS(
25+
TO_JSON_STRING(custom_metrics.css_variables),
26+
r'(?i)gradient\('
27+
),
28+
page,
29+
NULL
30+
)),
31+
COUNT(DISTINCT page)
32+
) AS pct_sites_using_gradient
33+
FROM
34+
`httparchive.crawl.pages`,
35+
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
36+
WHERE
37+
is_root_page AND
38+
rank <= rank_grouping AND
39+
date IN (
40+
DATE '2019-07-01',
41+
DATE '2020-08-01',
42+
DATE '2021-07-01',
43+
DATE '2022-07-01', -- CSS metrics exception
44+
DATE '2024-06-01',
45+
DATE '2025-07-01'
46+
)
47+
GROUP BY
48+
year,
49+
client,
50+
rank_grouping
51+
ORDER BY
52+
year,
53+
client,
54+
rank_grouping;
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#standardSQL
2+
-- % of sites whose CSS variables include a gradient( ... ) per year, client, rank
3+
-- Goal it use is as an indicator for vibe coded website like "delve into" is used for papers
4+
SELECT
5+
EXTRACT(YEAR FROM date) AS year,
6+
client,
7+
rank,
8+
COUNT(DISTINCT page) AS total_pages,
9+
COUNT(DISTINCT IF(
10+
REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('),
11+
page, NULL
12+
)) AS pages_with_gradient,
13+
SAFE_DIVIDE(
14+
COUNT(DISTINCT IF(
15+
REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('),
16+
page, NULL
17+
)),
18+
COUNT(DISTINCT page)
19+
) AS pct_with_gradient
20+
FROM
21+
`httparchive.crawl.pages`
22+
WHERE
23+
is_root_page AND
24+
date IN (
25+
DATE '2019-07-01',
26+
DATE '2020-08-01',
27+
DATE '2021-07-01',
28+
DATE '2022-07-01',
29+
DATE '2024-06-01',
30+
DATE '2025-07-01'
31+
)
32+
GROUP BY
33+
year,
34+
client,
35+
rank
36+
ORDER BY
37+
year,
38+
client,
39+
rank;
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
#standardSQL
2+
WITH raw_data AS (
3+
SELECT
4+
date,
5+
page,
6+
-- 1. TECHNOLOGY FLAGS
7+
-- CSS Variables: Exclude NULL, {}, '{"summary":{}}', and 'null' string
8+
(
9+
custom_metrics.css_variables IS NOT NULL AND
10+
TO_JSON_STRING(custom_metrics.css_variables) NOT IN ('{}', '{"summary":{}}', 'null')
11+
) AS uses_css_vars,
12+
13+
-- Tailwind: Check the array for the technology
14+
'Tailwind CSS' IN UNNEST(technologies.technology) AS uses_tailwind,
15+
16+
-- Content String for Regex
17+
LOWER(TO_JSON_STRING(custom_metrics.css_variables)) AS vars_str
18+
FROM
19+
`httparchive.crawl.pages`
20+
WHERE
21+
client = 'mobile' AND
22+
is_root_page AND
23+
-- NO RANK FILTER (Analyze the entire long-tail of the web)
24+
25+
-- Quarterly Dates
26+
date IN UNNEST([
27+
DATE '2020-10-01',
28+
DATE '2021-01-01', DATE '2021-04-01', DATE '2021-07-01', DATE '2021-10-01',
29+
DATE '2022-01-01', DATE '2022-04-01', DATE '2022-07-01', DATE '2022-10-01',
30+
DATE '2023-01-01', DATE '2023-04-01', DATE '2023-07-01', DATE '2023-10-01',
31+
DATE '2024-01-01', DATE '2024-04-01', DATE '2024-07-01', DATE '2024-10-01',
32+
DATE '2025-01-01', DATE '2025-04-01', DATE '2025-07-01', DATE '2025-10-01'
33+
])
34+
),
35+
36+
-- Pre-calculate heuristics
37+
flags AS (
38+
SELECT
39+
date,
40+
page,
41+
uses_css_vars,
42+
uses_tailwind,
43+
44+
-- HEURISTIC BOOLEANS (Only true if uses_css_vars is also true)
45+
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'"#6366f1"')) AS has_indigo_500,
46+
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'"(#6366f1|#8b5cf6|#a855f7)"')) AS has_ai_purples,
47+
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'inter')) AS has_inter,
48+
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'roboto')) AS has_roboto,
49+
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'system-ui')) AS has_system_ui,
50+
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'linear-gradient\(|radial-gradient\(')) AS has_gradient,
51+
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'"(2px|4px|6px|8px|12px|16px|0\.25rem|0\.5rem|0\.75rem|1rem|9999px)"')) AS has_radius,
52+
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'rgba\(|box-shadow')) AS has_shadow
53+
FROM
54+
raw_data
55+
)
56+
57+
SELECT
58+
FORMAT_DATE('%Y-Q%Q', date) AS year_quarter,
59+
60+
-- 1. CONTEXT (Denominators)
61+
COUNT(DISTINCT page) AS total_sites,
62+
COUNT(DISTINCT IF(uses_css_vars, page, NULL)) AS sites_using_vars,
63+
COUNT(DISTINCT IF(uses_tailwind, page, NULL)) AS sites_using_tailwind,
64+
65+
-------------------------------------------------------------------------
66+
-- 2. "AI PURPLE" SPECTRUM (Indigo/Violet/Purple 500)
67+
-------------------------------------------------------------------------
68+
COUNT(DISTINCT IF(has_ai_purples, page, NULL)) AS cnt_ai_purples,
69+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_ai_purples, page, NULL)), COUNT(DISTINCT page)) AS pct_all_ai_purples,
70+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_ai_purples, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_ai_purples,
71+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_ai_purples AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_ai_purples,
72+
73+
-------------------------------------------------------------------------
74+
-- 3. SPECIFIC INDIGO 500 (#6366f1 Only)
75+
-------------------------------------------------------------------------
76+
COUNT(DISTINCT IF(has_indigo_500, page, NULL)) AS cnt_indigo,
77+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_indigo_500, page, NULL)), COUNT(DISTINCT page)) AS pct_all_indigo,
78+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_indigo_500, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_indigo,
79+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_indigo_500 AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_indigo,
80+
81+
-------------------------------------------------------------------------
82+
-- 4. FONTS
83+
-------------------------------------------------------------------------
84+
-- Inter
85+
COUNT(DISTINCT IF(has_inter, page, NULL)) AS cnt_inter,
86+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter, page, NULL)), COUNT(DISTINCT page)) AS pct_all_inter,
87+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_inter,
88+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_inter,
89+
90+
-- Roboto
91+
COUNT(DISTINCT IF(has_roboto, page, NULL)) AS cnt_roboto,
92+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_roboto, page, NULL)), COUNT(DISTINCT page)) AS pct_all_roboto,
93+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_roboto, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_roboto,
94+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_roboto AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_roboto,
95+
96+
-- System UI
97+
COUNT(DISTINCT IF(has_system_ui, page, NULL)) AS cnt_system,
98+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_system_ui, page, NULL)), COUNT(DISTINCT page)) AS pct_all_system,
99+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_system_ui, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_system,
100+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_system_ui AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_system,
101+
102+
-------------------------------------------------------------------------
103+
-- 5. UI ELEMENTS
104+
-------------------------------------------------------------------------
105+
-- Gradients
106+
COUNT(DISTINCT IF(has_gradient, page, NULL)) AS cnt_gradient,
107+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_gradient, page, NULL)), COUNT(DISTINCT page)) AS pct_all_gradient,
108+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_gradient, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_gradient,
109+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_gradient AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_gradient,
110+
111+
-- Radius
112+
COUNT(DISTINCT IF(has_radius, page, NULL)) AS cnt_radius,
113+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_radius, page, NULL)), COUNT(DISTINCT page)) AS pct_all_radius,
114+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_radius, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_radius,
115+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_radius AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_radius,
116+
117+
-- Shadows
118+
COUNT(DISTINCT IF(has_shadow, page, NULL)) AS cnt_shadow,
119+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_shadow, page, NULL)), COUNT(DISTINCT page)) AS pct_all_shadow,
120+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_shadow, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_shadow,
121+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_shadow AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_shadow
122+
123+
FROM
124+
flags
125+
GROUP BY
126+
year_quarter
127+
ORDER BY
128+
year_quarter;
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#standardSQL
2+
-- % of sites whose robots.txt returns 200 and includes any directive, therefor the percentage is smaller (75% instead of 95% reported elswhere)
3+
WITH roots AS (
4+
SELECT
5+
client,
6+
root_page,
7+
SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status,
8+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.allow') AS INT64), 0) AS allow_cnt,
9+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.disallow') AS INT64), 0) AS disallow_cnt,
10+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.crawl_delay') AS INT64), 0) AS crawl_delay_cnt,
11+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.noindex') AS INT64), 0) AS noindex_cnt,
12+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.sitemap') AS INT64), 0) AS sitemap_cnt,
13+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.user_agent') AS INT64), 0) AS ua_cnt
14+
FROM
15+
`httparchive.crawl.pages`
16+
WHERE
17+
date = '2025-07-01' AND
18+
is_root_page
19+
)
20+
21+
SELECT
22+
client,
23+
COUNT(DISTINCT root_page) AS sites,
24+
COUNT(DISTINCT IF(
25+
status = 200 AND
26+
(allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0,
27+
root_page, NULL
28+
)) AS sites_with_robots_txt,
29+
SAFE_DIVIDE(
30+
COUNT(DISTINCT IF(
31+
status = 200 AND
32+
(allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0,
33+
root_page, NULL
34+
)),
35+
COUNT(DISTINCT root_page)
36+
) AS pct_sites_with_robots_txt
37+
FROM
38+
roots
39+
GROUP BY
40+
client
41+
ORDER BY
42+
client;

0 commit comments

Comments
 (0)