Skip to content

Commit a845fb0

Browse files
jazlan01tunethewebabubakaraziz
authored
Third Parties 2025 (#4200)
* Initialized with queries from 2024 * Updated query for 2025 spec * Updated date * Temp changes to dates, 2025-06 instead of 2025-07 * Updated median number of third parties by rank query * Updated top 100 third parties by number of websites query * Updated third party domains per page by rank * Updated percent of third parties by content type * Added prevalence of consent signals in third party requests * Updated queries for 2025 * Updated third parties table date * Updated third-parties.md with the content * Enable chapter * Technical edit * Lint SQL * Retake image * tunetheweb contributions * Linting fixes * Linting * Tweaks * Apply suggestions from code review * Update src/content/en/2025/third-parties.md * Update contributors * Clean up images * Most recently used queries * Updated bio for Jazlan * Added featured stats, sql file names * Fixed linter errors * text update * Technical edit * Cleaned up SQL * Fixed sql file name for third party providers * Added newline at the end of the file * Update sql/2025/third-parties/number_of_third_parties_by_rank_and_category.sql * Update sql/2025/third-parties/number_of_third_parties_by_rank_and_category.sql * Incorporate feedback in text * Update src/content/en/2025/third-parties.md with link to 2024 chapter Co-authored-by: Barry Pollard <barrypollard@google.com> --------- Co-authored-by: Barry Pollard <barrypollard@google.com> Co-authored-by: AbuBakar Aziz <aziz313f@gmail.com>
1 parent b0111d0 commit a845fb0

30 files changed

+2172
-15
lines changed
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#standardSQL
2+
# Basic consent signal analysis (simplified version to ensure data returns)
3+
4+
WITH pages AS (
5+
SELECT
6+
client,
7+
page,
8+
rank
9+
FROM
10+
`httparchive.crawl.pages`
11+
WHERE
12+
date = '2025-07-01' AND
13+
rank <= 50000 -- Expand to top 50K sites
14+
),
15+
16+
-- Find requests with consent signals (no redirect filtering)
17+
consent_requests AS (
18+
SELECT
19+
r.client,
20+
r.page,
21+
r.url,
22+
NET.REG_DOMAIN(r.page) AS page_domain,
23+
NET.REG_DOMAIN(r.url) AS url_domain,
24+
25+
-- Extract consent signals
26+
REGEXP_CONTAINS(r.url, r'[?&]us_privacy=') AS has_usp_standard,
27+
REGEXP_CONTAINS(r.url, r'[?&](ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string)=') AS has_usp_nonstandard,
28+
REGEXP_CONTAINS(r.url, r'[?&](gdpr|gdpr_consent|gdpr_pd)=') AS has_tcf_standard,
29+
REGEXP_CONTAINS(r.url, r'[?&](gpp|gpp_sid)=') AS has_gpp_standard,
30+
31+
-- Check if request has redirects
32+
JSON_EXTRACT(r.summary, '$.redirects') IS NOT NULL AND
33+
TO_JSON_STRING(JSON_EXTRACT(r.summary, '$.redirects')) != '[]' AS has_redirects
34+
FROM
35+
`httparchive.crawl.requests` r
36+
INNER JOIN
37+
pages p
38+
ON
39+
r.client = p.client AND r.page = p.page
40+
WHERE
41+
r.date = '2025-07-01' AND
42+
NET.REG_DOMAIN(r.page) != NET.REG_DOMAIN(r.url) AND -- Third-party only
43+
(
44+
REGEXP_CONTAINS(r.url, r'[?&]us_privacy=') OR
45+
REGEXP_CONTAINS(r.url, r'[?&](ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string)=') OR
46+
REGEXP_CONTAINS(r.url, r'[?&](gdpr|gdpr_consent|gdpr_pd)=') OR
47+
REGEXP_CONTAINS(r.url, r'[?&](gpp|gpp_sid)=')
48+
)
49+
),
50+
51+
-- Add any consent signal flag
52+
requests_with_signals AS (
53+
SELECT
54+
*,
55+
(has_usp_standard OR has_usp_nonstandard OR has_tcf_standard OR has_gpp_standard) AS has_any_signal
56+
FROM
57+
consent_requests
58+
)
59+
60+
-- Basic analysis
61+
SELECT
62+
client,
63+
64+
-- Overall counts
65+
COUNT(0) AS total_requests_with_consent_signals,
66+
COUNT(DISTINCT page) AS total_pages_with_consent_signals,
67+
COUNT(DISTINCT url_domain) AS total_domains_with_consent_signals,
68+
69+
-- Signal type breakdown
70+
COUNTIF(has_usp_standard) AS usp_standard_requests,
71+
COUNTIF(has_usp_nonstandard) AS usp_nonstandard_requests,
72+
COUNTIF(has_tcf_standard) AS tcf_standard_requests,
73+
COUNTIF(has_gpp_standard) AS gpp_standard_requests,
74+
75+
-- Percentage breakdown
76+
COUNTIF(has_usp_standard) / COUNT(0) AS pct_usp_standard,
77+
COUNTIF(has_usp_nonstandard) / COUNT(0) AS pct_usp_nonstandard,
78+
COUNTIF(has_tcf_standard) / COUNT(0) AS pct_tcf_standard,
79+
COUNTIF(has_gpp_standard) / COUNT(0) AS pct_gpp_standard,
80+
81+
-- Redirect availability
82+
COUNTIF(has_redirects) AS requests_with_redirects,
83+
COUNTIF(has_redirects) / COUNT(0) AS pct_requests_with_redirects
84+
85+
FROM
86+
requests_with_signals
87+
GROUP BY
88+
client
89+
ORDER BY
90+
client
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
#standardSQL
2+
# Consent signal prevalence broken down by third-party category
3+
4+
WITH pages AS (
5+
SELECT
6+
client,
7+
page,
8+
rank
9+
FROM
10+
`httparchive.crawl.pages`
11+
WHERE
12+
date = '2025-07-01'
13+
),
14+
15+
requests AS (
16+
SELECT
17+
client,
18+
page,
19+
url
20+
FROM
21+
`httparchive.crawl.requests`
22+
WHERE
23+
date = '2025-07-01'
24+
),
25+
26+
third_party AS (
27+
SELECT
28+
domain,
29+
canonicalDomain,
30+
category,
31+
COUNT(DISTINCT page) AS page_usage
32+
FROM
33+
`httparchive.almanac.third_parties` tp
34+
JOIN
35+
requests r
36+
ON NET.HOST(r.url) = NET.HOST(tp.domain)
37+
WHERE
38+
date = '2025-07-01' AND
39+
category != 'hosting'
40+
GROUP BY
41+
domain,
42+
canonicalDomain,
43+
category
44+
HAVING
45+
page_usage >= 50
46+
),
47+
48+
-- Get total requests per category and rank grouping for percentage calculations
49+
category_totals AS (
50+
SELECT
51+
r.client,
52+
rank_grouping,
53+
tp.category,
54+
COUNT(0) AS total_category_requests,
55+
COUNT(DISTINCT r.page) AS total_category_pages,
56+
COUNT(DISTINCT tp.canonicalDomain) AS total_category_domains
57+
FROM
58+
requests r
59+
INNER JOIN
60+
pages p
61+
ON
62+
r.client = p.client AND r.page = p.page
63+
INNER JOIN
64+
third_party tp
65+
ON
66+
NET.HOST(r.url) = NET.HOST(tp.domain),
67+
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
68+
WHERE
69+
p.rank <= rank_grouping
70+
GROUP BY
71+
r.client,
72+
rank_grouping,
73+
tp.category
74+
),
75+
76+
-- Extract consent signals from third-party requests
77+
consent_signals_by_category AS (
78+
SELECT
79+
r.client,
80+
rank_grouping,
81+
tp.category,
82+
tp.canonicalDomain,
83+
r.page,
84+
r.url,
85+
86+
-- Single-pass consent signal detection
87+
REGEXP_CONTAINS(r.url, r'[?&]us_privacy=') AS has_usp_standard,
88+
REGEXP_CONTAINS(r.url, r'[?&](ccpa|usp_consent|uspString|sst\.us_privacy|uspConsent|ccpa_consent|AV_CCPA|usp|usprivacy|_fw_us_privacy|D9v\.us_privacy|cnsnt|ccpaconsent|usp_string)=') AS has_usp_nonstandard,
89+
REGEXP_CONTAINS(r.url, r'[?&](gdpr|gdpr_consent|gdpr_pd)=') AS has_tcf_standard,
90+
REGEXP_CONTAINS(r.url, r'[?&](gpp|gpp_sid)=') AS has_gpp_standard
91+
92+
FROM
93+
requests r
94+
INNER JOIN
95+
pages p
96+
ON
97+
r.client = p.client AND r.page = p.page
98+
INNER JOIN
99+
third_party tp
100+
ON
101+
NET.HOST(r.url) = NET.HOST(tp.domain),
102+
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
103+
WHERE
104+
p.rank <= rank_grouping AND
105+
-- Pre-filter: only process URLs that might contain consent-related parameters
106+
REGEXP_CONTAINS(r.url, r'[?&](us_privacy|ccpa|usp_consent|uspString|sst\.us_privacy|uspConsent|ccpa_consent|AV_CCPA|usp|usprivacy|_fw_us_privacy|D9v\.us_privacy|cnsnt|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=')
107+
),
108+
109+
-- Add computed flag for any consent signal
110+
signals_with_any AS (
111+
SELECT
112+
*,
113+
(has_usp_standard OR has_usp_nonstandard OR has_tcf_standard OR has_gpp_standard) AS has_any_consent_signal
114+
FROM
115+
consent_signals_by_category
116+
),
117+
118+
-- Aggregate consent signals by category
119+
category_signal_aggregates AS (
120+
SELECT
121+
client,
122+
rank_grouping,
123+
category,
124+
125+
-- USP Standard metrics
126+
COUNTIF(has_usp_standard) AS usp_standard_requests,
127+
COUNT(DISTINCT CASE WHEN has_usp_standard THEN page END) AS usp_standard_pages,
128+
COUNT(DISTINCT CASE WHEN has_usp_standard THEN canonicalDomain END) AS usp_standard_domains,
129+
130+
-- USP Non-Standard metrics
131+
COUNTIF(has_usp_nonstandard) AS usp_nonstandard_requests,
132+
COUNT(DISTINCT CASE WHEN has_usp_nonstandard THEN page END) AS usp_nonstandard_pages,
133+
COUNT(DISTINCT CASE WHEN has_usp_nonstandard THEN canonicalDomain END) AS usp_nonstandard_domains,
134+
135+
-- TCF Standard metrics
136+
COUNTIF(has_tcf_standard) AS tcf_standard_requests,
137+
COUNT(DISTINCT CASE WHEN has_tcf_standard THEN page END) AS tcf_standard_pages,
138+
COUNT(DISTINCT CASE WHEN has_tcf_standard THEN canonicalDomain END) AS tcf_standard_domains,
139+
140+
-- GPP Standard metrics
141+
COUNTIF(has_gpp_standard) AS gpp_standard_requests,
142+
COUNT(DISTINCT CASE WHEN has_gpp_standard THEN page END) AS gpp_standard_pages,
143+
COUNT(DISTINCT CASE WHEN has_gpp_standard THEN canonicalDomain END) AS gpp_standard_domains,
144+
145+
-- Any consent signal metrics
146+
COUNTIF(has_any_consent_signal) AS any_consent_requests,
147+
COUNT(DISTINCT CASE WHEN has_any_consent_signal THEN page END) AS any_consent_pages,
148+
COUNT(DISTINCT CASE WHEN has_any_consent_signal THEN canonicalDomain END) AS any_consent_domains,
149+
150+
-- Totals for this filtered dataset
151+
COUNT(0) AS total_filtered_requests
152+
FROM
153+
signals_with_any
154+
GROUP BY
155+
client,
156+
rank_grouping,
157+
category
158+
)
159+
160+
-- Final output using UNNEST to avoid repetitive UNION ALL
161+
SELECT
162+
agg.client,
163+
agg.rank_grouping,
164+
agg.category,
165+
signal_data.signal_type,
166+
signal_data.requests_with_signal,
167+
totals.total_category_requests,
168+
signal_data.requests_with_signal / totals.total_category_requests AS pct_requests_with_signal,
169+
signal_data.pages_with_signal,
170+
totals.total_category_pages,
171+
signal_data.pages_with_signal / totals.total_category_pages AS pct_pages_with_signal,
172+
signal_data.domains_with_signal,
173+
totals.total_category_domains,
174+
signal_data.domains_with_signal / totals.total_category_domains AS pct_domains_with_signal
175+
FROM
176+
category_signal_aggregates agg
177+
JOIN
178+
category_totals totals
179+
USING (client, rank_grouping, category)
180+
CROSS JOIN
181+
UNNEST([
182+
STRUCT('USP Standard' AS signal_type, usp_standard_requests AS requests_with_signal, usp_standard_pages AS pages_with_signal, usp_standard_domains AS domains_with_signal),
183+
STRUCT('USP Non-Standard' AS signal_type, usp_nonstandard_requests AS requests_with_signal, usp_nonstandard_pages AS pages_with_signal, usp_nonstandard_domains AS domains_with_signal),
184+
STRUCT('TCF Standard' AS signal_type, tcf_standard_requests AS requests_with_signal, tcf_standard_pages AS pages_with_signal, tcf_standard_domains AS domains_with_signal),
185+
STRUCT('GPP Standard' AS signal_type, gpp_standard_requests AS requests_with_signal, gpp_standard_pages AS pages_with_signal, gpp_standard_domains AS domains_with_signal),
186+
STRUCT('Any Consent Signal' AS signal_type, any_consent_requests AS requests_with_signal, any_consent_pages AS pages_with_signal, any_consent_domains AS domains_with_signal)
187+
]) AS signal_data
188+
WHERE
189+
signal_data.requests_with_signal > 0 -- Only show categories with consent signals
190+
191+
ORDER BY
192+
client,
193+
rank_grouping,
194+
category,
195+
signal_type

0 commit comments

Comments
 (0)