@@ -8,83 +8,74 @@ WITH redirect_requests AS (
88 index,
99 response_headers,
1010 page
11- FROM ` httparchive.all .requests`
11+ FROM ` httparchive.crawl .requests`
1212 WHERE
1313 date = ' 2024-06-01' AND
1414 is_root_page = TRUE AND
1515 type NOT IN (' css' , ' image' , ' font' , ' video' , ' audio' ) AND
16- LEFT(JSON_VALUE (summary, ' $ .status' ), 1 ) = ' 3 ' AND
16+ ROUND(INT64 (summary .status ) / 100 ) = 3 AND
1717 index <= 2
1818), navigation_redirect AS (
1919 -- Find the first navigation redirect
2020 SELECT
2121 client,
2222 url,
2323 page,
24- headers .value AS navigation_redirect_location
24+ response_header .value AS navigation_redirect_location
2525 FROM redirect_requests,
26- UNNEST(response_headers) AS headers
26+ UNNEST(response_headers) AS response_header
2727 WHERE
2828 index = 1 AND
29- LOWER (headers .name ) = ' location' AND
30- NET .REG_DOMAIN (page ) != NET .REG_DOMAIN (headers . value )
29+ LOWER (response_header .name ) = ' location' AND
30+ NET .REG_DOMAIN (response_header . value ) != NET .REG_DOMAIN (page )
3131), bounce_redirect AS (
3232 -- Find the second navigation redirect
3333 SELECT
3434 client,
3535 url,
3636 page,
37- headers .value AS bounce_redirect_location,
37+ response_header .value AS bounce_redirect_location,
3838 response_headers
3939 FROM redirect_requests,
40- UNNEST(response_headers) AS headers
40+ UNNEST(response_headers) AS response_header
4141 WHERE
4242 index = 2 AND
43- LOWER (headers .name ) = ' location' AND
44- NET .REG_DOMAIN (headers .value ) = NET .REG_DOMAIN (page)
45- ), bounce_redirect_with_cookies AS (
46- -- Find the cookies set during the second navigation redirect
47- SELECT
48- client,
49- url,
50- page,
51- bounce_redirect_location
52- -- response_headers.value AS bounce_tracking_cookies
53- FROM bounce_redirect,
54- UNNEST(response_headers) AS response_headers
55- WHERE
56- LOWER (response_headers .name ) = ' set-cookie'
43+ LOWER (response_header .name ) = ' location'
5744), bounce_sequences AS (
5845 -- Combine the first and second navigation redirects
5946 SELECT
6047 nav .client ,
61- nav .page ,
62- nav .url AS navigation_url,
63- nav .navigation_redirect_location ,
64- bounce .bounce_redirect_location
48+ NET .REG_DOMAIN (navigation_redirect_location) AS bounce_hostname,
49+ COUNT (DISTINCT nav .page ) AS number_of_pages
6550 -- ARRAY_AGG(bounce.bounce_tracking_cookies) AS bounce_tracking_cookies
6651 FROM navigation_redirect AS nav
67- LEFT JOIN bounce_redirect_with_cookies AS bounce
52+ LEFT JOIN bounce_redirect AS bounce
6853 ON
6954 nav .client = bounce .client AND
7055 nav .page = bounce .page AND
7156 nav .navigation_redirect_location = bounce .url
7257 WHERE bounce_redirect_location IS NOT NULL
7358 GROUP BY
7459 nav .client ,
75- page,
76- navigation_url,
77- navigation_redirect_location,
78- bounce_redirect_location
60+ bounce_hostname
61+ ), pages_total AS (
62+ SELECT
63+ client,
64+ COUNT (DISTINCT page) AS total_pages
65+ FROM ` httparchive.crawl.pages`
66+ WHERE date = ' 2024-06-01' AND
67+ is_root_page
68+ GROUP BY client
7969)
8070
8171-- Count the number of websites with bounce tracking per bounce hostname
8272SELECT
8373 client,
84- NET . HOST (navigation_redirect_location) AS bounce_hostname,
85- COUNT (DISTINCT page) AS number_of_pages
86- -- ARRAY_AGG(page LIMIT 2) AS page_examples
74+ bounce_hostname,
75+ number_of_pages,
76+ number_of_pages / total_pages AS pct_pages
8777FROM bounce_sequences
88- GROUP BY client, bounce_hostname
78+ JOIN pages_total
79+ USING (client)
8980ORDER BY number_of_pages DESC
9081LIMIT 100
0 commit comments