Skip to content

Commit 89c51be

Browse files
committed
Merge branch 'main' of https://github.com/HTTPArchive/almanac.httparchive.org into production
2 parents a58c704 + 3f52951 commit 89c51be

46 files changed

Lines changed: 854 additions & 427 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

sql/2024/privacy/ads_accounts_distribution.sql

Lines changed: 0 additions & 64 deletions
This file was deleted.

sql/2024/privacy/ads_and_sellers_graph.sql

Lines changed: 0 additions & 114 deletions
This file was deleted.

sql/2024/privacy/ads_lines_distribution.sql

Lines changed: 0 additions & 45 deletions
This file was deleted.

sql/2024/privacy/common_ads_variables.sql

Lines changed: 0 additions & 29 deletions
This file was deleted.

sql/2024/privacy/number_of_websites_with_bounce_tracking.sql renamed to sql/2024/privacy/most_common_bounce_domains.sql

Lines changed: 26 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -8,83 +8,74 @@ WITH redirect_requests AS (
88
index,
99
response_headers,
1010
page
11-
FROM `httparchive.all.requests`
11+
FROM `httparchive.crawl.requests`
1212
WHERE
1313
date = '2024-06-01' AND
1414
is_root_page = TRUE AND
1515
type NOT IN ('css', 'image', 'font', 'video', 'audio') AND
16-
LEFT(JSON_VALUE(summary, '$.status'), 1) = '3' AND
16+
ROUND(INT64(summary.status) / 100) = 3 AND
1717
index <= 2
1818
), navigation_redirect AS (
1919
-- Find the first navigation redirect
2020
SELECT
2121
client,
2222
url,
2323
page,
24-
headers.value AS navigation_redirect_location
24+
response_header.value AS navigation_redirect_location
2525
FROM redirect_requests,
26-
UNNEST(response_headers) AS headers
26+
UNNEST(response_headers) AS response_header
2727
WHERE
2828
index = 1 AND
29-
LOWER(headers.name) = 'location' AND
30-
NET.REG_DOMAIN(page) != NET.REG_DOMAIN(headers.value)
29+
LOWER(response_header.name) = 'location' AND
30+
NET.REG_DOMAIN(response_header.value) != NET.REG_DOMAIN(page)
3131
), bounce_redirect AS (
3232
-- Find the second navigation redirect
3333
SELECT
3434
client,
3535
url,
3636
page,
37-
headers.value AS bounce_redirect_location,
37+
response_header.value AS bounce_redirect_location,
3838
response_headers
3939
FROM redirect_requests,
40-
UNNEST(response_headers) AS headers
40+
UNNEST(response_headers) AS response_header
4141
WHERE
4242
index = 2 AND
43-
LOWER(headers.name) = 'location' AND
44-
NET.REG_DOMAIN(headers.value) = NET.REG_DOMAIN(page)
45-
), bounce_redirect_with_cookies AS (
46-
-- Find the cookies set during the second navigation redirect
47-
SELECT
48-
client,
49-
url,
50-
page,
51-
bounce_redirect_location
52-
--response_headers.value AS bounce_tracking_cookies
53-
FROM bounce_redirect,
54-
UNNEST(response_headers) AS response_headers
55-
WHERE
56-
LOWER(response_headers.name) = 'set-cookie'
43+
LOWER(response_header.name) = 'location'
5744
), bounce_sequences AS (
5845
-- Combine the first and second navigation redirects
5946
SELECT
6047
nav.client,
61-
nav.page,
62-
nav.url AS navigation_url,
63-
nav.navigation_redirect_location,
64-
bounce.bounce_redirect_location
48+
NET.REG_DOMAIN(navigation_redirect_location) AS bounce_hostname,
49+
COUNT(DISTINCT nav.page) AS number_of_pages
6550
--ARRAY_AGG(bounce.bounce_tracking_cookies) AS bounce_tracking_cookies
6651
FROM navigation_redirect AS nav
67-
LEFT JOIN bounce_redirect_with_cookies AS bounce
52+
LEFT JOIN bounce_redirect AS bounce
6853
ON
6954
nav.client = bounce.client AND
7055
nav.page = bounce.page AND
7156
nav.navigation_redirect_location = bounce.url
7257
WHERE bounce_redirect_location IS NOT NULL
7358
GROUP BY
7459
nav.client,
75-
page,
76-
navigation_url,
77-
navigation_redirect_location,
78-
bounce_redirect_location
60+
bounce_hostname
61+
), pages_total AS (
62+
SELECT
63+
client,
64+
COUNT(DISTINCT page) AS total_pages
65+
FROM `httparchive.crawl.pages`
66+
WHERE date = '2024-06-01' AND
67+
is_root_page
68+
GROUP BY client
7969
)
8070

8171
-- Count the number of websites with bounce tracking per bounce hostname
8272
SELECT
8373
client,
84-
NET.HOST(navigation_redirect_location) AS bounce_hostname,
85-
COUNT(DISTINCT page) AS number_of_pages
86-
--ARRAY_AGG(page LIMIT 2) AS page_examples
74+
bounce_hostname,
75+
number_of_pages,
76+
number_of_pages / total_pages AS pct_pages
8777
FROM bounce_sequences
88-
GROUP BY client, bounce_hostname
78+
JOIN pages_total
79+
USING (client)
8980
ORDER BY number_of_pages DESC
9081
LIMIT 100

0 commit comments

Comments
 (0)