Skip to content

Commit 8800cfc

Browse files
committed
Merge branch 'main' of github.com:HTTPArchive/almanac.httparchive.org into production
2 parents 118947a + 9855e25 commit 8800cfc

15 files changed

Lines changed: 322 additions & 23 deletions

sql/2024/cookies/0_extract_cookies.sql

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,11 @@ WITH intermediate_cookie AS (
4444
page,
4545
root_page,
4646
rank,
47-
JSON_VALUE(summary, '$.startedDateTime') AS startedDateTime,
47+
payload.startedDateTime AS startedDateTime,
4848
cookie
4949
FROM
50-
`httparchive.all.pages`,
51-
UNNEST(JSON_EXTRACT_ARRAY(custom_metrics, '$.cookies')) AS cookie
50+
`httparchive.crawl.pages`,
51+
UNNEST(JSON_EXTRACT_ARRAY(custom_metrics.cookies)) AS cookie
5252
WHERE
5353
date = '2024-06-01'
5454
)
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
CREATE TEMP FUNCTION findAllInitiators(rootPage STRING, data ARRAY<STRUCT<root_page STRING, third_party STRING, initiator_etld STRING>>)
2+
RETURNS ARRAY<STRING>
3+
LANGUAGE js AS """
4+
// Helper function to find all initiator_etlds for a given root_page
5+
function findInitiators(page, visited, data) {
6+
// Find all entries where the root_page matches and the initiator_etld hasn't been visited
7+
const initiators = data
8+
.filter(row => row.root_page === page && !visited.includes(row.initiator_etld))
9+
.map(row => row.initiator_etld);
10+
11+
// Add the newly found initiators to the visited list
12+
visited = visited.concat(initiators);
13+
14+
// Recursively process all new initiators
15+
initiators.forEach(initiator => {
16+
visited = findInitiators(initiator, visited, data);
17+
});
18+
19+
return visited;
20+
}
21+
22+
// Main call: Start recursion from the rootPage
23+
// Use a Set to ensure that all returned values are distinct
24+
return Array.from(new Set(findInitiators(rootPage, [], data)));
25+
""";
26+
27+
28+
29+
CREATE TEMP FUNCTION mean_depth_and_next_element_after_gtm(input_array ARRAY<STRING>)
30+
RETURNS STRUCT<mean_depth FLOAT64, next_elements ARRAY<STRING>>
31+
LANGUAGE js AS """
32+
// Initialize the array to hold names of next elements
33+
const nextElements = [];
34+
35+
// Traverse the input array to find "googletagmanager.com" and capture the next element
36+
for (let i = 0; i < input_array.length - 1; i++) { // -1 to avoid out-of-bounds
37+
if (input_array[i] === 'googletagmanager.com') {
38+
nextElements.push(input_array[i + 1]);
39+
}
40+
}
41+
42+
// If no "googletagmanager.com" is found, return NULL
43+
if (nextElements.length === 0) {
44+
return { mean_depth: null, next_elements: [] };
45+
}
46+
47+
// Calculate mean depth for all next elements
48+
const meanDepth = nextElements.length > 0
49+
? nextElements.reduce((sum, _, idx) => sum + (idx + 2), 0) / nextElements.length
50+
: null;
51+
52+
// Return the result as a struct
53+
return { mean_depth: meanDepth, next_elements: nextElements };
54+
""";
55+
56+
57+
WITH data AS (
58+
-- TP interact with other tps
59+
SELECT
60+
*
61+
FROM (
62+
SELECT
63+
client,
64+
NET.REG_DOMAIN(root_page) AS root_page,
65+
NET.REG_DOMAIN(url) AS third_party,
66+
NET.REG_DOMAIN(JSON_VALUE(payload, '$._initiator')) AS initiator_etld
67+
FROM
68+
`httparchive.all.requests`
69+
WHERE
70+
NET.REG_DOMAIN(root_page) != NET.REG_DOMAIN(url) AND
71+
date = '2024-06-01')
72+
WHERE third_party != initiator_etld AND
73+
root_page != initiator_etld
74+
GROUP BY client, root_page, third_party, initiator_etld
75+
)
76+
77+
SELECT client, next_elements_after_gtm, count(0) AS c FROM(
78+
SELECT
79+
client,
80+
result.mean_depth AS mean_depth_after_gtm,
81+
result.next_elements AS next_elements_after_gtm
82+
FROM (
83+
SELECT
84+
root_page,
85+
client,
86+
findAllInitiators(root_page, ARRAY_AGG(STRUCT(root_page, third_party, initiator_etld))) AS all_initiators
87+
FROM data
88+
GROUP BY root_page, client),
89+
UNNEST([mean_depth_and_next_element_after_gtm(all_initiators)]) AS result
90+
WHERE result.mean_depth IS NOT NULL
91+
ORDER BY mean_depth_after_gtm) GROUP BY client, next_elements_after_gtm ORDER BY c;

src/config/2024.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,7 @@
6363
"part": "I",
6464
"chapter_number": "8",
6565
"title": "Third Parties",
66-
"slug": "third-parties",
67-
"todo": true
66+
"slug": "third-parties"
6867
}
6968
]
7069
},

src/config/contributors.json

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -678,7 +678,8 @@
678678
"2024": [
679679
"developers",
680680
"editors",
681-
"committee"
681+
"committee",
682+
"reviewers"
682683
]
683684
},
684685
"twitter": "tunetheweb",
@@ -1014,9 +1015,11 @@
10141015
},
10151016
"ChrisBeeti": {
10161017
"github": "ChrisBeeti",
1017-
"name": "Chris Beeti",
1018+
"name": "Chris Böttger",
10181019
"teams": {
10191020
"2024": [
1021+
"analysts",
1022+
"authors",
10201023
"committee"
10211024
]
10221025
}
@@ -4428,9 +4431,11 @@
44284431
"name": "Tobias Urban",
44294432
"teams": {
44304433
"2024": [
4434+
"authors",
44314435
"committee"
44324436
]
4433-
}
4437+
},
4438+
"website": "https://internet-sicherheit.de/ueber-uns/team/alle-mitarbeiter/urban-tobias-2/"
44344439
},
44354440
"bobbyshaw": {
44364441
"avatar_url": "553566",
@@ -4690,6 +4695,19 @@
46904695
]
46914696
}
46924697
},
4698+
"Yash-Vekaria": {
4699+
"avatar_url": "30694521",
4700+
"github": "Yash-Vekaria",
4701+
"name": "Yash Vekaria",
4702+
"teams": {
4703+
"2024": [
4704+
"analysts",
4705+
"authors"
4706+
]
4707+
},
4708+
"twitter": "vekariayash",
4709+
"website": "https://yash-vekaria.github.io"
4710+
},
46934711
"yoavweiss": {
46944712
"avatar_url": "786187",
46954713
"github": "yoavweiss",
@@ -4811,6 +4829,17 @@
48114829
]
48124830
}
48134831
},
4832+
"zubairshafiq": {
4833+
"github": "zubairshafiq",
4834+
"name": "Zubair Shafiq",
4835+
"teams": {
4836+
"2024": [
4837+
"authors"
4838+
]
4839+
},
4840+
"twitter": "zubair_shafiq",
4841+
"website": "http://www.cs.ucdavis.edu/~zubair"
4842+
},
48144843
"Zuckjet": {
48154844
"avatar_url": "17976139",
48164845
"github": "Zuckjet",

src/config/last_updated.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -852,9 +852,9 @@
852852
"hash": "bb49d876d3e33811819746edc96ed447"
853853
},
854854
"en/2024/chapters/third-parties.html": {
855-
"date_published": "2024-11-11T00:00:00.000Z",
856-
"date_modified": "2024-11-16T00:00:00.000Z",
857-
"hash": "124fe4e80189dd401c4f4d0bfeb361dd"
855+
"date_published": "2024-11-21T00:00:00.000Z",
856+
"date_modified": "2024-11-21T00:00:00.000Z",
857+
"hash": "075bec99b73be68c6fa7b97b97808182"
858858
},
859859
"en/2024/chapters/webassembly.html": {
860860
"date_published": "2024-11-11T00:00:00.000Z",

0 commit comments

Comments
 (0)