|
| 1 | +-- Extract to the `httparchive.almanac.cookies `table the cookies that were set |
| 2 | +-- during the <DATE> crawl on <CLIENT>. Data in this table can then be queried |
| 3 | +-- more efficiently in consecutive queries without having to reextract it every |
| 4 | +-- time |
| 5 | + |
| 6 | + |
| 7 | +-- Code used by @tunetheweb to create the table |
| 8 | +-- see https://github.com/HTTPArchive/almanac.httparchive.org/pull/3741#discussion_r1823153262 |
| 9 | + |
| 10 | +-- CREATE TABLE `httparchive.almanac.cookies` |
| 11 | +-- ( |
| 12 | +-- date DATE, |
| 13 | +-- client STRING, |
| 14 | +-- page STRING, |
| 15 | +-- root_page STRING, |
| 16 | +-- rank INTEGER, |
| 17 | +-- startedDateTime STRING, |
| 18 | +-- firstPartyCookie BOOL, |
| 19 | +-- name STRING, |
| 20 | +-- domain STRING, |
| 21 | +-- path STRING, |
| 22 | +-- expires STRING, |
| 23 | +-- size STRING, |
| 24 | +-- httpOnly STRING, |
| 25 | +-- secure STRING, |
| 26 | +-- session STRING, |
| 27 | +-- sameSite STRING, |
| 28 | +-- sameParty STRING, |
| 29 | +-- partitionKey STRING, |
| 30 | +-- partitionKeyOpaque STRING |
| 31 | +-- ) |
| 32 | +-- PARTITION BY date |
| 33 | +-- CLUSTER BY |
| 34 | +-- client, rank, page |
| 35 | +-- AS |
| 36 | +-- ... |
| 37 | + |
| 38 | +CREATE TEMPORARY FUNCTION toTimestamp(date_string STRING) |
| 39 | +RETURNS INT64 LANGUAGE js AS ''' |
| 40 | + try { |
| 41 | + var timestamp = Math.round(new Date(date_string).getTime() / 1000); |
| 42 | + return isNaN(timestamp) ? -1 : timestamp; |
| 43 | + } catch (e) { |
| 44 | + return -1; |
| 45 | + } |
| 46 | +'''; |
| 47 | + |
| 48 | +INSERT INTO `httparchive.almanac.cookies` |
| 49 | +SELECT |
| 50 | + date, |
| 51 | + client, |
| 52 | + page, |
| 53 | + root_page, |
| 54 | + rank, |
| 55 | + CAST(toTimestamp(JSON_VALUE(payload.startedDateTime)) AS STRING) AS startedDateTime, |
| 56 | + ENDS_WITH(NET.HOST(page), '.' || NET.REG_DOMAIN(JSON_VALUE(cookie.domain))) AS firstPartyCookie, |
| 57 | + JSON_VALUE(cookie.name) AS name, |
| 58 | + JSON_VALUE(cookie.domain) AS domain, |
| 59 | + JSON_VALUE(cookie.path) AS path, |
| 60 | + JSON_VALUE(cookie.expires) AS expires, |
| 61 | + JSON_VALUE(cookie.size) AS size, |
| 62 | + JSON_VALUE(cookie.httpOnly) AS httpOnly, |
| 63 | + JSON_VALUE(cookie.secure) AS secure, |
| 64 | + JSON_VALUE(cookie.session) AS session, |
| 65 | + JSON_VALUE(cookie.sameSite) AS sameSite, |
| 66 | + JSON_VALUE(cookie.sameParty) AS sameParty, |
| 67 | + NULLIF(TO_JSON_STRING(cookie.partitionKey), 'null') AS partitionKey, |
| 68 | + NULLIF(TO_JSON_STRING(cookie.partitionKeyOpaque), 'null') AS partitionKeyOpaque |
| 69 | +FROM |
| 70 | + `httparchive.crawl.pages`, |
| 71 | + UNNEST(JSON_EXTRACT_ARRAY(custom_metrics.cookies)) AS cookie |
| 72 | +WHERE |
| 73 | + date = '2025-07-01' |
0 commit comments