|
1 | 1 | #standardSQL |
2 | 2 | # Internal and external link metrics by quantile and rank |
3 | | - |
4 | | -CREATE TEMPORARY FUNCTION getOutgoingLinkMetrics(payload STRING) |
5 | | -RETURNS STRUCT< |
6 | | - same_site INT64, |
7 | | - same_property INT64, |
8 | | - other_property INT64 |
9 | | -> LANGUAGE js AS ''' |
10 | | -var result = {same_site: 0, |
11 | | - same_property: 0, |
12 | | - other_property: 0}; |
13 | | -
|
14 | | -try { |
15 | | - var $ = JSON.parse(payload); |
16 | | - var wpt_bodies = JSON.parse($._wpt_bodies); |
17 | | -
|
18 | | - if (!wpt_bodies){ |
19 | | - return result; |
20 | | - } |
21 | | -
|
22 | | - var anchors = wpt_bodies.anchors; |
23 | | -
|
24 | | - if (anchors){ |
25 | | - result.same_site = anchors.rendered.same_site; |
26 | | - result.same_property = anchors.rendered.same_property; |
27 | | - result.other_property = anchors.rendered.other_property; |
28 | | - } |
29 | | -
|
30 | | -} catch (e) {} |
31 | | -
|
32 | | -return result; |
33 | | -'''; |
34 | | - |
35 | 3 | WITH page_metrics AS ( |
36 | 4 | SELECT |
37 | 5 | client, |
38 | 6 | page, |
39 | | - getOutgoingLinkMetrics(payload) AS outgoing_link_metrics, |
40 | | - JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies'), '$.is_root_page') AS is_root_page |
41 | | - FROM |
42 | | - `httparchive.all.pages` |
| 7 | + is_root_page, |
| 8 | + IF(rank <= rank_bucket, rank_bucket, NULL) AS rank, |
| 9 | + ANY_VALUE(custom_metrics.wpt_bodies.anchors) AS anchors |
| 10 | + FROM httparchive.crawl.pages, |
| 11 | + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_bucket |
43 | 12 | WHERE |
44 | | - DATE = '2024-06-01' |
| 13 | + date = '2024-06-01' |
| 14 | + GROUP BY |
| 15 | + client, |
| 16 | + page, |
| 17 | + is_root_page, |
| 18 | + rank |
| 19 | + HAVING rank IS NOT NULL |
| 20 | +), metric_details AS ( |
| 21 | + SELECT |
| 22 | + client, |
| 23 | + is_root_page, |
| 24 | + percentile, |
| 25 | + rank, |
| 26 | + APPROX_QUANTILES(INT64(anchors.rendered.same_site), 1000)[OFFSET(percentile * 10)] AS outgoing_links_same_site, |
| 27 | + APPROX_QUANTILES(INT64(anchors.rendered.same_property), 1000)[OFFSET(percentile * 10)] AS outgoing_links_same_property, |
| 28 | + APPROX_QUANTILES(INT64(anchors.rendered.other_property), 1000)[OFFSET(percentile * 10)] AS outgoing_links_other_property |
| 29 | + FROM page_metrics, |
| 30 | + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile |
| 31 | + GROUP BY |
| 32 | + client, |
| 33 | + is_root_page, |
| 34 | + rank, |
| 35 | + percentile |
| 36 | + ORDER BY |
| 37 | + client, |
| 38 | + is_root_page, |
| 39 | + rank, |
| 40 | + percentile |
| 41 | +), page_counts AS ( |
| 42 | + SELECT |
| 43 | + client, |
| 44 | + is_root_page, |
| 45 | + rank, |
| 46 | + COUNT(DISTINCT page) AS total_pages |
| 47 | + FROM page_metrics |
| 48 | + GROUP BY |
| 49 | + client, |
| 50 | + is_root_page, |
| 51 | + rank |
45 | 52 | ) |
46 | 53 |
|
47 | 54 | SELECT |
48 | 55 | client, |
49 | | - CASE |
50 | | - WHEN is_root_page = 'false' THEN 'Secondary Page' |
51 | | - ELSE 'Homepage' |
52 | | - END AS page_type, |
| 56 | + is_root_page, |
| 57 | + rank, |
| 58 | + total_pages, |
53 | 59 | percentile, |
54 | | - rank_grouping, |
55 | | - CASE |
56 | | - WHEN rank_grouping = 100000000 THEN 'all' |
57 | | - ELSE FORMAT("%'d", rank_grouping) |
58 | | - END AS ranking, |
59 | | - COUNT(DISTINCT page) AS pages, |
60 | | - APPROX_QUANTILES(outgoing_link_metrics.same_site, 1000)[OFFSET(percentile * 10)] AS outgoing_links_same_site, |
61 | | - APPROX_QUANTILES(outgoing_link_metrics.same_property, 1000)[OFFSET(percentile * 10)] AS outgoing_links_same_property, |
62 | | - APPROX_QUANTILES(outgoing_link_metrics.other_property, 1000)[OFFSET(percentile * 10)] AS outgoing_links_other_property |
63 | | -FROM |
64 | | - page_metrics, |
65 | | - UNNEST([10, 25, 50, 75, 90, 100]) AS percentile, |
66 | | - UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping |
67 | | -GROUP BY |
68 | | - client, |
69 | | - page_type, |
70 | | - rank_grouping, |
71 | | - percentile |
| 60 | + outgoing_links_same_site, |
| 61 | + outgoing_links_same_property, |
| 62 | + outgoing_links_other_property |
| 63 | +FROM metric_details |
| 64 | +LEFT JOIN page_counts |
| 65 | +USING (client, is_root_page, rank) |
72 | 66 | ORDER BY |
73 | 67 | client, |
74 | | - page_type, |
75 | | - rank_grouping, |
| 68 | + is_root_page, |
| 69 | + rank, |
76 | 70 | percentile |
0 commit comments