Skip to content

Commit ddb33bb

Browse files
authored
Update outgoing_links_by_rank -2024.sql (#3891)
1 parent b7aeb29 commit ddb33bb

1 file changed

Lines changed: 55 additions & 61 deletions

File tree

Lines changed: 55 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,76 +1,70 @@
11
#standardSQL
22
# Internal and external link metrics by quantile and rank
3-
4-
CREATE TEMPORARY FUNCTION getOutgoingLinkMetrics(payload STRING)
5-
RETURNS STRUCT<
6-
same_site INT64,
7-
same_property INT64,
8-
other_property INT64
9-
> LANGUAGE js AS '''
10-
var result = {same_site: 0,
11-
same_property: 0,
12-
other_property: 0};
13-
14-
try {
15-
var $ = JSON.parse(payload);
16-
var wpt_bodies = JSON.parse($._wpt_bodies);
17-
18-
if (!wpt_bodies){
19-
return result;
20-
}
21-
22-
var anchors = wpt_bodies.anchors;
23-
24-
if (anchors){
25-
result.same_site = anchors.rendered.same_site;
26-
result.same_property = anchors.rendered.same_property;
27-
result.other_property = anchors.rendered.other_property;
28-
}
29-
30-
} catch (e) {}
31-
32-
return result;
33-
''';
34-
353
WITH page_metrics AS (
364
SELECT
375
client,
386
page,
39-
getOutgoingLinkMetrics(payload) AS outgoing_link_metrics,
40-
JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies'), '$.is_root_page') AS is_root_page
41-
FROM
42-
`httparchive.all.pages`
7+
is_root_page,
8+
IF(rank <= rank_bucket, rank_bucket, NULL) AS rank,
9+
ANY_VALUE(custom_metrics.wpt_bodies.anchors) AS anchors
10+
FROM httparchive.crawl.pages,
11+
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_bucket
4312
WHERE
44-
DATE = '2024-06-01'
13+
date = '2024-06-01'
14+
GROUP BY
15+
client,
16+
page,
17+
is_root_page,
18+
rank
19+
HAVING rank IS NOT NULL
20+
), metric_details AS (
21+
SELECT
22+
client,
23+
is_root_page,
24+
percentile,
25+
rank,
26+
APPROX_QUANTILES(INT64(anchors.rendered.same_site), 1000)[OFFSET(percentile * 10)] AS outgoing_links_same_site,
27+
APPROX_QUANTILES(INT64(anchors.rendered.same_property), 1000)[OFFSET(percentile * 10)] AS outgoing_links_same_property,
28+
APPROX_QUANTILES(INT64(anchors.rendered.other_property), 1000)[OFFSET(percentile * 10)] AS outgoing_links_other_property
29+
FROM page_metrics,
30+
UNNEST([10, 25, 50, 75, 90, 100]) AS percentile
31+
GROUP BY
32+
client,
33+
is_root_page,
34+
rank,
35+
percentile
36+
ORDER BY
37+
client,
38+
is_root_page,
39+
rank,
40+
percentile
41+
), page_counts AS (
42+
SELECT
43+
client,
44+
is_root_page,
45+
rank,
46+
COUNT(DISTINCT page) AS total_pages
47+
FROM page_metrics
48+
GROUP BY
49+
client,
50+
is_root_page,
51+
rank
4552
)
4653

4754
SELECT
4855
client,
49-
CASE
50-
WHEN is_root_page = 'false' THEN 'Secondary Page'
51-
ELSE 'Homepage'
52-
END AS page_type,
56+
is_root_page,
57+
rank,
58+
total_pages,
5359
percentile,
54-
rank_grouping,
55-
CASE
56-
WHEN rank_grouping = 100000000 THEN 'all'
57-
ELSE FORMAT("%'d", rank_grouping)
58-
END AS ranking,
59-
COUNT(DISTINCT page) AS pages,
60-
APPROX_QUANTILES(outgoing_link_metrics.same_site, 1000)[OFFSET(percentile * 10)] AS outgoing_links_same_site,
61-
APPROX_QUANTILES(outgoing_link_metrics.same_property, 1000)[OFFSET(percentile * 10)] AS outgoing_links_same_property,
62-
APPROX_QUANTILES(outgoing_link_metrics.other_property, 1000)[OFFSET(percentile * 10)] AS outgoing_links_other_property
63-
FROM
64-
page_metrics,
65-
UNNEST([10, 25, 50, 75, 90, 100]) AS percentile,
66-
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping
67-
GROUP BY
68-
client,
69-
page_type,
70-
rank_grouping,
71-
percentile
60+
outgoing_links_same_site,
61+
outgoing_links_same_property,
62+
outgoing_links_other_property
63+
FROM metric_details
64+
LEFT JOIN page_counts
65+
USING (client, is_root_page, rank)
7266
ORDER BY
7367
client,
74-
page_type,
75-
rank_grouping,
68+
is_root_page,
69+
rank,
7670
percentile

0 commit comments

Comments
 (0)