Skip to content

Commit 083de67

Browse files
henryp25tunetheweb
andauthored
SEO 2024 queries (#3791)
* Uploading Image loading property usage SQL * Added SQL files for SEO analysis in 2024 * Linting * Update sql/2024/seo/robots-text-size-2024.sql Co-authored-by: Barry Pollard <barrypollard@google.com> * Update sql/2024/seo/image-loading-property-usage-2024.sql Co-authored-by: Barry Pollard <barrypollard@google.com> * Update sql/2024/seo/lighthouse-seo-stats-2024.sql Co-authored-by: Barry Pollard <barrypollard@google.com> * Linting --------- Co-authored-by: Barry Pollard <barrypollard@google.com>
1 parent 7a80150 commit 083de67

32 files changed

Lines changed: 2490 additions & 0 deletions
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#standardSQL
2+
# Anchor rel attribute usage
3+
# This query reports if a rel attribute value was ever used on a page, and calculates various statistics.
4+
5+
CREATE TEMPORARY FUNCTION getRelStatsWptBodies(wpt_bodies_string STRING)
6+
RETURNS STRUCT<
7+
rel ARRAY<STRING>
8+
> LANGUAGE js AS '''
9+
var result = {rel: []};
10+
// Function to retrieve only keys if value is >0
11+
function getKey(dict){
12+
const arr = [],
13+
obj = Object.keys(dict);
14+
for (var x in obj){
15+
if(dict[obj[x]] > 0){
16+
arr.push(obj[x]);
17+
}
18+
}
19+
return arr;
20+
}
21+
try {
22+
var wpt_bodies = JSON.parse(wpt_bodies_string);
23+
if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result;
24+
if (wpt_bodies.anchors && wpt_bodies.anchors.rendered && wpt_bodies.anchors.rendered.rel_attributes) {
25+
result.rel = getKey(wpt_bodies.anchors.rendered.rel_attributes);
26+
}
27+
} catch (e) {}
28+
return result;
29+
''';
30+
31+
WITH rel_stats_table AS (
32+
SELECT
33+
client,
34+
root_page,
35+
page,
36+
CASE
37+
WHEN is_root_page = FALSE THEN 'Secondarypage'
38+
WHEN is_root_page = TRUE THEN 'Homepage'
39+
ELSE 'No Assigned Page'
40+
END
41+
AS is_root_page,
42+
getRelStatsWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS wpt_bodies_info
43+
FROM
44+
`httparchive.all.pages`
45+
WHERE
46+
date = '2024-06-01'
47+
)
48+
49+
SELECT
50+
client,
51+
is_root_page,
52+
rel,
53+
COUNT(DISTINCT page) AS sites,
54+
SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
55+
COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct
56+
FROM
57+
rel_stats_table,
58+
UNNEST(wpt_bodies_info.rel) AS rel
59+
GROUP BY
60+
client,
61+
is_root_page,
62+
rel
63+
ORDER BY
64+
sites DESC,
65+
rel,
66+
client DESC;
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#standardSQL
2+
# Anchor same site occurrence stats
3+
# This query aims to highlight sites with few same-site links, like SPAs.
4+
5+
CREATE TEMPORARY FUNCTION getLinkDesciptionsWptBodies(wpt_bodies_string STRING)
6+
RETURNS STRUCT<
7+
links_same_site INT64,
8+
links_window_location INT64,
9+
links_window_open INT64,
10+
links_href_javascript INT64
11+
> LANGUAGE js AS '''
12+
var result = {
13+
links_same_site: 0,
14+
links_window_location: 0,
15+
links_window_open: 0,
16+
links_href_javascript: 0
17+
};
18+
try {
19+
var wpt_bodies = JSON.parse(wpt_bodies_string);
20+
21+
if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result;
22+
23+
if (wpt_bodies.anchors && wpt_bodies.anchors.rendered) {
24+
var anchors_rendered = wpt_bodies.anchors.rendered;
25+
26+
result.links_same_site = anchors_rendered.same_site || 0;
27+
result.links_window_location = anchors_rendered.same_page.dynamic.onclick_attributes.window_location || 0;
28+
result.links_window_open = anchors_rendered.same_page.dynamic.onclick_attributes.window_open || 0;
29+
result.links_href_javascript = anchors_rendered.same_page.dynamic.href_javascript || 0;
30+
}
31+
32+
} catch (e) {}
33+
return result;
34+
''';
35+
36+
WITH same_links_info AS (
37+
SELECT
38+
client,
39+
root_page,
40+
page,
41+
CASE
42+
WHEN is_root_page = FALSE THEN 'Secondarypage'
43+
WHEN is_root_page = TRUE THEN 'Homepage'
44+
ELSE 'No Assigned Page'
45+
END
46+
AS is_root_page,
47+
getLinkDesciptionsWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS wpt_bodies_info
48+
FROM
49+
`httparchive.all.pages`
50+
WHERE
51+
date = '2024-06-01'
52+
)
53+
54+
SELECT
55+
client,
56+
wpt_bodies_info.links_same_site AS links_same_site,
57+
is_root_page,
58+
COUNT(DISTINCT page) AS sites, -- Counting all occurrences of links_same_site
59+
SAFE_DIVIDE(COUNT(0), COUNT(DISTINCT page)) AS pct_links_same_site, -- Percentage of same-site links
60+
AVG(wpt_bodies_info.links_window_location) AS avg_links_window_location,
61+
AVG(wpt_bodies_info.links_window_open) AS avg_links_window_open,
62+
AVG(wpt_bodies_info.links_href_javascript) AS avg_links_href_javascript,
63+
AVG(wpt_bodies_info.links_window_location + wpt_bodies_info.links_window_open + wpt_bodies_info.links_href_javascript) AS avg_links_any,
64+
MAX(wpt_bodies_info.links_window_location + wpt_bodies_info.links_window_open + wpt_bodies_info.links_href_javascript) AS max_links_any,
65+
SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
66+
COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct -- Secondary page percentage within group
67+
FROM
68+
same_links_info
69+
GROUP BY
70+
client,
71+
is_root_page,
72+
wpt_bodies_info.links_same_site
73+
ORDER BY
74+
links_same_site ASC;
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
CREATE TEMPORARY FUNCTION getContentLanguagesAlmanac(almanac_string STRING)
2+
RETURNS ARRAY<STRING>
3+
LANGUAGE js AS '''
4+
var result = [];
5+
try {
6+
var almanac = JSON.parse(almanac_string);
7+
8+
if (Array.isArray(almanac) || typeof almanac != 'object') return ["NO PAYLOAD"];
9+
10+
if (almanac && almanac["meta-nodes"] && almanac["meta-nodes"].nodes && almanac["meta-nodes"].nodes.filter) {
11+
result = almanac["meta-nodes"].nodes.filter(n => n["http-equiv"] && n["http-equiv"].toLowerCase().trim() == 'content-language' && n.content).map(am => am.content.toLowerCase().trim());
12+
}
13+
14+
if (result.length === 0)
15+
result.push("NO TAG");
16+
17+
} catch (e) {result.push("ERROR "+e);} // results show some issues with the validity of the payload
18+
return result;
19+
''';
20+
WITH content_language_usage AS (
21+
SELECT
22+
client,
23+
root_page,
24+
page,
25+
CASE
26+
WHEN is_root_page = FALSE THEN 'Secondarypage'
27+
WHEN is_root_page = TRUE THEN 'Homepage'
28+
ELSE 'No Assigned Page'
29+
END AS is_root_page,
30+
getContentLanguagesAlmanac(JSON_EXTRACT_SCALAR(payload, '$._almanac')) AS content_languages
31+
FROM
32+
`httparchive.all.pages`
33+
WHERE
34+
date = '2024-06-01'
35+
)
36+
SELECT
37+
client,
38+
is_root_page,
39+
content_language,
40+
COUNT(DISTINCT page) AS sites,
41+
SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
42+
COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct
43+
FROM
44+
content_language_usage,
45+
UNNEST(content_languages) AS content_language
46+
GROUP BY
47+
client,
48+
is_root_page,
49+
content_language
50+
ORDER BY
51+
sites DESC,
52+
client DESC;
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
CREATE TEMP FUNCTION IS_GOOD (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
2+
good / (good + needs_improvement + poor) >= 0.75
3+
);
4+
CREATE TEMP FUNCTION IS_NON_ZERO (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
5+
good + needs_improvement + poor > 0
6+
);
7+
SELECT
8+
date,
9+
device,
10+
SAFE_DIVIDE(
11+
COUNT(DISTINCT IF(IS_GOOD(fast_lcp, avg_lcp, slow_lcp), origin, NULL)),
12+
COUNT(DISTINCT IF(IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL))) AS pct_good_lcp,
13+
# Origins with good FID divided by origins with any FID.
14+
SAFE_DIVIDE(
15+
COUNT(DISTINCT IF(IS_GOOD(fast_fid, avg_fid, slow_fid), origin, NULL)),
16+
COUNT(DISTINCT IF(IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL))) AS pct_good_fid,
17+
# Origins with good CLS divided by origins with any CLS.
18+
SAFE_DIVIDE(
19+
COUNT(DISTINCT IF(IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL)),
20+
COUNT(DISTINCT IF(IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL))) AS pct_good_cls,
21+
# Origins with good LCP, FID, and CLS dividied by origins with any LCP, FID, and CLS.
22+
SAFE_DIVIDE(
23+
COUNT(DISTINCT IF(
24+
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AND
25+
IS_GOOD(fast_fid, avg_fid, slow_fid) IS NOT FALSE AND
26+
IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL)),
27+
COUNT(DISTINCT IF(
28+
IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AND
29+
IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL))) AS pct_good_cwv
30+
FROM
31+
`chrome-ux-report.materialized.device_summary`
32+
WHERE
33+
date BETWEEN '2019-11-01' AND '2024-06-01' AND
34+
device IN ('desktop', 'phone')
35+
GROUP BY
36+
date,
37+
device
38+
ORDER BY
39+
date DESC
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#standardSQL
2+
# hreflang header usage
3+
4+
# Returns all the data we need from _wpt_bodies
5+
CREATE TEMPORARY FUNCTION getHreflangWptBodies(wpt_bodies_string STRING)
6+
RETURNS STRUCT<
7+
hreflangs ARRAY<STRING>
8+
> LANGUAGE js AS '''
9+
var result = {
10+
hreflangs: []
11+
};
12+
13+
try {
14+
var wpt_bodies = JSON.parse(wpt_bodies_string);
15+
16+
if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result;
17+
18+
if (wpt_bodies.hreflangs && wpt_bodies.hreflangs.http_header && wpt_bodies.hreflangs.http_header.values) {
19+
result.hreflangs = wpt_bodies.hreflangs.http_header.values.map(v => v); // seems to fix a coercion issue!
20+
}
21+
22+
} catch (e) {}
23+
return result;
24+
''';
25+
26+
WITH hreflang_usage AS (
27+
SELECT
28+
client,
29+
root_page,
30+
page,
31+
CASE
32+
WHEN is_root_page = FALSE THEN 'Secondarypage'
33+
WHEN is_root_page = TRUE THEN 'Homepage'
34+
ELSE 'No Assigned Page'
35+
END AS is_root_page,
36+
getHreflangWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS hreflang_wpt_bodies_info
37+
FROM
38+
`httparchive.all.pages`
39+
WHERE
40+
date = '2024-06-01'
41+
42+
)
43+
SELECT
44+
client,
45+
is_root_page,
46+
NORMALIZE_AND_CASEFOLD(hreflang) AS hreflang,
47+
COUNT(DISTINCT page) AS sites,
48+
SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
49+
COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct
50+
FROM
51+
hreflang_usage,
52+
UNNEST(hreflang_wpt_bodies_info.hreflangs) AS hreflang
53+
GROUP BY
54+
hreflang,
55+
client,
56+
is_root_page
57+
ORDER BY
58+
sites DESC,
59+
client DESC;
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#standardSQL
2+
# hreflang link tag usage
3+
4+
# Returns all the data we need from _wpt_bodies
5+
CREATE TEMPORARY FUNCTION getHreflangWptBodies(wpt_bodies_string STRING)
6+
RETURNS STRUCT<
7+
hreflangs ARRAY<STRING>
8+
> LANGUAGE js AS '''
9+
var result = {
10+
hreflangs: []
11+
};
12+
13+
try {
14+
var wpt_bodies = JSON.parse(wpt_bodies_string);
15+
16+
if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result;
17+
18+
if (wpt_bodies.hreflangs && wpt_bodies.hreflangs.rendered && wpt_bodies.hreflangs.rendered.values) {
19+
result.hreflangs = wpt_bodies.hreflangs.rendered.values.map(v => v); // seems to fix a coercion issue!
20+
}
21+
22+
} catch (e) {}
23+
return result;
24+
''';
25+
26+
WITH link_tag AS (
27+
SELECT
28+
client,
29+
root_page,
30+
page,
31+
CASE
32+
WHEN is_root_page = FALSE THEN 'Secondarypage'
33+
WHEN is_root_page = TRUE THEN 'Homepage'
34+
ELSE 'No Assigned Page'
35+
END AS is_root_page,
36+
getHreflangWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS hreflang_wpt_bodies_info
37+
FROM
38+
`httparchive.all.pages`
39+
WHERE
40+
date = '2024-06-01'
41+
)
42+
SELECT
43+
client,
44+
is_root_page,
45+
NORMALIZE_AND_CASEFOLD(hreflang) AS hreflang,
46+
COUNT(DISTINCT page) AS sites,
47+
SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
48+
COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct
49+
FROM
50+
link_tag,
51+
UNNEST(hreflang_wpt_bodies_info.hreflangs) AS hreflang
52+
GROUP BY
53+
hreflang,
54+
is_root_page,
55+
client
56+
ORDER BY
57+
client DESC;
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
WITH subquery AS (
2+
SELECT
3+
client,
4+
page,
5+
request_headers,
6+
CASE
7+
WHEN is_root_page = FALSE THEN 'Secondarypage'
8+
WHEN is_root_page = TRUE THEN 'Homepage'
9+
ELSE 'No Assigned Page'
10+
END AS is_root_page
11+
FROM
12+
`httparchive.all.requests`
13+
WHERE
14+
date = '2024-06-01'
15+
)
16+
17+
SELECT
18+
client,
19+
is_root_page,
20+
header.name AS request_header_name,
21+
header.value AS request_header_value,
22+
COUNT(DISTINCT page) AS sites,
23+
SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
24+
SAFE_DIVIDE(COUNT(0), SUM(COUNT(0)) OVER ()) AS pct
25+
FROM
26+
subquery,
27+
UNNEST(request_headers) AS header
28+
GROUP BY
29+
client,
30+
is_root_page,
31+
header.name,
32+
header.value
33+
ORDER BY
34+
sites DESC,
35+
client;

0 commit comments

Comments
 (0)