Skip to content

Commit f084df4

Browse files
guacatunethewebgithub-actions[bot]
authored
Markup 2024 Chapter (#3815)
* Add 2024 markup queries * Update Markup 2024 queries * Update Markup queries * Update markup.md * Update markdown.md * Smart quotes * Generate chapter * Generate images * Optimised images with calibre/image-actions * Add author * Lint SQL * Retake hi-res images * Update contributors file * Final edits --------- Co-authored-by: Barry Pollard <barrypollard@google.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
1 parent 51b5f66 commit f084df4

50 files changed

Lines changed: 1693 additions & 12 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

sql/2024/markup/attributes.sql

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#standardSQL
2+
# pages almanac metrics grouped by device and element attribute use (frequency)
3+
4+
CREATE TEMPORARY FUNCTION get_almanac_attribute_info(almanac_string STRING)
5+
RETURNS ARRAY<STRUCT<name STRING, freq INT64>> LANGUAGE js AS '''
6+
try {
7+
var almanac = JSON.parse(almanac_string);
8+
9+
if (Array.isArray(almanac) || typeof almanac != 'object') return [];
10+
11+
if (almanac.attributes_used_on_elements) {
12+
return Object.entries(almanac.attributes_used_on_elements).map(([name, freq]) => ({name, freq}));
13+
}
14+
15+
} catch (e) {
16+
17+
}
18+
return [];
19+
''';
20+
21+
WITH totals AS (
22+
SELECT
23+
client,
24+
COUNT(0) AS total
25+
FROM
26+
`httparchive.all.pages`
27+
WHERE
28+
date = '2024-06-01'
29+
GROUP BY
30+
client
31+
), attributes AS (
32+
SELECT
33+
client,
34+
almanac_attribute_info.name,
35+
COUNT(DISTINCT page) AS pages,
36+
ANY_VALUE(total) AS total_pages,
37+
COUNT(DISTINCT page) / ANY_VALUE(total) AS pct_pages,
38+
SUM(almanac_attribute_info.freq) AS freq,
39+
SUM(SUM(almanac_attribute_info.freq)) OVER (PARTITION BY client) AS total,
40+
SUM(almanac_attribute_info.freq) / SUM(SUM(almanac_attribute_info.freq)) OVER (PARTITION BY client) AS pct_ratio
41+
FROM
42+
`httparchive.all.pages`,
43+
UNNEST(get_almanac_attribute_info(JSON_EXTRACT(custom_metrics, '$.almanac'))) AS almanac_attribute_info
44+
JOIN
45+
totals
46+
USING
47+
(client)
48+
WHERE
49+
date = '2024-06-01'
50+
GROUP BY
51+
client,
52+
almanac_attribute_info.name
53+
)
54+
55+
SELECT
56+
*
57+
FROM
58+
attributes
59+
ORDER BY
60+
pct_ratio DESC
61+
LIMIT
62+
1000

sql/2024/markup/buttons.sql

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
CREATE TEMPORARY FUNCTION get_markup_buttons_info(markup_string STRING)
2+
RETURNS ARRAY<STRING> LANGUAGE js AS '''
3+
try {
4+
var markup = JSON.parse(markup_string);
5+
var type_total = Object.values(markup.buttons.types).reduce((total, i) => total + i, 0);
6+
var types = [];
7+
if (markup.buttons.total > type_total) {
8+
types = ['NO_TYPE'];
9+
}
10+
return Object.keys(markup.buttons.types).concat(types);
11+
} catch (e) {
12+
return [];
13+
}
14+
''';
15+
16+
WITH totals AS (
17+
SELECT
18+
client,
19+
COUNT(0) AS total
20+
FROM
21+
`httparchive.all.pages`
22+
WHERE
23+
date = '2024-06-01'
24+
GROUP BY
25+
client
26+
)
27+
28+
SELECT
29+
client AS client,
30+
LOWER(TRIM(button_type)) AS button_type,
31+
COUNT(DISTINCT page) AS page,
32+
ANY_VALUE(total) AS total,
33+
COUNT(DISTINCT page) / ANY_VALUE(total) AS pct_pages
34+
FROM
35+
`httparchive.all.pages`
36+
JOIN
37+
totals
38+
USING
39+
(client),
40+
UNNEST(get_markup_buttons_info(JSON_EXTRACT(custom_metrics, '$.markup'))) AS button_type
41+
WHERE
42+
date = '2024-06-01'
43+
GROUP BY
44+
client,
45+
button_type
46+
ORDER BY
47+
pct_pages DESC
48+
LIMIT
49+
1000

sql/2024/markup/comments.sql

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
WITH comments AS (
2+
SELECT
3+
client,
4+
CAST(JSON_VALUE(JSON_EXTRACT(custom_metrics, '$.wpt_bodies'), '$.raw_html.comment_count') AS INT64) AS num_comments,
5+
CAST(JSON_VALUE(JSON_EXTRACT(custom_metrics, '$.wpt_bodies'), '$.raw_html.conditional_comment_count') AS INT64) AS num_conditional_comments
6+
FROM
7+
`httparchive.all.pages`
8+
WHERE
9+
date = '2024-06-01'
10+
)
11+
12+
SELECT
13+
client,
14+
COUNTIF(num_comments > 0) AS num_comments,
15+
COUNTIF(num_conditional_comments > 0) AS num_conditional_comments,
16+
COUNT(0) AS total,
17+
COUNTIF(num_comments > 0) / COUNT(0) AS pct_comments,
18+
COUNTIF(num_conditional_comments > 0) / COUNT(0) AS pct_conditional_comments
19+
FROM
20+
comments
21+
GROUP BY
22+
client
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
-- Temporary function to extract content-encoding
2+
CREATE TEMPORARY FUNCTION GET_CONTENT_ENCODING(response_headers ARRAY<STRUCT<name STRING, value STRING>>)
3+
RETURNS STRING AS (
4+
(
5+
SELECT
6+
value
7+
FROM
8+
UNNEST(response_headers) AS header
9+
WHERE
10+
LOWER(header.name) = 'content-encoding'
11+
LIMIT 1
12+
)
13+
);
14+
15+
SELECT
16+
date,
17+
client,
18+
GET_CONTENT_ENCODING(response_headers) AS content_encoding,
19+
COUNT(0) AS freq,
20+
SUM(COUNT(0)) OVER (PARTITION BY client) AS total,
21+
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct
22+
FROM
23+
`httparchive.all.requests`
24+
WHERE
25+
date = '2024-06-01' AND
26+
is_main_document
27+
GROUP BY
28+
client,
29+
content_encoding
30+
ORDER BY
31+
pct DESC
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
WITH custom_elements AS (
2+
SELECT
3+
client,
4+
page,
5+
COALESCE(ARRAY_LENGTH(JSON_VALUE_ARRAY(JSON_EXTRACT(custom_metrics, '$.wpt_bodies'), '$.web_components.rendered.customElements.names')) > 0, FALSE) AS has_custom_elements
6+
FROM
7+
`httparchive.all.pages`
8+
WHERE
9+
date IN ('2022-06-01', '2023-06-01', '2024-06-01')
10+
)
11+
12+
SELECT
13+
date,
14+
client,
15+
COUNT(0) AS total,
16+
COUNTIF(has_custom_elements) AS freq,
17+
COUNTIF(has_custom_elements) / COUNT(0) AS pct_custom_elements
18+
FROM
19+
custom_elements
20+
GROUP BY
21+
date, client
22+
ORDER BY
23+
date ASC
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
WITH js_bytes AS (
2+
SELECT
3+
client,
4+
page,
5+
SAFE_CAST(JSON_EXTRACT(summary, '$.bytesJS') AS INT64) / 1024 AS kbytes_js
6+
FROM
7+
`httparchive.all.pages`
8+
WHERE
9+
date = '2024-06-01'
10+
), custom_elements AS (
11+
SELECT
12+
client,
13+
page,
14+
COALESCE(ARRAY_LENGTH(JSON_VALUE_ARRAY(JSON_EXTRACT(custom_metrics, '$.wpt_bodies'), '$.web_components.rendered.customElements.names')) > 0, FALSE) AS has_custom_elements
15+
FROM
16+
`httparchive.all.pages`
17+
WHERE
18+
date = '2024-06-01'
19+
)
20+
21+
SELECT
22+
percentile,
23+
client,
24+
has_custom_elements,
25+
APPROX_QUANTILES(kbytes_js, 1000)[OFFSET(percentile * 10)] AS kbytes_js,
26+
COUNT(DISTINCT page) AS pages
27+
FROM
28+
custom_elements
29+
JOIN
30+
js_bytes
31+
USING
32+
(client, page),
33+
UNNEST([10, 25, 50, 75, 90]) AS percentile
34+
GROUP BY
35+
percentile,
36+
client,
37+
has_custom_elements
38+
ORDER BY
39+
percentile,
40+
client,
41+
has_custom_elements
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
CREATE TEMPORARY FUNCTION get_almanac_attribute_info(almanac_string STRING)
2+
RETURNS ARRAY<STRUCT<name STRING, freq INT64>> LANGUAGE js AS '''
3+
try {
4+
var almanac = JSON.parse(almanac_string);
5+
6+
if (Array.isArray(almanac) || typeof almanac != 'object') return [];
7+
8+
if (almanac.attributes_used_on_elements) {
9+
return Object.entries(almanac.attributes_used_on_elements).filter(([name, freq]) => name.startsWith('data-')).map(([name, freq]) => ({name, freq}));
10+
}
11+
12+
} catch (e) {}
13+
return [];
14+
''';
15+
16+
WITH totals AS (
17+
SELECT
18+
client,
19+
COUNT(0) AS total_pages
20+
FROM
21+
`httparchive.all.pages`
22+
WHERE
23+
date = '2024-06-01'
24+
GROUP BY
25+
client
26+
)
27+
28+
SELECT
29+
client,
30+
COUNT(DISTINCT page) AS pages,
31+
ANY_VALUE(total_pages) AS total_pages,
32+
COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct_pages
33+
FROM
34+
`httparchive.all.pages`
35+
JOIN
36+
totals
37+
USING
38+
(client),
39+
UNNEST(get_almanac_attribute_info(JSON_EXTRACT(custom_metrics, '$.almanac'))) AS almanac_attribute_info
40+
WHERE
41+
date = '2024-06-01'
42+
GROUP BY
43+
client
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
CREATE TEMPORARY FUNCTION get_almanac_attribute_info(almanac_string STRING)
2+
RETURNS ARRAY<STRUCT<name STRING, freq INT64>> LANGUAGE js AS '''
3+
try {
4+
var almanac = JSON.parse(almanac_string);
5+
6+
if (Array.isArray(almanac) || typeof almanac != 'object') return [];
7+
8+
if (almanac.attributes_used_on_elements) {
9+
return Object.entries(almanac.attributes_used_on_elements).filter(([name, freq]) => name.startsWith('data-')).map(([name, freq]) => ({name, freq}));
10+
}
11+
12+
} catch (e) {}
13+
return [];
14+
''';
15+
16+
WITH totals AS (
17+
SELECT
18+
client,
19+
COUNT(0) AS total_pages
20+
FROM
21+
`httparchive.all.pages`
22+
WHERE
23+
date = '2024-06-01'
24+
GROUP BY
25+
client
26+
), data_attrs AS (
27+
SELECT
28+
client,
29+
almanac_attribute_info.name,
30+
COUNT(DISTINCT page) AS pages,
31+
ANY_VALUE(total_pages) AS total_pages,
32+
COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct_pages,
33+
SUM(almanac_attribute_info.freq) AS freq, # total count from all pages
34+
SUM(SUM(almanac_attribute_info.freq)) OVER (PARTITION BY client) AS total,
35+
SUM(almanac_attribute_info.freq) / SUM(SUM(almanac_attribute_info.freq)) OVER (PARTITION BY client) AS pct_ratio
36+
FROM
37+
`httparchive.all.pages`
38+
JOIN
39+
totals
40+
USING
41+
(client),
42+
UNNEST(get_almanac_attribute_info(JSON_EXTRACT(custom_metrics, '$.almanac'))) AS almanac_attribute_info
43+
WHERE
44+
date = '2024-06-01'
45+
GROUP BY
46+
client,
47+
almanac_attribute_info.name
48+
)
49+
50+
SELECT
51+
*
52+
FROM
53+
data_attrs
54+
ORDER BY
55+
pct_ratio DESC
56+
LIMIT
57+
1000

sql/2024/markup/distinct_lang.sql

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
WITH langs AS (
2+
SELECT
3+
client,
4+
TRIM(LOWER(JSON_EXTRACT(custom_metrics, '$.almanac.html_node.lang'))) AS lang
5+
FROM
6+
`httparchive.all.pages`
7+
WHERE
8+
date = '2024-06-01'
9+
)
10+
11+
SELECT
12+
client,
13+
COUNT(DISTINCT IFNULL(lang, '(not set)')) AS distinct_lang_count
14+
FROM
15+
langs
16+
GROUP BY
17+
client
18+
ORDER BY
19+
distinct_lang_count DESC;

sql/2024/markup/doctype.sql

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
-- Temporary function to extract doctype
2+
CREATE TEMPORARY FUNCTION EXTRACT_DOCTYPE(summary STRING) RETURNS STRING AS (
3+
SAFE_CAST(JSON_EXTRACT(summary, '$.doctype') AS STRING)
4+
);
5+
6+
SELECT
7+
client,
8+
LOWER(REGEXP_REPLACE(TRIM(EXTRACT_DOCTYPE(summary)), r' +', ' ')) AS doctype, # remove extra spaces and make lower case
9+
COUNT(0) AS pages,
10+
SUM(COUNT(0)) OVER (PARTITION BY client) AS total,
11+
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_pages
12+
FROM
13+
`httparchive.all.pages`
14+
WHERE
15+
date = '2024-06-01'
16+
GROUP BY
17+
client,
18+
doctype
19+
ORDER BY
20+
pct_pages DESC
21+
LIMIT
22+
100;

0 commit comments

Comments
 (0)