Skip to content

Commit fd87798

Browse files
Media 2024 queries (#3738)
* Add web almanac 2024 SQL queries * Update video_source_types.todo.sql Linter prefers ' * Update video_media.sql Fixing inconsistencies in linter * Update video_media.sql Linter * Update video_formats.sql removing TABLESAMPLE SYSTEM (0.001 PERCENT) --------- Co-authored-by: Mike Gifford <mike.gifford@civicactions.com>
1 parent 0a512e8 commit fd87798

58 files changed

Lines changed: 3018 additions & 0 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#standardSQL
2+
# what is the bpp of animated vs non-animated GIFs?
3+
# animated_gif_bpp.sql
4+
5+
6+
# ImageMagick reports big images as having, e.g., "1.29097M" pixels. This means ~1.2 million pixels, but BigQuery doesn't know that.
7+
CREATE TEMPORARY FUNCTION magickPixels(imageMagickNumberString STRING)
8+
RETURNS INT64
9+
LANGUAGE js AS r'''
10+
11+
if (!imageMagickNumberString) { return null; }
12+
const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)?$/);
13+
const multiples = {
14+
'K': 1e3,
15+
'M': 1e6,
16+
'G': 1e9,
17+
'T': 1e12
18+
}
19+
if ( matched && matched[1] ) {
20+
return Math.round(
21+
parseFloat( matched[1] ) * ( multiples[ matched[2] ] || 1 )
22+
);
23+
} else {
24+
return null;
25+
}
26+
27+
''';
28+
29+
# ImageMagick reports bytesizes in a friendly, human readable format. Just return bytes
30+
CREATE TEMPORARY FUNCTION magickBytes(imageMagickNumberString STRING)
31+
RETURNS INT64
32+
LANGUAGE js AS r'''
33+
34+
if (!imageMagickNumberString) { return 0; }
35+
const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)$/);
36+
const multiples = {
37+
'B': 1,
38+
'KB': 1e3,
39+
'MB': 1e6,
40+
'GB': 1e9,
41+
'TB': 1e12
42+
}
43+
if ( matched && matched[1] && matched[2] ) {
44+
return Math.round(
45+
parseFloat( matched[1] ) * multiples[ matched[2] ]
46+
);
47+
} else {
48+
return null;
49+
}
50+
51+
''';
52+
53+
WITH gifs AS (
54+
SELECT
55+
_TABLE_SUFFIX AS client,
56+
CAST(JSON_VALUE(payload, '$._image_details.animated') AS BOOL) AS is_animated,
57+
(magickBytes(JSON_VALUE(payload, '$._image_details.magick.filesize')) * 8) /
58+
magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) AS bits_per_pixel
59+
FROM
60+
`requests.2024_06_01_*`
61+
WHERE
62+
JSON_VALUE(payload, '$._image_details.detected_type') = 'gif' AND
63+
magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) > 1 AND
64+
JSON_VALUE(payload, '$._image_details.animated') IS NOT NULL
65+
)
66+
67+
SELECT
68+
percentile,
69+
client,
70+
is_animated,
71+
APPROX_QUANTILES(bits_per_pixel, 1000)[OFFSET(percentile * 10)] AS bpp
72+
FROM
73+
gifs,
74+
UNNEST([10, 25, 50, 75, 90]) AS percentile
75+
GROUP BY
76+
percentile,
77+
client,
78+
is_animated
79+
ORDER BY
80+
percentile,
81+
client,
82+
is_animated
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#standardSQL
2+
# what percent of gifs are animated?
3+
# animated_gif_count.sql
4+
5+
# ImageMagick reports big images as having, e.g., "1.29097M" pixels. This means ~1.2 million pixels, but BigQuery doesn't know that.
6+
CREATE TEMPORARY FUNCTION magickPixels(imageMagickNumberString STRING)
7+
RETURNS INT64
8+
LANGUAGE js AS r'''
9+
10+
if (!imageMagickNumberString) { return null; }
11+
const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)?$/);
12+
const multiples = {
13+
'K': 1e3,
14+
'M': 1e6,
15+
'G': 1e9,
16+
'T': 1e12
17+
}
18+
if ( matched && matched[1] ) {
19+
return Math.round(
20+
parseFloat( matched[1] ) * ( multiples[ matched[2] ] || 1 )
21+
);
22+
} else {
23+
return null;
24+
}
25+
26+
''';
27+
28+
SELECT
29+
_TABLE_SUFFIX AS client,
30+
COUNT(0) AS total_gifs,
31+
COUNTIF(CAST(JSON_VALUE(payload, '$._image_details.animated') AS BOOL)) AS total_animated_gifs,
32+
COUNTIF(CAST(JSON_VALUE(payload, '$._image_details.animated') AS BOOL)) / COUNT(0) AS pct_animated_gifs
33+
FROM
34+
`requests.2024_06_01_*`
35+
WHERE
36+
JSON_VALUE(payload, '$._image_details.detected_type') = 'gif' AND
37+
magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) > 1
38+
GROUP BY
39+
client
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#standardSQL
2+
# distribution of animated GIF framecounts
3+
# animated_gif_framecount.sql
4+
5+
# ImageMagick reports big images as having, e.g., "1.29097M" pixels. This means ~1.2 million pixels, but BigQuery doesn't know that.
6+
CREATE TEMPORARY FUNCTION magickPixels(imageMagickNumberString STRING)
7+
RETURNS INT64
8+
LANGUAGE js AS r'''
9+
10+
if (!imageMagickNumberString) { return null; }
11+
const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)?$/);
12+
const multiples = {
13+
'K': 1e3,
14+
'M': 1e6,
15+
'G': 1e9,
16+
'T': 1e12
17+
}
18+
if ( matched && matched[1] ) {
19+
return Math.round(
20+
parseFloat( matched[1] ) * ( multiples[ matched[2] ] || 1 )
21+
);
22+
} else {
23+
return null;
24+
}
25+
26+
''';
27+
28+
WITH framecounts AS (
29+
SELECT
30+
_TABLE_SUFFIX AS client,
31+
CAST(JSON_VALUE(payload, '$._image_details.metadata.GIF.FrameCount') AS NUMERIC) AS framecount
32+
FROM `requests.2024_06_01_*`
33+
WHERE
34+
JSON_VALUE(payload, '$._image_details.metadata.GIF.FrameCount') IS NOT NULL AND
35+
magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) > 1
36+
)
37+
38+
SELECT
39+
percentile,
40+
client,
41+
APPROX_QUANTILES(framecount, 1000)[OFFSET(percentile * 10)] AS framecount
42+
FROM
43+
framecounts,
44+
UNNEST([10, 25, 50, 75, 90, 100]) AS percentile
45+
GROUP BY
46+
percentile,
47+
client
48+
ORDER BY
49+
percentile,
50+
client
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#standardSQL
2+
# What % of pages load at least one image?
3+
# ✅ at_least_one_image_request.sql
4+
5+
SELECT
6+
_TABLE_SUFFIX AS client,
7+
COUNTIF(reqImg > 0) AS atLeastOneImgReqCount,
8+
COUNT(0) AS total,
9+
SAFE_DIVIDE(COUNTIF(reqImg > 0), COUNT(0)) AS atLeastOneImgReqPct
10+
FROM
11+
`httparchive.summary_pages.2024_06_01_*`
12+
GROUP BY
13+
client
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#standardSQL
2+
# What % of pages have at least one <img>?
3+
# at_least_one_img.sql
4+
5+
CREATE TEMPORARY FUNCTION numberOfImages(images_string STRING)
6+
RETURNS INT64
7+
LANGUAGE js AS '''
8+
try {
9+
return JSON.parse(images_string).filter( i => parseInt(i.approximateResourceWidth) > 1 && parseInt(i.approximateResourceWidth) > 1 ).length;
10+
} catch {
11+
return 0;
12+
}
13+
''';
14+
15+
WITH numImgs AS (
16+
SELECT
17+
_TABLE_SUFFIX AS client,
18+
numberOfImages(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images')) AS numberOfImages
19+
FROM
20+
`httparchive.pages.2024_06_01_*`
21+
)
22+
23+
SELECT
24+
client,
25+
COUNTIF(numberOfImages > 0) AS atLeastOneCount,
26+
COUNT(0) AS total,
27+
SAFE_DIVIDE(COUNTIF(numberOfImages > 0), COUNT(0)) AS atLeastOnePct
28+
FROM
29+
numImgs
30+
GROUP BY
31+
client
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
#standardSQL
2+
# Measuring img loaded bytes and dimensions
3+
# bytes_and_dimensions.sql
4+
5+
CREATE TEMPORARY FUNCTION getSrcsetInfo(responsiveImagesJsonString STRING)
6+
RETURNS ARRAY<STRUCT<imgURL STRING, approximateResourceWidth INT64, approximateResourceHeight INT64, byteSize INT64, bitsPerPixel NUMERIC, isPixel BOOL, isDataURL BOOL, resourceFormat STRING>>
7+
LANGUAGE js AS '''
8+
9+
function pithyType( { contentType, url } ) {
10+
const subtypeMap = {
11+
'svg+xml': 'svg',
12+
'svgz': 'svg',
13+
'jpeg': 'jpg',
14+
'jfif': 'jpg',
15+
'x-png': 'png',
16+
'vnd.microsoft.icon': 'ico',
17+
'x-icon': 'ico',
18+
'jxr': 'jxr',
19+
'vnd.ms-photo': 'jxr',
20+
'hdp': 'jxr',
21+
'wdp': 'jxr',
22+
'jpf': 'jp2',
23+
'jpx': 'jp2',
24+
'jpm': 'jp2',
25+
'mj2': 'jp2',
26+
'x-jp2-container': 'jp2',
27+
'x-jp2-codestream': 'jp2',
28+
'x-jpeg2000-image': 'jp2',
29+
'heic': 'heif',
30+
'x-ms-bmp': 'bmp',
31+
'x-pict': 'pict',
32+
'tif': 'tiff',
33+
'x-tif': 'tiff',
34+
'x-tiff': 'tiff',
35+
'vnd.mozilla.apng': 'apng',
36+
// identities
37+
'apng': 'apng',
38+
'jpg': 'jpg',
39+
'jp2': 'jp2',
40+
'png': 'png',
41+
'gif': 'gif',
42+
'ico': 'ico',
43+
'webp': 'webp',
44+
'avif': 'avif',
45+
'tiff': 'tiff',
46+
'flif': 'flif',
47+
'heif': 'heif',
48+
'jxl': 'jxl',
49+
'avif-sequence': 'avif-sequence', // keep separate from single frames...
50+
'heic-sequence': 'heic-sequence',
51+
'bmp': 'bmp',
52+
'pict': 'pict'
53+
};
54+
55+
function normalizeSubtype( subtype ) {
56+
if ( subtypeMap[ subtype ] ) {
57+
return subtypeMap[ subtype ];
58+
}
59+
return 'unknown'; // switch between:
60+
// `subtype`
61+
// to see everything, check if there's anything else worth capturing
62+
// `'unknown'`
63+
// to make results manageable
64+
}
65+
66+
// if it's a data url, take the mime type from there, done.
67+
if ( url &&
68+
typeof url === "string" ) {
69+
const match = url.toLowerCase().match( /^data:image\\/([\\w\\-\\.\\+]+)/ );
70+
if ( match && match[ 1 ] ) {
71+
return normalizeSubtype( match[ 1 ] );
72+
}
73+
}
74+
75+
// if we get a content-type header, use it!
76+
if ( contentType &&
77+
typeof contentType === "string" ) {
78+
const match = contentType.toLowerCase().match( /image\\/([\\w\\-\\.\\+]+)/ );
79+
if ( match && match[ 1 ] ) {
80+
return normalizeSubtype( match[ 1 ] );
81+
}
82+
}
83+
84+
// otherwise fall back to extension in the URL
85+
if ( url &&
86+
typeof url === "string" ) {
87+
const splitOnSlashes = url.split("/");
88+
if ( splitOnSlashes.length > 1 ) {
89+
const afterLastSlash = splitOnSlashes[ splitOnSlashes.length - 1 ],
90+
splitOnDots = afterLastSlash.split(".");
91+
if ( splitOnDots.length > 1 ) {
92+
return normalizeSubtype(
93+
splitOnDots[ splitOnDots.length - 1 ]
94+
.toLowerCase()
95+
.replace( /^(\\w+)[\\?\\&\\#].*/, '$1' ) // strip query params
96+
);
97+
}
98+
}
99+
}
100+
101+
// otherwise throw up our hands
102+
return 'unknown';
103+
}
104+
105+
const parsed = JSON.parse( responsiveImagesJsonString );
106+
if ( parsed && parsed.map ) {
107+
const dataRegEx = new RegExp('^data');
108+
return parsed.map( d => ({
109+
imgURL: d.url,
110+
approximateResourceWidth: Math.floor( d.approximateResourceWidth || 0 ),
111+
approximateResourceHeight: Math.floor( d.approximateResourceHeight || 0 ),
112+
byteSize: Math.floor( d.byteSize || 0 ),
113+
bitsPerPixel: parseFloat( d.bitsPerPixel || 0 ),
114+
isPixel: d.approximateResourceWidth == 1 && d.approximateResourceHeight == 1,
115+
isDataURL: dataRegEx.test(d.url),
116+
resourceFormat: pithyType({ contentType: d.mimeType, url: d.url })
117+
}) );
118+
}
119+
''';
120+
121+
WITH imgs AS (
122+
SELECT
123+
_TABLE_SUFFIX AS client,
124+
url AS pageURL,
125+
imgURL,
126+
approximateResourceWidth,
127+
approximateResourceHeight,
128+
byteSize,
129+
bitsPerPixel,
130+
isPixel,
131+
isDataURL,
132+
(approximateResourceWidth * approximateResourceHeight) / 1000000 AS megapixels,
133+
(approximateResourceWidth / approximateResourceHeight) AS aspectRatio,
134+
resourceFormat
135+
FROM
136+
`httparchive.pages.2024_06_01_*`,
137+
UNNEST(getSrcsetInfo(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images')))
138+
),
139+
140+
percentiles AS (
141+
SELECT
142+
client,
143+
APPROX_QUANTILES(approximateResourceWidth, 1000) AS resourceWidthPercentiles,
144+
APPROX_QUANTILES(approximateResourceHeight, 1000) AS resourceHeightPercentiles,
145+
APPROX_QUANTILES(aspectRatio, 1000) AS aspectRatioPercentiles,
146+
APPROX_QUANTILES(megapixels, 1000) AS megapixelsPercentiles,
147+
APPROX_QUANTILES(byteSize, 1000) AS byteSizePercentiles,
148+
APPROX_QUANTILES(bitsPerPixel, 1000) AS bitsPerPixelPercentiles,
149+
COUNT(0) AS imgCount
150+
FROM
151+
imgs
152+
WHERE
153+
approximateResourceWidth > 1 AND
154+
approximateResourceHeight > 1
155+
GROUP BY
156+
client
157+
)
158+
159+
SELECT
160+
percentile,
161+
client,
162+
imgCount,
163+
resourceWidthPercentiles[OFFSET(percentile * 10)] AS resourceWidth,
164+
resourceHeightPercentiles[OFFSET(percentile * 10)] AS resourceHeight,
165+
aspectRatioPercentiles[OFFSET(percentile * 10)] AS aspectRatio,
166+
megapixelsPercentiles[OFFSET(percentile * 10)] AS megapixels,
167+
byteSizePercentiles[OFFSET(percentile * 10)] AS byteSize,
168+
bitsPerPixelPercentiles[OFFSET(percentile * 10)] AS bitsPerPixel
169+
FROM
170+
percentiles,
171+
UNNEST([0, 10, 25, 50, 75, 90, 100]) AS percentile
172+
ORDER BY
173+
imgCount DESC,
174+
percentile

0 commit comments

Comments
 (0)