Skip to content

Commit 518c197

Browse files
committed
Merge branch 'main' of github.com:HTTPArchive/almanac.httparchive.org into production
2 parents 5ff6eda + f432f71 commit 518c197

92 files changed

Lines changed: 2039 additions & 79 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# standardSQL
2+
# classic_microformats_types.sql
3+
# Count Classic Microformats types
4+
5+
CREATE TEMP FUNCTION getClassicMicroformatsTypes(rendered STRING)
6+
RETURNS ARRAY<STRUCT<name STRING, count NUMERIC>>
7+
LANGUAGE js AS """
8+
try {
9+
rendered = JSON.parse(rendered);
10+
return rendered.microformats_classic_types.map(microformats_classic_type => ({name: microformats_classic_type.name, count: microformats_classic_type.count}));
11+
} catch (e) {
12+
return [];
13+
}
14+
""";
15+
16+
WITH
17+
rendered_data AS (
18+
SELECT
19+
client,
20+
root_page AS url,
21+
getClassicMicroformatsTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS classic_microformats_types
22+
FROM
23+
`httparchive.all.pages`
24+
WHERE
25+
date = '2024-06-01'
26+
),
27+
28+
page_totals AS (
29+
SELECT
30+
client,
31+
COUNT(DISTINCT root_page) AS total_pages
32+
FROM
33+
`httparchive.all.pages`
34+
WHERE
35+
date = '2024-06-01'
36+
GROUP BY
37+
client
38+
)
39+
40+
SELECT
41+
client,
42+
classic_microformats_type.name AS classic_microformats_type,
43+
SUM(classic_microformats_type.count) AS freq_microformat,
44+
SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS total_microformat,
45+
SUM(classic_microformats_type.count) / SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS pct_microformat,
46+
COUNT(DISTINCT url) AS freq_pages,
47+
total_pages,
48+
COUNT(DISTINCT url) / total_pages AS pct_pages
49+
FROM
50+
rendered_data,
51+
UNNEST(classic_microformats_types) AS classic_microformats_type
52+
JOIN
53+
page_totals
54+
USING (client)
55+
GROUP BY
56+
client,
57+
classic_microformats_type,
58+
total_pages
59+
ORDER BY
60+
freq_microformat DESC,
61+
client
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# standardSQL
2+
# dublin_core_types.sql
3+
# Count Dublin Core types
4+
CREATE TEMP FUNCTION getDublinCoreTypes(rendered STRING)
5+
RETURNS ARRAY<STRING>
6+
LANGUAGE js AS """
7+
try {
8+
rendered = JSON.parse(rendered);
9+
return rendered.dublin_core.map(dublin_core => dublin_core.name.toLowerCase());
10+
} catch (e) {
11+
return [];
12+
}
13+
""";
14+
15+
WITH
16+
rendered_data AS (
17+
SELECT
18+
client,
19+
root_page AS url,
20+
getDublinCoreTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS dublin_core_types
21+
FROM
22+
`httparchive.all.pages`
23+
WHERE
24+
date = '2024-06-01'
25+
),
26+
27+
page_totals AS (
28+
SELECT
29+
client,
30+
COUNT(DISTINCT root_page) AS total_pages
31+
FROM
32+
`httparchive.all.pages`
33+
WHERE
34+
date = '2024-06-01'
35+
GROUP BY
36+
client
37+
)
38+
39+
SELECT
40+
client,
41+
dublin_core_type,
42+
COUNT(dublin_core_type) AS count,
43+
SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS freq_dublin_core,
44+
COUNT(dublin_core_type) / SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS pct_dublin_core,
45+
COUNT(DISTINCT url) AS freq_pages,
46+
total_pages,
47+
COUNT(DISTINCT url) / total_pages AS pct_pages
48+
FROM
49+
rendered_data,
50+
UNNEST(dublin_core_types) AS dublin_core_type
51+
JOIN
52+
page_totals
53+
USING (client)
54+
GROUP BY
55+
client,
56+
dublin_core_type,
57+
total_pages
58+
ORDER BY
59+
pct_dublin_core DESC,
60+
client
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# standardSQL
2+
# facebook_types.sql
3+
# Count Facebook types
4+
CREATE TEMP FUNCTION getFacebookTypes(rendered STRING)
5+
RETURNS ARRAY<STRING>
6+
LANGUAGE js AS """
7+
try {
8+
rendered = JSON.parse(rendered);
9+
return rendered.facebook.map(facebook => facebook.property.toLowerCase());
10+
} catch (e) {
11+
return [];
12+
}
13+
""";
14+
15+
WITH
16+
rendered_data AS (
17+
SELECT
18+
client,
19+
root_page AS url,
20+
getFacebookTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS facebook_type
21+
FROM
22+
`httparchive.all.pages`
23+
WHERE
24+
date = '2024-06-01'
25+
),
26+
27+
page_totals AS (
28+
SELECT
29+
client,
30+
COUNT(DISTINCT root_page) AS total_pages
31+
FROM
32+
`httparchive.all.pages`
33+
WHERE
34+
date = '2024-06-01'
35+
GROUP BY
36+
client
37+
)
38+
39+
SELECT
40+
client,
41+
facebook_type,
42+
COUNT(facebook_type) AS freq_facebook,
43+
SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS total_facebook,
44+
COUNT(facebook_type) / SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS pct_facebook,
45+
COUNT(DISTINCT url) AS freq_pages,
46+
total_pages,
47+
COUNT(DISTINCT url) / total_pages AS pct_pages
48+
FROM
49+
rendered_data,
50+
UNNEST(facebook_type) AS facebook_type
51+
JOIN
52+
page_totals
53+
USING (client)
54+
GROUP BY
55+
client,
56+
facebook_type,
57+
total_pages
58+
ORDER BY
59+
freq_facebook DESC,
60+
client
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# standardSQL
2+
# jsonld_contexts.sql
3+
# Count JSON-LD contexts
4+
CREATE TEMP FUNCTION getJSONLDContexts(rendered STRING)
5+
RETURNS ARRAY<STRING>
6+
LANGUAGE js AS """
7+
try {
8+
const arrayify = (value) => Array.isArray(value) ? value : [value];
9+
10+
const getDeep = (key, o) => {
11+
if (Array.isArray(o)) return o.map(child => getDeep(key, child)).flat();
12+
13+
if (o instanceof Object) {
14+
return Object.entries(o).map(([k, value]) => {
15+
if (k === key) return [...arrayify(value), ...getDeep(value)];
16+
return getDeep(value);
17+
}).flat();
18+
}
19+
20+
return [];
21+
}
22+
23+
rendered = JSON.parse(rendered);
24+
const jsonld_scripts = rendered.jsonld_scripts;
25+
return jsonld_scripts.map(jsonld_script => {
26+
jsonld_script = JSON.parse(jsonld_script);
27+
return getDeep('@context', jsonld_script);
28+
}).flat().filter(context => typeof context === 'string');
29+
} catch (e) {
30+
return [];
31+
}
32+
""";
33+
34+
WITH
35+
rendered_data AS (
36+
SELECT
37+
client,
38+
root_page AS url,
39+
getJSONLDContexts(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_context
40+
FROM
41+
`httparchive.all.pages`
42+
WHERE
43+
date = '2024-06-01'
44+
),
45+
46+
page_totals AS (
47+
SELECT
48+
client,
49+
COUNT(DISTINCT root_page) AS total_pages
50+
FROM
51+
`httparchive.all.pages`
52+
WHERE
53+
date = '2024-06-01'
54+
GROUP BY
55+
client
56+
)
57+
58+
SELECT
59+
client,
60+
NET.REG_DOMAIN(jsonld_context) AS jsonld_context,
61+
COUNT(0) AS freq_jsonld_context,
62+
SUM(COUNT(0)) OVER (PARTITION BY client) AS total_jsonld_context,
63+
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_jsonld_context,
64+
COUNT(DISTINCT url) AS freq_pages,
65+
total_pages,
66+
COUNT(DISTINCT url) / total_pages AS pct_pages
67+
FROM
68+
rendered_data,
69+
UNNEST(jsonld_context) AS jsonld_context
70+
JOIN
71+
page_totals
72+
USING (client)
73+
GROUP BY
74+
client,
75+
jsonld_context,
76+
total_pages
77+
ORDER BY
78+
pct_jsonld_context DESC,
79+
client
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# standardSQL
2+
# jsonld_depth_percentiles.sql
3+
# Find the most nested entity in a JSON-LD document
4+
CREATE TEMP FUNCTION getJSONLDEntitiesRelationships(rendered STRING)
5+
RETURNS ARRAY<STRUCT<_from STRING, relationship STRING, _to STRING, depth NUMERIC>>
6+
LANGUAGE js AS """
7+
try {
8+
const types = new Map();
9+
10+
const loadTypes = (o) => {
11+
if (Array.isArray(o)) {
12+
o.forEach(loadTypes);
13+
} else if (o instanceof Object) {
14+
if (o['@id'] && o['@type']) {
15+
types.set(o['@id'], o['@type']);
16+
}
17+
18+
Object.values(o).forEach(loadTypes);
19+
}
20+
}
21+
22+
const arrayify = (value) => Array.isArray(value) ? value : [value];
23+
24+
const getEntitiesAndRelationships = (o, _from, relationship, depth = 0) => {
25+
if (Array.isArray(o)) return o.map(value => getEntitiesAndRelationships(value, _from, relationship, depth)).flat();
26+
27+
if (o instanceof Object) {
28+
const type = types.get(o['@id']) || o['@type'];
29+
return [{_from, relationship, _to: type, depth}, ...Object.entries(o).map(([k, value]) => getEntitiesAndRelationships(value, type, k, depth + 1))].flat();
30+
}
31+
32+
return [];
33+
}
34+
35+
rendered = JSON.parse(rendered);
36+
const jsonld_scripts = rendered.jsonld_scripts.map(JSON.parse);
37+
loadTypes(jsonld_scripts);
38+
39+
return jsonld_scripts.map(jsonld_script => getEntitiesAndRelationships(jsonld_script, undefined, undefined, 0)).flat();
40+
} catch (e) {
41+
return [];
42+
}
43+
""";
44+
45+
WITH rendered_data AS (
46+
SELECT
47+
client,
48+
root_page AS url,
49+
getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_entities_relationships
50+
FROM
51+
`httparchive.all.pages`
52+
WHERE
53+
date = '2024-06-01'
54+
)
55+
56+
SELECT
57+
client,
58+
percentile,
59+
APPROX_QUANTILES(jsonld_entity_relationship.depth, 1000)[OFFSET(percentile * 10)] AS depth,
60+
ARRAY_TO_STRING(ARRAY_AGG(DISTINCT url LIMIT 5), ' ') AS sample_urls
61+
FROM
62+
rendered_data,
63+
UNNEST(jsonld_entities_relationships) AS jsonld_entity_relationship,
64+
UNNEST([10, 25, 50, 75, 90, 100]) AS percentile
65+
GROUP BY
66+
client,
67+
percentile
68+
ORDER BY
69+
client,
70+
percentile

0 commit comments

Comments
 (0)