Skip to content

Commit f610a48

Browse files
JannisBushGJFR
andauthored
Security 2024 Queries (#3671)
* Add 2022 queries * Add updated bot detection query (issue HSTS technology) * Query update * Update Cryptominer + Crypto query * Port all cookie queries * CSP + CORP porting * More porting * Many updated queries * Update many queries * Update all remaining queries * Remove 2024 prefix * Add header value distributions (some headers) * Add Timing-Allow-Origin Header Usage No real security header as it can only worsen security (MDN classifies it as CORS header) thus not included in all queries, e.g., not included in feature_adoption_by_country.sql) * Add OAC queries * Add document.domain query * HTML Sanitization query * Security Features by category * Add server-timing query * Update security.txt query * Improve security.text query * Only consider "real" security.txt files for query * Add some insights into security.txt data (FPs vs FNs) * Fix typo * SQLfluff fix * Fix typo * Add a limit * Improve Note document.domain * Update cryptominer time period * Add missing lowercasing * Update note * Remove trailing whitespace * Add two more queries * Fix typo * Add WSS CSP query * Iframes attributes: change to custom_metrics + add more dates * Update server-timing parsing * FIx st query + add sampling data * Add Server-Timing overview query * Lint * Add query to measure use of disallowed CSP directives in <meta> * Fix linting issue --------- Co-authored-by: Gertjan Franken <gertjan.franken@kuleuven.be>
1 parent 5bdb523 commit f610a48

67 files changed

Lines changed: 3283 additions & 2 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

sql/2024/security/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@
1414
- [📄 Planning doc][~google-doc]
1515
- [📊 Results sheet][~google-sheets]
1616
- [📝 Markdown file][~chapter-markdown]
17-
- [:book: 2021 chapter][~2021-chapter]
17+
- [:book: 2022 chapter][~2022-chapter]
1818

1919
[~google-doc]: https://docs.google.com/document/d/1jBGxgkBDIi9nDQ-n2eVFkwDZXk_9rQkLiKfnHkknsAs/edit
2020
[~google-sheets]: https://docs.google.com/spreadsheets/d/1b9IEGbfQjKCEaTBmcv_zyCyWEsq35StCa-dVOe6V1Cs/edit#gid=1778117656
2121
[~chapter-markdown]: https://github.com/HTTPArchive/almanac.httparchive.org/tree/main/src/content/en/2024/security.md
22-
[~2021-chapter]: https://almanac.httparchive.org/en/2021/security
22+
[~2022-chapter]: https://almanac.httparchive.org/en/2022/security
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#standardSQL
2+
# Section: Attack Preventions - Bot protection services
3+
# Question: Which bot protection services are used most often on mobile and desktop sites?
4+
# Notes: The Wappalyzer 'Security' category mostly contains bot protection services such as reCAPTCHA and Cloudflare Bot Management
5+
# Issue: Due to some updates to wappalyzer the 'Security' category now also contains 'HSTS' (security header) and 'Really Simple SSL & Security' in significant numbers. Do we want to filter them out?
6+
SELECT
7+
client,
8+
t.technology,
9+
COUNT(0) AS freq,
10+
total,
11+
COUNT(0) / total AS pct
12+
FROM
13+
`httparchive.all.pages`,
14+
UNNEST(technologies) AS t,
15+
UNNEST(t.categories) AS category
16+
JOIN (
17+
SELECT
18+
client,
19+
COUNT(0) AS total
20+
FROM
21+
`httparchive.all.pages`
22+
WHERE
23+
date = '2024-06-01' AND
24+
is_root_page
25+
GROUP BY
26+
client)
27+
USING
28+
(client)
29+
WHERE
30+
date = '2024-06-01' AND
31+
category = 'Security' AND
32+
is_root_page
33+
GROUP BY
34+
client,
35+
total,
36+
t.technology
37+
ORDER BY
38+
pct DESC
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#standardSQL
2+
# Section: Attack preventions - Preventing attacks using Clear-Site-Data
3+
# Question: Which Clear-Site-Data header values are most prevalent?
4+
# Notes: Many used values are still invalid (without quotes). We only count each host-value pair once.
5+
SELECT
6+
client,
7+
csd_header,
8+
SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS total_csd_headers,
9+
COUNT(DISTINCT host) AS freq,
10+
COUNT(DISTINCT host) / SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS pct
11+
FROM (
12+
SELECT
13+
client,
14+
NET.HOST(url) AS host,
15+
response_headers.value AS csd_header
16+
FROM
17+
`httparchive.all.requests`,
18+
UNNEST(response_headers) AS response_headers
19+
WHERE
20+
date = '2024-06-01' AND
21+
is_root_page AND
22+
# AND is_main_document # (Uncomment to only run on the main document response; majority of CSD headers are set on them)
23+
LOWER(response_headers.name) = 'clear-site-data')
24+
GROUP BY
25+
client,
26+
csd_header
27+
ORDER BY
28+
pct DESC
29+
LIMIT
30+
100
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#standardSQL
2+
# Section: Attack Preventions - Preventing attacks using Cross-Origin policies
3+
# Question: Which are the most common COEP values?
4+
# Note: Considers headers of main document responses
5+
SELECT
6+
client,
7+
coep_header,
8+
SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS total_coep_headers,
9+
COUNT(DISTINCT host) AS freq,
10+
COUNT(DISTINCT host) / SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS pct
11+
FROM (
12+
SELECT
13+
client,
14+
NET.HOST(url) AS host,
15+
response_headers.value AS coep_header
16+
FROM
17+
`httparchive.all.requests`,
18+
UNNEST(response_headers) AS response_headers
19+
WHERE
20+
date = '2024-06-01' AND
21+
is_root_page AND
22+
is_main_document AND
23+
LOWER(response_headers.name) = 'cross-origin-embedder-policy')
24+
GROUP BY
25+
client,
26+
coep_header
27+
ORDER BY
28+
pct DESC
29+
LIMIT
30+
100
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
#standardSQL
2+
# Section: Cookies - Cookie Age
3+
# Question: How many cookies (total, hosts, pages) have negative Max-Age, Expires and real age (Max-Age has precedence over Expires) attributes?
4+
# Note: Query is expensive and slow (14TB). Query is inefficient (We create a result array of length 1 for each cookie-attribute for each cookie and then unnest it again; We could instead not use arrays and skip the unnesting).
5+
# Note: Some of the percentages are quite different to the old query; one of both might be broken (difficult to compare as both cannot operate on a shared dataset)
6+
CREATE TEMPORARY FUNCTION getCookieAgeValues(cookie_value STRING, epochOfRequest NUMERIC)
7+
RETURNS STRING DETERMINISTIC
8+
LANGUAGE js AS '''
9+
const regexMaxAge = new RegExp(/max-age\\s*=\\s*(?<value>-*[0-9]+)/i);
10+
const regexExpires = new RegExp(/expires\\s*=\\s*(?<value>.*?)(;|$)/i);
11+
const cookieValues = [cookie_value];
12+
const result = {
13+
"maxAge": [],
14+
"expires": [],
15+
"realAge": []
16+
};
17+
cookieValues.forEach(cookie => {
18+
let maxAge = null;
19+
let expires = null;
20+
if (regexMaxAge.exec(cookie)) {
21+
maxAge = Number(regexMaxAge.exec(cookie)[1]);
22+
result["maxAge"].push(maxAge);
23+
}
24+
if (regexExpires.exec(cookie)) {
25+
expires = Math.round(Number(new Date(regexExpires.exec(cookie)[1])) / 1000) - epochOfRequest;
26+
result["expires"].push(Number.isSafeInteger(expires) ? expires : null);
27+
}
28+
if (maxAge) {
29+
result["realAge"].push(maxAge);
30+
} else if (expires) {
31+
result["realAge"].push(expires);
32+
}
33+
});
34+
return JSON.stringify(result);
35+
''';
36+
37+
WITH age_values AS (
38+
SELECT
39+
client,
40+
page,
41+
NET.HOST(url) AS host,
42+
getCookieAgeValues(response_headers.value, CAST(JSON_QUERY(summary, '$.startedDateTime') AS NUMERIC)) AS values
43+
FROM
44+
`httparchive.all.requests`,
45+
UNNEST(response_headers) AS response_headers
46+
WHERE
47+
date = '2024-06-01' AND
48+
is_root_page AND
49+
LOWER(response_headers.name) = 'set-cookie'
50+
),
51+
52+
max_age_values AS (
53+
SELECT
54+
client,
55+
COUNTIF(SAFE_CAST(max_age_value AS NUMERIC) <= 0) AS count_negative_max_age,
56+
SUM(COUNT(0)) OVER (PARTITION BY client) AS total_max_age_cookies,
57+
COUNT(DISTINCT IF(SAFE_CAST(max_age_value AS NUMERIC) <= 0, page, NULL)) AS num_max_age_pages,
58+
COUNT(DISTINCT page) AS total_max_age_pages,
59+
COUNT(DISTINCT IF(SAFE_CAST(max_age_value AS NUMERIC) <= 0, host, NULL)) AS num_max_age_hosts,
60+
COUNT(DISTINCT host) AS total_max_age_hosts
61+
FROM age_values,
62+
UNNEST(JSON_QUERY_ARRAY(values, '$.maxAge')) AS max_age_value
63+
GROUP BY
64+
client
65+
ORDER BY
66+
client
67+
),
68+
69+
expires_values AS (
70+
SELECT
71+
client,
72+
COUNTIF(SAFE_CAST(expires_value AS NUMERIC) <= 0) AS count_negative_expires,
73+
SUM(COUNT(0)) OVER (PARTITION BY client) AS total_expires_cookies,
74+
COUNT(DISTINCT IF(SAFE_CAST(expires_value AS NUMERIC) <= 0, page, NULL)) AS num_expires_pages,
75+
COUNT(DISTINCT page) AS total_expires_pages,
76+
COUNT(DISTINCT IF(SAFE_CAST(expires_value AS NUMERIC) <= 0, host, NULL)) AS num_expires_hosts,
77+
COUNT(DISTINCT host) AS total_expires_hosts
78+
FROM age_values,
79+
UNNEST(JSON_QUERY_ARRAY(values, '$.expires')) AS expires_value
80+
GROUP BY
81+
client
82+
ORDER BY
83+
client
84+
),
85+
86+
real_age_values AS (
87+
SELECT
88+
client,
89+
COUNTIF(SAFE_CAST(real_age_value AS NUMERIC) <= 0) AS count_negative_real_age,
90+
SUM(COUNT(0)) OVER (PARTITION BY client) AS total_real_age_cookies,
91+
COUNT(DISTINCT IF(SAFE_CAST(real_age_value AS NUMERIC) <= 0, page, NULL)) AS num_real_age_pages,
92+
COUNT(DISTINCT page) AS total_real_age_pages,
93+
COUNT(DISTINCT IF(SAFE_CAST(real_age_value AS NUMERIC) <= 0, host, NULL)) AS num_real_age_hosts,
94+
COUNT(DISTINCT host) AS total_real_age_hosts
95+
FROM age_values,
96+
UNNEST(JSON_QUERY_ARRAY(values, '$.realAge')) AS real_age_value
97+
GROUP BY
98+
client
99+
ORDER BY
100+
client
101+
)
102+
103+
SELECT
104+
client,
105+
count_negative_max_age,
106+
count_negative_max_age / total_max_age_cookies AS pct_negative_max_age,
107+
num_max_age_pages,
108+
num_max_age_pages / total_max_age_pages AS pct_max_age_pages,
109+
num_max_age_hosts,
110+
num_max_age_hosts / total_max_age_hosts AS pct_max_age_hosts,
111+
count_negative_expires,
112+
count_negative_expires / total_expires_cookies AS pct_negative_expires,
113+
num_expires_pages,
114+
num_expires_pages / total_expires_pages AS pct_expires_pages,
115+
num_expires_hosts,
116+
num_expires_hosts / total_expires_hosts AS pct_expires_hosts,
117+
count_negative_real_age,
118+
count_negative_real_age / total_real_age_cookies AS pct_negative_real_age,
119+
num_real_age_pages,
120+
num_real_age_pages / total_real_age_pages AS pct_real_age_pages,
121+
num_real_age_hosts,
122+
num_real_age_hosts / total_real_age_hosts AS pct_real_age_hosts
123+
FROM
124+
max_age_values
125+
JOIN expires_values
126+
USING (client)
127+
JOIN real_age_values
128+
USING (client)
129+
ORDER BY
130+
client
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
#standardSQL
2+
# Section: Cookies - Cookie Age
3+
# Question: How long are cookies valid? (Max-Age, Expires, Real Age)
4+
# Note: Only incorporates values that are larger than 0; cookies set over all all requests on the root_page
5+
# Note: Could be combined with the other cookie queries to run the expensive response_header unnesting only once?
6+
CREATE TEMPORARY FUNCTION getCookieAgeValues(cookie_value STRING, epochOfRequest NUMERIC)
7+
RETURNS STRING DETERMINISTIC
8+
LANGUAGE js AS '''
9+
const regexMaxAge = new RegExp(/max-age\\s*=\\s*(?<value>-*[0-9]+)/i);
10+
const regexExpires = new RegExp(/expires\\s*=\\s*(?<value>.*?)(;|$)/i);
11+
const cookieValues = [cookie_value];
12+
const result = {
13+
"maxAge": [],
14+
"expires": [],
15+
"realAge": []
16+
};
17+
cookieValues.forEach(cookie => {
18+
let maxAge = null;
19+
let expires = null;
20+
if (regexMaxAge.exec(cookie)) {
21+
maxAge = Number(regexMaxAge.exec(cookie)[1]);
22+
result["maxAge"].push(maxAge);
23+
}
24+
if (regexExpires.exec(cookie)) {
25+
expires = Math.round(Number(new Date(regexExpires.exec(cookie)[1])) / 1000) - epochOfRequest;
26+
result["expires"].push(Number.isSafeInteger(expires) ? expires : null);
27+
}
28+
if (maxAge) {
29+
result["realAge"].push(maxAge);
30+
} else if (expires) {
31+
result["realAge"].push(expires);
32+
}
33+
});
34+
return JSON.stringify(result);
35+
''';
36+
37+
WITH age_values AS (
38+
SELECT
39+
client,
40+
getCookieAgeValues(response_headers.value, CAST(JSON_QUERY(summary, '$.startedDateTime') AS NUMERIC)) AS values
41+
FROM
42+
`httparchive.all.requests`,
43+
UNNEST(response_headers) AS response_headers
44+
WHERE
45+
date = '2024-06-01' AND
46+
is_root_page AND
47+
LOWER(response_headers.name) = 'set-cookie'
48+
),
49+
50+
max_age_values AS (
51+
SELECT
52+
client,
53+
percentile,
54+
APPROX_QUANTILES(SAFE_CAST(max_age_value AS NUMERIC), 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS max_age
55+
FROM age_values,
56+
UNNEST(JSON_QUERY_ARRAY(values, '$.maxAge')) AS max_age_value,
57+
UNNEST([10, 25, 50, 75, 90, 100]) AS percentile
58+
WHERE
59+
SAFE_CAST(max_age_value AS NUMERIC) > 0
60+
GROUP BY
61+
percentile,
62+
client
63+
ORDER BY
64+
percentile,
65+
client
66+
),
67+
68+
expires_values AS (
69+
SELECT
70+
client,
71+
percentile,
72+
APPROX_QUANTILES(SAFE_CAST(expires_value AS NUMERIC), 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS expires
73+
FROM age_values,
74+
UNNEST(JSON_QUERY_ARRAY(values, '$.expires')) AS expires_value,
75+
UNNEST([10, 25, 50, 75, 90, 100]) AS percentile
76+
WHERE
77+
SAFE_CAST(expires_value AS NUMERIC) > 0
78+
GROUP BY
79+
percentile,
80+
client
81+
ORDER BY
82+
percentile,
83+
client
84+
),
85+
86+
real_age_values AS (
87+
SELECT
88+
client,
89+
percentile,
90+
APPROX_QUANTILES(SAFE_CAST(real_age_value AS NUMERIC), 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS real_age
91+
FROM age_values,
92+
UNNEST(JSON_QUERY_ARRAY(values, '$.realAge')) AS real_age_value,
93+
UNNEST([10, 25, 50, 75, 90, 100]) AS percentile
94+
WHERE
95+
SAFE_CAST(real_age_value AS NUMERIC) > 0
96+
GROUP BY
97+
percentile,
98+
client
99+
ORDER BY
100+
percentile,
101+
client
102+
)
103+
104+
SELECT
105+
client,
106+
percentile,
107+
max_age,
108+
expires,
109+
real_age
110+
FROM
111+
max_age_values
112+
JOIN expires_values
113+
USING (client, percentile)
114+
JOIN real_age_values
115+
USING (client, percentile)
116+
ORDER BY
117+
client,
118+
percentile

0 commit comments

Comments
 (0)