Skip to content

Commit 11bebf2

Browse files
committed
Merge branch 'main' of github.com:HTTPArchive/almanac.httparchive.org into production
2 parents fbdebcb + 30b70dd commit 11bebf2

87 files changed

Lines changed: 1847 additions & 471 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/code-static-analysis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ jobs:
3535
uses: actions/checkout@v3
3636
- name: Set up Python 3.8
3737
if: ${{ matrix.language == 'python' }}
38-
uses: actions/setup-python@v4.2.0
38+
uses: actions/setup-python@v4.3.0
3939
with:
4040
python-version: '3.8'
4141
- name: Install dependencies

.github/workflows/linter.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
run: |
3232
echo "VALIDATE_ALL_CODEBASE=false" >> $GITHUB_ENV
3333
- name: Lint Code Base
34-
uses: github/super-linter@v4.9.6
34+
uses: github/super-linter@v4.9.7
3535
#uses: docker://github/super-linter:v4.7.1
3636
env:
3737
DEFAULT_BRANCH: main

.github/workflows/lintsql.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ jobs:
1919
# Full git history is needed to get a proper list of changed files within `super-linter`
2020
fetch-depth: 0
2121
- name: Set up Python 3.8
22-
uses: actions/setup-python@v4.2.0
22+
uses: actions/setup-python@v4.3.0
2323
with:
2424
python-version: '3.8'
2525
- name: Lint SQL code

.github/workflows/predeploy.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ jobs:
3636
with:
3737
node-version: '16'
3838
- name: Set up Python 3.8
39-
uses: actions/setup-python@v4.2.0
39+
uses: actions/setup-python@v4.3.0
4040
with:
4141
python-version: '3.8'
4242
- name: Install Asian Fonts

.github/workflows/test_website.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ jobs:
3030
with:
3131
node-version: '16'
3232
- name: Set up Python 3.8
33-
uses: actions/setup-python@v4.2.0
33+
uses: actions/setup-python@v4.3.0
3434
with:
3535
python-version: '3.8'
3636
- name: Run the website
@@ -40,7 +40,7 @@ jobs:
4040
- name: Use more complete checks for generated HTML linting
4141
run: cp -f .github/linters/.htmlhintrc_morechecks .github/linters/.htmlhintrc
4242
- name: Lint Generated HTML
43-
uses: github/super-linter@v4.9.6
43+
uses: github/super-linter@v4.9.7
4444
env:
4545
DEFAULT_BRANCH: main
4646
FILTER_REGEX_INCLUDE: src/static/html/.*

CONTRIBUTING.md

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,11 @@
11
# Contributing
22

3-
## 2022
3+
## 2023
44

5-
We haven't begun organizing the 2022 edition yet, but if you'd be interested to contribute, you can sign up to be notified when the project planning kicks off by filling out [this interest form](https://forms.gle/Bb2ZZj6Yw5RnJHbj8). Check off all teams that interest you and give a detailed description of what your areas of expertise are so we can reach out to you with targeted opportunities.
5+
We haven't begun organizing the 2023 edition yet, but if you'd be interested to contribute, you can sign up to be notified when the project planning kicks off by filling out [this interest form]([https://forms.gle/Bb2ZZj6Yw5RnJHbj8](https://forms.gle/zmk6wXfDrmkkKzXo8)). Check off all teams that interest you and give a detailed description of what your areas of expertise are so we can reach out to you with targeted opportunities.
66

77
If you have any questions about contributing, you can start a discussion here on GitHub or visit the [`#web-almanac`](https://join.slack.com/t/httparchive/shared_invite/zt-45sgwmnb-eDEatOhqssqNAKxxOSLAaA) channel on Slack.
88

9-
10-
<!--
11-
## 2021
12-
Thanks for your interest in contributing to the Web Almanac! We're currently developing the 2021 edition and actively looking for contributors! If you'd like to contribute to a specific chapter, please see [the list of open chapters](https://github.com/HTTPArchive/almanac.httparchive.org/issues/2167). We're also looking for contributors to join the [translators](https://github.com/HTTPArchive/almanac.httparchive.org/issues/923), [developers](https://github.com/HTTPArchive/almanac.httparchive.org/issues/2172), and [designers](https://github.com/HTTPArchive/almanac.httparchive.org/issues/2173) teams.
13-
-->
149
![star-shaped Almanac character](https://almanac.httparchive.org/static/images/avatars/0.jpg)
1510

1611
There are several ways to contribute to the Web Almanac:

sql/.sqlfluff

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ fix_even_unparsable = False
5858
# This limit skips files over a certain character length
5959
# and warns the user what has happened.
6060
# Set this to 0 to disable.
61-
large_file_skip_char_limit = 40000
61+
large_file_skip_byte_limit = 40000
6262
# CPU processes to use while linting.
6363
# If positive, just implies number of processes.
6464
# If negative or zero, implies number_of_cpus - specifed_number.

sql/2022/cdn/top_cdns.sql

Lines changed: 56 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,73 @@
11
#standardSQL
2-
# top_cdns.sql: Top CDNs used on the root HTML pages
2+
# top_cdns.sql: Top CDNs used
33
SELECT
4+
year,
45
client,
56
cdn,
67
COUNTIF(firstHtml) AS firstHtmlHits,
7-
SUM(COUNTIF(firstHtml)) OVER (PARTITION BY client) AS firstHtmlTotalHits,
8-
SAFE_DIVIDE(COUNTIF(firstHtml), SUM(COUNTIF(firstHtml)) OVER (PARTITION BY client)) AS firstHtmlHitsPct,
8+
SUM(COUNTIF(firstHtml)) OVER (PARTITION BY year, client) AS firstHtmlTotalHits,
9+
SAFE_DIVIDE(COUNTIF(firstHtml), SUM(COUNTIF(firstHtml)) OVER (PARTITION BY year, client)) AS firstHtmlHitsPct,
910

1011
COUNTIF(NOT firstHtml AND NOT sameHost AND sameDomain) AS subDomainHits,
11-
SUM(COUNTIF(NOT firstHtml AND NOT sameHost AND sameDomain)) OVER (PARTITION BY client) AS subDomainTotalHits,
12-
SAFE_DIVIDE(COUNTIF(NOT firstHtml AND NOT sameHost AND sameDomain), SUM(COUNTIF(NOT firstHtml AND NOT sameHost AND sameDomain)) OVER (PARTITION BY client)) AS subDomainHitsPct,
12+
SUM(COUNTIF(NOT firstHtml AND NOT sameHost AND sameDomain)) OVER (PARTITION BY year, client) AS subDomainTotalHits,
13+
SAFE_DIVIDE(COUNTIF(NOT firstHtml AND NOT sameHost AND sameDomain), SUM(COUNTIF(NOT firstHtml AND NOT sameHost AND sameDomain)) OVER (PARTITION BY year, client)) AS subDomainHitsPct,
1314

1415
COUNTIF(NOT firstHtml AND NOT sameHost AND NOT sameDomain) AS thirdPartyHits,
15-
SUM(COUNTIF(NOT firstHtml AND NOT sameHost AND NOT sameDomain)) OVER (PARTITION BY client) AS thirdPartyTotalHits,
16-
SAFE_DIVIDE(COUNTIF(NOT firstHtml AND NOT sameHost AND NOT sameDomain), SUM(COUNTIF(NOT firstHtml AND NOT sameHost AND NOT sameDomain)) OVER (PARTITION BY client)) AS thirdPartyHitsPct,
16+
SUM(COUNTIF(NOT firstHtml AND NOT sameHost AND NOT sameDomain)) OVER (PARTITION BY year, client) AS thirdPartyTotalHits,
17+
SAFE_DIVIDE(COUNTIF(NOT firstHtml AND NOT sameHost AND NOT sameDomain), SUM(COUNTIF(NOT firstHtml AND NOT sameHost AND NOT sameDomain)) OVER (PARTITION BY year, client)) AS thirdPartyHitsPct,
1718

1819
COUNT(0) AS hits,
1920
SUM(COUNT(0)) OVER (PARTITION BY client) AS totalHits,
2021
SAFE_DIVIDE(COUNT(0), SUM(COUNT(0)) OVER (PARTITION BY client)) AS hitsPct
2122
FROM
2223
(
2324
SELECT
25+
'2019' AS year,
26+
client,
27+
page,
28+
url,
29+
firstHtml,
30+
respBodySize,
31+
IFNULL(NULLIF(REGEXP_EXTRACT(_cdn_provider, r'^([^,]*).*'), ''), 'ORIGIN') AS cdn, # sometimes _cdn provider detection includes multiple entries. we bias for the DNS detected entry which is the first entry
32+
NET.HOST(url) = NET.HOST(page) AS sameHost,
33+
NET.HOST(url) = NET.HOST(page) OR NET.REG_DOMAIN(url) = NET.REG_DOMAIN(page) AS sameDomain # if toplevel reg_domain will return NULL so we group this as sameDomain
34+
FROM
35+
`httparchive.almanac.requests`
36+
WHERE
37+
date = '2019-07-01'
38+
UNION ALL
39+
SELECT
40+
'2020' AS year,
41+
client,
42+
page,
43+
url,
44+
firstHtml,
45+
respBodySize,
46+
IFNULL(NULLIF(REGEXP_EXTRACT(_cdn_provider, r'^([^,]*).*'), ''), 'ORIGIN') AS cdn, # sometimes _cdn provider detection includes multiple entries. we bias for the DNS detected entry which is the first entry
47+
NET.HOST(url) = NET.HOST(page) AS sameHost,
48+
NET.HOST(url) = NET.HOST(page) OR NET.REG_DOMAIN(url) = NET.REG_DOMAIN(page) AS sameDomain # if toplevel reg_domain will return NULL so we group this as sameDomain
49+
FROM
50+
`httparchive.almanac.requests`
51+
WHERE
52+
date = '2020-08-01'
53+
UNION ALL
54+
SELECT
55+
'2021' AS year,
56+
client,
57+
page,
58+
url,
59+
firstHtml,
60+
respBodySize,
61+
IFNULL(NULLIF(REGEXP_EXTRACT(_cdn_provider, r'^([^,]*).*'), ''), 'ORIGIN') AS cdn, # sometimes _cdn provider detection includes multiple entries. we bias for the DNS detected entry which is the first entry
62+
NET.HOST(url) = NET.HOST(page) AS sameHost,
63+
NET.HOST(url) = NET.HOST(page) OR NET.REG_DOMAIN(url) = NET.REG_DOMAIN(page) AS sameDomain # if toplevel reg_domain will return NULL so we group this as sameDomain
64+
FROM
65+
`httparchive.almanac.requests`
66+
WHERE
67+
date = '2021-07-01'
68+
UNION ALL
69+
SELECT
70+
'2022' AS year,
2471
client,
2572
page,
2673
url,
@@ -35,8 +82,10 @@ FROM
3582
date = '2022-06-01'
3683
)
3784
GROUP BY
85+
year,
3886
client,
3987
cdn
4088
ORDER BY
89+
year DESC,
4190
client DESC,
4291
firstHtmlHits DESC

sql/2022/cdn/top_cdns_by_rank.sql

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22
# top_cdns_by_rank.sql: Top CDNs used on the root HTML pages by CrUX rank
33
SELECT
44
client,
5-
nested_rank,
5+
rank_grouping,
6+
CASE
7+
WHEN rank_grouping = 10000000 THEN 'all'
8+
ELSE FORMAT("%'d", rank_grouping)
9+
END AS ranking,
610
cdn,
711
COUNTIF(firstHtml) AS firstHtmlHits,
812
SUM(COUNTIF(firstHtml)) OVER (PARTITION BY client) AS firstHtmlTotalHits,
@@ -34,14 +38,14 @@ FROM (
3438
`httparchive.almanac.requests`
3539
WHERE
3640
date = '2022-06-01'),
37-
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS nested_rank
41+
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
3842
WHERE
39-
rank <= nested_rank
43+
rank <= rank_grouping
4044
GROUP BY
4145
client,
42-
nested_rank,
46+
rank_grouping,
4347
cdn
4448
ORDER BY
4549
client DESC,
46-
nested_rank,
50+
rank_grouping,
4751
firstHtmlHits DESC

sql/2022/third-parties/tao_by_third_party.sql

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ headers AS (
6161
requests.origin AS req_origin,
6262
pages.origin AS page_origin,
6363
get_tao(LOWER(respOtherHeaders)) AS timing_allow_origin,
64+
respOtherHeaders,
6465
third_party.category AS req_category
6566
FROM requests
6667
LEFT JOIN pages
@@ -72,18 +73,30 @@ headers AS (
7273
base AS (
7374
SELECT
7475
client,
76+
IF(respOtherHeaders LIKE '%timing-allow-origin = %', 1, 0) AS tao_header_present,
7577
IF(
7678
page_origin = req_origin OR
77-
timing_allow_origin = '*, ' OR
78-
STRPOS(timing_allow_origin, CONCAT(page_origin, ', ')) > 0,
79+
timing_allow_origin = '*' OR
80+
timing_allow_origin LIKE '*,%' OR
81+
timing_allow_origin LIKE '%,*' OR
82+
timing_allow_origin LIKE '%,*,%' OR
83+
timing_allow_origin LIKE '%, *,%' OR
84+
timing_allow_origin = page_origin OR
85+
timing_allow_origin LIKE page_origin || ',' OR
86+
timing_allow_origin LIKE '%,' || page_origin OR
87+
timing_allow_origin LIKE '%, ' || page_origin OR
88+
timing_allow_origin LIKE '%,' || page_origin || ',%' OR
89+
timing_allow_origin LIKE '%, ' || page_origin || ',%',
7990
1, 0) AS timing_allowed
8091
FROM headers
8192
)
8293

8394
SELECT
8495
client,
96+
SUM(tao_header_present) AS tao_requests,
8597
SUM(timing_allowed) AS timing_allowed_requests,
8698
COUNT(0) AS total_requests,
99+
SUM(tao_header_present) / COUNT(0) AS pct_tao_requests,
87100
SUM(timing_allowed) / COUNT(0) AS pct_timing_allowed_requests
88101
FROM
89102
base

0 commit comments

Comments
 (0)