|
1 | 1 | #standardSQL |
2 | | -# top_cdns.sql: Top CDNs used on the root HTML pages |
| 2 | +# top_cdns.sql: Top CDNs used |
3 | 3 | SELECT |
| 4 | + year, |
4 | 5 | client, |
5 | 6 | cdn, |
6 | 7 | COUNTIF(firstHtml) AS firstHtmlHits, |
7 | | - SUM(COUNTIF(firstHtml)) OVER (PARTITION BY client) AS firstHtmlTotalHits, |
8 | | - SAFE_DIVIDE(COUNTIF(firstHtml), SUM(COUNTIF(firstHtml)) OVER (PARTITION BY client)) AS firstHtmlHitsPct, |
| 8 | + SUM(COUNTIF(firstHtml)) OVER (PARTITION BY year, client) AS firstHtmlTotalHits, |
| 9 | + SAFE_DIVIDE(COUNTIF(firstHtml), SUM(COUNTIF(firstHtml)) OVER (PARTITION BY year, client)) AS firstHtmlHitsPct, |
9 | 10 |
|
10 | 11 | COUNTIF(NOT firstHtml AND NOT sameHost AND sameDomain) AS subDomainHits, |
11 | | - SUM(COUNTIF(NOT firstHtml AND NOT sameHost AND sameDomain)) OVER (PARTITION BY client) AS subDomainTotalHits, |
12 | | - SAFE_DIVIDE(COUNTIF(NOT firstHtml AND NOT sameHost AND sameDomain), SUM(COUNTIF(NOT firstHtml AND NOT sameHost AND sameDomain)) OVER (PARTITION BY client)) AS subDomainHitsPct, |
| 12 | + SUM(COUNTIF(NOT firstHtml AND NOT sameHost AND sameDomain)) OVER (PARTITION BY year, client) AS subDomainTotalHits, |
| 13 | + SAFE_DIVIDE(COUNTIF(NOT firstHtml AND NOT sameHost AND sameDomain), SUM(COUNTIF(NOT firstHtml AND NOT sameHost AND sameDomain)) OVER (PARTITION BY year, client)) AS subDomainHitsPct, |
13 | 14 |
|
14 | 15 | COUNTIF(NOT firstHtml AND NOT sameHost AND NOT sameDomain) AS thirdPartyHits, |
15 | | - SUM(COUNTIF(NOT firstHtml AND NOT sameHost AND NOT sameDomain)) OVER (PARTITION BY client) AS thirdPartyTotalHits, |
16 | | - SAFE_DIVIDE(COUNTIF(NOT firstHtml AND NOT sameHost AND NOT sameDomain), SUM(COUNTIF(NOT firstHtml AND NOT sameHost AND NOT sameDomain)) OVER (PARTITION BY client)) AS thirdPartyHitsPct, |
| 16 | + SUM(COUNTIF(NOT firstHtml AND NOT sameHost AND NOT sameDomain)) OVER (PARTITION BY year, client) AS thirdPartyTotalHits, |
| 17 | + SAFE_DIVIDE(COUNTIF(NOT firstHtml AND NOT sameHost AND NOT sameDomain), SUM(COUNTIF(NOT firstHtml AND NOT sameHost AND NOT sameDomain)) OVER (PARTITION BY year, client)) AS thirdPartyHitsPct, |
17 | 18 |
|
18 | 19 | COUNT(0) AS hits, |
19 | 20 | SUM(COUNT(0)) OVER (PARTITION BY client) AS totalHits, |
20 | 21 | SAFE_DIVIDE(COUNT(0), SUM(COUNT(0)) OVER (PARTITION BY client)) AS hitsPct |
21 | 22 | FROM |
22 | 23 | ( |
23 | 24 | SELECT |
| 25 | + '2019' AS year, |
| 26 | + client, |
| 27 | + page, |
| 28 | + url, |
| 29 | + firstHtml, |
| 30 | + respBodySize, |
| 31 | + IFNULL(NULLIF(REGEXP_EXTRACT(_cdn_provider, r'^([^,]*).*'), ''), 'ORIGIN') AS cdn, # sometimes _cdn provider detection includes multiple entries. we bias for the DNS detected entry which is the first entry |
| 32 | + NET.HOST(url) = NET.HOST(page) AS sameHost, |
| 33 | + NET.HOST(url) = NET.HOST(page) OR NET.REG_DOMAIN(url) = NET.REG_DOMAIN(page) AS sameDomain # if toplevel reg_domain will return NULL so we group this as sameDomain |
| 34 | + FROM |
| 35 | + `httparchive.almanac.requests` |
| 36 | + WHERE |
| 37 | + date = '2019-07-01' |
| 38 | + UNION ALL |
| 39 | + SELECT |
| 40 | + '2020' AS year, |
| 41 | + client, |
| 42 | + page, |
| 43 | + url, |
| 44 | + firstHtml, |
| 45 | + respBodySize, |
| 46 | + IFNULL(NULLIF(REGEXP_EXTRACT(_cdn_provider, r'^([^,]*).*'), ''), 'ORIGIN') AS cdn, # sometimes _cdn provider detection includes multiple entries. we bias for the DNS detected entry which is the first entry |
| 47 | + NET.HOST(url) = NET.HOST(page) AS sameHost, |
| 48 | + NET.HOST(url) = NET.HOST(page) OR NET.REG_DOMAIN(url) = NET.REG_DOMAIN(page) AS sameDomain # if toplevel reg_domain will return NULL so we group this as sameDomain |
| 49 | + FROM |
| 50 | + `httparchive.almanac.requests` |
| 51 | + WHERE |
| 52 | + date = '2020-08-01' |
| 53 | + UNION ALL |
| 54 | + SELECT |
| 55 | + '2021' AS year, |
| 56 | + client, |
| 57 | + page, |
| 58 | + url, |
| 59 | + firstHtml, |
| 60 | + respBodySize, |
| 61 | + IFNULL(NULLIF(REGEXP_EXTRACT(_cdn_provider, r'^([^,]*).*'), ''), 'ORIGIN') AS cdn, # sometimes _cdn provider detection includes multiple entries. we bias for the DNS detected entry which is the first entry |
| 62 | + NET.HOST(url) = NET.HOST(page) AS sameHost, |
| 63 | + NET.HOST(url) = NET.HOST(page) OR NET.REG_DOMAIN(url) = NET.REG_DOMAIN(page) AS sameDomain # if toplevel reg_domain will return NULL so we group this as sameDomain |
| 64 | + FROM |
| 65 | + `httparchive.almanac.requests` |
| 66 | + WHERE |
| 67 | + date = '2021-07-01' |
| 68 | + UNION ALL |
| 69 | + SELECT |
| 70 | + '2022' AS year, |
24 | 71 | client, |
25 | 72 | page, |
26 | 73 | url, |
|
35 | 82 | date = '2022-06-01' |
36 | 83 | ) |
37 | 84 | GROUP BY |
| 85 | + year, |
38 | 86 | client, |
39 | 87 | cdn |
40 | 88 | ORDER BY |
| 89 | + year DESC, |
41 | 90 | client DESC, |
42 | 91 | firstHtmlHits DESC |
0 commit comments