|
| 1 | +#standardSQL |
| 2 | +# Cache-Control and CDN caching behavior analysis |
| 3 | +# Analyzes cache directives and actual caching behavior |
| 4 | + |
| 5 | +WITH cache_analysis AS ( |
| 6 | + SELECT |
| 7 | + client, |
| 8 | + is_main_document, |
| 9 | + |
| 10 | + -- CDN detection |
| 11 | + IFNULL( |
| 12 | + NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), |
| 13 | + 'ORIGIN' |
| 14 | + ) AS cdn, |
| 15 | + |
| 16 | + -- Content type |
| 17 | + JSON_EXTRACT_SCALAR(summary, '$._contentType') AS content_type, |
| 18 | + |
| 19 | + -- Response status |
| 20 | + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response.status') AS INT64) AS status_code, |
| 21 | + |
| 22 | + -- Extract cache-control header value |
| 23 | + (SELECT value |
| 24 | +FROM UNNEST(response_headers) |
| 25 | +WHERE LOWER(name) = 'cache-control' |
| 26 | +LIMIT 1) AS cache_control, |
| 27 | + |
| 28 | + -- Extract other cache-related headers |
| 29 | + (SELECT value |
| 30 | +FROM UNNEST(response_headers) |
| 31 | +WHERE LOWER(name) = 'expires' |
| 32 | +LIMIT 1) AS expires_header, |
| 33 | + (SELECT value |
| 34 | +FROM UNNEST(response_headers) |
| 35 | +WHERE LOWER(name) = 'etag' |
| 36 | +LIMIT 1) AS etag, |
| 37 | + (SELECT value |
| 38 | +FROM UNNEST(response_headers) |
| 39 | +WHERE LOWER(name) = 'last-modified' |
| 40 | +LIMIT 1) AS last_modified, |
| 41 | + (SELECT value |
| 42 | +FROM UNNEST(response_headers) |
| 43 | +WHERE LOWER(name) = 'vary' |
| 44 | +LIMIT 1) AS vary_header, |
| 45 | + |
| 46 | + -- CDN-specific headers |
| 47 | + (SELECT value |
| 48 | +FROM UNNEST(response_headers) |
| 49 | +WHERE LOWER(name) = 'x-cache' |
| 50 | +LIMIT 1) AS x_cache, |
| 51 | + (SELECT value |
| 52 | +FROM UNNEST(response_headers) |
| 53 | +WHERE LOWER(name) = 'cf-cache-status' |
| 54 | +LIMIT 1) AS cf_cache_status, |
| 55 | + (SELECT value |
| 56 | +FROM UNNEST(response_headers) |
| 57 | +WHERE LOWER(name) = 'x-served-by' |
| 58 | +LIMIT 1) AS x_served_by |
| 59 | +FROM `httparchive.crawl.requests` |
| 60 | +WHERE date = '2025-07-01' |
| 61 | +) |
| 62 | + |
| 63 | +SELECT |
| 64 | + client, |
| 65 | + cdn, |
| 66 | + is_main_document, |
| 67 | + COUNT(0) AS total_requests, |
| 68 | + |
| 69 | + -- Cache-Control directives analysis |
| 70 | + COUNTIF(cache_control IS NOT NULL) AS has_cache_control, |
| 71 | + ROUND(SAFE_DIVIDE(COUNTIF(cache_control IS NOT NULL), COUNT(0)) * 100, 2) AS cache_control_pct, |
| 72 | + |
| 73 | + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'public')) AS is_public, |
| 74 | + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'public')), COUNT(0)) * 100, 2) AS public_pct, |
| 75 | + |
| 76 | + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'private')) AS is_private, |
| 77 | + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'private')), COUNT(0)) * 100, 2) AS private_pct, |
| 78 | + |
| 79 | + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-cache')) AS is_no_cache, |
| 80 | + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-cache')), COUNT(0)) * 100, 2) AS no_cache_pct, |
| 81 | + |
| 82 | + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-store')) AS is_no_store, |
| 83 | + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-store')), COUNT(0)) * 100, 2) AS no_store_pct, |
| 84 | + |
| 85 | + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'max-age=0')) AS is_max_age_zero, |
| 86 | + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'max-age=0')), COUNT(0)) * 100, 2) AS max_age_zero_pct, |
| 87 | + |
| 88 | + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'immutable')) AS is_immutable, |
| 89 | + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'immutable')), COUNT(0)) * 100, 2) AS immutable_pct, |
| 90 | + |
| 91 | + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r's-maxage')) AS has_s_maxage, |
| 92 | + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r's-maxage')), COUNT(0)) * 100, 2) AS s_maxage_pct, |
| 93 | + |
| 94 | + -- Modern cache directives |
| 95 | + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-while-revalidate')) AS has_stale_while_revalidate, |
| 96 | + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-while-revalidate')), COUNT(0)) * 100, 2) AS stale_while_revalidate_pct, |
| 97 | + |
| 98 | + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-if-error')) AS has_stale_if_error, |
| 99 | + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-if-error')), COUNT(0)) * 100, 2) AS stale_if_error_pct, |
| 100 | + |
| 101 | + -- Max-age value analysis |
| 102 | + APPROX_QUANTILES( |
| 103 | + SAFE_CAST(REGEXP_EXTRACT(cache_control, r'max-age=(\d+)') AS INT64), 100 |
| 104 | + )[OFFSET(50)] AS median_max_age_seconds, |
| 105 | + |
| 106 | + APPROX_QUANTILES( |
| 107 | + SAFE_CAST(REGEXP_EXTRACT(cache_control, r's-maxage=(\d+)') AS INT64), 100 |
| 108 | + )[OFFSET(50)] AS median_s_maxage_seconds, |
| 109 | + |
| 110 | + -- Cache hit/miss analysis from CDN headers |
| 111 | + COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')) AS cache_hits, |
| 112 | + COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'miss')) AS cache_misses, |
| 113 | + ROUND(SAFE_DIVIDE( |
| 114 | + COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')), |
| 115 | + COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit|miss')) |
| 116 | + ) * 100, 2) AS cache_hit_rate_pct, |
| 117 | + |
| 118 | + -- Cloudflare-specific cache status |
| 119 | + COUNTIF(REGEXP_CONTAINS(LOWER(cf_cache_status), r'hit')) AS cf_cache_hits, |
| 120 | + COUNTIF(REGEXP_CONTAINS(LOWER(cf_cache_status), r'miss')) AS cf_cache_misses, |
| 121 | + COUNTIF(REGEXP_CONTAINS(LOWER(cf_cache_status), r'dynamic')) AS cf_cache_dynamic, |
| 122 | + |
| 123 | + -- Validation headers |
| 124 | + COUNTIF(etag IS NOT NULL) AS has_etag, |
| 125 | + ROUND(SAFE_DIVIDE(COUNTIF(etag IS NOT NULL), COUNT(0)) * 100, 2) AS etag_pct, |
| 126 | + |
| 127 | + COUNTIF(last_modified IS NOT NULL) AS has_last_modified, |
| 128 | + ROUND(SAFE_DIVIDE(COUNTIF(last_modified IS NOT NULL), COUNT(0)) * 100, 2) AS last_modified_pct, |
| 129 | + |
| 130 | + -- CDN cache hit indicators |
| 131 | + COUNTIF(x_cache IS NOT NULL) AS has_x_cache, |
| 132 | + COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')) AS cache_hit_count, |
| 133 | + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')), COUNTIF(x_cache IS NOT NULL)) * 100, 2) AS cache_hit_rate, |
| 134 | + |
| 135 | + -- Cloudflare specific |
| 136 | + COUNTIF(cf_cache_status = 'HIT') AS cf_hits, |
| 137 | + COUNTIF(cf_cache_status = 'MISS') AS cf_misses, |
| 138 | + ROUND(SAFE_DIVIDE(COUNTIF(cf_cache_status = 'HIT'), COUNTIF(cf_cache_status IS NOT NULL)) * 100, 2) AS cf_hit_rate |
| 139 | +FROM cache_analysis |
| 140 | +WHERE status_code = 200 -- Focus on successful responses |
| 141 | +GROUP BY client, |
| 142 | + cdn, |
| 143 | + is_main_document |
| 144 | +HAVING |
| 145 | + total_requests >= 1000 |
| 146 | +ORDER BY client DESC, |
| 147 | + total_requests DESC |
0 commit comments