From cb7b2c2f4b5be3a0fb3dd4e6807675cb603b2c78 Mon Sep 17 00:00:00 2001 From: Rick Viscomi Date: Mon, 14 Sep 2020 11:34:26 -0400 Subject: [PATCH] PWA table utils (#1291) * pwa candidates * pwa candidates * pwa helpers * pwa readme --- sql/util/README.md | 6 ++++-- sql/util/manifests.sql | 33 +++++++++++++++++++++++++++++++++ sql/util/pwa_candidates.sql | 3 ++- sql/util/service_workers.sql | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+), 3 deletions(-) create mode 100644 sql/util/manifests.sql create mode 100644 sql/util/service_workers.sql diff --git a/sql/util/README.md b/sql/util/README.md index 811dec5a284..a273f05490f 100644 --- a/sql/util/README.md +++ b/sql/util/README.md @@ -18,6 +18,8 @@ These queries take the CSS response bodies and parse them using Rework CSS to ge This query copies the [Third Party Web](https://github.com/patrickhulce/third-party-web) category data. Coordinate with @patrickhulce to publish a new version of the table based on the latest HTTP Archive data, then append the results of this query to the `almanac.third_parties` table. -## [pwa_candidates.sql](./pwa_candidates.sql) +## [pwa_candidates.sql](./pwa_candidates.sql), [manifests.sql](./manifests.sql), [service_workers.sql](./service_workers.sql) -This query generates a list of candidate URLs for manifest and service worker files. It depends on the `summary_response_bodies` table but could just as easily query `response_bodies.YYYY_MM_DD_*` instead. Append the results to the `almanac.pwa_candidates` table with the latest HTTP Archive data. \ No newline at end of file +This query generates a list of candidate URLs for manifest and service worker files. It depends on the `summary_response_bodies` table but could just as easily query `response_bodies.YYYY_MM_DD_*` instead. Append the results to the `almanac.pwa_candidates` table with the latest HTTP Archive data. + +The `almanac.manifests` and `almanac.service_workers` tables depend on the `pwa_candidates` table. Running these queries will generate the latest data that can be appended to their respective tables. \ No newline at end of file diff --git a/sql/util/manifests.sql b/sql/util/manifests.sql new file mode 100644 index 00000000000..b4e57673df1 --- /dev/null +++ b/sql/util/manifests.sql @@ -0,0 +1,33 @@ +SELECT + date, + client, + page, + url, + body +FROM ( + SELECT + * + FROM + `httparchive.almanac.summary_response_bodies` + WHERE + date = '2020-08-01') AS bodies +JOIN ( + SELECT + date, + client, + pwa_url AS page, + manifest_url AS url + FROM + `httparchive.almanac.pwa_candidates` + WHERE + date = '2020-08-01' + GROUP BY + date, + client, + page, + url) AS pwa +USING ( + date, + client, + page, + url) \ No newline at end of file diff --git a/sql/util/pwa_candidates.sql b/sql/util/pwa_candidates.sql index b9e0aa435e8..be98a412441 100644 --- a/sql/util/pwa_candidates.sql +++ b/sql/util/pwa_candidates.sql @@ -1,6 +1,7 @@ #standardSQL CREATE TEMPORARY FUNCTION pathResolve(path1 STRING, path2 STRING) RETURNS STRING LANGUAGE js AS """ + if (!path2) return null; function normalizeStringPosix(e,t){for(var n="",r=-1,i=0,l=void 0,o=!1,h=0;h<=e.length;++h){if(h2){for(var g=n.length-1,a=g;a>=0&&n.charCodeAt(a)!==SLASH;--a);if(a!==g){n=-1===a?"":n.slice(0,a),r=h,i=0,o=!1;continue}}else if(2===n.length||1===n.length){n="",r=h,i=0,o=!1;continue}t&&(n.length>0?n+="/..":n="..",o=!0)}else{var f=e.slice(r+1,h);n.length>0?n+="/"+f:n=f,o=!1}r=h,i=0}else l===DOT&&-1!==i?++i:i=-1}return n}function resolvePath(){for(var e=[],t=0;t=-1&&!r;l--){var o=void 0;l>=0?o=e[l]:(void 0===i&&(i=getCWD()),o=i),0!==o.length&&(n=o+"/"+n,r=o.charCodeAt(0)===SLASH)}return n=normalizeStringPosix(n,!r),r?"/"+n:n.length>0?n:"."}var SLASH=47,DOT=46,getCWD=function(){return""};if(/^https?:/.test(path2)){return path2;}if(/^\\//.test(path2)){return path1+path2.substr(1);}return resolvePath(path1, path2).replace(/^(https?:\\/)/, '$1/'); """; @@ -18,5 +19,5 @@ WHERE date = '2020-08-01' AND (REGEXP_EXTRACT(body, "navigator\\.serviceWorker\\.register\\s*\\(\\s*[\"']([^\\),\\s\"']+)") IS NOT NULL AND REGEXP_EXTRACT(body, "navigator\\.serviceWorker\\.register\\s*\\(\\s*[\"']([^\\),\\s\"']+)") != "/") - AND (REGEXP_EXTRACT(REGEXP_EXTRACT(body, "(]+rel=[\"']?manifest[\"']?[^>]+>)"), "href=[\"']?([^\\s\"'>]+)[\"']?") IS NOT NULL + OR (REGEXP_EXTRACT(REGEXP_EXTRACT(body, "(]+rel=[\"']?manifest[\"']?[^>]+>)"), "href=[\"']?([^\\s\"'>]+)[\"']?") IS NOT NULL AND REGEXP_EXTRACT(REGEXP_EXTRACT(body, "(]+rel=[\"']?manifest[\"']?[^>]+>)"), "href=[\"']?([^\\s\"'>]+)[\"']?") != "/") \ No newline at end of file diff --git a/sql/util/service_workers.sql b/sql/util/service_workers.sql new file mode 100644 index 00000000000..d140e20246e --- /dev/null +++ b/sql/util/service_workers.sql @@ -0,0 +1,33 @@ +SELECT + date, + client, + page, + url, + body +FROM ( + SELECT + * + FROM + `httparchive.almanac.summary_response_bodies` + WHERE + date = '2020-08-01') AS bodies +JOIN ( + SELECT + date, + client, + pwa_url AS page, + sw_url AS url + FROM + `httparchive.almanac.pwa_candidates` + WHERE + date = '2020-08-01' + GROUP BY + date, + client, + page, + url) AS pwa +USING ( + date, + client, + page, + url) \ No newline at end of file