Skip to content

Commit

Permalink
PWA table utils (#1291)
Browse files Browse the repository at this point in the history
* pwa candidates

* pwa candidates

* pwa helpers

* pwa readme
  • Loading branch information
rviscomi committed Sep 14, 2020
1 parent ae815da commit cb7b2c2
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 3 deletions.
6 changes: 4 additions & 2 deletions sql/util/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ These queries take the CSS response bodies and parse them using Rework CSS to ge

This query copies the [Third Party Web](https://github.com/patrickhulce/third-party-web) category data. Coordinate with @patrickhulce to publish a new version of the table based on the latest HTTP Archive data, then append the results of this query to the `almanac.third_parties` table.

## [pwa_candidates.sql](./pwa_candidates.sql)
## [pwa_candidates.sql](./pwa_candidates.sql), [manifests.sql](./manifests.sql), [service_workers.sql](./service_workers.sql)

This query generates a list of candidate URLs for manifest and service worker files. It depends on the `summary_response_bodies` table but could just as easily query `response_bodies.YYYY_MM_DD_*` instead. Append the results to the `almanac.pwa_candidates` table with the latest HTTP Archive data.
This query generates a list of candidate URLs for manifest and service worker files. It depends on the `summary_response_bodies` table but could just as easily query `response_bodies.YYYY_MM_DD_*` instead. Append the results to the `almanac.pwa_candidates` table with the latest HTTP Archive data.

The `almanac.manifests` and `almanac.service_workers` tables depend on the `pwa_candidates` table. Running these queries will generate the latest data that can be appended to their respective tables.
33 changes: 33 additions & 0 deletions sql/util/manifests.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
SELECT
date,
client,
page,
url,
body
FROM (
SELECT
*
FROM
`httparchive.almanac.summary_response_bodies`
WHERE
date = '2020-08-01') AS bodies
JOIN (
SELECT
date,
client,
pwa_url AS page,
manifest_url AS url
FROM
`httparchive.almanac.pwa_candidates`
WHERE
date = '2020-08-01'
GROUP BY
date,
client,
page,
url) AS pwa
USING (
date,
client,
page,
url)
3 changes: 2 additions & 1 deletion sql/util/pwa_candidates.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#standardSQL
CREATE TEMPORARY FUNCTION pathResolve(path1 STRING, path2 STRING)
RETURNS STRING LANGUAGE js AS """
if (!path2) return null;
function normalizeStringPosix(e,t){for(var n="",r=-1,i=0,l=void 0,o=!1,h=0;h<=e.length;++h){if(h<e.length)l=e.charCodeAt(h);else{if(l===SLASH)break;l=SLASH}if(l===SLASH){if(r===h-1||1===i);else if(r!==h-1&&2===i){if(n.length<2||!o||n.charCodeAt(n.length-1)!==DOT||n.charCodeAt(n.length-2)!==DOT)if(n.length>2){for(var g=n.length-1,a=g;a>=0&&n.charCodeAt(a)!==SLASH;--a);if(a!==g){n=-1===a?"":n.slice(0,a),r=h,i=0,o=!1;continue}}else if(2===n.length||1===n.length){n="",r=h,i=0,o=!1;continue}t&&(n.length>0?n+="/..":n="..",o=!0)}else{var f=e.slice(r+1,h);n.length>0?n+="/"+f:n=f,o=!1}r=h,i=0}else l===DOT&&-1!==i?++i:i=-1}return n}function resolvePath(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];for(var n="",r=!1,i=void 0,l=e.length-1;l>=-1&&!r;l--){var o=void 0;l>=0?o=e[l]:(void 0===i&&(i=getCWD()),o=i),0!==o.length&&(n=o+"/"+n,r=o.charCodeAt(0)===SLASH)}return n=normalizeStringPosix(n,!r),r?"/"+n:n.length>0?n:"."}var SLASH=47,DOT=46,getCWD=function(){return""};if(/^https?:/.test(path2)){return path2;}if(/^\\//.test(path2)){return path1+path2.substr(1);}return resolvePath(path1, path2).replace(/^(https?:\\/)/, '$1/');
""";

Expand All @@ -18,5 +19,5 @@ WHERE
date = '2020-08-01' AND
(REGEXP_EXTRACT(body, "navigator\\.serviceWorker\\.register\\s*\\(\\s*[\"']([^\\),\\s\"']+)") IS NOT NULL
AND REGEXP_EXTRACT(body, "navigator\\.serviceWorker\\.register\\s*\\(\\s*[\"']([^\\),\\s\"']+)") != "/")
AND (REGEXP_EXTRACT(REGEXP_EXTRACT(body, "(<link[^>]+rel=[\"']?manifest[\"']?[^>]+>)"), "href=[\"']?([^\\s\"'>]+)[\"']?") IS NOT NULL
OR (REGEXP_EXTRACT(REGEXP_EXTRACT(body, "(<link[^>]+rel=[\"']?manifest[\"']?[^>]+>)"), "href=[\"']?([^\\s\"'>]+)[\"']?") IS NOT NULL
AND REGEXP_EXTRACT(REGEXP_EXTRACT(body, "(<link[^>]+rel=[\"']?manifest[\"']?[^>]+>)"), "href=[\"']?([^\\s\"'>]+)[\"']?") != "/")
33 changes: 33 additions & 0 deletions sql/util/service_workers.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
SELECT
date,
client,
page,
url,
body
FROM (
SELECT
*
FROM
`httparchive.almanac.summary_response_bodies`
WHERE
date = '2020-08-01') AS bodies
JOIN (
SELECT
date,
client,
pwa_url AS page,
sw_url AS url
FROM
`httparchive.almanac.pwa_candidates`
WHERE
date = '2020-08-01'
GROUP BY
date,
client,
page,
url) AS pwa
USING (
date,
client,
page,
url)

0 comments on commit cb7b2c2

Please sign in to comment.