Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ISSUE-28] Cache initial HTML page retrieval #46

Merged
merged 1 commit into from
Dec 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 116 additions & 19 deletions source/cache.d
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@ import std.array : replace;
import std.base64 : Base64URL;
import std.conv : to;
import std.datetime : SysTime, Clock, days;
import std.file : exists, getcwd, readText, tempDir, write;
import std.file : exists, getcwd, readText, remove, tempDir, write;
import std.net.curl : get;
import std.path : buildPath;
import std.typecons : Flag, Yes, No;
import std.string : indexOf;

import helpers : StdoutLogger, parseID, parseQueryString;
import helpers : StdoutLogger, parseID, parseQueryString, parseBaseJSKey;
import parsers : parseBaseJSURL, YoutubeVideoURLExtractor, SimpleYoutubeVideoURLExtractor, AdvancedYoutubeVideoURLExtractor;

struct Cache
Expand All @@ -31,50 +31,67 @@ struct Cache
{
this(logger);
this.downloadAsString = downloadAsString;
this.forceRefresh = forceRefresh;
}

YoutubeVideoURLExtractor makeParser(string url, int itag)
{
string htmlCachePath = getCachePath(url) ~ ".html";
string baseJSCachePath = getCachePath(url) ~ ".js";
updateCache(url, htmlCachePath, baseJSCachePath, itag);

string htmlCachePath = getHTMLCachePath(url) ~ ".html";
updateHTMLCache(url, htmlCachePath, itag);
string html = htmlCachePath.readText();

string baseJSURL = html.parseBaseJSURL();
string baseJSCachePath = getBaseJSCachePath(baseJSURL) ~ ".js";
updateBaseJSCache(baseJSURL, baseJSCachePath, itag);
string baseJS = baseJSCachePath.readText();
if(html.indexOf("signatureCipher:") == -1)

if(html.indexOf("signatureCipher") == -1)
{
return new SimpleYoutubeVideoURLExtractor(html, baseJS, logger);
}
return new AdvancedYoutubeVideoURLExtractor(html, baseJS, logger);
}

private void updateCache(string url, string htmlCachePath, string baseJSCachePath, int itag)
private void updateHTMLCache(string url, string htmlCachePath, int itag)
{
bool shouldRedownload = forceRefresh || !htmlCachePath.exists() || isStale(htmlCachePath.readText(), itag);
if(shouldRedownload)
{
logger.display("Cache miss, downloading HTML...");
string html = this.downloadAsString(url);
htmlCachePath.write(html);
string baseJS = this.downloadAsString(html.parseBaseJSURL());
baseJSCachePath.write(baseJS);
}
else
{
logger.display("Cache hit, skipping HTML download...");
}
}

private void updateBaseJSCache(string url, string baseJSCachePath, int itag)
{
bool shouldRedownload = forceRefresh || !baseJSCachePath.exists();
if(shouldRedownload)
{
logger.display("base.js cache miss, downloading from " ~ url);
string baseJS = this.downloadAsString(url);
baseJSCachePath.write(baseJS);
}
else
{
logger.display("base.js cache hit, skipping download...");
}
}

private bool isStale(string html, int itag)
{
YoutubeVideoURLExtractor shallowParser = html.indexOf("signatureCipher:") == -1
YoutubeVideoURLExtractor shallowParser = html.indexOf("signatureCipher") == -1
? new SimpleYoutubeVideoURLExtractor(html, "", logger)
: new AdvancedYoutubeVideoURLExtractor(html, "", logger);
ulong expire = shallowParser.findExpirationTimestamp(itag);
return SysTime.fromUnixTime(expire) < Clock.currTime();
}

private string getCachePath(string url)
private string getHTMLCachePath(string url)
{
string cacheKey = url.parseID();
if(cacheKey == "")
Expand All @@ -84,14 +101,28 @@ struct Cache

return buildPath(cacheDirectory, cacheKey);
}

private string getBaseJSCachePath(string url)
{
string cacheKey = url.parseBaseJSKey();
if(cacheKey == "")
{
cacheKey = Base64URL.encode(cast(ubyte[]) url);
}

return buildPath(cacheDirectory, cacheKey);
}
}

unittest
{
writeln("Given SimpleYoutubeVideoURLExtractor, when cache is stale, should redownload HTML");
bool downloadAttempted;
auto downloadAsString = delegate string(string url) {
downloadAttempted = true;
if(url == "https://youtu.be/zoz")
{
downloadAttempted = true;
}
return "zoz.html".readText();
};
auto cache = Cache(new StdoutLogger(), downloadAsString);
Expand All @@ -106,15 +137,17 @@ unittest
writeln("Given SimpleYoutubeVideoURLExtractor, when cache is fresh, should not download HTML");
bool downloadAttempted;
auto downloadAsString = delegate string(string url) {
downloadAttempted = true;
if(url == "https://youtu.be/zoz-fresh")
{
downloadAttempted = true;
}
return "zoz.html".readText();
};
SysTime tomorrow = Clock.currTime() + 1.days;
auto cache = Cache(new StdoutLogger(), downloadAsString);
cache.cacheDirectory = getcwd();

"zoz-fresh.html".write("zoz.html".readText().dup.replace("expire=1638935038", "expire=" ~ tomorrow.toUnixTime().to!string));
"zoz-fresh.js".write("base.min.js".readText());

auto parser = cache.makeParser("https://youtu.be/zoz-fresh", 18);
assert(!downloadAttempted);
Expand All @@ -125,7 +158,10 @@ unittest
writeln("Given AdvancedYoutubeVideoURLExtractor, when cache is stale, should redownload HTML");
bool downloadAttempted;
auto downloadAsString = delegate string(string url) {
downloadAttempted = true;
if(url == "https://youtu.be/dQw4w9WgXcQ")
{
downloadAttempted = true;
}
return "dQw4w9WgXcQ.html".readText();
};
auto cache = Cache(new StdoutLogger(), downloadAsString);
Expand All @@ -140,15 +176,17 @@ unittest
writeln("Given AdvancedYoutubeVideoURLExtractor, when cache is fresh, should not download HTML");
bool downloadAttempted;
auto downloadAsString = delegate string(string url) {
downloadAttempted = true;
if(url == "https://youtu.be/dQw4w9WgXcQ-fresh")
{
downloadAttempted = true;
}
return "dQw4w9WgXcQ-fresh.html".readText();
};
SysTime tomorrow = Clock.currTime() + 1.days;
auto cache = Cache(new StdoutLogger(), downloadAsString);
cache.cacheDirectory = getcwd();

//mock previously cached and fresh files
"dQw4w9WgXcQ-fresh.js".write("base.min.js".readText());
"dQw4w9WgXcQ-fresh.html".write(
"dQw4w9WgXcQ.html".readText().dup.replace("expire%3D1677997809", "expire%3D" ~ tomorrow.toUnixTime().to!string)
);
Expand All @@ -162,13 +200,72 @@ unittest
{
writeln("When forcing refresh, should download HTML");
bool downloadAttempted;
bool baseJSDownloadAttempted;
auto downloadAsString = delegate string(string url) {
downloadAttempted = true;
writeln("downloadAsString : ", url);
if(url == "https://youtu.be/zoz")
{
downloadAttempted = true;
}
if(url == "https://www.youtube.com/s/player/0c96dfd3/player_ias.vflset/ar_EG/base.js")
{
baseJSDownloadAttempted = true;
}
return "zoz.html".readText();
};
auto cache = Cache(new StdoutLogger(), downloadAsString, Yes.forceRefresh);
cache.cacheDirectory = getcwd();

auto parser = cache.makeParser("https://youtu.be/zoz", 18);
assert(downloadAttempted);
assert(baseJSDownloadAttempted);
}

unittest
{
writeln("When base.js is cached, should read from cache");
"0c96dfd3.js".write("base.min.js".readText());

bool baseJSDownloadAttempted;
auto downloadAsString = delegate string(string url) {
if(url == "https://www.youtube.com/s/player/0c96dfd3/player_ias.vflset/ar_EG/base.js")
{
baseJSDownloadAttempted = true;
return "0c96dfd3.js".readText();
}
return "zoz.html".readText();
};
auto cache = Cache(new StdoutLogger(), downloadAsString);
cache.cacheDirectory = getcwd();

auto parser = cache.makeParser("https://youtu.be/zoz", 18);
assert(!baseJSDownloadAttempted);
}

unittest
{
writeln("When base.js is not cached, should download it");
if("0c96dfd3.js".exists())
{
"0c96dfd3.js".remove();
}
scope(exit)
{
"0c96dfd3.js".remove();
}

bool baseJSDownloadAttempted;
auto downloadAsString = delegate string(string url) {
if(url == "https://www.youtube.com/s/player/0c96dfd3/player_ias.vflset/ar_EG/base.js")
{
baseJSDownloadAttempted = true;
return "base.min.js".readText();
}
return "zoz.html".readText();
};
auto cache = Cache(new StdoutLogger(), downloadAsString);
cache.cacheDirectory = getcwd();

auto parser = cache.makeParser("https://youtu.be/zoz", 18);
assert(baseJSDownloadAttempted);
}
21 changes: 21 additions & 0 deletions source/helpers.d
Original file line number Diff line number Diff line change
Expand Up @@ -220,3 +220,24 @@ unittest
assert("https://www.youtube.com/shorts/_tT2ldpZHek".parseID() == "_tT2ldpZHek");
assert("qlsdkqsldkj".parseID() == "");
}

string parseBaseJSKey(string url)
{
string id;
if(url.startsWith("https://"))
{
url = url["https://".length .. $];
}
if(url.startsWith("www.youtube.com"))
{
url = url["www.youtube.com".length .. $];
}
return url.split("/")[3];
}

unittest
{
assert("/s/player/0c96dfd3/player_ias.vflset/ar_EG/base.js".parseBaseJSKey() == "0c96dfd3");
assert("https://www.youtube.com/s/player/0c96dfd3/player_ias.vflset/ar_EG/base.js".parseBaseJSKey() == "0c96dfd3");
assert("www.youtube.com/s/player/0c96dfd3/player_ias.vflset/ar_EG/base.js".parseBaseJSKey() == "0c96dfd3");
}
Loading