-
Notifications
You must be signed in to change notification settings - Fork 82
/
xkcdscrape.R
60 lines (48 loc) · 2.43 KB
/
xkcdscrape.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#####################################################################################
### The following function will go to the xkcd website, scrape the html, look for ###
### a particular part based on a div tag, snag it, and write it out to a file for ###
### subsequent text analysis. ###
### ###
### The function requires only one argument, number, indicating the number of ###
### comic strips. ###
#####################################################################################
### Inspiration from:
### https://github.com/CabbagesAndKings/xkcd-Topics/blob/master/scripts/getTranscripts.sh
xkcdscrape = function(number){
# create url string
url = paste0("http://xkcd.com/", number,'/')
# creates a single element list argument of unparsed html
x = scrape(url, parse=F)
x = as.character(x[[1]]) # not necessary, but simplifies things a bit
# find the point in the text matching the regex
transcript = regexpr('<div id="transcript" style="display: none">(.*?)</div>', x)
# writes what is sent to console (via cat) to a xkcd#.txt file
sink(paste0('xkcdRScrape/xkcd', number, '.txt')) # open connection to a file of the name formed by paste0; create the folder first.
cat(unlist(regmatches(x, transcript))) # send to the console the point specified by transcript in the html text
sink() # close connection
}
library(parallel)
cl = makeCluster(3) # nubmer of cores
clusterEvalQ(cl, library(scrapeR)) # load scrapeR package on the cores
clusterExport(cl, 'xkcdscrape') # export the function to the cores
n = 1283 # number of xkcd comics by late Oct 2013
# a standard non-parallel way to do it without an explicit loop and more straightforward/easier code
# sapply(1:n, xkcdscrape) # each number 1:n is fed to the function
# the parallized version (timed via system.time function)
system.time({
parSapply(cl, 1:n, xkcdscrape)
})
### Example loop for comparison; slow
# Check website for number published
# n = 1283
# library(scrapeR)
# system.time({
# for (i in 1:n){
# url = paste0("http://xkcd.com/", i,'/')
# x = scrape(url, parse=F)
# transcript = regexpr('<div id="transcript" style="display: none">(.*?)</div>', x[[1]])
# sink(paste0('xkcdRScrape/xkcd', i))
# cat(regmatches(x[[1]], transcript))
# sink()
# }
# })