This repository has been archived by the owner on Jul 8, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharchive-bot.hs
33 lines (27 loc) · 1.57 KB
/
archive-bot.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
{-# LANGUAGE ScopedTypeVariables #-}
module Main () where
import Data.List (isPrefixOf)
import Control.Concurrent (forkIO)
import System.IO.Unsafe (unsafeInterleaveIO)
import qualified Data.ByteString.Lazy.Char8 as B (ByteString(), getContents, unpack, words)
import Text.HTML.TagSoup (parseTags, Tag(TagOpen))
import Text.HTML.Download (openURL)
import Control.Exception as E (catch, SomeException)
main :: IO ()
main = mapM_ (forkIO . archiveURL) . concat =<< mapM fetchArticleURLs =<< fmap B.words B.getContents
uniq :: [String] -> [String]
uniq = filter (\x -> x `elem`
["http://wikimediafoundation.org/",
"http://wikimediafoundation.org/wiki/Deductibility_of_donations",
"http://wikimediafoundation.org/wiki/Fundraising",
"http://wikimediafoundation.org/wiki/Privacy_policy",
"http://www.mediawiki.org/",
"http://www.wikimediafoundation.org"])
fetchArticleURLs :: B.ByteString -> IO [String]
fetchArticleURLs article = E.catch (fmap extractURLs $ unsafeInterleaveIO $ openURL("http://en.wikipedia.org/wiki/" ++ B.unpack article))
(\(_ :: SomeException) -> return [])
extractURLs :: String -> [String]
extractURLs arg = uniq $ [x | TagOpen "a" atts <- (parseTags arg), (_,x) <- atts, "http://" `isPrefixOf` x]
archiveURL :: String -> IO ()
archiveURL url = E.catch (openURL("www.webcitation.org/archive?url=" ++ url ++ "&[email protected]") >> return ())
(\(_ :: SomeException) -> return ())