forked from mawenbao/gofeed
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.go
199 lines (181 loc) · 5.89 KB
/
crawler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
package main
import (
"io/ioutil"
"log"
"net/http"
"net/url"
"time"
)
func SendHttpRequest(cache *HtmlCache, httpTimeout time.Duration) (resp *http.Response, err error) {
if *gVerbose {
log.Printf("start to request %s", cache.URL)
}
req, err := http.NewRequest("GET", cache.URL.String(), nil)
if nil != err {
log.Printf("[ERROR] failed to create http request for %s: %s", cache.URL, err)
return
}
req.Header.Set("User-Agent", GOFEED_AGENT)
// set cache related headers
if "" != cache.CacheControl {
req.Header.Set("Cache-Control", cache.CacheControl)
}
if nil != cache.LastModified {
req.Header.Set("If-Modified-Since", cache.LastModified.Format(http.TimeFormat))
}
if "" != cache.Etag {
req.Header.Set("If-None-Match", cache.Etag)
}
// set cache date as request date
dateNow := time.Now()
cache.Date = &dateNow
// send request
client := new(http.Client)
client.Timeout = httpTimeout
resp, err = client.Do(req)
if nil != err {
log.Printf("[ERROR] http client failed to send request to %s: %s", cache.URL.String(), err)
return
}
return
}
// will close response body
func ParseHttpResponse(resp *http.Response, cache *HtmlCache) (err error) {
defer resp.Body.Close()
// set cache date
if cacheDate, ok := resp.Header["Date"]; ok {
cache.Date = new(time.Time)
*cache.Date, err = http.ParseTime(cacheDate[0])
if nil != err {
log.Printf("[ERROR] failed to parse http response Date header %s: %s", cacheDate, err)
}
}
if http.StatusNotModified == resp.StatusCode {
// not modified, use cache
cache.Status = CACHE_NOT_MODIFIED
if *gVerbose {
log.Printf("cache for %s not modified", cache.URL.String())
}
return
} else {
// change status of expired cache to modified
if CACHE_NEW != cache.Status {
cache.Status = CACHE_MODIFIED
if *gVerbose {
log.Printf("cache for %s has been modified", cache.URL.String())
}
}
cache.Html, err = ioutil.ReadAll(resp.Body)
if nil != err {
log.Printf("[ERROR] failed to read response body for %s: %s", cache.URL.String(), err)
return
}
if cacheCtl, ok := resp.Header["Cache-Control"]; ok {
cache.CacheControl = cacheCtl[0]
} else {
cache.CacheControl = ""
}
if lastmod, ok := resp.Header["Last-Modified"]; ok {
cache.LastModified = new(time.Time)
*cache.LastModified, err = http.ParseTime(lastmod[0])
if nil != err {
log.Printf("[ERROR] error parsing http Last-Modified response header %s: %s", lastmod[0], err)
}
}
if expireStr, ok := resp.Header["Expires"]; ok {
cache.Expires = new(time.Time)
*cache.Expires, err = http.ParseTime(expireStr[0])
if nil != err {
log.Printf("[ERROR] error parsing http Expires response header %s: %s", expireStr[0], err)
}
}
if etag, ok := resp.Header["Etag"]; ok {
cache.Etag = etag[0]
} else {
cache.Etag = ""
}
}
return
}
func FetchHtml(normalURL *url.URL, feedTar *FeedTarget) (cache *HtmlCache, err error) {
dbPath := feedTar.CacheDB
cacheLifetime := feedTar.CacheLifetime
httpTimeout := feedTar.HttpTimeout
// try to retrive html from cache first
cache, err = GetHtmlCacheByURL(dbPath, normalURL.String())
if nil == cache || nil != err {
// cache not found
cache = &HtmlCache{Status: CACHE_NEW}
} else {
// check cache lifetime
if cacheLifetime > 0 && cache.Date.Add(cacheLifetime).Before(time.Now()) {
// cache is dead, remove it from cache database, and send new request
log.Printf("cache for %s is dead, will remove it from cache database %s", normalURL.String(), dbPath)
err = DelHtmlCacheByURL(dbPath, normalURL.String())
if nil != err {
log.Printf("[ERROR] failed to remove dead cache for %s from database %s", normalURL.String(), dbPath)
return
}
cache = &HtmlCache{Status: CACHE_NEW}
} else {
// if cache is still alive, check if it has expired
if time.Now().Before(cache.Date.Add(time.Second*ExtractMaxAge(cache.CacheControl))) ||
(nil != cache.Expires && time.Now().Before(*cache.Expires)) {
// cache not expired, reuse it
if *gDebug {
log.Printf("[DEBUG] time.Now() %s for %s", time.Now().Local().String(), cache.URL.String())
log.Printf("[DEBUG] cache.Date %s for %s", cache.Date.Local().String(), cache.URL.String())
log.Printf("[DEBUG] cache.CacheControl %s for %s", cache.CacheControl, cache.URL.String())
log.Printf("[DEBUG] cache.Date + MaxAge %s for %s", cache.Date.Add(time.Second*ExtractMaxAge(cache.CacheControl)).Local().String(), cache.URL.String())
if nil != cache.Expires {
log.Printf("[DEBUG] cache.Expires %s for %s", cache.Expires.Local().String(), cache.URL.String())
}
}
log.Printf("cache for %s has not expired", cache.URL.String())
return
} else {
// cache has expired
if *gVerbose {
log.Printf("cache for %s has expired", cache.URL.String())
}
}
}
}
// cache not found, dead or expired, send new request
cache.URL = normalURL
resp, err := SendHttpRequest(cache, httpTimeout)
if nil != err {
if CACHE_NEW == cache.Status || !*gAlwaysUseCache {
log.Printf("[ERROR] failed to get %s, just ignore it", normalURL.String())
return
} else {
// just print a warning message, use old cache
log.Printf("[WARN] failed to get %s, use cache instead", normalURL.String())
return cache, nil
}
} else {
// parse http response
err = ParseHttpResponse(resp, cache)
if nil != err {
log.Printf("[ERROR] failed parsing response of %s: %s", cache.URL.String(), err)
// stop
return
}
}
// ignore cache which is not modified or failed with a new request
switch cache.Status {
case CACHE_NEW:
// save html cache
err = PutHtmlCache(dbPath, []*HtmlCache{cache})
if nil != err {
log.Printf("[ERROR] failed to save new cache for %s: %s", cache.URL.String(), err)
}
case CACHE_MODIFIED:
// update html cache
err = UpdateHtmlCache(dbPath, []*HtmlCache{cache})
if nil != err {
log.Printf("[ERROR} failed to update cache for %s: %s", cache.URL.String(), err)
}
}
return
}