Skip to content

Commit f150610

Browse files
committed
init repo
0 parents  commit f150610

File tree

3 files changed

+368
-0
lines changed

3 files changed

+368
-0
lines changed

LICENSE

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
(The MIT License)
2+
3+
Copyright (c) 2017 Florian Carrere <[email protected]>
4+
5+
Permission is hereby granted, free of charge, to any person obtaining
6+
a copy of this software and associated documentation files (the
7+
'Software'), to deal in the Software without restriction, including
8+
without limitation the rights to use, copy, modify, merge, publish,
9+
distribute, sublicense, and/or sell copies of the Software, and to
10+
permit persons to whom the Software is furnished to do so, subject to
11+
the following conditions:
12+
13+
The above copyright notice and this permission notice shall be
14+
included in all copies or substantial portions of the Software.
15+
16+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
17+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE

README.md

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# goscraper
2+
[Golang](http://golang.org/) package to quickly return a preview of a webpage, you can get easily its title, description & images
3+
4+
## Usage
5+
func main() {
6+
s, err := goscraper.Scrape("https://www.w3.org/", 5)
7+
if err != nil {
8+
fmt.Println(err)
9+
return
10+
}
11+
fmt.Printf("Title : %s\n", s.Preview.Title)
12+
fmt.Printf("Description : %s\n", s.Preview.Description)
13+
fmt.Printf("Image: %s\n", s.Preview.Images[0])
14+
fmt.Printf("Url : %s\n", s.Preview.Link)
15+
}
16+
17+
output:
18+
19+
`Title : World Wide Web Consortium (W3C)
20+
Description : The World Wide Web Consortium (W3C) is an international community where Member organizations, a full-time staff, and the public work together to develop Web standards.
21+
Image: https://www.w3.org/2008/site/images/logo-w3c-mobile-lg
22+
Url : https://www.w3.org/`
23+
24+
25+
## License
26+
27+
Goscraper is licensed under the [MIT License](./LICENSE).

goscraper.go

+319
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,319 @@
1+
package goscraper
2+
3+
import (
4+
"bytes"
5+
"fmt"
6+
"io"
7+
"net/http"
8+
"net/url"
9+
"regexp"
10+
"strings"
11+
12+
"golang.org/x/net/html"
13+
)
14+
15+
var (
16+
EscapedFragment string = "_escaped_fragment_="
17+
)
18+
19+
type Scraper struct {
20+
Url *url.URL
21+
EscapedFragmentUrl *url.URL
22+
MaxRedirect int
23+
}
24+
25+
type Document struct {
26+
Body bytes.Buffer
27+
Preview DocumentPreview
28+
}
29+
30+
type DocumentPreview struct {
31+
Title string
32+
Description string
33+
Images []string
34+
Link string
35+
}
36+
37+
func Scrape(uri string, maxRedirect int) (*Document, error) {
38+
u, err := url.Parse(uri)
39+
if err != nil {
40+
return nil, err
41+
}
42+
return (&Scraper{Url: u, MaxRedirect: maxRedirect}).Scrape()
43+
}
44+
45+
func (scraper *Scraper) getUrl() string {
46+
if scraper.EscapedFragmentUrl != nil {
47+
return scraper.EscapedFragmentUrl.String()
48+
}
49+
return scraper.Url.String()
50+
}
51+
52+
func (scraper *Scraper) toFragmentUrl() error {
53+
re := regexp.MustCompile("#!(.*)")
54+
unescapedurl, err := url.QueryUnescape(scraper.Url.String())
55+
if err != nil {
56+
return err
57+
}
58+
matches := re.FindStringSubmatch(unescapedurl)
59+
if len(matches) > 1 {
60+
escapedFragment := EscapedFragment
61+
for _, r := range matches[1] {
62+
b := byte(r)
63+
if avoidByte(b) {
64+
continue
65+
}
66+
if escapeByte(b) {
67+
escapedFragment += url.QueryEscape(string(r))
68+
} else {
69+
escapedFragment += string(r)
70+
}
71+
}
72+
73+
p := "?"
74+
if len(scraper.Url.Query()) > 0 {
75+
p = "&"
76+
}
77+
fragmentUrl, err := url.Parse(strings.Replace(unescapedurl, matches[0], p+escapedFragment, 1))
78+
if err != nil {
79+
return err
80+
}
81+
scraper.EscapedFragmentUrl = fragmentUrl
82+
} else {
83+
p := "?"
84+
if len(scraper.Url.Query()) > 0 {
85+
p = "&"
86+
}
87+
fragmentUrl, err := url.Parse(unescapedurl + p + EscapedFragment)
88+
if err != nil {
89+
return err
90+
}
91+
scraper.EscapedFragmentUrl = fragmentUrl
92+
}
93+
return nil
94+
}
95+
96+
func (scraper *Scraper) getDocument() (*Document, error) {
97+
scraper.MaxRedirect -= 1
98+
if strings.Contains(scraper.Url.String(), "#!") {
99+
scraper.toFragmentUrl()
100+
}
101+
if strings.Contains(scraper.Url.String(), EscapedFragment) {
102+
scraper.EscapedFragmentUrl = scraper.Url
103+
}
104+
105+
req, err := http.NewRequest("GET", scraper.getUrl(), nil)
106+
if err != nil {
107+
return nil, err
108+
}
109+
req.Header.Add("User-Agent", "GoScraper")
110+
111+
resp, err := http.DefaultClient.Do(req)
112+
if resp != nil {
113+
defer resp.Body.Close()
114+
}
115+
if err != nil {
116+
return nil, err
117+
}
118+
119+
dst := bytes.Buffer{}
120+
_, err = io.Copy(&dst, resp.Body)
121+
if err != nil {
122+
return nil, err
123+
}
124+
if resp.Request.URL.String() != scraper.getUrl() {
125+
scraper.EscapedFragmentUrl = nil
126+
scraper.Url = resp.Request.URL
127+
}
128+
doc := &Document{Body: dst, Preview: DocumentPreview{Link: scraper.Url.String()}}
129+
130+
return doc, nil
131+
}
132+
133+
func (scraper *Scraper) parseDocument(doc *Document) error {
134+
t := html.NewTokenizer(&doc.Body)
135+
var ogImage bool
136+
var headPassed bool
137+
var hasFragment bool
138+
var hasCanonical bool
139+
var canonicalUrl *url.URL
140+
doc.Preview.Images = []string{}
141+
// saves previews' link in case that <link rel="canonical"> is found after <meta property="og:url">
142+
link := doc.Preview.Link
143+
for {
144+
tokenType := t.Next()
145+
if tokenType == html.ErrorToken {
146+
return nil
147+
}
148+
if tokenType != html.SelfClosingTagToken && tokenType != html.StartTagToken && tokenType != html.EndTagToken {
149+
continue
150+
}
151+
token := t.Token()
152+
153+
switch token.Data {
154+
case "head":
155+
if tokenType == html.EndTagToken {
156+
headPassed = true
157+
}
158+
case "body":
159+
headPassed = true
160+
161+
case "link":
162+
var canonical bool
163+
var href string
164+
for _, attr := range token.Attr {
165+
if cleanStr(attr.Key) == "rel" && cleanStr(attr.Val) == "canonical" {
166+
canonical = true
167+
}
168+
if cleanStr(attr.Key) == "href" {
169+
href = attr.Val
170+
}
171+
if len(href) > 0 && canonical && link != href {
172+
hasCanonical = true
173+
var err error
174+
canonicalUrl, err = url.Parse(href)
175+
if err != nil {
176+
return err
177+
}
178+
}
179+
}
180+
181+
case "meta":
182+
if len(token.Attr) != 2 {
183+
break
184+
}
185+
if metaFragment(token) && scraper.EscapedFragmentUrl == nil {
186+
hasFragment = true
187+
}
188+
var property string
189+
var content string
190+
for _, attr := range token.Attr {
191+
if cleanStr(attr.Key) == "property" || cleanStr(attr.Key) == "name" {
192+
property = attr.Val
193+
}
194+
if cleanStr(attr.Key) == "content" {
195+
content = attr.Val
196+
}
197+
}
198+
switch cleanStr(property) {
199+
case "og:title":
200+
doc.Preview.Title = content
201+
case "og:description":
202+
doc.Preview.Description = content
203+
case "description":
204+
if len(doc.Preview.Description) == 0 {
205+
doc.Preview.Description = content
206+
}
207+
case "og:url":
208+
doc.Preview.Link = content
209+
case "og:image":
210+
ogImage = true
211+
doc.Preview.Images = []string{content}
212+
213+
}
214+
215+
case "title":
216+
if tokenType == html.StartTagToken {
217+
t.Next()
218+
token = t.Token()
219+
if len(doc.Preview.Title) == 0 {
220+
doc.Preview.Title = token.Data
221+
}
222+
}
223+
224+
case "img":
225+
for _, attr := range token.Attr {
226+
if cleanStr(attr.Key) == "src" {
227+
imgUrl, err := url.Parse(attr.Val)
228+
if err != nil {
229+
return err
230+
}
231+
if !imgUrl.IsAbs() {
232+
doc.Preview.Images = append(doc.Preview.Images, fmt.Sprintf("%s://%s%s", scraper.Url.Scheme, scraper.Url.Host, imgUrl.Path))
233+
} else {
234+
doc.Preview.Images = append(doc.Preview.Images, attr.Val)
235+
}
236+
237+
}
238+
}
239+
}
240+
241+
if hasCanonical && headPassed && scraper.MaxRedirect > 0 {
242+
scraper.Url = canonicalUrl
243+
scraper.EscapedFragmentUrl = nil
244+
fdoc, err := scraper.getDocument()
245+
if err != nil {
246+
return err
247+
}
248+
*doc = *fdoc
249+
return scraper.parseDocument(doc)
250+
}
251+
252+
if hasFragment && headPassed && scraper.MaxRedirect > 0 {
253+
scraper.toFragmentUrl()
254+
fdoc, err := scraper.getDocument()
255+
if err != nil {
256+
return err
257+
}
258+
*doc = *fdoc
259+
return scraper.parseDocument(doc)
260+
}
261+
262+
if len(doc.Preview.Title) > 0 && len(doc.Preview.Description) > 0 && len(doc.Preview.Title) > 0 && ogImage && headPassed {
263+
return nil
264+
}
265+
266+
}
267+
268+
return nil
269+
}
270+
271+
func (scraper *Scraper) Scrape() (*Document, error) {
272+
doc, err := scraper.getDocument()
273+
if err != nil {
274+
return nil, err
275+
}
276+
err = scraper.parseDocument(doc)
277+
if err != nil {
278+
return nil, err
279+
}
280+
return doc, nil
281+
}
282+
283+
func avoidByte(b byte) bool {
284+
i := int(b)
285+
if i == 127 || (i >= 0 && i <= 31) {
286+
return true
287+
}
288+
return false
289+
}
290+
291+
func escapeByte(b byte) bool {
292+
i := int(b)
293+
if i == 32 || i == 35 || i == 37 || i == 38 || i == 43 || (i >= 127 && i <= 255) {
294+
return true
295+
}
296+
return false
297+
}
298+
299+
func metaFragment(token html.Token) bool {
300+
var name string
301+
var content string
302+
303+
for _, attr := range token.Attr {
304+
if cleanStr(attr.Key) == "name" {
305+
name = attr.Val
306+
}
307+
if cleanStr(attr.Key) == "content" {
308+
content = attr.Val
309+
}
310+
}
311+
if name == "fragment" && content == "!" {
312+
return true
313+
}
314+
return false
315+
}
316+
317+
func cleanStr(str string) string {
318+
return strings.ToLower(strings.TrimSpace(str))
319+
}

0 commit comments

Comments
 (0)